mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-05-27 08:58:55 -06:00
factored out arabic_utilities.py as a separate file
This commit is contained in:
169
src/arabic/wiktionary/arabic_utilities.py
Normal file
169
src/arabic/wiktionary/arabic_utilities.py
Normal file
@@ -0,0 +1,169 @@
|
||||
# utilities for Arabic script
|
||||
# in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter
|
||||
# as specified by the command line argument:
|
||||
#
|
||||
# % python3 arabic_utilities.py to <MorphoDictAra.gf | python3 arabic_utilities.py from >b.tmp
|
||||
# % diff MorphoDictAra.gf b.tmp
|
||||
# %
|
||||
|
||||
def is_arabic(s):
|
||||
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
||||
|
||||
|
||||
def get_arabic(s):
|
||||
return ''.join([c for c in s if is_arabic(c)])
|
||||
|
||||
|
||||
def unvocalize(s):
|
||||
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
|
||||
|
||||
|
||||
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
|
||||
buckwalter_dict = {
|
||||
0x621: "'", # ء
|
||||
0x622: '|', # آ
|
||||
0x623: '>', # أ
|
||||
0x624: '&', # ؤ
|
||||
0x625: '<', # إ
|
||||
0x626: '}', # ئ
|
||||
0x627: 'A', # ا
|
||||
0x628: 'b', # ب
|
||||
0x629: 'p', # ة
|
||||
0x62a: 't', # ت
|
||||
0x62b: 'v', # ث
|
||||
0x62c: 'j', # ج
|
||||
0x62d: 'H', # ح
|
||||
0x62e: 'x', # خ
|
||||
0x62f: 'd', # د
|
||||
0x630: '*', # ذ
|
||||
0x631: 'r', # ر
|
||||
0x632: 'z', # ز
|
||||
0x633: 's', # س
|
||||
0x634: '$', # ش
|
||||
0x635: 'S', # ص
|
||||
0x636: 'D', # ض
|
||||
0x637: 'T', # ط
|
||||
0x638: 'Z', # ظ
|
||||
0x639: 'E', # ع
|
||||
0x63a: 'g', # غ
|
||||
0x641: 'f', # ف
|
||||
0x642: 'q', # ق
|
||||
0x643: 'k', # ك
|
||||
0x644: 'l', # ل
|
||||
0x645: 'm', # م
|
||||
0x646: 'n', # ن
|
||||
0x647: 'h', # ه
|
||||
0x648: 'w', # و
|
||||
0x649: 'Y', # ى
|
||||
0x64a: 'y', # ي
|
||||
0x64b: 'F', # ً
|
||||
0x64c: 'N', # ٌ
|
||||
0x64d: 'K', # ٍ
|
||||
0x64e: 'a', # َ
|
||||
0x64f: 'u', # ُ
|
||||
0x650: 'i', # ِ
|
||||
0x651: '~', # ّ
|
||||
0x652: 'o', # ْ
|
||||
0x670: '`', # '
|
||||
0x671: '{' # ٱ
|
||||
}
|
||||
|
||||
|
||||
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
|
||||
|
||||
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
|
||||
|
||||
sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya
|
||||
|
||||
def to_buckwalter(s):
|
||||
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
|
||||
|
||||
|
||||
def from_buckwalter(s):
|
||||
return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
|
||||
|
||||
|
||||
def drop_final_vowel(s):
|
||||
if s[-1] in arabic_vowels:
|
||||
return s[:-1]
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
def normal(s):
|
||||
return unicodedata.normalize('NFD', s)
|
||||
|
||||
# heuristic for finding the three radicals from certain forms
|
||||
# works only for sound (strong) 3-radical roots, otherwise None
|
||||
def get_sound_trigram_root(s):
|
||||
sounds = [c for c in s if c in sound_consonants]
|
||||
if len(sounds) == 3:
|
||||
return ''.join(sounds)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
# reverse engineer fcl pattern from a given form, with a sound trigram root
|
||||
# one more condition: each of the root letters occurs exactly ones
|
||||
# TODO: better use the given root of the lex entry
|
||||
def get_sound_fcl_pattern(s):
|
||||
if root := get_sound_trigram_root(s):
|
||||
if len([c in s for c in root]) == 3:
|
||||
p = list(s)
|
||||
r = s.find(root[0])
|
||||
p[r] = chr(0x641)
|
||||
r += s[r+1:].find(root[1]) + 1
|
||||
p[r] = chr(0x639)
|
||||
r += s[r+1:].find(root[2]) + 1
|
||||
p[r] = chr(0x644)
|
||||
p = ''.join(p)
|
||||
## print('---PATT', s, root, p)
|
||||
return p
|
||||
|
||||
|
||||
# Wikt uses vowel+shadda which is a Unicode normalization
|
||||
# GF uses shadda+vowel which is linguistically correct
|
||||
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
|
||||
# unicodedata.normalize does this wrong, as noted by Ariel Gutman
|
||||
## todo: more direct implementation
|
||||
def reorder_shadda(s):
|
||||
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
|
||||
|
||||
|
||||
# quote word forms but not parameters
|
||||
def quote_if(s, cond=is_arabic, change=reorder_shadda):
|
||||
if cond(s):
|
||||
return '"' + change(s) + '"'
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
# for a string, change each string literal in "..." with a change function
|
||||
# leaving other characters as they are; print the string to stdout as you go
|
||||
def change_literals(s, change):
|
||||
inliteral = False
|
||||
literal = ''
|
||||
for c in s:
|
||||
if c == '"' and inliteral:
|
||||
print('"'+change(literal)+'"', end='')
|
||||
inliteral = False
|
||||
literal = ''
|
||||
elif c == '"':
|
||||
inliteral = True
|
||||
elif inliteral:
|
||||
literal += c
|
||||
else:
|
||||
print(c, end='')
|
||||
|
||||
|
||||
# convert literals in stdin 'to' or 'from' Buckwalter
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
mode = sys.argv[1]
|
||||
for line in sys.stdin:
|
||||
if mode == 'from':
|
||||
change_literals(line, from_buckwalter)
|
||||
elif mode == 'to':
|
||||
change_literals(line, to_buckwalter)
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import json
|
||||
import sys
|
||||
import unicodedata
|
||||
import pgf
|
||||
|
||||
from arabic_utilities import *
|
||||
|
||||
# data from https://kaikki.org/dictionary/rawdata.html
|
||||
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
|
||||
@@ -110,132 +110,6 @@ if MODE == 'error-analysis':
|
||||
for labverdict, n in sorted(list(evals.items())):
|
||||
print(labverdict, n)
|
||||
|
||||
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
|
||||
buckwalter_dict = {
|
||||
0x621: "'", # ء
|
||||
0x622: '|', # آ
|
||||
0x623: '>', # أ
|
||||
0x624: '&', # ؤ
|
||||
0x625: '<', # إ
|
||||
0x626: '}', # ئ
|
||||
0x627: 'A', # ا
|
||||
0x628: 'b', # ب
|
||||
0x629: 'p', # ة
|
||||
0x62a: 't', # ت
|
||||
0x62b: 'v', # ث
|
||||
0x62c: 'j', # ج
|
||||
0x62d: 'H', # ح
|
||||
0x62e: 'x', # خ
|
||||
0x62f: 'd', # د
|
||||
0x630: '*', # ذ
|
||||
0x631: 'r', # ر
|
||||
0x632: 'z', # ز
|
||||
0x633: 's', # س
|
||||
0x634: '$', # ش
|
||||
0x635: 'S', # ص
|
||||
0x636: 'D', # ض
|
||||
0x637: 'T', # ط
|
||||
0x638: 'Z', # ظ
|
||||
0x639: 'E', # ع
|
||||
0x63a: 'g', # غ
|
||||
0x641: 'f', # ف
|
||||
0x642: 'q', # ق
|
||||
0x643: 'k', # ك
|
||||
0x644: 'l', # ل
|
||||
0x645: 'm', # م
|
||||
0x646: 'n', # ن
|
||||
0x647: 'h', # ه
|
||||
0x648: 'w', # و
|
||||
0x649: 'Y', # ى
|
||||
0x64a: 'y', # ي
|
||||
0x64b: 'F', # ً
|
||||
0x64c: 'N', # ٌ
|
||||
0x64d: 'K', # ٍ
|
||||
0x64e: 'a', # َ
|
||||
0x64f: 'u', # ُ
|
||||
0x650: 'i', # ِ
|
||||
0x651: '~', # ّ
|
||||
0x652: 'o', # ْ
|
||||
0x670: '`', # '
|
||||
0x671: '{' # ٱ
|
||||
}
|
||||
|
||||
|
||||
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
|
||||
|
||||
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
|
||||
|
||||
sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya
|
||||
|
||||
def to_buckwalter(s):
|
||||
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
|
||||
|
||||
|
||||
def from_buckwalter(s):
|
||||
return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
|
||||
|
||||
|
||||
def unvocalize(s):
|
||||
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
|
||||
|
||||
|
||||
def drop_final_vowel(s):
|
||||
if s[-1] in arabic_vowels:
|
||||
return s[:-1]
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
def is_arabic(s):
|
||||
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
||||
|
||||
def normal(s):
|
||||
return unicodedata.normalize('NFD', s)
|
||||
|
||||
# heuristic for finding the three radicals from certain forms
|
||||
# works only for sound (strong) 3-radical roots, otherwise None
|
||||
def get_sound_trigram_root(s):
|
||||
sounds = [c for c in s if c in sound_consonants]
|
||||
if len(sounds) == 3:
|
||||
return ''.join(sounds)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
# reverse engineer fcl pattern from a given form, with a sound trigram root
|
||||
# one more condition: each of the root letters occurs exactly ones
|
||||
# TODO: better use the given root of the lex entry
|
||||
def get_sound_fcl_pattern(s):
|
||||
if root := get_sound_trigram_root(s):
|
||||
if len([c in s for c in root]) == 3:
|
||||
p = list(s)
|
||||
r = s.find(root[0])
|
||||
p[r] = chr(0x641)
|
||||
r += s[r+1:].find(root[1]) + 1
|
||||
p[r] = chr(0x639)
|
||||
r += s[r+1:].find(root[2]) + 1
|
||||
p[r] = chr(0x644)
|
||||
p = ''.join(p)
|
||||
## print('---PATT', s, root, p)
|
||||
return p
|
||||
|
||||
|
||||
# Wikt uses vowel+shadda which is a Unicode normalization
|
||||
# GF uses shadda+vowel which is linguistically correct
|
||||
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
|
||||
# unicodedata.normalize does this wrong, as noted by Ariel Gutman
|
||||
## todo: more direct implementation
|
||||
def reorder_shadda(s):
|
||||
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
|
||||
|
||||
|
||||
# quote word forms but not parameters
|
||||
def quote_if(s, cond=is_arabic, change=reorder_shadda):
|
||||
if cond(s):
|
||||
return '"' + change(s) + '"'
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
# generate word_d_C functions starting with d=0, but show d only when >= 1
|
||||
def gf_fun(s, pos, disamb=0):
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import csv
|
||||
import json
|
||||
|
||||
from arabic_utilities import *
|
||||
|
||||
# to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
|
||||
# the following are assumed
|
||||
|
||||
@@ -12,15 +14,6 @@ WN_TSV = 'ar2en_words_gf.csv' # Zarzoura
|
||||
# built as explained in ./read_wiktionary.py
|
||||
MORPHO_GF = 'MorphoDictAraAbs.gf'
|
||||
|
||||
def is_arabic(s):
|
||||
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
||||
|
||||
def get_arabic(s):
|
||||
return ''.join([c for c in s if is_arabic(c)])
|
||||
|
||||
def unvocalize(s):
|
||||
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
|
||||
|
||||
|
||||
# fun 'دُبُ_N' : N ; -- 10 [['bear']]
|
||||
funmap = {}
|
||||
|
||||
Reference in New Issue
Block a user