factored out arabic_utilities.py as a separate file

2026-05-27 08:58:55 -06:00 · 2023-09-25 09:22:21 +02:00
parent 561a8c130d
commit 1c355ce9dd
3 changed files with 172 additions and 136 deletions
--- a/src/arabic/wiktionary/arabic_utilities.py
+++ b/src/arabic/wiktionary/arabic_utilities.py
@@ -0,0 +1,169 @@
 # utilities for Arabic script
 # in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter
 # as specified by the command line argument:
 #
 #   % python3 arabic_utilities.py to <MorphoDictAra.gf | python3 arabic_utilities.py from >b.tmp
 #   % diff MorphoDictAra.gf b.tmp 
 #   % 
 def is_arabic(s):
    return s and any(1574 <= ord(c) <= 1616 for c in s)
 def get_arabic(s):
    return ''.join([c for c in s if is_arabic(c)])
 def unvocalize(s):
    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
 # https://en.wikipedia.org/wiki/Buckwalter_transliteration
 buckwalter_dict = {
  0x621: "'",  # ء
  0x622: '|',  # آ
  0x623: '>',  # أ
  0x624: '&',  # ؤ
  0x625: '<',  # إ
  0x626: '}',  # ئ
  0x627: 'A',  # ا
  0x628: 'b',  # ب
  0x629: 'p',  # ة
  0x62a: 't',  # ت
  0x62b: 'v',  # ث
  0x62c: 'j',  # ج
  0x62d: 'H',  # ح
  0x62e: 'x',  # خ
  0x62f: 'd',  # د
  0x630: '*',  # ذ
  0x631: 'r',  # ر
  0x632: 'z',  # ز
  0x633: 's',  # س
  0x634: '$',  # ش
  0x635: 'S',  # ص
  0x636: 'D',  # ض
  0x637: 'T',  # ط
  0x638: 'Z',  # ظ
  0x639: 'E',  # ع
  0x63a: 'g',  # غ
  0x641: 'f',  # ف  
  0x642: 'q',  # ق
  0x643: 'k',  # ك
  0x644: 'l',  # ل
  0x645: 'm',  # م
  0x646: 'n',  # ن
  0x647: 'h',  # ه
  0x648: 'w',  # و
  0x649: 'Y',  # ى
  0x64a: 'y',  # ي
  0x64b: 'F',  # ً
  0x64c: 'N',  # ٌ
  0x64d: 'K',  # ٍ
  0x64e: 'a',  # َ
  0x64f: 'u',  # ُ
  0x650: 'i',  # ِ
  0x651: '~',  # ّ
  0x652: 'o',  # ْ
  0x670: '`',  # '
  0x671: '{'   # ٱ
  }
 buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
 arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
 sound_consonants = {chr(c) for c in range(0x628, 0x648)}  # excluding alif, waw, ya
 def to_buckwalter(s):
    return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
 def from_buckwalter(s):
    return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
 def drop_final_vowel(s):
    if s[-1] in arabic_vowels:
        return s[:-1]
    else:
        return s
 def normal(s):
    return unicodedata.normalize('NFD', s)
 # heuristic for finding the three radicals from certain forms
 # works only for sound (strong) 3-radical roots, otherwise None
 def get_sound_trigram_root(s):
    sounds = [c for c in s if c in sound_consonants]
    if len(sounds) == 3:
        return ''.join(sounds)
    else:
        return None
 # reverse engineer fcl pattern from a given form, with a sound trigram root
 # one more condition: each of the root letters occurs exactly ones
 # TODO: better use the given root of the lex entry
 def get_sound_fcl_pattern(s):
    if root := get_sound_trigram_root(s):
        if len([c in s for c in root]) == 3:
            p = list(s)
            r = s.find(root[0])
            p[r] = chr(0x641)
            r += s[r+1:].find(root[1]) + 1
            p[r] = chr(0x639)
            r += s[r+1:].find(root[2]) + 1
            p[r] = chr(0x644)
            p = ''.join(p)
 ##            print('---PATT', s, root, p)
            return p
 # Wikt uses vowel+shadda which is a Unicode normalization
 # GF uses shadda+vowel which is linguistically correct
 # see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
 # unicodedata.normalize does this wrong, as noted by Ariel Gutman 
 ## todo: more direct implementation
 def reorder_shadda(s):
    return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
 # quote word forms but not parameters
 def quote_if(s, cond=is_arabic, change=reorder_shadda):
    if cond(s):
        return '"' + change(s) + '"'
    else:
        return s
 # for a string, change each string literal in "..." with a change function
 # leaving other characters as they are; print the string to stdout as you go
 def change_literals(s, change):
    inliteral = False
    literal = ''
    for c in s:
        if c == '"' and inliteral:
            print('"'+change(literal)+'"', end='')
            inliteral = False
            literal = ''
        elif c == '"':
            inliteral = True
        elif inliteral:
            literal += c
        else:
            print(c, end='')
 # convert literals in stdin 'to' or 'from' Buckwalter
 if __name__ == '__main__':
    import sys
    mode = sys.argv[1]
    for line in sys.stdin:
        if mode == 'from':
            change_literals(line, from_buckwalter)
        elif mode == 'to':
            change_literals(line, to_buckwalter)
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -3,7 +3,7 @@ import json
 import sys
 import unicodedata
 import pgf
-
+from arabic_utilities import *
 # data from https://kaikki.org/dictionary/rawdata.html
 # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
@@ -110,132 +110,6 @@ if MODE == 'error-analysis':
    for labverdict, n in sorted(list(evals.items())):
        print(labverdict, n)
 # https://en.wikipedia.org/wiki/Buckwalter_transliteration
 buckwalter_dict = {
  0x621: "'",  # ء
  0x622: '|',  # آ
  0x623: '>',  # أ
  0x624: '&',  # ؤ
  0x625: '<',  # إ
  0x626: '}',  # ئ
  0x627: 'A',  # ا
  0x628: 'b',  # ب
  0x629: 'p',  # ة
  0x62a: 't',  # ت
  0x62b: 'v',  # ث
  0x62c: 'j',  # ج
  0x62d: 'H',  # ح
  0x62e: 'x',  # خ
  0x62f: 'd',  # د
  0x630: '*',  # ذ
  0x631: 'r',  # ر
  0x632: 'z',  # ز
  0x633: 's',  # س
  0x634: '$',  # ش
  0x635: 'S',  # ص
  0x636: 'D',  # ض
  0x637: 'T',  # ط
  0x638: 'Z',  # ظ
  0x639: 'E',  # ع
  0x63a: 'g',  # غ
  0x641: 'f',  # ف  
  0x642: 'q',  # ق
  0x643: 'k',  # ك
  0x644: 'l',  # ل
  0x645: 'm',  # م
  0x646: 'n',  # ن
  0x647: 'h',  # ه
  0x648: 'w',  # و
  0x649: 'Y',  # ى
  0x64a: 'y',  # ي
  0x64b: 'F',  # ً
  0x64c: 'N',  # ٌ
  0x64d: 'K',  # ٍ
  0x64e: 'a',  # َ
  0x64f: 'u',  # ُ
  0x650: 'i',  # ِ
  0x651: '~',  # ّ
  0x652: 'o',  # ْ
  0x670: '`',  # '
  0x671: '{'   # ٱ
  }
 buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
 arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
 sound_consonants = {chr(c) for c in range(0x628, 0x648)}  # excluding alif, waw, ya
 def to_buckwalter(s):
    return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
 def from_buckwalter(s):
    return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
 def unvocalize(s):
    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
 def drop_final_vowel(s):
    if s[-1] in arabic_vowels:
        return s[:-1]
    else:
        return s
 def is_arabic(s):
    return s and any(1574 <= ord(c) <= 1616 for c in s)
 def normal(s):
    return unicodedata.normalize('NFD', s)
 # heuristic for finding the three radicals from certain forms
 # works only for sound (strong) 3-radical roots, otherwise None
 def get_sound_trigram_root(s):
    sounds = [c for c in s if c in sound_consonants]
    if len(sounds) == 3:
        return ''.join(sounds)
    else:
        return None
 # reverse engineer fcl pattern from a given form, with a sound trigram root
 # one more condition: each of the root letters occurs exactly ones
 # TODO: better use the given root of the lex entry
 def get_sound_fcl_pattern(s):
    if root := get_sound_trigram_root(s):
        if len([c in s for c in root]) == 3:
            p = list(s)
            r = s.find(root[0])
            p[r] = chr(0x641)
            r += s[r+1:].find(root[1]) + 1
            p[r] = chr(0x639)
            r += s[r+1:].find(root[2]) + 1
            p[r] = chr(0x644)
            p = ''.join(p)
 ##            print('---PATT', s, root, p)
            return p
 # Wikt uses vowel+shadda which is a Unicode normalization
 # GF uses shadda+vowel which is linguistically correct
 # see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
 # unicodedata.normalize does this wrong, as noted by Ariel Gutman 
 ## todo: more direct implementation
 def reorder_shadda(s):
    return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
 # quote word forms but not parameters
 def quote_if(s, cond=is_arabic, change=reorder_shadda):
    if cond(s):
        return '"' + change(s) + '"'
    else:
        return s
 # generate word_d_C functions starting with d=0, but show d only when >= 1
 def gf_fun(s, pos, disamb=0):
--- a/src/arabic/wiktionary/to_wordnet.py
+++ b/src/arabic/wiktionary/to_wordnet.py
@@ -1,6 +1,8 @@
 import csv
 import json
 from arabic_utilities import *
 # to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
 # the following are assumed
@@ -12,15 +14,6 @@ WN_TSV = 'ar2en_words_gf.csv'  # Zarzoura
 # built as explained in ./read_wiktionary.py
 MORPHO_GF = 'MorphoDictAraAbs.gf'
 def is_arabic(s):
    return s and any(1574 <= ord(c) <= 1616 for c in s)
 def get_arabic(s):
    return ''.join([c for c in s if is_arabic(c)])
 def unvocalize(s):
    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
 # fun 'دُبُ_N' : N ; -- 10 [['bear']]
 funmap = {}