From 1c355ce9dd49d1fd59090bb41a91a573cc9ce1c1 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Mon, 25 Sep 2023 09:22:21 +0200 Subject: [PATCH] factored out arabic_utilities.py as a separate file --- src/arabic/wiktionary/arabic_utilities.py | 169 ++++++++++++++++++++++ src/arabic/wiktionary/read_wiktionary.py | 128 +--------------- src/arabic/wiktionary/to_wordnet.py | 11 +- 3 files changed, 172 insertions(+), 136 deletions(-) create mode 100644 src/arabic/wiktionary/arabic_utilities.py diff --git a/src/arabic/wiktionary/arabic_utilities.py b/src/arabic/wiktionary/arabic_utilities.py new file mode 100644 index 000000000..29a15f105 --- /dev/null +++ b/src/arabic/wiktionary/arabic_utilities.py @@ -0,0 +1,169 @@ +# utilities for Arabic script +# in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter +# as specified by the command line argument: +# +# % python3 arabic_utilities.py to b.tmp +# % diff MorphoDictAra.gf b.tmp +# % + +def is_arabic(s): + return s and any(1574 <= ord(c) <= 1616 for c in s) + + +def get_arabic(s): + return ''.join([c for c in s if is_arabic(c)]) + + +def unvocalize(s): + return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) + + +# https://en.wikipedia.org/wiki/Buckwalter_transliteration +buckwalter_dict = { + 0x621: "'", # ء + 0x622: '|', # آ + 0x623: '>', # أ + 0x624: '&', # ؤ + 0x625: '<', # إ + 0x626: '}', # ئ + 0x627: 'A', # ا + 0x628: 'b', # ب + 0x629: 'p', # ة + 0x62a: 't', # ت + 0x62b: 'v', # ث + 0x62c: 'j', # ج + 0x62d: 'H', # ح + 0x62e: 'x', # خ + 0x62f: 'd', # د + 0x630: '*', # ذ + 0x631: 'r', # ر + 0x632: 'z', # ز + 0x633: 's', # س + 0x634: '$', # ش + 0x635: 'S', # ص + 0x636: 'D', # ض + 0x637: 'T', # ط + 0x638: 'Z', # ظ + 0x639: 'E', # ع + 0x63a: 'g', # غ + 0x641: 'f', # ف + 0x642: 'q', # ق + 0x643: 'k', # ك + 0x644: 'l', # ل + 0x645: 'm', # م + 0x646: 'n', # ن + 0x647: 'h', # ه + 0x648: 'w', # و + 0x649: 'Y', # ى + 0x64a: 'y', # ي + 0x64b: 'F', # ً + 0x64c: 'N', # ٌ + 0x64d: 'K', # ٍ + 0x64e: 'a', # َ + 0x64f: 'u', # ُ + 0x650: 'i', # ِ + 0x651: '~', # ّ + 0x652: 'o', # ْ + 0x670: '`', # ' + 0x671: '{' # ٱ + } + + +buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()} + +arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}} + +sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya + +def to_buckwalter(s): + return ''.join([buckwalter_dict.get(ord(c), c) for c in s]) + + +def from_buckwalter(s): + return ''.join([buckwalter_dict_rev.get(c, c) for c in s]) + + +def drop_final_vowel(s): + if s[-1] in arabic_vowels: + return s[:-1] + else: + return s + + +def normal(s): + return unicodedata.normalize('NFD', s) + +# heuristic for finding the three radicals from certain forms +# works only for sound (strong) 3-radical roots, otherwise None +def get_sound_trigram_root(s): + sounds = [c for c in s if c in sound_consonants] + if len(sounds) == 3: + return ''.join(sounds) + else: + return None + + +# reverse engineer fcl pattern from a given form, with a sound trigram root +# one more condition: each of the root letters occurs exactly ones +# TODO: better use the given root of the lex entry +def get_sound_fcl_pattern(s): + if root := get_sound_trigram_root(s): + if len([c in s for c in root]) == 3: + p = list(s) + r = s.find(root[0]) + p[r] = chr(0x641) + r += s[r+1:].find(root[1]) + 1 + p[r] = chr(0x639) + r += s[r+1:].find(root[2]) + 1 + p[r] = chr(0x644) + p = ''.join(p) +## print('---PATT', s, root, p) + return p + + +# Wikt uses vowel+shadda which is a Unicode normalization +# GF uses shadda+vowel which is linguistically correct +# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra +# unicodedata.normalize does this wrong, as noted by Ariel Gutman +## todo: more direct implementation +def reorder_shadda(s): + return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i')) + + +# quote word forms but not parameters +def quote_if(s, cond=is_arabic, change=reorder_shadda): + if cond(s): + return '"' + change(s) + '"' + else: + return s + + +# for a string, change each string literal in "..." with a change function +# leaving other characters as they are; print the string to stdout as you go +def change_literals(s, change): + inliteral = False + literal = '' + for c in s: + if c == '"' and inliteral: + print('"'+change(literal)+'"', end='') + inliteral = False + literal = '' + elif c == '"': + inliteral = True + elif inliteral: + literal += c + else: + print(c, end='') + + +# convert literals in stdin 'to' or 'from' Buckwalter +if __name__ == '__main__': + import sys + mode = sys.argv[1] + for line in sys.stdin: + if mode == 'from': + change_literals(line, from_buckwalter) + elif mode == 'to': + change_literals(line, to_buckwalter) + + diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 140852c7a..edfa69603 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -3,7 +3,7 @@ import json import sys import unicodedata import pgf - +from arabic_utilities import * # data from https://kaikki.org/dictionary/rawdata.html # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data, @@ -110,132 +110,6 @@ if MODE == 'error-analysis': for labverdict, n in sorted(list(evals.items())): print(labverdict, n) -# https://en.wikipedia.org/wiki/Buckwalter_transliteration -buckwalter_dict = { - 0x621: "'", # ء - 0x622: '|', # آ - 0x623: '>', # أ - 0x624: '&', # ؤ - 0x625: '<', # إ - 0x626: '}', # ئ - 0x627: 'A', # ا - 0x628: 'b', # ب - 0x629: 'p', # ة - 0x62a: 't', # ت - 0x62b: 'v', # ث - 0x62c: 'j', # ج - 0x62d: 'H', # ح - 0x62e: 'x', # خ - 0x62f: 'd', # د - 0x630: '*', # ذ - 0x631: 'r', # ر - 0x632: 'z', # ز - 0x633: 's', # س - 0x634: '$', # ش - 0x635: 'S', # ص - 0x636: 'D', # ض - 0x637: 'T', # ط - 0x638: 'Z', # ظ - 0x639: 'E', # ع - 0x63a: 'g', # غ - 0x641: 'f', # ف - 0x642: 'q', # ق - 0x643: 'k', # ك - 0x644: 'l', # ل - 0x645: 'm', # م - 0x646: 'n', # ن - 0x647: 'h', # ه - 0x648: 'w', # و - 0x649: 'Y', # ى - 0x64a: 'y', # ي - 0x64b: 'F', # ً - 0x64c: 'N', # ٌ - 0x64d: 'K', # ٍ - 0x64e: 'a', # َ - 0x64f: 'u', # ُ - 0x650: 'i', # ِ - 0x651: '~', # ّ - 0x652: 'o', # ْ - 0x670: '`', # ' - 0x671: '{' # ٱ - } - - -buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()} - -arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}} - -sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya - -def to_buckwalter(s): - return ''.join([buckwalter_dict.get(ord(c), c) for c in s]) - - -def from_buckwalter(s): - return ''.join([buckwalter_dict_rev.get(c, c) for c in s]) - - -def unvocalize(s): - return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) - - -def drop_final_vowel(s): - if s[-1] in arabic_vowels: - return s[:-1] - else: - return s - - -def is_arabic(s): - return s and any(1574 <= ord(c) <= 1616 for c in s) - -def normal(s): - return unicodedata.normalize('NFD', s) - -# heuristic for finding the three radicals from certain forms -# works only for sound (strong) 3-radical roots, otherwise None -def get_sound_trigram_root(s): - sounds = [c for c in s if c in sound_consonants] - if len(sounds) == 3: - return ''.join(sounds) - else: - return None - - -# reverse engineer fcl pattern from a given form, with a sound trigram root -# one more condition: each of the root letters occurs exactly ones -# TODO: better use the given root of the lex entry -def get_sound_fcl_pattern(s): - if root := get_sound_trigram_root(s): - if len([c in s for c in root]) == 3: - p = list(s) - r = s.find(root[0]) - p[r] = chr(0x641) - r += s[r+1:].find(root[1]) + 1 - p[r] = chr(0x639) - r += s[r+1:].find(root[2]) + 1 - p[r] = chr(0x644) - p = ''.join(p) -## print('---PATT', s, root, p) - return p - - -# Wikt uses vowel+shadda which is a Unicode normalization -# GF uses shadda+vowel which is linguistically correct -# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra -# unicodedata.normalize does this wrong, as noted by Ariel Gutman -## todo: more direct implementation -def reorder_shadda(s): - return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i')) - - -# quote word forms but not parameters -def quote_if(s, cond=is_arabic, change=reorder_shadda): - if cond(s): - return '"' + change(s) + '"' - else: - return s - # generate word_d_C functions starting with d=0, but show d only when >= 1 def gf_fun(s, pos, disamb=0): diff --git a/src/arabic/wiktionary/to_wordnet.py b/src/arabic/wiktionary/to_wordnet.py index b159c5f18..2aae047db 100644 --- a/src/arabic/wiktionary/to_wordnet.py +++ b/src/arabic/wiktionary/to_wordnet.py @@ -1,6 +1,8 @@ import csv import json +from arabic_utilities import * + # to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl # the following are assumed @@ -12,15 +14,6 @@ WN_TSV = 'ar2en_words_gf.csv' # Zarzoura # built as explained in ./read_wiktionary.py MORPHO_GF = 'MorphoDictAraAbs.gf' -def is_arabic(s): - return s and any(1574 <= ord(c) <= 1616 for c in s) - -def get_arabic(s): - return ''.join([c for c in s if is_arabic(c)]) - -def unvocalize(s): - return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) - # fun 'دُبُ_N' : N ; -- 10 [['bear']] funmap = {}