From 6312624a5fa84d0076e46a44befbfccadb0538f0 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Tue, 12 Sep 2023 12:08:31 +0200 Subject: [PATCH 01/19] preparing to read Arabic morpholex from Wiktionary --- src/arabic/wiktionary/read_wiktionary.py | 97 ++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 src/arabic/wiktionary/read_wiktionary.py diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py new file mode 100644 index 000000000..48a2fca38 --- /dev/null +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -0,0 +1,97 @@ +import gzip +import json + +WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz' + + +def get_gzip_json(file, sample=100000, langs=[]): + with gzip.open(file) as decompressed: + n = 0 + for line in decompressed: + n += 1 + if n % sample == 0: + obj = json.loads(line) + if obj.get('lang', None) in langs: + print(line.decode("utf-8")) + print(n) + + +# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic']) +# python3 read_wiktionary.py >wikt_arabic.jsonl +# 621-671 + +# https://en.wikipedia.org/wiki/Buckwalter_transliteration +buckwalter_dict = { + 0x621: "'", # ء + 0x622: '|', # آ + 0x623: '>', # أ + 0x624: '&', # ؤ + 0x625: '<', # إ + 0x626: '}', # ئ + 0x627: 'A', # ا + 0x628: 'b', # ب + 0x629: 'p', # ة + 0x62a: 't', # ت + 0x62b: 'v', # ث + 0x62c: 'j', # ج + 0x62d: 'H', # ح + 0x62e: 'x', # خ + 0x62f: 'd', # د + 0x630: '*', # ذ + 0x631: 'r', # ر + 0x632: 'z', # ز + 0x633: 's', # س + 0x634: '$', # ش + 0x635: 'S', # ص + 0x636: 'D', # ض + 0x637: 'T', # ط + 0x638: 'Z', # ظ + 0x639: 'E', # ع + 0x63a: 'g', # غ + 0x641: 'f', # ف + 0x642: 'q', # ق + 0x643: 'k', # ك + 0x644: 'l', # ل + 0x645: 'm', # م + 0x646: 'n', # ن + 0x647: 'h', # ه + 0x648: 'w', # و + 0x649: 'Y', # ى + 0x64a: 'y', # ي + 0x64b: 'F', # ً + 0x64c: 'N', # ٌ + 0x64d: 'K', # ٍ + 0x64e: 'a', # َ + 0x64f: 'u', # ُ + 0x650: 'i', # ِ + 0x651: '~', # ّ + 0x652: 'o', # ْ + 0x670: '`', # ' + 0x671: '{' # ٱ + } + +def to_buckwalter(s): + return ''.join(list(map(lambda c: buckwalter_dict.get(ord(c), '?'), s))) + + +def is_arabic(s): + return s and any(1574 <= ord(c) <= 1616 for c in s) + +""" +with open('wikt_arabic.jsonl') as file: + for line in file: + obj = json.loads(line) + if 'Arabic lemmas' in obj.get('categories', []): + entry = { + 'pos': obj['pos'], + 'forms': {form['form']: form.get('tags', []) for + form in obj.get('forms', []) if + 'romanization' not in form.get('tags', []) and + is_arabic(form['form']) + }, + 'senses': obj.get('senses', []) + } + entry['n_forms'] = len(entry['forms']) + print(entry['pos'], entry['n_forms']) +# print(json.dumps(entry, ensure_ascii=False)) +""" From ae1c7f0061ddec572b09acf0fe71b705d39fccb7 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Tue, 12 Sep 2023 16:35:21 +0200 Subject: [PATCH 02/19] extracting Arabic from Wiktionary, next step GF generation --- src/arabic/wiktionary/read_wiktionary.py | 106 ++++++++++++++++++++--- 1 file changed, 94 insertions(+), 12 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 48a2fca38..2520cf5fd 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -2,6 +2,7 @@ import gzip import json WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz' +FILTERED_WIKT = 'wikt_arabic.jsonl' def get_gzip_json(file, sample=100000, langs=[]): @@ -71,27 +72,108 @@ buckwalter_dict = { } def to_buckwalter(s): - return ''.join(list(map(lambda c: buckwalter_dict.get(ord(c), '?'), s))) + return ''.join([buckwalter_dict.get(ord(c), '?') for c in s]) +def unvocalize(s): + return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) + def is_arabic(s): return s and any(1574 <= ord(c) <= 1616 for c in s) -""" -with open('wikt_arabic.jsonl') as file: + +def gf_fun(s, pos): + return ''.join(["'", s, "_", pos, "'"]) + + +def forms_for_pos(obj): + forms = { + form['form']: + form.get('tags', []) for + form in obj.get('forms', []) if + 'romanization' not in form.get('tags', []) and + is_arabic(form['form']) + }.items() + if obj['pos'] == 'noun': + lemma = [form[:-1] for form, descr in forms + if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] + return { + 'gf_fun': gf_fun(lemma[0], 'N') if lemma else None, + 'singular': lemma, + 'plural': [form[:-1] for form, descr in forms + if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1], + 'gender': 'Fem' if 'Arabic feminine nouns' in obj['categories'] + else ('Masc' if 'Arabic masculine nouns' in obj['categories'] + else None) + } + elif obj['pos'] == 'verb': + lemma = [form for form, descr in forms + if all([w in descr for + w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1] + return { + 'gf_fun': gf_fun(lemma[0], 'V') if lemma else None, + 'perfect': lemma, + 'imperfect': [form for form, descr in forms + if all([w in descr for + w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1], + 'verbclass': max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII',''] + if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])], + key=len) + } + elif obj['pos'] == 'adj': + lemma = [form for form, descr in forms + if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1] + return { + 'gf_fun': gf_fun(lemma[0], 'A') if lemma else None, + 'masc_singular': lemma, + 'masc_plural': [form for form, descr in forms + if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1], + 'fem_singular': [form for form, descr in forms + if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1], + 'fem_plural': [form for form, descr in forms + if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1], + } + + else: + return {f: d for f, d in forms} + + +# "root": ["ش ر ح (š-r-ḥ)"] +def find_root(s): + return ''.join([c for c in s if is_arabic(c)]) + + + +with open(FILTERED_WIKT) as file: for line in file: obj = json.loads(line) if 'Arabic lemmas' in obj.get('categories', []): entry = { 'pos': obj['pos'], - 'forms': {form['form']: form.get('tags', []) for - form in obj.get('forms', []) if - 'romanization' not in form.get('tags', []) and - is_arabic(form['form']) - }, - 'senses': obj.get('senses', []) + 'root': [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1], + 'forms': forms_for_pos(obj), + 'senses': [sense['glosses'] for sense in obj.get('senses', []) + if 'glosses' in sense] } - entry['n_forms'] = len(entry['forms']) - print(entry['pos'], entry['n_forms']) -# print(json.dumps(entry, ensure_ascii=False)) +# entry['n_forms'] = len(entry['forms']) +# print(entry['pos'], entry['n_forms']) + print(json.dumps(entry, ensure_ascii=False)) + + +""" +"senses": [ + {"examples": [ + {"text": "10th century, Al-Mutanabbi\nذُو الْعَقْلِ يَشْقَى فِي النَّعِيمِ بِعَقْلِهِ / وَأَخُو الْجَهَالَةِ فِي الشَّقَاوَةِ يَنْعَمُ\nḏū l-ʕaqli yašqā fī an-naʕīmi biʕaqlihi / waʔaḵū l-jahālati fī š-šaqāwati yanʕamu", "english": "(please add an English translation of this quotation)", "type": "quotation"}], + "links": [ + ["bliss", "bliss#English"], ["delight", "delight#English"]], + "categories": ["Arabic terms with quotations", "Requests for translations of Arabic quotations"], + "glosses": ["bliss, delight"] + }, + {"links": [ + ["heaven", "heaven"], ["Heaven", "Heaven"], ["paradise", "paradise"], ["Paradise", "Paradise"]], + "synonyms": [{"word": "فِرْدَوس"}, {"word": "جَنَّة"}], + "antonyms": [{"word": "سَعِير"}, {"word": "لَظَىٰ"}, {"word": "النَّار"}, {"word": "جَهَنَّم"}, {"word": "جَحِيم"}, {"word": "حُطَمَة"}, {"word": "سَقَر"}, {"word": "هَاوِيَة"}], + "raw_glosses": ["(figurative) heaven, the Heaven, paradise, the Paradise"], + "glosses": ["heaven, the Heaven, paradise, the Paradise"], + "tags": ["figuratively"]}] """ From 714d8abac026fd2ec61fa431a61108358a3ef68c Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Tue, 12 Sep 2023 17:04:50 +0200 Subject: [PATCH 03/19] GF abstract dict generation --- src/arabic/wiktionary/read_wiktionary.py | 46 ++++++++++++------------ 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 2520cf5fd..ac5ee59dd 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -98,7 +98,8 @@ def forms_for_pos(obj): lemma = [form[:-1] for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] return { - 'gf_fun': gf_fun(lemma[0], 'N') if lemma else None, + 'gf_fun': gf_fun(lemma[0], 'N') if lemma else None, + 'gf_cat': 'N', 'singular': lemma, 'plural': [form[:-1] for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1], @@ -111,7 +112,8 @@ def forms_for_pos(obj): if all([w in descr for w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1] return { - 'gf_fun': gf_fun(lemma[0], 'V') if lemma else None, + 'gf_fun': gf_fun(lemma[0], 'V') if lemma else None, + 'gf_cat': 'V', 'perfect': lemma, 'imperfect': [form for form, descr in forms if all([w in descr for @@ -124,7 +126,8 @@ def forms_for_pos(obj): lemma = [form for form, descr in forms if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1] return { - 'gf_fun': gf_fun(lemma[0], 'A') if lemma else None, + 'gf_fun': gf_fun(lemma[0], 'A') if lemma else None, + 'gf_cat': 'A', 'masc_singular': lemma, 'masc_plural': [form for form, descr in forms if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1], @@ -142,9 +145,14 @@ def forms_for_pos(obj): def find_root(s): return ''.join([c for c in s if is_arabic(c)]) +import sys +MODE = sys.argv[1] +if MODE == 'gf': + print('abstract MorphoDictAraAbs = Cat ** {') with open(FILTERED_WIKT) as file: + seen_gf_funs = set() for line in file: obj = json.loads(line) if 'Arabic lemmas' in obj.get('categories', []): @@ -157,23 +165,17 @@ with open(FILTERED_WIKT) as file: } # entry['n_forms'] = len(entry['forms']) # print(entry['pos'], entry['n_forms']) - print(json.dumps(entry, ensure_ascii=False)) + if MODE == 'json': + print(json.dumps(entry, ensure_ascii=False)) - -""" -"senses": [ - {"examples": [ - {"text": "10th century, Al-Mutanabbi\nذُو الْعَقْلِ يَشْقَى فِي النَّعِيمِ بِعَقْلِهِ / وَأَخُو الْجَهَالَةِ فِي الشَّقَاوَةِ يَنْعَمُ\nḏū l-ʕaqli yašqā fī an-naʕīmi biʕaqlihi / waʔaḵū l-jahālati fī š-šaqāwati yanʕamu", "english": "(please add an English translation of this quotation)", "type": "quotation"}], - "links": [ - ["bliss", "bliss#English"], ["delight", "delight#English"]], - "categories": ["Arabic terms with quotations", "Requests for translations of Arabic quotations"], - "glosses": ["bliss, delight"] - }, - {"links": [ - ["heaven", "heaven"], ["Heaven", "Heaven"], ["paradise", "paradise"], ["Paradise", "Paradise"]], - "synonyms": [{"word": "فِرْدَوس"}, {"word": "جَنَّة"}], - "antonyms": [{"word": "سَعِير"}, {"word": "لَظَىٰ"}, {"word": "النَّار"}, {"word": "جَهَنَّم"}, {"word": "جَحِيم"}, {"word": "حُطَمَة"}, {"word": "سَقَر"}, {"word": "هَاوِيَة"}], - "raw_glosses": ["(figurative) heaven, the Heaven, paradise, the Paradise"], - "glosses": ["heaven, the Heaven, paradise, the Paradise"], - "tags": ["figuratively"]}] -""" + if MODE == 'gf': + + if 'gf_fun' in entry['forms'] and entry['forms']['gf_fun']: + if entry['forms']['gf_fun'] not in seen_gf_funs: + print('fun', entry['forms']['gf_fun'], ':', entry['forms']['gf_cat'], ';', '--', entry['senses']) + seen_gf_funs.add(entry['forms']['gf_fun']) + + # to do: rename duplicate function names: of 13762 names, 12946 are unique + +if MODE == 'gf': + print('}') From 8eceb53643a5a41d53ae61ce4ebb64f70b27010e Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Tue, 12 Sep 2023 19:38:14 +0200 Subject: [PATCH 04/19] compilable MorphoDictAra generation except for V, not yet using all forms --- src/arabic/wiktionary/read_wiktionary.py | 79 ++++++++++++++++-------- 1 file changed, 52 insertions(+), 27 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index ac5ee59dd..40d14b9fa 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -82,8 +82,9 @@ def is_arabic(s): return s and any(1574 <= ord(c) <= 1616 for c in s) -def gf_fun(s, pos): - return ''.join(["'", s, "_", pos, "'"]) +def gf_fun(s, pos, disamb=0): + discrim = '_' + str(disamb) if disamb else '' + return ''.join(["'", s, discrim, "_", pos, "'"]) def forms_for_pos(obj): @@ -97,23 +98,25 @@ def forms_for_pos(obj): if obj['pos'] == 'noun': lemma = [form[:-1] for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] - return { - 'gf_fun': gf_fun(lemma[0], 'N') if lemma else None, - 'gf_cat': 'N', - 'singular': lemma, - 'plural': [form[:-1] for form, descr in forms - if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1], - 'gender': 'Fem' if 'Arabic feminine nouns' in obj['categories'] + plural = [form[:-1] for form, descr in forms + if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1] + gender = ('Fem' if 'Arabic feminine nouns' in obj['categories'] else ('Masc' if 'Arabic masculine nouns' in obj['categories'] - else None) + else None)) + gf_entry = { + 'cat': 'N', + 'lemma': lemma, + 'singular': lemma, + 'plural': plural, + 'gender': gender } elif obj['pos'] == 'verb': lemma = [form for form, descr in forms if all([w in descr for w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1] - return { - 'gf_fun': gf_fun(lemma[0], 'V') if lemma else None, - 'gf_cat': 'V', + gf_entry = { + 'cat': 'V', + 'lemma': lemma, 'perfect': lemma, 'imperfect': [form for form, descr in forms if all([w in descr for @@ -125,9 +128,9 @@ def forms_for_pos(obj): elif obj['pos'] == 'adj': lemma = [form for form, descr in forms if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1] - return { - 'gf_fun': gf_fun(lemma[0], 'A') if lemma else None, - 'gf_cat': 'A', + gf_entry = { + 'cat': 'A', + 'lemma': lemma, 'masc_singular': lemma, 'masc_plural': [form for form, descr in forms if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1], @@ -138,9 +141,16 @@ def forms_for_pos(obj): } else: - return {f: d for f, d in forms} + gf_entry = {f: d for f, d in forms} + + if 'lemma' in gf_entry and gf_entry['lemma']: + gf_entry['lemma'] = gf_entry['lemma'][0] + form = gf_entry['imperfect'][0] if gf_entry['cat'] == 'V' and gf_entry['imperfect'] else gf_entry['lemma'] + gf_entry['lin'] = ''.join(['mk', gf_entry['cat'], ' "' + form + '"']) + return gf_entry + # "root": ["ش ر ح (š-r-ḥ)"] def find_root(s): return ''.join([c for c in s if is_arabic(c)]) @@ -148,17 +158,23 @@ def find_root(s): import sys MODE = sys.argv[1] -if MODE == 'gf': - print('abstract MorphoDictAraAbs = Cat ** {') +if MODE == 'gf-abs': + print('abstract MorphoDictAraAbs = Cat ** {') +if MODE == 'gf-cnc': + print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') + with open(FILTERED_WIKT) as file: - seen_gf_funs = set() + seen_gf_funs = {} for line in file: obj = json.loads(line) + root = [find_root(t['expansion']) for + t in obj.get('etymology_templates', []) if + t.get('name', None) =='ar-root'][:1] if 'Arabic lemmas' in obj.get('categories', []): entry = { 'pos': obj['pos'], - 'root': [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1], + 'root': root, 'forms': forms_for_pos(obj), 'senses': [sense['glosses'] for sense in obj.get('senses', []) if 'glosses' in sense] @@ -168,14 +184,23 @@ with open(FILTERED_WIKT) as file: if MODE == 'json': print(json.dumps(entry, ensure_ascii=False)) - if MODE == 'gf': + if MODE.startswith('gf'): - if 'gf_fun' in entry['forms'] and entry['forms']['gf_fun']: - if entry['forms']['gf_fun'] not in seen_gf_funs: - print('fun', entry['forms']['gf_fun'], ':', entry['forms']['gf_cat'], ';', '--', entry['senses']) - seen_gf_funs.add(entry['forms']['gf_fun']) + lemma = entry['forms'].get('lemma', None) + if lemma: + cat = entry['forms']['cat'] + lin = entry['forms']['lin'] + discrim = seen_gf_funs.get((lemma, cat), 0) + fun = gf_fun(lemma, cat, discrim) + + if MODE == 'gf-abs': + print('fun', fun, ':', cat, ';', '--', entry['senses']) + if MODE == 'gf-cnc': + print('lin', fun, '=', lin, ';') + + seen_gf_funs[(lemma, cat)] = discrim + 1 # to do: rename duplicate function names: of 13762 names, 12946 are unique -if MODE == 'gf': +if MODE.startswith('gf'): print('}') From afc84a61cbf2ac76e3ac3bf0f8d3654cb40c5c44 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Wed, 13 Sep 2023 09:06:02 +0200 Subject: [PATCH 05/19] arabic/wiktionary using paradigms with records as arguments to cope with heterogeneous information --- src/arabic/wiktionary/read_wiktionary.py | 43 +++++++++++++----------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 40d14b9fa..49d3a3c11 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -19,7 +19,6 @@ def get_gzip_json(file, sample=100000, langs=[]): # get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic']) # python3 read_wiktionary.py >wikt_arabic.jsonl -# 621-671 # https://en.wikipedia.org/wiki/Buckwalter_transliteration buckwalter_dict = { @@ -100,15 +99,17 @@ def forms_for_pos(obj): if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] plural = [form[:-1] for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1] - gender = ('Fem' if 'Arabic feminine nouns' in obj['categories'] - else ('Masc' if 'Arabic masculine nouns' in obj['categories'] - else None)) + gender = (['Fem'] if 'Arabic feminine nouns' in obj['categories'] + else (['Masc'] if 'Arabic masculine nouns' in obj['categories'] + else [])) gf_entry = { 'cat': 'N', 'lemma': lemma, - 'singular': lemma, - 'plural': plural, - 'gender': gender + 'args': { + 'sg': lemma, + 'pl': plural, + 'g': gender + } } elif obj['pos'] == 'verb': lemma = [form for form, descr in forms @@ -117,13 +118,15 @@ def forms_for_pos(obj): gf_entry = { 'cat': 'V', 'lemma': lemma, - 'perfect': lemma, - 'imperfect': [form for form, descr in forms + 'args': { + 'perfect': lemma, + 'imperfect': [form for form, descr in forms if all([w in descr for w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1], - 'verbclass': max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII',''] + 'cls': [max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII', 'XIV', 'XV', ''] if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])], - key=len) + key=len)] + } } elif obj['pos'] == 'adj': lemma = [form for form, descr in forms @@ -131,13 +134,15 @@ def forms_for_pos(obj): gf_entry = { 'cat': 'A', 'lemma': lemma, - 'masc_singular': lemma, - 'masc_plural': [form for form, descr in forms + 'args': { + 'masc_sg': lemma, + 'masc_pl': [form for form, descr in forms if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1], - 'fem_singular': [form for form, descr in forms + 'fem_sg': [form for form, descr in forms if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1], - 'fem_plural': [form for form, descr in forms + 'fem_pl': [form for form, descr in forms if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1], + } } else: @@ -145,11 +150,11 @@ def forms_for_pos(obj): if 'lemma' in gf_entry and gf_entry['lemma']: gf_entry['lemma'] = gf_entry['lemma'][0] - form = gf_entry['imperfect'][0] if gf_entry['cat'] == 'V' and gf_entry['imperfect'] else gf_entry['lemma'] - gf_entry['lin'] = ''.join(['mk', gf_entry['cat'], ' "' + form + '"']) + gf_entry['args']['root'] = obj['root'] + args = [r + ' = ' + '"' + x[0] + '"' for r, x in gf_entry['args'].items() if x] + gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' return gf_entry - # "root": ["ش ر ح (š-r-ḥ)"] def find_root(s): @@ -171,10 +176,10 @@ with open(FILTERED_WIKT) as file: root = [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1] + obj['root'] = root if 'Arabic lemmas' in obj.get('categories', []): entry = { 'pos': obj['pos'], - 'root': root, 'forms': forms_for_pos(obj), 'senses': [sense['glosses'] for sense in obj.get('senses', []) if 'glosses' in sense] From 3c0adada11f9c3055dedb5a26ef4d483585f8a15 Mon Sep 17 00:00:00 2001 From: aarneranta Date: Wed, 13 Sep 2023 15:29:28 +0200 Subject: [PATCH 06/19] new function in ParadigmsAra to deal with Wiktionary data; lots of untested guesses --- src/arabic/ParadigmsAra.gf | 67 ++++++++++++++++++++++++ src/arabic/wiktionary/read_wiktionary.py | 64 +++++++++++++++------- 2 files changed, 113 insertions(+), 18 deletions(-) diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf index ce479f944..20892fed8 100644 --- a/src/arabic/ParadigmsAra.gf +++ b/src/arabic/ParadigmsAra.gf @@ -868,4 +868,71 @@ formV : (root : Str) -> VerbForm -> V = \s,f -> case f of { param VerbForm = FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ; +-- paradigms for Wiktionary extraction +---- TODO: better usage of information in Wiktionary + +oper + wmkN = overload { + wmkN : {sg, pl : Str ; g : Gender} -> N + = \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt + wmkN : {sg : Str} -> N + = \r -> smartN r.sg ; + wmkN : {sg : Str ; g : Gender ; root : Str} -> N + = \r -> smartN r.sg ** {g = r.g} ; ---- + wmkN : {sg : Str; g : Gender} -> N + = \r -> smartN r.sg ** {g = r.g} ; + wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N + = \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt + wmkN : {sg : Str; pl : Str} -> N + = \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ; + wmkN : {sg : Str; root : Str} -> N + = \r -> smartN r.sg ; + } ; + + wmkA = overload { + wmkA : {root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_sg: Str ; masc_pl : Str; fem_pl : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A + = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; fem_sg : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_sg : Str} -> A + = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A + = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A + = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; masc_pl : Str} -> A + = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str} -> A + = \r -> mkA r.masc_sg ; ---- + } ; + + wmkV = overload { + wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V + = \r -> mkV r.root r.cls ; ---- + wmkV : {perfect : Str; cls : VerbForm} -> V + = \r -> mkV r.perfect r.cls ; ---- + wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V + = \r -> mkV r.root r.cls ; ---- + wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V + = \r -> variants {} ; ---- mkV r.imperfect ; ---- + wmkV : {root : Str ; cls : VerbForm} -> V + = \r -> mkV r.root r.cls ; + wmkV : {imperfect : Str} -> V + = \r -> variants {} ; ---- mkV r.imperfect ; + } ; + } ; diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 49d3a3c11..ea8d805fd 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -1,7 +1,22 @@ import gzip import json +import sys +# data from https://kaikki.org/dictionary/rawdata.html +# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data, +# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. + +if not sys.argv[1:]: + print('usage: read_wiktionary (raw | gf-cnc | gf-abs)') + exit() + +MODE = sys.argv[1] # + +# step 1: extract data from this file using the raw option WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz' + +# the following file is generated. +# in the sequel, use this file with gf-abs or gf-cnc option FILTERED_WIKT = 'wikt_arabic.jsonl' @@ -14,11 +29,12 @@ def get_gzip_json(file, sample=100000, langs=[]): obj = json.loads(line) if obj.get('lang', None) in langs: print(line.decode("utf-8")) - print(n) +# print(n) +if MODE == 'raw': + get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic']) -# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic']) -# python3 read_wiktionary.py >wikt_arabic.jsonl +# python3 read_wiktionary.py raw >wikt_arabic.jsonl # https://en.wikipedia.org/wiki/Buckwalter_transliteration buckwalter_dict = { @@ -80,6 +96,12 @@ def unvocalize(s): def is_arabic(s): return s and any(1574 <= ord(c) <= 1616 for c in s) +# quote forms but not parameters +def quote_if(s, cond=is_arabic): + if cond(s): + return '"' + s + '"' + else: + return s def gf_fun(s, pos, disamb=0): discrim = '_' + str(disamb) if disamb else '' @@ -99,8 +121,8 @@ def forms_for_pos(obj): if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] plural = [form[:-1] for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1] - gender = (['Fem'] if 'Arabic feminine nouns' in obj['categories'] - else (['Masc'] if 'Arabic masculine nouns' in obj['categories'] + gender = (['fem'] if 'Arabic feminine nouns' in obj['categories'] + else (['masc'] if 'Arabic masculine nouns' in obj['categories'] else [])) gf_entry = { 'cat': 'N', @@ -122,15 +144,20 @@ def forms_for_pos(obj): 'perfect': lemma, 'imperfect': [form for form, descr in forms if all([w in descr for - w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1], - 'cls': [max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII', 'XIV', 'XV', ''] - if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])], - key=len)] + w in [ + "active", "indicative", "masculine", "non-past", + "imperfective", "singular", "third-person"]])][:1], + 'cls': ['Form' + max([n for n in [ + 'I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI',''] + if n in ' '.join([c for c in obj['categories'] + if c.endswith('verbs') and any([n in c for n in 'IVX'])])], + key=len)] # max in RGL is XI, in Wikt XIII } } elif obj['pos'] == 'adj': lemma = [form for form, descr in forms - if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1] + if all([w in descr for w in [ + 'indefinite', 'masculine', 'singular', 'informal']])][:1] gf_entry = { 'cat': 'A', 'lemma': lemma, @@ -150,8 +177,9 @@ def forms_for_pos(obj): if 'lemma' in gf_entry and gf_entry['lemma']: gf_entry['lemma'] = gf_entry['lemma'][0] - gf_entry['args']['root'] = obj['root'] - args = [r + ' = ' + '"' + x[0] + '"' for r, x in gf_entry['args'].items() if x] + if obj['root']: + gf_entry['args']['root'] = obj['root'] + args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x] gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' return gf_entry @@ -160,19 +188,19 @@ def forms_for_pos(obj): def find_root(s): return ''.join([c for c in s if is_arabic(c)]) -import sys -MODE = sys.argv[1] - if MODE == 'gf-abs': print('abstract MorphoDictAraAbs = Cat ** {') if MODE == 'gf-cnc': print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') - -with open(FILTERED_WIKT) as file: +if MODE != 'raw': + with open(FILTERED_WIKT) as file: seen_gf_funs = {} for line in file: - obj = json.loads(line) + try: + obj = json.loads(line) + except: + continue root = [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1] From 8e029bd8dd24f8bd76c46cc2f811041be9682ab5 Mon Sep 17 00:00:00 2001 From: aarneranta Date: Wed, 13 Sep 2023 17:24:21 +0200 Subject: [PATCH 07/19] Arabic Wiktionary: started comparing evaluation --- src/arabic/wiktionary/read_wiktionary.py | 85 ++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 5 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index ea8d805fd..574233dda 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -108,14 +108,76 @@ def gf_fun(s, pos, disamb=0): return ''.join(["'", s, discrim, "_", pos, "'"]) -def forms_for_pos(obj): - forms = { +rgl_features = { + # V + 'VPerf': 'perfective', + 'Act': 'active', + 'Pas': 'passive', + 'Per3': 'third-person', + 'Per2': 'second-person', + 'Masc': 'masculine', + 'Fem': 'feminine', + 'Sg': 'singular', + 'Pl': 'plural', + 'Dl': 'dual', + 'VImpf': 'imperfective', + 'Ind': 'indicative', + 'Cnj': 'subjunctive', + 'Jus': 'jussive', + 'VImp': 'imperative', + # N: also Sg, Pl, Dl + 'Def': 'definite', + 'Indef': 'indefinite', + 'Nom': 'nominative', + 'Acc': 'accusative', + 'Gen': 'genitive', +# 'Bare': +# 'Dat': + 'Const': 'construct', +# 'Poss': + #A: also N features + 'APosit': 'positive', + 'AComp': 'comparative' + } + + +# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ +def compare_tables(gf, wikt): + report = {} + for line in gf: + gf_form = line #''.join([c for c in line if 1574 <= ord(c) <= 1616]) + gf_tags = tuple(word for word in + line.replace('(', ' ').replace(')', ' ').split() + if word in rgl_features) + wikt_tags = {rgl_features[tag] for tag in gf_tags} + wikt_form = None + for form, descr in wikt: + if all([tag in descr for tag in wikt_tags]): + wikt_form = form + break + report[gf_tags] = { + 'gf_form': gf_form, + 'wikt_form': wikt_form + } + if wikt_form: + report[gf_tags]['voc_match'] = int(gf_form == wikt_form) + report[gf_tags]['unvoc_match'] = int(unvocalize(gf_form) == unvocalize(wikt_form)) + return report + + + +def wikt_forms_for_pos(obj): + return { form['form']: form.get('tags', []) for form in obj.get('forms', []) if 'romanization' not in form.get('tags', []) and is_arabic(form['form']) }.items() + + +def forms_for_pos(obj): + forms = wikt_forms_for_pos(obj) if obj['pos'] == 'noun': lemma = [form[:-1] for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] @@ -136,7 +198,8 @@ def forms_for_pos(obj): elif obj['pos'] == 'verb': lemma = [form for form, descr in forms if all([w in descr for - w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1] + w in ["active", "indicative", "masculine", "past", + "perfective", "singular", "third-person"]])][:1] gf_entry = { 'cat': 'V', 'lemma': lemma, @@ -193,14 +256,16 @@ if MODE == 'gf-abs': if MODE == 'gf-cnc': print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') -if MODE != 'raw': +if MODE not in ['raw', 'eval']: with open(FILTERED_WIKT) as file: seen_gf_funs = {} + number = 1 for line in file: try: obj = json.loads(line) except: continue + number += 1 root = [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1] @@ -227,7 +292,7 @@ if MODE != 'raw': fun = gf_fun(lemma, cat, discrim) if MODE == 'gf-abs': - print('fun', fun, ':', cat, ';', '--', entry['senses']) + print('fun', fun, ':', cat, ';', '--', number, entry['senses']) if MODE == 'gf-cnc': print('lin', fun, '=', lin, ';') @@ -237,3 +302,13 @@ if MODE != 'raw': if MODE.startswith('gf'): print('}') + + +if MODE == 'eval': + with open('pot.gftbl') as file: + gf = [line.strip() for line in file] + with open('pot.json') as file: + wikt = wikt_forms_for_pos(json.loads(file.read())) + for line in compare_tables(gf, wikt).items(): + print(line) + From d5e6e7e38987ab98da7fa33b90428e446a730414 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Thu, 14 Sep 2023 12:21:48 +0200 Subject: [PATCH 08/19] Arabic Wiktionary: functions for normalization and evaluation --- src/arabic/wiktionary/read_wiktionary.py | 88 ++++++++++++++++++++---- 1 file changed, 75 insertions(+), 13 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 574233dda..bcf902b77 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -1,17 +1,21 @@ import gzip import json import sys +import unicodedata # data from https://kaikki.org/dictionary/rawdata.html # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data, # Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. -if not sys.argv[1:]: - print('usage: read_wiktionary (raw | gf-cnc | gf-abs)') - exit() +MODE = '' -MODE = sys.argv[1] # +if __name__ == '__main__': + if not sys.argv[1:]: + print('usage: read_wiktionary (raw | gf-cnc | gf-abs | gf-map | eval | eval-verbose)') + exit() + MODE = sys.argv[1] # + # step 1: extract data from this file using the raw option WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz' @@ -19,6 +23,18 @@ WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz' # in the sequel, use this file with gf-abs or gf-cnc option FILTERED_WIKT = 'wikt_arabic.jsonl' +# map each successfully extracted GF function to its source record in Wiktionary +# created with option gf-map +FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl' + + +def read_function_source_map(): + with open(FUNCTION_SOURCE_MAP) as file: + sourcemap = {} + for line in file: + obj = json.loads(line) + sourcemap[obj['fun']] = obj['source'] + def get_gzip_json(file, sample=100000, langs=[]): with gzip.open(file) as decompressed: @@ -86,16 +102,37 @@ buckwalter_dict = { 0x671: '{' # ٱ } +buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()} + + def to_buckwalter(s): - return ''.join([buckwalter_dict.get(ord(c), '?') for c in s]) + return ''.join([buckwalter_dict.get(ord(c), c) for c in s]) + + +def from_buckwalter(s): + return ''.join([buckwalter_dict_rev.get(c, c) for c in s]) def unvocalize(s): return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) + def is_arabic(s): return s and any(1574 <= ord(c) <= 1616 for c in s) +def normal(s): + return unicodedata.normalize('NFD', s) + + +# Wikt uses vowel+shadda which is a Unicode normalization +# GF uses shadda+vowel which is linguistically correct +# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra +# unicodedata.normalize does this wrong, as noted by Ariel Gutman +## todo: more direct implementation +def reorder_shadda(s): + return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i')) + + # quote forms but not parameters def quote_if(s, cond=is_arabic): if cond(s): @@ -115,8 +152,11 @@ rgl_features = { 'Pas': 'passive', 'Per3': 'third-person', 'Per2': 'second-person', + 'Per1': 'first-person', 'Masc': 'masculine', 'Fem': 'feminine', + 'Sing': 'singular', + 'Plur': 'plural', 'Sg': 'singular', 'Pl': 'plural', 'Dl': 'dual', @@ -142,26 +182,39 @@ rgl_features = { # format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ +# coming from 'l -treebank -table' def compare_tables(gf, wikt): report = {} for line in gf: - gf_form = line #''.join([c for c in line if 1574 <= ord(c) <= 1616]) + gf_form = line.split()[-1] # ''.join([c for c in line if 1574 <= ord(c) <= 1616]) gf_tags = tuple(word for word in line.replace('(', ' ').replace(')', ' ').split() if word in rgl_features) + if not gf_tags: + continue wikt_tags = {rgl_features[tag] for tag in gf_tags} wikt_form = None + wikt_descr = None for form, descr in wikt: if all([tag in descr for tag in wikt_tags]): - wikt_form = form + wikt_form = reorder_shadda(form) + wikt_descr = descr break report[gf_tags] = { 'gf_form': gf_form, - 'wikt_form': wikt_form + 'wikt_form': wikt_form, + 'gf_form_rom': to_buckwalter(gf_form) if gf_form else None, + 'wikt_form_rom': to_buckwalter(wikt_form) if wikt_form else None, + 'wikt_descr': wikt_descr } if wikt_form: - report[gf_tags]['voc_match'] = int(gf_form == wikt_form) - report[gf_tags]['unvoc_match'] = int(unvocalize(gf_form) == unvocalize(wikt_form)) + report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form)) + report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form))) + ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items + report['fun'] = gf[0].split()[-1] + report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ]) + report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems]) + report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems]) return report @@ -295,6 +348,9 @@ if MODE not in ['raw', 'eval']: print('fun', fun, ':', cat, ';', '--', number, entry['senses']) if MODE == 'gf-cnc': print('lin', fun, '=', lin, ';') + if MODE == 'gf-map': + mapitem = {'fun': fun, 'source': obj} + print(json.dumps(mapitem, ensure_ascii=False)) seen_gf_funs[(lemma, cat)] = discrim + 1 @@ -304,11 +360,17 @@ if MODE.startswith('gf'): print('}') -if MODE == 'eval': +if MODE.startswith('eval'): with open('pot.gftbl') as file: gf = [line.strip() for line in file] with open('pot.json') as file: wikt = wikt_forms_for_pos(json.loads(file.read())) - for line in compare_tables(gf, wikt).items(): - print(line) + report = compare_tables(gf, wikt) + + if MODE == 'eval-verbose': + for line in report.items(): + print(line) + else: + print(report['fun'], 'forms', report['total_found'], + 'voc', report['total_voc'], 'unvoc', report['total_unvoc']) From 3e9be76e52be26e046910afbffa196e3f2d64826 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Thu, 14 Sep 2023 15:19:05 +0200 Subject: [PATCH 09/19] evaluation of generated lexicon --- src/arabic/wiktionary/read_wiktionary.py | 135 ++++++++++++++++++----- 1 file changed, 110 insertions(+), 25 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index bcf902b77..6db526c33 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -2,10 +2,47 @@ import gzip import json import sys import unicodedata +import pgf + # data from https://kaikki.org/dictionary/rawdata.html # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data, -# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. +# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. + +""" +This file converts Wiktionary data to GF morphological dictionary files. +It words for Arabic but some functionalities could be modified to other languges. + +The steps to take are the following: + +fetch data: + + raw-wiktextract-data.json.gz from https://kaikki.org/dictionary/rawdata.html + +filter Arabic entries: + + $ python3 read_wiktionary.py raw >wikt_arabic.jsonl + +create GF files: + + $ python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf + $ python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf + +automatic evaluation: + + $ gf -make MorphoDictAra.gf + $ python3 read_wiktionary.py gf-map >function_sources_arabic.jsonl + $ python3 read_wiktionary.py eval + +TODO: +- better generation of GF +- better paradigms to use Wiktionary data +- refactor the code so that it can be used for other languages + +""" + + + MODE = '' @@ -27,13 +64,20 @@ FILTERED_WIKT = 'wikt_arabic.jsonl' # created with option gf-map FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl' +PGF_FILE = 'MorphoDictAraAbs.pgf' +CONCRETE_MODULE = 'MorphoDictAra' + def read_function_source_map(): with open(FUNCTION_SOURCE_MAP) as file: sourcemap = {} for line in file: - obj = json.loads(line) - sourcemap[obj['fun']] = obj['source'] + try: + obj = json.loads(line) + sourcemap[obj['fun']] = obj['source'] + except: + continue + return sourcemap def get_gzip_json(file, sample=100000, langs=[]): @@ -134,9 +178,9 @@ def reorder_shadda(s): # quote forms but not parameters -def quote_if(s, cond=is_arabic): +def quote_if(s, cond=is_arabic, change=reorder_shadda): if cond(s): - return '"' + s + '"' + return '"' + change(s) + '"' else: return s @@ -181,14 +225,19 @@ rgl_features = { } +# obsolote: # format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ # coming from 'l -treebank -table' -def compare_tables(gf, wikt): +# now used: +# {'s (AComp Def Bare)': 'الأَيَُونَانِ'} +# coming from tabularLinearize + +def compare_tables(gf, wikt, fun): report = {} - for line in gf: - gf_form = line.split()[-1] # ''.join([c for c in line if 1574 <= ord(c) <= 1616]) + for pair in gf.items(): + gf_form = pair[1] gf_tags = tuple(word for word in - line.replace('(', ' ').replace(')', ' ').split() + pair[0].replace('(', ' ').replace(')', ' ').split() if word in rgl_features) if not gf_tags: continue @@ -211,7 +260,7 @@ def compare_tables(gf, wikt): report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form)) report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form))) ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items - report['fun'] = gf[0].split()[-1] + report['fun'] = fun report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ]) report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems]) report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems]) @@ -293,7 +342,7 @@ def forms_for_pos(obj): if 'lemma' in gf_entry and gf_entry['lemma']: gf_entry['lemma'] = gf_entry['lemma'][0] - if obj['root']: + if obj['root'] and obj['root'][0].strip(): gf_entry['args']['root'] = obj['root'] args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x] gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' @@ -309,7 +358,8 @@ if MODE == 'gf-abs': if MODE == 'gf-cnc': print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') -if MODE not in ['raw', 'eval']: + +if MODE.startswith('gf') or MODE=='json': with open(FILTERED_WIKT) as file: seen_gf_funs = {} number = 1 @@ -360,17 +410,52 @@ if MODE.startswith('gf'): print('}') -if MODE.startswith('eval'): - with open('pot.gftbl') as file: - gf = [line.strip() for line in file] - with open('pot.json') as file: - wikt = wikt_forms_for_pos(json.loads(file.read())) - report = compare_tables(gf, wikt) - - if MODE == 'eval-verbose': - for line in report.items(): - print(line) - else: - print(report['fun'], 'forms', report['total_found'], - 'voc', report['total_voc'], 'unvoc', report['total_unvoc']) +def eval_all(gr, funmap, concrete=CONCRETE_MODULE): + lang = gr.languages[CONCRETE_MODULE] + funs = gr.functions + reports = [] + for fun in funs: + funn = "'" + fun + "'" + if funn not in funmap: + print(funn, 'not found') + continue + wikt = wikt_forms_for_pos(funmap[funn]) + gf = lang.tabularLinearize(pgf.Expr(fun, [])) + report = compare_tables(gf, wikt, fun) + reports.append(report) + return reports + + +def first_error(report): + for f, v in report.items(): + if 'voc_match' in v: + if v['voc_match'] == 0: + return f, v + + +if MODE.startswith('eval'): + gr = pgf.readPGF(PGF_FILE) + print('using', PGF_FILE) + funmap = read_function_source_map() + print(len(funmap), 'functions') + for report in eval_all(gr, funmap): + + if MODE == 'eval-verbose': + for line in report.items(): + print(line) + else: + if report['total_found'] == 0: + verdict = 'NOT_FOUND' + elif report['total_found'] == report['total_voc']: + verdict = 'PERFECT' + elif report['total_found'] == report['total_unvoc']: + verdict = 'PERFECT_UNVOC ' + str(first_error(report)) + elif report['total_voc'] == 0: + verdict = 'TOTALLY_WRONG ' + str(first_error(report)) + else: + verdict = 'PARTIAL ' + str(first_error(report)) + print(report['fun'], 'forms', report['total_found'], + 'voc', report['total_voc'], 'unvoc', report['total_unvoc'], + verdict + ) From edecc3fe57cac46e5b03079d9e87674f73626acb Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Thu, 14 Sep 2023 18:21:18 +0200 Subject: [PATCH 10/19] a quick way to extract wordnet morphology --- src/arabic/wiktionary/to_wordnet.py | 46 +++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 src/arabic/wiktionary/to_wordnet.py diff --git a/src/arabic/wiktionary/to_wordnet.py b/src/arabic/wiktionary/to_wordnet.py new file mode 100644 index 000000000..7496e769b --- /dev/null +++ b/src/arabic/wiktionary/to_wordnet.py @@ -0,0 +1,46 @@ +import csv +import json + +# to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl +# the following are assumed + +WN_TSV = 'arabic.tsv' +MORPHO_GF = 'MorphoDictAraAbs.gf' + +def is_arabic(s): + return s and any(1574 <= ord(c) <= 1616 for c in s) + +def get_arabic(s): + return ''.join([c for c in s if is_arabic(c)]) + +def unvocalize(s): + return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) + + +# fun 'دُبُ_N' : N ; -- 10 [['bear']] +funmap = {} +with open(MORPHO_GF) as gffile: + for line in gffile: + line = line.split() + if line[2:] and line[0] == 'fun': + fun = line[1] + key = unvocalize(fun) + cat = line[3] + sense = ' '.join(line[6:]) + funmap[(key, cat)] = funmap.get((key, cat), []) + funmap[(key, cat)].append({'fun': fun, 'sense': sense}) + + +# abandon_1_V2 ParseAra ترك (1,1,1,3,322,3) +with open(WN_TSV) as wnfile: +## wnreader = csv.reader(wnfile, delimiter='\t') + for row in wnfile: +## word = row[-1].strip() # does not show tha arabic, but the second-last word + word = get_arabic(row) + wnfun = row.split()[0] + cat = [c for c in wnfun if c.isalpha()][-1] # the last letter; the dict only contains N, A, V + funs = funmap.get((word, cat), []) + result = {'wnfun': wnfun, 'sought': word, 'found': funs} + print(json.dumps(result, ensure_ascii=False)) + + From 73f0b8ef00d944580b793020ef9cc94a7064b622 Mon Sep 17 00:00:00 2001 From: aarneranta Date: Fri, 15 Sep 2023 14:48:23 +0200 Subject: [PATCH 11/19] commented and refactored read_wiktionary.py --- src/arabic/wiktionary/read_wiktionary.py | 213 ++++++++++++++--------- src/arabic/wiktionary/to_wordnet.py | 6 +- 2 files changed, 133 insertions(+), 86 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 6db526c33..6ee6e10e8 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -31,7 +31,7 @@ create GF files: automatic evaluation: $ gf -make MorphoDictAra.gf - $ python3 read_wiktionary.py gf-map >function_sources_arabic.jsonl + $ python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl $ python3 read_wiktionary.py eval TODO: @@ -42,8 +42,6 @@ TODO: """ - - MODE = '' if __name__ == '__main__': @@ -53,8 +51,9 @@ if __name__ == '__main__': MODE = sys.argv[1] # -# step 1: extract data from this file using the raw option +# step 1: extract Arabic data from this file using the raw option WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz' +EXTRACTED_LANGUAGE = 'Arabic' # the following file is generated. # in the sequel, use this file with gf-abs or gf-cnc option @@ -62,24 +61,18 @@ FILTERED_WIKT = 'wikt_arabic.jsonl' # map each successfully extracted GF function to its source record in Wiktionary # created with option gf-map -FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl' +FUNCTION_SOURCE_MAP = 'source_of_MorphoDictAra.jsonl' +# created with $ gf -make MorphoDictAra.gf PGF_FILE = 'MorphoDictAraAbs.pgf' + +# module to linearize with CONCRETE_MODULE = 'MorphoDictAra' - -def read_function_source_map(): - with open(FUNCTION_SOURCE_MAP) as file: - sourcemap = {} - for line in file: - try: - obj = json.loads(line) - sourcemap[obj['fun']] = obj['source'] - except: - continue - return sourcemap - +# read a gzipped jsonl file (one object per line), +# showing lines where one of a list of languages is present +# This can be sampled to one of 100k lines by default, 1 for total recall. def get_gzip_json(file, sample=100000, langs=[]): with gzip.open(file) as decompressed: n = 0 @@ -91,10 +84,13 @@ def get_gzip_json(file, sample=100000, langs=[]): print(line.decode("utf-8")) # print(n) -if MODE == 'raw': - get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic']) +# to perform the first step of data extraction, pipe this into a file: # python3 read_wiktionary.py raw >wikt_arabic.jsonl +if MODE == 'raw': + get_gzip_json(WIKTIONARY_DUMP, 1, [EXTRACTED_LANGUAGE]) + exit() + # https://en.wikipedia.org/wiki/Buckwalter_transliteration buckwalter_dict = { @@ -177,19 +173,22 @@ def reorder_shadda(s): return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i')) -# quote forms but not parameters +# quote word forms but not parameters def quote_if(s, cond=is_arabic, change=reorder_shadda): if cond(s): return '"' + change(s) + '"' else: return s + +# generate word_d_C functions starting with d=0, but show d only when >= 1 def gf_fun(s, pos, disamb=0): discrim = '_' + str(disamb) if disamb else '' return ''.join(["'", s, discrim, "_", pos, "'"]) -rgl_features = { +# mapping from GF to Wikt features +arabic_rgl_features = { # V 'VPerf': 'perfective', 'Act': 'active', @@ -224,62 +223,22 @@ rgl_features = { 'AComp': 'comparative' } - -# obsolote: -# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ -# coming from 'l -treebank -table' -# now used: -# {'s (AComp Def Bare)': 'الأَيَُونَانِ'} -# coming from tabularLinearize - -def compare_tables(gf, wikt, fun): - report = {} - for pair in gf.items(): - gf_form = pair[1] - gf_tags = tuple(word for word in - pair[0].replace('(', ' ').replace(')', ' ').split() - if word in rgl_features) - if not gf_tags: - continue - wikt_tags = {rgl_features[tag] for tag in gf_tags} - wikt_form = None - wikt_descr = None - for form, descr in wikt: - if all([tag in descr for tag in wikt_tags]): - wikt_form = reorder_shadda(form) - wikt_descr = descr - break - report[gf_tags] = { - 'gf_form': gf_form, - 'wikt_form': wikt_form, - 'gf_form_rom': to_buckwalter(gf_form) if gf_form else None, - 'wikt_form_rom': to_buckwalter(wikt_form) if wikt_form else None, - 'wikt_descr': wikt_descr - } - if wikt_form: - report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form)) - report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form))) - ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items - report['fun'] = fun - report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ]) - report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems]) - report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems]) - return report - - - -def wikt_forms_for_pos(obj): + +# the inflection forms in a wiktionary entry +def wikt_forms_from_obj(obj): return { form['form']: form.get('tags', []) for form in obj.get('forms', []) if 'romanization' not in form.get('tags', []) and is_arabic(form['form']) - }.items() + } +# selection of forms for a given POS from Wikt: noun, adj, or verb +# return a linearization function def forms_for_pos(obj): - forms = wikt_forms_for_pos(obj) + forms = wikt_forms_from_obj(obj).items() if obj['pos'] == 'noun': lemma = [form[:-1] for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] @@ -345,46 +304,60 @@ def forms_for_pos(obj): if obj['root'] and obj['root'][0].strip(): gf_entry['args']['root'] = obj['root'] args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x] - gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' + gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(sorted(args)) + '}' return gf_entry + # "root": ["ش ر ح (š-r-ḥ)"] def find_root(s): return ''.join([c for c in s if is_arabic(c)]) + +# GF code generation + +# start with the header of the desired GF module + if MODE == 'gf-abs': print('abstract MorphoDictAraAbs = Cat ** {') if MODE == 'gf-cnc': print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') - +# go through the Arabic Wiktionary entries +# generate functions with unique names + if MODE.startswith('gf') or MODE=='json': with open(FILTERED_WIKT) as file: - seen_gf_funs = {} + seen_gf_funs = {} # to disambiguate names if needed number = 1 for line in file: try: obj = json.loads(line) except: continue - number += 1 + number += 1 # if you find the same word_C again, mark it word_1_C + + # the root (three radicals) is found in this place if at all root = [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1] obj['root'] = root + + # only take entries that are marked as lemmas if 'Arabic lemmas' in obj.get('categories', []): entry = { 'pos': obj['pos'], 'forms': forms_for_pos(obj), + 'all_forms': wikt_forms_from_obj(obj), 'senses': [sense['glosses'] for sense in obj.get('senses', []) if 'glosses' in sense] } -# entry['n_forms'] = len(entry['forms']) -# print(entry['pos'], entry['n_forms']) + + # if you only want to see the Wikt information used GF generation if MODE == 'json': print(json.dumps(entry, ensure_ascii=False)) - + + # if you want to proceed to GF generation if MODE.startswith('gf'): lemma = entry['forms'].get('lemma', None) @@ -393,23 +366,74 @@ if MODE.startswith('gf') or MODE=='json': lin = entry['forms']['lin'] discrim = seen_gf_funs.get((lemma, cat), 0) fun = gf_fun(lemma, cat, discrim) - + + # abstract syntax, save in MorphoDictAraAbs.gf if MODE == 'gf-abs': print('fun', fun, ':', cat, ';', '--', number, entry['senses']) - if MODE == 'gf-cnc': + + # concrete syntax, save in MorphoDictAra.gf + elif MODE == 'gf-cnc': print('lin', fun, '=', lin, ';') - if MODE == 'gf-map': - mapitem = {'fun': fun, 'source': obj} + + # function-source map, save in source_of_MorphoDictAra.jsonl + elif MODE == 'gf-map': + mapitem = {'fun': fun, 'source': wikt_forms_from_obj(obj)} print(json.dumps(mapitem, ensure_ascii=False)) - seen_gf_funs[(lemma, cat)] = discrim + 1 + seen_gf_funs[(lemma, cat)] = discrim + 1 # next word_d_C will get a new number - # to do: rename duplicate function names: of 13762 names, 12946 are unique - -if MODE.startswith('gf'): +# terminate the GF file with a closing brace +if MODE in ['gf-abs', 'gf-cnc']: print('}') - + +# evaluation: +# linearize all words to tables +# compare them to the forms found in Wiktionary +# report on matches + +# format of GF table: +# {'s (AComp Def Bare)': 'الأَيَُونَانِ'} +# coming from pgf tabularLinearize + +def compare_tables(gf, wikt, fun, show_buckwalter=True): + report = {} + for pair in gf.items(): + gf_form = pair[1] + gf_params = pair[0] + gf_tags = tuple(word for word in + pair[0].replace('(', ' ').replace(')', ' ').split() + if word in arabic_rgl_features) + if not gf_tags: + continue # if gf_tags match no Wikt tags, do not include this form + wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags} + wikt_form = None + wikt_descr = None + for form, descr in wikt: + if all([tag in descr for tag in wikt_tags]): + wikt_form = reorder_shadda(form) + wikt_descr = descr + break + report[gf_tags] = { # flat param description with only Wikt-relevant tags + 'gf_params': gf_params, # full param description + 'gf_form': gf_form, + 'wikt_form': wikt_form, + 'wikt_descr': wikt_descr + } + if show_buckwalter: + report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None, + report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None, + if wikt_form: + report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form)) + report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form))) + ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items + report['fun'] = fun + report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ]) + report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems]) + report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems]) + return report + + def eval_all(gr, funmap, concrete=CONCRETE_MODULE): lang = gr.languages[CONCRETE_MODULE] funs = gr.functions @@ -419,13 +443,14 @@ def eval_all(gr, funmap, concrete=CONCRETE_MODULE): if funn not in funmap: print(funn, 'not found') continue - wikt = wikt_forms_for_pos(funmap[funn]) + wikt = funmap[funn].items() gf = lang.tabularLinearize(pgf.Expr(fun, [])) report = compare_tables(gf, wikt, fun) reports.append(report) return reports +# in the summary report: print the first error if anything gets wrong def first_error(report): for f, v in report.items(): if 'voc_match' in v: @@ -433,6 +458,20 @@ def first_error(report): return f, v +# having stored the Wiktionary object for each GF function +# read it back from a file +def read_function_source_map(): + with open(FUNCTION_SOURCE_MAP) as file: + sourcemap = {} + for line in file: + try: + obj = json.loads(line) + sourcemap[obj['fun']] = obj['source'] + except: + continue + return sourcemap + + if MODE.startswith('eval'): gr = pgf.readPGF(PGF_FILE) print('using', PGF_FILE) @@ -443,6 +482,10 @@ if MODE.startswith('eval'): if MODE == 'eval-verbose': for line in report.items(): print(line) + if MODE == 'eval-tables': + for gftags, value in report.items(): + if v := value['wikt_form']: + print(' ', value['gf_params'][2:], '=>', '"' + v + '" ;') else: if report['total_found'] == 0: verdict = 'NOT_FOUND' diff --git a/src/arabic/wiktionary/to_wordnet.py b/src/arabic/wiktionary/to_wordnet.py index 7496e769b..144e4cc1a 100644 --- a/src/arabic/wiktionary/to_wordnet.py +++ b/src/arabic/wiktionary/to_wordnet.py @@ -4,7 +4,11 @@ import json # to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl # the following are assumed + +# from https://www.grammaticalframework.org/~krasimir/arabic.tsv.gz WN_TSV = 'arabic.tsv' + +# built as explained in ./read_wiktionary.py MORPHO_GF = 'MorphoDictAraAbs.gf' def is_arabic(s): @@ -36,7 +40,7 @@ with open(WN_TSV) as wnfile: ## wnreader = csv.reader(wnfile, delimiter='\t') for row in wnfile: ## word = row[-1].strip() # does not show tha arabic, but the second-last word - word = get_arabic(row) + word = unvocalize(get_arabic(row)) wnfun = row.split()[0] cat = [c for c in wnfun if c.isalpha()][-1] # the last letter; the dict only contains N, A, V funs = funmap.get((word, cat), []) From 9e8c5eaad5699ee2e45e269f2c27a28a36185dd1 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Mon, 18 Sep 2023 08:52:32 +0200 Subject: [PATCH 12/19] arabic/wiktionary: including root in the form list --- src/arabic/wiktionary/read_wiktionary.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 6ee6e10e8..9a1d76fef 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -226,19 +226,28 @@ arabic_rgl_features = { # the inflection forms in a wiktionary entry def wikt_forms_from_obj(obj): - return { + forms = { form['form']: form.get('tags', []) for form in obj.get('forms', []) if 'romanization' not in form.get('tags', []) and is_arabic(form['form']) } + # the root (three radicals) is found in this place if at all + root = [find_root(t['expansion']) for + t in obj.get('etymology_templates', []) if + t.get('name', None) =='ar-root'][:1] + if root and root[0].strip(): + forms['root'] = root[0].strip() + + return forms # selection of forms for a given POS from Wikt: noun, adj, or verb # return a linearization function def forms_for_pos(obj): - forms = wikt_forms_from_obj(obj).items() + dforms = wikt_forms_from_obj(obj) + forms = dforms.items() if obj['pos'] == 'noun': lemma = [form[:-1] for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] @@ -301,8 +310,8 @@ def forms_for_pos(obj): if 'lemma' in gf_entry and gf_entry['lemma']: gf_entry['lemma'] = gf_entry['lemma'][0] - if obj['root'] and obj['root'][0].strip(): - gf_entry['args']['root'] = obj['root'] + if 'root' in dforms: + gf_entry['args']['root'] = [dforms['root']] args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x] gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(sorted(args)) + '}' @@ -337,12 +346,6 @@ if MODE.startswith('gf') or MODE=='json': continue number += 1 # if you find the same word_C again, mark it word_1_C - # the root (three radicals) is found in this place if at all - root = [find_root(t['expansion']) for - t in obj.get('etymology_templates', []) if - t.get('name', None) =='ar-root'][:1] - obj['root'] = root - # only take entries that are marked as lemmas if 'Arabic lemmas' in obj.get('categories', []): entry = { From abcb3a9f2aa7d421072ed26f171d2cfd46ca688e Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Wed, 20 Sep 2023 11:54:29 +0200 Subject: [PATCH 13/19] improving evaluation of wiktionary generated lexicon --- src/arabic/wiktionary/read_wiktionary.py | 186 ++++++++++++++--------- 1 file changed, 110 insertions(+), 76 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 9a1d76fef..434617231 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -69,6 +69,8 @@ PGF_FILE = 'MorphoDictAraAbs.pgf' # module to linearize with CONCRETE_MODULE = 'MorphoDictAra' +# concrete syntax file, to debug sources of linearizations +CONCRETE_FILE = CONCRETE_MODULE + '.gf' # read a gzipped jsonl file (one object per line), # showing lines where one of a list of languages is present @@ -144,6 +146,9 @@ buckwalter_dict = { buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()} +arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}} + +sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya def to_buckwalter(s): return ''.join([buckwalter_dict.get(ord(c), c) for c in s]) @@ -157,12 +162,28 @@ def unvocalize(s): return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) +def drop_final_vowel(s): + if s[-1] in arabic_vowels: + return s[:-1] + else: + return s + + def is_arabic(s): return s and any(1574 <= ord(c) <= 1616 for c in s) def normal(s): return unicodedata.normalize('NFD', s) +# heuristic for finding the three radicals from certain forms +# works only for sound (strong) 3-radical roots, otherwise None +def get_sound_trigram_root(s): + sounds = [c for c in s if c in sound_consonants] + if len(sounds) == 3: + return ''.join(sounds) + else: + return None + # Wikt uses vowel+shadda which is a Unicode normalization # GF uses shadda+vowel which is linguistically correct @@ -216,18 +237,18 @@ arabic_rgl_features = { 'Gen': 'genitive', # 'Bare': # 'Dat': - 'Const': 'construct', + 'Const': 'construct' # 'Poss': - #A: also N features - 'APosit': 'positive', - 'AComp': 'comparative' + #A: also N features; degree features cannot be found +# 'APosit': 'positive', +# 'AComp': 'comparative' } # the inflection forms in a wiktionary entry def wikt_forms_from_obj(obj): forms = { - form['form']: + reorder_shadda(form['form']): form.get('tags', []) for form in obj.get('forms', []) if 'romanization' not in form.get('tags', []) and @@ -249,9 +270,9 @@ def forms_for_pos(obj): dforms = wikt_forms_from_obj(obj) forms = dforms.items() if obj['pos'] == 'noun': - lemma = [form[:-1] for form, descr in forms + lemma = [drop_final_vowel(form) for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] - plural = [form[:-1] for form, descr in forms + plural = [drop_final_vowel(form) for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1] gender = (['fem'] if 'Arabic feminine nouns' in obj['categories'] else (['masc'] if 'Arabic masculine nouns' in obj['categories'] @@ -312,8 +333,11 @@ def forms_for_pos(obj): gf_entry['lemma'] = gf_entry['lemma'][0] if 'root' in dforms: gf_entry['args']['root'] = [dforms['root']] - args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x] - gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(sorted(args)) + '}' + elif root := get_sound_trigram_root(gf_entry['lemma']): + gf_entry['args']['root'] = [root] + args = sorted([(r, quote_if(x[0])) for r, x in gf_entry['args'].items() if x]) + gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join([r + ' = ' + v for (r, v) in args]) + '}' + gf_entry['labels'] = ','.join([r for r, v in args]) return gf_entry @@ -367,6 +391,7 @@ if MODE.startswith('gf') or MODE=='json': if lemma: cat = entry['forms']['cat'] lin = entry['forms']['lin'] + labels = entry['forms']['labels'] discrim = seen_gf_funs.get((lemma, cat), 0) fun = gf_fun(lemma, cat, discrim) @@ -380,7 +405,9 @@ if MODE.startswith('gf') or MODE=='json': # function-source map, save in source_of_MorphoDictAra.jsonl elif MODE == 'gf-map': - mapitem = {'fun': fun, 'source': wikt_forms_from_obj(obj)} + source = wikt_forms_from_obj(obj) + source['gf_labels'] = labels + mapitem = {'fun': fun, 'source': source} print(json.dumps(mapitem, ensure_ascii=False)) seen_gf_funs[(lemma, cat)] = discrim + 1 # next word_d_C will get a new number @@ -399,6 +426,7 @@ if MODE in ['gf-abs', 'gf-cnc']: # {'s (AComp Def Bare)': 'الأَيَُونَانِ'} # coming from pgf tabularLinearize +# compare the table for one function, returning a report as a dict def compare_tables(gf, wikt, fun, show_buckwalter=True): report = {} for pair in gf.items(): @@ -412,7 +440,7 @@ def compare_tables(gf, wikt, fun, show_buckwalter=True): wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags} wikt_form = None wikt_descr = None - for form, descr in wikt: + for form, descr in wikt.items(): if all([tag in descr for tag in wikt_tags]): wikt_form = reorder_shadda(form) wikt_descr = descr @@ -424,84 +452,90 @@ def compare_tables(gf, wikt, fun, show_buckwalter=True): 'wikt_descr': wikt_descr } if show_buckwalter: - report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None, - report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None, + report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None + report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None if wikt_form: report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form)) report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form))) ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items report['fun'] = fun + report['labels'] = wikt['gf_labels'] report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ]) report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems]) report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems]) return report -def eval_all(gr, funmap, concrete=CONCRETE_MODULE): - lang = gr.languages[CONCRETE_MODULE] - funs = gr.functions - reports = [] - for fun in funs: - funn = "'" + fun + "'" - if funn not in funmap: - print(funn, 'not found') - continue - wikt = funmap[funn].items() - gf = lang.tabularLinearize(pgf.Expr(fun, [])) - report = compare_tables(gf, wikt, fun) - reports.append(report) - return reports +# with a given grammar and function, prepare input for compare_tables +# and produce a report, possibly summarizing it +def eval_with_wikt(gr, lang, fun, wikt, verbose=False): + if fun not in gr.functions: + print(fun, 'not found in grammar') + return + gf = {p: s for (p, s) in lang.tabularLinearize(pgf.Expr(fun, [])).items() + if p.startswith('s ')} # require the s field, exclude s2 + report = compare_tables(gf, wikt, fun) + if verbose: + return report + else: + if report['total_found'] == 0: + verdict = 'NOT_FOUND' + flaws = False + elif report['total_found'] == report['total_voc']: + verdict = 'PERFECT' + flaws = False + elif report['total_found'] == report['total_unvoc']: + verdict = 'PERFECT_UNVOC' + flaws = True + elif report['total_voc'] == 0: + verdict = 'TOTALLY_WRONG' + flaws = True + else: + verdict = 'PARTIAL' + flaws = True + summary = { + 'fun': report['fun'], + 'forms': report['total_found'], + 'voc': report['total_voc'], + 'unvoc': report['total_unvoc'], + 'verdict': verdict, + 'labels': report['labels'] + } - -# in the summary report: print the first error if anything gets wrong -def first_error(report): - for f, v in report.items(): - if 'voc_match' in v: - if v['voc_match'] == 0: - return f, v - - -# having stored the Wiktionary object for each GF function -# read it back from a file -def read_function_source_map(): - with open(FUNCTION_SOURCE_MAP) as file: - sourcemap = {} - for line in file: - try: - obj = json.loads(line) - sourcemap[obj['fun']] = obj['source'] - except: - continue - return sourcemap + if flaws: + for f, v in report.items(): + if v.get('voc_match', 1) == 0: + summary['first_error'] = v + break + return summary +def eval_grammar(pgffile, concretename, mapfile, show=True, verbose=False): + gr = pgf.readPGF(pgffile) + concrete = gr.languages[concretename] + + totals = {'A': {}, 'N': {}, 'V': {}} + + with open(mapfile) as file: + for line in file: + obj = json.loads(line) + fun = obj['fun'][1:-1] + report = eval_with_wikt(gr, concrete, fun, obj['source'], verbose) + + cat = fun[-1] + if 'verdict' in report: + rep = report['verdict'] + totals[cat][rep] = totals[cat].get(rep, 0) + 1 + + if show: + print(report) + + print(totals) + + if MODE.startswith('eval'): - gr = pgf.readPGF(PGF_FILE) - print('using', PGF_FILE) - funmap = read_function_source_map() - print(len(funmap), 'functions') - for report in eval_all(gr, funmap): - - if MODE == 'eval-verbose': - for line in report.items(): - print(line) - if MODE == 'eval-tables': - for gftags, value in report.items(): - if v := value['wikt_form']: - print(' ', value['gf_params'][2:], '=>', '"' + v + '" ;') - else: - if report['total_found'] == 0: - verdict = 'NOT_FOUND' - elif report['total_found'] == report['total_voc']: - verdict = 'PERFECT' - elif report['total_found'] == report['total_unvoc']: - verdict = 'PERFECT_UNVOC ' + str(first_error(report)) - elif report['total_voc'] == 0: - verdict = 'TOTALLY_WRONG ' + str(first_error(report)) - else: - verdict = 'PARTIAL ' + str(first_error(report)) - print(report['fun'], 'forms', report['total_found'], - 'voc', report['total_voc'], 'unvoc', report['total_unvoc'], - verdict - ) + verbose = MODE=='eval-verbose' + show = verbose or MODE=='eval-funs' + eval_grammar(PGF_FILE, CONCRETE_MODULE, FUNCTION_SOURCE_MAP, show, verbose) + From 24199311058e87a72ecde6e9d8bd835a8c143e02 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Wed, 20 Sep 2023 11:54:59 +0200 Subject: [PATCH 14/19] some more paradigms for Arabic Wiktionary generation --- src/arabic/ParadigmsAra.gf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf index 20892fed8..1b3cfc85b 100644 --- a/src/arabic/ParadigmsAra.gf +++ b/src/arabic/ParadigmsAra.gf @@ -885,8 +885,10 @@ oper = \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt wmkN : {sg : Str; pl : Str} -> N = \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ; + wmkN : {sg, pl : Str ; root : Str} -> N + = \r -> mkN r.sg r.pl masc nohum ; ---- wmkN : {sg : Str; root : Str} -> N - = \r -> smartN r.sg ; + = \r -> smartN r.sg ; } ; wmkA = overload { @@ -928,7 +930,7 @@ oper wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V = \r -> mkV r.root r.cls ; ---- wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V - = \r -> variants {} ; ---- mkV r.imperfect ; ---- + = \r -> mkV r.perfect r.cls ; ---- wmkV : {root : Str ; cls : VerbForm} -> V = \r -> mkV r.root r.cls ; wmkV : {imperfect : Str} -> V From fdd7c9641ea6b14af6dfd0bf21456a7071b33332 Mon Sep 17 00:00:00 2001 From: aarneranta Date: Wed, 20 Sep 2023 16:05:46 +0200 Subject: [PATCH 15/19] Ara: improving Adj inflection by identifying fcl patterns from concrete forms --- src/arabic/MorphoAra.gf | 6 +++-- src/arabic/ParadigmsAra.gf | 32 +++++++++++++++++++++--- src/arabic/wiktionary/Makefile | 7 ++++++ src/arabic/wiktionary/read_wiktionary.py | 28 +++++++++++++++++++-- 4 files changed, 65 insertions(+), 8 deletions(-) create mode 100644 src/arabic/wiktionary/Makefile diff --git a/src/arabic/MorphoAra.gf b/src/arabic/MorphoAra.gf index 808223b4d..53f7a2608 100644 --- a/src/arabic/MorphoAra.gf +++ b/src/arabic/MorphoAra.gf @@ -153,7 +153,8 @@ oper w + "ف" + x + "ع" + y + "ل" + z => { h = w ; m1 = x; m2 = y; t = z} ; w + "ف" + x + ("ع"|"ل") + y - => { h = w ; m1 = x; m2 = ""; t = y} + => { h = w ; m1 = x; m2 = ""; t = y} ; + _ => Predef.error("cannot get FCL pattern from" ++ pat) } ; --opers to interdigitize (make words out of roots and patterns: @@ -204,7 +205,8 @@ oper => mkAssimilated pat (mkRoot3 rS) ; ? + ? + _ => mkBilit pat (mkRoot2 rS) ; --2=> _=> error rS ---- AR error "expected 3--6" - } + } ; + _ => Predef.error("cannot get FCL pattern from" ++ pS) }; ----------------------------------------------------------------------------- diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf index 1b3cfc85b..3d1623e14 100644 --- a/src/arabic/ParadigmsAra.gf +++ b/src/arabic/ParadigmsAra.gf @@ -898,12 +898,30 @@ oper = \r -> mkA r.root ; mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; fem_sg: Str ; masc_pl : Str; fem_pl : Str; root : Str} -> A - = \r -> mkA r.root ; + mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A + = \r -> mkA r.root r.sg_patt r.pl_patt ; + mkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A + = \r -> mkA r.root r.sg_patt r.pl_patt ; + mkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + mkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + mkA : {masc_sg, root, sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + mkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A + = \r -> mkA r.root ; ---- + mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A + = \r -> mkA r.root ; ---- + mkA : {masc_sg, fem_sg, root : Str} -> A + = \r -> mkA r.root ; ---- + mkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A + = \r -> mkA r.masc_sg ; ---- mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; fem_sg : Str; root : Str} -> A - = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; mkA : {masc_sg : Str; fem_sg : Str} -> A = \r -> mkA r.masc_sg ; ---- mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A @@ -914,8 +932,14 @@ oper = \r -> mkA r.masc_sg ; ---- mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A = \r -> mkA r.root ; + mkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A + = \r -> mkA r.sg_patt r.pl_patt ; mkA : {masc_sg : Str; masc_pl : Str} -> A = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A + = \r -> mkA r.masc_sg ; ---- mkA : {masc_sg : Str; root : Str} -> A = \r -> mkA r.root ; mkA : {masc_sg : Str} -> A diff --git a/src/arabic/wiktionary/Makefile b/src/arabic/wiktionary/Makefile new file mode 100644 index 000000000..80e1da791 --- /dev/null +++ b/src/arabic/wiktionary/Makefile @@ -0,0 +1,7 @@ +all: + python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf + python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf + python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl + gf -make MorphoDictAra.gf + python3 read_wiktionary.py eval-funs >1-eval.txt + python3 to_wordnet.py >wornet-arabic.jsonl diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 434617231..960a592d3 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -122,7 +122,7 @@ buckwalter_dict = { 0x638: 'Z', # ظ 0x639: 'E', # ع 0x63a: 'g', # غ - 0x641: 'f', # ف + 0x641: 'f', # ف 0x642: 'q', # ق 0x643: 'k', # ك 0x644: 'l', # ل @@ -144,6 +144,7 @@ buckwalter_dict = { 0x671: '{' # ٱ } + buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()} arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}} @@ -184,6 +185,24 @@ def get_sound_trigram_root(s): else: return None + +# reverse engineer fcl pattern from a given form, with a sound trigram root +# one more condition: each of the root letters occurs exactly ones +# TODO: better use the given root of the lex entry +def get_sound_fcl_pattern(s): + if root := get_sound_trigram_root(s): + if len([c in s for c in root]) == 3: + p = list(s) + r = s.find(root[0]) + p[r] = chr(0x641) + r += s[r+1:].find(root[1]) + 1 + p[r] = chr(0x639) + r += s[r+1:].find(root[2]) + 1 + p[r] = chr(0x644) + p = ''.join(p) +## print('---PATT', s, root, p) + return p + # Wikt uses vowel+shadda which is a Unicode normalization # GF uses shadda+vowel which is linguistically correct @@ -324,7 +343,12 @@ def forms_for_pos(obj): 'fem_pl': [form for form, descr in forms if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1], } - } + } + for patt in ['masc_sg', 'masc_pl']: + if patt in gf_entry['args']: + if form := gf_entry['args'][patt]: + if spatt := get_sound_fcl_pattern(form[0]): + gf_entry['args'][patt[5:]+'_patt'] = [spatt] # sg_patt, pl_patt else: gf_entry = {f: d for f, d in forms} From 7e383b746e81544dfeee9ae776fa84ac07e3c4f9 Mon Sep 17 00:00:00 2001 From: aarneranta Date: Thu, 21 Sep 2023 15:46:41 +0200 Subject: [PATCH 16/19] moved wikt-specific paradigms to a separate file (for the moment) --- src/arabic/ParadigmsAra.gf | 54 ++++++++++++------------ src/arabic/wiktionary/Makefile | 2 +- src/arabic/wiktionary/read_wiktionary.py | 21 +++++++-- 3 files changed, 47 insertions(+), 30 deletions(-) diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf index 3d1623e14..80506ebbb 100644 --- a/src/arabic/ParadigmsAra.gf +++ b/src/arabic/ParadigmsAra.gf @@ -868,6 +868,8 @@ formV : (root : Str) -> VerbForm -> V = \s,f -> case f of { param VerbForm = FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ; + +{- temporarily moved to wiktionary/MoreAra.gf -- paradigms for Wiktionary extraction ---- TODO: better usage of information in Wiktionary @@ -894,55 +896,55 @@ oper wmkA = overload { wmkA : {root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A + wmkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A + wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A = \r -> mkA r.root r.sg_patt r.pl_patt ; - mkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A + wmkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A = \r -> mkA r.root r.sg_patt r.pl_patt ; - mkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A + wmkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A = \r -> mkA r.root r.sg_patt ; - mkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A + wmkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A = \r -> mkA r.root r.sg_patt ; - mkA : {masc_sg, root, sg_patt : Str} -> A + wmkA : {masc_sg, root, sg_patt : Str} -> A = \r -> mkA r.root r.sg_patt ; - mkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A + wmkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A = \r -> mkA r.root r.sg_patt ; - mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A = \r -> mkA r.root ; ---- - mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A = \r -> mkA r.root ; ---- - mkA : {masc_sg, fem_sg, root : Str} -> A + wmkA : {masc_sg, fem_sg, root : Str} -> A = \r -> mkA r.root ; ---- - mkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A + wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A + wmkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A = \r -> mkA r.root r.sg_patt ; - mkA : {masc_sg : Str; fem_sg : Str} -> A + wmkA : {masc_sg : Str; fem_sg : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A + wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A + wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A + wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A + wmkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A + wmkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A + wmkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A = \r -> mkA r.sg_patt r.pl_patt ; - mkA : {masc_sg : Str; masc_pl : Str} -> A + wmkA : {masc_sg : Str; masc_pl : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A + wmkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; root : Str} -> A + wmkA : {masc_sg : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str} -> A + wmkA : {masc_sg : Str} -> A = \r -> mkA r.masc_sg ; ---- } ; @@ -960,5 +962,5 @@ oper wmkV : {imperfect : Str} -> V = \r -> variants {} ; ---- mkV r.imperfect ; } ; - +-} } ; diff --git a/src/arabic/wiktionary/Makefile b/src/arabic/wiktionary/Makefile index 80e1da791..58fcf2b6d 100644 --- a/src/arabic/wiktionary/Makefile +++ b/src/arabic/wiktionary/Makefile @@ -4,4 +4,4 @@ all: python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl gf -make MorphoDictAra.gf python3 read_wiktionary.py eval-funs >1-eval.txt - python3 to_wordnet.py >wornet-arabic.jsonl + python3 to_wordnet.py >wordnet-arabic.jsonl diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 960a592d3..69099294e 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -71,6 +71,10 @@ CONCRETE_MODULE = 'MorphoDictAra' # concrete syntax file, to debug sources of linearizations CONCRETE_FILE = CONCRETE_MODULE + '.gf' + +# evaluation result file, created with mode eval-funs +EVAL_FILE = 'eval.jsonl' + # read a gzipped jsonl file (one object per line), # showing lines where one of a list of languages is present @@ -93,6 +97,17 @@ if MODE == 'raw': get_gzip_json(WIKTIONARY_DUMP, 1, [EXTRACTED_LANGUAGE]) exit() + +if MODE == 'error-analysis': + evals = {} + with open(EVAL_FILE) as file: + for line in file: + row = json.loads(line) + if labels := row.get('labels', None): + verdict = row['verdict'] + evals[(labels, verdict)] = evals.get((labels, verdict), 0) + 1 + for labverdict, n in sorted(list(evals.items())): + print(labverdict, n) # https://en.wikipedia.org/wiki/Buckwalter_transliteration buckwalter_dict = { @@ -378,7 +393,7 @@ def find_root(s): if MODE == 'gf-abs': print('abstract MorphoDictAraAbs = Cat ** {') if MODE == 'gf-cnc': - print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') + print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra, MoreAra in {') # go through the Arabic Wiktionary entries # generate functions with unique names @@ -552,9 +567,9 @@ def eval_grammar(pgffile, concretename, mapfile, show=True, verbose=False): totals[cat][rep] = totals[cat].get(rep, 0) + 1 if show: - print(report) + print(json.dumps(report, ensure_ascii=False)) - print(totals) + print(json.dumps(totals, ensure_ascii=False)) if MODE.startswith('eval'): From aa1dff67026918764e0c3c03697120c828e6b4ea Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Thu, 21 Sep 2023 17:29:38 +0200 Subject: [PATCH 17/19] added MoreAra.gf --- src/arabic/wiktionary/Makefile | 3 +- src/arabic/wiktionary/MoreAra.gf | 98 ++++++++++++++++++++++++ src/arabic/wiktionary/read_wiktionary.py | 3 +- 3 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 src/arabic/wiktionary/MoreAra.gf diff --git a/src/arabic/wiktionary/Makefile b/src/arabic/wiktionary/Makefile index 58fcf2b6d..a14e23e52 100644 --- a/src/arabic/wiktionary/Makefile +++ b/src/arabic/wiktionary/Makefile @@ -3,5 +3,6 @@ all: python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl gf -make MorphoDictAra.gf - python3 read_wiktionary.py eval-funs >1-eval.txt + python3 read_wiktionary.py eval-funs >eval.jsonl python3 to_wordnet.py >wordnet-arabic.jsonl + python3 read_wiktionary.py error-analysis diff --git a/src/arabic/wiktionary/MoreAra.gf b/src/arabic/wiktionary/MoreAra.gf new file mode 100644 index 000000000..e45b49b58 --- /dev/null +++ b/src/arabic/wiktionary/MoreAra.gf @@ -0,0 +1,98 @@ +resource MoreAra = CatAra ** open ParadigmsAra in { + + +-- temporarily moved from ParadigmsAra +-- paradigms for Wiktionary extraction +---- TODO: better usage of information in Wiktionary + +oper + wmkN = overload { + wmkN : {sg, pl : Str ; g : Gender} -> N + = \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt + wmkN : {sg : Str} -> N + = \r -> smartN r.sg ; + wmkN : {sg : Str ; g : Gender ; root : Str} -> N + = \r -> smartN r.sg ** {g = r.g} ; ---- + wmkN : {sg : Str; g : Gender} -> N + = \r -> smartN r.sg ** {g = r.g} ; + wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N + = \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt + wmkN : {sg : Str; pl : Str} -> N + = \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ; + wmkN : {sg, pl : Str ; root : Str} -> N + = \r -> mkN r.sg r.pl masc nohum ; ---- + wmkN : {sg : Str; root : Str} -> N + = \r -> smartN r.sg ; + } ; + + wmkA = overload { + wmkA : {root : Str} -> A + = \r -> mkA r.root ; + wmkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A + = \r -> mkA r.root ; + wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A + = \r -> mkA r.root ; + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A + = \r -> mkA r.root r.sg_patt r.pl_patt ; + wmkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A + = \r -> mkA r.root r.sg_patt r.pl_patt ; + wmkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + wmkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + wmkA : {masc_sg, root, sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + wmkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A + = \r -> mkA r.root ; ---- + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A + = \r -> mkA r.root ; ---- + wmkA : {masc_sg, fem_sg, root : Str} -> A + = \r -> mkA r.root ; ---- + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A + = \r -> mkA r.masc_sg ; ---- + wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A + = \r -> mkA r.masc_sg ; ---- + wmkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A + = \r -> mkA r.root r.sg_patt ; + wmkA : {masc_sg : Str; fem_sg : Str} -> A + = \r -> mkA r.masc_sg ; ---- + wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A + = \r -> mkA r.masc_sg ; ---- + wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A + = \r -> mkA r.root ; + wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A + = \r -> mkA r.masc_sg ; ---- + wmkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A + = \r -> mkA r.root ; + wmkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A + = \r -> mkA r.root ; + wmkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A + = \r -> mkA r.sg_patt r.pl_patt ; + wmkA : {masc_sg : Str; masc_pl : Str} -> A + = \r -> mkA r.masc_sg ; ---- + wmkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A + = \r -> mkA r.masc_sg ; ---- + wmkA : {masc_sg : Str; root : Str} -> A + = \r -> mkA r.root ; + wmkA : {masc_sg : Str} -> A + = \r -> mkA r.masc_sg ; ---- + } ; + + wmkV = overload { + wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V + = \r -> mkV r.root r.cls ; ---- + wmkV : {perfect : Str; cls : VerbForm} -> V + = \r -> mkV r.perfect r.cls ; ---- + wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V + = \r -> mkV r.root r.cls ; ---- + wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V + = \r -> mkV r.perfect r.cls ; ---- + wmkV : {root : Str ; cls : VerbForm} -> V + = \r -> mkV r.root r.cls ; + wmkV : {imperfect : Str} -> V + = \r -> variants {} ; ---- mkV r.imperfect ; + } ; + +} \ No newline at end of file diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 69099294e..140852c7a 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -104,8 +104,9 @@ if MODE == 'error-analysis': for line in file: row = json.loads(line) if labels := row.get('labels', None): + cat = row['fun'][-1] verdict = row['verdict'] - evals[(labels, verdict)] = evals.get((labels, verdict), 0) + 1 + evals[(cat, labels, verdict)] = evals.get((cat, labels, verdict), 0) + 1 for labverdict, n in sorted(list(evals.items())): print(labverdict, n) From 561a8c130d5b1f99e98a5ced819ff58f4858a65a Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Mon, 25 Sep 2023 08:22:47 +0200 Subject: [PATCH 18/19] to_wordnet applied to a new format of data --- src/arabic/wiktionary/to_wordnet.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/arabic/wiktionary/to_wordnet.py b/src/arabic/wiktionary/to_wordnet.py index 144e4cc1a..b159c5f18 100644 --- a/src/arabic/wiktionary/to_wordnet.py +++ b/src/arabic/wiktionary/to_wordnet.py @@ -6,7 +6,8 @@ import json # from https://www.grammaticalframework.org/~krasimir/arabic.tsv.gz -WN_TSV = 'arabic.tsv' +# WN_TSV = 'arabic.tsv' # Krasimir +WN_TSV = 'ar2en_words_gf.csv' # Zarzoura # built as explained in ./read_wiktionary.py MORPHO_GF = 'MorphoDictAraAbs.gf' @@ -41,7 +42,7 @@ with open(WN_TSV) as wnfile: for row in wnfile: ## word = row[-1].strip() # does not show tha arabic, but the second-last word word = unvocalize(get_arabic(row)) - wnfun = row.split()[0] + wnfun = row.split()[-1] # 0 in Krasimir cat = [c for c in wnfun if c.isalpha()][-1] # the last letter; the dict only contains N, A, V funs = funmap.get((word, cat), []) result = {'wnfun': wnfun, 'sought': word, 'found': funs} From 1c355ce9dd49d1fd59090bb41a91a573cc9ce1c1 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Mon, 25 Sep 2023 09:22:21 +0200 Subject: [PATCH 19/19] factored out arabic_utilities.py as a separate file --- src/arabic/wiktionary/arabic_utilities.py | 169 ++++++++++++++++++++++ src/arabic/wiktionary/read_wiktionary.py | 128 +--------------- src/arabic/wiktionary/to_wordnet.py | 11 +- 3 files changed, 172 insertions(+), 136 deletions(-) create mode 100644 src/arabic/wiktionary/arabic_utilities.py diff --git a/src/arabic/wiktionary/arabic_utilities.py b/src/arabic/wiktionary/arabic_utilities.py new file mode 100644 index 000000000..29a15f105 --- /dev/null +++ b/src/arabic/wiktionary/arabic_utilities.py @@ -0,0 +1,169 @@ +# utilities for Arabic script +# in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter +# as specified by the command line argument: +# +# % python3 arabic_utilities.py to b.tmp +# % diff MorphoDictAra.gf b.tmp +# % + +def is_arabic(s): + return s and any(1574 <= ord(c) <= 1616 for c in s) + + +def get_arabic(s): + return ''.join([c for c in s if is_arabic(c)]) + + +def unvocalize(s): + return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) + + +# https://en.wikipedia.org/wiki/Buckwalter_transliteration +buckwalter_dict = { + 0x621: "'", # ء + 0x622: '|', # آ + 0x623: '>', # أ + 0x624: '&', # ؤ + 0x625: '<', # إ + 0x626: '}', # ئ + 0x627: 'A', # ا + 0x628: 'b', # ب + 0x629: 'p', # ة + 0x62a: 't', # ت + 0x62b: 'v', # ث + 0x62c: 'j', # ج + 0x62d: 'H', # ح + 0x62e: 'x', # خ + 0x62f: 'd', # د + 0x630: '*', # ذ + 0x631: 'r', # ر + 0x632: 'z', # ز + 0x633: 's', # س + 0x634: '$', # ش + 0x635: 'S', # ص + 0x636: 'D', # ض + 0x637: 'T', # ط + 0x638: 'Z', # ظ + 0x639: 'E', # ع + 0x63a: 'g', # غ + 0x641: 'f', # ف + 0x642: 'q', # ق + 0x643: 'k', # ك + 0x644: 'l', # ل + 0x645: 'm', # م + 0x646: 'n', # ن + 0x647: 'h', # ه + 0x648: 'w', # و + 0x649: 'Y', # ى + 0x64a: 'y', # ي + 0x64b: 'F', # ً + 0x64c: 'N', # ٌ + 0x64d: 'K', # ٍ + 0x64e: 'a', # َ + 0x64f: 'u', # ُ + 0x650: 'i', # ِ + 0x651: '~', # ّ + 0x652: 'o', # ْ + 0x670: '`', # ' + 0x671: '{' # ٱ + } + + +buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()} + +arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}} + +sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya + +def to_buckwalter(s): + return ''.join([buckwalter_dict.get(ord(c), c) for c in s]) + + +def from_buckwalter(s): + return ''.join([buckwalter_dict_rev.get(c, c) for c in s]) + + +def drop_final_vowel(s): + if s[-1] in arabic_vowels: + return s[:-1] + else: + return s + + +def normal(s): + return unicodedata.normalize('NFD', s) + +# heuristic for finding the three radicals from certain forms +# works only for sound (strong) 3-radical roots, otherwise None +def get_sound_trigram_root(s): + sounds = [c for c in s if c in sound_consonants] + if len(sounds) == 3: + return ''.join(sounds) + else: + return None + + +# reverse engineer fcl pattern from a given form, with a sound trigram root +# one more condition: each of the root letters occurs exactly ones +# TODO: better use the given root of the lex entry +def get_sound_fcl_pattern(s): + if root := get_sound_trigram_root(s): + if len([c in s for c in root]) == 3: + p = list(s) + r = s.find(root[0]) + p[r] = chr(0x641) + r += s[r+1:].find(root[1]) + 1 + p[r] = chr(0x639) + r += s[r+1:].find(root[2]) + 1 + p[r] = chr(0x644) + p = ''.join(p) +## print('---PATT', s, root, p) + return p + + +# Wikt uses vowel+shadda which is a Unicode normalization +# GF uses shadda+vowel which is linguistically correct +# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra +# unicodedata.normalize does this wrong, as noted by Ariel Gutman +## todo: more direct implementation +def reorder_shadda(s): + return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i')) + + +# quote word forms but not parameters +def quote_if(s, cond=is_arabic, change=reorder_shadda): + if cond(s): + return '"' + change(s) + '"' + else: + return s + + +# for a string, change each string literal in "..." with a change function +# leaving other characters as they are; print the string to stdout as you go +def change_literals(s, change): + inliteral = False + literal = '' + for c in s: + if c == '"' and inliteral: + print('"'+change(literal)+'"', end='') + inliteral = False + literal = '' + elif c == '"': + inliteral = True + elif inliteral: + literal += c + else: + print(c, end='') + + +# convert literals in stdin 'to' or 'from' Buckwalter +if __name__ == '__main__': + import sys + mode = sys.argv[1] + for line in sys.stdin: + if mode == 'from': + change_literals(line, from_buckwalter) + elif mode == 'to': + change_literals(line, to_buckwalter) + + diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 140852c7a..edfa69603 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -3,7 +3,7 @@ import json import sys import unicodedata import pgf - +from arabic_utilities import * # data from https://kaikki.org/dictionary/rawdata.html # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data, @@ -110,132 +110,6 @@ if MODE == 'error-analysis': for labverdict, n in sorted(list(evals.items())): print(labverdict, n) -# https://en.wikipedia.org/wiki/Buckwalter_transliteration -buckwalter_dict = { - 0x621: "'", # ء - 0x622: '|', # آ - 0x623: '>', # أ - 0x624: '&', # ؤ - 0x625: '<', # إ - 0x626: '}', # ئ - 0x627: 'A', # ا - 0x628: 'b', # ب - 0x629: 'p', # ة - 0x62a: 't', # ت - 0x62b: 'v', # ث - 0x62c: 'j', # ج - 0x62d: 'H', # ح - 0x62e: 'x', # خ - 0x62f: 'd', # د - 0x630: '*', # ذ - 0x631: 'r', # ر - 0x632: 'z', # ز - 0x633: 's', # س - 0x634: '$', # ش - 0x635: 'S', # ص - 0x636: 'D', # ض - 0x637: 'T', # ط - 0x638: 'Z', # ظ - 0x639: 'E', # ع - 0x63a: 'g', # غ - 0x641: 'f', # ف - 0x642: 'q', # ق - 0x643: 'k', # ك - 0x644: 'l', # ل - 0x645: 'm', # م - 0x646: 'n', # ن - 0x647: 'h', # ه - 0x648: 'w', # و - 0x649: 'Y', # ى - 0x64a: 'y', # ي - 0x64b: 'F', # ً - 0x64c: 'N', # ٌ - 0x64d: 'K', # ٍ - 0x64e: 'a', # َ - 0x64f: 'u', # ُ - 0x650: 'i', # ِ - 0x651: '~', # ّ - 0x652: 'o', # ْ - 0x670: '`', # ' - 0x671: '{' # ٱ - } - - -buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()} - -arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}} - -sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya - -def to_buckwalter(s): - return ''.join([buckwalter_dict.get(ord(c), c) for c in s]) - - -def from_buckwalter(s): - return ''.join([buckwalter_dict_rev.get(c, c) for c in s]) - - -def unvocalize(s): - return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) - - -def drop_final_vowel(s): - if s[-1] in arabic_vowels: - return s[:-1] - else: - return s - - -def is_arabic(s): - return s and any(1574 <= ord(c) <= 1616 for c in s) - -def normal(s): - return unicodedata.normalize('NFD', s) - -# heuristic for finding the three radicals from certain forms -# works only for sound (strong) 3-radical roots, otherwise None -def get_sound_trigram_root(s): - sounds = [c for c in s if c in sound_consonants] - if len(sounds) == 3: - return ''.join(sounds) - else: - return None - - -# reverse engineer fcl pattern from a given form, with a sound trigram root -# one more condition: each of the root letters occurs exactly ones -# TODO: better use the given root of the lex entry -def get_sound_fcl_pattern(s): - if root := get_sound_trigram_root(s): - if len([c in s for c in root]) == 3: - p = list(s) - r = s.find(root[0]) - p[r] = chr(0x641) - r += s[r+1:].find(root[1]) + 1 - p[r] = chr(0x639) - r += s[r+1:].find(root[2]) + 1 - p[r] = chr(0x644) - p = ''.join(p) -## print('---PATT', s, root, p) - return p - - -# Wikt uses vowel+shadda which is a Unicode normalization -# GF uses shadda+vowel which is linguistically correct -# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra -# unicodedata.normalize does this wrong, as noted by Ariel Gutman -## todo: more direct implementation -def reorder_shadda(s): - return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i')) - - -# quote word forms but not parameters -def quote_if(s, cond=is_arabic, change=reorder_shadda): - if cond(s): - return '"' + change(s) + '"' - else: - return s - # generate word_d_C functions starting with d=0, but show d only when >= 1 def gf_fun(s, pos, disamb=0): diff --git a/src/arabic/wiktionary/to_wordnet.py b/src/arabic/wiktionary/to_wordnet.py index b159c5f18..2aae047db 100644 --- a/src/arabic/wiktionary/to_wordnet.py +++ b/src/arabic/wiktionary/to_wordnet.py @@ -1,6 +1,8 @@ import csv import json +from arabic_utilities import * + # to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl # the following are assumed @@ -12,15 +14,6 @@ WN_TSV = 'ar2en_words_gf.csv' # Zarzoura # built as explained in ./read_wiktionary.py MORPHO_GF = 'MorphoDictAraAbs.gf' -def is_arabic(s): - return s and any(1574 <= ord(c) <= 1616 for c in s) - -def get_arabic(s): - return ''.join([c for c in s if is_arabic(c)]) - -def unvocalize(s): - return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) - # fun 'دُبُ_N' : N ; -- 10 [['bear']] funmap = {}