From 8e029bd8dd24f8bd76c46cc2f811041be9682ab5 Mon Sep 17 00:00:00 2001 From: aarneranta Date: Wed, 13 Sep 2023 17:24:21 +0200 Subject: [PATCH] Arabic Wiktionary: started comparing evaluation --- src/arabic/wiktionary/read_wiktionary.py | 85 ++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 5 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index ea8d805fd..574233dda 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -108,14 +108,76 @@ def gf_fun(s, pos, disamb=0): return ''.join(["'", s, discrim, "_", pos, "'"]) -def forms_for_pos(obj): - forms = { +rgl_features = { + # V + 'VPerf': 'perfective', + 'Act': 'active', + 'Pas': 'passive', + 'Per3': 'third-person', + 'Per2': 'second-person', + 'Masc': 'masculine', + 'Fem': 'feminine', + 'Sg': 'singular', + 'Pl': 'plural', + 'Dl': 'dual', + 'VImpf': 'imperfective', + 'Ind': 'indicative', + 'Cnj': 'subjunctive', + 'Jus': 'jussive', + 'VImp': 'imperative', + # N: also Sg, Pl, Dl + 'Def': 'definite', + 'Indef': 'indefinite', + 'Nom': 'nominative', + 'Acc': 'accusative', + 'Gen': 'genitive', +# 'Bare': +# 'Dat': + 'Const': 'construct', +# 'Poss': + #A: also N features + 'APosit': 'positive', + 'AComp': 'comparative' + } + + +# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ +def compare_tables(gf, wikt): + report = {} + for line in gf: + gf_form = line #''.join([c for c in line if 1574 <= ord(c) <= 1616]) + gf_tags = tuple(word for word in + line.replace('(', ' ').replace(')', ' ').split() + if word in rgl_features) + wikt_tags = {rgl_features[tag] for tag in gf_tags} + wikt_form = None + for form, descr in wikt: + if all([tag in descr for tag in wikt_tags]): + wikt_form = form + break + report[gf_tags] = { + 'gf_form': gf_form, + 'wikt_form': wikt_form + } + if wikt_form: + report[gf_tags]['voc_match'] = int(gf_form == wikt_form) + report[gf_tags]['unvoc_match'] = int(unvocalize(gf_form) == unvocalize(wikt_form)) + return report + + + +def wikt_forms_for_pos(obj): + return { form['form']: form.get('tags', []) for form in obj.get('forms', []) if 'romanization' not in form.get('tags', []) and is_arabic(form['form']) }.items() + + +def forms_for_pos(obj): + forms = wikt_forms_for_pos(obj) if obj['pos'] == 'noun': lemma = [form[:-1] for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] @@ -136,7 +198,8 @@ def forms_for_pos(obj): elif obj['pos'] == 'verb': lemma = [form for form, descr in forms if all([w in descr for - w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1] + w in ["active", "indicative", "masculine", "past", + "perfective", "singular", "third-person"]])][:1] gf_entry = { 'cat': 'V', 'lemma': lemma, @@ -193,14 +256,16 @@ if MODE == 'gf-abs': if MODE == 'gf-cnc': print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') -if MODE != 'raw': +if MODE not in ['raw', 'eval']: with open(FILTERED_WIKT) as file: seen_gf_funs = {} + number = 1 for line in file: try: obj = json.loads(line) except: continue + number += 1 root = [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1] @@ -227,7 +292,7 @@ if MODE != 'raw': fun = gf_fun(lemma, cat, discrim) if MODE == 'gf-abs': - print('fun', fun, ':', cat, ';', '--', entry['senses']) + print('fun', fun, ':', cat, ';', '--', number, entry['senses']) if MODE == 'gf-cnc': print('lin', fun, '=', lin, ';') @@ -237,3 +302,13 @@ if MODE != 'raw': if MODE.startswith('gf'): print('}') + + +if MODE == 'eval': + with open('pot.gftbl') as file: + gf = [line.strip() for line in file] + with open('pot.json') as file: + wikt = wikt_forms_for_pos(json.loads(file.read())) + for line in compare_tables(gf, wikt).items(): + print(line) +