improving evaluation of wiktionary generated lexicon

2026-05-28 01:18:57 -06:00 · 2023-09-20 11:54:29 +02:00
parent 9e8c5eaad5
commit abcb3a9f2a
1 changed files with 110 additions and 76 deletions
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -69,6 +69,8 @@ PGF_FILE = 'MorphoDictAraAbs.pgf'
 # module to linearize with
 CONCRETE_MODULE = 'MorphoDictAra'
 # concrete syntax file, to debug sources of linearizations
 CONCRETE_FILE = CONCRETE_MODULE + '.gf'
 # read a gzipped jsonl file (one object per line),
 # showing lines where one of a list of languages is present
@@ -144,6 +146,9 @@ buckwalter_dict = {
 buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
 arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
 sound_consonants = {chr(c) for c in range(0x628, 0x648)}  # excluding alif, waw, ya
 def to_buckwalter(s):
    return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
@@ -157,12 +162,28 @@ def unvocalize(s):
    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
 def drop_final_vowel(s):
    if s[-1] in arabic_vowels:
        return s[:-1]
    else:
        return s
 def is_arabic(s):
    return s and any(1574 <= ord(c) <= 1616 for c in s)
 def normal(s):
    return unicodedata.normalize('NFD', s)
 # heuristic for finding the three radicals from certain forms
 # works only for sound (strong) 3-radical roots, otherwise None
 def get_sound_trigram_root(s):
    sounds = [c for c in s if c in sound_consonants]
    if len(sounds) == 3:
        return ''.join(sounds)
    else:
        return None
 # Wikt uses vowel+shadda which is a Unicode normalization
 # GF uses shadda+vowel which is linguistically correct
@@ -216,18 +237,18 @@ arabic_rgl_features = {
    'Gen': 'genitive',
 #    'Bare':
 #    'Dat':
-    'Const': 'construct',
+    'Const': 'construct'
 #    'Poss':
-    #A: also N features
+    #A: also N features; degree features cannot be found
-    'APosit': 'positive',
+#    'APosit': 'positive',
-    'AComp': 'comparative'
+#    'AComp': 'comparative'
    }
 # the inflection forms in a wiktionary entry
 def wikt_forms_from_obj(obj):
    forms = {
-        form['form']:
+        reorder_shadda(form['form']):
          form.get('tags', []) for
            form in obj.get('forms', []) if
               'romanization' not in form.get('tags', []) and
@@ -249,9 +270,9 @@ def forms_for_pos(obj):
    dforms = wikt_forms_from_obj(obj)
    forms = dforms.items()
    if obj['pos'] == 'noun':
-        lemma = [form[:-1] for form, descr in forms
+        lemma = [drop_final_vowel(form) for form, descr in forms
                         if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
-        plural = [form[:-1] for form, descr in forms
+        plural = [drop_final_vowel(form) for form, descr in forms
                         if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
        gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
                            else (['masc'] if  'Arabic masculine nouns' in obj['categories']
@@ -312,8 +333,11 @@ def forms_for_pos(obj):
        gf_entry['lemma'] = gf_entry['lemma'][0]
        if 'root' in dforms:
            gf_entry['args']['root'] = [dforms['root']]
-        args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
+        elif root := get_sound_trigram_root(gf_entry['lemma']):
-        gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(sorted(args)) + '}' 
+            gf_entry['args']['root'] = [root]
        args = sorted([(r, quote_if(x[0])) for r, x in gf_entry['args'].items() if x])
        gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join([r + ' = ' + v for (r, v) in args]) + '}'
        gf_entry['labels'] = ','.join([r for r, v in args])
    return gf_entry
@@ -367,6 +391,7 @@ if MODE.startswith('gf') or MODE=='json':
                if lemma:
                    cat = entry['forms']['cat']
                    lin = entry['forms']['lin']
                    labels = entry['forms']['labels']
                    discrim = seen_gf_funs.get((lemma, cat), 0)
                    fun = gf_fun(lemma, cat, discrim)
@@ -380,7 +405,9 @@ if MODE.startswith('gf') or MODE=='json':
                    # function-source map, save in source_of_MorphoDictAra.jsonl
                    elif MODE == 'gf-map':
-                        mapitem = {'fun': fun, 'source': wikt_forms_from_obj(obj)}
+                        source = wikt_forms_from_obj(obj)
                        source['gf_labels'] = labels
                        mapitem = {'fun': fun, 'source': source}
                        print(json.dumps(mapitem, ensure_ascii=False))
                    seen_gf_funs[(lemma, cat)] = discrim + 1  # next word_d_C will get a new number
@@ -399,6 +426,7 @@ if MODE in ['gf-abs', 'gf-cnc']:
 #  {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
 # coming from pgf tabularLinearize
 # compare the table for one function, returning a report as a dict
 def compare_tables(gf, wikt, fun, show_buckwalter=True):
    report = {}    
    for pair in gf.items():
@@ -412,7 +440,7 @@ def compare_tables(gf, wikt, fun, show_buckwalter=True):
        wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags}
        wikt_form = None
        wikt_descr = None
-        for form, descr in wikt:
+        for form, descr in wikt.items():
            if all([tag in descr for tag in wikt_tags]):
                wikt_form = reorder_shadda(form)
                wikt_descr = descr
@@ -424,84 +452,90 @@ def compare_tables(gf, wikt, fun, show_buckwalter=True):
            'wikt_descr': wikt_descr
            }
        if show_buckwalter:
-            report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None,
+            report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None
-            report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None,
+            report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None
        if wikt_form:
            report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
            report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
    ritems = tuple(report.items())  # need an unmutable structure, because otherwise ints are added to items
    report['fun'] = fun
    report['labels'] = wikt['gf_labels']
    report['total_found'] = len([f for f, v  in ritems if v['wikt_form'] is not None ])
    report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
    report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
    return report
-def eval_all(gr, funmap, concrete=CONCRETE_MODULE):
+# with a given grammar and function, prepare input for compare_tables
-    lang = gr.languages[CONCRETE_MODULE]
+# and produce a report, possibly summarizing it
-    funs = gr.functions
+def eval_with_wikt(gr, lang, fun, wikt, verbose=False):
-    reports = []
+    if fun not in gr.functions:
-    for fun in funs:
+        print(fun, 'not found in grammar')
-        funn = "'" + fun + "'"
+        return
-        if funn not in funmap:
+    gf = {p: s for (p, s) in lang.tabularLinearize(pgf.Expr(fun, [])).items()
-            print(funn, 'not found')
+              if p.startswith('s ')}  # require the s field, exclude s2
-            continue
+    report = compare_tables(gf, wikt, fun)
-        wikt = funmap[funn].items()
+    if verbose:
-        gf = lang.tabularLinearize(pgf.Expr(fun, []))
+        return report
-        report = compare_tables(gf, wikt, fun)
+    else:
-        reports.append(report)
+        if report['total_found'] == 0:
-    return reports
+            verdict = 'NOT_FOUND'
            flaws = False
        elif report['total_found'] == report['total_voc']:
            verdict = 'PERFECT'
            flaws = False
        elif report['total_found'] == report['total_unvoc']:
            verdict = 'PERFECT_UNVOC'
            flaws = True
        elif report['total_voc'] == 0:
            verdict = 'TOTALLY_WRONG'
            flaws = True
        else:
            verdict = 'PARTIAL'
            flaws = True
        summary = {
            'fun': report['fun'],
            'forms': report['total_found'],
            'voc': report['total_voc'],
            'unvoc': report['total_unvoc'],
            'verdict': verdict,
            'labels': report['labels']
            }
        if flaws:
            for f, v in report.items():
                if v.get('voc_match', 1) == 0:
                    summary['first_error'] = v
                    break
        return summary
-# in the summary report: print the first error if anything gets wrong
+def eval_grammar(pgffile, concretename, mapfile, show=True, verbose=False): 
-def first_error(report):
+    gr = pgf.readPGF(pgffile)
-    for f, v in report.items():
+    concrete = gr.languages[concretename]
        if 'voc_match' in v:
            if v['voc_match'] == 0:
                return f, v
    totals = {'A': {}, 'N': {}, 'V': {}}
-# having stored the Wiktionary object for each GF function
+    with open(mapfile) as file:
 # read it back from a file
 def read_function_source_map():
    with open(FUNCTION_SOURCE_MAP) as file:
        sourcemap = {}
        for line in file:
-            try:
+            obj = json.loads(line)
-                obj = json.loads(line)
+            fun = obj['fun'][1:-1]
-                sourcemap[obj['fun']] = obj['source']
+            report = eval_with_wikt(gr, concrete, fun, obj['source'], verbose)
-            except:
+
-                continue
+            cat = fun[-1]
-    return sourcemap
+            if 'verdict' in report:
                rep = report['verdict']
                totals[cat][rep] = totals[cat].get(rep, 0) + 1 
            if show:
                print(report)
    print(totals)
 if MODE.startswith('eval'):
-    gr = pgf.readPGF(PGF_FILE)
+    verbose = MODE=='eval-verbose'
-    print('using', PGF_FILE)
+    show = verbose or MODE=='eval-funs'
-    funmap = read_function_source_map()
+    eval_grammar(PGF_FILE, CONCRETE_MODULE, FUNCTION_SOURCE_MAP, show, verbose)
    print(len(funmap), 'functions')
    for report in eval_all(gr, funmap):    
        if MODE == 'eval-verbose':
            for line in report.items():
                print(line)
        if MODE == 'eval-tables':
            for gftags, value in report.items():
                if v := value['wikt_form']:
                    print(' ', value['gf_params'][2:], '=>', '"' + v + '" ;')
        else:
            if report['total_found'] == 0:
                verdict = 'NOT_FOUND'
            elif report['total_found'] == report['total_voc']:
                verdict = 'PERFECT'
            elif report['total_found'] == report['total_unvoc']:
                verdict = 'PERFECT_UNVOC ' + str(first_error(report))
            elif report['total_voc'] == 0:
                verdict = 'TOTALLY_WRONG ' + str(first_error(report))
            else:
                verdict = 'PARTIAL ' + str(first_error(report))
            print(report['fun'], 'forms', report['total_found'],
                  'voc', report['total_voc'], 'unvoc', report['total_unvoc'],
                  verdict
                  )