evaluation of generated lexicon

2026-05-28 01:18:57 -06:00 · 2023-09-14 15:19:05 +02:00
parent d5e6e7e389
commit 3e9be76e52
1 changed files with 110 additions and 25 deletions
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -2,10 +2,47 @@ import gzip
 import json
 import sys
 import unicodedata
 import pgf
 # data from https://kaikki.org/dictionary/rawdata.html
 # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
-# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. 
+# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.
 """
 This file converts Wiktionary data to GF morphological dictionary files.
 It words for Arabic but some functionalities could be modified to other languges.
 The steps to take are the following:
 fetch data:
  raw-wiktextract-data.json.gz from https://kaikki.org/dictionary/rawdata.html
 filter Arabic entries:
  $ python3 read_wiktionary.py raw >wikt_arabic.jsonl
 create GF files:
  $ python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
  $ python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
 automatic evaluation:
  $ gf -make MorphoDictAra.gf
  $ python3 read_wiktionary.py gf-map >function_sources_arabic.jsonl
  $ python3 read_wiktionary.py eval
 TODO:
 - better generation of GF
 - better paradigms to use Wiktionary data
 - refactor the code so that it can be used for other languages
 """
 MODE = ''
@@ -27,13 +64,20 @@ FILTERED_WIKT = 'wikt_arabic.jsonl'
 # created with option gf-map
 FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl'
 PGF_FILE = 'MorphoDictAraAbs.pgf'
 CONCRETE_MODULE = 'MorphoDictAra'
 def read_function_source_map():
    with open(FUNCTION_SOURCE_MAP) as file:
        sourcemap = {}
        for line in file:
-            obj = json.loads(line)
+            try:
-            sourcemap[obj['fun']] = obj['source']
+                obj = json.loads(line)
                sourcemap[obj['fun']] = obj['source']
            except:
                continue
    return sourcemap
 def get_gzip_json(file, sample=100000, langs=[]):
@@ -134,9 +178,9 @@ def reorder_shadda(s):
 # quote forms but not parameters
-def quote_if(s, cond=is_arabic):
+def quote_if(s, cond=is_arabic, change=reorder_shadda):
    if cond(s):
-        return '"' + s + '"'
+        return '"' + change(s) + '"'
    else:
        return s
@@ -181,14 +225,19 @@ rgl_features = {
    }
 # obsolote:
 # format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ    
 # coming from 'l -treebank -table'
-def compare_tables(gf, wikt):
+# now used:
 #  {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
 # coming from tabularLinearize
 def compare_tables(gf, wikt, fun):
    report = {}    
-    for line in gf:
+    for pair in gf.items():
-        gf_form = line.split()[-1] # ''.join([c for c in line if 1574 <= ord(c) <= 1616])
+        gf_form = pair[1]
        gf_tags = tuple(word for word in
-                    line.replace('(', ' ').replace(')', ' ').split()
+                    pair[0].replace('(', ' ').replace(')', ' ').split()
                      if word in rgl_features)
        if not gf_tags:
            continue
@@ -211,7 +260,7 @@ def compare_tables(gf, wikt):
            report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
            report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
    ritems = tuple(report.items())  # need an unmutable structure, because otherwise ints are added to items
-    report['fun'] = gf[0].split()[-1]
+    report['fun'] = fun
    report['total_found'] = len([f for f, v  in ritems if v['wikt_form'] is not None ])
    report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
    report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
@@ -293,7 +342,7 @@ def forms_for_pos(obj):
    if 'lemma' in gf_entry and gf_entry['lemma']:
        gf_entry['lemma'] = gf_entry['lemma'][0]
-        if obj['root']:
+        if obj['root'] and obj['root'][0].strip():
            gf_entry['args']['root'] = obj['root']
        args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
        gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' 
@@ -309,7 +358,8 @@ if MODE == 'gf-abs':
 if MODE == 'gf-cnc':
    print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') 
-if MODE not in ['raw', 'eval']:
+    
 if MODE.startswith('gf') or MODE=='json':
  with open(FILTERED_WIKT) as file:
    seen_gf_funs = {}
    number = 1
@@ -360,17 +410,52 @@ if MODE.startswith('gf'):
    print('}')
-if MODE.startswith('eval'):
+def eval_all(gr, funmap, concrete=CONCRETE_MODULE):
-    with open('pot.gftbl') as file:
+    lang = gr.languages[CONCRETE_MODULE]
-        gf = [line.strip() for line in file]
+    funs = gr.functions
-    with open('pot.json') as file:
+    reports = []
-        wikt = wikt_forms_for_pos(json.loads(file.read()))
+    for fun in funs:
-    report = compare_tables(gf, wikt)
+        funn = "'" + fun + "'"
-    
+        if funn not in funmap:
-    if MODE == 'eval-verbose':
+            print(funn, 'not found')
-        for line in report.items():
+            continue
-            print(line)
+        wikt = wikt_forms_for_pos(funmap[funn])
-    else:
+        gf = lang.tabularLinearize(pgf.Expr(fun, []))
-        print(report['fun'], 'forms', report['total_found'],
+        report = compare_tables(gf, wikt, fun)
-              'voc', report['total_voc'], 'unvoc', report['total_unvoc'])
+        reports.append(report)
    return reports
 def first_error(report):
    for f, v in report.items():
        if 'voc_match' in v:
            if v['voc_match'] == 0:
                return f, v
 if MODE.startswith('eval'):
    gr = pgf.readPGF(PGF_FILE)
    print('using', PGF_FILE)
    funmap = read_function_source_map()
    print(len(funmap), 'functions')
    for report in eval_all(gr, funmap):    
        if MODE == 'eval-verbose':
            for line in report.items():
                print(line)
        else:
            if report['total_found'] == 0:
                verdict = 'NOT_FOUND'
            elif report['total_found'] == report['total_voc']:
                verdict = 'PERFECT'
            elif report['total_found'] == report['total_unvoc']:
                verdict = 'PERFECT_UNVOC ' + str(first_error(report))
            elif report['total_voc'] == 0:
                verdict = 'TOTALLY_WRONG ' + str(first_error(report))
            else:
                verdict = 'PARTIAL ' + str(first_error(report))
            print(report['fun'], 'forms', report['total_found'],
                  'voc', report['total_voc'], 'unvoc', report['total_unvoc'],
                  verdict
                  )