From 8e029bd8dd24f8bd76c46cc2f811041be9682ab5 Mon Sep 17 00:00:00 2001
From: aarneranta <aarne@chalmers.se>
Date: Wed, 13 Sep 2023 17:24:21 +0200
Subject: [PATCH] Arabic Wiktionary: started comparing evaluation

---
 src/arabic/wiktionary/read_wiktionary.py | 85 ++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 5 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index ea8d805fd..574233dda 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -108,14 +108,76 @@ def gf_fun(s, pos, disamb=0):
     return ''.join(["'", s, discrim, "_", pos, "'"])
 
 
-def forms_for_pos(obj):
-    forms = {
+rgl_features = {
+    # V
+    'VPerf': 'perfective',
+    'Act': 'active',
+    'Pas': 'passive',
+    'Per3': 'third-person',
+    'Per2': 'second-person',
+    'Masc': 'masculine',
+    'Fem': 'feminine',
+    'Sg': 'singular',
+    'Pl': 'plural',
+    'Dl': 'dual',
+    'VImpf': 'imperfective',
+    'Ind': 'indicative',
+    'Cnj': 'subjunctive',
+    'Jus': 'jussive',
+    'VImp': 'imperative',
+    # N: also Sg, Pl, Dl
+    'Def': 'definite',
+    'Indef': 'indefinite',
+    'Nom': 'nominative',
+    'Acc': 'accusative',
+    'Gen': 'genitive',
+#    'Bare':
+#    'Dat':
+    'Const': 'construct',
+#    'Poss':
+    #A: also N features
+    'APosit': 'positive',
+    'AComp': 'comparative'
+    }
+
+
+# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ    
+def compare_tables(gf, wikt):
+    report = {}    
+    for line in gf:
+        gf_form = line  #''.join([c for c in line if 1574 <= ord(c) <= 1616])
+        gf_tags = tuple(word for word in
+                    line.replace('(', ' ').replace(')', ' ').split()
+                      if word in rgl_features)
+        wikt_tags = {rgl_features[tag] for tag in gf_tags}
+        wikt_form = None
+        for form, descr in wikt:
+            if all([tag in descr for tag in wikt_tags]):
+                wikt_form = form
+                break
+        report[gf_tags] = {
+            'gf_form': gf_form,
+            'wikt_form': wikt_form
+            }
+        if wikt_form:
+            report[gf_tags]['voc_match'] = int(gf_form == wikt_form)
+            report[gf_tags]['unvoc_match'] = int(unvocalize(gf_form) == unvocalize(wikt_form))
+    return report
+
+
+
+def wikt_forms_for_pos(obj):
+    return {
         form['form']:
           form.get('tags', []) for
             form in obj.get('forms', []) if
                'romanization' not in form.get('tags', []) and
                    is_arabic(form['form'])
         }.items()
+
+
+def forms_for_pos(obj):
+    forms = wikt_forms_for_pos(obj)
     if obj['pos'] == 'noun':
         lemma = [form[:-1] for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
@@ -136,7 +198,8 @@ def forms_for_pos(obj):
     elif obj['pos'] == 'verb':
         lemma = [form for form, descr in forms
                       if all([w in descr for
-                              w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1]
+                              w in ["active", "indicative", "masculine", "past",
+                                        "perfective", "singular", "third-person"]])][:1]
         gf_entry = {
           'cat': 'V',
           'lemma': lemma,
@@ -193,14 +256,16 @@ if MODE == 'gf-abs':
 if MODE == 'gf-cnc':
     print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') 
 
-if MODE != 'raw':
+if MODE not in ['raw', 'eval']:
   with open(FILTERED_WIKT) as file:
     seen_gf_funs = {}
+    number = 1
     for line in file:
         try:
             obj = json.loads(line)
         except:
             continue
+        number += 1
         root = [find_root(t['expansion']) for
                 t in obj.get('etymology_templates', []) if
                 t.get('name', None) =='ar-root'][:1]
@@ -227,7 +292,7 @@ if MODE != 'raw':
                     fun = gf_fun(lemma, cat, discrim)
                         
                     if MODE == 'gf-abs':
-                        print('fun', fun, ':', cat, ';', '--', entry['senses'])
+                        print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
                     if MODE == 'gf-cnc':
                         print('lin', fun, '=', lin, ';')
                             
@@ -237,3 +302,13 @@ if MODE != 'raw':
 
 if MODE.startswith('gf'):            
     print('}')
+
+    
+if MODE == 'eval':
+    with open('pot.gftbl') as file:
+        gf = [line.strip() for line in file]
+    with open('pot.json') as file:
+        wikt = wikt_forms_for_pos(json.loads(file.read()))
+    for line in compare_tables(gf, wikt).items():
+        print(line)
+