From 73f0b8ef00d944580b793020ef9cc94a7064b622 Mon Sep 17 00:00:00 2001
From: aarneranta <aarne@chalmers.se>
Date: Fri, 15 Sep 2023 14:48:23 +0200
Subject: [PATCH] commented and refactored read_wiktionary.py

---
 src/arabic/wiktionary/read_wiktionary.py | 213 ++++++++++++++---------
 src/arabic/wiktionary/to_wordnet.py      |   6 +-
 2 files changed, 133 insertions(+), 86 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 6db526c3..6ee6e10e 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -31,7 +31,7 @@ create GF files:
 automatic evaluation:
 
   $ gf -make MorphoDictAra.gf
-  $ python3 read_wiktionary.py gf-map >function_sources_arabic.jsonl
+  $ python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl
   $ python3 read_wiktionary.py eval
 
 TODO:
@@ -42,8 +42,6 @@ TODO:
 """
 
 
-
-
 MODE = ''
 
 if __name__ == '__main__':
@@ -53,8 +51,9 @@ if __name__ == '__main__':
     MODE = sys.argv[1]  # 
 
     
-# step 1: extract data from this file using the raw option
+# step 1: extract Arabic data from this file using the raw option
 WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
+EXTRACTED_LANGUAGE = 'Arabic'
 
 # the following file is generated.
 # in the sequel, use this file with gf-abs or gf-cnc option
@@ -62,24 +61,18 @@ FILTERED_WIKT = 'wikt_arabic.jsonl'
 
 # map each successfully extracted GF function to its source record in Wiktionary
 # created with option gf-map
-FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl'
+FUNCTION_SOURCE_MAP = 'source_of_MorphoDictAra.jsonl'
 
+# created with $ gf -make MorphoDictAra.gf
 PGF_FILE = 'MorphoDictAraAbs.pgf'
+
+# module to linearize with
 CONCRETE_MODULE = 'MorphoDictAra'
 
-
-def read_function_source_map():
-    with open(FUNCTION_SOURCE_MAP) as file:
-        sourcemap = {}
-        for line in file:
-            try:
-                obj = json.loads(line)
-                sourcemap[obj['fun']] = obj['source']
-            except:
-                continue
-    return sourcemap
             
-
+# read a gzipped jsonl file (one object per line),
+# showing lines where one of a list of languages is present
+# This can be sampled to one of 100k lines by default, 1 for total recall.
 def get_gzip_json(file, sample=100000, langs=[]):
     with gzip.open(file) as decompressed:
         n = 0
@@ -91,10 +84,13 @@ def get_gzip_json(file, sample=100000, langs=[]):
                     print(line.decode("utf-8"))
 #        print(n)
 
-if MODE == 'raw':
-    get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])  
 
+# to perform the first step of data extraction, pipe this into a file:
 # python3 read_wiktionary.py raw >wikt_arabic.jsonl
+if MODE == 'raw':
+    get_gzip_json(WIKTIONARY_DUMP, 1, [EXTRACTED_LANGUAGE])
+    exit()
+
 
 # https://en.wikipedia.org/wiki/Buckwalter_transliteration
 buckwalter_dict = {
@@ -177,19 +173,22 @@ def reorder_shadda(s):
     return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
 
 
-# quote forms but not parameters
+# quote word forms but not parameters
 def quote_if(s, cond=is_arabic, change=reorder_shadda):
     if cond(s):
         return '"' + change(s) + '"'
     else:
         return s
 
+
+# generate word_d_C functions starting with d=0, but show d only when >= 1
 def gf_fun(s, pos, disamb=0):
     discrim = '_' + str(disamb) if disamb else ''
     return ''.join(["'", s, discrim, "_", pos, "'"])
 
 
-rgl_features = {
+# mapping from GF to Wikt features
+arabic_rgl_features = {
     # V
     'VPerf': 'perfective',
     'Act': 'active',
@@ -224,62 +223,22 @@ rgl_features = {
     'AComp': 'comparative'
     }
 
-
-# obsolote:
-# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ    
-# coming from 'l -treebank -table'
-# now used:
-#  {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
-# coming from tabularLinearize
-
-def compare_tables(gf, wikt, fun):
-    report = {}    
-    for pair in gf.items():
-        gf_form = pair[1]
-        gf_tags = tuple(word for word in
-                    pair[0].replace('(', ' ').replace(')', ' ').split()
-                      if word in rgl_features)
-        if not gf_tags:
-            continue
-        wikt_tags = {rgl_features[tag] for tag in gf_tags}
-        wikt_form = None
-        wikt_descr = None
-        for form, descr in wikt:
-            if all([tag in descr for tag in wikt_tags]):
-                wikt_form = reorder_shadda(form)
-                wikt_descr = descr
-                break
-        report[gf_tags] = {
-            'gf_form': gf_form,
-            'wikt_form': wikt_form,
-            'gf_form_rom': to_buckwalter(gf_form) if gf_form else None,
-            'wikt_form_rom': to_buckwalter(wikt_form) if wikt_form else None,
-            'wikt_descr': wikt_descr
-            }
-        if wikt_form:
-            report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
-            report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
-    ritems = tuple(report.items())  # need an unmutable structure, because otherwise ints are added to items
-    report['fun'] = fun
-    report['total_found'] = len([f for f, v  in ritems if v['wikt_form'] is not None ])
-    report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
-    report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
-    return report
-
-
-
-def wikt_forms_for_pos(obj):
+    
+# the inflection forms in a wiktionary entry
+def wikt_forms_from_obj(obj):
     return {
         form['form']:
           form.get('tags', []) for
             form in obj.get('forms', []) if
                'romanization' not in form.get('tags', []) and
                    is_arabic(form['form'])
-        }.items()
+        }
 
 
+# selection of forms for a given POS from Wikt: noun, adj, or verb
+# return a linearization function
 def forms_for_pos(obj):
-    forms = wikt_forms_for_pos(obj)
+    forms = wikt_forms_from_obj(obj).items()
     if obj['pos'] == 'noun':
         lemma = [form[:-1] for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
@@ -345,46 +304,60 @@ def forms_for_pos(obj):
         if obj['root'] and obj['root'][0].strip():
             gf_entry['args']['root'] = obj['root']
         args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
-        gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' 
+        gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(sorted(args)) + '}' 
 
     return gf_entry
+
     
 # "root": ["ش ر ح (š-r-ḥ)"]
 def find_root(s):
     return ''.join([c for c in s if is_arabic(c)])
     
+    
+# GF code generation
+
+# start with the header of the desired GF module
+
 if MODE == 'gf-abs':
     print('abstract MorphoDictAraAbs = Cat ** {')    
 if MODE == 'gf-cnc':
     print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') 
 
-    
+# go through the Arabic Wiktionary entries
+# generate functions with unique names
+
 if MODE.startswith('gf') or MODE=='json':
   with open(FILTERED_WIKT) as file:
-    seen_gf_funs = {}
+    seen_gf_funs = {}  # to disambiguate names if needed
     number = 1
     for line in file:
         try:
             obj = json.loads(line)
         except:
             continue
-        number += 1
+        number += 1   # if you find the same word_C again, mark it word_1_C
+
+        # the root (three radicals) is found in this place if at all
         root = [find_root(t['expansion']) for
                 t in obj.get('etymology_templates', []) if
                 t.get('name', None) =='ar-root'][:1]
         obj['root'] = root
+
+        # only take entries that are marked as lemmas 
         if 'Arabic lemmas' in obj.get('categories', []):
             entry = {
                 'pos': obj['pos'],
                 'forms': forms_for_pos(obj),
+                'all_forms': wikt_forms_from_obj(obj),
                 'senses': [sense['glosses'] for sense in obj.get('senses', [])
                            if 'glosses' in sense]
                 }
-#            entry['n_forms'] = len(entry['forms'])
-#            print(entry['pos'], entry['n_forms'])
+
+            # if you only want to see the Wikt information used GF generation
             if MODE == 'json':
                 print(json.dumps(entry, ensure_ascii=False))
-
+                
+            # if you want to proceed to GF generation
             if MODE.startswith('gf'):
 
                 lemma = entry['forms'].get('lemma', None)
@@ -393,23 +366,74 @@ if MODE.startswith('gf') or MODE=='json':
                     lin = entry['forms']['lin']
                     discrim = seen_gf_funs.get((lemma, cat), 0)
                     fun = gf_fun(lemma, cat, discrim)
-                        
+
+                    # abstract syntax, save in MorphoDictAraAbs.gf
                     if MODE == 'gf-abs':
                         print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
-                    if MODE == 'gf-cnc':
+                        
+                    # concrete syntax, save in MorphoDictAra.gf
+                    elif MODE == 'gf-cnc':
                         print('lin', fun, '=', lin, ';')
-                    if MODE == 'gf-map':
-                        mapitem = {'fun': fun, 'source': obj}
+                        
+                    # function-source map, save in source_of_MorphoDictAra.jsonl
+                    elif MODE == 'gf-map':
+                        mapitem = {'fun': fun, 'source': wikt_forms_from_obj(obj)}
                         print(json.dumps(mapitem, ensure_ascii=False))
                             
-                    seen_gf_funs[(lemma, cat)] = discrim + 1
+                    seen_gf_funs[(lemma, cat)] = discrim + 1  # next word_d_C will get a new number
 
-                # to do: rename duplicate function names: of 13762 names, 12946 are unique
-
-if MODE.startswith('gf'):            
+# terminate the GF file with a closing brace
+if MODE in ['gf-abs', 'gf-cnc']:            
     print('}')
 
-    
+
+# evaluation:
+# linearize all words to tables
+# compare them to the forms found in Wiktionary
+# report on matches
+
+# format of GF table:
+#  {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
+# coming from pgf tabularLinearize
+
+def compare_tables(gf, wikt, fun, show_buckwalter=True):
+    report = {}    
+    for pair in gf.items():
+        gf_form = pair[1]
+        gf_params = pair[0]
+        gf_tags = tuple(word for word in
+                    pair[0].replace('(', ' ').replace(')', ' ').split()
+                      if word in arabic_rgl_features)
+        if not gf_tags:
+            continue  # if gf_tags match no Wikt tags, do not include this form
+        wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags}
+        wikt_form = None
+        wikt_descr = None
+        for form, descr in wikt:
+            if all([tag in descr for tag in wikt_tags]):
+                wikt_form = reorder_shadda(form)
+                wikt_descr = descr
+                break
+        report[gf_tags] = {          # flat param description with only Wikt-relevant tags
+            'gf_params': gf_params,  # full param description
+            'gf_form': gf_form,
+            'wikt_form': wikt_form,
+            'wikt_descr': wikt_descr
+            }
+        if show_buckwalter:
+            report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None,
+            report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None,
+        if wikt_form:
+            report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
+            report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
+    ritems = tuple(report.items())  # need an unmutable structure, because otherwise ints are added to items
+    report['fun'] = fun
+    report['total_found'] = len([f for f, v  in ritems if v['wikt_form'] is not None ])
+    report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
+    report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
+    return report
+
+
 def eval_all(gr, funmap, concrete=CONCRETE_MODULE):
     lang = gr.languages[CONCRETE_MODULE]
     funs = gr.functions
@@ -419,13 +443,14 @@ def eval_all(gr, funmap, concrete=CONCRETE_MODULE):
         if funn not in funmap:
             print(funn, 'not found')
             continue
-        wikt = wikt_forms_for_pos(funmap[funn])
+        wikt = funmap[funn].items()
         gf = lang.tabularLinearize(pgf.Expr(fun, []))
         report = compare_tables(gf, wikt, fun)
         reports.append(report)
     return reports
 
 
+# in the summary report: print the first error if anything gets wrong
 def first_error(report):
     for f, v in report.items():
         if 'voc_match' in v:
@@ -433,6 +458,20 @@ def first_error(report):
                 return f, v
 
 
+# having stored the Wiktionary object for each GF function
+# read it back from a file
+def read_function_source_map():
+    with open(FUNCTION_SOURCE_MAP) as file:
+        sourcemap = {}
+        for line in file:
+            try:
+                obj = json.loads(line)
+                sourcemap[obj['fun']] = obj['source']
+            except:
+                continue
+    return sourcemap
+
+            
 if MODE.startswith('eval'):
     gr = pgf.readPGF(PGF_FILE)
     print('using', PGF_FILE)
@@ -443,6 +482,10 @@ if MODE.startswith('eval'):
         if MODE == 'eval-verbose':
             for line in report.items():
                 print(line)
+        if MODE == 'eval-tables':
+            for gftags, value in report.items():
+                if v := value['wikt_form']:
+                    print(' ', value['gf_params'][2:], '=>', '"' + v + '" ;')
         else:
             if report['total_found'] == 0:
                 verdict = 'NOT_FOUND'
diff --git a/src/arabic/wiktionary/to_wordnet.py b/src/arabic/wiktionary/to_wordnet.py
index 7496e769..144e4cc1 100644
--- a/src/arabic/wiktionary/to_wordnet.py
+++ b/src/arabic/wiktionary/to_wordnet.py
@@ -4,7 +4,11 @@ import json
 # to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
 # the following are assumed
 
+
+# from https://www.grammaticalframework.org/~krasimir/arabic.tsv.gz
 WN_TSV = 'arabic.tsv'
+
+# built as explained in ./read_wiktionary.py
 MORPHO_GF = 'MorphoDictAraAbs.gf'
 
 def is_arabic(s):
@@ -36,7 +40,7 @@ with open(WN_TSV) as wnfile:
 ##    wnreader = csv.reader(wnfile, delimiter='\t')
     for row in wnfile:
 ##        word = row[-1].strip()   # does not show tha arabic, but the second-last word
-        word = get_arabic(row)
+        word = unvocalize(get_arabic(row))
         wnfun = row.split()[0]
         cat = [c for c in wnfun if c.isalpha()][-1]  # the last letter; the dict only contains N, A, V
         funs = funmap.get((word, cat), [])