From 6312624a5fa84d0076e46a44befbfccadb0538f0 Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Tue, 12 Sep 2023 12:08:31 +0200
Subject: [PATCH 01/19] preparing to read Arabic morpholex from Wiktionary

---
 src/arabic/wiktionary/read_wiktionary.py | 97 ++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 src/arabic/wiktionary/read_wiktionary.py

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
new file mode 100644
index 000000000..48a2fca38
--- /dev/null
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -0,0 +1,97 @@
+import gzip
+import json
+
+WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
+
+
+def get_gzip_json(file, sample=100000, langs=[]):
+    with gzip.open(file) as decompressed:
+        n = 0
+        for line in decompressed:
+            n += 1
+            if n % sample == 0:
+                obj = json.loads(line)
+                if obj.get('lang', None) in langs:
+                    print(line.decode("utf-8"))
+        print(n)
+
+
+# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])  
+# python3 read_wiktionary.py >wikt_arabic.jsonl
+# 621-671
+
+# https://en.wikipedia.org/wiki/Buckwalter_transliteration
+buckwalter_dict = {
+  0x621: "'",  # ء
+  0x622: '|',  # آ
+  0x623: '>',  # أ
+  0x624: '&',  # ؤ
+  0x625: '<',  # إ
+  0x626: '}',  # ئ
+  0x627: 'A',  # ا
+  0x628: 'b',  # ب
+  0x629: 'p',  # ة
+  0x62a: 't',  # ت
+  0x62b: 'v',  # ث
+  0x62c: 'j',  # ج
+  0x62d: 'H',  # ح
+  0x62e: 'x',  # خ
+  0x62f: 'd',  # د
+  0x630: '*',  # ذ
+  0x631: 'r',  # ر
+  0x632: 'z',  # ز
+  0x633: 's',  # س
+  0x634: '$',  # ش
+  0x635: 'S',  # ص
+  0x636: 'D',  # ض
+  0x637: 'T',  # ط
+  0x638: 'Z',  # ظ
+  0x639: 'E',  # ع
+  0x63a: 'g',  # غ
+  0x641: 'f',  # ف
+  0x642: 'q',  # ق
+  0x643: 'k',  # ك
+  0x644: 'l',  # ل
+  0x645: 'm',  # م
+  0x646: 'n',  # ن
+  0x647: 'h',  # ه
+  0x648: 'w',  # و
+  0x649: 'Y',  # ى
+  0x64a: 'y',  # ي
+  0x64b: 'F',  # ً
+  0x64c: 'N',  # ٌ
+  0x64d: 'K',  # ٍ
+  0x64e: 'a',  # َ
+  0x64f: 'u',  # ُ
+  0x650: 'i',  # ِ
+  0x651: '~',  # ّ
+  0x652: 'o',  # ْ
+  0x670: '`',  # '
+  0x671: '{'   # ٱ
+  }
+
+def to_buckwalter(s):
+    return ''.join(list(map(lambda c: buckwalter_dict.get(ord(c), '?'), s)))
+
+
+def is_arabic(s):
+    return s and any(1574 <= ord(c) <= 1616 for c in s)
+
+"""
+with open('wikt_arabic.jsonl') as file:
+    for line in file:
+        obj = json.loads(line)
+        if 'Arabic lemmas' in obj.get('categories', []):
+            entry = {
+                'pos': obj['pos'],
+                'forms': {form['form']: form.get('tags', []) for
+                          form in obj.get('forms', []) if
+                          'romanization' not in form.get('tags', []) and
+                          is_arabic(form['form'])
+                          },
+                'senses': obj.get('senses', [])
+                }
+            entry['n_forms'] = len(entry['forms'])
+            print(entry['pos'], entry['n_forms'])
+#            print(json.dumps(entry, ensure_ascii=False))
+"""

From ae1c7f0061ddec572b09acf0fe71b705d39fccb7 Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Tue, 12 Sep 2023 16:35:21 +0200
Subject: [PATCH 02/19] extracting Arabic from Wiktionary, next step GF
 generation

---
 src/arabic/wiktionary/read_wiktionary.py | 106 ++++++++++++++++++++---
 1 file changed, 94 insertions(+), 12 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 48a2fca38..2520cf5fd 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -2,6 +2,7 @@ import gzip
 import json
 
 WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
+FILTERED_WIKT = 'wikt_arabic.jsonl'
 
 
 def get_gzip_json(file, sample=100000, langs=[]):
@@ -71,27 +72,108 @@ buckwalter_dict = {
   }
 
 def to_buckwalter(s):
-    return ''.join(list(map(lambda c: buckwalter_dict.get(ord(c), '?'), s)))
+    return ''.join([buckwalter_dict.get(ord(c), '?') for c in s])
 
 
+def unvocalize(s):
+    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
+
 def is_arabic(s):
     return s and any(1574 <= ord(c) <= 1616 for c in s)
 
-"""
-with open('wikt_arabic.jsonl') as file:
+
+def gf_fun(s, pos):
+    return ''.join(["'", s, "_", pos, "'"])
+
+
+def forms_for_pos(obj):
+    forms = {
+        form['form']:
+          form.get('tags', []) for
+            form in obj.get('forms', []) if
+               'romanization' not in form.get('tags', []) and
+                   is_arabic(form['form'])
+        }.items()
+    if obj['pos'] == 'noun':
+        lemma = [form[:-1] for form, descr in forms
+                         if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
+        return {
+            'gf_fun': gf_fun(lemma[0], 'N') if lemma else None, 
+            'singular': lemma,  
+            'plural': [form[:-1] for form, descr in forms
+                         if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1],
+            'gender': 'Fem' if 'Arabic feminine nouns' in obj['categories']
+                            else ('Masc' if  'Arabic masculine nouns' in obj['categories']
+                                else None)
+            } 
+    elif obj['pos'] == 'verb':
+        lemma = [form for form, descr in forms
+                      if all([w in descr for
+                              w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1]
+        return {
+          'gf_fun': gf_fun(lemma[0], 'V') if lemma else None, 
+          'perfect': lemma, 
+          'imperfect': [form for form, descr in forms
+                      if all([w in descr for
+                              w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1],
+          'verbclass': max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','']
+                            if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
+                           key=len)
+          }
+    elif obj['pos'] == 'adj':
+        lemma = [form for form, descr in forms
+                         if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1]
+        return {
+            'gf_fun': gf_fun(lemma[0], 'A') if lemma else None, 
+            'masc_singular': lemma,   
+            'masc_plural': [form for form, descr in forms
+                         if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
+            'fem_singular': [form for form, descr in forms
+                         if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],  
+            'fem_plural': [form for form, descr in forms
+                         if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
+            } 
+
+    else:
+        return {f: d for f, d in forms}
+
+
+# "root": ["ش ر ح (š-r-ḥ)"]
+def find_root(s):
+    return ''.join([c for c in s if is_arabic(c)])
+    
+
+
+with open(FILTERED_WIKT) as file:
     for line in file:
         obj = json.loads(line)
         if 'Arabic lemmas' in obj.get('categories', []):
             entry = {
                 'pos': obj['pos'],
-                'forms': {form['form']: form.get('tags', []) for
-                          form in obj.get('forms', []) if
-                          'romanization' not in form.get('tags', []) and
-                          is_arabic(form['form'])
-                          },
-                'senses': obj.get('senses', [])
+                'root': [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1],
+                'forms': forms_for_pos(obj),
+                'senses': [sense['glosses'] for sense in obj.get('senses', [])
+                           if 'glosses' in sense]
                 }
-            entry['n_forms'] = len(entry['forms'])
-            print(entry['pos'], entry['n_forms'])
-#            print(json.dumps(entry, ensure_ascii=False))
+#            entry['n_forms'] = len(entry['forms'])
+#            print(entry['pos'], entry['n_forms'])
+            print(json.dumps(entry, ensure_ascii=False))
+
+            
+"""
+"senses": [
+    {"examples": [
+        {"text": "10th century, Al-Mutanabbi\nذُو الْعَقْلِ يَشْقَى فِي النَّعِيمِ بِعَقْلِهِ / وَأَخُو الْجَهَالَةِ فِي الشَّقَاوَةِ يَنْعَمُ\nḏū l-ʕaqli yašqā fī an-naʕīmi biʕaqlihi / waʔaḵū l-jahālati fī š-šaqāwati yanʕamu", "english": "(please add an English translation of this quotation)", "type": "quotation"}],
+     "links": [
+         ["bliss", "bliss#English"], ["delight", "delight#English"]],
+     "categories": ["Arabic terms with quotations", "Requests for translations of Arabic quotations"],
+     "glosses": ["bliss, delight"]
+     },
+    {"links": [
+        ["heaven", "heaven"], ["Heaven", "Heaven"], ["paradise", "paradise"], ["Paradise", "Paradise"]],
+     "synonyms": [{"word": "فِرْدَوس"}, {"word": "جَنَّة"}],
+     "antonyms": [{"word": "سَعِير"}, {"word": "لَظَىٰ"}, {"word": "النَّار"}, {"word": "جَهَنَّم"}, {"word": "جَحِيم"}, {"word": "حُطَمَة"}, {"word": "سَقَر"}, {"word": "هَاوِيَة"}],
+     "raw_glosses": ["(figurative) heaven, the Heaven, paradise, the Paradise"],
+     "glosses": ["heaven, the Heaven, paradise, the Paradise"],
+     "tags": ["figuratively"]}]
 """

From 714d8abac026fd2ec61fa431a61108358a3ef68c Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Tue, 12 Sep 2023 17:04:50 +0200
Subject: [PATCH 03/19] GF abstract dict generation

---
 src/arabic/wiktionary/read_wiktionary.py | 46 ++++++++++++------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 2520cf5fd..ac5ee59dd 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -98,7 +98,8 @@ def forms_for_pos(obj):
         lemma = [form[:-1] for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
         return {
-            'gf_fun': gf_fun(lemma[0], 'N') if lemma else None, 
+            'gf_fun': gf_fun(lemma[0], 'N') if lemma else None,
+            'gf_cat': 'N',
             'singular': lemma,  
             'plural': [form[:-1] for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1],
@@ -111,7 +112,8 @@ def forms_for_pos(obj):
                       if all([w in descr for
                               w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1]
         return {
-          'gf_fun': gf_fun(lemma[0], 'V') if lemma else None, 
+          'gf_fun': gf_fun(lemma[0], 'V') if lemma else None,
+          'gf_cat': 'V',
           'perfect': lemma, 
           'imperfect': [form for form, descr in forms
                       if all([w in descr for
@@ -124,7 +126,8 @@ def forms_for_pos(obj):
         lemma = [form for form, descr in forms
                          if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1]
         return {
-            'gf_fun': gf_fun(lemma[0], 'A') if lemma else None, 
+            'gf_fun': gf_fun(lemma[0], 'A') if lemma else None,
+            'gf_cat': 'A',
             'masc_singular': lemma,   
             'masc_plural': [form for form, descr in forms
                          if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
@@ -142,9 +145,14 @@ def forms_for_pos(obj):
 def find_root(s):
     return ''.join([c for c in s if is_arabic(c)])
     
+import sys
+MODE = sys.argv[1]
 
+if MODE == 'gf':
+    print('abstract MorphoDictAraAbs = Cat ** {') 
 
 with open(FILTERED_WIKT) as file:
+    seen_gf_funs = set()
     for line in file:
         obj = json.loads(line)
         if 'Arabic lemmas' in obj.get('categories', []):
@@ -157,23 +165,17 @@ with open(FILTERED_WIKT) as file:
                 }
 #            entry['n_forms'] = len(entry['forms'])
 #            print(entry['pos'], entry['n_forms'])
-            print(json.dumps(entry, ensure_ascii=False))
+            if MODE == 'json':
+                print(json.dumps(entry, ensure_ascii=False))
 
-            
-"""
-"senses": [
-    {"examples": [
-        {"text": "10th century, Al-Mutanabbi\nذُو الْعَقْلِ يَشْقَى فِي النَّعِيمِ بِعَقْلِهِ / وَأَخُو الْجَهَالَةِ فِي الشَّقَاوَةِ يَنْعَمُ\nḏū l-ʕaqli yašqā fī an-naʕīmi biʕaqlihi / waʔaḵū l-jahālati fī š-šaqāwati yanʕamu", "english": "(please add an English translation of this quotation)", "type": "quotation"}],
-     "links": [
-         ["bliss", "bliss#English"], ["delight", "delight#English"]],
-     "categories": ["Arabic terms with quotations", "Requests for translations of Arabic quotations"],
-     "glosses": ["bliss, delight"]
-     },
-    {"links": [
-        ["heaven", "heaven"], ["Heaven", "Heaven"], ["paradise", "paradise"], ["Paradise", "Paradise"]],
-     "synonyms": [{"word": "فِرْدَوس"}, {"word": "جَنَّة"}],
-     "antonyms": [{"word": "سَعِير"}, {"word": "لَظَىٰ"}, {"word": "النَّار"}, {"word": "جَهَنَّم"}, {"word": "جَحِيم"}, {"word": "حُطَمَة"}, {"word": "سَقَر"}, {"word": "هَاوِيَة"}],
-     "raw_glosses": ["(figurative) heaven, the Heaven, paradise, the Paradise"],
-     "glosses": ["heaven, the Heaven, paradise, the Paradise"],
-     "tags": ["figuratively"]}]
-"""
+            if MODE == 'gf':
+
+                if 'gf_fun' in entry['forms'] and entry['forms']['gf_fun']:
+                    if entry['forms']['gf_fun'] not in seen_gf_funs:
+                        print('fun', entry['forms']['gf_fun'], ':', entry['forms']['gf_cat'], ';', '--', entry['senses'])
+                        seen_gf_funs.add(entry['forms']['gf_fun'])
+
+                # to do: rename duplicate function names: of 13762 names, 12946 are unique
+
+if MODE == 'gf':            
+    print('}')

From 8eceb53643a5a41d53ae61ce4ebb64f70b27010e Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Tue, 12 Sep 2023 19:38:14 +0200
Subject: [PATCH 04/19] compilable MorphoDictAra generation except for V, not
 yet using all forms

---
 src/arabic/wiktionary/read_wiktionary.py | 79 ++++++++++++++++--------
 1 file changed, 52 insertions(+), 27 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index ac5ee59dd..40d14b9fa 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -82,8 +82,9 @@ def is_arabic(s):
     return s and any(1574 <= ord(c) <= 1616 for c in s)
 
 
-def gf_fun(s, pos):
-    return ''.join(["'", s, "_", pos, "'"])
+def gf_fun(s, pos, disamb=0):
+    discrim = '_' + str(disamb) if disamb else ''
+    return ''.join(["'", s, discrim, "_", pos, "'"])
 
 
 def forms_for_pos(obj):
@@ -97,23 +98,25 @@ def forms_for_pos(obj):
     if obj['pos'] == 'noun':
         lemma = [form[:-1] for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
-        return {
-            'gf_fun': gf_fun(lemma[0], 'N') if lemma else None,
-            'gf_cat': 'N',
-            'singular': lemma,  
-            'plural': [form[:-1] for form, descr in forms
-                         if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1],
-            'gender': 'Fem' if 'Arabic feminine nouns' in obj['categories']
+        plural = [form[:-1] for form, descr in forms
+                         if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
+        gender = ('Fem' if 'Arabic feminine nouns' in obj['categories']
                             else ('Masc' if  'Arabic masculine nouns' in obj['categories']
-                                else None)
+                                else None))
+        gf_entry = {
+            'cat': 'N',
+            'lemma': lemma,
+            'singular': lemma,  
+            'plural': plural,
+            'gender': gender
             } 
     elif obj['pos'] == 'verb':
         lemma = [form for form, descr in forms
                       if all([w in descr for
                               w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1]
-        return {
-          'gf_fun': gf_fun(lemma[0], 'V') if lemma else None,
-          'gf_cat': 'V',
+        gf_entry = {
+          'cat': 'V',
+          'lemma': lemma,
           'perfect': lemma, 
           'imperfect': [form for form, descr in forms
                       if all([w in descr for
@@ -125,9 +128,9 @@ def forms_for_pos(obj):
     elif obj['pos'] == 'adj':
         lemma = [form for form, descr in forms
                          if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1]
-        return {
-            'gf_fun': gf_fun(lemma[0], 'A') if lemma else None,
-            'gf_cat': 'A',
+        gf_entry = {
+            'cat': 'A',
+            'lemma': lemma,
             'masc_singular': lemma,   
             'masc_plural': [form for form, descr in forms
                          if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
@@ -138,9 +141,16 @@ def forms_for_pos(obj):
             } 
 
     else:
-        return {f: d for f, d in forms}
+        gf_entry = {f: d for f, d in forms}
+        
+    if 'lemma' in gf_entry and gf_entry['lemma']:
+        gf_entry['lemma'] = gf_entry['lemma'][0]
+        form = gf_entry['imperfect'][0] if gf_entry['cat'] == 'V' and gf_entry['imperfect'] else gf_entry['lemma']
+        gf_entry['lin'] = ''.join(['mk', gf_entry['cat'], ' "' + form + '"']) 
 
+    return gf_entry
 
+    
 # "root": ["ش ر ح (š-r-ḥ)"]
 def find_root(s):
     return ''.join([c for c in s if is_arabic(c)])
@@ -148,17 +158,23 @@ def find_root(s):
 import sys
 MODE = sys.argv[1]
 
-if MODE == 'gf':
-    print('abstract MorphoDictAraAbs = Cat ** {') 
+if MODE == 'gf-abs':
+    print('abstract MorphoDictAraAbs = Cat ** {')    
+if MODE == 'gf-cnc':
+    print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') 
+
 
 with open(FILTERED_WIKT) as file:
-    seen_gf_funs = set()
+    seen_gf_funs = {}
     for line in file:
         obj = json.loads(line)
+        root = [find_root(t['expansion']) for
+                t in obj.get('etymology_templates', []) if
+                t.get('name', None) =='ar-root'][:1]
         if 'Arabic lemmas' in obj.get('categories', []):
             entry = {
                 'pos': obj['pos'],
-                'root': [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1],
+                'root': root, 
                 'forms': forms_for_pos(obj),
                 'senses': [sense['glosses'] for sense in obj.get('senses', [])
                            if 'glosses' in sense]
@@ -168,14 +184,23 @@ with open(FILTERED_WIKT) as file:
             if MODE == 'json':
                 print(json.dumps(entry, ensure_ascii=False))
 
-            if MODE == 'gf':
+            if MODE.startswith('gf'):
 
-                if 'gf_fun' in entry['forms'] and entry['forms']['gf_fun']:
-                    if entry['forms']['gf_fun'] not in seen_gf_funs:
-                        print('fun', entry['forms']['gf_fun'], ':', entry['forms']['gf_cat'], ';', '--', entry['senses'])
-                        seen_gf_funs.add(entry['forms']['gf_fun'])
+                lemma = entry['forms'].get('lemma', None)
+                if lemma:
+                    cat = entry['forms']['cat']
+                    lin = entry['forms']['lin']
+                    discrim = seen_gf_funs.get((lemma, cat), 0)
+                    fun = gf_fun(lemma, cat, discrim)
+                        
+                    if MODE == 'gf-abs':
+                        print('fun', fun, ':', cat, ';', '--', entry['senses'])
+                    if MODE == 'gf-cnc':
+                        print('lin', fun, '=', lin, ';')
+                            
+                    seen_gf_funs[(lemma, cat)] = discrim + 1
 
                 # to do: rename duplicate function names: of 13762 names, 12946 are unique
 
-if MODE == 'gf':            
+if MODE.startswith('gf'):            
     print('}')

From afc84a61cbf2ac76e3ac3bf0f8d3654cb40c5c44 Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Wed, 13 Sep 2023 09:06:02 +0200
Subject: [PATCH 05/19] arabic/wiktionary using paradigms with records as
 arguments to cope with heterogeneous information

---
 src/arabic/wiktionary/read_wiktionary.py | 43 +++++++++++++-----------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 40d14b9fa..49d3a3c11 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -19,7 +19,6 @@ def get_gzip_json(file, sample=100000, langs=[]):
 
 # get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])  
 # python3 read_wiktionary.py >wikt_arabic.jsonl
-# 621-671
 
 # https://en.wikipedia.org/wiki/Buckwalter_transliteration
 buckwalter_dict = {
@@ -100,15 +99,17 @@ def forms_for_pos(obj):
                          if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
         plural = [form[:-1] for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
-        gender = ('Fem' if 'Arabic feminine nouns' in obj['categories']
-                            else ('Masc' if  'Arabic masculine nouns' in obj['categories']
-                                else None))
+        gender = (['Fem'] if 'Arabic feminine nouns' in obj['categories']
+                            else (['Masc'] if  'Arabic masculine nouns' in obj['categories']
+                                  else []))
         gf_entry = {
             'cat': 'N',
             'lemma': lemma,
-            'singular': lemma,  
-            'plural': plural,
-            'gender': gender
+            'args': {
+                'sg': lemma,  
+                'pl': plural,
+                'g': gender
+                }
             } 
     elif obj['pos'] == 'verb':
         lemma = [form for form, descr in forms
@@ -117,13 +118,15 @@ def forms_for_pos(obj):
         gf_entry = {
           'cat': 'V',
           'lemma': lemma,
-          'perfect': lemma, 
-          'imperfect': [form for form, descr in forms
+          'args': {
+              'perfect': lemma, 
+              'imperfect': [form for form, descr in forms
                       if all([w in descr for
                               w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1],
-          'verbclass': max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','']
+              'cls': [max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII', 'XIV', 'XV', '']
                             if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
-                           key=len)
+                           key=len)]
+              }
           }
     elif obj['pos'] == 'adj':
         lemma = [form for form, descr in forms
@@ -131,13 +134,15 @@ def forms_for_pos(obj):
         gf_entry = {
             'cat': 'A',
             'lemma': lemma,
-            'masc_singular': lemma,   
-            'masc_plural': [form for form, descr in forms
+            'args': {
+                'masc_sg': lemma,   
+                'masc_pl': [form for form, descr in forms
                          if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
-            'fem_singular': [form for form, descr in forms
+                'fem_sg': [form for form, descr in forms
                          if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],  
-            'fem_plural': [form for form, descr in forms
+                'fem_pl': [form for form, descr in forms
                          if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
+                }
             } 
 
     else:
@@ -145,11 +150,11 @@ def forms_for_pos(obj):
         
     if 'lemma' in gf_entry and gf_entry['lemma']:
         gf_entry['lemma'] = gf_entry['lemma'][0]
-        form = gf_entry['imperfect'][0] if gf_entry['cat'] == 'V' and gf_entry['imperfect'] else gf_entry['lemma']
-        gf_entry['lin'] = ''.join(['mk', gf_entry['cat'], ' "' + form + '"']) 
+        gf_entry['args']['root'] = obj['root']
+        args = [r + ' = ' + '"' + x[0] + '"' for r, x in gf_entry['args'].items() if x]
+        gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' 
 
     return gf_entry
-
     
 # "root": ["ش ر ح (š-r-ḥ)"]
 def find_root(s):
@@ -171,10 +176,10 @@ with open(FILTERED_WIKT) as file:
         root = [find_root(t['expansion']) for
                 t in obj.get('etymology_templates', []) if
                 t.get('name', None) =='ar-root'][:1]
+        obj['root'] = root
         if 'Arabic lemmas' in obj.get('categories', []):
             entry = {
                 'pos': obj['pos'],
-                'root': root, 
                 'forms': forms_for_pos(obj),
                 'senses': [sense['glosses'] for sense in obj.get('senses', [])
                            if 'glosses' in sense]

From 3c0adada11f9c3055dedb5a26ef4d483585f8a15 Mon Sep 17 00:00:00 2001
From: aarneranta <aarne@chalmers.se>
Date: Wed, 13 Sep 2023 15:29:28 +0200
Subject: [PATCH 06/19] new function in ParadigmsAra to deal with Wiktionary
 data; lots of untested guesses

---
 src/arabic/ParadigmsAra.gf               | 67 ++++++++++++++++++++++++
 src/arabic/wiktionary/read_wiktionary.py | 64 +++++++++++++++-------
 2 files changed, 113 insertions(+), 18 deletions(-)

diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf
index ce479f944..20892fed8 100644
--- a/src/arabic/ParadigmsAra.gf
+++ b/src/arabic/ParadigmsAra.gf
@@ -868,4 +868,71 @@ formV : (root : Str) -> VerbForm -> V = \s,f -> case f of {
 param VerbForm =
   FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ;
 
+-- paradigms for Wiktionary extraction
+---- TODO: better usage of information in Wiktionary
+
+oper
+  wmkN = overload {
+    wmkN : {sg, pl : Str ; g : Gender} -> N
+      = \r -> mkN r.sg r.pl r.g nohum ;  --- hum/nohum not in Wikt
+    wmkN : {sg : Str} -> N
+      = \r -> smartN r.sg ; 
+    wmkN : {sg : Str ; g : Gender ; root : Str} -> N
+      = \r -> smartN r.sg ** {g = r.g} ; ----
+    wmkN : {sg : Str; g : Gender} -> N
+      = \r -> smartN r.sg ** {g = r.g} ;
+    wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N
+      = \r -> mkN r.sg r.pl r.g nohum ;   --- hum/nohum not in Wikt
+    wmkN : {sg : Str; pl : Str} -> N
+      = \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ;
+    wmkN : {sg : Str; root : Str} -> N 
+      = \r -> smartN r.sg ; 
+    } ;
+
+  wmkA = overload {
+    wmkA : {root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; fem_sg: Str ; masc_pl : Str; fem_pl : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    mkA : {masc_sg : Str; fem_sg : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; fem_sg : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; masc_pl : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    mkA : {masc_sg : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    } ;
+
+  wmkV = overload {
+    wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V
+      = \r -> mkV r.root r.cls ; ----
+    wmkV : {perfect : Str; cls : VerbForm} -> V
+      = \r -> mkV r.perfect r.cls ; ----
+    wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V
+      = \r -> mkV r.root r.cls ; ----
+    wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V
+      = \r -> variants {} ; ---- mkV r.imperfect ; ----
+    wmkV : {root : Str ; cls : VerbForm} -> V
+      = \r -> mkV r.root r.cls ;
+    wmkV : {imperfect : Str} -> V
+      = \r -> variants {} ; ---- mkV r.imperfect ;
+    } ;
+
 } ;
diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 49d3a3c11..ea8d805fd 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -1,7 +1,22 @@
 import gzip
 import json
+import sys
 
+# data from https://kaikki.org/dictionary/rawdata.html
+# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
+# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. 
+
+if not sys.argv[1:]:
+    print('usage: read_wiktionary (raw | gf-cnc | gf-abs)')
+    exit()
+
+MODE = sys.argv[1]  # 
+
+# step 1: extract data from this file using the raw option
 WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
+
+# the following file is generated.
+# in the sequel, use this file with gf-abs or gf-cnc option
 FILTERED_WIKT = 'wikt_arabic.jsonl'
 
 
@@ -14,11 +29,12 @@ def get_gzip_json(file, sample=100000, langs=[]):
                 obj = json.loads(line)
                 if obj.get('lang', None) in langs:
                     print(line.decode("utf-8"))
-        print(n)
+#        print(n)
 
+if MODE == 'raw':
+    get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])  
 
-# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])  
-# python3 read_wiktionary.py >wikt_arabic.jsonl
+# python3 read_wiktionary.py raw >wikt_arabic.jsonl
 
 # https://en.wikipedia.org/wiki/Buckwalter_transliteration
 buckwalter_dict = {
@@ -80,6 +96,12 @@ def unvocalize(s):
 def is_arabic(s):
     return s and any(1574 <= ord(c) <= 1616 for c in s)
 
+# quote forms but not parameters
+def quote_if(s, cond=is_arabic):
+    if cond(s):
+        return '"' + s + '"'
+    else:
+        return s
 
 def gf_fun(s, pos, disamb=0):
     discrim = '_' + str(disamb) if disamb else ''
@@ -99,8 +121,8 @@ def forms_for_pos(obj):
                          if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
         plural = [form[:-1] for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
-        gender = (['Fem'] if 'Arabic feminine nouns' in obj['categories']
-                            else (['Masc'] if  'Arabic masculine nouns' in obj['categories']
+        gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
+                            else (['masc'] if  'Arabic masculine nouns' in obj['categories']
                                   else []))
         gf_entry = {
             'cat': 'N',
@@ -122,15 +144,20 @@ def forms_for_pos(obj):
               'perfect': lemma, 
               'imperfect': [form for form, descr in forms
                       if all([w in descr for
-                              w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1],
-              'cls': [max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII', 'XIV', 'XV', '']
-                            if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
-                           key=len)]
+                              w in [
+                                  "active", "indicative", "masculine", "non-past",
+                                  "imperfective", "singular", "third-person"]])][:1],
+              'cls': ['Form' + max([n for n in [
+                  'I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','']
+                            if n in ' '.join([c for c in obj['categories']
+                                if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
+                           key=len)]  # max in RGL is XI, in Wikt XIII
               }
           }
     elif obj['pos'] == 'adj':
         lemma = [form for form, descr in forms
-                         if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1]
+                    if all([w in descr for w in [
+                        'indefinite', 'masculine', 'singular', 'informal']])][:1]
         gf_entry = {
             'cat': 'A',
             'lemma': lemma,
@@ -150,8 +177,9 @@ def forms_for_pos(obj):
         
     if 'lemma' in gf_entry and gf_entry['lemma']:
         gf_entry['lemma'] = gf_entry['lemma'][0]
-        gf_entry['args']['root'] = obj['root']
-        args = [r + ' = ' + '"' + x[0] + '"' for r, x in gf_entry['args'].items() if x]
+        if obj['root']:
+            gf_entry['args']['root'] = obj['root']
+        args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
         gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' 
 
     return gf_entry
@@ -160,19 +188,19 @@ def forms_for_pos(obj):
 def find_root(s):
     return ''.join([c for c in s if is_arabic(c)])
     
-import sys
-MODE = sys.argv[1]
-
 if MODE == 'gf-abs':
     print('abstract MorphoDictAraAbs = Cat ** {')    
 if MODE == 'gf-cnc':
     print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') 
 
-
-with open(FILTERED_WIKT) as file:
+if MODE != 'raw':
+  with open(FILTERED_WIKT) as file:
     seen_gf_funs = {}
     for line in file:
-        obj = json.loads(line)
+        try:
+            obj = json.loads(line)
+        except:
+            continue
         root = [find_root(t['expansion']) for
                 t in obj.get('etymology_templates', []) if
                 t.get('name', None) =='ar-root'][:1]

From 8e029bd8dd24f8bd76c46cc2f811041be9682ab5 Mon Sep 17 00:00:00 2001
From: aarneranta <aarne@chalmers.se>
Date: Wed, 13 Sep 2023 17:24:21 +0200
Subject: [PATCH 07/19] Arabic Wiktionary: started comparing evaluation

---
 src/arabic/wiktionary/read_wiktionary.py | 85 ++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 5 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index ea8d805fd..574233dda 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -108,14 +108,76 @@ def gf_fun(s, pos, disamb=0):
     return ''.join(["'", s, discrim, "_", pos, "'"])
 
 
-def forms_for_pos(obj):
-    forms = {
+rgl_features = {
+    # V
+    'VPerf': 'perfective',
+    'Act': 'active',
+    'Pas': 'passive',
+    'Per3': 'third-person',
+    'Per2': 'second-person',
+    'Masc': 'masculine',
+    'Fem': 'feminine',
+    'Sg': 'singular',
+    'Pl': 'plural',
+    'Dl': 'dual',
+    'VImpf': 'imperfective',
+    'Ind': 'indicative',
+    'Cnj': 'subjunctive',
+    'Jus': 'jussive',
+    'VImp': 'imperative',
+    # N: also Sg, Pl, Dl
+    'Def': 'definite',
+    'Indef': 'indefinite',
+    'Nom': 'nominative',
+    'Acc': 'accusative',
+    'Gen': 'genitive',
+#    'Bare':
+#    'Dat':
+    'Const': 'construct',
+#    'Poss':
+    #A: also N features
+    'APosit': 'positive',
+    'AComp': 'comparative'
+    }
+
+
+# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ    
+def compare_tables(gf, wikt):
+    report = {}    
+    for line in gf:
+        gf_form = line  #''.join([c for c in line if 1574 <= ord(c) <= 1616])
+        gf_tags = tuple(word for word in
+                    line.replace('(', ' ').replace(')', ' ').split()
+                      if word in rgl_features)
+        wikt_tags = {rgl_features[tag] for tag in gf_tags}
+        wikt_form = None
+        for form, descr in wikt:
+            if all([tag in descr for tag in wikt_tags]):
+                wikt_form = form
+                break
+        report[gf_tags] = {
+            'gf_form': gf_form,
+            'wikt_form': wikt_form
+            }
+        if wikt_form:
+            report[gf_tags]['voc_match'] = int(gf_form == wikt_form)
+            report[gf_tags]['unvoc_match'] = int(unvocalize(gf_form) == unvocalize(wikt_form))
+    return report
+
+
+
+def wikt_forms_for_pos(obj):
+    return {
         form['form']:
           form.get('tags', []) for
             form in obj.get('forms', []) if
                'romanization' not in form.get('tags', []) and
                    is_arabic(form['form'])
         }.items()
+
+
+def forms_for_pos(obj):
+    forms = wikt_forms_for_pos(obj)
     if obj['pos'] == 'noun':
         lemma = [form[:-1] for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
@@ -136,7 +198,8 @@ def forms_for_pos(obj):
     elif obj['pos'] == 'verb':
         lemma = [form for form, descr in forms
                       if all([w in descr for
-                              w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1]
+                              w in ["active", "indicative", "masculine", "past",
+                                        "perfective", "singular", "third-person"]])][:1]
         gf_entry = {
           'cat': 'V',
           'lemma': lemma,
@@ -193,14 +256,16 @@ if MODE == 'gf-abs':
 if MODE == 'gf-cnc':
     print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') 
 
-if MODE != 'raw':
+if MODE not in ['raw', 'eval']:
   with open(FILTERED_WIKT) as file:
     seen_gf_funs = {}
+    number = 1
     for line in file:
         try:
             obj = json.loads(line)
         except:
             continue
+        number += 1
         root = [find_root(t['expansion']) for
                 t in obj.get('etymology_templates', []) if
                 t.get('name', None) =='ar-root'][:1]
@@ -227,7 +292,7 @@ if MODE != 'raw':
                     fun = gf_fun(lemma, cat, discrim)
                         
                     if MODE == 'gf-abs':
-                        print('fun', fun, ':', cat, ';', '--', entry['senses'])
+                        print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
                     if MODE == 'gf-cnc':
                         print('lin', fun, '=', lin, ';')
                             
@@ -237,3 +302,13 @@ if MODE != 'raw':
 
 if MODE.startswith('gf'):            
     print('}')
+
+    
+if MODE == 'eval':
+    with open('pot.gftbl') as file:
+        gf = [line.strip() for line in file]
+    with open('pot.json') as file:
+        wikt = wikt_forms_for_pos(json.loads(file.read()))
+    for line in compare_tables(gf, wikt).items():
+        print(line)
+

From d5e6e7e38987ab98da7fa33b90428e446a730414 Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Thu, 14 Sep 2023 12:21:48 +0200
Subject: [PATCH 08/19] Arabic Wiktionary: functions for normalization and
 evaluation

---
 src/arabic/wiktionary/read_wiktionary.py | 88 ++++++++++++++++++++----
 1 file changed, 75 insertions(+), 13 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 574233dda..bcf902b77 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -1,17 +1,21 @@
 import gzip
 import json
 import sys
+import unicodedata
 
 # data from https://kaikki.org/dictionary/rawdata.html
 # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
 # Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. 
 
-if not sys.argv[1:]:
-    print('usage: read_wiktionary (raw | gf-cnc | gf-abs)')
-    exit()
+MODE = ''
 
-MODE = sys.argv[1]  # 
+if __name__ == '__main__':
+    if not sys.argv[1:]:
+        print('usage: read_wiktionary (raw | gf-cnc | gf-abs | gf-map | eval | eval-verbose)')
+        exit()
+    MODE = sys.argv[1]  # 
 
+    
 # step 1: extract data from this file using the raw option
 WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
 
@@ -19,6 +23,18 @@ WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
 # in the sequel, use this file with gf-abs or gf-cnc option
 FILTERED_WIKT = 'wikt_arabic.jsonl'
 
+# map each successfully extracted GF function to its source record in Wiktionary
+# created with option gf-map
+FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl'
+
+
+def read_function_source_map():
+    with open(FUNCTION_SOURCE_MAP) as file:
+        sourcemap = {}
+        for line in file:
+            obj = json.loads(line)
+            sourcemap[obj['fun']] = obj['source']
+            
 
 def get_gzip_json(file, sample=100000, langs=[]):
     with gzip.open(file) as decompressed:
@@ -86,16 +102,37 @@ buckwalter_dict = {
   0x671: '{'   # ٱ
   }
 
+buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
+
+
 def to_buckwalter(s):
-    return ''.join([buckwalter_dict.get(ord(c), '?') for c in s])
+    return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
+
+
+def from_buckwalter(s):
+    return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
 
 
 def unvocalize(s):
     return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
 
+
 def is_arabic(s):
     return s and any(1574 <= ord(c) <= 1616 for c in s)
 
+def normal(s):
+    return unicodedata.normalize('NFD', s)
+
+
+# Wikt uses vowel+shadda which is a Unicode normalization
+# GF uses shadda+vowel which is linguistically correct
+# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
+# unicodedata.normalize does this wrong, as noted by Ariel Gutman 
+## todo: more direct implementation
+def reorder_shadda(s):
+    return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
+
+
 # quote forms but not parameters
 def quote_if(s, cond=is_arabic):
     if cond(s):
@@ -115,8 +152,11 @@ rgl_features = {
     'Pas': 'passive',
     'Per3': 'third-person',
     'Per2': 'second-person',
+    'Per1': 'first-person',
     'Masc': 'masculine',
     'Fem': 'feminine',
+    'Sing': 'singular',
+    'Plur': 'plural',
     'Sg': 'singular',
     'Pl': 'plural',
     'Dl': 'dual',
@@ -142,26 +182,39 @@ rgl_features = {
 
 
 # format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ    
+# coming from 'l -treebank -table'
 def compare_tables(gf, wikt):
     report = {}    
     for line in gf:
-        gf_form = line  #''.join([c for c in line if 1574 <= ord(c) <= 1616])
+        gf_form = line.split()[-1] # ''.join([c for c in line if 1574 <= ord(c) <= 1616])
         gf_tags = tuple(word for word in
                     line.replace('(', ' ').replace(')', ' ').split()
                       if word in rgl_features)
+        if not gf_tags:
+            continue
         wikt_tags = {rgl_features[tag] for tag in gf_tags}
         wikt_form = None
+        wikt_descr = None
         for form, descr in wikt:
             if all([tag in descr for tag in wikt_tags]):
-                wikt_form = form
+                wikt_form = reorder_shadda(form)
+                wikt_descr = descr
                 break
         report[gf_tags] = {
             'gf_form': gf_form,
-            'wikt_form': wikt_form
+            'wikt_form': wikt_form,
+            'gf_form_rom': to_buckwalter(gf_form) if gf_form else None,
+            'wikt_form_rom': to_buckwalter(wikt_form) if wikt_form else None,
+            'wikt_descr': wikt_descr
             }
         if wikt_form:
-            report[gf_tags]['voc_match'] = int(gf_form == wikt_form)
-            report[gf_tags]['unvoc_match'] = int(unvocalize(gf_form) == unvocalize(wikt_form))
+            report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
+            report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
+    ritems = tuple(report.items())  # need an unmutable structure, because otherwise ints are added to items
+    report['fun'] = gf[0].split()[-1]
+    report['total_found'] = len([f for f, v  in ritems if v['wikt_form'] is not None ])
+    report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
+    report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
     return report
 
 
@@ -295,6 +348,9 @@ if MODE not in ['raw', 'eval']:
                         print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
                     if MODE == 'gf-cnc':
                         print('lin', fun, '=', lin, ';')
+                    if MODE == 'gf-map':
+                        mapitem = {'fun': fun, 'source': obj}
+                        print(json.dumps(mapitem, ensure_ascii=False))
                             
                     seen_gf_funs[(lemma, cat)] = discrim + 1
 
@@ -304,11 +360,17 @@ if MODE.startswith('gf'):
     print('}')
 
     
-if MODE == 'eval':
+if MODE.startswith('eval'):
     with open('pot.gftbl') as file:
         gf = [line.strip() for line in file]
     with open('pot.json') as file:
         wikt = wikt_forms_for_pos(json.loads(file.read()))
-    for line in compare_tables(gf, wikt).items():
-        print(line)
+    report = compare_tables(gf, wikt)
+    
+    if MODE == 'eval-verbose':
+        for line in report.items():
+            print(line)
+    else:
+        print(report['fun'], 'forms', report['total_found'],
+              'voc', report['total_voc'], 'unvoc', report['total_unvoc'])
 

From 3e9be76e52be26e046910afbffa196e3f2d64826 Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Thu, 14 Sep 2023 15:19:05 +0200
Subject: [PATCH 09/19] evaluation of generated lexicon

---
 src/arabic/wiktionary/read_wiktionary.py | 135 ++++++++++++++++++-----
 1 file changed, 110 insertions(+), 25 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index bcf902b77..6db526c33 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -2,10 +2,47 @@ import gzip
 import json
 import sys
 import unicodedata
+import pgf
+
 
 # data from https://kaikki.org/dictionary/rawdata.html
 # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
-# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. 
+# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.
+
+"""
+This file converts Wiktionary data to GF morphological dictionary files.
+It words for Arabic but some functionalities could be modified to other languges.
+
+The steps to take are the following:
+
+fetch data:
+
+  raw-wiktextract-data.json.gz from https://kaikki.org/dictionary/rawdata.html
+
+filter Arabic entries:
+
+  $ python3 read_wiktionary.py raw >wikt_arabic.jsonl
+
+create GF files:
+
+  $ python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
+  $ python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
+
+automatic evaluation:
+
+  $ gf -make MorphoDictAra.gf
+  $ python3 read_wiktionary.py gf-map >function_sources_arabic.jsonl
+  $ python3 read_wiktionary.py eval
+
+TODO:
+- better generation of GF
+- better paradigms to use Wiktionary data
+- refactor the code so that it can be used for other languages
+
+"""
+
+
+
 
 MODE = ''
 
@@ -27,13 +64,20 @@ FILTERED_WIKT = 'wikt_arabic.jsonl'
 # created with option gf-map
 FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl'
 
+PGF_FILE = 'MorphoDictAraAbs.pgf'
+CONCRETE_MODULE = 'MorphoDictAra'
+
 
 def read_function_source_map():
     with open(FUNCTION_SOURCE_MAP) as file:
         sourcemap = {}
         for line in file:
-            obj = json.loads(line)
-            sourcemap[obj['fun']] = obj['source']
+            try:
+                obj = json.loads(line)
+                sourcemap[obj['fun']] = obj['source']
+            except:
+                continue
+    return sourcemap
             
 
 def get_gzip_json(file, sample=100000, langs=[]):
@@ -134,9 +178,9 @@ def reorder_shadda(s):
 
 
 # quote forms but not parameters
-def quote_if(s, cond=is_arabic):
+def quote_if(s, cond=is_arabic, change=reorder_shadda):
     if cond(s):
-        return '"' + s + '"'
+        return '"' + change(s) + '"'
     else:
         return s
 
@@ -181,14 +225,19 @@ rgl_features = {
     }
 
 
+# obsolote:
 # format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ    
 # coming from 'l -treebank -table'
-def compare_tables(gf, wikt):
+# now used:
+#  {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
+# coming from tabularLinearize
+
+def compare_tables(gf, wikt, fun):
     report = {}    
-    for line in gf:
-        gf_form = line.split()[-1] # ''.join([c for c in line if 1574 <= ord(c) <= 1616])
+    for pair in gf.items():
+        gf_form = pair[1]
         gf_tags = tuple(word for word in
-                    line.replace('(', ' ').replace(')', ' ').split()
+                    pair[0].replace('(', ' ').replace(')', ' ').split()
                       if word in rgl_features)
         if not gf_tags:
             continue
@@ -211,7 +260,7 @@ def compare_tables(gf, wikt):
             report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
             report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
     ritems = tuple(report.items())  # need an unmutable structure, because otherwise ints are added to items
-    report['fun'] = gf[0].split()[-1]
+    report['fun'] = fun
     report['total_found'] = len([f for f, v  in ritems if v['wikt_form'] is not None ])
     report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
     report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
@@ -293,7 +342,7 @@ def forms_for_pos(obj):
         
     if 'lemma' in gf_entry and gf_entry['lemma']:
         gf_entry['lemma'] = gf_entry['lemma'][0]
-        if obj['root']:
+        if obj['root'] and obj['root'][0].strip():
             gf_entry['args']['root'] = obj['root']
         args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
         gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' 
@@ -309,7 +358,8 @@ if MODE == 'gf-abs':
 if MODE == 'gf-cnc':
     print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') 
 
-if MODE not in ['raw', 'eval']:
+    
+if MODE.startswith('gf') or MODE=='json':
   with open(FILTERED_WIKT) as file:
     seen_gf_funs = {}
     number = 1
@@ -360,17 +410,52 @@ if MODE.startswith('gf'):
     print('}')
 
     
-if MODE.startswith('eval'):
-    with open('pot.gftbl') as file:
-        gf = [line.strip() for line in file]
-    with open('pot.json') as file:
-        wikt = wikt_forms_for_pos(json.loads(file.read()))
-    report = compare_tables(gf, wikt)
-    
-    if MODE == 'eval-verbose':
-        for line in report.items():
-            print(line)
-    else:
-        print(report['fun'], 'forms', report['total_found'],
-              'voc', report['total_voc'], 'unvoc', report['total_unvoc'])
+def eval_all(gr, funmap, concrete=CONCRETE_MODULE):
+    lang = gr.languages[CONCRETE_MODULE]
+    funs = gr.functions
+    reports = []
+    for fun in funs:
+        funn = "'" + fun + "'"
+        if funn not in funmap:
+            print(funn, 'not found')
+            continue
+        wikt = wikt_forms_for_pos(funmap[funn])
+        gf = lang.tabularLinearize(pgf.Expr(fun, []))
+        report = compare_tables(gf, wikt, fun)
+        reports.append(report)
+    return reports
+
+
+def first_error(report):
+    for f, v in report.items():
+        if 'voc_match' in v:
+            if v['voc_match'] == 0:
+                return f, v
+
+
+if MODE.startswith('eval'):
+    gr = pgf.readPGF(PGF_FILE)
+    print('using', PGF_FILE)
+    funmap = read_function_source_map()
+    print(len(funmap), 'functions')
+    for report in eval_all(gr, funmap):    
+
+        if MODE == 'eval-verbose':
+            for line in report.items():
+                print(line)
+        else:
+            if report['total_found'] == 0:
+                verdict = 'NOT_FOUND'
+            elif report['total_found'] == report['total_voc']:
+                verdict = 'PERFECT'
+            elif report['total_found'] == report['total_unvoc']:
+                verdict = 'PERFECT_UNVOC ' + str(first_error(report))
+            elif report['total_voc'] == 0:
+                verdict = 'TOTALLY_WRONG ' + str(first_error(report))
+            else:
+                verdict = 'PARTIAL ' + str(first_error(report))
+            print(report['fun'], 'forms', report['total_found'],
+                  'voc', report['total_voc'], 'unvoc', report['total_unvoc'],
+                  verdict
+                  )
 

From edecc3fe57cac46e5b03079d9e87674f73626acb Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Thu, 14 Sep 2023 18:21:18 +0200
Subject: [PATCH 10/19] a quick way to extract wordnet morphology

---
 src/arabic/wiktionary/to_wordnet.py | 46 +++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 src/arabic/wiktionary/to_wordnet.py

diff --git a/src/arabic/wiktionary/to_wordnet.py b/src/arabic/wiktionary/to_wordnet.py
new file mode 100644
index 000000000..7496e769b
--- /dev/null
+++ b/src/arabic/wiktionary/to_wordnet.py
@@ -0,0 +1,46 @@
+import csv
+import json
+
+# to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
+# the following are assumed
+
+WN_TSV = 'arabic.tsv'
+MORPHO_GF = 'MorphoDictAraAbs.gf'
+
+def is_arabic(s):
+    return s and any(1574 <= ord(c) <= 1616 for c in s)
+
+def get_arabic(s):
+    return ''.join([c for c in s if is_arabic(c)])
+
+def unvocalize(s):
+    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
+
+
+# fun 'دُبُ_N' : N ; -- 10 [['bear']]
+funmap = {}
+with open(MORPHO_GF) as gffile:
+    for line in gffile:
+        line = line.split()
+        if line[2:] and line[0] == 'fun':
+            fun = line[1]
+            key = unvocalize(fun)
+            cat = line[3] 
+            sense = ' '.join(line[6:])
+            funmap[(key, cat)] = funmap.get((key, cat), [])
+            funmap[(key, cat)].append({'fun': fun,  'sense': sense})
+
+
+# abandon_1_V2    ParseAra        ترك     (1,1,1,3,322,3)
+with open(WN_TSV) as wnfile:
+##    wnreader = csv.reader(wnfile, delimiter='\t')
+    for row in wnfile:
+##        word = row[-1].strip()   # does not show tha arabic, but the second-last word
+        word = get_arabic(row)
+        wnfun = row.split()[0]
+        cat = [c for c in wnfun if c.isalpha()][-1]  # the last letter; the dict only contains N, A, V
+        funs = funmap.get((word, cat), [])
+        result = {'wnfun': wnfun, 'sought': word, 'found': funs}
+        print(json.dumps(result, ensure_ascii=False))
+ 
+

From 73f0b8ef00d944580b793020ef9cc94a7064b622 Mon Sep 17 00:00:00 2001
From: aarneranta <aarne@chalmers.se>
Date: Fri, 15 Sep 2023 14:48:23 +0200
Subject: [PATCH 11/19] commented and refactored read_wiktionary.py

---
 src/arabic/wiktionary/read_wiktionary.py | 213 ++++++++++++++---------
 src/arabic/wiktionary/to_wordnet.py      |   6 +-
 2 files changed, 133 insertions(+), 86 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 6db526c33..6ee6e10e8 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -31,7 +31,7 @@ create GF files:
 automatic evaluation:
 
   $ gf -make MorphoDictAra.gf
-  $ python3 read_wiktionary.py gf-map >function_sources_arabic.jsonl
+  $ python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl
   $ python3 read_wiktionary.py eval
 
 TODO:
@@ -42,8 +42,6 @@ TODO:
 """
 
 
-
-
 MODE = ''
 
 if __name__ == '__main__':
@@ -53,8 +51,9 @@ if __name__ == '__main__':
     MODE = sys.argv[1]  # 
 
     
-# step 1: extract data from this file using the raw option
+# step 1: extract Arabic data from this file using the raw option
 WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
+EXTRACTED_LANGUAGE = 'Arabic'
 
 # the following file is generated.
 # in the sequel, use this file with gf-abs or gf-cnc option
@@ -62,24 +61,18 @@ FILTERED_WIKT = 'wikt_arabic.jsonl'
 
 # map each successfully extracted GF function to its source record in Wiktionary
 # created with option gf-map
-FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl'
+FUNCTION_SOURCE_MAP = 'source_of_MorphoDictAra.jsonl'
 
+# created with $ gf -make MorphoDictAra.gf
 PGF_FILE = 'MorphoDictAraAbs.pgf'
+
+# module to linearize with
 CONCRETE_MODULE = 'MorphoDictAra'
 
-
-def read_function_source_map():
-    with open(FUNCTION_SOURCE_MAP) as file:
-        sourcemap = {}
-        for line in file:
-            try:
-                obj = json.loads(line)
-                sourcemap[obj['fun']] = obj['source']
-            except:
-                continue
-    return sourcemap
             
-
+# read a gzipped jsonl file (one object per line),
+# showing lines where one of a list of languages is present
+# This can be sampled to one of 100k lines by default, 1 for total recall.
 def get_gzip_json(file, sample=100000, langs=[]):
     with gzip.open(file) as decompressed:
         n = 0
@@ -91,10 +84,13 @@ def get_gzip_json(file, sample=100000, langs=[]):
                     print(line.decode("utf-8"))
 #        print(n)
 
-if MODE == 'raw':
-    get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])  
 
+# to perform the first step of data extraction, pipe this into a file:
 # python3 read_wiktionary.py raw >wikt_arabic.jsonl
+if MODE == 'raw':
+    get_gzip_json(WIKTIONARY_DUMP, 1, [EXTRACTED_LANGUAGE])
+    exit()
+
 
 # https://en.wikipedia.org/wiki/Buckwalter_transliteration
 buckwalter_dict = {
@@ -177,19 +173,22 @@ def reorder_shadda(s):
     return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
 
 
-# quote forms but not parameters
+# quote word forms but not parameters
 def quote_if(s, cond=is_arabic, change=reorder_shadda):
     if cond(s):
         return '"' + change(s) + '"'
     else:
         return s
 
+
+# generate word_d_C functions starting with d=0, but show d only when >= 1
 def gf_fun(s, pos, disamb=0):
     discrim = '_' + str(disamb) if disamb else ''
     return ''.join(["'", s, discrim, "_", pos, "'"])
 
 
-rgl_features = {
+# mapping from GF to Wikt features
+arabic_rgl_features = {
     # V
     'VPerf': 'perfective',
     'Act': 'active',
@@ -224,62 +223,22 @@ rgl_features = {
     'AComp': 'comparative'
     }
 
-
-# obsolote:
-# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ    
-# coming from 'l -treebank -table'
-# now used:
-#  {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
-# coming from tabularLinearize
-
-def compare_tables(gf, wikt, fun):
-    report = {}    
-    for pair in gf.items():
-        gf_form = pair[1]
-        gf_tags = tuple(word for word in
-                    pair[0].replace('(', ' ').replace(')', ' ').split()
-                      if word in rgl_features)
-        if not gf_tags:
-            continue
-        wikt_tags = {rgl_features[tag] for tag in gf_tags}
-        wikt_form = None
-        wikt_descr = None
-        for form, descr in wikt:
-            if all([tag in descr for tag in wikt_tags]):
-                wikt_form = reorder_shadda(form)
-                wikt_descr = descr
-                break
-        report[gf_tags] = {
-            'gf_form': gf_form,
-            'wikt_form': wikt_form,
-            'gf_form_rom': to_buckwalter(gf_form) if gf_form else None,
-            'wikt_form_rom': to_buckwalter(wikt_form) if wikt_form else None,
-            'wikt_descr': wikt_descr
-            }
-        if wikt_form:
-            report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
-            report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
-    ritems = tuple(report.items())  # need an unmutable structure, because otherwise ints are added to items
-    report['fun'] = fun
-    report['total_found'] = len([f for f, v  in ritems if v['wikt_form'] is not None ])
-    report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
-    report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
-    return report
-
-
-
-def wikt_forms_for_pos(obj):
+    
+# the inflection forms in a wiktionary entry
+def wikt_forms_from_obj(obj):
     return {
         form['form']:
           form.get('tags', []) for
             form in obj.get('forms', []) if
                'romanization' not in form.get('tags', []) and
                    is_arabic(form['form'])
-        }.items()
+        }
 
 
+# selection of forms for a given POS from Wikt: noun, adj, or verb
+# return a linearization function
 def forms_for_pos(obj):
-    forms = wikt_forms_for_pos(obj)
+    forms = wikt_forms_from_obj(obj).items()
     if obj['pos'] == 'noun':
         lemma = [form[:-1] for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
@@ -345,46 +304,60 @@ def forms_for_pos(obj):
         if obj['root'] and obj['root'][0].strip():
             gf_entry['args']['root'] = obj['root']
         args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
-        gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' 
+        gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(sorted(args)) + '}' 
 
     return gf_entry
+
     
 # "root": ["ش ر ح (š-r-ḥ)"]
 def find_root(s):
     return ''.join([c for c in s if is_arabic(c)])
     
+    
+# GF code generation
+
+# start with the header of the desired GF module
+
 if MODE == 'gf-abs':
     print('abstract MorphoDictAraAbs = Cat ** {')    
 if MODE == 'gf-cnc':
     print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') 
 
-    
+# go through the Arabic Wiktionary entries
+# generate functions with unique names
+
 if MODE.startswith('gf') or MODE=='json':
   with open(FILTERED_WIKT) as file:
-    seen_gf_funs = {}
+    seen_gf_funs = {}  # to disambiguate names if needed
     number = 1
     for line in file:
         try:
             obj = json.loads(line)
         except:
             continue
-        number += 1
+        number += 1   # if you find the same word_C again, mark it word_1_C
+
+        # the root (three radicals) is found in this place if at all
         root = [find_root(t['expansion']) for
                 t in obj.get('etymology_templates', []) if
                 t.get('name', None) =='ar-root'][:1]
         obj['root'] = root
+
+        # only take entries that are marked as lemmas 
         if 'Arabic lemmas' in obj.get('categories', []):
             entry = {
                 'pos': obj['pos'],
                 'forms': forms_for_pos(obj),
+                'all_forms': wikt_forms_from_obj(obj),
                 'senses': [sense['glosses'] for sense in obj.get('senses', [])
                            if 'glosses' in sense]
                 }
-#            entry['n_forms'] = len(entry['forms'])
-#            print(entry['pos'], entry['n_forms'])
+
+            # if you only want to see the Wikt information used GF generation
             if MODE == 'json':
                 print(json.dumps(entry, ensure_ascii=False))
-
+                
+            # if you want to proceed to GF generation
             if MODE.startswith('gf'):
 
                 lemma = entry['forms'].get('lemma', None)
@@ -393,23 +366,74 @@ if MODE.startswith('gf') or MODE=='json':
                     lin = entry['forms']['lin']
                     discrim = seen_gf_funs.get((lemma, cat), 0)
                     fun = gf_fun(lemma, cat, discrim)
-                        
+
+                    # abstract syntax, save in MorphoDictAraAbs.gf
                     if MODE == 'gf-abs':
                         print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
-                    if MODE == 'gf-cnc':
+                        
+                    # concrete syntax, save in MorphoDictAra.gf
+                    elif MODE == 'gf-cnc':
                         print('lin', fun, '=', lin, ';')
-                    if MODE == 'gf-map':
-                        mapitem = {'fun': fun, 'source': obj}
+                        
+                    # function-source map, save in source_of_MorphoDictAra.jsonl
+                    elif MODE == 'gf-map':
+                        mapitem = {'fun': fun, 'source': wikt_forms_from_obj(obj)}
                         print(json.dumps(mapitem, ensure_ascii=False))
                             
-                    seen_gf_funs[(lemma, cat)] = discrim + 1
+                    seen_gf_funs[(lemma, cat)] = discrim + 1  # next word_d_C will get a new number
 
-                # to do: rename duplicate function names: of 13762 names, 12946 are unique
-
-if MODE.startswith('gf'):            
+# terminate the GF file with a closing brace
+if MODE in ['gf-abs', 'gf-cnc']:            
     print('}')
 
-    
+
+# evaluation:
+# linearize all words to tables
+# compare them to the forms found in Wiktionary
+# report on matches
+
+# format of GF table:
+#  {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
+# coming from pgf tabularLinearize
+
+def compare_tables(gf, wikt, fun, show_buckwalter=True):
+    report = {}    
+    for pair in gf.items():
+        gf_form = pair[1]
+        gf_params = pair[0]
+        gf_tags = tuple(word for word in
+                    pair[0].replace('(', ' ').replace(')', ' ').split()
+                      if word in arabic_rgl_features)
+        if not gf_tags:
+            continue  # if gf_tags match no Wikt tags, do not include this form
+        wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags}
+        wikt_form = None
+        wikt_descr = None
+        for form, descr in wikt:
+            if all([tag in descr for tag in wikt_tags]):
+                wikt_form = reorder_shadda(form)
+                wikt_descr = descr
+                break
+        report[gf_tags] = {          # flat param description with only Wikt-relevant tags
+            'gf_params': gf_params,  # full param description
+            'gf_form': gf_form,
+            'wikt_form': wikt_form,
+            'wikt_descr': wikt_descr
+            }
+        if show_buckwalter:
+            report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None,
+            report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None,
+        if wikt_form:
+            report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
+            report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
+    ritems = tuple(report.items())  # need an unmutable structure, because otherwise ints are added to items
+    report['fun'] = fun
+    report['total_found'] = len([f for f, v  in ritems if v['wikt_form'] is not None ])
+    report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
+    report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
+    return report
+
+
 def eval_all(gr, funmap, concrete=CONCRETE_MODULE):
     lang = gr.languages[CONCRETE_MODULE]
     funs = gr.functions
@@ -419,13 +443,14 @@ def eval_all(gr, funmap, concrete=CONCRETE_MODULE):
         if funn not in funmap:
             print(funn, 'not found')
             continue
-        wikt = wikt_forms_for_pos(funmap[funn])
+        wikt = funmap[funn].items()
         gf = lang.tabularLinearize(pgf.Expr(fun, []))
         report = compare_tables(gf, wikt, fun)
         reports.append(report)
     return reports
 
 
+# in the summary report: print the first error if anything gets wrong
 def first_error(report):
     for f, v in report.items():
         if 'voc_match' in v:
@@ -433,6 +458,20 @@ def first_error(report):
                 return f, v
 
 
+# having stored the Wiktionary object for each GF function
+# read it back from a file
+def read_function_source_map():
+    with open(FUNCTION_SOURCE_MAP) as file:
+        sourcemap = {}
+        for line in file:
+            try:
+                obj = json.loads(line)
+                sourcemap[obj['fun']] = obj['source']
+            except:
+                continue
+    return sourcemap
+
+            
 if MODE.startswith('eval'):
     gr = pgf.readPGF(PGF_FILE)
     print('using', PGF_FILE)
@@ -443,6 +482,10 @@ if MODE.startswith('eval'):
         if MODE == 'eval-verbose':
             for line in report.items():
                 print(line)
+        if MODE == 'eval-tables':
+            for gftags, value in report.items():
+                if v := value['wikt_form']:
+                    print(' ', value['gf_params'][2:], '=>', '"' + v + '" ;')
         else:
             if report['total_found'] == 0:
                 verdict = 'NOT_FOUND'
diff --git a/src/arabic/wiktionary/to_wordnet.py b/src/arabic/wiktionary/to_wordnet.py
index 7496e769b..144e4cc1a 100644
--- a/src/arabic/wiktionary/to_wordnet.py
+++ b/src/arabic/wiktionary/to_wordnet.py
@@ -4,7 +4,11 @@ import json
 # to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
 # the following are assumed
 
+
+# from https://www.grammaticalframework.org/~krasimir/arabic.tsv.gz
 WN_TSV = 'arabic.tsv'
+
+# built as explained in ./read_wiktionary.py
 MORPHO_GF = 'MorphoDictAraAbs.gf'
 
 def is_arabic(s):
@@ -36,7 +40,7 @@ with open(WN_TSV) as wnfile:
 ##    wnreader = csv.reader(wnfile, delimiter='\t')
     for row in wnfile:
 ##        word = row[-1].strip()   # does not show tha arabic, but the second-last word
-        word = get_arabic(row)
+        word = unvocalize(get_arabic(row))
         wnfun = row.split()[0]
         cat = [c for c in wnfun if c.isalpha()][-1]  # the last letter; the dict only contains N, A, V
         funs = funmap.get((word, cat), [])

From 9e8c5eaad5699ee2e45e269f2c27a28a36185dd1 Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Mon, 18 Sep 2023 08:52:32 +0200
Subject: [PATCH 12/19] arabic/wiktionary: including root in the form list

---
 src/arabic/wiktionary/read_wiktionary.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 6ee6e10e8..9a1d76fef 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -226,19 +226,28 @@ arabic_rgl_features = {
     
 # the inflection forms in a wiktionary entry
 def wikt_forms_from_obj(obj):
-    return {
+    forms = {
         form['form']:
           form.get('tags', []) for
             form in obj.get('forms', []) if
                'romanization' not in form.get('tags', []) and
                    is_arabic(form['form'])
         }
+    # the root (three radicals) is found in this place if at all
+    root = [find_root(t['expansion']) for
+                t in obj.get('etymology_templates', []) if
+                t.get('name', None) =='ar-root'][:1]
+    if root and root[0].strip():
+        forms['root'] = root[0].strip()
+
+    return forms
 
 
 # selection of forms for a given POS from Wikt: noun, adj, or verb
 # return a linearization function
 def forms_for_pos(obj):
-    forms = wikt_forms_from_obj(obj).items()
+    dforms = wikt_forms_from_obj(obj)
+    forms = dforms.items()
     if obj['pos'] == 'noun':
         lemma = [form[:-1] for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
@@ -301,8 +310,8 @@ def forms_for_pos(obj):
         
     if 'lemma' in gf_entry and gf_entry['lemma']:
         gf_entry['lemma'] = gf_entry['lemma'][0]
-        if obj['root'] and obj['root'][0].strip():
-            gf_entry['args']['root'] = obj['root']
+        if 'root' in dforms:
+            gf_entry['args']['root'] = [dforms['root']]
         args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
         gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(sorted(args)) + '}' 
 
@@ -337,12 +346,6 @@ if MODE.startswith('gf') or MODE=='json':
             continue
         number += 1   # if you find the same word_C again, mark it word_1_C
 
-        # the root (three radicals) is found in this place if at all
-        root = [find_root(t['expansion']) for
-                t in obj.get('etymology_templates', []) if
-                t.get('name', None) =='ar-root'][:1]
-        obj['root'] = root
-
         # only take entries that are marked as lemmas 
         if 'Arabic lemmas' in obj.get('categories', []):
             entry = {

From abcb3a9f2aa7d421072ed26f171d2cfd46ca688e Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Wed, 20 Sep 2023 11:54:29 +0200
Subject: [PATCH 13/19] improving evaluation of wiktionary generated lexicon

---
 src/arabic/wiktionary/read_wiktionary.py | 186 ++++++++++++++---------
 1 file changed, 110 insertions(+), 76 deletions(-)

diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 9a1d76fef..434617231 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -69,6 +69,8 @@ PGF_FILE = 'MorphoDictAraAbs.pgf'
 # module to linearize with
 CONCRETE_MODULE = 'MorphoDictAra'
 
+# concrete syntax file, to debug sources of linearizations
+CONCRETE_FILE = CONCRETE_MODULE + '.gf'
             
 # read a gzipped jsonl file (one object per line),
 # showing lines where one of a list of languages is present
@@ -144,6 +146,9 @@ buckwalter_dict = {
 
 buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
 
+arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
+
+sound_consonants = {chr(c) for c in range(0x628, 0x648)}  # excluding alif, waw, ya
 
 def to_buckwalter(s):
     return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
@@ -157,12 +162,28 @@ def unvocalize(s):
     return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
 
 
+def drop_final_vowel(s):
+    if s[-1] in arabic_vowels:
+        return s[:-1]
+    else:
+        return s
+
+
 def is_arabic(s):
     return s and any(1574 <= ord(c) <= 1616 for c in s)
 
 def normal(s):
     return unicodedata.normalize('NFD', s)
 
+# heuristic for finding the three radicals from certain forms
+# works only for sound (strong) 3-radical roots, otherwise None
+def get_sound_trigram_root(s):
+    sounds = [c for c in s if c in sound_consonants]
+    if len(sounds) == 3:
+        return ''.join(sounds)
+    else:
+        return None
+
 
 # Wikt uses vowel+shadda which is a Unicode normalization
 # GF uses shadda+vowel which is linguistically correct
@@ -216,18 +237,18 @@ arabic_rgl_features = {
     'Gen': 'genitive',
 #    'Bare':
 #    'Dat':
-    'Const': 'construct',
+    'Const': 'construct'
 #    'Poss':
-    #A: also N features
-    'APosit': 'positive',
-    'AComp': 'comparative'
+    #A: also N features; degree features cannot be found
+#    'APosit': 'positive',
+#    'AComp': 'comparative'
     }
 
     
 # the inflection forms in a wiktionary entry
 def wikt_forms_from_obj(obj):
     forms = {
-        form['form']:
+        reorder_shadda(form['form']):
           form.get('tags', []) for
             form in obj.get('forms', []) if
                'romanization' not in form.get('tags', []) and
@@ -249,9 +270,9 @@ def forms_for_pos(obj):
     dforms = wikt_forms_from_obj(obj)
     forms = dforms.items()
     if obj['pos'] == 'noun':
-        lemma = [form[:-1] for form, descr in forms
+        lemma = [drop_final_vowel(form) for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
-        plural = [form[:-1] for form, descr in forms
+        plural = [drop_final_vowel(form) for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
         gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
                             else (['masc'] if  'Arabic masculine nouns' in obj['categories']
@@ -312,8 +333,11 @@ def forms_for_pos(obj):
         gf_entry['lemma'] = gf_entry['lemma'][0]
         if 'root' in dforms:
             gf_entry['args']['root'] = [dforms['root']]
-        args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
-        gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(sorted(args)) + '}' 
+        elif root := get_sound_trigram_root(gf_entry['lemma']):
+            gf_entry['args']['root'] = [root]
+        args = sorted([(r, quote_if(x[0])) for r, x in gf_entry['args'].items() if x])
+        gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join([r + ' = ' + v for (r, v) in args]) + '}'
+        gf_entry['labels'] = ','.join([r for r, v in args])
 
     return gf_entry
 
@@ -367,6 +391,7 @@ if MODE.startswith('gf') or MODE=='json':
                 if lemma:
                     cat = entry['forms']['cat']
                     lin = entry['forms']['lin']
+                    labels = entry['forms']['labels']
                     discrim = seen_gf_funs.get((lemma, cat), 0)
                     fun = gf_fun(lemma, cat, discrim)
 
@@ -380,7 +405,9 @@ if MODE.startswith('gf') or MODE=='json':
                         
                     # function-source map, save in source_of_MorphoDictAra.jsonl
                     elif MODE == 'gf-map':
-                        mapitem = {'fun': fun, 'source': wikt_forms_from_obj(obj)}
+                        source = wikt_forms_from_obj(obj)
+                        source['gf_labels'] = labels
+                        mapitem = {'fun': fun, 'source': source}
                         print(json.dumps(mapitem, ensure_ascii=False))
                             
                     seen_gf_funs[(lemma, cat)] = discrim + 1  # next word_d_C will get a new number
@@ -399,6 +426,7 @@ if MODE in ['gf-abs', 'gf-cnc']:
 #  {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
 # coming from pgf tabularLinearize
 
+# compare the table for one function, returning a report as a dict
 def compare_tables(gf, wikt, fun, show_buckwalter=True):
     report = {}    
     for pair in gf.items():
@@ -412,7 +440,7 @@ def compare_tables(gf, wikt, fun, show_buckwalter=True):
         wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags}
         wikt_form = None
         wikt_descr = None
-        for form, descr in wikt:
+        for form, descr in wikt.items():
             if all([tag in descr for tag in wikt_tags]):
                 wikt_form = reorder_shadda(form)
                 wikt_descr = descr
@@ -424,84 +452,90 @@ def compare_tables(gf, wikt, fun, show_buckwalter=True):
             'wikt_descr': wikt_descr
             }
         if show_buckwalter:
-            report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None,
-            report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None,
+            report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None
+            report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None
         if wikt_form:
             report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
             report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
     ritems = tuple(report.items())  # need an unmutable structure, because otherwise ints are added to items
     report['fun'] = fun
+    report['labels'] = wikt['gf_labels']
     report['total_found'] = len([f for f, v  in ritems if v['wikt_form'] is not None ])
     report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
     report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
     return report
 
 
-def eval_all(gr, funmap, concrete=CONCRETE_MODULE):
-    lang = gr.languages[CONCRETE_MODULE]
-    funs = gr.functions
-    reports = []
-    for fun in funs:
-        funn = "'" + fun + "'"
-        if funn not in funmap:
-            print(funn, 'not found')
-            continue
-        wikt = funmap[funn].items()
-        gf = lang.tabularLinearize(pgf.Expr(fun, []))
-        report = compare_tables(gf, wikt, fun)
-        reports.append(report)
-    return reports
+# with a given grammar and function, prepare input for compare_tables
+# and produce a report, possibly summarizing it
+def eval_with_wikt(gr, lang, fun, wikt, verbose=False):
+    if fun not in gr.functions:
+        print(fun, 'not found in grammar')
+        return
+    gf = {p: s for (p, s) in lang.tabularLinearize(pgf.Expr(fun, [])).items()
+              if p.startswith('s ')}  # require the s field, exclude s2
+    report = compare_tables(gf, wikt, fun)
+    if verbose:
+        return report
+    else:
+        if report['total_found'] == 0:
+            verdict = 'NOT_FOUND'
+            flaws = False
+        elif report['total_found'] == report['total_voc']:
+            verdict = 'PERFECT'
+            flaws = False
+        elif report['total_found'] == report['total_unvoc']:
+            verdict = 'PERFECT_UNVOC'
+            flaws = True
+        elif report['total_voc'] == 0:
+            verdict = 'TOTALLY_WRONG'
+            flaws = True
+        else:
+            verdict = 'PARTIAL'
+            flaws = True
+        summary = {
+            'fun': report['fun'],
+            'forms': report['total_found'],
+            'voc': report['total_voc'],
+            'unvoc': report['total_unvoc'],
+            'verdict': verdict,
+            'labels': report['labels']
+            }
 
-
-# in the summary report: print the first error if anything gets wrong
-def first_error(report):
-    for f, v in report.items():
-        if 'voc_match' in v:
-            if v['voc_match'] == 0:
-                return f, v
-
-
-# having stored the Wiktionary object for each GF function
-# read it back from a file
-def read_function_source_map():
-    with open(FUNCTION_SOURCE_MAP) as file:
-        sourcemap = {}
-        for line in file:
-            try:
-                obj = json.loads(line)
-                sourcemap[obj['fun']] = obj['source']
-            except:
-                continue
-    return sourcemap
+        if flaws:
+            for f, v in report.items():
+                if v.get('voc_match', 1) == 0:
+                    summary['first_error'] = v
+                    break
+        return summary
 
             
+def eval_grammar(pgffile, concretename, mapfile, show=True, verbose=False): 
+    gr = pgf.readPGF(pgffile)
+    concrete = gr.languages[concretename]
+
+    totals = {'A': {}, 'N': {}, 'V': {}}
+
+    with open(mapfile) as file:
+        for line in file:
+            obj = json.loads(line)
+            fun = obj['fun'][1:-1]
+            report = eval_with_wikt(gr, concrete, fun, obj['source'], verbose)
+
+            cat = fun[-1]
+            if 'verdict' in report:
+                rep = report['verdict']
+                totals[cat][rep] = totals[cat].get(rep, 0) + 1 
+
+            if show:
+                print(report)
+
+    print(totals)
+    
+
 if MODE.startswith('eval'):
-    gr = pgf.readPGF(PGF_FILE)
-    print('using', PGF_FILE)
-    funmap = read_function_source_map()
-    print(len(funmap), 'functions')
-    for report in eval_all(gr, funmap):    
-
-        if MODE == 'eval-verbose':
-            for line in report.items():
-                print(line)
-        if MODE == 'eval-tables':
-            for gftags, value in report.items():
-                if v := value['wikt_form']:
-                    print(' ', value['gf_params'][2:], '=>', '"' + v + '" ;')
-        else:
-            if report['total_found'] == 0:
-                verdict = 'NOT_FOUND'
-            elif report['total_found'] == report['total_voc']:
-                verdict = 'PERFECT'
-            elif report['total_found'] == report['total_unvoc']:
-                verdict = 'PERFECT_UNVOC ' + str(first_error(report))
-            elif report['total_voc'] == 0:
-                verdict = 'TOTALLY_WRONG ' + str(first_error(report))
-            else:
-                verdict = 'PARTIAL ' + str(first_error(report))
-            print(report['fun'], 'forms', report['total_found'],
-                  'voc', report['total_voc'], 'unvoc', report['total_unvoc'],
-                  verdict
-                  )
+    verbose = MODE=='eval-verbose'
+    show = verbose or MODE=='eval-funs'
+    eval_grammar(PGF_FILE, CONCRETE_MODULE, FUNCTION_SOURCE_MAP, show, verbose)
 
+    

From 24199311058e87a72ecde6e9d8bd835a8c143e02 Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Wed, 20 Sep 2023 11:54:59 +0200
Subject: [PATCH 14/19] some more paradigms for Arabic Wiktionary generation

---
 src/arabic/ParadigmsAra.gf | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf
index 20892fed8..1b3cfc85b 100644
--- a/src/arabic/ParadigmsAra.gf
+++ b/src/arabic/ParadigmsAra.gf
@@ -885,8 +885,10 @@ oper
       = \r -> mkN r.sg r.pl r.g nohum ;   --- hum/nohum not in Wikt
     wmkN : {sg : Str; pl : Str} -> N
       = \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ;
+    wmkN : {sg, pl : Str ; root : Str} -> N
+      = \r -> mkN r.sg r.pl masc nohum ;  ---- 
     wmkN : {sg : Str; root : Str} -> N 
-      = \r -> smartN r.sg ; 
+      = \r -> smartN r.sg ;
     } ;
 
   wmkA = overload {
@@ -928,7 +930,7 @@ oper
     wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V
       = \r -> mkV r.root r.cls ; ----
     wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V
-      = \r -> variants {} ; ---- mkV r.imperfect ; ----
+      = \r -> mkV r.perfect r.cls ; ----
     wmkV : {root : Str ; cls : VerbForm} -> V
       = \r -> mkV r.root r.cls ;
     wmkV : {imperfect : Str} -> V

From fdd7c9641ea6b14af6dfd0bf21456a7071b33332 Mon Sep 17 00:00:00 2001
From: aarneranta <aarne@chalmers.se>
Date: Wed, 20 Sep 2023 16:05:46 +0200
Subject: [PATCH 15/19] Ara: improving Adj inflection by identifying fcl
 patterns from concrete forms

---
 src/arabic/MorphoAra.gf                  |  6 +++--
 src/arabic/ParadigmsAra.gf               | 32 +++++++++++++++++++++---
 src/arabic/wiktionary/Makefile           |  7 ++++++
 src/arabic/wiktionary/read_wiktionary.py | 28 +++++++++++++++++++--
 4 files changed, 65 insertions(+), 8 deletions(-)
 create mode 100644 src/arabic/wiktionary/Makefile

diff --git a/src/arabic/MorphoAra.gf b/src/arabic/MorphoAra.gf
index 808223b4d..53f7a2608 100644
--- a/src/arabic/MorphoAra.gf
+++ b/src/arabic/MorphoAra.gf
@@ -153,7 +153,8 @@ oper
         w + "ف" + x + "ع" + y + "ل" + z
           => { h = w ; m1 = x; m2 = y; t = z} ;
         w + "ف" + x + ("ع"|"ل") + y
-          => { h = w ; m1 = x; m2 = ""; t = y}
+          => { h = w ; m1 = x; m2 = ""; t = y} ;
+	_ => Predef.error("cannot get FCL pattern from" ++ pat)
       } ;
 
   --opers to interdigitize (make words out of roots and patterns:
@@ -204,7 +205,8 @@ oper
                     => mkAssimilated pat (mkRoot3 rS) ;
             ? + ? + _ => mkBilit pat (mkRoot2 rS) ; --2=>
             _=> error rS ---- AR error "expected 3--6"
-        }
+        } ;
+     _ => Predef.error("cannot get FCL pattern from" ++ pS)
     };
 
 -----------------------------------------------------------------------------
diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf
index 1b3cfc85b..3d1623e14 100644
--- a/src/arabic/ParadigmsAra.gf
+++ b/src/arabic/ParadigmsAra.gf
@@ -898,12 +898,30 @@ oper
       = \r -> mkA r.root ;
     mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
       = \r -> mkA r.root ;
-    mkA : {masc_sg : Str; fem_sg: Str ; masc_pl : Str; fem_pl : Str; root : Str} -> A
-      = \r -> mkA r.root ;
+    mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt r.pl_patt ;
+    mkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt r.pl_patt ;
+    mkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt ;
+    mkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt ;
+    mkA : {masc_sg, root, sg_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt ;
+    mkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt ;
+    mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
+      = \r -> mkA r.root ; ----
+    mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
+      = \r -> mkA r.root ; ----
+    mkA : {masc_sg, fem_sg, root : Str} -> A
+      = \r -> mkA r.root ; ----
+    mkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
     mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
       = \r -> mkA r.masc_sg ; ----
-    mkA : {masc_sg : Str; fem_sg : Str; root : Str} -> A
-      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt ;
     mkA : {masc_sg : Str; fem_sg : Str} -> A
       = \r -> mkA r.masc_sg ; ----
     mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
@@ -914,8 +932,14 @@ oper
       = \r -> mkA r.masc_sg ; ----
     mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
       = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
+      = \r -> mkA r.sg_patt r.pl_patt ;
     mkA : {masc_sg : Str; masc_pl : Str} -> A
       = \r -> mkA r.masc_sg ; ----
+    mkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
     mkA : {masc_sg : Str; root : Str} -> A
       = \r -> mkA r.root ;
     mkA : {masc_sg : Str} -> A
diff --git a/src/arabic/wiktionary/Makefile b/src/arabic/wiktionary/Makefile
new file mode 100644
index 000000000..80e1da791
--- /dev/null
+++ b/src/arabic/wiktionary/Makefile
@@ -0,0 +1,7 @@
+all:
+	python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
+	python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
+	python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl 
+	gf -make MorphoDictAra.gf
+	python3 read_wiktionary.py eval-funs >1-eval.txt 
+	python3 to_wordnet.py >wornet-arabic.jsonl
diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 434617231..960a592d3 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -122,7 +122,7 @@ buckwalter_dict = {
   0x638: 'Z',  # ظ
   0x639: 'E',  # ع
   0x63a: 'g',  # غ
-  0x641: 'f',  # ف
+  0x641: 'f',  # ف  
   0x642: 'q',  # ق
   0x643: 'k',  # ك
   0x644: 'l',  # ل
@@ -144,6 +144,7 @@ buckwalter_dict = {
   0x671: '{'   # ٱ
   }
 
+
 buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
 
 arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
@@ -184,6 +185,24 @@ def get_sound_trigram_root(s):
     else:
         return None
 
+    
+# reverse engineer fcl pattern from a given form, with a sound trigram root
+# one more condition: each of the root letters occurs exactly ones
+# TODO: better use the given root of the lex entry
+def get_sound_fcl_pattern(s):
+    if root := get_sound_trigram_root(s):
+        if len([c in s for c in root]) == 3:
+            p = list(s)
+            r = s.find(root[0])
+            p[r] = chr(0x641)
+            r += s[r+1:].find(root[1]) + 1
+            p[r] = chr(0x639)
+            r += s[r+1:].find(root[2]) + 1
+            p[r] = chr(0x644)
+            p = ''.join(p)
+##            print('---PATT', s, root, p)
+            return p
+    
 
 # Wikt uses vowel+shadda which is a Unicode normalization
 # GF uses shadda+vowel which is linguistically correct
@@ -324,7 +343,12 @@ def forms_for_pos(obj):
                 'fem_pl': [form for form, descr in forms
                          if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
                 }
-            } 
+            }
+        for patt in ['masc_sg', 'masc_pl']:
+            if patt in gf_entry['args']:
+                if form := gf_entry['args'][patt]:
+                    if spatt := get_sound_fcl_pattern(form[0]):
+                        gf_entry['args'][patt[5:]+'_patt'] = [spatt]  # sg_patt, pl_patt
 
     else:
         gf_entry = {f: d for f, d in forms}

From 7e383b746e81544dfeee9ae776fa84ac07e3c4f9 Mon Sep 17 00:00:00 2001
From: aarneranta <aarne@chalmers.se>
Date: Thu, 21 Sep 2023 15:46:41 +0200
Subject: [PATCH 16/19] moved wikt-specific paradigms to a separate file (for
 the moment)

---
 src/arabic/ParadigmsAra.gf               | 54 ++++++++++++------------
 src/arabic/wiktionary/Makefile           |  2 +-
 src/arabic/wiktionary/read_wiktionary.py | 21 +++++++--
 3 files changed, 47 insertions(+), 30 deletions(-)

diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf
index 3d1623e14..80506ebbb 100644
--- a/src/arabic/ParadigmsAra.gf
+++ b/src/arabic/ParadigmsAra.gf
@@ -868,6 +868,8 @@ formV : (root : Str) -> VerbForm -> V = \s,f -> case f of {
 param VerbForm =
   FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ;
 
+
+{- temporarily moved to wiktionary/MoreAra.gf
 -- paradigms for Wiktionary extraction
 ---- TODO: better usage of information in Wiktionary
 
@@ -894,55 +896,55 @@ oper
   wmkA = overload {
     wmkA : {root : Str} -> A
       = \r -> mkA r.root ;
-    mkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
+    wmkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
       = \r -> mkA r.root ;
-    mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
+    wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
       = \r -> mkA r.root ;
-    mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
+    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
       = \r -> mkA r.root r.sg_patt r.pl_patt ;
-    mkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
+    wmkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
       = \r -> mkA r.root r.sg_patt r.pl_patt ;
-    mkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
+    wmkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
       = \r -> mkA r.root r.sg_patt ;
-    mkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
+    wmkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
       = \r -> mkA r.root r.sg_patt ;
-    mkA : {masc_sg, root, sg_patt : Str} -> A
+    wmkA : {masc_sg, root, sg_patt : Str} -> A
       = \r -> mkA r.root r.sg_patt ;
-    mkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
+    wmkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
       = \r -> mkA r.root r.sg_patt ;
-    mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
+    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
       = \r -> mkA r.root ; ----
-    mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
+    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
       = \r -> mkA r.root ; ----
-    mkA : {masc_sg, fem_sg, root : Str} -> A
+    wmkA : {masc_sg, fem_sg, root : Str} -> A
       = \r -> mkA r.root ; ----
-    mkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
+    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
       = \r -> mkA r.masc_sg ; ----
-    mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
+    wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
       = \r -> mkA r.masc_sg ; ----
-    mkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
+    wmkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
       = \r -> mkA r.root r.sg_patt ;
-    mkA : {masc_sg : Str; fem_sg : Str} -> A
+    wmkA : {masc_sg : Str; fem_sg : Str} -> A
       = \r -> mkA r.masc_sg ; ----
-    mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
+    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
       = \r -> mkA r.masc_sg ; ----
-    mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
+    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
       = \r -> mkA r.root ;
-    mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
+    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
       = \r -> mkA r.masc_sg ; ----
-    mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
+    wmkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
       = \r -> mkA r.root ;
-    mkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
+    wmkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
       = \r -> mkA r.root ;
-    mkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
+    wmkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
       = \r -> mkA r.sg_patt r.pl_patt ;
-    mkA : {masc_sg : Str; masc_pl : Str} -> A
+    wmkA : {masc_sg : Str; masc_pl : Str} -> A
       = \r -> mkA r.masc_sg ; ----
-    mkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
+    wmkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
       = \r -> mkA r.masc_sg ; ----
-    mkA : {masc_sg : Str; root : Str} -> A
+    wmkA : {masc_sg : Str; root : Str} -> A
       = \r -> mkA r.root ;
-    mkA : {masc_sg : Str} -> A
+    wmkA : {masc_sg : Str} -> A
       = \r -> mkA r.masc_sg ; ----
     } ;
 
@@ -960,5 +962,5 @@ oper
     wmkV : {imperfect : Str} -> V
       = \r -> variants {} ; ---- mkV r.imperfect ;
     } ;
-
+-}
 } ;
diff --git a/src/arabic/wiktionary/Makefile b/src/arabic/wiktionary/Makefile
index 80e1da791..58fcf2b6d 100644
--- a/src/arabic/wiktionary/Makefile
+++ b/src/arabic/wiktionary/Makefile
@@ -4,4 +4,4 @@ all:
 	python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl 
 	gf -make MorphoDictAra.gf
 	python3 read_wiktionary.py eval-funs >1-eval.txt 
-	python3 to_wordnet.py >wornet-arabic.jsonl
+	python3 to_wordnet.py >wordnet-arabic.jsonl
diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 960a592d3..69099294e 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -71,6 +71,10 @@ CONCRETE_MODULE = 'MorphoDictAra'
 
 # concrete syntax file, to debug sources of linearizations
 CONCRETE_FILE = CONCRETE_MODULE + '.gf'
+
+# evaluation result file, created with mode eval-funs
+EVAL_FILE = 'eval.jsonl'
+
             
 # read a gzipped jsonl file (one object per line),
 # showing lines where one of a list of languages is present
@@ -93,6 +97,17 @@ if MODE == 'raw':
     get_gzip_json(WIKTIONARY_DUMP, 1, [EXTRACTED_LANGUAGE])
     exit()
 
+    
+if MODE == 'error-analysis':
+    evals = {}
+    with open(EVAL_FILE) as file:
+        for line in file:
+            row = json.loads(line)
+            if labels := row.get('labels', None):
+                verdict = row['verdict']
+                evals[(labels, verdict)] = evals.get((labels, verdict), 0) + 1
+    for labverdict, n in sorted(list(evals.items())):
+        print(labverdict, n)
 
 # https://en.wikipedia.org/wiki/Buckwalter_transliteration
 buckwalter_dict = {
@@ -378,7 +393,7 @@ def find_root(s):
 if MODE == 'gf-abs':
     print('abstract MorphoDictAraAbs = Cat ** {')    
 if MODE == 'gf-cnc':
-    print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') 
+    print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra, MoreAra in {') 
 
 # go through the Arabic Wiktionary entries
 # generate functions with unique names
@@ -552,9 +567,9 @@ def eval_grammar(pgffile, concretename, mapfile, show=True, verbose=False):
                 totals[cat][rep] = totals[cat].get(rep, 0) + 1 
 
             if show:
-                print(report)
+                print(json.dumps(report, ensure_ascii=False))
 
-    print(totals)
+        print(json.dumps(totals, ensure_ascii=False))
     
 
 if MODE.startswith('eval'):

From aa1dff67026918764e0c3c03697120c828e6b4ea Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Thu, 21 Sep 2023 17:29:38 +0200
Subject: [PATCH 17/19] added MoreAra.gf

---
 src/arabic/wiktionary/Makefile           |  3 +-
 src/arabic/wiktionary/MoreAra.gf         | 98 ++++++++++++++++++++++++
 src/arabic/wiktionary/read_wiktionary.py |  3 +-
 3 files changed, 102 insertions(+), 2 deletions(-)
 create mode 100644 src/arabic/wiktionary/MoreAra.gf

diff --git a/src/arabic/wiktionary/Makefile b/src/arabic/wiktionary/Makefile
index 58fcf2b6d..a14e23e52 100644
--- a/src/arabic/wiktionary/Makefile
+++ b/src/arabic/wiktionary/Makefile
@@ -3,5 +3,6 @@ all:
 	python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
 	python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl 
 	gf -make MorphoDictAra.gf
-	python3 read_wiktionary.py eval-funs >1-eval.txt 
+	python3 read_wiktionary.py eval-funs >eval.jsonl 
 	python3 to_wordnet.py >wordnet-arabic.jsonl
+	python3 read_wiktionary.py error-analysis
diff --git a/src/arabic/wiktionary/MoreAra.gf b/src/arabic/wiktionary/MoreAra.gf
new file mode 100644
index 000000000..e45b49b58
--- /dev/null
+++ b/src/arabic/wiktionary/MoreAra.gf
@@ -0,0 +1,98 @@
+resource MoreAra = CatAra ** open ParadigmsAra in {
+
+
+-- temporarily moved from ParadigmsAra
+-- paradigms for Wiktionary extraction
+---- TODO: better usage of information in Wiktionary
+
+oper
+  wmkN = overload {
+    wmkN : {sg, pl : Str ; g : Gender} -> N
+      = \r -> mkN r.sg r.pl r.g nohum ;  --- hum/nohum not in Wikt
+    wmkN : {sg : Str} -> N
+      = \r -> smartN r.sg ; 
+    wmkN : {sg : Str ; g : Gender ; root : Str} -> N
+      = \r -> smartN r.sg ** {g = r.g} ; ----
+    wmkN : {sg : Str; g : Gender} -> N
+      = \r -> smartN r.sg ** {g = r.g} ;
+    wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N
+      = \r -> mkN r.sg r.pl r.g nohum ;   --- hum/nohum not in Wikt
+    wmkN : {sg : Str; pl : Str} -> N
+      = \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ;
+    wmkN : {sg, pl : Str ; root : Str} -> N
+      = \r -> mkN r.sg r.pl masc nohum ;  ---- 
+    wmkN : {sg : Str; root : Str} -> N 
+      = \r -> smartN r.sg ;
+    } ;
+
+  wmkA = overload {
+    wmkA : {root : Str} -> A
+      = \r -> mkA r.root ;
+    wmkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt r.pl_patt ;
+    wmkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt r.pl_patt ;
+    wmkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt ;
+    wmkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt ;
+    wmkA : {masc_sg, root, sg_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt ;
+    wmkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt ;
+    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
+      = \r -> mkA r.root ; ----
+    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
+      = \r -> mkA r.root ; ----
+    wmkA : {masc_sg, fem_sg, root : Str} -> A
+      = \r -> mkA r.root ; ----
+    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    wmkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
+      = \r -> mkA r.root r.sg_patt ;
+    wmkA : {masc_sg : Str; fem_sg : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    wmkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    wmkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    wmkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
+      = \r -> mkA r.sg_patt r.pl_patt ;
+    wmkA : {masc_sg : Str; masc_pl : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    wmkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    wmkA : {masc_sg : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    wmkA : {masc_sg : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    } ;
+
+  wmkV = overload {
+    wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V
+      = \r -> mkV r.root r.cls ; ----
+    wmkV : {perfect : Str; cls : VerbForm} -> V
+      = \r -> mkV r.perfect r.cls ; ----
+    wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V
+      = \r -> mkV r.root r.cls ; ----
+    wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V
+      = \r -> mkV r.perfect r.cls ; ----
+    wmkV : {root : Str ; cls : VerbForm} -> V
+      = \r -> mkV r.root r.cls ;
+    wmkV : {imperfect : Str} -> V
+      = \r -> variants {} ; ---- mkV r.imperfect ;
+    } ;
+
+}
\ No newline at end of file
diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 69099294e..140852c7a 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -104,8 +104,9 @@ if MODE == 'error-analysis':
         for line in file:
             row = json.loads(line)
             if labels := row.get('labels', None):
+                cat = row['fun'][-1]
                 verdict = row['verdict']
-                evals[(labels, verdict)] = evals.get((labels, verdict), 0) + 1
+                evals[(cat, labels, verdict)] = evals.get((cat, labels, verdict), 0) + 1
     for labverdict, n in sorted(list(evals.items())):
         print(labverdict, n)
 

From 561a8c130d5b1f99e98a5ced819ff58f4858a65a Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Mon, 25 Sep 2023 08:22:47 +0200
Subject: [PATCH 18/19] to_wordnet applied to a new format of data

---
 src/arabic/wiktionary/to_wordnet.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/arabic/wiktionary/to_wordnet.py b/src/arabic/wiktionary/to_wordnet.py
index 144e4cc1a..b159c5f18 100644
--- a/src/arabic/wiktionary/to_wordnet.py
+++ b/src/arabic/wiktionary/to_wordnet.py
@@ -6,7 +6,8 @@ import json
 
 
 # from https://www.grammaticalframework.org/~krasimir/arabic.tsv.gz
-WN_TSV = 'arabic.tsv'
+# WN_TSV = 'arabic.tsv'  # Krasimir
+WN_TSV = 'ar2en_words_gf.csv'  # Zarzoura
 
 # built as explained in ./read_wiktionary.py
 MORPHO_GF = 'MorphoDictAraAbs.gf'
@@ -41,7 +42,7 @@ with open(WN_TSV) as wnfile:
     for row in wnfile:
 ##        word = row[-1].strip()   # does not show tha arabic, but the second-last word
         word = unvocalize(get_arabic(row))
-        wnfun = row.split()[0]
+        wnfun = row.split()[-1]  # 0 in Krasimir
         cat = [c for c in wnfun if c.isalpha()][-1]  # the last letter; the dict only contains N, A, V
         funs = funmap.get((word, cat), [])
         result = {'wnfun': wnfun, 'sought': word, 'found': funs}

From 1c355ce9dd49d1fd59090bb41a91a573cc9ce1c1 Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Mon, 25 Sep 2023 09:22:21 +0200
Subject: [PATCH 19/19] factored out arabic_utilities.py as a separate file

---
 src/arabic/wiktionary/arabic_utilities.py | 169 ++++++++++++++++++++++
 src/arabic/wiktionary/read_wiktionary.py  | 128 +---------------
 src/arabic/wiktionary/to_wordnet.py       |  11 +-
 3 files changed, 172 insertions(+), 136 deletions(-)
 create mode 100644 src/arabic/wiktionary/arabic_utilities.py

diff --git a/src/arabic/wiktionary/arabic_utilities.py b/src/arabic/wiktionary/arabic_utilities.py
new file mode 100644
index 000000000..29a15f105
--- /dev/null
+++ b/src/arabic/wiktionary/arabic_utilities.py
@@ -0,0 +1,169 @@
+# utilities for Arabic script
+# in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter
+# as specified by the command line argument:
+#
+#   % python3 arabic_utilities.py to <MorphoDictAra.gf | python3 arabic_utilities.py from >b.tmp
+#   % diff MorphoDictAra.gf b.tmp 
+#   % 
+
+def is_arabic(s):
+    return s and any(1574 <= ord(c) <= 1616 for c in s)
+
+
+def get_arabic(s):
+    return ''.join([c for c in s if is_arabic(c)])
+
+
+def unvocalize(s):
+    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
+
+
+# https://en.wikipedia.org/wiki/Buckwalter_transliteration
+buckwalter_dict = {
+  0x621: "'",  # ء
+  0x622: '|',  # آ
+  0x623: '>',  # أ
+  0x624: '&',  # ؤ
+  0x625: '<',  # إ
+  0x626: '}',  # ئ
+  0x627: 'A',  # ا
+  0x628: 'b',  # ب
+  0x629: 'p',  # ة
+  0x62a: 't',  # ت
+  0x62b: 'v',  # ث
+  0x62c: 'j',  # ج
+  0x62d: 'H',  # ح
+  0x62e: 'x',  # خ
+  0x62f: 'd',  # د
+  0x630: '*',  # ذ
+  0x631: 'r',  # ر
+  0x632: 'z',  # ز
+  0x633: 's',  # س
+  0x634: '$',  # ش
+  0x635: 'S',  # ص
+  0x636: 'D',  # ض
+  0x637: 'T',  # ط
+  0x638: 'Z',  # ظ
+  0x639: 'E',  # ع
+  0x63a: 'g',  # غ
+  0x641: 'f',  # ف  
+  0x642: 'q',  # ق
+  0x643: 'k',  # ك
+  0x644: 'l',  # ل
+  0x645: 'm',  # م
+  0x646: 'n',  # ن
+  0x647: 'h',  # ه
+  0x648: 'w',  # و
+  0x649: 'Y',  # ى
+  0x64a: 'y',  # ي
+  0x64b: 'F',  # ً
+  0x64c: 'N',  # ٌ
+  0x64d: 'K',  # ٍ
+  0x64e: 'a',  # َ
+  0x64f: 'u',  # ُ
+  0x650: 'i',  # ِ
+  0x651: '~',  # ّ
+  0x652: 'o',  # ْ
+  0x670: '`',  # '
+  0x671: '{'   # ٱ
+  }
+
+
+buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
+
+arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
+
+sound_consonants = {chr(c) for c in range(0x628, 0x648)}  # excluding alif, waw, ya
+
+def to_buckwalter(s):
+    return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
+
+
+def from_buckwalter(s):
+    return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
+
+
+def drop_final_vowel(s):
+    if s[-1] in arabic_vowels:
+        return s[:-1]
+    else:
+        return s
+
+
+def normal(s):
+    return unicodedata.normalize('NFD', s)
+
+# heuristic for finding the three radicals from certain forms
+# works only for sound (strong) 3-radical roots, otherwise None
+def get_sound_trigram_root(s):
+    sounds = [c for c in s if c in sound_consonants]
+    if len(sounds) == 3:
+        return ''.join(sounds)
+    else:
+        return None
+
+    
+# reverse engineer fcl pattern from a given form, with a sound trigram root
+# one more condition: each of the root letters occurs exactly ones
+# TODO: better use the given root of the lex entry
+def get_sound_fcl_pattern(s):
+    if root := get_sound_trigram_root(s):
+        if len([c in s for c in root]) == 3:
+            p = list(s)
+            r = s.find(root[0])
+            p[r] = chr(0x641)
+            r += s[r+1:].find(root[1]) + 1
+            p[r] = chr(0x639)
+            r += s[r+1:].find(root[2]) + 1
+            p[r] = chr(0x644)
+            p = ''.join(p)
+##            print('---PATT', s, root, p)
+            return p
+    
+
+# Wikt uses vowel+shadda which is a Unicode normalization
+# GF uses shadda+vowel which is linguistically correct
+# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
+# unicodedata.normalize does this wrong, as noted by Ariel Gutman 
+## todo: more direct implementation
+def reorder_shadda(s):
+    return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
+
+
+# quote word forms but not parameters
+def quote_if(s, cond=is_arabic, change=reorder_shadda):
+    if cond(s):
+        return '"' + change(s) + '"'
+    else:
+        return s
+
+
+# for a string, change each string literal in "..." with a change function
+# leaving other characters as they are; print the string to stdout as you go
+def change_literals(s, change):
+    inliteral = False
+    literal = ''
+    for c in s:
+        if c == '"' and inliteral:
+            print('"'+change(literal)+'"', end='')
+            inliteral = False
+            literal = ''
+        elif c == '"':
+            inliteral = True
+        elif inliteral:
+            literal += c
+        else:
+            print(c, end='')
+
+
+# convert literals in stdin 'to' or 'from' Buckwalter
+if __name__ == '__main__':
+    import sys
+    mode = sys.argv[1]
+    for line in sys.stdin:
+        if mode == 'from':
+            change_literals(line, from_buckwalter)
+        elif mode == 'to':
+            change_literals(line, to_buckwalter)
+
+        
diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 140852c7a..edfa69603 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -3,7 +3,7 @@ import json
 import sys
 import unicodedata
 import pgf
-
+from arabic_utilities import *
 
 # data from https://kaikki.org/dictionary/rawdata.html
 # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
@@ -110,132 +110,6 @@ if MODE == 'error-analysis':
     for labverdict, n in sorted(list(evals.items())):
         print(labverdict, n)
 
-# https://en.wikipedia.org/wiki/Buckwalter_transliteration
-buckwalter_dict = {
-  0x621: "'",  # ء
-  0x622: '|',  # آ
-  0x623: '>',  # أ
-  0x624: '&',  # ؤ
-  0x625: '<',  # إ
-  0x626: '}',  # ئ
-  0x627: 'A',  # ا
-  0x628: 'b',  # ب
-  0x629: 'p',  # ة
-  0x62a: 't',  # ت
-  0x62b: 'v',  # ث
-  0x62c: 'j',  # ج
-  0x62d: 'H',  # ح
-  0x62e: 'x',  # خ
-  0x62f: 'd',  # د
-  0x630: '*',  # ذ
-  0x631: 'r',  # ر
-  0x632: 'z',  # ز
-  0x633: 's',  # س
-  0x634: '$',  # ش
-  0x635: 'S',  # ص
-  0x636: 'D',  # ض
-  0x637: 'T',  # ط
-  0x638: 'Z',  # ظ
-  0x639: 'E',  # ع
-  0x63a: 'g',  # غ
-  0x641: 'f',  # ف  
-  0x642: 'q',  # ق
-  0x643: 'k',  # ك
-  0x644: 'l',  # ل
-  0x645: 'm',  # م
-  0x646: 'n',  # ن
-  0x647: 'h',  # ه
-  0x648: 'w',  # و
-  0x649: 'Y',  # ى
-  0x64a: 'y',  # ي
-  0x64b: 'F',  # ً
-  0x64c: 'N',  # ٌ
-  0x64d: 'K',  # ٍ
-  0x64e: 'a',  # َ
-  0x64f: 'u',  # ُ
-  0x650: 'i',  # ِ
-  0x651: '~',  # ّ
-  0x652: 'o',  # ْ
-  0x670: '`',  # '
-  0x671: '{'   # ٱ
-  }
-
-
-buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
-
-arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
-
-sound_consonants = {chr(c) for c in range(0x628, 0x648)}  # excluding alif, waw, ya
-
-def to_buckwalter(s):
-    return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
-
-
-def from_buckwalter(s):
-    return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
-
-
-def unvocalize(s):
-    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
-
-
-def drop_final_vowel(s):
-    if s[-1] in arabic_vowels:
-        return s[:-1]
-    else:
-        return s
-
-
-def is_arabic(s):
-    return s and any(1574 <= ord(c) <= 1616 for c in s)
-
-def normal(s):
-    return unicodedata.normalize('NFD', s)
-
-# heuristic for finding the three radicals from certain forms
-# works only for sound (strong) 3-radical roots, otherwise None
-def get_sound_trigram_root(s):
-    sounds = [c for c in s if c in sound_consonants]
-    if len(sounds) == 3:
-        return ''.join(sounds)
-    else:
-        return None
-
-    
-# reverse engineer fcl pattern from a given form, with a sound trigram root
-# one more condition: each of the root letters occurs exactly ones
-# TODO: better use the given root of the lex entry
-def get_sound_fcl_pattern(s):
-    if root := get_sound_trigram_root(s):
-        if len([c in s for c in root]) == 3:
-            p = list(s)
-            r = s.find(root[0])
-            p[r] = chr(0x641)
-            r += s[r+1:].find(root[1]) + 1
-            p[r] = chr(0x639)
-            r += s[r+1:].find(root[2]) + 1
-            p[r] = chr(0x644)
-            p = ''.join(p)
-##            print('---PATT', s, root, p)
-            return p
-    
-
-# Wikt uses vowel+shadda which is a Unicode normalization
-# GF uses shadda+vowel which is linguistically correct
-# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
-# unicodedata.normalize does this wrong, as noted by Ariel Gutman 
-## todo: more direct implementation
-def reorder_shadda(s):
-    return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
-
-
-# quote word forms but not parameters
-def quote_if(s, cond=is_arabic, change=reorder_shadda):
-    if cond(s):
-        return '"' + change(s) + '"'
-    else:
-        return s
-
 
 # generate word_d_C functions starting with d=0, but show d only when >= 1
 def gf_fun(s, pos, disamb=0):
diff --git a/src/arabic/wiktionary/to_wordnet.py b/src/arabic/wiktionary/to_wordnet.py
index b159c5f18..2aae047db 100644
--- a/src/arabic/wiktionary/to_wordnet.py
+++ b/src/arabic/wiktionary/to_wordnet.py
@@ -1,6 +1,8 @@
 import csv
 import json
 
+from arabic_utilities import *
+
 # to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
 # the following are assumed
 
@@ -12,15 +14,6 @@ WN_TSV = 'ar2en_words_gf.csv'  # Zarzoura
 # built as explained in ./read_wiktionary.py
 MORPHO_GF = 'MorphoDictAraAbs.gf'
 
-def is_arabic(s):
-    return s and any(1574 <= ord(c) <= 1616 for c in s)
-
-def get_arabic(s):
-    return ''.join([c for c in s if is_arabic(c)])
-
-def unvocalize(s):
-    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
-
 
 # fun 'دُبُ_N' : N ; -- 10 [['bear']]
 funmap = {}