From 3c0adada11f9c3055dedb5a26ef4d483585f8a15 Mon Sep 17 00:00:00 2001
From: aarneranta <aarne@chalmers.se>
Date: Wed, 13 Sep 2023 15:29:28 +0200
Subject: [PATCH] new function in ParadigmsAra to deal with Wiktionary data;
 lots of untested guesses

---
 src/arabic/ParadigmsAra.gf               | 67 ++++++++++++++++++++++++
 src/arabic/wiktionary/read_wiktionary.py | 64 +++++++++++++++-------
 2 files changed, 113 insertions(+), 18 deletions(-)

diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf
index ce479f94..20892fed 100644
--- a/src/arabic/ParadigmsAra.gf
+++ b/src/arabic/ParadigmsAra.gf
@@ -868,4 +868,71 @@ formV : (root : Str) -> VerbForm -> V = \s,f -> case f of {
 param VerbForm =
   FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ;
 
+-- paradigms for Wiktionary extraction
+---- TODO: better usage of information in Wiktionary
+
+oper
+  wmkN = overload {
+    wmkN : {sg, pl : Str ; g : Gender} -> N
+      = \r -> mkN r.sg r.pl r.g nohum ;  --- hum/nohum not in Wikt
+    wmkN : {sg : Str} -> N
+      = \r -> smartN r.sg ; 
+    wmkN : {sg : Str ; g : Gender ; root : Str} -> N
+      = \r -> smartN r.sg ** {g = r.g} ; ----
+    wmkN : {sg : Str; g : Gender} -> N
+      = \r -> smartN r.sg ** {g = r.g} ;
+    wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N
+      = \r -> mkN r.sg r.pl r.g nohum ;   --- hum/nohum not in Wikt
+    wmkN : {sg : Str; pl : Str} -> N
+      = \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ;
+    wmkN : {sg : Str; root : Str} -> N 
+      = \r -> smartN r.sg ; 
+    } ;
+
+  wmkA = overload {
+    wmkA : {root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; fem_sg: Str ; masc_pl : Str; fem_pl : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    mkA : {masc_sg : Str; fem_sg : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; fem_sg : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str; masc_pl : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    mkA : {masc_sg : Str; root : Str} -> A
+      = \r -> mkA r.root ;
+    mkA : {masc_sg : Str} -> A
+      = \r -> mkA r.masc_sg ; ----
+    } ;
+
+  wmkV = overload {
+    wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V
+      = \r -> mkV r.root r.cls ; ----
+    wmkV : {perfect : Str; cls : VerbForm} -> V
+      = \r -> mkV r.perfect r.cls ; ----
+    wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V
+      = \r -> mkV r.root r.cls ; ----
+    wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V
+      = \r -> variants {} ; ---- mkV r.imperfect ; ----
+    wmkV : {root : Str ; cls : VerbForm} -> V
+      = \r -> mkV r.root r.cls ;
+    wmkV : {imperfect : Str} -> V
+      = \r -> variants {} ; ---- mkV r.imperfect ;
+    } ;
+
 } ;
diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py
index 49d3a3c1..ea8d805f 100644
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -1,7 +1,22 @@
 import gzip
 import json
+import sys
 
+# data from https://kaikki.org/dictionary/rawdata.html
+# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
+# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. 
+
+if not sys.argv[1:]:
+    print('usage: read_wiktionary (raw | gf-cnc | gf-abs)')
+    exit()
+
+MODE = sys.argv[1]  # 
+
+# step 1: extract data from this file using the raw option
 WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
+
+# the following file is generated.
+# in the sequel, use this file with gf-abs or gf-cnc option
 FILTERED_WIKT = 'wikt_arabic.jsonl'
 
 
@@ -14,11 +29,12 @@ def get_gzip_json(file, sample=100000, langs=[]):
                 obj = json.loads(line)
                 if obj.get('lang', None) in langs:
                     print(line.decode("utf-8"))
-        print(n)
+#        print(n)
 
+if MODE == 'raw':
+    get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])  
 
-# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])  
-# python3 read_wiktionary.py >wikt_arabic.jsonl
+# python3 read_wiktionary.py raw >wikt_arabic.jsonl
 
 # https://en.wikipedia.org/wiki/Buckwalter_transliteration
 buckwalter_dict = {
@@ -80,6 +96,12 @@ def unvocalize(s):
 def is_arabic(s):
     return s and any(1574 <= ord(c) <= 1616 for c in s)
 
+# quote forms but not parameters
+def quote_if(s, cond=is_arabic):
+    if cond(s):
+        return '"' + s + '"'
+    else:
+        return s
 
 def gf_fun(s, pos, disamb=0):
     discrim = '_' + str(disamb) if disamb else ''
@@ -99,8 +121,8 @@ def forms_for_pos(obj):
                          if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
         plural = [form[:-1] for form, descr in forms
                          if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
-        gender = (['Fem'] if 'Arabic feminine nouns' in obj['categories']
-                            else (['Masc'] if  'Arabic masculine nouns' in obj['categories']
+        gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
+                            else (['masc'] if  'Arabic masculine nouns' in obj['categories']
                                   else []))
         gf_entry = {
             'cat': 'N',
@@ -122,15 +144,20 @@ def forms_for_pos(obj):
               'perfect': lemma, 
               'imperfect': [form for form, descr in forms
                       if all([w in descr for
-                              w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1],
-              'cls': [max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII', 'XIV', 'XV', '']
-                            if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
-                           key=len)]
+                              w in [
+                                  "active", "indicative", "masculine", "non-past",
+                                  "imperfective", "singular", "third-person"]])][:1],
+              'cls': ['Form' + max([n for n in [
+                  'I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','']
+                            if n in ' '.join([c for c in obj['categories']
+                                if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
+                           key=len)]  # max in RGL is XI, in Wikt XIII
               }
           }
     elif obj['pos'] == 'adj':
         lemma = [form for form, descr in forms
-                         if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1]
+                    if all([w in descr for w in [
+                        'indefinite', 'masculine', 'singular', 'informal']])][:1]
         gf_entry = {
             'cat': 'A',
             'lemma': lemma,
@@ -150,8 +177,9 @@ def forms_for_pos(obj):
         
     if 'lemma' in gf_entry and gf_entry['lemma']:
         gf_entry['lemma'] = gf_entry['lemma'][0]
-        gf_entry['args']['root'] = obj['root']
-        args = [r + ' = ' + '"' + x[0] + '"' for r, x in gf_entry['args'].items() if x]
+        if obj['root']:
+            gf_entry['args']['root'] = obj['root']
+        args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
         gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' 
 
     return gf_entry
@@ -160,19 +188,19 @@ def forms_for_pos(obj):
 def find_root(s):
     return ''.join([c for c in s if is_arabic(c)])
     
-import sys
-MODE = sys.argv[1]
-
 if MODE == 'gf-abs':
     print('abstract MorphoDictAraAbs = Cat ** {')    
 if MODE == 'gf-cnc':
     print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') 
 
-
-with open(FILTERED_WIKT) as file:
+if MODE != 'raw':
+  with open(FILTERED_WIKT) as file:
     seen_gf_funs = {}
     for line in file:
-        obj = json.loads(line)
+        try:
+            obj = json.loads(line)
+        except:
+            continue
         root = [find_root(t['expansion']) for
                 t in obj.get('etymology_templates', []) if
                 t.get('name', None) =='ar-root'][:1]