From 14351f2767a447bbe96fd02b76229ffa51d8707d Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Tue, 11 Oct 2022 13:39:49 +0200
Subject: [PATCH] improved generation of Hrv lexicon from Wiktionary; still
 work in progress

---
 src/croatian/wiktionary/extract.py | 87 ++++++++++++++++++++++++++++--
 1 file changed, 83 insertions(+), 4 deletions(-)

diff --git a/src/croatian/wiktionary/extract.py b/src/croatian/wiktionary/extract.py
index d4f8c5604..7476f76d9 100644
--- a/src/croatian/wiktionary/extract.py
+++ b/src/croatian/wiktionary/extract.py
@@ -14,6 +14,7 @@ REFLANG = 'English'
 MORPHO_OUTPUT_FILE = 'm.json'
 TRANS_OUTPUT_FILE = 't.json'
 
+MORPHO_FINAL_FILE = 'morpho-hr.json'  ###
 
 GENDERS = ['masculine', 'feminine', 'neuter']
 
@@ -90,7 +91,36 @@ VERB_FORMS = {
         }
     }
 
+def unaccent(word):
+    w = []
+    for c in word:
+        if c in "ÀȀȂ":
+            w.append('A')
+        elif c in "ÈÈ":
+            w.append('E')
+        elif c in "ÌÍȊ̂":
+            w.append('I')
+        elif c in "Ò":
+            w.append('O')
+        elif c in "Ù":
+            w.append('U')
+        elif c in "àáâāȁȃ":
+            w.append('a')
+        elif c in "èéēȅȇê":
+            w.append('e')
+        elif c in "ìíīȉȋîî":
+            w.append('i')
+        elif c in "òóôōȍȏ":
+            w.append('o')
+        elif c in "ùúȕȗ̀́̄̏̑ū":
+            w.append('u')
+        elif c in "ŕȑȓ":
+            w.append('r')
+        else:
+            w.append(c)
+    return ''.join(w)
 
+cyrillic = 'ЀЈЉЊЋЍЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшыѐђјљњћѝџӣӯ'
 
 def get_forms(pos, forms):
     dict = {}
@@ -104,7 +134,18 @@ def get_forms(pos, forms):
                 if num in tags:
                     for case in NOUN_FORMS[num]:
                         if case in tags:
-                            dict[NOUN_FORMS[num][case]] = f['form']
+                            dict[NOUN_FORMS[num][case]] = unaccent(f['form'])
+    elif pos == 'name':
+        for f in forms:
+            for g in GENDERS:
+                if g in f.get('tags', []):
+                    dict['gender'] = g
+            tags = f.get('tags', [])
+            for num in NOUN_FORMS:
+                if num in tags:
+                    for case in NOUN_FORMS[num]:
+                        if case in tags:
+                            dict[NOUN_FORMS[num][case]] = unaccent(f['form'])
     elif pos == 'adj':
         for f in forms:
             tags = f.get('tags', [])
@@ -115,7 +156,7 @@ def get_forms(pos, forms):
                             if n in tags:
                                 for c in ADJ_FORMS[g][n]:
                                     if c in tags:
-                                        dict[ADJ_FORMS[g][n][c]] = f['form']
+                                        dict[ADJ_FORMS[g][n][c]] = unaccent(f['form'])
     elif pos == 'verb':
         for f in forms:
             tags = f.get('tags', [])
@@ -125,10 +166,13 @@ def get_forms(pos, forms):
                         if n in tags:
                             for g in VERB_FORMS[t][n]:
                                if g in tags:
-                                   dict[VERB_FORMS[t][n][g]] = f['form']
+                                   dict[VERB_FORMS[t][n][g]] = unaccent(f['form'])
 
     else:
         dict['forms'] = forms[:10] ####
+        dict['status'] = 'NOFORMS-'+pos
+    if not dict:
+        dict['status'] = 'NOFORMS'
     return dict
 
 
@@ -145,7 +189,8 @@ def morpho(mylang, lines):
             if data.get('lang', '') == mylang and (
                   all([x in data for x in ['pos', 'word', 'forms']])):
                 word, info = lexinfo(data)
-                file.write(json.dumps({word: info})+'\n')
+                json.dump({word: info}, file, ensure_ascii=False)
+                file.write('\n')
 
 
 # write translations from reflang to mylang in t.json
@@ -164,7 +209,34 @@ def translations(mylang, reflang, lines):
                             'sense': t.get('sense')}
                         })+'\n')
 
+# write GF lexical entry
+def print_gf_code(data):
+    
+    def prrec(fs, lemma):
+        if fs.get('status') == 'NOFORMS':
+            return '"' + lemma + '"'
+        else:
+            s = '{'
+            for f in fs:
+                s += f + ' = ' + '"' + str(fs[f]) + '"' + ' ; '
+            return s[:-3] + '}'  # removing last ;
 
+    cats = {'noun': 'N', 'adv': 'Adv', 'adj': 'A', 'verb': 'V'}
+    
+    lemma = list(data.keys())[0]
+    
+    if any([c in cyrillic for c in lemma]):
+        return
+    
+    if data[lemma]['pos'] in cats:
+        cat = cats[data[lemma]['pos']]
+        fun =  lemma + '_' + cat
+        print(' '.join(['fun', fun, ':', cat, ';']))
+        print(' '.join(['lin', fun, '=', 'mk'+cat, prrec(data[lemma]['forms'], lemma),';']))
+    else:
+        pass
+
+                    
 def main():
     if not sys.argv[1:]:
         print('usage: extract.py (morpho|trans) mylang reflang')
@@ -173,6 +245,13 @@ def main():
     mylang, reflang = MYLANG, REFLANG
     if sys.argv[3:]:
         mylang, reflang = sys.argv[2:]
+        
+    if mode == 'gf':
+        with open(MORPHO_FINAL_FILE, "r", encoding="utf-8") as lines:
+            for line in lines:
+                data = json.loads(line)
+                print_gf_code(data)
+            
     with open(WIKTIONARY_FILE, "r", encoding="utf-8") as lines:
             if mode == 'trans':
                 translations(mylang, reflang, lines)