From 8755f9da655330d4f9799c65ee1d1453c1ac37bd Mon Sep 17 00:00:00 2001
From: Aarne Ranta <aarne@chalmers.se>
Date: Tue, 11 Oct 2022 17:43:36 +0200
Subject: [PATCH] Hrv: generating morpholex from wiktionary, in progress

---
 src/croatian/wiktionary/extract.py | 43 +++++++++++++++++-------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/src/croatian/wiktionary/extract.py b/src/croatian/wiktionary/extract.py
index 7476f76d..de3745ff 100644
--- a/src/croatian/wiktionary/extract.py
+++ b/src/croatian/wiktionary/extract.py
@@ -210,29 +210,36 @@ def translations(mylang, reflang, lines):
                         })+'\n')
 
 # write GF lexical entry
-def print_gf_code(data):
+def print_gf_code(data, i):
+
+    lemma = list(data.keys())[0]
+    if any([c in cyrillic for c in lemma]):
+        return
     
-    def prrec(fs, lemma):
-        if fs.get('status') == 'NOFORMS':
+    cats = {
+        'name': ('PN', 7),
+        'noun': ('N', 11),
+        'adj': ('A', 12),
+        'verb': ('V', 12)
+        }    
+    pos = data[lemma]['pos']
+     
+    def prrec(fs, lemma, expected):
+        if fs.get('status') == 'NOFORMS' or len(fs) != expected:  
             return '"' + lemma + '"'
         else:
             s = '{'
             for f in fs:
                 s += f + ' = ' + '"' + str(fs[f]) + '"' + ' ; '
             return s[:-3] + '}'  # removing last ;
-
-    cats = {'noun': 'N', 'adv': 'Adv', 'adj': 'A', 'verb': 'V'}
-    
-    lemma = list(data.keys())[0]
-    
-    if any([c in cyrillic for c in lemma]):
-        return
-    
-    if data[lemma]['pos'] in cats:
-        cat = cats[data[lemma]['pos']]
-        fun =  lemma + '_' + cat
-        print(' '.join(['fun', fun, ':', cat, ';']))
-        print(' '.join(['lin', fun, '=', 'mk'+cat, prrec(data[lemma]['forms'], lemma),';']))
+        
+    if pos in cats:
+        cat, expected = cats[pos]
+        fun =  "'" + lemma + '_' + str(i) + '_' + cat + "'"
+        if len(data[lemma]['forms']) == expected:
+            print(' '.join(['fun', fun, ':', cat, ';']))
+            print(' '.join(['lin', fun, '=',
+                  'mk'+cat, prrec(data[lemma]['forms'], lemma, expected),';']))
     else:
         pass
 
@@ -248,9 +255,9 @@ def main():
         
     if mode == 'gf':
         with open(MORPHO_FINAL_FILE, "r", encoding="utf-8") as lines:
-            for line in lines:
+            for line, i in zip(lines, range(100000)):
                 data = json.loads(line)
-                print_gf_code(data)
+                print_gf_code(data, i)
             
     with open(WIKTIONARY_FILE, "r", encoding="utf-8") as lines:
             if mode == 'trans':