first version of MorphoDictHrv extracted from wiktionary; TODO better use of PN and V forms

2026-05-27 08:58:55 -06:00 · 2022-10-12 10:11:12 +02:00
parent 8755f9da65
commit c20e9b6383
6 changed files with 21937 additions and 14 deletions
--- a/src/croatian/NounHrv.gf
+++ b/src/croatian/NounHrv.gf
@@ -30,7 +30,7 @@ lin

    DefArt = {s = \\_,_,_ => []} ;
    IndefArt = {s = \\_,_,_ => []} ;
-    NumPl = {s = \\_,_ => [] ; size = NS_20_} ; ---- size
+    NumPl = {s = \\_,_ => [] ; size = NS_2_4} ; ---- size
    NumSg = {s = \\_,_ => [] ; size = NS_1} ;

    UsePron pron = {
--- a/src/croatian/ParadigmsHrv.gf
+++ b/src/croatian/ParadigmsHrv.gf
@@ -13,6 +13,8 @@ oper
    = Masc Anim ;
  mascInanimate : Gender
    = Masc Inanim ;
+  masculine : Gender
+    = Masc Inanim ;
  feminine : Gender
    = Fem ;
  neuter : Gender
@@ -132,12 +134,27 @@ oper
 	  compar = velikA comp ;
 	  superl = superlAForms (velikA comp)
 	  } ;
+    mkA : (posit : AForms) -> (compar : Str) -> A
+      = \posit,compar -> lin A {
+          posit = posit ;
+	  compar = velikA compar ;
+	  superl = superlAForms (velikA compar)
+	  } ;
    mkA : (posit, compar : AForms) -> A
      = \posit,compar -> lin A {
          posit = posit ;
 	  compar = compar ;
 	  superl = superlAForms compar
 	  } ;
+    mkA : (posit : AForms) -> A
+      = \posit ->
+          let
+            compar = regComparAForms posit
+          in lin A {
+            posit = posit ;
+	    compar = compar ;
+	    superl = superlAForms compar
+	    } ;
    } ;

  invarA : Str -> A
--- a/src/croatian/ResHrv.gf
+++ b/src/croatian/ResHrv.gf
@@ -338,12 +338,12 @@ voicing : Str -> Str = \s -> case s of {
    msins : Str ;        -- nsins, pdat, ploc, pins = msins
    fsins : Str ;               -- no o/e variation like in msdat
    mpnom : Str ;               -- mpvoc = mpnom
-    pgen : Str ;                --
+    mpgen : Str ;               --
    } ;

 invarAdjForms : Str -> AdjForms = \s -> {
    msnom, fsnom, nsnom, msgen, fsgen, msdat,
-    fsdat, fsacc, msins, fsins, mpnom, pgen = s ;
+    fsdat, fsacc, msins, fsins, mpnom, mpgen = s ;
    } ;

 -- used in PositA but will also work in Compar and Superl by calling their record fields
@@ -368,7 +368,7 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
      | <Pl,Dat|Loc|Ins, _>      => afs.msins ;
    <Sg, Ins, Fem>               => afs.fsins ;
    <Pl, Nom|Voc, Masc _>        => afs.mpnom ;
-    <Pl, Gen,_>                  => afs.pgen
+    <Pl, Gen,_>                  => afs.mpgen
    }
    } ;

@@ -395,7 +395,7 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
      msins   = velk + "im" ;
      fsins   = velk + "om" ;
      mpnom   = velk + "i" ;
-      pgen    = velk + "ih" ;
+      mpgen   = velk + "ih" ;
      } ;

  regComparAForms : AdjForms -> AdjForms
--- a/src/croatian/wiktionary/MorphoDictHrv.gf
+++ b/src/croatian/wiktionary/MorphoDictHrv.gf
--- a/src/croatian/wiktionary/MorphoDictHrvAbs.gf
+++ b/src/croatian/wiktionary/MorphoDictHrvAbs.gf
--- a/src/croatian/wiktionary/extract.py
+++ b/src/croatian/wiktionary/extract.py
@@ -40,13 +40,11 @@ ADJ_FORMS = {
         'singular': {
            'nominative': 'msnom',
            'genitive': 'msgen',
-            'dative': 'msdat',
-            'locative': 'msloc',
-            'instrumental': 'msins'
+            'dative': 'msdat'
            },
        'plural': {
            'nominative': 'mpnom',
-            'genitive': 'pgen'
+            'genitive': 'mpgen'
            }
        },
    'feminine': {
@@ -54,7 +52,8 @@ ADJ_FORMS = {
            'nominative': 'fsnom',
            'genitive': 'fsgen',
            'dative': 'fsdat',
-            'accusative': 'fsacc'
+            'accusative': 'fsacc',
+            'instrumental': 'fsins'
            }
        },
    'neuter': {
@@ -122,7 +121,7 @@ def unaccent(word):

 cyrillic = 'ЀЈЉЊЋЍЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшыѐђјљњћѝџӣӯ'

-def get_forms(pos, forms):
+def get_forms(pos, forms, word):
    dict = {}
    if pos == 'noun':
        for f in forms:
@@ -157,6 +156,9 @@ def get_forms(pos, forms):
                                for c in ADJ_FORMS[g][n]:
                                    if c in tags:
                                        dict[ADJ_FORMS[g][n][c]] = unaccent(f['form'])
+            elif all([t in tags for t in [
+                  'comparative', 'masculine', 'singular', 'nominative']]):
+                dict['cmsnom'] = unaccent(f['form'])
    elif pos == 'verb':
        for f in forms:
            tags = f.get('tags', [])
@@ -167,6 +169,8 @@ def get_forms(pos, forms):
                            for g in VERB_FORMS[t][n]:
                               if g in tags:
                                   dict[VERB_FORMS[t][n][g]] = unaccent(f['form'])
+        if dict:
+            dict['infin'] = unaccent(word)

    else:
        dict['forms'] = forms[:10] ####
@@ -178,7 +182,7 @@ def get_forms(pos, forms):

 def lexinfo(data):
    return data['word'], {
-        'pos': data['pos'], 'forms': get_forms(data['pos'], data['forms'])}
+        'pos': data['pos'], 'forms': get_forms(data['pos'], data['forms'], data['word'])}


 # write morphology of mylang in m.json
@@ -219,7 +223,7 @@ def print_gf_code(data, i):
    cats = {
        'name': ('PN', 7),
        'noun': ('N', 11),
-        'adj': ('A', 12),
+        'adj': ('A', 13),
        'verb': ('V', 12)
        }    
    pos = data[lemma]['pos']
@@ -230,7 +234,10 @@ def print_gf_code(data, i):
        else:
            s = '{'
            for f in fs:
-                s += f + ' = ' + '"' + str(fs[f]) + '"' + ' ; '
+                if f == 'gender':
+                    s += f + ' = P.' + str(fs[f]) + ' ; '
+                else:
+                    s += f + ' = ' + '"' + str(fs[f]) + '"' + ' ; '
            return s[:-3] + '}'  # removing last ;
        
    if pos in cats: