first version of MorphoDictHrv extracted from wiktionary; TODO better use of PN and V forms

2022-10-12 10:11:12 +02:00
parent 8755f9da65
commit c20e9b6383
6 changed files with 21937 additions and 14 deletions
--- a/src/croatian/NounHrv.gf
+++ b/src/croatian/NounHrv.gf
@@ -30,7 +30,7 @@ lin
    DefArt = {s = \\_,_,_ => []} ;
    IndefArt = {s = \\_,_,_ => []} ;
-    NumPl = {s = \\_,_ => [] ; size = NS_20_} ; ---- size
+    NumPl = {s = \\_,_ => [] ; size = NS_2_4} ; ---- size
    NumSg = {s = \\_,_ => [] ; size = NS_1} ;
    UsePron pron = {
--- a/src/croatian/ParadigmsHrv.gf
+++ b/src/croatian/ParadigmsHrv.gf
@@ -13,6 +13,8 @@ oper
    = Masc Anim ;
  mascInanimate : Gender
    = Masc Inanim ;
  masculine : Gender
    = Masc Inanim ;
  feminine : Gender
    = Fem ;
  neuter : Gender
@@ -132,12 +134,27 @@ oper
 	  compar = velikA comp ;
 	  superl = superlAForms (velikA comp)
 	  } ;
    mkA : (posit : AForms) -> (compar : Str) -> A
      = \posit,compar -> lin A {
          posit = posit ;
 	  compar = velikA compar ;
 	  superl = superlAForms (velikA compar)
 	  } ;
    mkA : (posit, compar : AForms) -> A
      = \posit,compar -> lin A {
          posit = posit ;
 	  compar = compar ;
 	  superl = superlAForms compar
 	  } ;
    mkA : (posit : AForms) -> A
      = \posit ->
          let
            compar = regComparAForms posit
          in lin A {
            posit = posit ;
 	    compar = compar ;
 	    superl = superlAForms compar
 	    } ;
    } ;
  invarA : Str -> A
--- a/src/croatian/ResHrv.gf
+++ b/src/croatian/ResHrv.gf
@@ -338,12 +338,12 @@ voicing : Str -> Str = \s -> case s of {
    msins : Str ;        -- nsins, pdat, ploc, pins = msins
    fsins : Str ;               -- no o/e variation like in msdat
    mpnom : Str ;               -- mpvoc = mpnom
-    pgen : Str ;                --
+    mpgen : Str ;               --
    } ;
 invarAdjForms : Str -> AdjForms = \s -> {
    msnom, fsnom, nsnom, msgen, fsgen, msdat,
-    fsdat, fsacc, msins, fsins, mpnom, pgen = s ;
+    fsdat, fsacc, msins, fsins, mpnom, mpgen = s ;
    } ;
 -- used in PositA but will also work in Compar and Superl by calling their record fields
@@ -368,7 +368,7 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
      | <Pl,Dat|Loc|Ins, _>      => afs.msins ;
    <Sg, Ins, Fem>               => afs.fsins ;
    <Pl, Nom|Voc, Masc _>        => afs.mpnom ;
-    <Pl, Gen,_>                  => afs.pgen
+    <Pl, Gen,_>                  => afs.mpgen
    }
    } ;
@@ -395,7 +395,7 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
      msins   = velk + "im" ;
      fsins   = velk + "om" ;
      mpnom   = velk + "i" ;
-      pgen    = velk + "ih" ;
+      mpgen   = velk + "ih" ;
      } ;
  regComparAForms : AdjForms -> AdjForms
--- a/src/croatian/wiktionary/MorphoDictHrv.gf
+++ b/src/croatian/wiktionary/MorphoDictHrv.gf
--- a/src/croatian/wiktionary/MorphoDictHrvAbs.gf
+++ b/src/croatian/wiktionary/MorphoDictHrvAbs.gf
--- a/src/croatian/wiktionary/extract.py
+++ b/src/croatian/wiktionary/extract.py
@@ -40,13 +40,11 @@ ADJ_FORMS = {
         'singular': {
            'nominative': 'msnom',
            'genitive': 'msgen',
-            'dative': 'msdat',
+            'dative': 'msdat'
            'locative': 'msloc',
            'instrumental': 'msins'
            },
        'plural': {
            'nominative': 'mpnom',
-            'genitive': 'pgen'
+            'genitive': 'mpgen'
            }
        },
    'feminine': {
@@ -54,7 +52,8 @@ ADJ_FORMS = {
            'nominative': 'fsnom',
            'genitive': 'fsgen',
            'dative': 'fsdat',
-            'accusative': 'fsacc'
+            'accusative': 'fsacc',
            'instrumental': 'fsins'
            }
        },
    'neuter': {
@@ -122,7 +121,7 @@ def unaccent(word):
 cyrillic = 'ЀЈЉЊЋЍЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшыѐђјљњћѝџӣӯ'
-def get_forms(pos, forms):
+def get_forms(pos, forms, word):
    dict = {}
    if pos == 'noun':
        for f in forms:
@@ -157,6 +156,9 @@ def get_forms(pos, forms):
                                for c in ADJ_FORMS[g][n]:
                                    if c in tags:
                                        dict[ADJ_FORMS[g][n][c]] = unaccent(f['form'])
            elif all([t in tags for t in [
                  'comparative', 'masculine', 'singular', 'nominative']]):
                dict['cmsnom'] = unaccent(f['form'])
    elif pos == 'verb':
        for f in forms:
            tags = f.get('tags', [])
@@ -167,6 +169,8 @@ def get_forms(pos, forms):
                            for g in VERB_FORMS[t][n]:
                               if g in tags:
                                   dict[VERB_FORMS[t][n][g]] = unaccent(f['form'])
        if dict:
            dict['infin'] = unaccent(word)
    else:
        dict['forms'] = forms[:10] ####
@@ -178,7 +182,7 @@ def get_forms(pos, forms):
 def lexinfo(data):
    return data['word'], {
-        'pos': data['pos'], 'forms': get_forms(data['pos'], data['forms'])}
+        'pos': data['pos'], 'forms': get_forms(data['pos'], data['forms'], data['word'])}
 # write morphology of mylang in m.json
@@ -219,7 +223,7 @@ def print_gf_code(data, i):
    cats = {
        'name': ('PN', 7),
        'noun': ('N', 11),
-        'adj': ('A', 12),
+        'adj': ('A', 13),
        'verb': ('V', 12)
        }    
    pos = data[lemma]['pos']
@@ -230,7 +234,10 @@ def print_gf_code(data, i):
        else:
            s = '{'
            for f in fs:
-                s += f + ' = ' + '"' + str(fs[f]) + '"' + ' ; '
+                if f == 'gender':
                    s += f + ' = P.' + str(fs[f]) + ' ; '
                else:
                    s += f + ' = ' + '"' + str(fs[f]) + '"' + ' ; '
            return s[:-3] + '}'  # removing last ;
    if pos in cats: