started Hrv verbs and their Wiktionary extraction

2022-09-25 10:06:36 +02:00
parent 3eac1b9d0c
commit 7a0b1eed34
4 changed files with 118 additions and 44 deletions
--- a/src/croatian/ResHrv.gf
+++ b/src/croatian/ResHrv.gf
@@ -17,6 +17,13 @@ param
  Person = P1 | P2 | P3 ;
  VForm =
      VInf
    | VPres Number Person
    | VPastPart Gender Number
    ;
    ---- TODO aorist, imperfect
  Agr = Ag Gender Number Person ;
  CTense = CTPres | CTPast ; ----- TODO complete the tense system to match BCS verb morphology
@@ -414,30 +421,20 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
      pgen    = velk + "ih" ;
      } ;
 {- 
 ---------------------
 -- Verbs
-- https://en.wikipedia.org/wiki/Slovak_language#Verbs
+-- Wiki
-  VerbForms : Type = {          ---- TODO more forms to add ?
+  VerbForms : Type = VForm => Str ;
    inf,
    pressg1, pressg2, pressg3,
    prespl1, prespl2, prespl3,
    pastpmasc, pastpfem, pastpneutr : Str
    } ;
  ComplementCase : Type = {s : Str ; c : Case ; hasPrep : Bool} ;
-  verbAgr : VerbForms -> Agr -> Bool -> Str   ---- TODO tenses
+  verbAgr : VerbForms -> Agr -> CTense -> Str   ---- TODO tenses
-    = \vf,a,b -> case a of {
+    = \vf,a,b -> case <a,b> of {
-      Ag _ Sg P1 => vf.pressg1 ;
+      <Ag _ n p, CTPres> => vf ! VPres n p ;
-      Ag _ Sg P2 => vf.pressg2 ;
+      <Ag g n _, CTPast> => vf ! VPastPart g n
      Ag _ Sg P3 => vf.pressg3 ;
      Ag _ Pl P1 => vf.prespl1 ;
      Ag _ Pl P2 => vf.prespl2 ;
      Ag _ Pl P3 => vf.prespl3
      } ;
-
+{-
  copulaVerbForms : VerbForms = {
    inf = "byť" ;
    pressg1 = "som" ;
@@ -463,29 +460,36 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
    pastpfem = "mala" ;
    pastpneutr = "malo" ;
    } ;
 -}
 -- just an example of a traditional paradigm
 ---- TODO other traditional paradigms
-  iii_kupovatVerbForms : Str -> VerbForms = \kupovat ->
+  aeiVerbForms : Str -> VerbForms = \citati ->
   let
-     kupo = Predef.tk 3 kupovat ;
+     cita = Predef.tk 2 citati ;
-     kupu = Predef.tk 1 kupo + "u"
+     u = case last cita of {
-   in
+        "a" => "aju" ;
-   {
+	"e" => "u" ;
-    inf = kupovat ;
+	"i" => "e"
-    pressg1 = kupu + "jem" ;
+        } ;
-    pressg2 = kupu + "ješ" ;
+   in table {
-    pressg3 = kupu + "je" ;
+      VInf => citati ;
-    prespl1 = kupu + "jeme" ;
+      VPres Sg P1 => cita + "m" ;
-    prespl2 = kupu + "jete" ;
+      VPres Sg P2 => cita + "š" ;
-    prespl3 = kupu + "jú" ;
+      VPres Sg P3 => cita ;
-    pastpmasc = "kupoval" ;
+      VPres Pl P1 => cita + "mo" ;
-    pastpfem = "kupovala" ;
+      VPres Pl P2 => cita + "te" ;
-    pastpneutr = "kupovalo" ;    
+      VPres pl P3 => init cita + u ;
      VPastPart (Masc _) Sg => cita + "o" ;
      VPastPart Fem Sg => cita + "la" ;
      VPastPart Neutr Sg => cita + "lo" ;
      VPastPart (Masc _) Pl => cita + "li" ;
      VPastPart Fem Pl => cita + "le" ;
      VPastPart Neutr Pl => cita + "la"
    } ;
-
+{-
 ---------------------------
 -- Pronouns
--- a/src/croatian/gold-test.txt
+++ b/src/croatian/gold-test.txt
@@ -552,3 +552,34 @@ s . Neutr => Pl => Acc => niska
 s . Neutr => Pl => Voc => niska
 s . Neutr => Pl => Loc => niskim
 s . Neutr => Pl => Ins => niskim
 VInf => čitati
 VPres Sg P1 => čitam
 VPres Sg P2 => čitaš
 VPres Sg P3 => čita
 VPres Pl P1 => čitamo
 VPres Pl P2 => čitate
 VPres Pl P3 => čitaju
 VPastPart (Masc Anim) Sg => čitao
 VPastPart (Masc Anim) Pl => čitali
 VPastPart (Masc Inanim) Sg => čitao
 VPastPart (Masc Inanim) Pl => čitali
 VPastPart Fem Sg => čitala
 VPastPart Fem Pl => čitale
 VPastPart Neutr Sg => čitalo
 VPastPart Neutr Pl => čitala
 VInf => raditi
 VPres Sg P1 => radim
 VPres Sg P2 => radiš
 VPres Sg P3 => radi
 VPres Pl P1 => radimo
 VPres Pl P2 => radite
 VPres Pl P3 => rade
 VPastPart (Masc Anim) Sg => radio
 VPastPart (Masc Anim) Pl => radili
 VPastPart (Masc Inanim) Sg => radio
 VPastPart (Masc Inanim) Pl => radili
 VPastPart Fem Sg => radila
 VPastPart Fem Pl => radile
 VPastPart Neutr Sg => radilo
 VPastPart Neutr Pl => radila
 aarnes-mbp-2:croatian aarne$ 
--- a/src/croatian/testHrv.gfs
+++ b/src/croatian/testHrv.gfs
@@ -28,3 +28,6 @@ cc -table -unqual adjFormsAdjective (velikA "mastan")
 cc -table -unqual adjFormsAdjective (velikA "gladan")
 cc -table -unqual adjFormsAdjective (velikA "nizak")
 cc -table -unqual aeiVerbForms ("čitati")
 cc -table -unqual aeiVerbForms ("raditi")
--- a/src/croatian/wiktionary/extract.py
+++ b/src/croatian/wiktionary/extract.py
@@ -8,7 +8,7 @@ MYLANG = 'Serbo-Croatian'
 GENDERS = ['masculine', 'feminine', 'neuter']
-NOUN_CASES = {
+NOUN_FORMS = {
    'singular': {
        'nominative': 'snom',
        'genitive': 'sgen',
@@ -25,7 +25,7 @@ NOUN_CASES = {
        }
    }
-ADJ_CASES = {
+ADJ_FORMS = {
    'masculine': {
         'singular': {
            'nominative': 'msnom',
@@ -54,6 +54,33 @@ ADJ_CASES = {
        }
    }
 VERB_FORMS = {
    'present': {
        'singular': {
            'first-person': 'pres_sg_1',
            'second-person': 'pres_sg_2',
            'third-person': 'pres_sg_3'
            },
        'plural': {
            'first-person': 'pres_pl_1',
            'second-person': 'pres_pl_2',
            'third-person': 'pres_pl_3'
            }
        },
    'participle': {
        'singular': {
            'masculine': 'ppart_masc_sg',
            'feminine': 'ppart_fem_sg',
            'neuter': 'ppart_neutr_sg'
            },
        'plural': {
            'masculine': 'ppart_masc_pl',
            'feminine': 'ppart_fem_pl',
            'neuter': 'ppart_neutr_pl'
            }
        }
    }
 def get_forms(pos, forms):
@@ -64,24 +91,33 @@ def get_forms(pos, forms):
                if g in f.get('tags', []):
                    dict['gender'] = g
            tags = f.get('tags', [])
-            for num in NOUN_CASES:
+            for num in NOUN_FORMS:
                if num in tags:
-                    for case in NOUN_CASES[num]:
+                    for case in NOUN_FORMS[num]:
                        if case in tags:
-                            dict[NOUN_CASES[num][case]] = f['form']
+                            dict[NOUN_FORMS[num][case]] = f['form']
    elif pos == 'adj':
        print(forms) 
        for f in forms:
            tags = f.get('tags', [])
            if 'positive' in tags and 'indefinite' in tags:
-                for g in ADJ_CASES:
+                for g in ADJ_FORMS:
                    if g in tags:
-                        for n in ADJ_CASES[g]:
+                        for n in ADJ_FORMS[g]:
                            if n in tags:
-                                for c in ADJ_CASES[g][n]:
+                                for c in ADJ_FORMS[g][n]:
                                    if c in tags:
-                                        dict[ADJ_CASES[g][n][c]] = f['form']
+                                        dict[ADJ_FORMS[g][n][c]] = f['form']
-        
+    elif pos == 'verb':
        for f in forms:
            tags = f.get('tags', [])
            for t in VERB_FORMS:
                if t in tags:
                    for n in VERB_FORMS[t]:
                        if n in tags:
                            for g in VERB_FORMS[t][n]:
                               if g in tags:
                                   dict[VERB_FORMS[t][n][g]] = f['form']
    else:
        dict['forms'] = forms[:10] ####
    return dict