started Hrv verbs and their Wiktionary extraction

2026-05-27 17:08:54 -06:00 · 2022-09-25 10:06:36 +02:00
parent 3eac1b9d0c
commit 7a0b1eed34
4 changed files with 118 additions and 44 deletions
--- a/src/croatian/ResHrv.gf
+++ b/src/croatian/ResHrv.gf
@@ -17,6 +17,13 @@ param

  Person = P1 | P2 | P3 ;

+  VForm =
+      VInf
+    | VPres Number Person
+    | VPastPart Gender Number
+    ;
+    ---- TODO aorist, imperfect
+
  Agr = Ag Gender Number Person ;

  CTense = CTPres | CTPast ; ----- TODO complete the tense system to match BCS verb morphology
@@ -414,30 +421,20 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
      pgen    = velk + "ih" ;
      } ;

-{- 
 ---------------------
 -- Verbs
-- https://en.wikipedia.org/wiki/Slovak_language#Verbs
+-- Wiki

-  VerbForms : Type = {          ---- TODO more forms to add ?
-    inf,
-    pressg1, pressg2, pressg3,
-    prespl1, prespl2, prespl3,
-    pastpmasc, pastpfem, pastpneutr : Str
-    } ;
+  VerbForms : Type = VForm => Str ;

  ComplementCase : Type = {s : Str ; c : Case ; hasPrep : Bool} ;

-  verbAgr : VerbForms -> Agr -> Bool -> Str   ---- TODO tenses
-    = \vf,a,b -> case a of {
-      Ag _ Sg P1 => vf.pressg1 ;
-      Ag _ Sg P2 => vf.pressg2 ;
-      Ag _ Sg P3 => vf.pressg3 ;
-      Ag _ Pl P1 => vf.prespl1 ;
-      Ag _ Pl P2 => vf.prespl2 ;
-      Ag _ Pl P3 => vf.prespl3
+  verbAgr : VerbForms -> Agr -> CTense -> Str   ---- TODO tenses
+    = \vf,a,b -> case <a,b> of {
+      <Ag _ n p, CTPres> => vf ! VPres n p ;
+      <Ag g n _, CTPast> => vf ! VPastPart g n
      } ;
-
+{-
  copulaVerbForms : VerbForms = {
    inf = "byť" ;
    pressg1 = "som" ;
@@ -463,29 +460,36 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
    pastpfem = "mala" ;
    pastpneutr = "malo" ;
    } ;
+-}

 -- just an example of a traditional paradigm
 ---- TODO other traditional paradigms

-  iii_kupovatVerbForms : Str -> VerbForms = \kupovat ->
+  aeiVerbForms : Str -> VerbForms = \citati ->
   let
-     kupo = Predef.tk 3 kupovat ;
-     kupu = Predef.tk 1 kupo + "u"
-   in
-   {
-    inf = kupovat ;
-    pressg1 = kupu + "jem" ;
-    pressg2 = kupu + "ješ" ;
-    pressg3 = kupu + "je" ;
-    prespl1 = kupu + "jeme" ;
-    prespl2 = kupu + "jete" ;
-    prespl3 = kupu + "jú" ;
-    pastpmasc = "kupoval" ;
-    pastpfem = "kupovala" ;
-    pastpneutr = "kupovalo" ;    
+     cita = Predef.tk 2 citati ;
+     u = case last cita of {
+        "a" => "aju" ;
+	"e" => "u" ;
+	"i" => "e"
+        } ;
+   in table {
+      VInf => citati ;
+      VPres Sg P1 => cita + "m" ;
+      VPres Sg P2 => cita + "š" ;
+      VPres Sg P3 => cita ;
+      VPres Pl P1 => cita + "mo" ;
+      VPres Pl P2 => cita + "te" ;
+      VPres pl P3 => init cita + u ;
+      VPastPart (Masc _) Sg => cita + "o" ;
+      VPastPart Fem Sg => cita + "la" ;
+      VPastPart Neutr Sg => cita + "lo" ;
+      VPastPart (Masc _) Pl => cita + "li" ;
+      VPastPart Fem Pl => cita + "le" ;
+      VPastPart Neutr Pl => cita + "la"
    } ;

-
+{-
 ---------------------------
 -- Pronouns

--- a/src/croatian/gold-test.txt
+++ b/src/croatian/gold-test.txt
@@ -552,3 +552,34 @@ s . Neutr => Pl => Acc => niska
 s . Neutr => Pl => Voc => niska
 s . Neutr => Pl => Loc => niskim
 s . Neutr => Pl => Ins => niskim
+VInf => čitati
+VPres Sg P1 => čitam
+VPres Sg P2 => čitaš
+VPres Sg P3 => čita
+VPres Pl P1 => čitamo
+VPres Pl P2 => čitate
+VPres Pl P3 => čitaju
+VPastPart (Masc Anim) Sg => čitao
+VPastPart (Masc Anim) Pl => čitali
+VPastPart (Masc Inanim) Sg => čitao
+VPastPart (Masc Inanim) Pl => čitali
+VPastPart Fem Sg => čitala
+VPastPart Fem Pl => čitale
+VPastPart Neutr Sg => čitalo
+VPastPart Neutr Pl => čitala
+VInf => raditi
+VPres Sg P1 => radim
+VPres Sg P2 => radiš
+VPres Sg P3 => radi
+VPres Pl P1 => radimo
+VPres Pl P2 => radite
+VPres Pl P3 => rade
+VPastPart (Masc Anim) Sg => radio
+VPastPart (Masc Anim) Pl => radili
+VPastPart (Masc Inanim) Sg => radio
+VPastPart (Masc Inanim) Pl => radili
+VPastPart Fem Sg => radila
+VPastPart Fem Pl => radile
+VPastPart Neutr Sg => radilo
+VPastPart Neutr Pl => radila
+aarnes-mbp-2:croatian aarne$ 
--- a/src/croatian/testHrv.gfs
+++ b/src/croatian/testHrv.gfs
@@ -28,3 +28,6 @@ cc -table -unqual adjFormsAdjective (velikA "mastan")
 cc -table -unqual adjFormsAdjective (velikA "gladan")
 cc -table -unqual adjFormsAdjective (velikA "nizak")

+cc -table -unqual aeiVerbForms ("čitati")
+cc -table -unqual aeiVerbForms ("raditi")
+
--- a/src/croatian/wiktionary/extract.py
+++ b/src/croatian/wiktionary/extract.py
@@ -8,7 +8,7 @@ MYLANG = 'Serbo-Croatian'

 GENDERS = ['masculine', 'feminine', 'neuter']

-NOUN_CASES = {
+NOUN_FORMS = {
    'singular': {
        'nominative': 'snom',
        'genitive': 'sgen',
@@ -25,7 +25,7 @@ NOUN_CASES = {
        }
    }

-ADJ_CASES = {
+ADJ_FORMS = {
    'masculine': {
         'singular': {
            'nominative': 'msnom',
@@ -54,6 +54,33 @@ ADJ_CASES = {
        }
    }

+VERB_FORMS = {
+    'present': {
+        'singular': {
+            'first-person': 'pres_sg_1',
+            'second-person': 'pres_sg_2',
+            'third-person': 'pres_sg_3'
+            },
+        'plural': {
+            'first-person': 'pres_pl_1',
+            'second-person': 'pres_pl_2',
+            'third-person': 'pres_pl_3'
+            }
+        },
+    'participle': {
+        'singular': {
+            'masculine': 'ppart_masc_sg',
+            'feminine': 'ppart_fem_sg',
+            'neuter': 'ppart_neutr_sg'
+            },
+        'plural': {
+            'masculine': 'ppart_masc_pl',
+            'feminine': 'ppart_fem_pl',
+            'neuter': 'ppart_neutr_pl'
+            }
+        }
+    }
+


 def get_forms(pos, forms):
@@ -64,23 +91,32 @@ def get_forms(pos, forms):
                if g in f.get('tags', []):
                    dict['gender'] = g
            tags = f.get('tags', [])
-            for num in NOUN_CASES:
+            for num in NOUN_FORMS:
                if num in tags:
-                    for case in NOUN_CASES[num]:
+                    for case in NOUN_FORMS[num]:
                        if case in tags:
-                            dict[NOUN_CASES[num][case]] = f['form']
+                            dict[NOUN_FORMS[num][case]] = f['form']
    elif pos == 'adj':
-        print(forms) 
        for f in forms:
            tags = f.get('tags', [])
            if 'positive' in tags and 'indefinite' in tags:
-                for g in ADJ_CASES:
+                for g in ADJ_FORMS:
                    if g in tags:
-                        for n in ADJ_CASES[g]:
+                        for n in ADJ_FORMS[g]:
                            if n in tags:
-                                for c in ADJ_CASES[g][n]:
+                                for c in ADJ_FORMS[g][n]:
                                    if c in tags:
-                                        dict[ADJ_CASES[g][n][c]] = f['form']
+                                        dict[ADJ_FORMS[g][n][c]] = f['form']
+    elif pos == 'verb':
+        for f in forms:
+            tags = f.get('tags', [])
+            for t in VERB_FORMS:
+                if t in tags:
+                    for n in VERB_FORMS[t]:
+                        if n in tags:
+                            for g in VERB_FORMS[t][n]:
+                               if g in tags:
+                                   dict[VERB_FORMS[t][n][g]] = f['form']

    else:
        dict['forms'] = forms[:10] ####