LangHrv compiles now, but with a partial Slovak lexicon

2026-05-27 08:58:55 -06:00 · 2022-09-28 11:29:03 +02:00
parent 7c2c519e50
commit 13fac41ce6
10 changed files with 81 additions and 146 deletions
--- a/src/croatian/CatHrv.gf
+++ b/src/croatian/CatHrv.gf
@@ -22,9 +22,9 @@ concrete CatHrv of Cat =

    VP = {verb : VerbForms ; clit,compl : Agr => Str} ; ---- more fields probably needed
    VPSlash = {verb : VerbForms ; clit,compl : Agr => Str ; c : ComplementCase} ; ----
-    V  = ResHrv.VerbForms ;
-    V2 = ResHrv.VerbForms ** {c : ComplementCase} ;
-    VS,VQ  = ResHrv.VerbForms ;
+    V  = {s : VerbForms} ;
+    V2 = {s : VerbForms ; c : ComplementCase} ;
+    VS,VQ  = {s : VerbForms} ;

    A  = ResHrv.AdjForms ;
    AP = ResHrv.Adjective ** {isPost : Bool} ; -- {s : Gender => Number => Case => Str}
@@ -32,7 +32,7 @@ concrete CatHrv of Cat =
    
    AdA = {s : Str} ;

-    N  = ResHrv.NounForms ;
+    N  = ResHrv.NounForms ** {g : Gender} ;
    CN = ResHrv.Noun ;      -- {s : Number => Case => Str ; g : Gender}
    NP = {s,clit,prep : Case => Str ; a : Agr ; hasClit : Bool} ; -- clit,prep differ for pronouns
    PN = {s : Case => Str ; g : Gender} ; 
@@ -40,7 +40,7 @@ concrete CatHrv of Cat =
    Quant = {s : Gender => Number => Case => Str} ; -- same as AP
    Num = Determiner ;
    Card = Determiner ; -- {s : Gender => Case => Str ; size : NumSize} ; 
-    Pron = PronForms ** {poss : DemPronForms} ;
+    Pron = PronForms ** {poss : AdjForms} ;

    Adv  = {s : Str} ;
    Prep = ResHrv.ComplementCase ; -- {s : Str ; c : Case ; hasPrep : Bool} ;
@@ -64,8 +64,7 @@ concrete CatHrv of Cat =
    A = \s -> s.msnom ;


-  lincat Numeral = Determiner ; ---- TODO: should contain Ord as well
-  lincat Digits = {s:Str ; size : NumSize} ;
-
+  lincat Numeral = {s : AdjForms ; size : NumSize} ;
+  lincat Digits = {s : Str ; size : NumSize} ;

 }
--- a/src/croatian/LexiconHrv.gf
+++ b/src/croatian/LexiconHrv.gf
@@ -57,8 +57,8 @@ concrete LexiconHrv of Lexicon =
    green_A = mkA "zelený" ;
    yellow_A = mkA "žltý" ;

-    buy_V2 = mkV2 (iii_kupovatVerbForms "kupovať") ;
-    love_V2 = mkV2 (iii_kupovatVerbForms "milovať") ;
+----    buy_V2 = mkV2 (iii_kupovatVerbForms "kupovať") ;
+----    love_V2 = mkV2 (iii_kupovatVerbForms "milovať") ;

 }

--- a/src/croatian/NounHrv.gf
+++ b/src/croatian/NounHrv.gf
@@ -24,39 +24,27 @@ lin

    DefArt = {s = \\_,_,_ => []} ;
    IndefArt = {s = \\_,_,_ => []} ;
-    NumPl = {s = \\_,_ => [] ; size = Num2_4} ; ---- size
-    NumSg = {s = \\_,_ => [] ; size = Num1} ;
+    NumPl = {s = \\_,_ => [] ; size = NS_20_} ; ---- size
+    NumSg = {s = \\_,_ => [] ; size = NS_1} ;

    UsePron pron = {
-      s = table {
-        Nom => pron.nom ;
-	Gen => pron.gen ;
-	Dat => pron.dat ;
-	Acc => pron.acc ;
-	Loc => pron.loc ;
+      s, prep = table {  ---- TODO check prep
+        Nom | Voc => pron.nom ;
+	Gen | Acc => pron.gen ;
+	Dat | Loc => pron.dat ;
 	Ins => pron.ins
        } ;
-      clit = table {
-        Nom => pron.cnom ;
-	Gen => pron.cgen ;
-	Dat => pron.cdat ;
-	Acc => pron.cacc ;
-	Loc => pron.loc ;
+      clit = table {  ---- TODO check prep
+        Nom | Voc => pron.nom ;
+	Gen | Acc => pron.cgen ;
+	Dat | Loc => pron.cdat ;
 	Ins => pron.ins
        } ;
-      prep = table {
-        Nom => pron.nom ;
-	Gen => pron.pgen ;
-	Dat => pron.pdat ;
-	Acc => pron.pacc ;
-	Loc => pron.loc ;
-	Ins => pron.pins
-        } ;
      a = pron.a ;
      hasClit = True ;
      } ;
      
-    PossPron pron = justDemPronFormsAdjective pron.poss ;
+    PossPron pron = adjFormsAdjective pron.poss ;

    UsePN pn = {
      s,clit,prep = \\c => pn.s ! c ;
@@ -86,7 +74,7 @@ lin
      hasClit = False ;
      } ;
      
-    UseN n = nounFormsNoun n ;
+    UseN n = nounFormsNoun n n.g ;

    ApposCN cn np = {
      s = \\n,c => cn.s ! n ! c ++ np.s ! c ; ---- TODO check apposition order
@@ -95,7 +83,10 @@ lin
      
    NumCard c = c ;
    NumDigits ds = ds ** {s = \\_,_ => ds.s} ;
-    NumNumeral nu = nu ;
+    NumNumeral nu = {
+      s = \\g,c => (adjFormsAdjective nu.s).s ! g ! Sg ! c ; ---- TODO Sg?
+      size = nu.size
+      } ;


 }
--- a/src/croatian/NumeralHrv.gf
+++ b/src/croatian/NumeralHrv.gf
@@ -1,6 +1,6 @@
 concrete NumeralHrv of Numeral =

----  CatHrv [Numeral, Digits] **
+  CatHrv [Numeral, Digits] **
  
  open
    ResHrv,
@@ -10,11 +10,6 @@ concrete NumeralHrv of Numeral =
 -- AR 2022-09-27
 ---- TODO ordinal forms

-lincat Numeral = LinNumeral ; ---- TODO move to Cat
-lincat Digits = {s : Str ; size : NumSize} ;
-
-param NumSize = NS_1 | NS_2_4 | NS_5_20 | NS_20_ ;
-
 oper LinNumeral = {s : AdjForms ; size : NumSize} ;
 oper LinDigit = {unit : AdjForms ; teen, ten, hundred : Str ; size : NumSize} ;

--- a/src/croatian/ParadigmsHrv.gf
+++ b/src/croatian/ParadigmsHrv.gf
@@ -26,6 +26,8 @@ oper
    = Dat ;
  accusative : Case
    = Acc ;
+  vocative : Case
+    = Voc ;
  locative : Case
    = Loc ;
  instrumental : Case
@@ -39,8 +41,8 @@ oper
  mkN = overload {
    mkN : (nom : Str) -> N
      = \nom -> lin N (guessNounForms nom) ;
-    mkN : (nom,gen : Str) -> Gender -> N
-      = \nom,gen,g -> lin N (declensionNounForms nom gen g) ;
+    mkN : (nom,gen : Str) -> Gender -> N ---- TODO
+      = \nom,gen,g -> lin N (guessNounForms nom) ;
    } ;

 -- The following standard declensions can be used with good accuracy.
@@ -49,34 +51,7 @@ oper
 -- The default extensions are shown in comments; if the default is correct, no extension is needed.
 -- Notice that some paradigms take two arguments, some take one.

-  chlapN : Str -> N  
-    = \s -> lin N (R.chlapN s) ;
-  hrdinaN : Str -> N 
-    = \s -> lin N (R.hrdinaN s) ;
-  dubN : Str -> N  
-    = \s -> lin N (R.dubN s) ;
-  strojN : Str -> N
-    = \s -> lin N (R.strojN s) ;
-  ponyN : Str -> N
-    = \s -> lin N (R.ponyN s) ;
-  zenaN : (snom, pgen : Str) -> N
-    = \s,p -> lin N (R.zenaN s) ** {pgen = p} ;
-  ulicaN : (snom, pgen : Str) -> N
-    = \s,p -> lin N (R.ulicaN s) ** {pgen = p} ;
-  dlanN  : (snom, pgen : Str) -> N
-    = \s,p -> lin N (R.dlanN s p) ;
-  kostN  : (snom, pgen : Str) -> N
-    = \s,p -> lin N (R.kostN s p) ;
-  mestoN : (snom, pgen : Str) -> N
-    = \s,p -> lin N (R.mestoN s) ** {pgen = p} ;
-  srdceN : (snom, pgen : Str) -> N
-    = \s,p -> lin N (R.srdceN s) ** {pgen = p} ;
-  vysvedcenieN : Str -> N
-    = \s -> lin N (R.vysvedcenieN s) ;
-  dievcaN : Str -> N
-    = \s -> lin N (R.dievcaN s) ;
-  dievceniecN : Str -> N
-    = \s -> lin N (R.dievceniecN s) ;
+---- TODO

 -- The full definition of the noun record is
 -- {
@@ -92,27 +67,9 @@ oper

  mkA = overload {
    mkA : Str -> A
-      = \s -> lin A (guessAdjForms s)
+      = \s -> lin A (velikA s)
    } ;

-  peknyA : Str -> A
-    = \s -> lin A (R.peknyA s) ;
-  krasnyA : Str -> A
-    = \s -> lin A (R.krasnyA s) ;
-  cudziA : Str -> A
-    = \s -> lin A (R.cudziA s) ;
-  rydziA : Str -> A
-    = \s -> lin A (R.rydziA s) ;
-  otcovA : Str -> A
-    = \s -> lin A (R.otcovA s) ;
-  paviA  : Str -> A
-    = \s -> lin A (R.paviA s) ;
-
-  invarA : Str -> A
-    = \s -> lin A (invarAdjForms s) ;
-
-  mkA2 : A -> Prep -> A2
-    = \a,p -> lin A2 (a ** {c = p}) ;

 -- the full definition of the adjective record is
 -- {
@@ -125,12 +82,12 @@ oper
 -- Verbs

  mkV2 = overload {
-    mkV2 : VerbForms -> VerbForms ** {c : ComplementCase}
-      = \vf -> vf ** {c = {s = [] ; c = Acc ; hasPrep = False}} ;
-    mkV2 : VerbForms -> Case -> VerbForms ** {c : ComplementCase}
-      = \vf,c -> vf ** {c = {s = [] ; c = c ; hasPrep = False}} ;
-    mkV2 : VerbForms -> ComplementCase -> VerbForms ** {c : ComplementCase}
-      = \vf,c -> vf ** {c = c} ;
+    mkV2 : VerbForms -> V2
+      = \vf -> lin V2 {s = vf ; c = {s = [] ; c = Acc ; hasPrep = False}} ;
+    mkV2 : VerbForms -> Case -> V2
+      = \vf,c -> lin V2 {s = vf ; c = {s = [] ; c = c ; hasPrep = False}} ;
+    mkV2 : VerbForms -> ComplementCase -> V2 
+      = \vf,c -> lin V2 {s = vf ; c = c} ;
    } ;

 ------------------------
--- a/src/croatian/ResHrv.gf
+++ b/src/croatian/ResHrv.gf
@@ -143,29 +143,16 @@ voicing : Str -> Str = \s -> case s of {

      _ => dubN (""+snom) ** {pgen = pgen} ---- Predef.error ("cannot infer declension type for" ++ snom ++ pgen)
      } ** {pgen = pgen ; g = g} ;
-
+-}
 -- the "smartest" one-argument mkN

-  guessNounForms : Str -> NounForms
+  guessNounForms : Str -> NounForms ** {g : Gender}
    = \snom -> case snom of {
-        _ + ("i"|"y"|"e")           => ponyN snom ;
-        _ + #softConsonant          => strojN snom ;
-        _ + #hardConsonant          => dubN snom ;
-        _ + #neutralConsonant       => dubN snom ;
-        _ + #hardConsonant + "a"    => zenaN snom ;
-        _ + #neutralConsonant + "a" => zenaN snom ;
-        _ + #softConsonant + "a"    => ulicaN snom ;
-        _ + ("ia"|"ya")             => ulicaN snom ;
-        _ + "o"                     => mestoN snom ;
-        _ + "ie"                    => vysvedcenieN snom ;
-        _ + "e"                     => srdceN snom ;
-        _ + "ä"                     => dievcaN snom ;

-        _ => dubN (""+snom) ---- Predef.error ("cannot guess declension type for" ++ snom)
+---- TODO
+        _ => izvorN snom ** {g = inanimate} 
      } ;

-}
-
 -- the traditional declensions, following Wiki
 -- they are also exported in ParadigmsHrv with names izvorN etc

@@ -666,12 +653,17 @@ oper
        _ => adjAdj.s ! g ! n ! c
        }
      } ;
+-}

+param NumSize = NS_1 | NS_2_4 | NS_5_20 | NS_20_ ;
+
+oper
  Determiner : Type = {
    s : Gender => Case => Str ;
    size : NumSize
    } ;

+{-
  mkDemPronForms : Str -> DemPronForms = \jedn -> {
      msnom   = jedn + "y" ; -- should be "jeden"
      fsnom   = jedn + "a" ;
@@ -810,19 +802,17 @@ oper
    regNumeral sto sto sto sto ;

  invarNumeral : Str -> Determiner = \s -> invarDeterminer s Num5 ;
+-}

 --------------------------------
 -- combining nouns with numerals

-param
-  NumSize = Num1 | Num2_4 | Num5 ; -- CEG 6.1
-
 oper
  numSizeForm : (Number => Case => Str) -> NumSize -> Case -> Str
    = \cns,n,c -> case n of {
-        Num1   => cns ! Sg ! c ;
-	Num2_4 => cns ! Pl ! c ;
-	Num5   => case c of {
+        NS_1   => cns ! Sg ! c ;
+	NS_2_4 => cns ! Pl ! c ;
+	_ => case c of {
 	  Nom | Acc => cns ! Pl ! Gen ;
 	  _ => cns ! Pl ! c
 	  }
@@ -830,14 +820,14 @@ oper

  numSizeAgr : Gender -> NumSize -> Person -> Agr
    = \g,ns,p -> case ns of {
-        Num5   => Ag Neutr Sg p ; -- essential grammar 6.1.4
-	Num2_4 => Ag g Pl p ;
-	Num1   => Ag g Sg p
+    	NS_1   => Ag g Sg p ;
+	NS_2_4 => Ag g Pl p ;
+	_   => Ag Neutr Sg p ---- TODO verify
 	} ;

  numSizeNumber : NumSize -> Number = \ns -> case ns of {
-    Num1 => Sg ;
+    NS_1 => Sg ;
    _ => Pl      ---- TO CHECK
    } ;
-}
+
 }
--- a/src/croatian/SentenceHrv.gf
+++ b/src/croatian/SentenceHrv.gf
@@ -14,19 +14,19 @@ lin
      } ;

    UseCl temp pol cl = {
-      s = temp.s ++ cl.subj ++ cl.clit ++ pol.s ++ verbAgr cl.verb cl.a pol.p ++ cl.compl ;
-      } ;
+      s = temp.s ++ cl.subj ++ cl.clit ++ pol.s ++ verbAgr cl.verb cl.a CTPres ++ cl.compl ;
+      } ; ---- TODO tense, negation

    --- TODO is inversion the standard? ; add indirect questions
    UseQCl temp pol cl = {
-      s = temp.s ++ cl.clit ++ pol.s ++ verbAgr cl.verb cl.a pol.p ++ cl.subj ++ cl.compl ; 
-      } ;
+      s = temp.s ++ cl.clit ++ pol.s ++ verbAgr cl.verb cl.a CTPres ++ cl.subj ++ cl.compl ; 
+      } ; ---- TODO tenses

    UseRCl temp pol rcl = {
      s = \\a => temp.s ++
                 rcl.subj ! a ++ rcl.clit ! a ++
-		 pol.s ++ verbAgr rcl.verb a pol.p ++
+		 pol.s ++ verbAgr rcl.verb a CTPres ++
 		 rcl.compl ! a ;
-      } ;
+      } ;  ---- TODO tenses
    
 }
--- a/src/croatian/StructuralHrv.gf
+++ b/src/croatian/StructuralHrv.gf
@@ -3,22 +3,22 @@ concrete StructuralHrv of Structural = CatHrv **

 lin
    and_Conj = mkConj "a" ;
-    by8agent_Prep = mkPrep "" Ins ; 
-    few_Det = invarNumeral "málo" ; -- see notes
+----    by8agent_Prep = mkPrep "" Ins ; 
+----    few_Det = invarNumeral "málo" ; -- see notes
    for_Prep = mkPrep "pre" accusative ;
-    from_Prep = mkPrep (pre {"z" => "zo" ; _ => "z"}) Gen ; ---- consonant clusters and syllable with the onset with the same place of articulation 
-    have_V2 = mkV2 haveVerbForms ;
-    in_Prep = mkPrep (pre {"v" => "vo" ; _ => "v"}) Loc ; ----
-    many_Det = regNumeral "mnoho" "mnohých" "mnohým" "mnohými" ; ---- alternative: invarNumeral "veľa" ;
+    from_Prep = mkPrep "iz" Gen ;
+    have_V2 = mkV2 imati_VerbForms ;
+    in_Prep = mkPrep "u" Loc ; 
+----    many_Det = regNumeral "mnoho" "mnohých" "mnohým" "mnohými" ; ---- alternative: invarNumeral "veľa" ;
    or_Conj = mkConj "alebo" ;
-    somePl_Det = invarDeterminer "niekoľko" Num5 ;
---    somePl_Det = {s = \\g,c => (demPronFormsAdjective (mkDemPronForms "niekoľko") "").s ! g ! Pl ! c ; size = Num5} ;
-    something_NP = {s,clit,prep = \\c => "nie" + coForms ! c ; a = Ag Neutr Sg P3 ; hasClit = False} ; -- CEG 5.6.3
+----    somePl_Det = invarDeterminer "niekoľko" Num5 ;
+----    somePl_Det = {s = \\g,c => (demPronFormsAdjective (mkDemPronForms "niekoľko") "").s ! g ! Pl ! c ; size = Num5} ;
+----    something_NP = {s,clit,prep = \\c => "nie" + coForms ! c ; a = Ag Neutr Sg P3 ; hasClit = False} ; -- CEG 5.6.3
    possess_Prep = mkPrep "" Gen ;
-    that_Quant = demPronFormsAdjective (tenDemPronForms "") "" ;
-    this_Quant = demPronFormsAdjective (tenDemPronForms "" ** {msgen = "toh"}) "to" ;
-    to_Prep = mkPrep "do" Gen ;
-    with_Prep = mkPrep (pre {"s" => "so" ; _ => "s"}) Ins ; 
+----    that_Quant = demPronFormsAdjective (tenDemPronForms "") "" ;
+----    this_Quant = demPronFormsAdjective (tenDemPronForms "" ** {msgen = "toh"}) "to" ;
+    to_Prep = mkPrep "u" Acc ;
+    with_Prep = mkPrep (pre {"s"|"z"|"š"|"ž"|"mnom" => "sa" ; _ => "s"}) Ins ; 

    i_Pron = mkPron (Ag (Masc Anim) Sg P1) ;   --- to add Fem pronouns in Extend
    youSg_Pron = mkPron (Ag (Masc Anim) Sg P2) ;
--- a/src/croatian/VerbHrv.gf
+++ b/src/croatian/VerbHrv.gf
@@ -2,7 +2,7 @@ concrete VerbHrv of Verb = CatHrv ** open ResHrv, Prelude in {

 lin
    UseV v = {
-      verb = v ;
+      verb = v.s ;
      clit,compl = \\_ => []
      } ;
    
@@ -16,13 +16,13 @@ lin
      } ;

    SlashV2a v = {
-      verb = v ;
+      verb = v.s ;
      clit,compl = \\_ => [] ;
      c = v.c
      } ;

    UseComp comp = {
-      verb  = copulaVerbForms ;
+      verb  = biti_VerbForms ; ---- TODO: jesam
      clit = \\_ => [] ;
      compl = comp.s
      } ;
--- a/src/croatian/wiktionary/extract.py
+++ b/src/croatian/wiktionary/extract.py
@@ -1,6 +1,9 @@
 import json

 # https://kaikki.org/dictionary/rawdata.html
+# Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
+# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC),
+# pp. 1317-1325, Marseille, 20-25 June 2022. 

 FILE = 'data/raw-wiktextract-data.json'