diff --git a/src/croatian/CatHrv.gf b/src/croatian/CatHrv.gf index 6425c336..adca880f 100644 --- a/src/croatian/CatHrv.gf +++ b/src/croatian/CatHrv.gf @@ -22,9 +22,9 @@ concrete CatHrv of Cat = VP = {verb : VerbForms ; clit,compl : Agr => Str} ; ---- more fields probably needed VPSlash = {verb : VerbForms ; clit,compl : Agr => Str ; c : ComplementCase} ; ---- - V = ResHrv.VerbForms ; - V2 = ResHrv.VerbForms ** {c : ComplementCase} ; - VS,VQ = ResHrv.VerbForms ; + V = {s : VerbForms} ; + V2 = {s : VerbForms ; c : ComplementCase} ; + VS,VQ = {s : VerbForms} ; A = ResHrv.AdjForms ; AP = ResHrv.Adjective ** {isPost : Bool} ; -- {s : Gender => Number => Case => Str} @@ -32,7 +32,7 @@ concrete CatHrv of Cat = AdA = {s : Str} ; - N = ResHrv.NounForms ; + N = ResHrv.NounForms ** {g : Gender} ; CN = ResHrv.Noun ; -- {s : Number => Case => Str ; g : Gender} NP = {s,clit,prep : Case => Str ; a : Agr ; hasClit : Bool} ; -- clit,prep differ for pronouns PN = {s : Case => Str ; g : Gender} ; @@ -40,7 +40,7 @@ concrete CatHrv of Cat = Quant = {s : Gender => Number => Case => Str} ; -- same as AP Num = Determiner ; Card = Determiner ; -- {s : Gender => Case => Str ; size : NumSize} ; - Pron = PronForms ** {poss : DemPronForms} ; + Pron = PronForms ** {poss : AdjForms} ; Adv = {s : Str} ; Prep = ResHrv.ComplementCase ; -- {s : Str ; c : Case ; hasPrep : Bool} ; @@ -64,8 +64,7 @@ concrete CatHrv of Cat = A = \s -> s.msnom ; - lincat Numeral = Determiner ; ---- TODO: should contain Ord as well - lincat Digits = {s:Str ; size : NumSize} ; - + lincat Numeral = {s : AdjForms ; size : NumSize} ; + lincat Digits = {s : Str ; size : NumSize} ; } diff --git a/src/croatian/LexiconHrv.gf b/src/croatian/LexiconHrv.gf index 593b3e94..cac7ad36 100644 --- a/src/croatian/LexiconHrv.gf +++ b/src/croatian/LexiconHrv.gf @@ -57,8 +57,8 @@ concrete LexiconHrv of Lexicon = green_A = mkA "zelený" ; yellow_A = mkA "žltý" ; - buy_V2 = mkV2 (iii_kupovatVerbForms "kupovať") ; - love_V2 = mkV2 (iii_kupovatVerbForms "milovať") ; +---- buy_V2 = mkV2 (iii_kupovatVerbForms "kupovať") ; +---- love_V2 = mkV2 (iii_kupovatVerbForms "milovať") ; } diff --git a/src/croatian/NounHrv.gf b/src/croatian/NounHrv.gf index 4481416f..7924c8f4 100644 --- a/src/croatian/NounHrv.gf +++ b/src/croatian/NounHrv.gf @@ -24,39 +24,27 @@ lin DefArt = {s = \\_,_,_ => []} ; IndefArt = {s = \\_,_,_ => []} ; - NumPl = {s = \\_,_ => [] ; size = Num2_4} ; ---- size - NumSg = {s = \\_,_ => [] ; size = Num1} ; + NumPl = {s = \\_,_ => [] ; size = NS_20_} ; ---- size + NumSg = {s = \\_,_ => [] ; size = NS_1} ; UsePron pron = { - s = table { - Nom => pron.nom ; - Gen => pron.gen ; - Dat => pron.dat ; - Acc => pron.acc ; - Loc => pron.loc ; + s, prep = table { ---- TODO check prep + Nom | Voc => pron.nom ; + Gen | Acc => pron.gen ; + Dat | Loc => pron.dat ; Ins => pron.ins } ; - clit = table { - Nom => pron.cnom ; - Gen => pron.cgen ; - Dat => pron.cdat ; - Acc => pron.cacc ; - Loc => pron.loc ; + clit = table { ---- TODO check prep + Nom | Voc => pron.nom ; + Gen | Acc => pron.cgen ; + Dat | Loc => pron.cdat ; Ins => pron.ins } ; - prep = table { - Nom => pron.nom ; - Gen => pron.pgen ; - Dat => pron.pdat ; - Acc => pron.pacc ; - Loc => pron.loc ; - Ins => pron.pins - } ; a = pron.a ; hasClit = True ; } ; - PossPron pron = justDemPronFormsAdjective pron.poss ; + PossPron pron = adjFormsAdjective pron.poss ; UsePN pn = { s,clit,prep = \\c => pn.s ! c ; @@ -86,7 +74,7 @@ lin hasClit = False ; } ; - UseN n = nounFormsNoun n ; + UseN n = nounFormsNoun n n.g ; ApposCN cn np = { s = \\n,c => cn.s ! n ! c ++ np.s ! c ; ---- TODO check apposition order @@ -95,7 +83,10 @@ lin NumCard c = c ; NumDigits ds = ds ** {s = \\_,_ => ds.s} ; - NumNumeral nu = nu ; + NumNumeral nu = { + s = \\g,c => (adjFormsAdjective nu.s).s ! g ! Sg ! c ; ---- TODO Sg? + size = nu.size + } ; } diff --git a/src/croatian/NumeralHrv.gf b/src/croatian/NumeralHrv.gf index 88b2f431..40756b3e 100644 --- a/src/croatian/NumeralHrv.gf +++ b/src/croatian/NumeralHrv.gf @@ -1,6 +1,6 @@ concrete NumeralHrv of Numeral = ----- CatHrv [Numeral, Digits] ** + CatHrv [Numeral, Digits] ** open ResHrv, @@ -10,11 +10,6 @@ concrete NumeralHrv of Numeral = -- AR 2022-09-27 ---- TODO ordinal forms -lincat Numeral = LinNumeral ; ---- TODO move to Cat -lincat Digits = {s : Str ; size : NumSize} ; - -param NumSize = NS_1 | NS_2_4 | NS_5_20 | NS_20_ ; - oper LinNumeral = {s : AdjForms ; size : NumSize} ; oper LinDigit = {unit : AdjForms ; teen, ten, hundred : Str ; size : NumSize} ; diff --git a/src/croatian/ParadigmsHrv.gf b/src/croatian/ParadigmsHrv.gf index 0300cf46..0824b14c 100644 --- a/src/croatian/ParadigmsHrv.gf +++ b/src/croatian/ParadigmsHrv.gf @@ -26,6 +26,8 @@ oper = Dat ; accusative : Case = Acc ; + vocative : Case + = Voc ; locative : Case = Loc ; instrumental : Case @@ -39,8 +41,8 @@ oper mkN = overload { mkN : (nom : Str) -> N = \nom -> lin N (guessNounForms nom) ; - mkN : (nom,gen : Str) -> Gender -> N - = \nom,gen,g -> lin N (declensionNounForms nom gen g) ; + mkN : (nom,gen : Str) -> Gender -> N ---- TODO + = \nom,gen,g -> lin N (guessNounForms nom) ; } ; -- The following standard declensions can be used with good accuracy. @@ -49,34 +51,7 @@ oper -- The default extensions are shown in comments; if the default is correct, no extension is needed. -- Notice that some paradigms take two arguments, some take one. - chlapN : Str -> N - = \s -> lin N (R.chlapN s) ; - hrdinaN : Str -> N - = \s -> lin N (R.hrdinaN s) ; - dubN : Str -> N - = \s -> lin N (R.dubN s) ; - strojN : Str -> N - = \s -> lin N (R.strojN s) ; - ponyN : Str -> N - = \s -> lin N (R.ponyN s) ; - zenaN : (snom, pgen : Str) -> N - = \s,p -> lin N (R.zenaN s) ** {pgen = p} ; - ulicaN : (snom, pgen : Str) -> N - = \s,p -> lin N (R.ulicaN s) ** {pgen = p} ; - dlanN : (snom, pgen : Str) -> N - = \s,p -> lin N (R.dlanN s p) ; - kostN : (snom, pgen : Str) -> N - = \s,p -> lin N (R.kostN s p) ; - mestoN : (snom, pgen : Str) -> N - = \s,p -> lin N (R.mestoN s) ** {pgen = p} ; - srdceN : (snom, pgen : Str) -> N - = \s,p -> lin N (R.srdceN s) ** {pgen = p} ; - vysvedcenieN : Str -> N - = \s -> lin N (R.vysvedcenieN s) ; - dievcaN : Str -> N - = \s -> lin N (R.dievcaN s) ; - dievceniecN : Str -> N - = \s -> lin N (R.dievceniecN s) ; +---- TODO -- The full definition of the noun record is -- { @@ -92,27 +67,9 @@ oper mkA = overload { mkA : Str -> A - = \s -> lin A (guessAdjForms s) + = \s -> lin A (velikA s) } ; - peknyA : Str -> A - = \s -> lin A (R.peknyA s) ; - krasnyA : Str -> A - = \s -> lin A (R.krasnyA s) ; - cudziA : Str -> A - = \s -> lin A (R.cudziA s) ; - rydziA : Str -> A - = \s -> lin A (R.rydziA s) ; - otcovA : Str -> A - = \s -> lin A (R.otcovA s) ; - paviA : Str -> A - = \s -> lin A (R.paviA s) ; - - invarA : Str -> A - = \s -> lin A (invarAdjForms s) ; - - mkA2 : A -> Prep -> A2 - = \a,p -> lin A2 (a ** {c = p}) ; -- the full definition of the adjective record is -- { @@ -125,12 +82,12 @@ oper -- Verbs mkV2 = overload { - mkV2 : VerbForms -> VerbForms ** {c : ComplementCase} - = \vf -> vf ** {c = {s = [] ; c = Acc ; hasPrep = False}} ; - mkV2 : VerbForms -> Case -> VerbForms ** {c : ComplementCase} - = \vf,c -> vf ** {c = {s = [] ; c = c ; hasPrep = False}} ; - mkV2 : VerbForms -> ComplementCase -> VerbForms ** {c : ComplementCase} - = \vf,c -> vf ** {c = c} ; + mkV2 : VerbForms -> V2 + = \vf -> lin V2 {s = vf ; c = {s = [] ; c = Acc ; hasPrep = False}} ; + mkV2 : VerbForms -> Case -> V2 + = \vf,c -> lin V2 {s = vf ; c = {s = [] ; c = c ; hasPrep = False}} ; + mkV2 : VerbForms -> ComplementCase -> V2 + = \vf,c -> lin V2 {s = vf ; c = c} ; } ; ------------------------ diff --git a/src/croatian/ResHrv.gf b/src/croatian/ResHrv.gf index 77d1d2ff..0436322e 100644 --- a/src/croatian/ResHrv.gf +++ b/src/croatian/ResHrv.gf @@ -143,29 +143,16 @@ voicing : Str -> Str = \s -> case s of { _ => dubN (""+snom) ** {pgen = pgen} ---- Predef.error ("cannot infer declension type for" ++ snom ++ pgen) } ** {pgen = pgen ; g = g} ; - +-} -- the "smartest" one-argument mkN - guessNounForms : Str -> NounForms + guessNounForms : Str -> NounForms ** {g : Gender} = \snom -> case snom of { - _ + ("i"|"y"|"e") => ponyN snom ; - _ + #softConsonant => strojN snom ; - _ + #hardConsonant => dubN snom ; - _ + #neutralConsonant => dubN snom ; - _ + #hardConsonant + "a" => zenaN snom ; - _ + #neutralConsonant + "a" => zenaN snom ; - _ + #softConsonant + "a" => ulicaN snom ; - _ + ("ia"|"ya") => ulicaN snom ; - _ + "o" => mestoN snom ; - _ + "ie" => vysvedcenieN snom ; - _ + "e" => srdceN snom ; - _ + "ä" => dievcaN snom ; - _ => dubN (""+snom) ---- Predef.error ("cannot guess declension type for" ++ snom) +---- TODO + _ => izvorN snom ** {g = inanimate} } ; --} - -- the traditional declensions, following Wiki -- they are also exported in ParadigmsHrv with names izvorN etc @@ -666,12 +653,17 @@ oper _ => adjAdj.s ! g ! n ! c } } ; +-} +param NumSize = NS_1 | NS_2_4 | NS_5_20 | NS_20_ ; + +oper Determiner : Type = { s : Gender => Case => Str ; size : NumSize } ; +{- mkDemPronForms : Str -> DemPronForms = \jedn -> { msnom = jedn + "y" ; -- should be "jeden" fsnom = jedn + "a" ; @@ -810,19 +802,17 @@ oper regNumeral sto sto sto sto ; invarNumeral : Str -> Determiner = \s -> invarDeterminer s Num5 ; +-} -------------------------------- -- combining nouns with numerals -param - NumSize = Num1 | Num2_4 | Num5 ; -- CEG 6.1 - oper numSizeForm : (Number => Case => Str) -> NumSize -> Case -> Str = \cns,n,c -> case n of { - Num1 => cns ! Sg ! c ; - Num2_4 => cns ! Pl ! c ; - Num5 => case c of { + NS_1 => cns ! Sg ! c ; + NS_2_4 => cns ! Pl ! c ; + _ => case c of { Nom | Acc => cns ! Pl ! Gen ; _ => cns ! Pl ! c } @@ -830,14 +820,14 @@ oper numSizeAgr : Gender -> NumSize -> Person -> Agr = \g,ns,p -> case ns of { - Num5 => Ag Neutr Sg p ; -- essential grammar 6.1.4 - Num2_4 => Ag g Pl p ; - Num1 => Ag g Sg p + NS_1 => Ag g Sg p ; + NS_2_4 => Ag g Pl p ; + _ => Ag Neutr Sg p ---- TODO verify } ; numSizeNumber : NumSize -> Number = \ns -> case ns of { - Num1 => Sg ; + NS_1 => Sg ; _ => Pl ---- TO CHECK } ; --} + } diff --git a/src/croatian/SentenceHrv.gf b/src/croatian/SentenceHrv.gf index 17334d58..7b2e580d 100644 --- a/src/croatian/SentenceHrv.gf +++ b/src/croatian/SentenceHrv.gf @@ -14,19 +14,19 @@ lin } ; UseCl temp pol cl = { - s = temp.s ++ cl.subj ++ cl.clit ++ pol.s ++ verbAgr cl.verb cl.a pol.p ++ cl.compl ; - } ; + s = temp.s ++ cl.subj ++ cl.clit ++ pol.s ++ verbAgr cl.verb cl.a CTPres ++ cl.compl ; + } ; ---- TODO tense, negation --- TODO is inversion the standard? ; add indirect questions UseQCl temp pol cl = { - s = temp.s ++ cl.clit ++ pol.s ++ verbAgr cl.verb cl.a pol.p ++ cl.subj ++ cl.compl ; - } ; + s = temp.s ++ cl.clit ++ pol.s ++ verbAgr cl.verb cl.a CTPres ++ cl.subj ++ cl.compl ; + } ; ---- TODO tenses UseRCl temp pol rcl = { s = \\a => temp.s ++ rcl.subj ! a ++ rcl.clit ! a ++ - pol.s ++ verbAgr rcl.verb a pol.p ++ + pol.s ++ verbAgr rcl.verb a CTPres ++ rcl.compl ! a ; - } ; + } ; ---- TODO tenses } diff --git a/src/croatian/StructuralHrv.gf b/src/croatian/StructuralHrv.gf index b40ea4b9..afe9c72d 100644 --- a/src/croatian/StructuralHrv.gf +++ b/src/croatian/StructuralHrv.gf @@ -3,22 +3,22 @@ concrete StructuralHrv of Structural = CatHrv ** lin and_Conj = mkConj "a" ; - by8agent_Prep = mkPrep "" Ins ; - few_Det = invarNumeral "málo" ; -- see notes +---- by8agent_Prep = mkPrep "" Ins ; +---- few_Det = invarNumeral "málo" ; -- see notes for_Prep = mkPrep "pre" accusative ; - from_Prep = mkPrep (pre {"z" => "zo" ; _ => "z"}) Gen ; ---- consonant clusters and syllable with the onset with the same place of articulation - have_V2 = mkV2 haveVerbForms ; - in_Prep = mkPrep (pre {"v" => "vo" ; _ => "v"}) Loc ; ---- - many_Det = regNumeral "mnoho" "mnohých" "mnohým" "mnohými" ; ---- alternative: invarNumeral "veľa" ; + from_Prep = mkPrep "iz" Gen ; + have_V2 = mkV2 imati_VerbForms ; + in_Prep = mkPrep "u" Loc ; +---- many_Det = regNumeral "mnoho" "mnohých" "mnohým" "mnohými" ; ---- alternative: invarNumeral "veľa" ; or_Conj = mkConj "alebo" ; - somePl_Det = invarDeterminer "niekoľko" Num5 ; ---- somePl_Det = {s = \\g,c => (demPronFormsAdjective (mkDemPronForms "niekoľko") "").s ! g ! Pl ! c ; size = Num5} ; - something_NP = {s,clit,prep = \\c => "nie" + coForms ! c ; a = Ag Neutr Sg P3 ; hasClit = False} ; -- CEG 5.6.3 +---- somePl_Det = invarDeterminer "niekoľko" Num5 ; +---- somePl_Det = {s = \\g,c => (demPronFormsAdjective (mkDemPronForms "niekoľko") "").s ! g ! Pl ! c ; size = Num5} ; +---- something_NP = {s,clit,prep = \\c => "nie" + coForms ! c ; a = Ag Neutr Sg P3 ; hasClit = False} ; -- CEG 5.6.3 possess_Prep = mkPrep "" Gen ; - that_Quant = demPronFormsAdjective (tenDemPronForms "") "" ; - this_Quant = demPronFormsAdjective (tenDemPronForms "" ** {msgen = "toh"}) "to" ; - to_Prep = mkPrep "do" Gen ; - with_Prep = mkPrep (pre {"s" => "so" ; _ => "s"}) Ins ; +---- that_Quant = demPronFormsAdjective (tenDemPronForms "") "" ; +---- this_Quant = demPronFormsAdjective (tenDemPronForms "" ** {msgen = "toh"}) "to" ; + to_Prep = mkPrep "u" Acc ; + with_Prep = mkPrep (pre {"s"|"z"|"š"|"ž"|"mnom" => "sa" ; _ => "s"}) Ins ; i_Pron = mkPron (Ag (Masc Anim) Sg P1) ; --- to add Fem pronouns in Extend youSg_Pron = mkPron (Ag (Masc Anim) Sg P2) ; diff --git a/src/croatian/VerbHrv.gf b/src/croatian/VerbHrv.gf index fe73d1c1..0738926c 100644 --- a/src/croatian/VerbHrv.gf +++ b/src/croatian/VerbHrv.gf @@ -2,7 +2,7 @@ concrete VerbHrv of Verb = CatHrv ** open ResHrv, Prelude in { lin UseV v = { - verb = v ; + verb = v.s ; clit,compl = \\_ => [] } ; @@ -16,13 +16,13 @@ lin } ; SlashV2a v = { - verb = v ; + verb = v.s ; clit,compl = \\_ => [] ; c = v.c } ; UseComp comp = { - verb = copulaVerbForms ; + verb = biti_VerbForms ; ---- TODO: jesam clit = \\_ => [] ; compl = comp.s } ; diff --git a/src/croatian/wiktionary/extract.py b/src/croatian/wiktionary/extract.py index 92e5f795..9c88589b 100644 --- a/src/croatian/wiktionary/extract.py +++ b/src/croatian/wiktionary/extract.py @@ -1,6 +1,9 @@ import json # https://kaikki.org/dictionary/rawdata.html +# Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data, +# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), +# pp. 1317-1325, Marseille, 20-25 June 2022. FILE = 'data/raw-wiktextract-data.json'