started Hrv verbs and their Wiktionary extraction

This commit is contained in:
Aarne Ranta
2022-09-25 10:06:36 +02:00
parent 3eac1b9d0c
commit 7a0b1eed34
4 changed files with 118 additions and 44 deletions

View File

@@ -17,6 +17,13 @@ param
Person = P1 | P2 | P3 ;
VForm =
VInf
| VPres Number Person
| VPastPart Gender Number
;
---- TODO aorist, imperfect
Agr = Ag Gender Number Person ;
CTense = CTPres | CTPast ; ----- TODO complete the tense system to match BCS verb morphology
@@ -414,30 +421,20 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
pgen = velk + "ih" ;
} ;
{-
---------------------
-- Verbs
-- https://en.wikipedia.org/wiki/Slovak_language#Verbs
-- Wiki
VerbForms : Type = { ---- TODO more forms to add ?
inf,
pressg1, pressg2, pressg3,
prespl1, prespl2, prespl3,
pastpmasc, pastpfem, pastpneutr : Str
} ;
VerbForms : Type = VForm => Str ;
ComplementCase : Type = {s : Str ; c : Case ; hasPrep : Bool} ;
verbAgr : VerbForms -> Agr -> Bool -> Str ---- TODO tenses
= \vf,a,b -> case a of {
Ag _ Sg P1 => vf.pressg1 ;
Ag _ Sg P2 => vf.pressg2 ;
Ag _ Sg P3 => vf.pressg3 ;
Ag _ Pl P1 => vf.prespl1 ;
Ag _ Pl P2 => vf.prespl2 ;
Ag _ Pl P3 => vf.prespl3
verbAgr : VerbForms -> Agr -> CTense -> Str ---- TODO tenses
= \vf,a,b -> case <a,b> of {
<Ag _ n p, CTPres> => vf ! VPres n p ;
<Ag g n _, CTPast> => vf ! VPastPart g n
} ;
{-
copulaVerbForms : VerbForms = {
inf = "byť" ;
pressg1 = "som" ;
@@ -463,29 +460,36 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
pastpfem = "mala" ;
pastpneutr = "malo" ;
} ;
-}
-- just an example of a traditional paradigm
---- TODO other traditional paradigms
iii_kupovatVerbForms : Str -> VerbForms = \kupovat ->
aeiVerbForms : Str -> VerbForms = \citati ->
let
kupo = Predef.tk 3 kupovat ;
kupu = Predef.tk 1 kupo + "u"
in
{
inf = kupovat ;
pressg1 = kupu + "jem" ;
pressg2 = kupu + "ješ" ;
pressg3 = kupu + "je" ;
prespl1 = kupu + "jeme" ;
prespl2 = kupu + "jete" ;
prespl3 = kupu + "jú" ;
pastpmasc = "kupoval" ;
pastpfem = "kupovala" ;
pastpneutr = "kupovalo" ;
cita = Predef.tk 2 citati ;
u = case last cita of {
"a" => "aju" ;
"e" => "u" ;
"i" => "e"
} ;
in table {
VInf => citati ;
VPres Sg P1 => cita + "m" ;
VPres Sg P2 => cita + "š" ;
VPres Sg P3 => cita ;
VPres Pl P1 => cita + "mo" ;
VPres Pl P2 => cita + "te" ;
VPres pl P3 => init cita + u ;
VPastPart (Masc _) Sg => cita + "o" ;
VPastPart Fem Sg => cita + "la" ;
VPastPart Neutr Sg => cita + "lo" ;
VPastPart (Masc _) Pl => cita + "li" ;
VPastPart Fem Pl => cita + "le" ;
VPastPart Neutr Pl => cita + "la"
} ;
{-
---------------------------
-- Pronouns

View File

@@ -552,3 +552,34 @@ s . Neutr => Pl => Acc => niska
s . Neutr => Pl => Voc => niska
s . Neutr => Pl => Loc => niskim
s . Neutr => Pl => Ins => niskim
VInf => čitati
VPres Sg P1 => čitam
VPres Sg P2 => čitaš
VPres Sg P3 => čita
VPres Pl P1 => čitamo
VPres Pl P2 => čitate
VPres Pl P3 => čitaju
VPastPart (Masc Anim) Sg => čitao
VPastPart (Masc Anim) Pl => čitali
VPastPart (Masc Inanim) Sg => čitao
VPastPart (Masc Inanim) Pl => čitali
VPastPart Fem Sg => čitala
VPastPart Fem Pl => čitale
VPastPart Neutr Sg => čitalo
VPastPart Neutr Pl => čitala
VInf => raditi
VPres Sg P1 => radim
VPres Sg P2 => radiš
VPres Sg P3 => radi
VPres Pl P1 => radimo
VPres Pl P2 => radite
VPres Pl P3 => rade
VPastPart (Masc Anim) Sg => radio
VPastPart (Masc Anim) Pl => radili
VPastPart (Masc Inanim) Sg => radio
VPastPart (Masc Inanim) Pl => radili
VPastPart Fem Sg => radila
VPastPart Fem Pl => radile
VPastPart Neutr Sg => radilo
VPastPart Neutr Pl => radila
aarnes-mbp-2:croatian aarne$

View File

@@ -28,3 +28,6 @@ cc -table -unqual adjFormsAdjective (velikA "mastan")
cc -table -unqual adjFormsAdjective (velikA "gladan")
cc -table -unqual adjFormsAdjective (velikA "nizak")
cc -table -unqual aeiVerbForms ("čitati")
cc -table -unqual aeiVerbForms ("raditi")

View File

@@ -8,7 +8,7 @@ MYLANG = 'Serbo-Croatian'
GENDERS = ['masculine', 'feminine', 'neuter']
NOUN_CASES = {
NOUN_FORMS = {
'singular': {
'nominative': 'snom',
'genitive': 'sgen',
@@ -25,7 +25,7 @@ NOUN_CASES = {
}
}
ADJ_CASES = {
ADJ_FORMS = {
'masculine': {
'singular': {
'nominative': 'msnom',
@@ -54,6 +54,33 @@ ADJ_CASES = {
}
}
VERB_FORMS = {
'present': {
'singular': {
'first-person': 'pres_sg_1',
'second-person': 'pres_sg_2',
'third-person': 'pres_sg_3'
},
'plural': {
'first-person': 'pres_pl_1',
'second-person': 'pres_pl_2',
'third-person': 'pres_pl_3'
}
},
'participle': {
'singular': {
'masculine': 'ppart_masc_sg',
'feminine': 'ppart_fem_sg',
'neuter': 'ppart_neutr_sg'
},
'plural': {
'masculine': 'ppart_masc_pl',
'feminine': 'ppart_fem_pl',
'neuter': 'ppart_neutr_pl'
}
}
}
def get_forms(pos, forms):
@@ -64,23 +91,32 @@ def get_forms(pos, forms):
if g in f.get('tags', []):
dict['gender'] = g
tags = f.get('tags', [])
for num in NOUN_CASES:
for num in NOUN_FORMS:
if num in tags:
for case in NOUN_CASES[num]:
for case in NOUN_FORMS[num]:
if case in tags:
dict[NOUN_CASES[num][case]] = f['form']
dict[NOUN_FORMS[num][case]] = f['form']
elif pos == 'adj':
print(forms)
for f in forms:
tags = f.get('tags', [])
if 'positive' in tags and 'indefinite' in tags:
for g in ADJ_CASES:
for g in ADJ_FORMS:
if g in tags:
for n in ADJ_CASES[g]:
for n in ADJ_FORMS[g]:
if n in tags:
for c in ADJ_CASES[g][n]:
for c in ADJ_FORMS[g][n]:
if c in tags:
dict[ADJ_CASES[g][n][c]] = f['form']
dict[ADJ_FORMS[g][n][c]] = f['form']
elif pos == 'verb':
for f in forms:
tags = f.get('tags', [])
for t in VERB_FORMS:
if t in tags:
for n in VERB_FORMS[t]:
if n in tags:
for g in VERB_FORMS[t][n]:
if g in tags:
dict[VERB_FORMS[t][n][g]] = f['form']
else:
dict['forms'] = forms[:10] ####