first version of MorphoDictHrv extracted from wiktionary; TODO better use of PN and V forms

This commit is contained in:
Aarne Ranta
2022-10-12 10:11:12 +02:00
parent 8755f9da65
commit c20e9b6383
6 changed files with 21937 additions and 14 deletions

View File

@@ -30,7 +30,7 @@ lin
DefArt = {s = \\_,_,_ => []} ;
IndefArt = {s = \\_,_,_ => []} ;
NumPl = {s = \\_,_ => [] ; size = NS_20_} ; ---- size
NumPl = {s = \\_,_ => [] ; size = NS_2_4} ; ---- size
NumSg = {s = \\_,_ => [] ; size = NS_1} ;
UsePron pron = {

View File

@@ -13,6 +13,8 @@ oper
= Masc Anim ;
mascInanimate : Gender
= Masc Inanim ;
masculine : Gender
= Masc Inanim ;
feminine : Gender
= Fem ;
neuter : Gender
@@ -132,12 +134,27 @@ oper
compar = velikA comp ;
superl = superlAForms (velikA comp)
} ;
mkA : (posit : AForms) -> (compar : Str) -> A
= \posit,compar -> lin A {
posit = posit ;
compar = velikA compar ;
superl = superlAForms (velikA compar)
} ;
mkA : (posit, compar : AForms) -> A
= \posit,compar -> lin A {
posit = posit ;
compar = compar ;
superl = superlAForms compar
} ;
mkA : (posit : AForms) -> A
= \posit ->
let
compar = regComparAForms posit
in lin A {
posit = posit ;
compar = compar ;
superl = superlAForms compar
} ;
} ;
invarA : Str -> A

View File

@@ -338,12 +338,12 @@ voicing : Str -> Str = \s -> case s of {
msins : Str ; -- nsins, pdat, ploc, pins = msins
fsins : Str ; -- no o/e variation like in msdat
mpnom : Str ; -- mpvoc = mpnom
pgen : Str ; --
mpgen : Str ; --
} ;
invarAdjForms : Str -> AdjForms = \s -> {
msnom, fsnom, nsnom, msgen, fsgen, msdat,
fsdat, fsacc, msins, fsins, mpnom, pgen = s ;
fsdat, fsacc, msins, fsins, mpnom, mpgen = s ;
} ;
-- used in PositA but will also work in Compar and Superl by calling their record fields
@@ -368,7 +368,7 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
| <Pl,Dat|Loc|Ins, _> => afs.msins ;
<Sg, Ins, Fem> => afs.fsins ;
<Pl, Nom|Voc, Masc _> => afs.mpnom ;
<Pl, Gen,_> => afs.pgen
<Pl, Gen,_> => afs.mpgen
}
} ;
@@ -395,7 +395,7 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> {
msins = velk + "im" ;
fsins = velk + "om" ;
mpnom = velk + "i" ;
pgen = velk + "ih" ;
mpgen = velk + "ih" ;
} ;
regComparAForms : AdjForms -> AdjForms

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -40,13 +40,11 @@ ADJ_FORMS = {
'singular': {
'nominative': 'msnom',
'genitive': 'msgen',
'dative': 'msdat',
'locative': 'msloc',
'instrumental': 'msins'
'dative': 'msdat'
},
'plural': {
'nominative': 'mpnom',
'genitive': 'pgen'
'genitive': 'mpgen'
}
},
'feminine': {
@@ -54,7 +52,8 @@ ADJ_FORMS = {
'nominative': 'fsnom',
'genitive': 'fsgen',
'dative': 'fsdat',
'accusative': 'fsacc'
'accusative': 'fsacc',
'instrumental': 'fsins'
}
},
'neuter': {
@@ -122,7 +121,7 @@ def unaccent(word):
cyrillic = 'ЀЈЉЊЋЍЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшыѐђјљњћѝџӣӯ'
def get_forms(pos, forms):
def get_forms(pos, forms, word):
dict = {}
if pos == 'noun':
for f in forms:
@@ -157,6 +156,9 @@ def get_forms(pos, forms):
for c in ADJ_FORMS[g][n]:
if c in tags:
dict[ADJ_FORMS[g][n][c]] = unaccent(f['form'])
elif all([t in tags for t in [
'comparative', 'masculine', 'singular', 'nominative']]):
dict['cmsnom'] = unaccent(f['form'])
elif pos == 'verb':
for f in forms:
tags = f.get('tags', [])
@@ -167,6 +169,8 @@ def get_forms(pos, forms):
for g in VERB_FORMS[t][n]:
if g in tags:
dict[VERB_FORMS[t][n][g]] = unaccent(f['form'])
if dict:
dict['infin'] = unaccent(word)
else:
dict['forms'] = forms[:10] ####
@@ -178,7 +182,7 @@ def get_forms(pos, forms):
def lexinfo(data):
return data['word'], {
'pos': data['pos'], 'forms': get_forms(data['pos'], data['forms'])}
'pos': data['pos'], 'forms': get_forms(data['pos'], data['forms'], data['word'])}
# write morphology of mylang in m.json
@@ -219,7 +223,7 @@ def print_gf_code(data, i):
cats = {
'name': ('PN', 7),
'noun': ('N', 11),
'adj': ('A', 12),
'adj': ('A', 13),
'verb': ('V', 12)
}
pos = data[lemma]['pos']
@@ -230,7 +234,10 @@ def print_gf_code(data, i):
else:
s = '{'
for f in fs:
s += f + ' = ' + '"' + str(fs[f]) + '"' + ' ; '
if f == 'gender':
s += f + ' = P.' + str(fs[f]) + ' ; '
else:
s += f + ' = ' + '"' + str(fs[f]) + '"' + ' ; '
return s[:-3] + '}' # removing last ;
if pos in cats: