From 3eac1b9d0c685c80c75c0dfba10ee61f6d4aab18 Mon Sep 17 00:00:00 2001 From: aarneranta Date: Fri, 23 Sep 2022 15:54:43 +0200 Subject: [PATCH] adjective sound changes and extraction from wiktionary --- src/croatian/ResHrv.gf | 140 ++++++------------------ src/croatian/gold-test.txt | 168 +++++++++++++++++++++++++++++ src/croatian/testHrv.gfs | 3 + src/croatian/wiktionary/extract.py | 46 +++++++- 4 files changed, 248 insertions(+), 109 deletions(-) diff --git a/src/croatian/ResHrv.gf b/src/croatian/ResHrv.gf index f534cab1..132350c2 100644 --- a/src/croatian/ResHrv.gf +++ b/src/croatian/ResHrv.gf @@ -49,6 +49,15 @@ palatalize : Str -> Str = \s -> case s of { _ => s } ; +voicing : Str -> Str = \s -> case s of { + x + "b" => x + "p" ; + x + "d" => x + "t" ; + x + "đ" => x + "ć" ; + x + "z" => x + "s" ; + x + "dž" => x + "č" ; + x + "ž" => x + "š" ; + _ => s + } ; --------------- -- Nouns --------------- @@ -380,117 +389,32 @@ adjFormsAdjective : AdjForms -> Adjective = \afs -> { -} velikA : Str -> AdjForms = \velik -> - { + let + velk : Str = case velik of { + vel + "stan" => vel + "sn" ; + vel + "ao" => vel + "l" ; + vel + "ak" => voicing vel + "k" ; + vel + "a" + k@? => vel + k ; + _ => velik + } + in { msnom = velik ; - fsnom = velik + "a" ; - nsnom = velik + "o" ; - msgen = velik + "og" ; - fsgen = velik + "e" ; - msdat = velik + "omu" ; - fsdat = velik + "oj" ; - fsacc = velik + "u" ; - msloc = velik + "om" ; - msins = velik + "im" ; - mpnom = velik + "i" ; - pgen = velik + "ih" ; + fsnom = velk + "a" ; + nsnom = ifSoft velik + (velk + "e") + (velk + "o") ; + msgen = velk + "og" ; + fsgen = velk + "e" ; + msdat = velk + "omu" ; + fsdat = velk + "oj" ; + fsacc = velk + "u" ; + msloc = velk + "om" ; + msins = velk + "im" ; + mpnom = velk + "i" ; + pgen = velk + "ih" ; } ; -{- --- if the penultimate has accent, e.g. krásny, the last accent disappears - krasnyA : Str -> AdjForms = \krasny -> - let - krasn = init krasny ; - in peknyA krasny ** { - msnom = krasn + "y" ; - fsnom = krasn + "a" ; - nsnom = krasn + "e" ; - msgen = krasn + "eho" ; - msdat = krasn + "emu" ; - fsacc = krasn + "u" ; - msins = krasn + "ym" ; - ampnom = krasn + "i" ; - pgen = krasn + "ych" ; - pins = krasn + "ymi" ; - } ; - --- soft consonant + i - - cudziA : Str -> AdjForms = \cudzi -> - let - cudz = init cudzi ; - pcudz = palatal cudz ; - in { - msnom = pcudz + "í" ; - fsnom = pcudz + "ia" ; - nsnom = pcudz + "ie" ; - msgen = pcudz + "ieho" ; - fsgen = pcudz + "ej" ; - msdat = pcudz + "iemu" ; - fsacc = pcudz + "iu" ; - msloc = cudz + "om" ; - msins = pcudz + "ím" ; - fsins = cudz + "ou" ; - ampnom = pcudz + "í" ; - pgen = pcudz + "ích" ; - pins = pcudz + "ími" ; - } ; - --- accented vowel + soft consonant + i - rydziA : Str -> AdjForms = \rydzi -> - let - rydz = init rydzi ; - prydz = palatal rydz ; - in peknyA rydzi ** { - msnom = prydz + "i" ; - fsnom = rydz + "a" ; - nsnom = prydz + "e" ; - msgen = prydz + "eho" ; - msdat = prydz + "emu" ; - fsacc = rydz + "u" ; - msins = prydz + "im" ; - ampnom = prydz + "i" ; - pgen = prydz + "ich" ; - pins = prydz + "imi" ; - } ; - --- masculine possession: the same endings as in feminine - - otcovA : Str -> AdjForms = \otcov -> - { - msnom = otcov ; - fsnom = otcov + "a" ; - nsnom = otcov + "o" ; - msgen = otcov + "ho" ; - fsgen = otcov + "ej" ; - msdat = otcov + "mu" ; - fsacc = otcov + "u" ; - msloc = otcov + "om" ; - msins = otcov + "ým" ; - fsins = otcov + "ou" ; - ampnom = otcov + "i" ; - pgen = otcov + "ých" ; - pins = otcov + "ými" ; - } ; - - paviA : Str -> AdjForms = \pavi -> - let - pav = init pavi ; - in { - msnom = pav + "í" ; - fsnom = pav + "ia" ; - nsnom = pav + "ie" ; - msgen = pav + "ieho" ; - fsgen = pav + "ej" ; - msdat = pav + "iemu" ; - fsacc = pav + "iu" ; - msloc = pav + "om" ; - msins = pav + "ím" ; - fsins = pav + "ou" ; - ampnom = pav + "í" ; ---- - pgen = pav + "ich" ; ---- - pins = pav + "imi" ; ---- - } ; - +{- --------------------- -- Verbs -- https://en.wikipedia.org/wiki/Slovak_language#Verbs diff --git a/src/croatian/gold-test.txt b/src/croatian/gold-test.txt index de5e795c..e3293e86 100644 --- a/src/croatian/gold-test.txt +++ b/src/croatian/gold-test.txt @@ -384,3 +384,171 @@ s . Neutr => Pl => Acc => velika s . Neutr => Pl => Voc => velika s . Neutr => Pl => Loc => velikim s . Neutr => Pl => Ins => velikim +s . Masc Anim => Sg => Nom => mastan +s . Masc Anim => Sg => Gen => masnog +s . Masc Anim => Sg => Dat => masnomu +s . Masc Anim => Sg => Acc => masnog +s . Masc Anim => Sg => Voc => mastan +s . Masc Anim => Sg => Loc => masnom +s . Masc Anim => Sg => Ins => masnim +s . Masc Anim => Pl => Nom => masni +s . Masc Anim => Pl => Gen => masnih +s . Masc Anim => Pl => Dat => masnim +s . Masc Anim => Pl => Acc => masne +s . Masc Anim => Pl => Voc => masni +s . Masc Anim => Pl => Loc => masnim +s . Masc Anim => Pl => Ins => masnim +s . Masc Inanim => Sg => Nom => mastan +s . Masc Inanim => Sg => Gen => masnog +s . Masc Inanim => Sg => Dat => masnomu +s . Masc Inanim => Sg => Acc => mastan +s . Masc Inanim => Sg => Voc => mastan +s . Masc Inanim => Sg => Loc => masnom +s . Masc Inanim => Sg => Ins => masnim +s . Masc Inanim => Pl => Nom => masni +s . Masc Inanim => Pl => Gen => masnih +s . Masc Inanim => Pl => Dat => masnim +s . Masc Inanim => Pl => Acc => masne +s . Masc Inanim => Pl => Voc => masni +s . Masc Inanim => Pl => Loc => masnim +s . Masc Inanim => Pl => Ins => masnim +s . Fem => Sg => Nom => masna +s . Fem => Sg => Gen => masne +s . Fem => Sg => Dat => masnoj +s . Fem => Sg => Acc => masnu +s . Fem => Sg => Voc => masna +s . Fem => Sg => Loc => masnoj +s . Fem => Sg => Ins => masnom +s . Fem => Pl => Nom => masne +s . Fem => Pl => Gen => masnih +s . Fem => Pl => Dat => masnim +s . Fem => Pl => Acc => masne +s . Fem => Pl => Voc => masne +s . Fem => Pl => Loc => masnim +s . Fem => Pl => Ins => masnim +s . Neutr => Sg => Nom => masno +s . Neutr => Sg => Gen => masnog +s . Neutr => Sg => Dat => masnomu +s . Neutr => Sg => Acc => masno +s . Neutr => Sg => Voc => masno +s . Neutr => Sg => Loc => masnom +s . Neutr => Sg => Ins => masnim +s . Neutr => Pl => Nom => masna +s . Neutr => Pl => Gen => masnih +s . Neutr => Pl => Dat => masnim +s . Neutr => Pl => Acc => masna +s . Neutr => Pl => Voc => masna +s . Neutr => Pl => Loc => masnim +s . Neutr => Pl => Ins => masnim +s . Masc Anim => Sg => Nom => gladan +s . Masc Anim => Sg => Gen => gladnog +s . Masc Anim => Sg => Dat => gladnomu +s . Masc Anim => Sg => Acc => gladnog +s . Masc Anim => Sg => Voc => gladan +s . Masc Anim => Sg => Loc => gladnom +s . Masc Anim => Sg => Ins => gladnim +s . Masc Anim => Pl => Nom => gladni +s . Masc Anim => Pl => Gen => gladnih +s . Masc Anim => Pl => Dat => gladnim +s . Masc Anim => Pl => Acc => gladne +s . Masc Anim => Pl => Voc => gladni +s . Masc Anim => Pl => Loc => gladnim +s . Masc Anim => Pl => Ins => gladnim +s . Masc Inanim => Sg => Nom => gladan +s . Masc Inanim => Sg => Gen => gladnog +s . Masc Inanim => Sg => Dat => gladnomu +s . Masc Inanim => Sg => Acc => gladan +s . Masc Inanim => Sg => Voc => gladan +s . Masc Inanim => Sg => Loc => gladnom +s . Masc Inanim => Sg => Ins => gladnim +s . Masc Inanim => Pl => Nom => gladni +s . Masc Inanim => Pl => Gen => gladnih +s . Masc Inanim => Pl => Dat => gladnim +s . Masc Inanim => Pl => Acc => gladne +s . Masc Inanim => Pl => Voc => gladni +s . Masc Inanim => Pl => Loc => gladnim +s . Masc Inanim => Pl => Ins => gladnim +s . Fem => Sg => Nom => gladna +s . Fem => Sg => Gen => gladne +s . Fem => Sg => Dat => gladnoj +s . Fem => Sg => Acc => gladnu +s . Fem => Sg => Voc => gladna +s . Fem => Sg => Loc => gladnoj +s . Fem => Sg => Ins => gladnom +s . Fem => Pl => Nom => gladne +s . Fem => Pl => Gen => gladnih +s . Fem => Pl => Dat => gladnim +s . Fem => Pl => Acc => gladne +s . Fem => Pl => Voc => gladne +s . Fem => Pl => Loc => gladnim +s . Fem => Pl => Ins => gladnim +s . Neutr => Sg => Nom => gladno +s . Neutr => Sg => Gen => gladnog +s . Neutr => Sg => Dat => gladnomu +s . Neutr => Sg => Acc => gladno +s . Neutr => Sg => Voc => gladno +s . Neutr => Sg => Loc => gladnom +s . Neutr => Sg => Ins => gladnim +s . Neutr => Pl => Nom => gladna +s . Neutr => Pl => Gen => gladnih +s . Neutr => Pl => Dat => gladnim +s . Neutr => Pl => Acc => gladna +s . Neutr => Pl => Voc => gladna +s . Neutr => Pl => Loc => gladnim +s . Neutr => Pl => Ins => gladnim +s . Masc Anim => Sg => Nom => nizak +s . Masc Anim => Sg => Gen => niskog +s . Masc Anim => Sg => Dat => niskomu +s . Masc Anim => Sg => Acc => niskog +s . Masc Anim => Sg => Voc => nizak +s . Masc Anim => Sg => Loc => niskom +s . Masc Anim => Sg => Ins => niskim +s . Masc Anim => Pl => Nom => niski +s . Masc Anim => Pl => Gen => niskih +s . Masc Anim => Pl => Dat => niskim +s . Masc Anim => Pl => Acc => niske +s . Masc Anim => Pl => Voc => niski +s . Masc Anim => Pl => Loc => niskim +s . Masc Anim => Pl => Ins => niskim +s . Masc Inanim => Sg => Nom => nizak +s . Masc Inanim => Sg => Gen => niskog +s . Masc Inanim => Sg => Dat => niskomu +s . Masc Inanim => Sg => Acc => nizak +s . Masc Inanim => Sg => Voc => nizak +s . Masc Inanim => Sg => Loc => niskom +s . Masc Inanim => Sg => Ins => niskim +s . Masc Inanim => Pl => Nom => niski +s . Masc Inanim => Pl => Gen => niskih +s . Masc Inanim => Pl => Dat => niskim +s . Masc Inanim => Pl => Acc => niske +s . Masc Inanim => Pl => Voc => niski +s . Masc Inanim => Pl => Loc => niskim +s . Masc Inanim => Pl => Ins => niskim +s . Fem => Sg => Nom => niska +s . Fem => Sg => Gen => niske +s . Fem => Sg => Dat => niskoj +s . Fem => Sg => Acc => nisku +s . Fem => Sg => Voc => niska +s . Fem => Sg => Loc => niskoj +s . Fem => Sg => Ins => niskom +s . Fem => Pl => Nom => niske +s . Fem => Pl => Gen => niskih +s . Fem => Pl => Dat => niskim +s . Fem => Pl => Acc => niske +s . Fem => Pl => Voc => niske +s . Fem => Pl => Loc => niskim +s . Fem => Pl => Ins => niskim +s . Neutr => Sg => Nom => nisko +s . Neutr => Sg => Gen => niskog +s . Neutr => Sg => Dat => niskomu +s . Neutr => Sg => Acc => nisko +s . Neutr => Sg => Voc => nisko +s . Neutr => Sg => Loc => niskom +s . Neutr => Sg => Ins => niskim +s . Neutr => Pl => Nom => niska +s . Neutr => Pl => Gen => niskih +s . Neutr => Pl => Dat => niskim +s . Neutr => Pl => Acc => niska +s . Neutr => Pl => Voc => niska +s . Neutr => Pl => Loc => niskim +s . Neutr => Pl => Ins => niskim diff --git a/src/croatian/testHrv.gfs b/src/croatian/testHrv.gfs index de43042f..50ed8272 100644 --- a/src/croatian/testHrv.gfs +++ b/src/croatian/testHrv.gfs @@ -24,4 +24,7 @@ cc -table -unqual nounFormsNoun (poljeN "polje") neuter cc -table -unqual nounFormsNoun (zenaN "žena") feminine cc -table -unqual adjFormsAdjective (velikA "velik") +cc -table -unqual adjFormsAdjective (velikA "mastan") +cc -table -unqual adjFormsAdjective (velikA "gladan") +cc -table -unqual adjFormsAdjective (velikA "nizak") diff --git a/src/croatian/wiktionary/extract.py b/src/croatian/wiktionary/extract.py index db0b334a..0abe4cec 100644 --- a/src/croatian/wiktionary/extract.py +++ b/src/croatian/wiktionary/extract.py @@ -2,7 +2,7 @@ import json # https://kaikki.org/dictionary/rawdata.html -FILE = 'raw-wiktextract-data.json' +FILE = 'data/raw-wiktextract-data.json' MYLANG = 'Serbo-Croatian' @@ -25,6 +25,35 @@ NOUN_CASES = { } } +ADJ_CASES = { + 'masculine': { + 'singular': { + 'nominative': 'msnom', + 'genitive': 'msgen', + 'dative': 'msdat', + 'locative': 'msloc', + 'instrumental': 'msins' + }, + 'plural': { + 'nominative': 'mpnom', + 'genitive': 'pgen' + } + }, + 'feminine': { + 'singular': { + 'nominative': 'fsnom', + 'genitive': 'fsgen', + 'dative': 'fsdat', + 'accusative': 'fsacc' + } + }, + 'neuter': { + 'singular': { + 'nominative': 'nsnom' + } + } + } + def get_forms(pos, forms): @@ -40,6 +69,21 @@ def get_forms(pos, forms): for case in NOUN_CASES[num]: if case in tags: dict[NOUN_CASES[num][case]] = f['form'] + elif pos == 'adj': + print(forms) + for f in forms: + tags = f.get('tags', []) + if 'positive' in tags and 'indefinite' in tags: + for g in ADJ_CASES: + if g in tags: + for n in ADJ_CASES[g]: + if n in tags: + for c in ADJ_CASES[g][n]: + if c in tags: + dict[ADJ_CASES[g][n][c]] = f['form'] + + else: + dict['forms'] = forms[:10] #### return dict