Ara: improving Adj inflection by identifying fcl patterns from concrete forms

This commit is contained in:
aarneranta
2023-09-20 16:05:46 +02:00
parent 2419931105
commit fdd7c9641e
4 changed files with 65 additions and 8 deletions

View File

@@ -153,7 +153,8 @@ oper
w + "ف" + x + "ع" + y + "ل" + z
=> { h = w ; m1 = x; m2 = y; t = z} ;
w + "ف" + x + ("ع"|"ل") + y
=> { h = w ; m1 = x; m2 = ""; t = y}
=> { h = w ; m1 = x; m2 = ""; t = y} ;
_ => Predef.error("cannot get FCL pattern from" ++ pat)
} ;
--opers to interdigitize (make words out of roots and patterns:
@@ -204,7 +205,8 @@ oper
=> mkAssimilated pat (mkRoot3 rS) ;
? + ? + _ => mkBilit pat (mkRoot2 rS) ; --2=>
_=> error rS ---- AR error "expected 3--6"
}
} ;
_ => Predef.error("cannot get FCL pattern from" ++ pS)
};
-----------------------------------------------------------------------------

View File

@@ -898,12 +898,30 @@ oper
= \r -> mkA r.root ;
mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
= \r -> mkA r.root ;
mkA : {masc_sg : Str; fem_sg: Str ; masc_pl : Str; fem_pl : Str; root : Str} -> A
= \r -> mkA r.root ;
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
= \r -> mkA r.root r.sg_patt r.pl_patt ;
mkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
= \r -> mkA r.root r.sg_patt r.pl_patt ;
mkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
mkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
mkA : {masc_sg, root, sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
mkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
= \r -> mkA r.root ; ----
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
= \r -> mkA r.root ; ----
mkA : {masc_sg, fem_sg, root : Str} -> A
= \r -> mkA r.root ; ----
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
= \r -> mkA r.masc_sg ; ----
mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
= \r -> mkA r.masc_sg ; ----
mkA : {masc_sg : Str; fem_sg : Str; root : Str} -> A
= \r -> mkA r.root ;
mkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
mkA : {masc_sg : Str; fem_sg : Str} -> A
= \r -> mkA r.masc_sg ; ----
mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
@@ -914,8 +932,14 @@ oper
= \r -> mkA r.masc_sg ; ----
mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
= \r -> mkA r.root ;
mkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
= \r -> mkA r.root ;
mkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
= \r -> mkA r.sg_patt r.pl_patt ;
mkA : {masc_sg : Str; masc_pl : Str} -> A
= \r -> mkA r.masc_sg ; ----
mkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
= \r -> mkA r.masc_sg ; ----
mkA : {masc_sg : Str; root : Str} -> A
= \r -> mkA r.root ;
mkA : {masc_sg : Str} -> A

View File

@@ -0,0 +1,7 @@
all:
python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl
gf -make MorphoDictAra.gf
python3 read_wiktionary.py eval-funs >1-eval.txt
python3 to_wordnet.py >wornet-arabic.jsonl

View File

@@ -122,7 +122,7 @@ buckwalter_dict = {
0x638: 'Z', # ظ
0x639: 'E', # ع
0x63a: 'g', # غ
0x641: 'f', # ف
0x641: 'f', # ف
0x642: 'q', # ق
0x643: 'k', # ك
0x644: 'l', # ل
@@ -144,6 +144,7 @@ buckwalter_dict = {
0x671: '{' # ٱ
}
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
@@ -184,6 +185,24 @@ def get_sound_trigram_root(s):
else:
return None
# reverse engineer fcl pattern from a given form, with a sound trigram root
# one more condition: each of the root letters occurs exactly ones
# TODO: better use the given root of the lex entry
def get_sound_fcl_pattern(s):
if root := get_sound_trigram_root(s):
if len([c in s for c in root]) == 3:
p = list(s)
r = s.find(root[0])
p[r] = chr(0x641)
r += s[r+1:].find(root[1]) + 1
p[r] = chr(0x639)
r += s[r+1:].find(root[2]) + 1
p[r] = chr(0x644)
p = ''.join(p)
## print('---PATT', s, root, p)
return p
# Wikt uses vowel+shadda which is a Unicode normalization
# GF uses shadda+vowel which is linguistically correct
@@ -324,7 +343,12 @@ def forms_for_pos(obj):
'fem_pl': [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
}
}
}
for patt in ['masc_sg', 'masc_pl']:
if patt in gf_entry['args']:
if form := gf_entry['args'][patt]:
if spatt := get_sound_fcl_pattern(form[0]):
gf_entry['args'][patt[5:]+'_patt'] = [spatt] # sg_patt, pl_patt
else:
gf_entry = {f: d for f, d in forms}