forked from GitHub/gf-rgl
Ara: improving Adj inflection by identifying fcl patterns from concrete forms
This commit is contained in:
@@ -153,7 +153,8 @@ oper
|
|||||||
w + "ف" + x + "ع" + y + "ل" + z
|
w + "ف" + x + "ع" + y + "ل" + z
|
||||||
=> { h = w ; m1 = x; m2 = y; t = z} ;
|
=> { h = w ; m1 = x; m2 = y; t = z} ;
|
||||||
w + "ف" + x + ("ع"|"ل") + y
|
w + "ف" + x + ("ع"|"ل") + y
|
||||||
=> { h = w ; m1 = x; m2 = ""; t = y}
|
=> { h = w ; m1 = x; m2 = ""; t = y} ;
|
||||||
|
_ => Predef.error("cannot get FCL pattern from" ++ pat)
|
||||||
} ;
|
} ;
|
||||||
|
|
||||||
--opers to interdigitize (make words out of roots and patterns:
|
--opers to interdigitize (make words out of roots and patterns:
|
||||||
@@ -204,7 +205,8 @@ oper
|
|||||||
=> mkAssimilated pat (mkRoot3 rS) ;
|
=> mkAssimilated pat (mkRoot3 rS) ;
|
||||||
? + ? + _ => mkBilit pat (mkRoot2 rS) ; --2=>
|
? + ? + _ => mkBilit pat (mkRoot2 rS) ; --2=>
|
||||||
_=> error rS ---- AR error "expected 3--6"
|
_=> error rS ---- AR error "expected 3--6"
|
||||||
}
|
} ;
|
||||||
|
_ => Predef.error("cannot get FCL pattern from" ++ pS)
|
||||||
};
|
};
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -898,12 +898,30 @@ oper
|
|||||||
= \r -> mkA r.root ;
|
= \r -> mkA r.root ;
|
||||||
mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
|
mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
|
||||||
= \r -> mkA r.root ;
|
= \r -> mkA r.root ;
|
||||||
mkA : {masc_sg : Str; fem_sg: Str ; masc_pl : Str; fem_pl : Str; root : Str} -> A
|
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
|
||||||
= \r -> mkA r.root ;
|
= \r -> mkA r.root r.sg_patt r.pl_patt ;
|
||||||
|
mkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt r.pl_patt ;
|
||||||
|
mkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
mkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
mkA : {masc_sg, root, sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
mkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root ; ----
|
||||||
|
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
|
||||||
|
= \r -> mkA r.root ; ----
|
||||||
|
mkA : {masc_sg, fem_sg, root : Str} -> A
|
||||||
|
= \r -> mkA r.root ; ----
|
||||||
|
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
|
mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
|
||||||
= \r -> mkA r.masc_sg ; ----
|
= \r -> mkA r.masc_sg ; ----
|
||||||
mkA : {masc_sg : Str; fem_sg : Str; root : Str} -> A
|
mkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
|
||||||
= \r -> mkA r.root ;
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
mkA : {masc_sg : Str; fem_sg : Str} -> A
|
mkA : {masc_sg : Str; fem_sg : Str} -> A
|
||||||
= \r -> mkA r.masc_sg ; ----
|
= \r -> mkA r.masc_sg ; ----
|
||||||
mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
|
mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
|
||||||
@@ -914,8 +932,14 @@ oper
|
|||||||
= \r -> mkA r.masc_sg ; ----
|
= \r -> mkA r.masc_sg ; ----
|
||||||
mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
|
mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
|
||||||
= \r -> mkA r.root ;
|
= \r -> mkA r.root ;
|
||||||
|
mkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
mkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.sg_patt r.pl_patt ;
|
||||||
mkA : {masc_sg : Str; masc_pl : Str} -> A
|
mkA : {masc_sg : Str; masc_pl : Str} -> A
|
||||||
= \r -> mkA r.masc_sg ; ----
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
mkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
mkA : {masc_sg : Str; root : Str} -> A
|
mkA : {masc_sg : Str; root : Str} -> A
|
||||||
= \r -> mkA r.root ;
|
= \r -> mkA r.root ;
|
||||||
mkA : {masc_sg : Str} -> A
|
mkA : {masc_sg : Str} -> A
|
||||||
|
|||||||
7
src/arabic/wiktionary/Makefile
Normal file
7
src/arabic/wiktionary/Makefile
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
all:
|
||||||
|
python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
|
||||||
|
python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
|
||||||
|
python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl
|
||||||
|
gf -make MorphoDictAra.gf
|
||||||
|
python3 read_wiktionary.py eval-funs >1-eval.txt
|
||||||
|
python3 to_wordnet.py >wornet-arabic.jsonl
|
||||||
@@ -144,6 +144,7 @@ buckwalter_dict = {
|
|||||||
0x671: '{' # ٱ
|
0x671: '{' # ٱ
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
|
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
|
||||||
|
|
||||||
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
|
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
|
||||||
@@ -185,6 +186,24 @@ def get_sound_trigram_root(s):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# reverse engineer fcl pattern from a given form, with a sound trigram root
|
||||||
|
# one more condition: each of the root letters occurs exactly ones
|
||||||
|
# TODO: better use the given root of the lex entry
|
||||||
|
def get_sound_fcl_pattern(s):
|
||||||
|
if root := get_sound_trigram_root(s):
|
||||||
|
if len([c in s for c in root]) == 3:
|
||||||
|
p = list(s)
|
||||||
|
r = s.find(root[0])
|
||||||
|
p[r] = chr(0x641)
|
||||||
|
r += s[r+1:].find(root[1]) + 1
|
||||||
|
p[r] = chr(0x639)
|
||||||
|
r += s[r+1:].find(root[2]) + 1
|
||||||
|
p[r] = chr(0x644)
|
||||||
|
p = ''.join(p)
|
||||||
|
## print('---PATT', s, root, p)
|
||||||
|
return p
|
||||||
|
|
||||||
|
|
||||||
# Wikt uses vowel+shadda which is a Unicode normalization
|
# Wikt uses vowel+shadda which is a Unicode normalization
|
||||||
# GF uses shadda+vowel which is linguistically correct
|
# GF uses shadda+vowel which is linguistically correct
|
||||||
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
|
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
|
||||||
@@ -325,6 +344,11 @@ def forms_for_pos(obj):
|
|||||||
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
|
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for patt in ['masc_sg', 'masc_pl']:
|
||||||
|
if patt in gf_entry['args']:
|
||||||
|
if form := gf_entry['args'][patt]:
|
||||||
|
if spatt := get_sound_fcl_pattern(form[0]):
|
||||||
|
gf_entry['args'][patt[5:]+'_patt'] = [spatt] # sg_patt, pl_patt
|
||||||
|
|
||||||
else:
|
else:
|
||||||
gf_entry = {f: d for f, d in forms}
|
gf_entry = {f: d for f, d in forms}
|
||||||
|
|||||||
Reference in New Issue
Block a user