mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-05-27 08:58:55 -06:00
Ara: improving Adj inflection by identifying fcl patterns from concrete forms
This commit is contained in:
@@ -153,7 +153,8 @@ oper
|
||||
w + "ف" + x + "ع" + y + "ل" + z
|
||||
=> { h = w ; m1 = x; m2 = y; t = z} ;
|
||||
w + "ف" + x + ("ع"|"ل") + y
|
||||
=> { h = w ; m1 = x; m2 = ""; t = y}
|
||||
=> { h = w ; m1 = x; m2 = ""; t = y} ;
|
||||
_ => Predef.error("cannot get FCL pattern from" ++ pat)
|
||||
} ;
|
||||
|
||||
--opers to interdigitize (make words out of roots and patterns:
|
||||
@@ -204,7 +205,8 @@ oper
|
||||
=> mkAssimilated pat (mkRoot3 rS) ;
|
||||
? + ? + _ => mkBilit pat (mkRoot2 rS) ; --2=>
|
||||
_=> error rS ---- AR error "expected 3--6"
|
||||
}
|
||||
} ;
|
||||
_ => Predef.error("cannot get FCL pattern from" ++ pS)
|
||||
};
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
@@ -898,12 +898,30 @@ oper
|
||||
= \r -> mkA r.root ;
|
||||
mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
|
||||
= \r -> mkA r.root ;
|
||||
mkA : {masc_sg : Str; fem_sg: Str ; masc_pl : Str; fem_pl : Str; root : Str} -> A
|
||||
= \r -> mkA r.root ;
|
||||
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
|
||||
= \r -> mkA r.root r.sg_patt r.pl_patt ;
|
||||
mkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
|
||||
= \r -> mkA r.root r.sg_patt r.pl_patt ;
|
||||
mkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
|
||||
= \r -> mkA r.root r.sg_patt ;
|
||||
mkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
|
||||
= \r -> mkA r.root r.sg_patt ;
|
||||
mkA : {masc_sg, root, sg_patt : Str} -> A
|
||||
= \r -> mkA r.root r.sg_patt ;
|
||||
mkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
|
||||
= \r -> mkA r.root r.sg_patt ;
|
||||
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
|
||||
= \r -> mkA r.root ; ----
|
||||
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
|
||||
= \r -> mkA r.root ; ----
|
||||
mkA : {masc_sg, fem_sg, root : Str} -> A
|
||||
= \r -> mkA r.root ; ----
|
||||
mkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
|
||||
= \r -> mkA r.masc_sg ; ----
|
||||
mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
|
||||
= \r -> mkA r.masc_sg ; ----
|
||||
mkA : {masc_sg : Str; fem_sg : Str; root : Str} -> A
|
||||
= \r -> mkA r.root ;
|
||||
mkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
|
||||
= \r -> mkA r.root r.sg_patt ;
|
||||
mkA : {masc_sg : Str; fem_sg : Str} -> A
|
||||
= \r -> mkA r.masc_sg ; ----
|
||||
mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
|
||||
@@ -914,8 +932,14 @@ oper
|
||||
= \r -> mkA r.masc_sg ; ----
|
||||
mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
|
||||
= \r -> mkA r.root ;
|
||||
mkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
|
||||
= \r -> mkA r.root ;
|
||||
mkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
|
||||
= \r -> mkA r.sg_patt r.pl_patt ;
|
||||
mkA : {masc_sg : Str; masc_pl : Str} -> A
|
||||
= \r -> mkA r.masc_sg ; ----
|
||||
mkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
|
||||
= \r -> mkA r.masc_sg ; ----
|
||||
mkA : {masc_sg : Str; root : Str} -> A
|
||||
= \r -> mkA r.root ;
|
||||
mkA : {masc_sg : Str} -> A
|
||||
|
||||
7
src/arabic/wiktionary/Makefile
Normal file
7
src/arabic/wiktionary/Makefile
Normal file
@@ -0,0 +1,7 @@
|
||||
all:
|
||||
python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
|
||||
python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
|
||||
python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl
|
||||
gf -make MorphoDictAra.gf
|
||||
python3 read_wiktionary.py eval-funs >1-eval.txt
|
||||
python3 to_wordnet.py >wornet-arabic.jsonl
|
||||
@@ -122,7 +122,7 @@ buckwalter_dict = {
|
||||
0x638: 'Z', # ظ
|
||||
0x639: 'E', # ع
|
||||
0x63a: 'g', # غ
|
||||
0x641: 'f', # ف
|
||||
0x641: 'f', # ف
|
||||
0x642: 'q', # ق
|
||||
0x643: 'k', # ك
|
||||
0x644: 'l', # ل
|
||||
@@ -144,6 +144,7 @@ buckwalter_dict = {
|
||||
0x671: '{' # ٱ
|
||||
}
|
||||
|
||||
|
||||
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
|
||||
|
||||
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
|
||||
@@ -184,6 +185,24 @@ def get_sound_trigram_root(s):
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
# reverse engineer fcl pattern from a given form, with a sound trigram root
|
||||
# one more condition: each of the root letters occurs exactly ones
|
||||
# TODO: better use the given root of the lex entry
|
||||
def get_sound_fcl_pattern(s):
|
||||
if root := get_sound_trigram_root(s):
|
||||
if len([c in s for c in root]) == 3:
|
||||
p = list(s)
|
||||
r = s.find(root[0])
|
||||
p[r] = chr(0x641)
|
||||
r += s[r+1:].find(root[1]) + 1
|
||||
p[r] = chr(0x639)
|
||||
r += s[r+1:].find(root[2]) + 1
|
||||
p[r] = chr(0x644)
|
||||
p = ''.join(p)
|
||||
## print('---PATT', s, root, p)
|
||||
return p
|
||||
|
||||
|
||||
# Wikt uses vowel+shadda which is a Unicode normalization
|
||||
# GF uses shadda+vowel which is linguistically correct
|
||||
@@ -324,7 +343,12 @@ def forms_for_pos(obj):
|
||||
'fem_pl': [form for form, descr in forms
|
||||
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
|
||||
}
|
||||
}
|
||||
}
|
||||
for patt in ['masc_sg', 'masc_pl']:
|
||||
if patt in gf_entry['args']:
|
||||
if form := gf_entry['args'][patt]:
|
||||
if spatt := get_sound_fcl_pattern(form[0]):
|
||||
gf_entry['args'][patt[5:]+'_patt'] = [spatt] # sg_patt, pl_patt
|
||||
|
||||
else:
|
||||
gf_entry = {f: d for f, d in forms}
|
||||
|
||||
Reference in New Issue
Block a user