mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-05-27 08:58:55 -06:00
Arabic Wiktionary: started comparing evaluation
This commit is contained in:
@@ -108,14 +108,76 @@ def gf_fun(s, pos, disamb=0):
|
|||||||
return ''.join(["'", s, discrim, "_", pos, "'"])
|
return ''.join(["'", s, discrim, "_", pos, "'"])
|
||||||
|
|
||||||
|
|
||||||
def forms_for_pos(obj):
|
rgl_features = {
|
||||||
forms = {
|
# V
|
||||||
|
'VPerf': 'perfective',
|
||||||
|
'Act': 'active',
|
||||||
|
'Pas': 'passive',
|
||||||
|
'Per3': 'third-person',
|
||||||
|
'Per2': 'second-person',
|
||||||
|
'Masc': 'masculine',
|
||||||
|
'Fem': 'feminine',
|
||||||
|
'Sg': 'singular',
|
||||||
|
'Pl': 'plural',
|
||||||
|
'Dl': 'dual',
|
||||||
|
'VImpf': 'imperfective',
|
||||||
|
'Ind': 'indicative',
|
||||||
|
'Cnj': 'subjunctive',
|
||||||
|
'Jus': 'jussive',
|
||||||
|
'VImp': 'imperative',
|
||||||
|
# N: also Sg, Pl, Dl
|
||||||
|
'Def': 'definite',
|
||||||
|
'Indef': 'indefinite',
|
||||||
|
'Nom': 'nominative',
|
||||||
|
'Acc': 'accusative',
|
||||||
|
'Gen': 'genitive',
|
||||||
|
# 'Bare':
|
||||||
|
# 'Dat':
|
||||||
|
'Const': 'construct',
|
||||||
|
# 'Poss':
|
||||||
|
#A: also N features
|
||||||
|
'APosit': 'positive',
|
||||||
|
'AComp': 'comparative'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ
|
||||||
|
def compare_tables(gf, wikt):
|
||||||
|
report = {}
|
||||||
|
for line in gf:
|
||||||
|
gf_form = line #''.join([c for c in line if 1574 <= ord(c) <= 1616])
|
||||||
|
gf_tags = tuple(word for word in
|
||||||
|
line.replace('(', ' ').replace(')', ' ').split()
|
||||||
|
if word in rgl_features)
|
||||||
|
wikt_tags = {rgl_features[tag] for tag in gf_tags}
|
||||||
|
wikt_form = None
|
||||||
|
for form, descr in wikt:
|
||||||
|
if all([tag in descr for tag in wikt_tags]):
|
||||||
|
wikt_form = form
|
||||||
|
break
|
||||||
|
report[gf_tags] = {
|
||||||
|
'gf_form': gf_form,
|
||||||
|
'wikt_form': wikt_form
|
||||||
|
}
|
||||||
|
if wikt_form:
|
||||||
|
report[gf_tags]['voc_match'] = int(gf_form == wikt_form)
|
||||||
|
report[gf_tags]['unvoc_match'] = int(unvocalize(gf_form) == unvocalize(wikt_form))
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def wikt_forms_for_pos(obj):
|
||||||
|
return {
|
||||||
form['form']:
|
form['form']:
|
||||||
form.get('tags', []) for
|
form.get('tags', []) for
|
||||||
form in obj.get('forms', []) if
|
form in obj.get('forms', []) if
|
||||||
'romanization' not in form.get('tags', []) and
|
'romanization' not in form.get('tags', []) and
|
||||||
is_arabic(form['form'])
|
is_arabic(form['form'])
|
||||||
}.items()
|
}.items()
|
||||||
|
|
||||||
|
|
||||||
|
def forms_for_pos(obj):
|
||||||
|
forms = wikt_forms_for_pos(obj)
|
||||||
if obj['pos'] == 'noun':
|
if obj['pos'] == 'noun':
|
||||||
lemma = [form[:-1] for form, descr in forms
|
lemma = [form[:-1] for form, descr in forms
|
||||||
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
||||||
@@ -136,7 +198,8 @@ def forms_for_pos(obj):
|
|||||||
elif obj['pos'] == 'verb':
|
elif obj['pos'] == 'verb':
|
||||||
lemma = [form for form, descr in forms
|
lemma = [form for form, descr in forms
|
||||||
if all([w in descr for
|
if all([w in descr for
|
||||||
w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1]
|
w in ["active", "indicative", "masculine", "past",
|
||||||
|
"perfective", "singular", "third-person"]])][:1]
|
||||||
gf_entry = {
|
gf_entry = {
|
||||||
'cat': 'V',
|
'cat': 'V',
|
||||||
'lemma': lemma,
|
'lemma': lemma,
|
||||||
@@ -193,14 +256,16 @@ if MODE == 'gf-abs':
|
|||||||
if MODE == 'gf-cnc':
|
if MODE == 'gf-cnc':
|
||||||
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {')
|
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {')
|
||||||
|
|
||||||
if MODE != 'raw':
|
if MODE not in ['raw', 'eval']:
|
||||||
with open(FILTERED_WIKT) as file:
|
with open(FILTERED_WIKT) as file:
|
||||||
seen_gf_funs = {}
|
seen_gf_funs = {}
|
||||||
|
number = 1
|
||||||
for line in file:
|
for line in file:
|
||||||
try:
|
try:
|
||||||
obj = json.loads(line)
|
obj = json.loads(line)
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
|
number += 1
|
||||||
root = [find_root(t['expansion']) for
|
root = [find_root(t['expansion']) for
|
||||||
t in obj.get('etymology_templates', []) if
|
t in obj.get('etymology_templates', []) if
|
||||||
t.get('name', None) =='ar-root'][:1]
|
t.get('name', None) =='ar-root'][:1]
|
||||||
@@ -227,7 +292,7 @@ if MODE != 'raw':
|
|||||||
fun = gf_fun(lemma, cat, discrim)
|
fun = gf_fun(lemma, cat, discrim)
|
||||||
|
|
||||||
if MODE == 'gf-abs':
|
if MODE == 'gf-abs':
|
||||||
print('fun', fun, ':', cat, ';', '--', entry['senses'])
|
print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
|
||||||
if MODE == 'gf-cnc':
|
if MODE == 'gf-cnc':
|
||||||
print('lin', fun, '=', lin, ';')
|
print('lin', fun, '=', lin, ';')
|
||||||
|
|
||||||
@@ -237,3 +302,13 @@ if MODE != 'raw':
|
|||||||
|
|
||||||
if MODE.startswith('gf'):
|
if MODE.startswith('gf'):
|
||||||
print('}')
|
print('}')
|
||||||
|
|
||||||
|
|
||||||
|
if MODE == 'eval':
|
||||||
|
with open('pot.gftbl') as file:
|
||||||
|
gf = [line.strip() for line in file]
|
||||||
|
with open('pot.json') as file:
|
||||||
|
wikt = wikt_forms_for_pos(json.loads(file.read()))
|
||||||
|
for line in compare_tables(gf, wikt).items():
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user