mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-05-28 01:18:57 -06:00
improving evaluation of wiktionary generated lexicon
This commit is contained in:
@@ -69,6 +69,8 @@ PGF_FILE = 'MorphoDictAraAbs.pgf'
|
|||||||
# module to linearize with
|
# module to linearize with
|
||||||
CONCRETE_MODULE = 'MorphoDictAra'
|
CONCRETE_MODULE = 'MorphoDictAra'
|
||||||
|
|
||||||
|
# concrete syntax file, to debug sources of linearizations
|
||||||
|
CONCRETE_FILE = CONCRETE_MODULE + '.gf'
|
||||||
|
|
||||||
# read a gzipped jsonl file (one object per line),
|
# read a gzipped jsonl file (one object per line),
|
||||||
# showing lines where one of a list of languages is present
|
# showing lines where one of a list of languages is present
|
||||||
@@ -144,6 +146,9 @@ buckwalter_dict = {
|
|||||||
|
|
||||||
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
|
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
|
||||||
|
|
||||||
|
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
|
||||||
|
|
||||||
|
sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya
|
||||||
|
|
||||||
def to_buckwalter(s):
|
def to_buckwalter(s):
|
||||||
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
|
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
|
||||||
@@ -157,12 +162,28 @@ def unvocalize(s):
|
|||||||
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
|
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
|
||||||
|
|
||||||
|
|
||||||
|
def drop_final_vowel(s):
|
||||||
|
if s[-1] in arabic_vowels:
|
||||||
|
return s[:-1]
|
||||||
|
else:
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
def is_arabic(s):
|
def is_arabic(s):
|
||||||
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
||||||
|
|
||||||
def normal(s):
|
def normal(s):
|
||||||
return unicodedata.normalize('NFD', s)
|
return unicodedata.normalize('NFD', s)
|
||||||
|
|
||||||
|
# heuristic for finding the three radicals from certain forms
|
||||||
|
# works only for sound (strong) 3-radical roots, otherwise None
|
||||||
|
def get_sound_trigram_root(s):
|
||||||
|
sounds = [c for c in s if c in sound_consonants]
|
||||||
|
if len(sounds) == 3:
|
||||||
|
return ''.join(sounds)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# Wikt uses vowel+shadda which is a Unicode normalization
|
# Wikt uses vowel+shadda which is a Unicode normalization
|
||||||
# GF uses shadda+vowel which is linguistically correct
|
# GF uses shadda+vowel which is linguistically correct
|
||||||
@@ -216,18 +237,18 @@ arabic_rgl_features = {
|
|||||||
'Gen': 'genitive',
|
'Gen': 'genitive',
|
||||||
# 'Bare':
|
# 'Bare':
|
||||||
# 'Dat':
|
# 'Dat':
|
||||||
'Const': 'construct',
|
'Const': 'construct'
|
||||||
# 'Poss':
|
# 'Poss':
|
||||||
#A: also N features
|
#A: also N features; degree features cannot be found
|
||||||
'APosit': 'positive',
|
# 'APosit': 'positive',
|
||||||
'AComp': 'comparative'
|
# 'AComp': 'comparative'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# the inflection forms in a wiktionary entry
|
# the inflection forms in a wiktionary entry
|
||||||
def wikt_forms_from_obj(obj):
|
def wikt_forms_from_obj(obj):
|
||||||
forms = {
|
forms = {
|
||||||
form['form']:
|
reorder_shadda(form['form']):
|
||||||
form.get('tags', []) for
|
form.get('tags', []) for
|
||||||
form in obj.get('forms', []) if
|
form in obj.get('forms', []) if
|
||||||
'romanization' not in form.get('tags', []) and
|
'romanization' not in form.get('tags', []) and
|
||||||
@@ -249,9 +270,9 @@ def forms_for_pos(obj):
|
|||||||
dforms = wikt_forms_from_obj(obj)
|
dforms = wikt_forms_from_obj(obj)
|
||||||
forms = dforms.items()
|
forms = dforms.items()
|
||||||
if obj['pos'] == 'noun':
|
if obj['pos'] == 'noun':
|
||||||
lemma = [form[:-1] for form, descr in forms
|
lemma = [drop_final_vowel(form) for form, descr in forms
|
||||||
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
||||||
plural = [form[:-1] for form, descr in forms
|
plural = [drop_final_vowel(form) for form, descr in forms
|
||||||
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
|
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
|
||||||
gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
|
gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
|
||||||
else (['masc'] if 'Arabic masculine nouns' in obj['categories']
|
else (['masc'] if 'Arabic masculine nouns' in obj['categories']
|
||||||
@@ -312,8 +333,11 @@ def forms_for_pos(obj):
|
|||||||
gf_entry['lemma'] = gf_entry['lemma'][0]
|
gf_entry['lemma'] = gf_entry['lemma'][0]
|
||||||
if 'root' in dforms:
|
if 'root' in dforms:
|
||||||
gf_entry['args']['root'] = [dforms['root']]
|
gf_entry['args']['root'] = [dforms['root']]
|
||||||
args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
|
elif root := get_sound_trigram_root(gf_entry['lemma']):
|
||||||
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(sorted(args)) + '}'
|
gf_entry['args']['root'] = [root]
|
||||||
|
args = sorted([(r, quote_if(x[0])) for r, x in gf_entry['args'].items() if x])
|
||||||
|
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join([r + ' = ' + v for (r, v) in args]) + '}'
|
||||||
|
gf_entry['labels'] = ','.join([r for r, v in args])
|
||||||
|
|
||||||
return gf_entry
|
return gf_entry
|
||||||
|
|
||||||
@@ -367,6 +391,7 @@ if MODE.startswith('gf') or MODE=='json':
|
|||||||
if lemma:
|
if lemma:
|
||||||
cat = entry['forms']['cat']
|
cat = entry['forms']['cat']
|
||||||
lin = entry['forms']['lin']
|
lin = entry['forms']['lin']
|
||||||
|
labels = entry['forms']['labels']
|
||||||
discrim = seen_gf_funs.get((lemma, cat), 0)
|
discrim = seen_gf_funs.get((lemma, cat), 0)
|
||||||
fun = gf_fun(lemma, cat, discrim)
|
fun = gf_fun(lemma, cat, discrim)
|
||||||
|
|
||||||
@@ -380,7 +405,9 @@ if MODE.startswith('gf') or MODE=='json':
|
|||||||
|
|
||||||
# function-source map, save in source_of_MorphoDictAra.jsonl
|
# function-source map, save in source_of_MorphoDictAra.jsonl
|
||||||
elif MODE == 'gf-map':
|
elif MODE == 'gf-map':
|
||||||
mapitem = {'fun': fun, 'source': wikt_forms_from_obj(obj)}
|
source = wikt_forms_from_obj(obj)
|
||||||
|
source['gf_labels'] = labels
|
||||||
|
mapitem = {'fun': fun, 'source': source}
|
||||||
print(json.dumps(mapitem, ensure_ascii=False))
|
print(json.dumps(mapitem, ensure_ascii=False))
|
||||||
|
|
||||||
seen_gf_funs[(lemma, cat)] = discrim + 1 # next word_d_C will get a new number
|
seen_gf_funs[(lemma, cat)] = discrim + 1 # next word_d_C will get a new number
|
||||||
@@ -399,6 +426,7 @@ if MODE in ['gf-abs', 'gf-cnc']:
|
|||||||
# {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
|
# {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
|
||||||
# coming from pgf tabularLinearize
|
# coming from pgf tabularLinearize
|
||||||
|
|
||||||
|
# compare the table for one function, returning a report as a dict
|
||||||
def compare_tables(gf, wikt, fun, show_buckwalter=True):
|
def compare_tables(gf, wikt, fun, show_buckwalter=True):
|
||||||
report = {}
|
report = {}
|
||||||
for pair in gf.items():
|
for pair in gf.items():
|
||||||
@@ -412,7 +440,7 @@ def compare_tables(gf, wikt, fun, show_buckwalter=True):
|
|||||||
wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags}
|
wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags}
|
||||||
wikt_form = None
|
wikt_form = None
|
||||||
wikt_descr = None
|
wikt_descr = None
|
||||||
for form, descr in wikt:
|
for form, descr in wikt.items():
|
||||||
if all([tag in descr for tag in wikt_tags]):
|
if all([tag in descr for tag in wikt_tags]):
|
||||||
wikt_form = reorder_shadda(form)
|
wikt_form = reorder_shadda(form)
|
||||||
wikt_descr = descr
|
wikt_descr = descr
|
||||||
@@ -424,84 +452,90 @@ def compare_tables(gf, wikt, fun, show_buckwalter=True):
|
|||||||
'wikt_descr': wikt_descr
|
'wikt_descr': wikt_descr
|
||||||
}
|
}
|
||||||
if show_buckwalter:
|
if show_buckwalter:
|
||||||
report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None,
|
report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None
|
||||||
report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None,
|
report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None
|
||||||
if wikt_form:
|
if wikt_form:
|
||||||
report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
|
report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
|
||||||
report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
|
report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
|
||||||
ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items
|
ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items
|
||||||
report['fun'] = fun
|
report['fun'] = fun
|
||||||
|
report['labels'] = wikt['gf_labels']
|
||||||
report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ])
|
report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ])
|
||||||
report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
|
report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
|
||||||
report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
|
report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
|
||||||
return report
|
return report
|
||||||
|
|
||||||
|
|
||||||
def eval_all(gr, funmap, concrete=CONCRETE_MODULE):
|
# with a given grammar and function, prepare input for compare_tables
|
||||||
lang = gr.languages[CONCRETE_MODULE]
|
# and produce a report, possibly summarizing it
|
||||||
funs = gr.functions
|
def eval_with_wikt(gr, lang, fun, wikt, verbose=False):
|
||||||
reports = []
|
if fun not in gr.functions:
|
||||||
for fun in funs:
|
print(fun, 'not found in grammar')
|
||||||
funn = "'" + fun + "'"
|
return
|
||||||
if funn not in funmap:
|
gf = {p: s for (p, s) in lang.tabularLinearize(pgf.Expr(fun, [])).items()
|
||||||
print(funn, 'not found')
|
if p.startswith('s ')} # require the s field, exclude s2
|
||||||
continue
|
report = compare_tables(gf, wikt, fun)
|
||||||
wikt = funmap[funn].items()
|
if verbose:
|
||||||
gf = lang.tabularLinearize(pgf.Expr(fun, []))
|
return report
|
||||||
report = compare_tables(gf, wikt, fun)
|
else:
|
||||||
reports.append(report)
|
if report['total_found'] == 0:
|
||||||
return reports
|
verdict = 'NOT_FOUND'
|
||||||
|
flaws = False
|
||||||
|
elif report['total_found'] == report['total_voc']:
|
||||||
|
verdict = 'PERFECT'
|
||||||
|
flaws = False
|
||||||
|
elif report['total_found'] == report['total_unvoc']:
|
||||||
|
verdict = 'PERFECT_UNVOC'
|
||||||
|
flaws = True
|
||||||
|
elif report['total_voc'] == 0:
|
||||||
|
verdict = 'TOTALLY_WRONG'
|
||||||
|
flaws = True
|
||||||
|
else:
|
||||||
|
verdict = 'PARTIAL'
|
||||||
|
flaws = True
|
||||||
|
summary = {
|
||||||
|
'fun': report['fun'],
|
||||||
|
'forms': report['total_found'],
|
||||||
|
'voc': report['total_voc'],
|
||||||
|
'unvoc': report['total_unvoc'],
|
||||||
|
'verdict': verdict,
|
||||||
|
'labels': report['labels']
|
||||||
|
}
|
||||||
|
|
||||||
|
if flaws:
|
||||||
|
for f, v in report.items():
|
||||||
|
if v.get('voc_match', 1) == 0:
|
||||||
|
summary['first_error'] = v
|
||||||
|
break
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
# in the summary report: print the first error if anything gets wrong
|
def eval_grammar(pgffile, concretename, mapfile, show=True, verbose=False):
|
||||||
def first_error(report):
|
gr = pgf.readPGF(pgffile)
|
||||||
for f, v in report.items():
|
concrete = gr.languages[concretename]
|
||||||
if 'voc_match' in v:
|
|
||||||
if v['voc_match'] == 0:
|
|
||||||
return f, v
|
|
||||||
|
|
||||||
|
totals = {'A': {}, 'N': {}, 'V': {}}
|
||||||
|
|
||||||
# having stored the Wiktionary object for each GF function
|
with open(mapfile) as file:
|
||||||
# read it back from a file
|
|
||||||
def read_function_source_map():
|
|
||||||
with open(FUNCTION_SOURCE_MAP) as file:
|
|
||||||
sourcemap = {}
|
|
||||||
for line in file:
|
for line in file:
|
||||||
try:
|
obj = json.loads(line)
|
||||||
obj = json.loads(line)
|
fun = obj['fun'][1:-1]
|
||||||
sourcemap[obj['fun']] = obj['source']
|
report = eval_with_wikt(gr, concrete, fun, obj['source'], verbose)
|
||||||
except:
|
|
||||||
continue
|
cat = fun[-1]
|
||||||
return sourcemap
|
if 'verdict' in report:
|
||||||
|
rep = report['verdict']
|
||||||
|
totals[cat][rep] = totals[cat].get(rep, 0) + 1
|
||||||
|
|
||||||
|
if show:
|
||||||
|
print(report)
|
||||||
|
|
||||||
|
print(totals)
|
||||||
|
|
||||||
|
|
||||||
if MODE.startswith('eval'):
|
if MODE.startswith('eval'):
|
||||||
gr = pgf.readPGF(PGF_FILE)
|
verbose = MODE=='eval-verbose'
|
||||||
print('using', PGF_FILE)
|
show = verbose or MODE=='eval-funs'
|
||||||
funmap = read_function_source_map()
|
eval_grammar(PGF_FILE, CONCRETE_MODULE, FUNCTION_SOURCE_MAP, show, verbose)
|
||||||
print(len(funmap), 'functions')
|
|
||||||
for report in eval_all(gr, funmap):
|
|
||||||
|
|
||||||
if MODE == 'eval-verbose':
|
|
||||||
for line in report.items():
|
|
||||||
print(line)
|
|
||||||
if MODE == 'eval-tables':
|
|
||||||
for gftags, value in report.items():
|
|
||||||
if v := value['wikt_form']:
|
|
||||||
print(' ', value['gf_params'][2:], '=>', '"' + v + '" ;')
|
|
||||||
else:
|
|
||||||
if report['total_found'] == 0:
|
|
||||||
verdict = 'NOT_FOUND'
|
|
||||||
elif report['total_found'] == report['total_voc']:
|
|
||||||
verdict = 'PERFECT'
|
|
||||||
elif report['total_found'] == report['total_unvoc']:
|
|
||||||
verdict = 'PERFECT_UNVOC ' + str(first_error(report))
|
|
||||||
elif report['total_voc'] == 0:
|
|
||||||
verdict = 'TOTALLY_WRONG ' + str(first_error(report))
|
|
||||||
else:
|
|
||||||
verdict = 'PARTIAL ' + str(first_error(report))
|
|
||||||
print(report['fun'], 'forms', report['total_found'],
|
|
||||||
'voc', report['total_voc'], 'unvoc', report['total_unvoc'],
|
|
||||||
verdict
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user