mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-05-28 01:18:57 -06:00
evaluation of generated lexicon
This commit is contained in:
@@ -2,10 +2,47 @@ import gzip
|
|||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
import pgf
|
||||||
|
|
||||||
|
|
||||||
# data from https://kaikki.org/dictionary/rawdata.html
|
# data from https://kaikki.org/dictionary/rawdata.html
|
||||||
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
|
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
|
||||||
# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.
|
# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file converts Wiktionary data to GF morphological dictionary files.
|
||||||
|
It words for Arabic but some functionalities could be modified to other languges.
|
||||||
|
|
||||||
|
The steps to take are the following:
|
||||||
|
|
||||||
|
fetch data:
|
||||||
|
|
||||||
|
raw-wiktextract-data.json.gz from https://kaikki.org/dictionary/rawdata.html
|
||||||
|
|
||||||
|
filter Arabic entries:
|
||||||
|
|
||||||
|
$ python3 read_wiktionary.py raw >wikt_arabic.jsonl
|
||||||
|
|
||||||
|
create GF files:
|
||||||
|
|
||||||
|
$ python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
|
||||||
|
$ python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
|
||||||
|
|
||||||
|
automatic evaluation:
|
||||||
|
|
||||||
|
$ gf -make MorphoDictAra.gf
|
||||||
|
$ python3 read_wiktionary.py gf-map >function_sources_arabic.jsonl
|
||||||
|
$ python3 read_wiktionary.py eval
|
||||||
|
|
||||||
|
TODO:
|
||||||
|
- better generation of GF
|
||||||
|
- better paradigms to use Wiktionary data
|
||||||
|
- refactor the code so that it can be used for other languages
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
MODE = ''
|
MODE = ''
|
||||||
|
|
||||||
@@ -27,13 +64,20 @@ FILTERED_WIKT = 'wikt_arabic.jsonl'
|
|||||||
# created with option gf-map
|
# created with option gf-map
|
||||||
FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl'
|
FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl'
|
||||||
|
|
||||||
|
PGF_FILE = 'MorphoDictAraAbs.pgf'
|
||||||
|
CONCRETE_MODULE = 'MorphoDictAra'
|
||||||
|
|
||||||
|
|
||||||
def read_function_source_map():
|
def read_function_source_map():
|
||||||
with open(FUNCTION_SOURCE_MAP) as file:
|
with open(FUNCTION_SOURCE_MAP) as file:
|
||||||
sourcemap = {}
|
sourcemap = {}
|
||||||
for line in file:
|
for line in file:
|
||||||
obj = json.loads(line)
|
try:
|
||||||
sourcemap[obj['fun']] = obj['source']
|
obj = json.loads(line)
|
||||||
|
sourcemap[obj['fun']] = obj['source']
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
return sourcemap
|
||||||
|
|
||||||
|
|
||||||
def get_gzip_json(file, sample=100000, langs=[]):
|
def get_gzip_json(file, sample=100000, langs=[]):
|
||||||
@@ -134,9 +178,9 @@ def reorder_shadda(s):
|
|||||||
|
|
||||||
|
|
||||||
# quote forms but not parameters
|
# quote forms but not parameters
|
||||||
def quote_if(s, cond=is_arabic):
|
def quote_if(s, cond=is_arabic, change=reorder_shadda):
|
||||||
if cond(s):
|
if cond(s):
|
||||||
return '"' + s + '"'
|
return '"' + change(s) + '"'
|
||||||
else:
|
else:
|
||||||
return s
|
return s
|
||||||
|
|
||||||
@@ -181,14 +225,19 @@ rgl_features = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# obsolote:
|
||||||
# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ
|
# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ
|
||||||
# coming from 'l -treebank -table'
|
# coming from 'l -treebank -table'
|
||||||
def compare_tables(gf, wikt):
|
# now used:
|
||||||
|
# {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
|
||||||
|
# coming from tabularLinearize
|
||||||
|
|
||||||
|
def compare_tables(gf, wikt, fun):
|
||||||
report = {}
|
report = {}
|
||||||
for line in gf:
|
for pair in gf.items():
|
||||||
gf_form = line.split()[-1] # ''.join([c for c in line if 1574 <= ord(c) <= 1616])
|
gf_form = pair[1]
|
||||||
gf_tags = tuple(word for word in
|
gf_tags = tuple(word for word in
|
||||||
line.replace('(', ' ').replace(')', ' ').split()
|
pair[0].replace('(', ' ').replace(')', ' ').split()
|
||||||
if word in rgl_features)
|
if word in rgl_features)
|
||||||
if not gf_tags:
|
if not gf_tags:
|
||||||
continue
|
continue
|
||||||
@@ -211,7 +260,7 @@ def compare_tables(gf, wikt):
|
|||||||
report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
|
report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
|
||||||
report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
|
report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
|
||||||
ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items
|
ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items
|
||||||
report['fun'] = gf[0].split()[-1]
|
report['fun'] = fun
|
||||||
report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ])
|
report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ])
|
||||||
report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
|
report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
|
||||||
report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
|
report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
|
||||||
@@ -293,7 +342,7 @@ def forms_for_pos(obj):
|
|||||||
|
|
||||||
if 'lemma' in gf_entry and gf_entry['lemma']:
|
if 'lemma' in gf_entry and gf_entry['lemma']:
|
||||||
gf_entry['lemma'] = gf_entry['lemma'][0]
|
gf_entry['lemma'] = gf_entry['lemma'][0]
|
||||||
if obj['root']:
|
if obj['root'] and obj['root'][0].strip():
|
||||||
gf_entry['args']['root'] = obj['root']
|
gf_entry['args']['root'] = obj['root']
|
||||||
args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
|
args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
|
||||||
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}'
|
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}'
|
||||||
@@ -309,7 +358,8 @@ if MODE == 'gf-abs':
|
|||||||
if MODE == 'gf-cnc':
|
if MODE == 'gf-cnc':
|
||||||
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {')
|
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {')
|
||||||
|
|
||||||
if MODE not in ['raw', 'eval']:
|
|
||||||
|
if MODE.startswith('gf') or MODE=='json':
|
||||||
with open(FILTERED_WIKT) as file:
|
with open(FILTERED_WIKT) as file:
|
||||||
seen_gf_funs = {}
|
seen_gf_funs = {}
|
||||||
number = 1
|
number = 1
|
||||||
@@ -360,17 +410,52 @@ if MODE.startswith('gf'):
|
|||||||
print('}')
|
print('}')
|
||||||
|
|
||||||
|
|
||||||
if MODE.startswith('eval'):
|
def eval_all(gr, funmap, concrete=CONCRETE_MODULE):
|
||||||
with open('pot.gftbl') as file:
|
lang = gr.languages[CONCRETE_MODULE]
|
||||||
gf = [line.strip() for line in file]
|
funs = gr.functions
|
||||||
with open('pot.json') as file:
|
reports = []
|
||||||
wikt = wikt_forms_for_pos(json.loads(file.read()))
|
for fun in funs:
|
||||||
report = compare_tables(gf, wikt)
|
funn = "'" + fun + "'"
|
||||||
|
if funn not in funmap:
|
||||||
if MODE == 'eval-verbose':
|
print(funn, 'not found')
|
||||||
for line in report.items():
|
continue
|
||||||
print(line)
|
wikt = wikt_forms_for_pos(funmap[funn])
|
||||||
else:
|
gf = lang.tabularLinearize(pgf.Expr(fun, []))
|
||||||
print(report['fun'], 'forms', report['total_found'],
|
report = compare_tables(gf, wikt, fun)
|
||||||
'voc', report['total_voc'], 'unvoc', report['total_unvoc'])
|
reports.append(report)
|
||||||
|
return reports
|
||||||
|
|
||||||
|
|
||||||
|
def first_error(report):
|
||||||
|
for f, v in report.items():
|
||||||
|
if 'voc_match' in v:
|
||||||
|
if v['voc_match'] == 0:
|
||||||
|
return f, v
|
||||||
|
|
||||||
|
|
||||||
|
if MODE.startswith('eval'):
|
||||||
|
gr = pgf.readPGF(PGF_FILE)
|
||||||
|
print('using', PGF_FILE)
|
||||||
|
funmap = read_function_source_map()
|
||||||
|
print(len(funmap), 'functions')
|
||||||
|
for report in eval_all(gr, funmap):
|
||||||
|
|
||||||
|
if MODE == 'eval-verbose':
|
||||||
|
for line in report.items():
|
||||||
|
print(line)
|
||||||
|
else:
|
||||||
|
if report['total_found'] == 0:
|
||||||
|
verdict = 'NOT_FOUND'
|
||||||
|
elif report['total_found'] == report['total_voc']:
|
||||||
|
verdict = 'PERFECT'
|
||||||
|
elif report['total_found'] == report['total_unvoc']:
|
||||||
|
verdict = 'PERFECT_UNVOC ' + str(first_error(report))
|
||||||
|
elif report['total_voc'] == 0:
|
||||||
|
verdict = 'TOTALLY_WRONG ' + str(first_error(report))
|
||||||
|
else:
|
||||||
|
verdict = 'PARTIAL ' + str(first_error(report))
|
||||||
|
print(report['fun'], 'forms', report['total_found'],
|
||||||
|
'voc', report['total_voc'], 'unvoc', report['total_unvoc'],
|
||||||
|
verdict
|
||||||
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user