Files
gf-rgl/src/arabic/wiktionary/read_wiktionary.py
T
2023-09-21 17:29:38 +02:00

582 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import gzip
import json
import sys
import unicodedata
import pgf
# data from https://kaikki.org/dictionary/rawdata.html
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.
"""
This file converts Wiktionary data to GF morphological dictionary files.
It words for Arabic but some functionalities could be modified to other languges.
The steps to take are the following:
fetch data:
raw-wiktextract-data.json.gz from https://kaikki.org/dictionary/rawdata.html
filter Arabic entries:
$ python3 read_wiktionary.py raw >wikt_arabic.jsonl
create GF files:
$ python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
$ python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
automatic evaluation:
$ gf -make MorphoDictAra.gf
$ python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl
$ python3 read_wiktionary.py eval
TODO:
- better generation of GF
- better paradigms to use Wiktionary data
- refactor the code so that it can be used for other languages
"""
MODE = ''
if __name__ == '__main__':
if not sys.argv[1:]:
print('usage: read_wiktionary (raw | gf-cnc | gf-abs | gf-map | eval | eval-verbose)')
exit()
MODE = sys.argv[1] #
# step 1: extract Arabic data from this file using the raw option
WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
EXTRACTED_LANGUAGE = 'Arabic'
# the following file is generated.
# in the sequel, use this file with gf-abs or gf-cnc option
FILTERED_WIKT = 'wikt_arabic.jsonl'
# map each successfully extracted GF function to its source record in Wiktionary
# created with option gf-map
FUNCTION_SOURCE_MAP = 'source_of_MorphoDictAra.jsonl'
# created with $ gf -make MorphoDictAra.gf
PGF_FILE = 'MorphoDictAraAbs.pgf'
# module to linearize with
CONCRETE_MODULE = 'MorphoDictAra'
# concrete syntax file, to debug sources of linearizations
CONCRETE_FILE = CONCRETE_MODULE + '.gf'
# evaluation result file, created with mode eval-funs
EVAL_FILE = 'eval.jsonl'
# read a gzipped jsonl file (one object per line),
# showing lines where one of a list of languages is present
# This can be sampled to one of 100k lines by default, 1 for total recall.
def get_gzip_json(file, sample=100000, langs=[]):
with gzip.open(file) as decompressed:
n = 0
for line in decompressed:
n += 1
if n % sample == 0:
obj = json.loads(line)
if obj.get('lang', None) in langs:
print(line.decode("utf-8"))
# print(n)
# to perform the first step of data extraction, pipe this into a file:
# python3 read_wiktionary.py raw >wikt_arabic.jsonl
if MODE == 'raw':
get_gzip_json(WIKTIONARY_DUMP, 1, [EXTRACTED_LANGUAGE])
exit()
if MODE == 'error-analysis':
evals = {}
with open(EVAL_FILE) as file:
for line in file:
row = json.loads(line)
if labels := row.get('labels', None):
cat = row['fun'][-1]
verdict = row['verdict']
evals[(cat, labels, verdict)] = evals.get((cat, labels, verdict), 0) + 1
for labverdict, n in sorted(list(evals.items())):
print(labverdict, n)
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
buckwalter_dict = {
0x621: "'", # ء
0x622: '|', # آ
0x623: '>', # أ
0x624: '&', # ؤ
0x625: '<', # إ
0x626: '}', # ئ
0x627: 'A', # ا
0x628: 'b', # ب
0x629: 'p', # ة
0x62a: 't', # ت
0x62b: 'v', # ث
0x62c: 'j', # ج
0x62d: 'H', # ح
0x62e: 'x', # خ
0x62f: 'd', # د
0x630: '*', # ذ
0x631: 'r', # ر
0x632: 'z', # ز
0x633: 's', # س
0x634: '$', # ش
0x635: 'S', # ص
0x636: 'D', # ض
0x637: 'T', # ط
0x638: 'Z', # ظ
0x639: 'E', # ع
0x63a: 'g', # غ
0x641: 'f', # ف
0x642: 'q', # ق
0x643: 'k', # ك
0x644: 'l', # ل
0x645: 'm', # م
0x646: 'n', # ن
0x647: 'h', # ه
0x648: 'w', # و
0x649: 'Y', # ى
0x64a: 'y', # ي
0x64b: 'F', # ً
0x64c: 'N', # ٌ
0x64d: 'K', # ٍ
0x64e: 'a', # َ
0x64f: 'u', # ُ
0x650: 'i', # ِ
0x651: '~', # ّ
0x652: 'o', # ْ
0x670: '`', # '
0x671: '{' # ٱ
}
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya
def to_buckwalter(s):
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
def from_buckwalter(s):
return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
def unvocalize(s):
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
def drop_final_vowel(s):
if s[-1] in arabic_vowels:
return s[:-1]
else:
return s
def is_arabic(s):
return s and any(1574 <= ord(c) <= 1616 for c in s)
def normal(s):
return unicodedata.normalize('NFD', s)
# heuristic for finding the three radicals from certain forms
# works only for sound (strong) 3-radical roots, otherwise None
def get_sound_trigram_root(s):
sounds = [c for c in s if c in sound_consonants]
if len(sounds) == 3:
return ''.join(sounds)
else:
return None
# reverse engineer fcl pattern from a given form, with a sound trigram root
# one more condition: each of the root letters occurs exactly ones
# TODO: better use the given root of the lex entry
def get_sound_fcl_pattern(s):
if root := get_sound_trigram_root(s):
if len([c in s for c in root]) == 3:
p = list(s)
r = s.find(root[0])
p[r] = chr(0x641)
r += s[r+1:].find(root[1]) + 1
p[r] = chr(0x639)
r += s[r+1:].find(root[2]) + 1
p[r] = chr(0x644)
p = ''.join(p)
## print('---PATT', s, root, p)
return p
# Wikt uses vowel+shadda which is a Unicode normalization
# GF uses shadda+vowel which is linguistically correct
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
# unicodedata.normalize does this wrong, as noted by Ariel Gutman
## todo: more direct implementation
def reorder_shadda(s):
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
# quote word forms but not parameters
def quote_if(s, cond=is_arabic, change=reorder_shadda):
if cond(s):
return '"' + change(s) + '"'
else:
return s
# generate word_d_C functions starting with d=0, but show d only when >= 1
def gf_fun(s, pos, disamb=0):
discrim = '_' + str(disamb) if disamb else ''
return ''.join(["'", s, discrim, "_", pos, "'"])
# mapping from GF to Wikt features
arabic_rgl_features = {
# V
'VPerf': 'perfective',
'Act': 'active',
'Pas': 'passive',
'Per3': 'third-person',
'Per2': 'second-person',
'Per1': 'first-person',
'Masc': 'masculine',
'Fem': 'feminine',
'Sing': 'singular',
'Plur': 'plural',
'Sg': 'singular',
'Pl': 'plural',
'Dl': 'dual',
'VImpf': 'imperfective',
'Ind': 'indicative',
'Cnj': 'subjunctive',
'Jus': 'jussive',
'VImp': 'imperative',
# N: also Sg, Pl, Dl
'Def': 'definite',
'Indef': 'indefinite',
'Nom': 'nominative',
'Acc': 'accusative',
'Gen': 'genitive',
# 'Bare':
# 'Dat':
'Const': 'construct'
# 'Poss':
#A: also N features; degree features cannot be found
# 'APosit': 'positive',
# 'AComp': 'comparative'
}
# the inflection forms in a wiktionary entry
def wikt_forms_from_obj(obj):
forms = {
reorder_shadda(form['form']):
form.get('tags', []) for
form in obj.get('forms', []) if
'romanization' not in form.get('tags', []) and
is_arabic(form['form'])
}
# the root (three radicals) is found in this place if at all
root = [find_root(t['expansion']) for
t in obj.get('etymology_templates', []) if
t.get('name', None) =='ar-root'][:1]
if root and root[0].strip():
forms['root'] = root[0].strip()
return forms
# selection of forms for a given POS from Wikt: noun, adj, or verb
# return a linearization function
def forms_for_pos(obj):
dforms = wikt_forms_from_obj(obj)
forms = dforms.items()
if obj['pos'] == 'noun':
lemma = [drop_final_vowel(form) for form, descr in forms
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
plural = [drop_final_vowel(form) for form, descr in forms
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
else (['masc'] if 'Arabic masculine nouns' in obj['categories']
else []))
gf_entry = {
'cat': 'N',
'lemma': lemma,
'args': {
'sg': lemma,
'pl': plural,
'g': gender
}
}
elif obj['pos'] == 'verb':
lemma = [form for form, descr in forms
if all([w in descr for
w in ["active", "indicative", "masculine", "past",
"perfective", "singular", "third-person"]])][:1]
gf_entry = {
'cat': 'V',
'lemma': lemma,
'args': {
'perfect': lemma,
'imperfect': [form for form, descr in forms
if all([w in descr for
w in [
"active", "indicative", "masculine", "non-past",
"imperfective", "singular", "third-person"]])][:1],
'cls': ['Form' + max([n for n in [
'I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','']
if n in ' '.join([c for c in obj['categories']
if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
key=len)] # max in RGL is XI, in Wikt XIII
}
}
elif obj['pos'] == 'adj':
lemma = [form for form, descr in forms
if all([w in descr for w in [
'indefinite', 'masculine', 'singular', 'informal']])][:1]
gf_entry = {
'cat': 'A',
'lemma': lemma,
'args': {
'masc_sg': lemma,
'masc_pl': [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
'fem_sg': [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],
'fem_pl': [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
}
}
for patt in ['masc_sg', 'masc_pl']:
if patt in gf_entry['args']:
if form := gf_entry['args'][patt]:
if spatt := get_sound_fcl_pattern(form[0]):
gf_entry['args'][patt[5:]+'_patt'] = [spatt] # sg_patt, pl_patt
else:
gf_entry = {f: d for f, d in forms}
if 'lemma' in gf_entry and gf_entry['lemma']:
gf_entry['lemma'] = gf_entry['lemma'][0]
if 'root' in dforms:
gf_entry['args']['root'] = [dforms['root']]
elif root := get_sound_trigram_root(gf_entry['lemma']):
gf_entry['args']['root'] = [root]
args = sorted([(r, quote_if(x[0])) for r, x in gf_entry['args'].items() if x])
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join([r + ' = ' + v for (r, v) in args]) + '}'
gf_entry['labels'] = ','.join([r for r, v in args])
return gf_entry
# "root": ["ش ر ح (š-r-ḥ)"]
def find_root(s):
return ''.join([c for c in s if is_arabic(c)])
# GF code generation
# start with the header of the desired GF module
if MODE == 'gf-abs':
print('abstract MorphoDictAraAbs = Cat ** {')
if MODE == 'gf-cnc':
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra, MoreAra in {')
# go through the Arabic Wiktionary entries
# generate functions with unique names
if MODE.startswith('gf') or MODE=='json':
with open(FILTERED_WIKT) as file:
seen_gf_funs = {} # to disambiguate names if needed
number = 1
for line in file:
try:
obj = json.loads(line)
except:
continue
number += 1 # if you find the same word_C again, mark it word_1_C
# only take entries that are marked as lemmas
if 'Arabic lemmas' in obj.get('categories', []):
entry = {
'pos': obj['pos'],
'forms': forms_for_pos(obj),
'all_forms': wikt_forms_from_obj(obj),
'senses': [sense['glosses'] for sense in obj.get('senses', [])
if 'glosses' in sense]
}
# if you only want to see the Wikt information used GF generation
if MODE == 'json':
print(json.dumps(entry, ensure_ascii=False))
# if you want to proceed to GF generation
if MODE.startswith('gf'):
lemma = entry['forms'].get('lemma', None)
if lemma:
cat = entry['forms']['cat']
lin = entry['forms']['lin']
labels = entry['forms']['labels']
discrim = seen_gf_funs.get((lemma, cat), 0)
fun = gf_fun(lemma, cat, discrim)
# abstract syntax, save in MorphoDictAraAbs.gf
if MODE == 'gf-abs':
print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
# concrete syntax, save in MorphoDictAra.gf
elif MODE == 'gf-cnc':
print('lin', fun, '=', lin, ';')
# function-source map, save in source_of_MorphoDictAra.jsonl
elif MODE == 'gf-map':
source = wikt_forms_from_obj(obj)
source['gf_labels'] = labels
mapitem = {'fun': fun, 'source': source}
print(json.dumps(mapitem, ensure_ascii=False))
seen_gf_funs[(lemma, cat)] = discrim + 1 # next word_d_C will get a new number
# terminate the GF file with a closing brace
if MODE in ['gf-abs', 'gf-cnc']:
print('}')
# evaluation:
# linearize all words to tables
# compare them to the forms found in Wiktionary
# report on matches
# format of GF table:
# {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
# coming from pgf tabularLinearize
# compare the table for one function, returning a report as a dict
def compare_tables(gf, wikt, fun, show_buckwalter=True):
report = {}
for pair in gf.items():
gf_form = pair[1]
gf_params = pair[0]
gf_tags = tuple(word for word in
pair[0].replace('(', ' ').replace(')', ' ').split()
if word in arabic_rgl_features)
if not gf_tags:
continue # if gf_tags match no Wikt tags, do not include this form
wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags}
wikt_form = None
wikt_descr = None
for form, descr in wikt.items():
if all([tag in descr for tag in wikt_tags]):
wikt_form = reorder_shadda(form)
wikt_descr = descr
break
report[gf_tags] = { # flat param description with only Wikt-relevant tags
'gf_params': gf_params, # full param description
'gf_form': gf_form,
'wikt_form': wikt_form,
'wikt_descr': wikt_descr
}
if show_buckwalter:
report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None
report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None
if wikt_form:
report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items
report['fun'] = fun
report['labels'] = wikt['gf_labels']
report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ])
report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
return report
# with a given grammar and function, prepare input for compare_tables
# and produce a report, possibly summarizing it
def eval_with_wikt(gr, lang, fun, wikt, verbose=False):
if fun not in gr.functions:
print(fun, 'not found in grammar')
return
gf = {p: s for (p, s) in lang.tabularLinearize(pgf.Expr(fun, [])).items()
if p.startswith('s ')} # require the s field, exclude s2
report = compare_tables(gf, wikt, fun)
if verbose:
return report
else:
if report['total_found'] == 0:
verdict = 'NOT_FOUND'
flaws = False
elif report['total_found'] == report['total_voc']:
verdict = 'PERFECT'
flaws = False
elif report['total_found'] == report['total_unvoc']:
verdict = 'PERFECT_UNVOC'
flaws = True
elif report['total_voc'] == 0:
verdict = 'TOTALLY_WRONG'
flaws = True
else:
verdict = 'PARTIAL'
flaws = True
summary = {
'fun': report['fun'],
'forms': report['total_found'],
'voc': report['total_voc'],
'unvoc': report['total_unvoc'],
'verdict': verdict,
'labels': report['labels']
}
if flaws:
for f, v in report.items():
if v.get('voc_match', 1) == 0:
summary['first_error'] = v
break
return summary
def eval_grammar(pgffile, concretename, mapfile, show=True, verbose=False):
gr = pgf.readPGF(pgffile)
concrete = gr.languages[concretename]
totals = {'A': {}, 'N': {}, 'V': {}}
with open(mapfile) as file:
for line in file:
obj = json.loads(line)
fun = obj['fun'][1:-1]
report = eval_with_wikt(gr, concrete, fun, obj['source'], verbose)
cat = fun[-1]
if 'verdict' in report:
rep = report['verdict']
totals[cat][rep] = totals[cat].get(rep, 0) + 1
if show:
print(json.dumps(report, ensure_ascii=False))
print(json.dumps(totals, ensure_ascii=False))
if MODE.startswith('eval'):
verbose = MODE=='eval-verbose'
show = verbose or MODE=='eval-funs'
eval_grammar(PGF_FILE, CONCRETE_MODULE, FUNCTION_SOURCE_MAP, show, verbose)