1
0
forked from GitHub/gf-rgl
Files
gf-rgl/src/arabic/wiktionary/read_wiktionary.py
T

377 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import gzip
import json
import sys
import unicodedata
# data from https://kaikki.org/dictionary/rawdata.html
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.
MODE = ''
if __name__ == '__main__':
if not sys.argv[1:]:
print('usage: read_wiktionary (raw | gf-cnc | gf-abs | gf-map | eval | eval-verbose)')
exit()
MODE = sys.argv[1] #
# step 1: extract data from this file using the raw option
WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
# the following file is generated.
# in the sequel, use this file with gf-abs or gf-cnc option
FILTERED_WIKT = 'wikt_arabic.jsonl'
# map each successfully extracted GF function to its source record in Wiktionary
# created with option gf-map
FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl'
def read_function_source_map():
with open(FUNCTION_SOURCE_MAP) as file:
sourcemap = {}
for line in file:
obj = json.loads(line)
sourcemap[obj['fun']] = obj['source']
def get_gzip_json(file, sample=100000, langs=[]):
with gzip.open(file) as decompressed:
n = 0
for line in decompressed:
n += 1
if n % sample == 0:
obj = json.loads(line)
if obj.get('lang', None) in langs:
print(line.decode("utf-8"))
# print(n)
if MODE == 'raw':
get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])
# python3 read_wiktionary.py raw >wikt_arabic.jsonl
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
buckwalter_dict = {
0x621: "'", # ء
0x622: '|', # آ
0x623: '>', # أ
0x624: '&', # ؤ
0x625: '<', # إ
0x626: '}', # ئ
0x627: 'A', # ا
0x628: 'b', # ب
0x629: 'p', # ة
0x62a: 't', # ت
0x62b: 'v', # ث
0x62c: 'j', # ج
0x62d: 'H', # ح
0x62e: 'x', # خ
0x62f: 'd', # د
0x630: '*', # ذ
0x631: 'r', # ر
0x632: 'z', # ز
0x633: 's', # س
0x634: '$', # ش
0x635: 'S', # ص
0x636: 'D', # ض
0x637: 'T', # ط
0x638: 'Z', # ظ
0x639: 'E', # ع
0x63a: 'g', # غ
0x641: 'f', # ف
0x642: 'q', # ق
0x643: 'k', # ك
0x644: 'l', # ل
0x645: 'm', # م
0x646: 'n', # ن
0x647: 'h', # ه
0x648: 'w', # و
0x649: 'Y', # ى
0x64a: 'y', # ي
0x64b: 'F', # ً
0x64c: 'N', # ٌ
0x64d: 'K', # ٍ
0x64e: 'a', # َ
0x64f: 'u', # ُ
0x650: 'i', # ِ
0x651: '~', # ّ
0x652: 'o', # ْ
0x670: '`', # '
0x671: '{' # ٱ
}
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
def to_buckwalter(s):
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
def from_buckwalter(s):
return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
def unvocalize(s):
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
def is_arabic(s):
return s and any(1574 <= ord(c) <= 1616 for c in s)
def normal(s):
return unicodedata.normalize('NFD', s)
# Wikt uses vowel+shadda which is a Unicode normalization
# GF uses shadda+vowel which is linguistically correct
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
# unicodedata.normalize does this wrong, as noted by Ariel Gutman
## todo: more direct implementation
def reorder_shadda(s):
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
# quote forms but not parameters
def quote_if(s, cond=is_arabic):
if cond(s):
return '"' + s + '"'
else:
return s
def gf_fun(s, pos, disamb=0):
discrim = '_' + str(disamb) if disamb else ''
return ''.join(["'", s, discrim, "_", pos, "'"])
rgl_features = {
# V
'VPerf': 'perfective',
'Act': 'active',
'Pas': 'passive',
'Per3': 'third-person',
'Per2': 'second-person',
'Per1': 'first-person',
'Masc': 'masculine',
'Fem': 'feminine',
'Sing': 'singular',
'Plur': 'plural',
'Sg': 'singular',
'Pl': 'plural',
'Dl': 'dual',
'VImpf': 'imperfective',
'Ind': 'indicative',
'Cnj': 'subjunctive',
'Jus': 'jussive',
'VImp': 'imperative',
# N: also Sg, Pl, Dl
'Def': 'definite',
'Indef': 'indefinite',
'Nom': 'nominative',
'Acc': 'accusative',
'Gen': 'genitive',
# 'Bare':
# 'Dat':
'Const': 'construct',
# 'Poss':
#A: also N features
'APosit': 'positive',
'AComp': 'comparative'
}
# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ
# coming from 'l -treebank -table'
def compare_tables(gf, wikt):
report = {}
for line in gf:
gf_form = line.split()[-1] # ''.join([c for c in line if 1574 <= ord(c) <= 1616])
gf_tags = tuple(word for word in
line.replace('(', ' ').replace(')', ' ').split()
if word in rgl_features)
if not gf_tags:
continue
wikt_tags = {rgl_features[tag] for tag in gf_tags}
wikt_form = None
wikt_descr = None
for form, descr in wikt:
if all([tag in descr for tag in wikt_tags]):
wikt_form = reorder_shadda(form)
wikt_descr = descr
break
report[gf_tags] = {
'gf_form': gf_form,
'wikt_form': wikt_form,
'gf_form_rom': to_buckwalter(gf_form) if gf_form else None,
'wikt_form_rom': to_buckwalter(wikt_form) if wikt_form else None,
'wikt_descr': wikt_descr
}
if wikt_form:
report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items
report['fun'] = gf[0].split()[-1]
report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ])
report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
return report
def wikt_forms_for_pos(obj):
return {
form['form']:
form.get('tags', []) for
form in obj.get('forms', []) if
'romanization' not in form.get('tags', []) and
is_arabic(form['form'])
}.items()
def forms_for_pos(obj):
forms = wikt_forms_for_pos(obj)
if obj['pos'] == 'noun':
lemma = [form[:-1] for form, descr in forms
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
plural = [form[:-1] for form, descr in forms
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
else (['masc'] if 'Arabic masculine nouns' in obj['categories']
else []))
gf_entry = {
'cat': 'N',
'lemma': lemma,
'args': {
'sg': lemma,
'pl': plural,
'g': gender
}
}
elif obj['pos'] == 'verb':
lemma = [form for form, descr in forms
if all([w in descr for
w in ["active", "indicative", "masculine", "past",
"perfective", "singular", "third-person"]])][:1]
gf_entry = {
'cat': 'V',
'lemma': lemma,
'args': {
'perfect': lemma,
'imperfect': [form for form, descr in forms
if all([w in descr for
w in [
"active", "indicative", "masculine", "non-past",
"imperfective", "singular", "third-person"]])][:1],
'cls': ['Form' + max([n for n in [
'I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','']
if n in ' '.join([c for c in obj['categories']
if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
key=len)] # max in RGL is XI, in Wikt XIII
}
}
elif obj['pos'] == 'adj':
lemma = [form for form, descr in forms
if all([w in descr for w in [
'indefinite', 'masculine', 'singular', 'informal']])][:1]
gf_entry = {
'cat': 'A',
'lemma': lemma,
'args': {
'masc_sg': lemma,
'masc_pl': [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
'fem_sg': [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],
'fem_pl': [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
}
}
else:
gf_entry = {f: d for f, d in forms}
if 'lemma' in gf_entry and gf_entry['lemma']:
gf_entry['lemma'] = gf_entry['lemma'][0]
if obj['root']:
gf_entry['args']['root'] = obj['root']
args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}'
return gf_entry
# "root": ["ش ر ح (š-r-ḥ)"]
def find_root(s):
return ''.join([c for c in s if is_arabic(c)])
if MODE == 'gf-abs':
print('abstract MorphoDictAraAbs = Cat ** {')
if MODE == 'gf-cnc':
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {')
if MODE not in ['raw', 'eval']:
with open(FILTERED_WIKT) as file:
seen_gf_funs = {}
number = 1
for line in file:
try:
obj = json.loads(line)
except:
continue
number += 1
root = [find_root(t['expansion']) for
t in obj.get('etymology_templates', []) if
t.get('name', None) =='ar-root'][:1]
obj['root'] = root
if 'Arabic lemmas' in obj.get('categories', []):
entry = {
'pos': obj['pos'],
'forms': forms_for_pos(obj),
'senses': [sense['glosses'] for sense in obj.get('senses', [])
if 'glosses' in sense]
}
# entry['n_forms'] = len(entry['forms'])
# print(entry['pos'], entry['n_forms'])
if MODE == 'json':
print(json.dumps(entry, ensure_ascii=False))
if MODE.startswith('gf'):
lemma = entry['forms'].get('lemma', None)
if lemma:
cat = entry['forms']['cat']
lin = entry['forms']['lin']
discrim = seen_gf_funs.get((lemma, cat), 0)
fun = gf_fun(lemma, cat, discrim)
if MODE == 'gf-abs':
print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
if MODE == 'gf-cnc':
print('lin', fun, '=', lin, ';')
if MODE == 'gf-map':
mapitem = {'fun': fun, 'source': obj}
print(json.dumps(mapitem, ensure_ascii=False))
seen_gf_funs[(lemma, cat)] = discrim + 1
# to do: rename duplicate function names: of 13762 names, 12946 are unique
if MODE.startswith('gf'):
print('}')
if MODE.startswith('eval'):
with open('pot.gftbl') as file:
gf = [line.strip() for line in file]
with open('pot.json') as file:
wikt = wikt_forms_for_pos(json.loads(file.read()))
report = compare_tables(gf, wikt)
if MODE == 'eval-verbose':
for line in report.items():
print(line)
else:
print(report['fun'], 'forms', report['total_found'],
'voc', report['total_voc'], 'unvoc', report['total_unvoc'])