forked from GitHub/gf-rgl
377 lines
12 KiB
Python
377 lines
12 KiB
Python
import gzip
|
||
import json
|
||
import sys
|
||
import unicodedata
|
||
|
||
# data from https://kaikki.org/dictionary/rawdata.html
|
||
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
|
||
# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.
|
||
|
||
MODE = ''
|
||
|
||
if __name__ == '__main__':
|
||
if not sys.argv[1:]:
|
||
print('usage: read_wiktionary (raw | gf-cnc | gf-abs | gf-map | eval | eval-verbose)')
|
||
exit()
|
||
MODE = sys.argv[1] #
|
||
|
||
|
||
# step 1: extract data from this file using the raw option
|
||
WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
|
||
|
||
# the following file is generated.
|
||
# in the sequel, use this file with gf-abs or gf-cnc option
|
||
FILTERED_WIKT = 'wikt_arabic.jsonl'
|
||
|
||
# map each successfully extracted GF function to its source record in Wiktionary
|
||
# created with option gf-map
|
||
FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl'
|
||
|
||
|
||
def read_function_source_map():
|
||
with open(FUNCTION_SOURCE_MAP) as file:
|
||
sourcemap = {}
|
||
for line in file:
|
||
obj = json.loads(line)
|
||
sourcemap[obj['fun']] = obj['source']
|
||
|
||
|
||
def get_gzip_json(file, sample=100000, langs=[]):
|
||
with gzip.open(file) as decompressed:
|
||
n = 0
|
||
for line in decompressed:
|
||
n += 1
|
||
if n % sample == 0:
|
||
obj = json.loads(line)
|
||
if obj.get('lang', None) in langs:
|
||
print(line.decode("utf-8"))
|
||
# print(n)
|
||
|
||
if MODE == 'raw':
|
||
get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])
|
||
|
||
# python3 read_wiktionary.py raw >wikt_arabic.jsonl
|
||
|
||
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
|
||
buckwalter_dict = {
|
||
0x621: "'", # ء
|
||
0x622: '|', # آ
|
||
0x623: '>', # أ
|
||
0x624: '&', # ؤ
|
||
0x625: '<', # إ
|
||
0x626: '}', # ئ
|
||
0x627: 'A', # ا
|
||
0x628: 'b', # ب
|
||
0x629: 'p', # ة
|
||
0x62a: 't', # ت
|
||
0x62b: 'v', # ث
|
||
0x62c: 'j', # ج
|
||
0x62d: 'H', # ح
|
||
0x62e: 'x', # خ
|
||
0x62f: 'd', # د
|
||
0x630: '*', # ذ
|
||
0x631: 'r', # ر
|
||
0x632: 'z', # ز
|
||
0x633: 's', # س
|
||
0x634: '$', # ش
|
||
0x635: 'S', # ص
|
||
0x636: 'D', # ض
|
||
0x637: 'T', # ط
|
||
0x638: 'Z', # ظ
|
||
0x639: 'E', # ع
|
||
0x63a: 'g', # غ
|
||
0x641: 'f', # ف
|
||
0x642: 'q', # ق
|
||
0x643: 'k', # ك
|
||
0x644: 'l', # ل
|
||
0x645: 'm', # م
|
||
0x646: 'n', # ن
|
||
0x647: 'h', # ه
|
||
0x648: 'w', # و
|
||
0x649: 'Y', # ى
|
||
0x64a: 'y', # ي
|
||
0x64b: 'F', # ً
|
||
0x64c: 'N', # ٌ
|
||
0x64d: 'K', # ٍ
|
||
0x64e: 'a', # َ
|
||
0x64f: 'u', # ُ
|
||
0x650: 'i', # ِ
|
||
0x651: '~', # ّ
|
||
0x652: 'o', # ْ
|
||
0x670: '`', # '
|
||
0x671: '{' # ٱ
|
||
}
|
||
|
||
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
|
||
|
||
|
||
def to_buckwalter(s):
|
||
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
|
||
|
||
|
||
def from_buckwalter(s):
|
||
return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
|
||
|
||
|
||
def unvocalize(s):
|
||
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
|
||
|
||
|
||
def is_arabic(s):
|
||
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
||
|
||
def normal(s):
|
||
return unicodedata.normalize('NFD', s)
|
||
|
||
|
||
# Wikt uses vowel+shadda which is a Unicode normalization
|
||
# GF uses shadda+vowel which is linguistically correct
|
||
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
|
||
# unicodedata.normalize does this wrong, as noted by Ariel Gutman
|
||
## todo: more direct implementation
|
||
def reorder_shadda(s):
|
||
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
|
||
|
||
|
||
# quote forms but not parameters
|
||
def quote_if(s, cond=is_arabic):
|
||
if cond(s):
|
||
return '"' + s + '"'
|
||
else:
|
||
return s
|
||
|
||
def gf_fun(s, pos, disamb=0):
|
||
discrim = '_' + str(disamb) if disamb else ''
|
||
return ''.join(["'", s, discrim, "_", pos, "'"])
|
||
|
||
|
||
rgl_features = {
|
||
# V
|
||
'VPerf': 'perfective',
|
||
'Act': 'active',
|
||
'Pas': 'passive',
|
||
'Per3': 'third-person',
|
||
'Per2': 'second-person',
|
||
'Per1': 'first-person',
|
||
'Masc': 'masculine',
|
||
'Fem': 'feminine',
|
||
'Sing': 'singular',
|
||
'Plur': 'plural',
|
||
'Sg': 'singular',
|
||
'Pl': 'plural',
|
||
'Dl': 'dual',
|
||
'VImpf': 'imperfective',
|
||
'Ind': 'indicative',
|
||
'Cnj': 'subjunctive',
|
||
'Jus': 'jussive',
|
||
'VImp': 'imperative',
|
||
# N: also Sg, Pl, Dl
|
||
'Def': 'definite',
|
||
'Indef': 'indefinite',
|
||
'Nom': 'nominative',
|
||
'Acc': 'accusative',
|
||
'Gen': 'genitive',
|
||
# 'Bare':
|
||
# 'Dat':
|
||
'Const': 'construct',
|
||
# 'Poss':
|
||
#A: also N features
|
||
'APosit': 'positive',
|
||
'AComp': 'comparative'
|
||
}
|
||
|
||
|
||
# format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ
|
||
# coming from 'l -treebank -table'
|
||
def compare_tables(gf, wikt):
|
||
report = {}
|
||
for line in gf:
|
||
gf_form = line.split()[-1] # ''.join([c for c in line if 1574 <= ord(c) <= 1616])
|
||
gf_tags = tuple(word for word in
|
||
line.replace('(', ' ').replace(')', ' ').split()
|
||
if word in rgl_features)
|
||
if not gf_tags:
|
||
continue
|
||
wikt_tags = {rgl_features[tag] for tag in gf_tags}
|
||
wikt_form = None
|
||
wikt_descr = None
|
||
for form, descr in wikt:
|
||
if all([tag in descr for tag in wikt_tags]):
|
||
wikt_form = reorder_shadda(form)
|
||
wikt_descr = descr
|
||
break
|
||
report[gf_tags] = {
|
||
'gf_form': gf_form,
|
||
'wikt_form': wikt_form,
|
||
'gf_form_rom': to_buckwalter(gf_form) if gf_form else None,
|
||
'wikt_form_rom': to_buckwalter(wikt_form) if wikt_form else None,
|
||
'wikt_descr': wikt_descr
|
||
}
|
||
if wikt_form:
|
||
report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
|
||
report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
|
||
ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items
|
||
report['fun'] = gf[0].split()[-1]
|
||
report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ])
|
||
report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
|
||
report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
|
||
return report
|
||
|
||
|
||
|
||
def wikt_forms_for_pos(obj):
|
||
return {
|
||
form['form']:
|
||
form.get('tags', []) for
|
||
form in obj.get('forms', []) if
|
||
'romanization' not in form.get('tags', []) and
|
||
is_arabic(form['form'])
|
||
}.items()
|
||
|
||
|
||
def forms_for_pos(obj):
|
||
forms = wikt_forms_for_pos(obj)
|
||
if obj['pos'] == 'noun':
|
||
lemma = [form[:-1] for form, descr in forms
|
||
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
||
plural = [form[:-1] for form, descr in forms
|
||
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
|
||
gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
|
||
else (['masc'] if 'Arabic masculine nouns' in obj['categories']
|
||
else []))
|
||
gf_entry = {
|
||
'cat': 'N',
|
||
'lemma': lemma,
|
||
'args': {
|
||
'sg': lemma,
|
||
'pl': plural,
|
||
'g': gender
|
||
}
|
||
}
|
||
elif obj['pos'] == 'verb':
|
||
lemma = [form for form, descr in forms
|
||
if all([w in descr for
|
||
w in ["active", "indicative", "masculine", "past",
|
||
"perfective", "singular", "third-person"]])][:1]
|
||
gf_entry = {
|
||
'cat': 'V',
|
||
'lemma': lemma,
|
||
'args': {
|
||
'perfect': lemma,
|
||
'imperfect': [form for form, descr in forms
|
||
if all([w in descr for
|
||
w in [
|
||
"active", "indicative", "masculine", "non-past",
|
||
"imperfective", "singular", "third-person"]])][:1],
|
||
'cls': ['Form' + max([n for n in [
|
||
'I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','']
|
||
if n in ' '.join([c for c in obj['categories']
|
||
if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
|
||
key=len)] # max in RGL is XI, in Wikt XIII
|
||
}
|
||
}
|
||
elif obj['pos'] == 'adj':
|
||
lemma = [form for form, descr in forms
|
||
if all([w in descr for w in [
|
||
'indefinite', 'masculine', 'singular', 'informal']])][:1]
|
||
gf_entry = {
|
||
'cat': 'A',
|
||
'lemma': lemma,
|
||
'args': {
|
||
'masc_sg': lemma,
|
||
'masc_pl': [form for form, descr in forms
|
||
if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
|
||
'fem_sg': [form for form, descr in forms
|
||
if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],
|
||
'fem_pl': [form for form, descr in forms
|
||
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
|
||
}
|
||
}
|
||
|
||
else:
|
||
gf_entry = {f: d for f, d in forms}
|
||
|
||
if 'lemma' in gf_entry and gf_entry['lemma']:
|
||
gf_entry['lemma'] = gf_entry['lemma'][0]
|
||
if obj['root']:
|
||
gf_entry['args']['root'] = obj['root']
|
||
args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
|
||
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}'
|
||
|
||
return gf_entry
|
||
|
||
# "root": ["ش ر ح (š-r-ḥ)"]
|
||
def find_root(s):
|
||
return ''.join([c for c in s if is_arabic(c)])
|
||
|
||
if MODE == 'gf-abs':
|
||
print('abstract MorphoDictAraAbs = Cat ** {')
|
||
if MODE == 'gf-cnc':
|
||
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {')
|
||
|
||
if MODE not in ['raw', 'eval']:
|
||
with open(FILTERED_WIKT) as file:
|
||
seen_gf_funs = {}
|
||
number = 1
|
||
for line in file:
|
||
try:
|
||
obj = json.loads(line)
|
||
except:
|
||
continue
|
||
number += 1
|
||
root = [find_root(t['expansion']) for
|
||
t in obj.get('etymology_templates', []) if
|
||
t.get('name', None) =='ar-root'][:1]
|
||
obj['root'] = root
|
||
if 'Arabic lemmas' in obj.get('categories', []):
|
||
entry = {
|
||
'pos': obj['pos'],
|
||
'forms': forms_for_pos(obj),
|
||
'senses': [sense['glosses'] for sense in obj.get('senses', [])
|
||
if 'glosses' in sense]
|
||
}
|
||
# entry['n_forms'] = len(entry['forms'])
|
||
# print(entry['pos'], entry['n_forms'])
|
||
if MODE == 'json':
|
||
print(json.dumps(entry, ensure_ascii=False))
|
||
|
||
if MODE.startswith('gf'):
|
||
|
||
lemma = entry['forms'].get('lemma', None)
|
||
if lemma:
|
||
cat = entry['forms']['cat']
|
||
lin = entry['forms']['lin']
|
||
discrim = seen_gf_funs.get((lemma, cat), 0)
|
||
fun = gf_fun(lemma, cat, discrim)
|
||
|
||
if MODE == 'gf-abs':
|
||
print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
|
||
if MODE == 'gf-cnc':
|
||
print('lin', fun, '=', lin, ';')
|
||
if MODE == 'gf-map':
|
||
mapitem = {'fun': fun, 'source': obj}
|
||
print(json.dumps(mapitem, ensure_ascii=False))
|
||
|
||
seen_gf_funs[(lemma, cat)] = discrim + 1
|
||
|
||
# to do: rename duplicate function names: of 13762 names, 12946 are unique
|
||
|
||
if MODE.startswith('gf'):
|
||
print('}')
|
||
|
||
|
||
if MODE.startswith('eval'):
|
||
with open('pot.gftbl') as file:
|
||
gf = [line.strip() for line in file]
|
||
with open('pot.json') as file:
|
||
wikt = wikt_forms_for_pos(json.loads(file.read()))
|
||
report = compare_tables(gf, wikt)
|
||
|
||
if MODE == 'eval-verbose':
|
||
for line in report.items():
|
||
print(line)
|
||
else:
|
||
print(report['fun'], 'forms', report['total_found'],
|
||
'voc', report['total_voc'], 'unvoc', report['total_unvoc'])
|
||
|