From d5e6e7e38987ab98da7fa33b90428e446a730414 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Thu, 14 Sep 2023 12:21:48 +0200 Subject: [PATCH] Arabic Wiktionary: functions for normalization and evaluation --- src/arabic/wiktionary/read_wiktionary.py | 88 ++++++++++++++++++++---- 1 file changed, 75 insertions(+), 13 deletions(-) diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 574233dd..bcf902b7 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -1,17 +1,21 @@ import gzip import json import sys +import unicodedata # data from https://kaikki.org/dictionary/rawdata.html # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data, # Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. -if not sys.argv[1:]: - print('usage: read_wiktionary (raw | gf-cnc | gf-abs)') - exit() +MODE = '' -MODE = sys.argv[1] # +if __name__ == '__main__': + if not sys.argv[1:]: + print('usage: read_wiktionary (raw | gf-cnc | gf-abs | gf-map | eval | eval-verbose)') + exit() + MODE = sys.argv[1] # + # step 1: extract data from this file using the raw option WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz' @@ -19,6 +23,18 @@ WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz' # in the sequel, use this file with gf-abs or gf-cnc option FILTERED_WIKT = 'wikt_arabic.jsonl' +# map each successfully extracted GF function to its source record in Wiktionary +# created with option gf-map +FUNCTION_SOURCE_MAP = 'function_sources_arabic.jsonl' + + +def read_function_source_map(): + with open(FUNCTION_SOURCE_MAP) as file: + sourcemap = {} + for line in file: + obj = json.loads(line) + sourcemap[obj['fun']] = obj['source'] + def get_gzip_json(file, sample=100000, langs=[]): with gzip.open(file) as decompressed: @@ -86,16 +102,37 @@ buckwalter_dict = { 0x671: '{' # ٱ } +buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()} + + def to_buckwalter(s): - return ''.join([buckwalter_dict.get(ord(c), '?') for c in s]) + return ''.join([buckwalter_dict.get(ord(c), c) for c in s]) + + +def from_buckwalter(s): + return ''.join([buckwalter_dict_rev.get(c, c) for c in s]) def unvocalize(s): return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a]) + def is_arabic(s): return s and any(1574 <= ord(c) <= 1616 for c in s) +def normal(s): + return unicodedata.normalize('NFD', s) + + +# Wikt uses vowel+shadda which is a Unicode normalization +# GF uses shadda+vowel which is linguistically correct +# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra +# unicodedata.normalize does this wrong, as noted by Ariel Gutman +## todo: more direct implementation +def reorder_shadda(s): + return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i')) + + # quote forms but not parameters def quote_if(s, cond=is_arabic): if cond(s): @@ -115,8 +152,11 @@ rgl_features = { 'Pas': 'passive', 'Per3': 'third-person', 'Per2': 'second-person', + 'Per1': 'first-person', 'Masc': 'masculine', 'Fem': 'feminine', + 'Sing': 'singular', + 'Plur': 'plural', 'Sg': 'singular', 'Pl': 'plural', 'Dl': 'dual', @@ -142,26 +182,39 @@ rgl_features = { # format of GF table: MorphoDictAra: s (VPerf Act (Per3 Masc Sg)) : أَجْرََ +# coming from 'l -treebank -table' def compare_tables(gf, wikt): report = {} for line in gf: - gf_form = line #''.join([c for c in line if 1574 <= ord(c) <= 1616]) + gf_form = line.split()[-1] # ''.join([c for c in line if 1574 <= ord(c) <= 1616]) gf_tags = tuple(word for word in line.replace('(', ' ').replace(')', ' ').split() if word in rgl_features) + if not gf_tags: + continue wikt_tags = {rgl_features[tag] for tag in gf_tags} wikt_form = None + wikt_descr = None for form, descr in wikt: if all([tag in descr for tag in wikt_tags]): - wikt_form = form + wikt_form = reorder_shadda(form) + wikt_descr = descr break report[gf_tags] = { 'gf_form': gf_form, - 'wikt_form': wikt_form + 'wikt_form': wikt_form, + 'gf_form_rom': to_buckwalter(gf_form) if gf_form else None, + 'wikt_form_rom': to_buckwalter(wikt_form) if wikt_form else None, + 'wikt_descr': wikt_descr } if wikt_form: - report[gf_tags]['voc_match'] = int(gf_form == wikt_form) - report[gf_tags]['unvoc_match'] = int(unvocalize(gf_form) == unvocalize(wikt_form)) + report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form)) + report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form))) + ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items + report['fun'] = gf[0].split()[-1] + report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ]) + report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems]) + report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems]) return report @@ -295,6 +348,9 @@ if MODE not in ['raw', 'eval']: print('fun', fun, ':', cat, ';', '--', number, entry['senses']) if MODE == 'gf-cnc': print('lin', fun, '=', lin, ';') + if MODE == 'gf-map': + mapitem = {'fun': fun, 'source': obj} + print(json.dumps(mapitem, ensure_ascii=False)) seen_gf_funs[(lemma, cat)] = discrim + 1 @@ -304,11 +360,17 @@ if MODE.startswith('gf'): print('}') -if MODE == 'eval': +if MODE.startswith('eval'): with open('pot.gftbl') as file: gf = [line.strip() for line in file] with open('pot.json') as file: wikt = wikt_forms_for_pos(json.loads(file.read())) - for line in compare_tables(gf, wikt).items(): - print(line) + report = compare_tables(gf, wikt) + + if MODE == 'eval-verbose': + for line in report.items(): + print(line) + else: + print(report['fun'], 'forms', report['total_found'], + 'voc', report['total_voc'], 'unvoc', report['total_unvoc'])