From 3c0adada11f9c3055dedb5a26ef4d483585f8a15 Mon Sep 17 00:00:00 2001 From: aarneranta Date: Wed, 13 Sep 2023 15:29:28 +0200 Subject: [PATCH] new function in ParadigmsAra to deal with Wiktionary data; lots of untested guesses --- src/arabic/ParadigmsAra.gf | 67 ++++++++++++++++++++++++ src/arabic/wiktionary/read_wiktionary.py | 64 +++++++++++++++------- 2 files changed, 113 insertions(+), 18 deletions(-) diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf index ce479f94..20892fed 100644 --- a/src/arabic/ParadigmsAra.gf +++ b/src/arabic/ParadigmsAra.gf @@ -868,4 +868,71 @@ formV : (root : Str) -> VerbForm -> V = \s,f -> case f of { param VerbForm = FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ; +-- paradigms for Wiktionary extraction +---- TODO: better usage of information in Wiktionary + +oper + wmkN = overload { + wmkN : {sg, pl : Str ; g : Gender} -> N + = \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt + wmkN : {sg : Str} -> N + = \r -> smartN r.sg ; + wmkN : {sg : Str ; g : Gender ; root : Str} -> N + = \r -> smartN r.sg ** {g = r.g} ; ---- + wmkN : {sg : Str; g : Gender} -> N + = \r -> smartN r.sg ** {g = r.g} ; + wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N + = \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt + wmkN : {sg : Str; pl : Str} -> N + = \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ; + wmkN : {sg : Str; root : Str} -> N + = \r -> smartN r.sg ; + } ; + + wmkA = overload { + wmkA : {root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_sg: Str ; masc_pl : Str; fem_pl : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A + = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; fem_sg : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; fem_sg : Str} -> A + = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A + = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A + = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str; masc_pl : Str} -> A + = \r -> mkA r.masc_sg ; ---- + mkA : {masc_sg : Str; root : Str} -> A + = \r -> mkA r.root ; + mkA : {masc_sg : Str} -> A + = \r -> mkA r.masc_sg ; ---- + } ; + + wmkV = overload { + wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V + = \r -> mkV r.root r.cls ; ---- + wmkV : {perfect : Str; cls : VerbForm} -> V + = \r -> mkV r.perfect r.cls ; ---- + wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V + = \r -> mkV r.root r.cls ; ---- + wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V + = \r -> variants {} ; ---- mkV r.imperfect ; ---- + wmkV : {root : Str ; cls : VerbForm} -> V + = \r -> mkV r.root r.cls ; + wmkV : {imperfect : Str} -> V + = \r -> variants {} ; ---- mkV r.imperfect ; + } ; + } ; diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 49d3a3c1..ea8d805f 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -1,7 +1,22 @@ import gzip import json +import sys +# data from https://kaikki.org/dictionary/rawdata.html +# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data, +# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. + +if not sys.argv[1:]: + print('usage: read_wiktionary (raw | gf-cnc | gf-abs)') + exit() + +MODE = sys.argv[1] # + +# step 1: extract data from this file using the raw option WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz' + +# the following file is generated. +# in the sequel, use this file with gf-abs or gf-cnc option FILTERED_WIKT = 'wikt_arabic.jsonl' @@ -14,11 +29,12 @@ def get_gzip_json(file, sample=100000, langs=[]): obj = json.loads(line) if obj.get('lang', None) in langs: print(line.decode("utf-8")) - print(n) +# print(n) +if MODE == 'raw': + get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic']) -# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic']) -# python3 read_wiktionary.py >wikt_arabic.jsonl +# python3 read_wiktionary.py raw >wikt_arabic.jsonl # https://en.wikipedia.org/wiki/Buckwalter_transliteration buckwalter_dict = { @@ -80,6 +96,12 @@ def unvocalize(s): def is_arabic(s): return s and any(1574 <= ord(c) <= 1616 for c in s) +# quote forms but not parameters +def quote_if(s, cond=is_arabic): + if cond(s): + return '"' + s + '"' + else: + return s def gf_fun(s, pos, disamb=0): discrim = '_' + str(disamb) if disamb else '' @@ -99,8 +121,8 @@ def forms_for_pos(obj): if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] plural = [form[:-1] for form, descr in forms if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1] - gender = (['Fem'] if 'Arabic feminine nouns' in obj['categories'] - else (['Masc'] if 'Arabic masculine nouns' in obj['categories'] + gender = (['fem'] if 'Arabic feminine nouns' in obj['categories'] + else (['masc'] if 'Arabic masculine nouns' in obj['categories'] else [])) gf_entry = { 'cat': 'N', @@ -122,15 +144,20 @@ def forms_for_pos(obj): 'perfect': lemma, 'imperfect': [form for form, descr in forms if all([w in descr for - w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1], - 'cls': [max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII', 'XIV', 'XV', ''] - if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])], - key=len)] + w in [ + "active", "indicative", "masculine", "non-past", + "imperfective", "singular", "third-person"]])][:1], + 'cls': ['Form' + max([n for n in [ + 'I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI',''] + if n in ' '.join([c for c in obj['categories'] + if c.endswith('verbs') and any([n in c for n in 'IVX'])])], + key=len)] # max in RGL is XI, in Wikt XIII } } elif obj['pos'] == 'adj': lemma = [form for form, descr in forms - if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1] + if all([w in descr for w in [ + 'indefinite', 'masculine', 'singular', 'informal']])][:1] gf_entry = { 'cat': 'A', 'lemma': lemma, @@ -150,8 +177,9 @@ def forms_for_pos(obj): if 'lemma' in gf_entry and gf_entry['lemma']: gf_entry['lemma'] = gf_entry['lemma'][0] - gf_entry['args']['root'] = obj['root'] - args = [r + ' = ' + '"' + x[0] + '"' for r, x in gf_entry['args'].items() if x] + if obj['root']: + gf_entry['args']['root'] = obj['root'] + args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x] gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}' return gf_entry @@ -160,19 +188,19 @@ def forms_for_pos(obj): def find_root(s): return ''.join([c for c in s if is_arabic(c)]) -import sys -MODE = sys.argv[1] - if MODE == 'gf-abs': print('abstract MorphoDictAraAbs = Cat ** {') if MODE == 'gf-cnc': print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') - -with open(FILTERED_WIKT) as file: +if MODE != 'raw': + with open(FILTERED_WIKT) as file: seen_gf_funs = {} for line in file: - obj = json.loads(line) + try: + obj = json.loads(line) + except: + continue root = [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1]