From 7e383b746e81544dfeee9ae776fa84ac07e3c4f9 Mon Sep 17 00:00:00 2001 From: aarneranta Date: Thu, 21 Sep 2023 15:46:41 +0200 Subject: [PATCH] moved wikt-specific paradigms to a separate file (for the moment) --- src/arabic/ParadigmsAra.gf | 54 ++++++++++++------------ src/arabic/wiktionary/Makefile | 2 +- src/arabic/wiktionary/read_wiktionary.py | 21 +++++++-- 3 files changed, 47 insertions(+), 30 deletions(-) diff --git a/src/arabic/ParadigmsAra.gf b/src/arabic/ParadigmsAra.gf index 3d1623e14..80506ebbb 100644 --- a/src/arabic/ParadigmsAra.gf +++ b/src/arabic/ParadigmsAra.gf @@ -868,6 +868,8 @@ formV : (root : Str) -> VerbForm -> V = \s,f -> case f of { param VerbForm = FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ; + +{- temporarily moved to wiktionary/MoreAra.gf -- paradigms for Wiktionary extraction ---- TODO: better usage of information in Wiktionary @@ -894,55 +896,55 @@ oper wmkA = overload { wmkA : {root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A + wmkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A + wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A = \r -> mkA r.root r.sg_patt r.pl_patt ; - mkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A + wmkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A = \r -> mkA r.root r.sg_patt r.pl_patt ; - mkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A + wmkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A = \r -> mkA r.root r.sg_patt ; - mkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A + wmkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A = \r -> mkA r.root r.sg_patt ; - mkA : {masc_sg, root, sg_patt : Str} -> A + wmkA : {masc_sg, root, sg_patt : Str} -> A = \r -> mkA r.root r.sg_patt ; - mkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A + wmkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A = \r -> mkA r.root r.sg_patt ; - mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A = \r -> mkA r.root ; ---- - mkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A = \r -> mkA r.root ; ---- - mkA : {masc_sg, fem_sg, root : Str} -> A + wmkA : {masc_sg, fem_sg, root : Str} -> A = \r -> mkA r.root ; ---- - mkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A + wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A + wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A + wmkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A = \r -> mkA r.root r.sg_patt ; - mkA : {masc_sg : Str; fem_sg : Str} -> A + wmkA : {masc_sg : Str; fem_sg : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A + wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A + wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A + wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A + wmkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A + wmkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A + wmkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A = \r -> mkA r.sg_patt r.pl_patt ; - mkA : {masc_sg : Str; masc_pl : Str} -> A + wmkA : {masc_sg : Str; masc_pl : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A + wmkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A = \r -> mkA r.masc_sg ; ---- - mkA : {masc_sg : Str; root : Str} -> A + wmkA : {masc_sg : Str; root : Str} -> A = \r -> mkA r.root ; - mkA : {masc_sg : Str} -> A + wmkA : {masc_sg : Str} -> A = \r -> mkA r.masc_sg ; ---- } ; @@ -960,5 +962,5 @@ oper wmkV : {imperfect : Str} -> V = \r -> variants {} ; ---- mkV r.imperfect ; } ; - +-} } ; diff --git a/src/arabic/wiktionary/Makefile b/src/arabic/wiktionary/Makefile index 80e1da791..58fcf2b6d 100644 --- a/src/arabic/wiktionary/Makefile +++ b/src/arabic/wiktionary/Makefile @@ -4,4 +4,4 @@ all: python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl gf -make MorphoDictAra.gf python3 read_wiktionary.py eval-funs >1-eval.txt - python3 to_wordnet.py >wornet-arabic.jsonl + python3 to_wordnet.py >wordnet-arabic.jsonl diff --git a/src/arabic/wiktionary/read_wiktionary.py b/src/arabic/wiktionary/read_wiktionary.py index 960a592d3..69099294e 100644 --- a/src/arabic/wiktionary/read_wiktionary.py +++ b/src/arabic/wiktionary/read_wiktionary.py @@ -71,6 +71,10 @@ CONCRETE_MODULE = 'MorphoDictAra' # concrete syntax file, to debug sources of linearizations CONCRETE_FILE = CONCRETE_MODULE + '.gf' + +# evaluation result file, created with mode eval-funs +EVAL_FILE = 'eval.jsonl' + # read a gzipped jsonl file (one object per line), # showing lines where one of a list of languages is present @@ -93,6 +97,17 @@ if MODE == 'raw': get_gzip_json(WIKTIONARY_DUMP, 1, [EXTRACTED_LANGUAGE]) exit() + +if MODE == 'error-analysis': + evals = {} + with open(EVAL_FILE) as file: + for line in file: + row = json.loads(line) + if labels := row.get('labels', None): + verdict = row['verdict'] + evals[(labels, verdict)] = evals.get((labels, verdict), 0) + 1 + for labverdict, n in sorted(list(evals.items())): + print(labverdict, n) # https://en.wikipedia.org/wiki/Buckwalter_transliteration buckwalter_dict = { @@ -378,7 +393,7 @@ def find_root(s): if MODE == 'gf-abs': print('abstract MorphoDictAraAbs = Cat ** {') if MODE == 'gf-cnc': - print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {') + print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra, MoreAra in {') # go through the Arabic Wiktionary entries # generate functions with unique names @@ -552,9 +567,9 @@ def eval_grammar(pgffile, concretename, mapfile, show=True, verbose=False): totals[cat][rep] = totals[cat].get(rep, 0) + 1 if show: - print(report) + print(json.dumps(report, ensure_ascii=False)) - print(totals) + print(json.dumps(totals, ensure_ascii=False)) if MODE.startswith('eval'):