Merge branch 'master' of github.com:GrammaticalFramework/gf-rgl

2026-05-27 08:58:55 -06:00 · 2023-09-26 10:47:39 +02:00
parent fdd9b98601 1c355ce9dd
commit 79643d8604
7 changed files with 873 additions and 2 deletions
--- a/src/arabic/MorphoAra.gf
+++ b/src/arabic/MorphoAra.gf
@@ -153,7 +153,8 @@ oper
        w + "ف" + x + "ع" + y + "ل" + z
          => { h = w ; m1 = x; m2 = y; t = z} ;
        w + "ف" + x + ("ع"|"ل") + y
-          => { h = w ; m1 = x; m2 = ""; t = y}
+          => { h = w ; m1 = x; m2 = ""; t = y} ;
 	_ => Predef.error("cannot get FCL pattern from" ++ pat)
      } ;
  --opers to interdigitize (make words out of roots and patterns:
@@ -204,7 +205,8 @@ oper
                    => mkAssimilated pat (mkRoot3 rS) ;
            ? + ? + _ => mkBilit pat (mkRoot2 rS) ; --2=>
            _=> error rS ---- AR error "expected 3--6"
-        }
+        } ;
     _ => Predef.error("cannot get FCL pattern from" ++ pS)
    };
 -----------------------------------------------------------------------------
--- a/src/arabic/ParadigmsAra.gf
+++ b/src/arabic/ParadigmsAra.gf
@@ -868,4 +868,99 @@ formV : (root : Str) -> VerbForm -> V = \s,f -> case f of {
 param VerbForm =
  FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ;
 {- temporarily moved to wiktionary/MoreAra.gf
 -- paradigms for Wiktionary extraction
 ---- TODO: better usage of information in Wiktionary
 oper
  wmkN = overload {
    wmkN : {sg, pl : Str ; g : Gender} -> N
      = \r -> mkN r.sg r.pl r.g nohum ;  --- hum/nohum not in Wikt
    wmkN : {sg : Str} -> N
      = \r -> smartN r.sg ; 
    wmkN : {sg : Str ; g : Gender ; root : Str} -> N
      = \r -> smartN r.sg ** {g = r.g} ; ----
    wmkN : {sg : Str; g : Gender} -> N
      = \r -> smartN r.sg ** {g = r.g} ;
    wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N
      = \r -> mkN r.sg r.pl r.g nohum ;   --- hum/nohum not in Wikt
    wmkN : {sg : Str; pl : Str} -> N
      = \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ;
    wmkN : {sg, pl : Str ; root : Str} -> N
      = \r -> mkN r.sg r.pl masc nohum ;  ---- 
    wmkN : {sg : Str; root : Str} -> N 
      = \r -> smartN r.sg ;
    } ;
  wmkA = overload {
    wmkA : {root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt r.pl_patt ;
    wmkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt r.pl_patt ;
    wmkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt ;
    wmkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt ;
    wmkA : {masc_sg, root, sg_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt ;
    wmkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt ;
    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
      = \r -> mkA r.root ; ----
    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
      = \r -> mkA r.root ; ----
    wmkA : {masc_sg, fem_sg, root : Str} -> A
      = \r -> mkA r.root ; ----
    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt ;
    wmkA : {masc_sg : Str; fem_sg : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
      = \r -> mkA r.sg_patt r.pl_patt ;
    wmkA : {masc_sg : Str; masc_pl : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    } ;
  wmkV = overload {
    wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V
      = \r -> mkV r.root r.cls ; ----
    wmkV : {perfect : Str; cls : VerbForm} -> V
      = \r -> mkV r.perfect r.cls ; ----
    wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V
      = \r -> mkV r.root r.cls ; ----
    wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V
      = \r -> mkV r.perfect r.cls ; ----
    wmkV : {root : Str ; cls : VerbForm} -> V
      = \r -> mkV r.root r.cls ;
    wmkV : {imperfect : Str} -> V
      = \r -> variants {} ; ---- mkV r.imperfect ;
    } ;
 -}
 } ;
--- a/src/arabic/wiktionary/Makefile
+++ b/src/arabic/wiktionary/Makefile
@@ -0,0 +1,8 @@
 all:
 	python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
 	python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
 	python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl 
 	gf -make MorphoDictAra.gf
 	python3 read_wiktionary.py eval-funs >eval.jsonl 
 	python3 to_wordnet.py >wordnet-arabic.jsonl
 	python3 read_wiktionary.py error-analysis
--- a/src/arabic/wiktionary/MoreAra.gf
+++ b/src/arabic/wiktionary/MoreAra.gf
@@ -0,0 +1,98 @@
 resource MoreAra = CatAra ** open ParadigmsAra in {
 -- temporarily moved from ParadigmsAra
 -- paradigms for Wiktionary extraction
 ---- TODO: better usage of information in Wiktionary
 oper
  wmkN = overload {
    wmkN : {sg, pl : Str ; g : Gender} -> N
      = \r -> mkN r.sg r.pl r.g nohum ;  --- hum/nohum not in Wikt
    wmkN : {sg : Str} -> N
      = \r -> smartN r.sg ; 
    wmkN : {sg : Str ; g : Gender ; root : Str} -> N
      = \r -> smartN r.sg ** {g = r.g} ; ----
    wmkN : {sg : Str; g : Gender} -> N
      = \r -> smartN r.sg ** {g = r.g} ;
    wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N
      = \r -> mkN r.sg r.pl r.g nohum ;   --- hum/nohum not in Wikt
    wmkN : {sg : Str; pl : Str} -> N
      = \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ;
    wmkN : {sg, pl : Str ; root : Str} -> N
      = \r -> mkN r.sg r.pl masc nohum ;  ---- 
    wmkN : {sg : Str; root : Str} -> N 
      = \r -> smartN r.sg ;
    } ;
  wmkA = overload {
    wmkA : {root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt r.pl_patt ;
    wmkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt r.pl_patt ;
    wmkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt ;
    wmkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt ;
    wmkA : {masc_sg, root, sg_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt ;
    wmkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt ;
    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
      = \r -> mkA r.root ; ----
    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
      = \r -> mkA r.root ; ----
    wmkA : {masc_sg, fem_sg, root : Str} -> A
      = \r -> mkA r.root ; ----
    wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
      = \r -> mkA r.root r.sg_patt ;
    wmkA : {masc_sg : Str; fem_sg : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
      = \r -> mkA r.sg_patt r.pl_patt ;
    wmkA : {masc_sg : Str; masc_pl : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    wmkA : {masc_sg : Str; root : Str} -> A
      = \r -> mkA r.root ;
    wmkA : {masc_sg : Str} -> A
      = \r -> mkA r.masc_sg ; ----
    } ;
  wmkV = overload {
    wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V
      = \r -> mkV r.root r.cls ; ----
    wmkV : {perfect : Str; cls : VerbForm} -> V
      = \r -> mkV r.perfect r.cls ; ----
    wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V
      = \r -> mkV r.root r.cls ; ----
    wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V
      = \r -> mkV r.perfect r.cls ; ----
    wmkV : {root : Str ; cls : VerbForm} -> V
      = \r -> mkV r.root r.cls ;
    wmkV : {imperfect : Str} -> V
      = \r -> variants {} ; ---- mkV r.imperfect ;
    } ;
 }
--- a/src/arabic/wiktionary/arabic_utilities.py
+++ b/src/arabic/wiktionary/arabic_utilities.py
@@ -0,0 +1,169 @@
 # utilities for Arabic script
 # in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter
 # as specified by the command line argument:
 #
 #   % python3 arabic_utilities.py to <MorphoDictAra.gf | python3 arabic_utilities.py from >b.tmp
 #   % diff MorphoDictAra.gf b.tmp 
 #   % 
 def is_arabic(s):
    return s and any(1574 <= ord(c) <= 1616 for c in s)
 def get_arabic(s):
    return ''.join([c for c in s if is_arabic(c)])
 def unvocalize(s):
    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
 # https://en.wikipedia.org/wiki/Buckwalter_transliteration
 buckwalter_dict = {
  0x621: "'",  # ء
  0x622: '|',  # آ
  0x623: '>',  # أ
  0x624: '&',  # ؤ
  0x625: '<',  # إ
  0x626: '}',  # ئ
  0x627: 'A',  # ا
  0x628: 'b',  # ب
  0x629: 'p',  # ة
  0x62a: 't',  # ت
  0x62b: 'v',  # ث
  0x62c: 'j',  # ج
  0x62d: 'H',  # ح
  0x62e: 'x',  # خ
  0x62f: 'd',  # د
  0x630: '*',  # ذ
  0x631: 'r',  # ر
  0x632: 'z',  # ز
  0x633: 's',  # س
  0x634: '$',  # ش
  0x635: 'S',  # ص
  0x636: 'D',  # ض
  0x637: 'T',  # ط
  0x638: 'Z',  # ظ
  0x639: 'E',  # ع
  0x63a: 'g',  # غ
  0x641: 'f',  # ف  
  0x642: 'q',  # ق
  0x643: 'k',  # ك
  0x644: 'l',  # ل
  0x645: 'm',  # م
  0x646: 'n',  # ن
  0x647: 'h',  # ه
  0x648: 'w',  # و
  0x649: 'Y',  # ى
  0x64a: 'y',  # ي
  0x64b: 'F',  # ً
  0x64c: 'N',  # ٌ
  0x64d: 'K',  # ٍ
  0x64e: 'a',  # َ
  0x64f: 'u',  # ُ
  0x650: 'i',  # ِ
  0x651: '~',  # ّ
  0x652: 'o',  # ْ
  0x670: '`',  # '
  0x671: '{'   # ٱ
  }
 buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
 arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
 sound_consonants = {chr(c) for c in range(0x628, 0x648)}  # excluding alif, waw, ya
 def to_buckwalter(s):
    return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
 def from_buckwalter(s):
    return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
 def drop_final_vowel(s):
    if s[-1] in arabic_vowels:
        return s[:-1]
    else:
        return s
 def normal(s):
    return unicodedata.normalize('NFD', s)
 # heuristic for finding the three radicals from certain forms
 # works only for sound (strong) 3-radical roots, otherwise None
 def get_sound_trigram_root(s):
    sounds = [c for c in s if c in sound_consonants]
    if len(sounds) == 3:
        return ''.join(sounds)
    else:
        return None
 # reverse engineer fcl pattern from a given form, with a sound trigram root
 # one more condition: each of the root letters occurs exactly ones
 # TODO: better use the given root of the lex entry
 def get_sound_fcl_pattern(s):
    if root := get_sound_trigram_root(s):
        if len([c in s for c in root]) == 3:
            p = list(s)
            r = s.find(root[0])
            p[r] = chr(0x641)
            r += s[r+1:].find(root[1]) + 1
            p[r] = chr(0x639)
            r += s[r+1:].find(root[2]) + 1
            p[r] = chr(0x644)
            p = ''.join(p)
 ##            print('---PATT', s, root, p)
            return p
 # Wikt uses vowel+shadda which is a Unicode normalization
 # GF uses shadda+vowel which is linguistically correct
 # see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
 # unicodedata.normalize does this wrong, as noted by Ariel Gutman 
 ## todo: more direct implementation
 def reorder_shadda(s):
    return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
 # quote word forms but not parameters
 def quote_if(s, cond=is_arabic, change=reorder_shadda):
    if cond(s):
        return '"' + change(s) + '"'
    else:
        return s
 # for a string, change each string literal in "..." with a change function
 # leaving other characters as they are; print the string to stdout as you go
 def change_literals(s, change):
    inliteral = False
    literal = ''
    for c in s:
        if c == '"' and inliteral:
            print('"'+change(literal)+'"', end='')
            inliteral = False
            literal = ''
        elif c == '"':
            inliteral = True
        elif inliteral:
            literal += c
        else:
            print(c, end='')
 # convert literals in stdin 'to' or 'from' Buckwalter
 if __name__ == '__main__':
    import sys
    mode = sys.argv[1]
    for line in sys.stdin:
        if mode == 'from':
            change_literals(line, from_buckwalter)
        elif mode == 'to':
            change_literals(line, to_buckwalter)
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -0,0 +1,455 @@
 import gzip
 import json
 import sys
 import unicodedata
 import pgf
 from arabic_utilities import *
 # data from https://kaikki.org/dictionary/rawdata.html
 # thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
 # Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.
 """
 This file converts Wiktionary data to GF morphological dictionary files.
 It words for Arabic but some functionalities could be modified to other languges.
 The steps to take are the following:
 fetch data:
  raw-wiktextract-data.json.gz from https://kaikki.org/dictionary/rawdata.html
 filter Arabic entries:
  $ python3 read_wiktionary.py raw >wikt_arabic.jsonl
 create GF files:
  $ python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
  $ python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
 automatic evaluation:
  $ gf -make MorphoDictAra.gf
  $ python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl
  $ python3 read_wiktionary.py eval
 TODO:
 - better generation of GF
 - better paradigms to use Wiktionary data
 - refactor the code so that it can be used for other languages
 """
 MODE = ''
 if __name__ == '__main__':
    if not sys.argv[1:]:
        print('usage: read_wiktionary (raw | gf-cnc | gf-abs | gf-map | eval | eval-verbose)')
        exit()
    MODE = sys.argv[1]  # 
 # step 1: extract Arabic data from this file using the raw option
 WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
 EXTRACTED_LANGUAGE = 'Arabic'
 # the following file is generated.
 # in the sequel, use this file with gf-abs or gf-cnc option
 FILTERED_WIKT = 'wikt_arabic.jsonl'
 # map each successfully extracted GF function to its source record in Wiktionary
 # created with option gf-map
 FUNCTION_SOURCE_MAP = 'source_of_MorphoDictAra.jsonl'
 # created with $ gf -make MorphoDictAra.gf
 PGF_FILE = 'MorphoDictAraAbs.pgf'
 # module to linearize with
 CONCRETE_MODULE = 'MorphoDictAra'
 # concrete syntax file, to debug sources of linearizations
 CONCRETE_FILE = CONCRETE_MODULE + '.gf'
 # evaluation result file, created with mode eval-funs
 EVAL_FILE = 'eval.jsonl'
 # read a gzipped jsonl file (one object per line),
 # showing lines where one of a list of languages is present
 # This can be sampled to one of 100k lines by default, 1 for total recall.
 def get_gzip_json(file, sample=100000, langs=[]):
    with gzip.open(file) as decompressed:
        n = 0
        for line in decompressed:
            n += 1
            if n % sample == 0:
                obj = json.loads(line)
                if obj.get('lang', None) in langs:
                    print(line.decode("utf-8"))
 #        print(n)
 # to perform the first step of data extraction, pipe this into a file:
 # python3 read_wiktionary.py raw >wikt_arabic.jsonl
 if MODE == 'raw':
    get_gzip_json(WIKTIONARY_DUMP, 1, [EXTRACTED_LANGUAGE])
    exit()
 if MODE == 'error-analysis':
    evals = {}
    with open(EVAL_FILE) as file:
        for line in file:
            row = json.loads(line)
            if labels := row.get('labels', None):
                cat = row['fun'][-1]
                verdict = row['verdict']
                evals[(cat, labels, verdict)] = evals.get((cat, labels, verdict), 0) + 1
    for labverdict, n in sorted(list(evals.items())):
        print(labverdict, n)
 # generate word_d_C functions starting with d=0, but show d only when >= 1
 def gf_fun(s, pos, disamb=0):
    discrim = '_' + str(disamb) if disamb else ''
    return ''.join(["'", s, discrim, "_", pos, "'"])
 # mapping from GF to Wikt features
 arabic_rgl_features = {
    # V
    'VPerf': 'perfective',
    'Act': 'active',
    'Pas': 'passive',
    'Per3': 'third-person',
    'Per2': 'second-person',
    'Per1': 'first-person',
    'Masc': 'masculine',
    'Fem': 'feminine',
    'Sing': 'singular',
    'Plur': 'plural',
    'Sg': 'singular',
    'Pl': 'plural',
    'Dl': 'dual',
    'VImpf': 'imperfective',
    'Ind': 'indicative',
    'Cnj': 'subjunctive',
    'Jus': 'jussive',
    'VImp': 'imperative',
    # N: also Sg, Pl, Dl
    'Def': 'definite',
    'Indef': 'indefinite',
    'Nom': 'nominative',
    'Acc': 'accusative',
    'Gen': 'genitive',
 #    'Bare':
 #    'Dat':
    'Const': 'construct'
 #    'Poss':
    #A: also N features; degree features cannot be found
 #    'APosit': 'positive',
 #    'AComp': 'comparative'
    }
 # the inflection forms in a wiktionary entry
 def wikt_forms_from_obj(obj):
    forms = {
        reorder_shadda(form['form']):
          form.get('tags', []) for
            form in obj.get('forms', []) if
               'romanization' not in form.get('tags', []) and
                   is_arabic(form['form'])
        }
    # the root (three radicals) is found in this place if at all
    root = [find_root(t['expansion']) for
                t in obj.get('etymology_templates', []) if
                t.get('name', None) =='ar-root'][:1]
    if root and root[0].strip():
        forms['root'] = root[0].strip()
    return forms
 # selection of forms for a given POS from Wikt: noun, adj, or verb
 # return a linearization function
 def forms_for_pos(obj):
    dforms = wikt_forms_from_obj(obj)
    forms = dforms.items()
    if obj['pos'] == 'noun':
        lemma = [drop_final_vowel(form) for form, descr in forms
                         if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
        plural = [drop_final_vowel(form) for form, descr in forms
                         if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
        gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
                            else (['masc'] if  'Arabic masculine nouns' in obj['categories']
                                  else []))
        gf_entry = {
            'cat': 'N',
            'lemma': lemma,
            'args': {
                'sg': lemma,  
                'pl': plural,
                'g': gender
                }
            } 
    elif obj['pos'] == 'verb':
        lemma = [form for form, descr in forms
                      if all([w in descr for
                              w in ["active", "indicative", "masculine", "past",
                                        "perfective", "singular", "third-person"]])][:1]
        gf_entry = {
          'cat': 'V',
          'lemma': lemma,
          'args': {
              'perfect': lemma, 
              'imperfect': [form for form, descr in forms
                      if all([w in descr for
                              w in [
                                  "active", "indicative", "masculine", "non-past",
                                  "imperfective", "singular", "third-person"]])][:1],
              'cls': ['Form' + max([n for n in [
                  'I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','']
                            if n in ' '.join([c for c in obj['categories']
                                if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
                           key=len)]  # max in RGL is XI, in Wikt XIII
              }
          }
    elif obj['pos'] == 'adj':
        lemma = [form for form, descr in forms
                    if all([w in descr for w in [
                        'indefinite', 'masculine', 'singular', 'informal']])][:1]
        gf_entry = {
            'cat': 'A',
            'lemma': lemma,
            'args': {
                'masc_sg': lemma,   
                'masc_pl': [form for form, descr in forms
                         if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
                'fem_sg': [form for form, descr in forms
                         if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],  
                'fem_pl': [form for form, descr in forms
                         if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
                }
            }
        for patt in ['masc_sg', 'masc_pl']:
            if patt in gf_entry['args']:
                if form := gf_entry['args'][patt]:
                    if spatt := get_sound_fcl_pattern(form[0]):
                        gf_entry['args'][patt[5:]+'_patt'] = [spatt]  # sg_patt, pl_patt
    else:
        gf_entry = {f: d for f, d in forms}
    if 'lemma' in gf_entry and gf_entry['lemma']:
        gf_entry['lemma'] = gf_entry['lemma'][0]
        if 'root' in dforms:
            gf_entry['args']['root'] = [dforms['root']]
        elif root := get_sound_trigram_root(gf_entry['lemma']):
            gf_entry['args']['root'] = [root]
        args = sorted([(r, quote_if(x[0])) for r, x in gf_entry['args'].items() if x])
        gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join([r + ' = ' + v for (r, v) in args]) + '}'
        gf_entry['labels'] = ','.join([r for r, v in args])
    return gf_entry
 # "root": ["ش ر ح (š-r-ḥ)"]
 def find_root(s):
    return ''.join([c for c in s if is_arabic(c)])
 # GF code generation
 # start with the header of the desired GF module
 if MODE == 'gf-abs':
    print('abstract MorphoDictAraAbs = Cat ** {')    
 if MODE == 'gf-cnc':
    print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra, MoreAra in {') 
 # go through the Arabic Wiktionary entries
 # generate functions with unique names
 if MODE.startswith('gf') or MODE=='json':
  with open(FILTERED_WIKT) as file:
    seen_gf_funs = {}  # to disambiguate names if needed
    number = 1
    for line in file:
        try:
            obj = json.loads(line)
        except:
            continue
        number += 1   # if you find the same word_C again, mark it word_1_C
        # only take entries that are marked as lemmas 
        if 'Arabic lemmas' in obj.get('categories', []):
            entry = {
                'pos': obj['pos'],
                'forms': forms_for_pos(obj),
                'all_forms': wikt_forms_from_obj(obj),
                'senses': [sense['glosses'] for sense in obj.get('senses', [])
                           if 'glosses' in sense]
                }
            # if you only want to see the Wikt information used GF generation
            if MODE == 'json':
                print(json.dumps(entry, ensure_ascii=False))
            # if you want to proceed to GF generation
            if MODE.startswith('gf'):
                lemma = entry['forms'].get('lemma', None)
                if lemma:
                    cat = entry['forms']['cat']
                    lin = entry['forms']['lin']
                    labels = entry['forms']['labels']
                    discrim = seen_gf_funs.get((lemma, cat), 0)
                    fun = gf_fun(lemma, cat, discrim)
                    # abstract syntax, save in MorphoDictAraAbs.gf
                    if MODE == 'gf-abs':
                        print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
                    # concrete syntax, save in MorphoDictAra.gf
                    elif MODE == 'gf-cnc':
                        print('lin', fun, '=', lin, ';')
                    # function-source map, save in source_of_MorphoDictAra.jsonl
                    elif MODE == 'gf-map':
                        source = wikt_forms_from_obj(obj)
                        source['gf_labels'] = labels
                        mapitem = {'fun': fun, 'source': source}
                        print(json.dumps(mapitem, ensure_ascii=False))
                    seen_gf_funs[(lemma, cat)] = discrim + 1  # next word_d_C will get a new number
 # terminate the GF file with a closing brace
 if MODE in ['gf-abs', 'gf-cnc']:            
    print('}')
 # evaluation:
 # linearize all words to tables
 # compare them to the forms found in Wiktionary
 # report on matches
 # format of GF table:
 #  {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
 # coming from pgf tabularLinearize
 # compare the table for one function, returning a report as a dict
 def compare_tables(gf, wikt, fun, show_buckwalter=True):
    report = {}    
    for pair in gf.items():
        gf_form = pair[1]
        gf_params = pair[0]
        gf_tags = tuple(word for word in
                    pair[0].replace('(', ' ').replace(')', ' ').split()
                      if word in arabic_rgl_features)
        if not gf_tags:
            continue  # if gf_tags match no Wikt tags, do not include this form
        wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags}
        wikt_form = None
        wikt_descr = None
        for form, descr in wikt.items():
            if all([tag in descr for tag in wikt_tags]):
                wikt_form = reorder_shadda(form)
                wikt_descr = descr
                break
        report[gf_tags] = {          # flat param description with only Wikt-relevant tags
            'gf_params': gf_params,  # full param description
            'gf_form': gf_form,
            'wikt_form': wikt_form,
            'wikt_descr': wikt_descr
            }
        if show_buckwalter:
            report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None
            report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None
        if wikt_form:
            report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
            report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
    ritems = tuple(report.items())  # need an unmutable structure, because otherwise ints are added to items
    report['fun'] = fun
    report['labels'] = wikt['gf_labels']
    report['total_found'] = len([f for f, v  in ritems if v['wikt_form'] is not None ])
    report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
    report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
    return report
 # with a given grammar and function, prepare input for compare_tables
 # and produce a report, possibly summarizing it
 def eval_with_wikt(gr, lang, fun, wikt, verbose=False):
    if fun not in gr.functions:
        print(fun, 'not found in grammar')
        return
    gf = {p: s for (p, s) in lang.tabularLinearize(pgf.Expr(fun, [])).items()
              if p.startswith('s ')}  # require the s field, exclude s2
    report = compare_tables(gf, wikt, fun)
    if verbose:
        return report
    else:
        if report['total_found'] == 0:
            verdict = 'NOT_FOUND'
            flaws = False
        elif report['total_found'] == report['total_voc']:
            verdict = 'PERFECT'
            flaws = False
        elif report['total_found'] == report['total_unvoc']:
            verdict = 'PERFECT_UNVOC'
            flaws = True
        elif report['total_voc'] == 0:
            verdict = 'TOTALLY_WRONG'
            flaws = True
        else:
            verdict = 'PARTIAL'
            flaws = True
        summary = {
            'fun': report['fun'],
            'forms': report['total_found'],
            'voc': report['total_voc'],
            'unvoc': report['total_unvoc'],
            'verdict': verdict,
            'labels': report['labels']
            }
        if flaws:
            for f, v in report.items():
                if v.get('voc_match', 1) == 0:
                    summary['first_error'] = v
                    break
        return summary
 def eval_grammar(pgffile, concretename, mapfile, show=True, verbose=False): 
    gr = pgf.readPGF(pgffile)
    concrete = gr.languages[concretename]
    totals = {'A': {}, 'N': {}, 'V': {}}
    with open(mapfile) as file:
        for line in file:
            obj = json.loads(line)
            fun = obj['fun'][1:-1]
            report = eval_with_wikt(gr, concrete, fun, obj['source'], verbose)
            cat = fun[-1]
            if 'verdict' in report:
                rep = report['verdict']
                totals[cat][rep] = totals[cat].get(rep, 0) + 1 
            if show:
                print(json.dumps(report, ensure_ascii=False))
        print(json.dumps(totals, ensure_ascii=False))
 if MODE.startswith('eval'):
    verbose = MODE=='eval-verbose'
    show = verbose or MODE=='eval-funs'
    eval_grammar(PGF_FILE, CONCRETE_MODULE, FUNCTION_SOURCE_MAP, show, verbose)
--- a/src/arabic/wiktionary/to_wordnet.py
+++ b/src/arabic/wiktionary/to_wordnet.py
@@ -0,0 +1,44 @@
 import csv
 import json
 from arabic_utilities import *
 # to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
 # the following are assumed
 # from https://www.grammaticalframework.org/~krasimir/arabic.tsv.gz
 # WN_TSV = 'arabic.tsv'  # Krasimir
 WN_TSV = 'ar2en_words_gf.csv'  # Zarzoura
 # built as explained in ./read_wiktionary.py
 MORPHO_GF = 'MorphoDictAraAbs.gf'
 # fun 'دُبُ_N' : N ; -- 10 [['bear']]
 funmap = {}
 with open(MORPHO_GF) as gffile:
    for line in gffile:
        line = line.split()
        if line[2:] and line[0] == 'fun':
            fun = line[1]
            key = unvocalize(fun)
            cat = line[3] 
            sense = ' '.join(line[6:])
            funmap[(key, cat)] = funmap.get((key, cat), [])
            funmap[(key, cat)].append({'fun': fun,  'sense': sense})
 # abandon_1_V2    ParseAra        ترك     (1,1,1,3,322,3)
 with open(WN_TSV) as wnfile:
 ##    wnreader = csv.reader(wnfile, delimiter='\t')
    for row in wnfile:
 ##        word = row[-1].strip()   # does not show tha arabic, but the second-last word
        word = unvocalize(get_arabic(row))
        wnfun = row.split()[-1]  # 0 in Krasimir
        cat = [c for c in wnfun if c.isalpha()][-1]  # the last letter; the dict only contains N, A, V
        funs = funmap.get((word, cat), [])
        result = {'wnfun': wnfun, 'sought': word, 'found': funs}
        print(json.dumps(result, ensure_ascii=False))