Merge branch 'master' of github.com:GrammaticalFramework/gf-rgl

This commit is contained in:
Krasimir Angelov
2023-09-26 10:47:39 +02:00
7 changed files with 873 additions and 2 deletions

View File

@@ -153,7 +153,8 @@ oper
w + "ف" + x + "ع" + y + "ل" + z
=> { h = w ; m1 = x; m2 = y; t = z} ;
w + "ف" + x + ("ع"|"ل") + y
=> { h = w ; m1 = x; m2 = ""; t = y}
=> { h = w ; m1 = x; m2 = ""; t = y} ;
_ => Predef.error("cannot get FCL pattern from" ++ pat)
} ;
--opers to interdigitize (make words out of roots and patterns:
@@ -204,7 +205,8 @@ oper
=> mkAssimilated pat (mkRoot3 rS) ;
? + ? + _ => mkBilit pat (mkRoot2 rS) ; --2=>
_=> error rS ---- AR error "expected 3--6"
}
} ;
_ => Predef.error("cannot get FCL pattern from" ++ pS)
};
-----------------------------------------------------------------------------

View File

@@ -868,4 +868,99 @@ formV : (root : Str) -> VerbForm -> V = \s,f -> case f of {
param VerbForm =
FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ;
{- temporarily moved to wiktionary/MoreAra.gf
-- paradigms for Wiktionary extraction
---- TODO: better usage of information in Wiktionary
oper
wmkN = overload {
wmkN : {sg, pl : Str ; g : Gender} -> N
= \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt
wmkN : {sg : Str} -> N
= \r -> smartN r.sg ;
wmkN : {sg : Str ; g : Gender ; root : Str} -> N
= \r -> smartN r.sg ** {g = r.g} ; ----
wmkN : {sg : Str; g : Gender} -> N
= \r -> smartN r.sg ** {g = r.g} ;
wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N
= \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt
wmkN : {sg : Str; pl : Str} -> N
= \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ;
wmkN : {sg, pl : Str ; root : Str} -> N
= \r -> mkN r.sg r.pl masc nohum ; ----
wmkN : {sg : Str; root : Str} -> N
= \r -> smartN r.sg ;
} ;
wmkA = overload {
wmkA : {root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
= \r -> mkA r.root r.sg_patt r.pl_patt ;
wmkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
= \r -> mkA r.root r.sg_patt r.pl_patt ;
wmkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
wmkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
wmkA : {masc_sg, root, sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
wmkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
= \r -> mkA r.root ; ----
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
= \r -> mkA r.root ; ----
wmkA : {masc_sg, fem_sg, root : Str} -> A
= \r -> mkA r.root ; ----
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
wmkA : {masc_sg : Str; fem_sg : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
= \r -> mkA r.sg_patt r.pl_patt ;
wmkA : {masc_sg : Str; masc_pl : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str} -> A
= \r -> mkA r.masc_sg ; ----
} ;
wmkV = overload {
wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V
= \r -> mkV r.root r.cls ; ----
wmkV : {perfect : Str; cls : VerbForm} -> V
= \r -> mkV r.perfect r.cls ; ----
wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V
= \r -> mkV r.root r.cls ; ----
wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V
= \r -> mkV r.perfect r.cls ; ----
wmkV : {root : Str ; cls : VerbForm} -> V
= \r -> mkV r.root r.cls ;
wmkV : {imperfect : Str} -> V
= \r -> variants {} ; ---- mkV r.imperfect ;
} ;
-}
} ;

View File

@@ -0,0 +1,8 @@
all:
python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl
gf -make MorphoDictAra.gf
python3 read_wiktionary.py eval-funs >eval.jsonl
python3 to_wordnet.py >wordnet-arabic.jsonl
python3 read_wiktionary.py error-analysis

View File

@@ -0,0 +1,98 @@
resource MoreAra = CatAra ** open ParadigmsAra in {
-- temporarily moved from ParadigmsAra
-- paradigms for Wiktionary extraction
---- TODO: better usage of information in Wiktionary
oper
wmkN = overload {
wmkN : {sg, pl : Str ; g : Gender} -> N
= \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt
wmkN : {sg : Str} -> N
= \r -> smartN r.sg ;
wmkN : {sg : Str ; g : Gender ; root : Str} -> N
= \r -> smartN r.sg ** {g = r.g} ; ----
wmkN : {sg : Str; g : Gender} -> N
= \r -> smartN r.sg ** {g = r.g} ;
wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N
= \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt
wmkN : {sg : Str; pl : Str} -> N
= \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ;
wmkN : {sg, pl : Str ; root : Str} -> N
= \r -> mkN r.sg r.pl masc nohum ; ----
wmkN : {sg : Str; root : Str} -> N
= \r -> smartN r.sg ;
} ;
wmkA = overload {
wmkA : {root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
= \r -> mkA r.root r.sg_patt r.pl_patt ;
wmkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
= \r -> mkA r.root r.sg_patt r.pl_patt ;
wmkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
wmkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
wmkA : {masc_sg, root, sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
wmkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
= \r -> mkA r.root ; ----
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
= \r -> mkA r.root ; ----
wmkA : {masc_sg, fem_sg, root : Str} -> A
= \r -> mkA r.root ; ----
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
= \r -> mkA r.root r.sg_patt ;
wmkA : {masc_sg : Str; fem_sg : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
= \r -> mkA r.sg_patt r.pl_patt ;
wmkA : {masc_sg : Str; masc_pl : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
= \r -> mkA r.masc_sg ; ----
wmkA : {masc_sg : Str; root : Str} -> A
= \r -> mkA r.root ;
wmkA : {masc_sg : Str} -> A
= \r -> mkA r.masc_sg ; ----
} ;
wmkV = overload {
wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V
= \r -> mkV r.root r.cls ; ----
wmkV : {perfect : Str; cls : VerbForm} -> V
= \r -> mkV r.perfect r.cls ; ----
wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V
= \r -> mkV r.root r.cls ; ----
wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V
= \r -> mkV r.perfect r.cls ; ----
wmkV : {root : Str ; cls : VerbForm} -> V
= \r -> mkV r.root r.cls ;
wmkV : {imperfect : Str} -> V
= \r -> variants {} ; ---- mkV r.imperfect ;
} ;
}

View File

@@ -0,0 +1,169 @@
# utilities for Arabic script
# in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter
# as specified by the command line argument:
#
# % python3 arabic_utilities.py to <MorphoDictAra.gf | python3 arabic_utilities.py from >b.tmp
# % diff MorphoDictAra.gf b.tmp
# %
def is_arabic(s):
return s and any(1574 <= ord(c) <= 1616 for c in s)
def get_arabic(s):
return ''.join([c for c in s if is_arabic(c)])
def unvocalize(s):
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
buckwalter_dict = {
0x621: "'", # ء
0x622: '|', # آ
0x623: '>', # أ
0x624: '&', # ؤ
0x625: '<', # إ
0x626: '}', # ئ
0x627: 'A', # ا
0x628: 'b', # ب
0x629: 'p', # ة
0x62a: 't', # ت
0x62b: 'v', # ث
0x62c: 'j', # ج
0x62d: 'H', # ح
0x62e: 'x', # خ
0x62f: 'd', # د
0x630: '*', # ذ
0x631: 'r', # ر
0x632: 'z', # ز
0x633: 's', # س
0x634: '$', # ش
0x635: 'S', # ص
0x636: 'D', # ض
0x637: 'T', # ط
0x638: 'Z', # ظ
0x639: 'E', # ع
0x63a: 'g', # غ
0x641: 'f', # ف
0x642: 'q', # ق
0x643: 'k', # ك
0x644: 'l', # ل
0x645: 'm', # م
0x646: 'n', # ن
0x647: 'h', # ه
0x648: 'w', # و
0x649: 'Y', # ى
0x64a: 'y', # ي
0x64b: 'F', # ً
0x64c: 'N', # ٌ
0x64d: 'K', # ٍ
0x64e: 'a', # َ
0x64f: 'u', # ُ
0x650: 'i', # ِ
0x651: '~', # ّ
0x652: 'o', # ْ
0x670: '`', # '
0x671: '{' # ٱ
}
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya
def to_buckwalter(s):
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
def from_buckwalter(s):
return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
def drop_final_vowel(s):
if s[-1] in arabic_vowels:
return s[:-1]
else:
return s
def normal(s):
return unicodedata.normalize('NFD', s)
# heuristic for finding the three radicals from certain forms
# works only for sound (strong) 3-radical roots, otherwise None
def get_sound_trigram_root(s):
sounds = [c for c in s if c in sound_consonants]
if len(sounds) == 3:
return ''.join(sounds)
else:
return None
# reverse engineer fcl pattern from a given form, with a sound trigram root
# one more condition: each of the root letters occurs exactly ones
# TODO: better use the given root of the lex entry
def get_sound_fcl_pattern(s):
if root := get_sound_trigram_root(s):
if len([c in s for c in root]) == 3:
p = list(s)
r = s.find(root[0])
p[r] = chr(0x641)
r += s[r+1:].find(root[1]) + 1
p[r] = chr(0x639)
r += s[r+1:].find(root[2]) + 1
p[r] = chr(0x644)
p = ''.join(p)
## print('---PATT', s, root, p)
return p
# Wikt uses vowel+shadda which is a Unicode normalization
# GF uses shadda+vowel which is linguistically correct
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
# unicodedata.normalize does this wrong, as noted by Ariel Gutman
## todo: more direct implementation
def reorder_shadda(s):
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
# quote word forms but not parameters
def quote_if(s, cond=is_arabic, change=reorder_shadda):
if cond(s):
return '"' + change(s) + '"'
else:
return s
# for a string, change each string literal in "..." with a change function
# leaving other characters as they are; print the string to stdout as you go
def change_literals(s, change):
inliteral = False
literal = ''
for c in s:
if c == '"' and inliteral:
print('"'+change(literal)+'"', end='')
inliteral = False
literal = ''
elif c == '"':
inliteral = True
elif inliteral:
literal += c
else:
print(c, end='')
# convert literals in stdin 'to' or 'from' Buckwalter
if __name__ == '__main__':
import sys
mode = sys.argv[1]
for line in sys.stdin:
if mode == 'from':
change_literals(line, from_buckwalter)
elif mode == 'to':
change_literals(line, to_buckwalter)

View File

@@ -0,0 +1,455 @@
import gzip
import json
import sys
import unicodedata
import pgf
from arabic_utilities import *
# data from https://kaikki.org/dictionary/rawdata.html
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.
"""
This file converts Wiktionary data to GF morphological dictionary files.
It words for Arabic but some functionalities could be modified to other languges.
The steps to take are the following:
fetch data:
raw-wiktextract-data.json.gz from https://kaikki.org/dictionary/rawdata.html
filter Arabic entries:
$ python3 read_wiktionary.py raw >wikt_arabic.jsonl
create GF files:
$ python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
$ python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
automatic evaluation:
$ gf -make MorphoDictAra.gf
$ python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl
$ python3 read_wiktionary.py eval
TODO:
- better generation of GF
- better paradigms to use Wiktionary data
- refactor the code so that it can be used for other languages
"""
MODE = ''
if __name__ == '__main__':
if not sys.argv[1:]:
print('usage: read_wiktionary (raw | gf-cnc | gf-abs | gf-map | eval | eval-verbose)')
exit()
MODE = sys.argv[1] #
# step 1: extract Arabic data from this file using the raw option
WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
EXTRACTED_LANGUAGE = 'Arabic'
# the following file is generated.
# in the sequel, use this file with gf-abs or gf-cnc option
FILTERED_WIKT = 'wikt_arabic.jsonl'
# map each successfully extracted GF function to its source record in Wiktionary
# created with option gf-map
FUNCTION_SOURCE_MAP = 'source_of_MorphoDictAra.jsonl'
# created with $ gf -make MorphoDictAra.gf
PGF_FILE = 'MorphoDictAraAbs.pgf'
# module to linearize with
CONCRETE_MODULE = 'MorphoDictAra'
# concrete syntax file, to debug sources of linearizations
CONCRETE_FILE = CONCRETE_MODULE + '.gf'
# evaluation result file, created with mode eval-funs
EVAL_FILE = 'eval.jsonl'
# read a gzipped jsonl file (one object per line),
# showing lines where one of a list of languages is present
# This can be sampled to one of 100k lines by default, 1 for total recall.
def get_gzip_json(file, sample=100000, langs=[]):
with gzip.open(file) as decompressed:
n = 0
for line in decompressed:
n += 1
if n % sample == 0:
obj = json.loads(line)
if obj.get('lang', None) in langs:
print(line.decode("utf-8"))
# print(n)
# to perform the first step of data extraction, pipe this into a file:
# python3 read_wiktionary.py raw >wikt_arabic.jsonl
if MODE == 'raw':
get_gzip_json(WIKTIONARY_DUMP, 1, [EXTRACTED_LANGUAGE])
exit()
if MODE == 'error-analysis':
evals = {}
with open(EVAL_FILE) as file:
for line in file:
row = json.loads(line)
if labels := row.get('labels', None):
cat = row['fun'][-1]
verdict = row['verdict']
evals[(cat, labels, verdict)] = evals.get((cat, labels, verdict), 0) + 1
for labverdict, n in sorted(list(evals.items())):
print(labverdict, n)
# generate word_d_C functions starting with d=0, but show d only when >= 1
def gf_fun(s, pos, disamb=0):
discrim = '_' + str(disamb) if disamb else ''
return ''.join(["'", s, discrim, "_", pos, "'"])
# mapping from GF to Wikt features
arabic_rgl_features = {
# V
'VPerf': 'perfective',
'Act': 'active',
'Pas': 'passive',
'Per3': 'third-person',
'Per2': 'second-person',
'Per1': 'first-person',
'Masc': 'masculine',
'Fem': 'feminine',
'Sing': 'singular',
'Plur': 'plural',
'Sg': 'singular',
'Pl': 'plural',
'Dl': 'dual',
'VImpf': 'imperfective',
'Ind': 'indicative',
'Cnj': 'subjunctive',
'Jus': 'jussive',
'VImp': 'imperative',
# N: also Sg, Pl, Dl
'Def': 'definite',
'Indef': 'indefinite',
'Nom': 'nominative',
'Acc': 'accusative',
'Gen': 'genitive',
# 'Bare':
# 'Dat':
'Const': 'construct'
# 'Poss':
#A: also N features; degree features cannot be found
# 'APosit': 'positive',
# 'AComp': 'comparative'
}
# the inflection forms in a wiktionary entry
def wikt_forms_from_obj(obj):
forms = {
reorder_shadda(form['form']):
form.get('tags', []) for
form in obj.get('forms', []) if
'romanization' not in form.get('tags', []) and
is_arabic(form['form'])
}
# the root (three radicals) is found in this place if at all
root = [find_root(t['expansion']) for
t in obj.get('etymology_templates', []) if
t.get('name', None) =='ar-root'][:1]
if root and root[0].strip():
forms['root'] = root[0].strip()
return forms
# selection of forms for a given POS from Wikt: noun, adj, or verb
# return a linearization function
def forms_for_pos(obj):
dforms = wikt_forms_from_obj(obj)
forms = dforms.items()
if obj['pos'] == 'noun':
lemma = [drop_final_vowel(form) for form, descr in forms
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
plural = [drop_final_vowel(form) for form, descr in forms
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
else (['masc'] if 'Arabic masculine nouns' in obj['categories']
else []))
gf_entry = {
'cat': 'N',
'lemma': lemma,
'args': {
'sg': lemma,
'pl': plural,
'g': gender
}
}
elif obj['pos'] == 'verb':
lemma = [form for form, descr in forms
if all([w in descr for
w in ["active", "indicative", "masculine", "past",
"perfective", "singular", "third-person"]])][:1]
gf_entry = {
'cat': 'V',
'lemma': lemma,
'args': {
'perfect': lemma,
'imperfect': [form for form, descr in forms
if all([w in descr for
w in [
"active", "indicative", "masculine", "non-past",
"imperfective", "singular", "third-person"]])][:1],
'cls': ['Form' + max([n for n in [
'I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','']
if n in ' '.join([c for c in obj['categories']
if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
key=len)] # max in RGL is XI, in Wikt XIII
}
}
elif obj['pos'] == 'adj':
lemma = [form for form, descr in forms
if all([w in descr for w in [
'indefinite', 'masculine', 'singular', 'informal']])][:1]
gf_entry = {
'cat': 'A',
'lemma': lemma,
'args': {
'masc_sg': lemma,
'masc_pl': [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
'fem_sg': [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],
'fem_pl': [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
}
}
for patt in ['masc_sg', 'masc_pl']:
if patt in gf_entry['args']:
if form := gf_entry['args'][patt]:
if spatt := get_sound_fcl_pattern(form[0]):
gf_entry['args'][patt[5:]+'_patt'] = [spatt] # sg_patt, pl_patt
else:
gf_entry = {f: d for f, d in forms}
if 'lemma' in gf_entry and gf_entry['lemma']:
gf_entry['lemma'] = gf_entry['lemma'][0]
if 'root' in dforms:
gf_entry['args']['root'] = [dforms['root']]
elif root := get_sound_trigram_root(gf_entry['lemma']):
gf_entry['args']['root'] = [root]
args = sorted([(r, quote_if(x[0])) for r, x in gf_entry['args'].items() if x])
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join([r + ' = ' + v for (r, v) in args]) + '}'
gf_entry['labels'] = ','.join([r for r, v in args])
return gf_entry
# "root": ["ش ر ح (š-r-ḥ)"]
def find_root(s):
return ''.join([c for c in s if is_arabic(c)])
# GF code generation
# start with the header of the desired GF module
if MODE == 'gf-abs':
print('abstract MorphoDictAraAbs = Cat ** {')
if MODE == 'gf-cnc':
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra, MoreAra in {')
# go through the Arabic Wiktionary entries
# generate functions with unique names
if MODE.startswith('gf') or MODE=='json':
with open(FILTERED_WIKT) as file:
seen_gf_funs = {} # to disambiguate names if needed
number = 1
for line in file:
try:
obj = json.loads(line)
except:
continue
number += 1 # if you find the same word_C again, mark it word_1_C
# only take entries that are marked as lemmas
if 'Arabic lemmas' in obj.get('categories', []):
entry = {
'pos': obj['pos'],
'forms': forms_for_pos(obj),
'all_forms': wikt_forms_from_obj(obj),
'senses': [sense['glosses'] for sense in obj.get('senses', [])
if 'glosses' in sense]
}
# if you only want to see the Wikt information used GF generation
if MODE == 'json':
print(json.dumps(entry, ensure_ascii=False))
# if you want to proceed to GF generation
if MODE.startswith('gf'):
lemma = entry['forms'].get('lemma', None)
if lemma:
cat = entry['forms']['cat']
lin = entry['forms']['lin']
labels = entry['forms']['labels']
discrim = seen_gf_funs.get((lemma, cat), 0)
fun = gf_fun(lemma, cat, discrim)
# abstract syntax, save in MorphoDictAraAbs.gf
if MODE == 'gf-abs':
print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
# concrete syntax, save in MorphoDictAra.gf
elif MODE == 'gf-cnc':
print('lin', fun, '=', lin, ';')
# function-source map, save in source_of_MorphoDictAra.jsonl
elif MODE == 'gf-map':
source = wikt_forms_from_obj(obj)
source['gf_labels'] = labels
mapitem = {'fun': fun, 'source': source}
print(json.dumps(mapitem, ensure_ascii=False))
seen_gf_funs[(lemma, cat)] = discrim + 1 # next word_d_C will get a new number
# terminate the GF file with a closing brace
if MODE in ['gf-abs', 'gf-cnc']:
print('}')
# evaluation:
# linearize all words to tables
# compare them to the forms found in Wiktionary
# report on matches
# format of GF table:
# {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
# coming from pgf tabularLinearize
# compare the table for one function, returning a report as a dict
def compare_tables(gf, wikt, fun, show_buckwalter=True):
report = {}
for pair in gf.items():
gf_form = pair[1]
gf_params = pair[0]
gf_tags = tuple(word for word in
pair[0].replace('(', ' ').replace(')', ' ').split()
if word in arabic_rgl_features)
if not gf_tags:
continue # if gf_tags match no Wikt tags, do not include this form
wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags}
wikt_form = None
wikt_descr = None
for form, descr in wikt.items():
if all([tag in descr for tag in wikt_tags]):
wikt_form = reorder_shadda(form)
wikt_descr = descr
break
report[gf_tags] = { # flat param description with only Wikt-relevant tags
'gf_params': gf_params, # full param description
'gf_form': gf_form,
'wikt_form': wikt_form,
'wikt_descr': wikt_descr
}
if show_buckwalter:
report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None
report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None
if wikt_form:
report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items
report['fun'] = fun
report['labels'] = wikt['gf_labels']
report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ])
report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
return report
# with a given grammar and function, prepare input for compare_tables
# and produce a report, possibly summarizing it
def eval_with_wikt(gr, lang, fun, wikt, verbose=False):
if fun not in gr.functions:
print(fun, 'not found in grammar')
return
gf = {p: s for (p, s) in lang.tabularLinearize(pgf.Expr(fun, [])).items()
if p.startswith('s ')} # require the s field, exclude s2
report = compare_tables(gf, wikt, fun)
if verbose:
return report
else:
if report['total_found'] == 0:
verdict = 'NOT_FOUND'
flaws = False
elif report['total_found'] == report['total_voc']:
verdict = 'PERFECT'
flaws = False
elif report['total_found'] == report['total_unvoc']:
verdict = 'PERFECT_UNVOC'
flaws = True
elif report['total_voc'] == 0:
verdict = 'TOTALLY_WRONG'
flaws = True
else:
verdict = 'PARTIAL'
flaws = True
summary = {
'fun': report['fun'],
'forms': report['total_found'],
'voc': report['total_voc'],
'unvoc': report['total_unvoc'],
'verdict': verdict,
'labels': report['labels']
}
if flaws:
for f, v in report.items():
if v.get('voc_match', 1) == 0:
summary['first_error'] = v
break
return summary
def eval_grammar(pgffile, concretename, mapfile, show=True, verbose=False):
gr = pgf.readPGF(pgffile)
concrete = gr.languages[concretename]
totals = {'A': {}, 'N': {}, 'V': {}}
with open(mapfile) as file:
for line in file:
obj = json.loads(line)
fun = obj['fun'][1:-1]
report = eval_with_wikt(gr, concrete, fun, obj['source'], verbose)
cat = fun[-1]
if 'verdict' in report:
rep = report['verdict']
totals[cat][rep] = totals[cat].get(rep, 0) + 1
if show:
print(json.dumps(report, ensure_ascii=False))
print(json.dumps(totals, ensure_ascii=False))
if MODE.startswith('eval'):
verbose = MODE=='eval-verbose'
show = verbose or MODE=='eval-funs'
eval_grammar(PGF_FILE, CONCRETE_MODULE, FUNCTION_SOURCE_MAP, show, verbose)

View File

@@ -0,0 +1,44 @@
import csv
import json
from arabic_utilities import *
# to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
# the following are assumed
# from https://www.grammaticalframework.org/~krasimir/arabic.tsv.gz
# WN_TSV = 'arabic.tsv' # Krasimir
WN_TSV = 'ar2en_words_gf.csv' # Zarzoura
# built as explained in ./read_wiktionary.py
MORPHO_GF = 'MorphoDictAraAbs.gf'
# fun 'دُبُ_N' : N ; -- 10 [['bear']]
funmap = {}
with open(MORPHO_GF) as gffile:
for line in gffile:
line = line.split()
if line[2:] and line[0] == 'fun':
fun = line[1]
key = unvocalize(fun)
cat = line[3]
sense = ' '.join(line[6:])
funmap[(key, cat)] = funmap.get((key, cat), [])
funmap[(key, cat)].append({'fun': fun, 'sense': sense})
# abandon_1_V2 ParseAra ترك (1,1,1,3,322,3)
with open(WN_TSV) as wnfile:
## wnreader = csv.reader(wnfile, delimiter='\t')
for row in wnfile:
## word = row[-1].strip() # does not show tha arabic, but the second-last word
word = unvocalize(get_arabic(row))
wnfun = row.split()[-1] # 0 in Krasimir
cat = [c for c in wnfun if c.isalpha()][-1] # the last letter; the dict only contains N, A, V
funs = funmap.get((word, cat), [])
result = {'wnfun': wnfun, 'sought': word, 'found': funs}
print(json.dumps(result, ensure_ascii=False))