mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-05-27 08:58:55 -06:00
Merge branch 'master' of github.com:GrammaticalFramework/gf-rgl
This commit is contained in:
@@ -153,7 +153,8 @@ oper
|
|||||||
w + "ف" + x + "ع" + y + "ل" + z
|
w + "ف" + x + "ع" + y + "ل" + z
|
||||||
=> { h = w ; m1 = x; m2 = y; t = z} ;
|
=> { h = w ; m1 = x; m2 = y; t = z} ;
|
||||||
w + "ف" + x + ("ع"|"ل") + y
|
w + "ف" + x + ("ع"|"ل") + y
|
||||||
=> { h = w ; m1 = x; m2 = ""; t = y}
|
=> { h = w ; m1 = x; m2 = ""; t = y} ;
|
||||||
|
_ => Predef.error("cannot get FCL pattern from" ++ pat)
|
||||||
} ;
|
} ;
|
||||||
|
|
||||||
--opers to interdigitize (make words out of roots and patterns:
|
--opers to interdigitize (make words out of roots and patterns:
|
||||||
@@ -204,7 +205,8 @@ oper
|
|||||||
=> mkAssimilated pat (mkRoot3 rS) ;
|
=> mkAssimilated pat (mkRoot3 rS) ;
|
||||||
? + ? + _ => mkBilit pat (mkRoot2 rS) ; --2=>
|
? + ? + _ => mkBilit pat (mkRoot2 rS) ; --2=>
|
||||||
_=> error rS ---- AR error "expected 3--6"
|
_=> error rS ---- AR error "expected 3--6"
|
||||||
}
|
} ;
|
||||||
|
_ => Predef.error("cannot get FCL pattern from" ++ pS)
|
||||||
};
|
};
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -868,4 +868,99 @@ formV : (root : Str) -> VerbForm -> V = \s,f -> case f of {
|
|||||||
param VerbForm =
|
param VerbForm =
|
||||||
FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ;
|
FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ;
|
||||||
|
|
||||||
|
|
||||||
|
{- temporarily moved to wiktionary/MoreAra.gf
|
||||||
|
-- paradigms for Wiktionary extraction
|
||||||
|
---- TODO: better usage of information in Wiktionary
|
||||||
|
|
||||||
|
oper
|
||||||
|
wmkN = overload {
|
||||||
|
wmkN : {sg, pl : Str ; g : Gender} -> N
|
||||||
|
= \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt
|
||||||
|
wmkN : {sg : Str} -> N
|
||||||
|
= \r -> smartN r.sg ;
|
||||||
|
wmkN : {sg : Str ; g : Gender ; root : Str} -> N
|
||||||
|
= \r -> smartN r.sg ** {g = r.g} ; ----
|
||||||
|
wmkN : {sg : Str; g : Gender} -> N
|
||||||
|
= \r -> smartN r.sg ** {g = r.g} ;
|
||||||
|
wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N
|
||||||
|
= \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt
|
||||||
|
wmkN : {sg : Str; pl : Str} -> N
|
||||||
|
= \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ;
|
||||||
|
wmkN : {sg, pl : Str ; root : Str} -> N
|
||||||
|
= \r -> mkN r.sg r.pl masc nohum ; ----
|
||||||
|
wmkN : {sg : Str; root : Str} -> N
|
||||||
|
= \r -> smartN r.sg ;
|
||||||
|
} ;
|
||||||
|
|
||||||
|
wmkA = overload {
|
||||||
|
wmkA : {root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt r.pl_patt ;
|
||||||
|
wmkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt r.pl_patt ;
|
||||||
|
wmkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
wmkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
wmkA : {masc_sg, root, sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
wmkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root ; ----
|
||||||
|
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
|
||||||
|
= \r -> mkA r.root ; ----
|
||||||
|
wmkA : {masc_sg, fem_sg, root : Str} -> A
|
||||||
|
= \r -> mkA r.root ; ----
|
||||||
|
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
wmkA : {masc_sg : Str; fem_sg : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.sg_patt r.pl_patt ;
|
||||||
|
wmkA : {masc_sg : Str; masc_pl : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
} ;
|
||||||
|
|
||||||
|
wmkV = overload {
|
||||||
|
wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V
|
||||||
|
= \r -> mkV r.root r.cls ; ----
|
||||||
|
wmkV : {perfect : Str; cls : VerbForm} -> V
|
||||||
|
= \r -> mkV r.perfect r.cls ; ----
|
||||||
|
wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V
|
||||||
|
= \r -> mkV r.root r.cls ; ----
|
||||||
|
wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V
|
||||||
|
= \r -> mkV r.perfect r.cls ; ----
|
||||||
|
wmkV : {root : Str ; cls : VerbForm} -> V
|
||||||
|
= \r -> mkV r.root r.cls ;
|
||||||
|
wmkV : {imperfect : Str} -> V
|
||||||
|
= \r -> variants {} ; ---- mkV r.imperfect ;
|
||||||
|
} ;
|
||||||
|
-}
|
||||||
} ;
|
} ;
|
||||||
|
|||||||
8
src/arabic/wiktionary/Makefile
Normal file
8
src/arabic/wiktionary/Makefile
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
all:
|
||||||
|
python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
|
||||||
|
python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
|
||||||
|
python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl
|
||||||
|
gf -make MorphoDictAra.gf
|
||||||
|
python3 read_wiktionary.py eval-funs >eval.jsonl
|
||||||
|
python3 to_wordnet.py >wordnet-arabic.jsonl
|
||||||
|
python3 read_wiktionary.py error-analysis
|
||||||
98
src/arabic/wiktionary/MoreAra.gf
Normal file
98
src/arabic/wiktionary/MoreAra.gf
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
resource MoreAra = CatAra ** open ParadigmsAra in {
|
||||||
|
|
||||||
|
|
||||||
|
-- temporarily moved from ParadigmsAra
|
||||||
|
-- paradigms for Wiktionary extraction
|
||||||
|
---- TODO: better usage of information in Wiktionary
|
||||||
|
|
||||||
|
oper
|
||||||
|
wmkN = overload {
|
||||||
|
wmkN : {sg, pl : Str ; g : Gender} -> N
|
||||||
|
= \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt
|
||||||
|
wmkN : {sg : Str} -> N
|
||||||
|
= \r -> smartN r.sg ;
|
||||||
|
wmkN : {sg : Str ; g : Gender ; root : Str} -> N
|
||||||
|
= \r -> smartN r.sg ** {g = r.g} ; ----
|
||||||
|
wmkN : {sg : Str; g : Gender} -> N
|
||||||
|
= \r -> smartN r.sg ** {g = r.g} ;
|
||||||
|
wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N
|
||||||
|
= \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt
|
||||||
|
wmkN : {sg : Str; pl : Str} -> N
|
||||||
|
= \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ;
|
||||||
|
wmkN : {sg, pl : Str ; root : Str} -> N
|
||||||
|
= \r -> mkN r.sg r.pl masc nohum ; ----
|
||||||
|
wmkN : {sg : Str; root : Str} -> N
|
||||||
|
= \r -> smartN r.sg ;
|
||||||
|
} ;
|
||||||
|
|
||||||
|
wmkA = overload {
|
||||||
|
wmkA : {root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, sg_patt, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt r.pl_patt ;
|
||||||
|
wmkA : {masc_sg, fem_sg, masc_pl, root, sg_patt, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt r.pl_patt ;
|
||||||
|
wmkA : {fem_pl : Str; fem_sg : Str; masc_sg : Str; root : Str; sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
wmkA : {fem_pl : Str; fem_sg : Str; masc_sg, masc_pl, root, sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
wmkA : {masc_sg, root, sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
wmkA : {masc_sg, masc_pl, root, sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root ; ----
|
||||||
|
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, root : Str} -> A
|
||||||
|
= \r -> mkA r.root ; ----
|
||||||
|
wmkA : {masc_sg, fem_sg, root : Str} -> A
|
||||||
|
= \r -> mkA r.root ; ----
|
||||||
|
wmkA : {masc_sg, fem_sg, masc_pl, fem_pl, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; fem_sg : Str; root : Str ; sg_patt : Str} -> A
|
||||||
|
= \r -> mkA r.root r.sg_patt ;
|
||||||
|
wmkA : {masc_sg : Str; fem_sg : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str; masc_pl, pl_patt : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str; masc_pl, pl_patt, sg_patt : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.sg_patt r.pl_patt ;
|
||||||
|
wmkA : {masc_sg : Str; masc_pl : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; masc_pl, pl_patt : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
wmkA : {masc_sg : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
wmkA : {masc_sg : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
} ;
|
||||||
|
|
||||||
|
wmkV = overload {
|
||||||
|
wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V
|
||||||
|
= \r -> mkV r.root r.cls ; ----
|
||||||
|
wmkV : {perfect : Str; cls : VerbForm} -> V
|
||||||
|
= \r -> mkV r.perfect r.cls ; ----
|
||||||
|
wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V
|
||||||
|
= \r -> mkV r.root r.cls ; ----
|
||||||
|
wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V
|
||||||
|
= \r -> mkV r.perfect r.cls ; ----
|
||||||
|
wmkV : {root : Str ; cls : VerbForm} -> V
|
||||||
|
= \r -> mkV r.root r.cls ;
|
||||||
|
wmkV : {imperfect : Str} -> V
|
||||||
|
= \r -> variants {} ; ---- mkV r.imperfect ;
|
||||||
|
} ;
|
||||||
|
|
||||||
|
}
|
||||||
169
src/arabic/wiktionary/arabic_utilities.py
Normal file
169
src/arabic/wiktionary/arabic_utilities.py
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
# utilities for Arabic script
|
||||||
|
# in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter
|
||||||
|
# as specified by the command line argument:
|
||||||
|
#
|
||||||
|
# % python3 arabic_utilities.py to <MorphoDictAra.gf | python3 arabic_utilities.py from >b.tmp
|
||||||
|
# % diff MorphoDictAra.gf b.tmp
|
||||||
|
# %
|
||||||
|
|
||||||
|
def is_arabic(s):
|
||||||
|
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
||||||
|
|
||||||
|
|
||||||
|
def get_arabic(s):
|
||||||
|
return ''.join([c for c in s if is_arabic(c)])
|
||||||
|
|
||||||
|
|
||||||
|
def unvocalize(s):
|
||||||
|
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
|
||||||
|
|
||||||
|
|
||||||
|
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
|
||||||
|
buckwalter_dict = {
|
||||||
|
0x621: "'", # ء
|
||||||
|
0x622: '|', # آ
|
||||||
|
0x623: '>', # أ
|
||||||
|
0x624: '&', # ؤ
|
||||||
|
0x625: '<', # إ
|
||||||
|
0x626: '}', # ئ
|
||||||
|
0x627: 'A', # ا
|
||||||
|
0x628: 'b', # ب
|
||||||
|
0x629: 'p', # ة
|
||||||
|
0x62a: 't', # ت
|
||||||
|
0x62b: 'v', # ث
|
||||||
|
0x62c: 'j', # ج
|
||||||
|
0x62d: 'H', # ح
|
||||||
|
0x62e: 'x', # خ
|
||||||
|
0x62f: 'd', # د
|
||||||
|
0x630: '*', # ذ
|
||||||
|
0x631: 'r', # ر
|
||||||
|
0x632: 'z', # ز
|
||||||
|
0x633: 's', # س
|
||||||
|
0x634: '$', # ش
|
||||||
|
0x635: 'S', # ص
|
||||||
|
0x636: 'D', # ض
|
||||||
|
0x637: 'T', # ط
|
||||||
|
0x638: 'Z', # ظ
|
||||||
|
0x639: 'E', # ع
|
||||||
|
0x63a: 'g', # غ
|
||||||
|
0x641: 'f', # ف
|
||||||
|
0x642: 'q', # ق
|
||||||
|
0x643: 'k', # ك
|
||||||
|
0x644: 'l', # ل
|
||||||
|
0x645: 'm', # م
|
||||||
|
0x646: 'n', # ن
|
||||||
|
0x647: 'h', # ه
|
||||||
|
0x648: 'w', # و
|
||||||
|
0x649: 'Y', # ى
|
||||||
|
0x64a: 'y', # ي
|
||||||
|
0x64b: 'F', # ً
|
||||||
|
0x64c: 'N', # ٌ
|
||||||
|
0x64d: 'K', # ٍ
|
||||||
|
0x64e: 'a', # َ
|
||||||
|
0x64f: 'u', # ُ
|
||||||
|
0x650: 'i', # ِ
|
||||||
|
0x651: '~', # ّ
|
||||||
|
0x652: 'o', # ْ
|
||||||
|
0x670: '`', # '
|
||||||
|
0x671: '{' # ٱ
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
|
||||||
|
|
||||||
|
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
|
||||||
|
|
||||||
|
sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya
|
||||||
|
|
||||||
|
def to_buckwalter(s):
|
||||||
|
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
|
||||||
|
|
||||||
|
|
||||||
|
def from_buckwalter(s):
|
||||||
|
return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
|
||||||
|
|
||||||
|
|
||||||
|
def drop_final_vowel(s):
|
||||||
|
if s[-1] in arabic_vowels:
|
||||||
|
return s[:-1]
|
||||||
|
else:
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def normal(s):
|
||||||
|
return unicodedata.normalize('NFD', s)
|
||||||
|
|
||||||
|
# heuristic for finding the three radicals from certain forms
|
||||||
|
# works only for sound (strong) 3-radical roots, otherwise None
|
||||||
|
def get_sound_trigram_root(s):
|
||||||
|
sounds = [c for c in s if c in sound_consonants]
|
||||||
|
if len(sounds) == 3:
|
||||||
|
return ''.join(sounds)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# reverse engineer fcl pattern from a given form, with a sound trigram root
|
||||||
|
# one more condition: each of the root letters occurs exactly ones
|
||||||
|
# TODO: better use the given root of the lex entry
|
||||||
|
def get_sound_fcl_pattern(s):
|
||||||
|
if root := get_sound_trigram_root(s):
|
||||||
|
if len([c in s for c in root]) == 3:
|
||||||
|
p = list(s)
|
||||||
|
r = s.find(root[0])
|
||||||
|
p[r] = chr(0x641)
|
||||||
|
r += s[r+1:].find(root[1]) + 1
|
||||||
|
p[r] = chr(0x639)
|
||||||
|
r += s[r+1:].find(root[2]) + 1
|
||||||
|
p[r] = chr(0x644)
|
||||||
|
p = ''.join(p)
|
||||||
|
## print('---PATT', s, root, p)
|
||||||
|
return p
|
||||||
|
|
||||||
|
|
||||||
|
# Wikt uses vowel+shadda which is a Unicode normalization
|
||||||
|
# GF uses shadda+vowel which is linguistically correct
|
||||||
|
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
|
||||||
|
# unicodedata.normalize does this wrong, as noted by Ariel Gutman
|
||||||
|
## todo: more direct implementation
|
||||||
|
def reorder_shadda(s):
|
||||||
|
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
|
||||||
|
|
||||||
|
|
||||||
|
# quote word forms but not parameters
|
||||||
|
def quote_if(s, cond=is_arabic, change=reorder_shadda):
|
||||||
|
if cond(s):
|
||||||
|
return '"' + change(s) + '"'
|
||||||
|
else:
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
# for a string, change each string literal in "..." with a change function
|
||||||
|
# leaving other characters as they are; print the string to stdout as you go
|
||||||
|
def change_literals(s, change):
|
||||||
|
inliteral = False
|
||||||
|
literal = ''
|
||||||
|
for c in s:
|
||||||
|
if c == '"' and inliteral:
|
||||||
|
print('"'+change(literal)+'"', end='')
|
||||||
|
inliteral = False
|
||||||
|
literal = ''
|
||||||
|
elif c == '"':
|
||||||
|
inliteral = True
|
||||||
|
elif inliteral:
|
||||||
|
literal += c
|
||||||
|
else:
|
||||||
|
print(c, end='')
|
||||||
|
|
||||||
|
|
||||||
|
# convert literals in stdin 'to' or 'from' Buckwalter
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
mode = sys.argv[1]
|
||||||
|
for line in sys.stdin:
|
||||||
|
if mode == 'from':
|
||||||
|
change_literals(line, from_buckwalter)
|
||||||
|
elif mode == 'to':
|
||||||
|
change_literals(line, to_buckwalter)
|
||||||
|
|
||||||
|
|
||||||
455
src/arabic/wiktionary/read_wiktionary.py
Normal file
455
src/arabic/wiktionary/read_wiktionary.py
Normal file
@@ -0,0 +1,455 @@
|
|||||||
|
import gzip
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import unicodedata
|
||||||
|
import pgf
|
||||||
|
from arabic_utilities import *
|
||||||
|
|
||||||
|
# data from https://kaikki.org/dictionary/rawdata.html
|
||||||
|
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
|
||||||
|
# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file converts Wiktionary data to GF morphological dictionary files.
|
||||||
|
It words for Arabic but some functionalities could be modified to other languges.
|
||||||
|
|
||||||
|
The steps to take are the following:
|
||||||
|
|
||||||
|
fetch data:
|
||||||
|
|
||||||
|
raw-wiktextract-data.json.gz from https://kaikki.org/dictionary/rawdata.html
|
||||||
|
|
||||||
|
filter Arabic entries:
|
||||||
|
|
||||||
|
$ python3 read_wiktionary.py raw >wikt_arabic.jsonl
|
||||||
|
|
||||||
|
create GF files:
|
||||||
|
|
||||||
|
$ python3 read_wiktionary.py gf-abs >MorphoDictAraAbs.gf
|
||||||
|
$ python3 read_wiktionary.py gf-cnc >MorphoDictAra.gf
|
||||||
|
|
||||||
|
automatic evaluation:
|
||||||
|
|
||||||
|
$ gf -make MorphoDictAra.gf
|
||||||
|
$ python3 read_wiktionary.py gf-map >source_of_MorphoDictAra.jsonl
|
||||||
|
$ python3 read_wiktionary.py eval
|
||||||
|
|
||||||
|
TODO:
|
||||||
|
- better generation of GF
|
||||||
|
- better paradigms to use Wiktionary data
|
||||||
|
- refactor the code so that it can be used for other languages
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
MODE = ''
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if not sys.argv[1:]:
|
||||||
|
print('usage: read_wiktionary (raw | gf-cnc | gf-abs | gf-map | eval | eval-verbose)')
|
||||||
|
exit()
|
||||||
|
MODE = sys.argv[1] #
|
||||||
|
|
||||||
|
|
||||||
|
# step 1: extract Arabic data from this file using the raw option
|
||||||
|
WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
|
||||||
|
EXTRACTED_LANGUAGE = 'Arabic'
|
||||||
|
|
||||||
|
# the following file is generated.
|
||||||
|
# in the sequel, use this file with gf-abs or gf-cnc option
|
||||||
|
FILTERED_WIKT = 'wikt_arabic.jsonl'
|
||||||
|
|
||||||
|
# map each successfully extracted GF function to its source record in Wiktionary
|
||||||
|
# created with option gf-map
|
||||||
|
FUNCTION_SOURCE_MAP = 'source_of_MorphoDictAra.jsonl'
|
||||||
|
|
||||||
|
# created with $ gf -make MorphoDictAra.gf
|
||||||
|
PGF_FILE = 'MorphoDictAraAbs.pgf'
|
||||||
|
|
||||||
|
# module to linearize with
|
||||||
|
CONCRETE_MODULE = 'MorphoDictAra'
|
||||||
|
|
||||||
|
# concrete syntax file, to debug sources of linearizations
|
||||||
|
CONCRETE_FILE = CONCRETE_MODULE + '.gf'
|
||||||
|
|
||||||
|
# evaluation result file, created with mode eval-funs
|
||||||
|
EVAL_FILE = 'eval.jsonl'
|
||||||
|
|
||||||
|
|
||||||
|
# read a gzipped jsonl file (one object per line),
|
||||||
|
# showing lines where one of a list of languages is present
|
||||||
|
# This can be sampled to one of 100k lines by default, 1 for total recall.
|
||||||
|
def get_gzip_json(file, sample=100000, langs=[]):
|
||||||
|
with gzip.open(file) as decompressed:
|
||||||
|
n = 0
|
||||||
|
for line in decompressed:
|
||||||
|
n += 1
|
||||||
|
if n % sample == 0:
|
||||||
|
obj = json.loads(line)
|
||||||
|
if obj.get('lang', None) in langs:
|
||||||
|
print(line.decode("utf-8"))
|
||||||
|
# print(n)
|
||||||
|
|
||||||
|
|
||||||
|
# to perform the first step of data extraction, pipe this into a file:
|
||||||
|
# python3 read_wiktionary.py raw >wikt_arabic.jsonl
|
||||||
|
if MODE == 'raw':
|
||||||
|
get_gzip_json(WIKTIONARY_DUMP, 1, [EXTRACTED_LANGUAGE])
|
||||||
|
exit()
|
||||||
|
|
||||||
|
|
||||||
|
if MODE == 'error-analysis':
|
||||||
|
evals = {}
|
||||||
|
with open(EVAL_FILE) as file:
|
||||||
|
for line in file:
|
||||||
|
row = json.loads(line)
|
||||||
|
if labels := row.get('labels', None):
|
||||||
|
cat = row['fun'][-1]
|
||||||
|
verdict = row['verdict']
|
||||||
|
evals[(cat, labels, verdict)] = evals.get((cat, labels, verdict), 0) + 1
|
||||||
|
for labverdict, n in sorted(list(evals.items())):
|
||||||
|
print(labverdict, n)
|
||||||
|
|
||||||
|
|
||||||
|
# generate word_d_C functions starting with d=0, but show d only when >= 1
|
||||||
|
def gf_fun(s, pos, disamb=0):
|
||||||
|
discrim = '_' + str(disamb) if disamb else ''
|
||||||
|
return ''.join(["'", s, discrim, "_", pos, "'"])
|
||||||
|
|
||||||
|
|
||||||
|
# mapping from GF to Wikt features
|
||||||
|
arabic_rgl_features = {
|
||||||
|
# V
|
||||||
|
'VPerf': 'perfective',
|
||||||
|
'Act': 'active',
|
||||||
|
'Pas': 'passive',
|
||||||
|
'Per3': 'third-person',
|
||||||
|
'Per2': 'second-person',
|
||||||
|
'Per1': 'first-person',
|
||||||
|
'Masc': 'masculine',
|
||||||
|
'Fem': 'feminine',
|
||||||
|
'Sing': 'singular',
|
||||||
|
'Plur': 'plural',
|
||||||
|
'Sg': 'singular',
|
||||||
|
'Pl': 'plural',
|
||||||
|
'Dl': 'dual',
|
||||||
|
'VImpf': 'imperfective',
|
||||||
|
'Ind': 'indicative',
|
||||||
|
'Cnj': 'subjunctive',
|
||||||
|
'Jus': 'jussive',
|
||||||
|
'VImp': 'imperative',
|
||||||
|
# N: also Sg, Pl, Dl
|
||||||
|
'Def': 'definite',
|
||||||
|
'Indef': 'indefinite',
|
||||||
|
'Nom': 'nominative',
|
||||||
|
'Acc': 'accusative',
|
||||||
|
'Gen': 'genitive',
|
||||||
|
# 'Bare':
|
||||||
|
# 'Dat':
|
||||||
|
'Const': 'construct'
|
||||||
|
# 'Poss':
|
||||||
|
#A: also N features; degree features cannot be found
|
||||||
|
# 'APosit': 'positive',
|
||||||
|
# 'AComp': 'comparative'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# the inflection forms in a wiktionary entry
|
||||||
|
def wikt_forms_from_obj(obj):
|
||||||
|
forms = {
|
||||||
|
reorder_shadda(form['form']):
|
||||||
|
form.get('tags', []) for
|
||||||
|
form in obj.get('forms', []) if
|
||||||
|
'romanization' not in form.get('tags', []) and
|
||||||
|
is_arabic(form['form'])
|
||||||
|
}
|
||||||
|
# the root (three radicals) is found in this place if at all
|
||||||
|
root = [find_root(t['expansion']) for
|
||||||
|
t in obj.get('etymology_templates', []) if
|
||||||
|
t.get('name', None) =='ar-root'][:1]
|
||||||
|
if root and root[0].strip():
|
||||||
|
forms['root'] = root[0].strip()
|
||||||
|
|
||||||
|
return forms
|
||||||
|
|
||||||
|
|
||||||
|
# selection of forms for a given POS from Wikt: noun, adj, or verb
|
||||||
|
# return a linearization function
|
||||||
|
def forms_for_pos(obj):
|
||||||
|
dforms = wikt_forms_from_obj(obj)
|
||||||
|
forms = dforms.items()
|
||||||
|
if obj['pos'] == 'noun':
|
||||||
|
lemma = [drop_final_vowel(form) for form, descr in forms
|
||||||
|
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
||||||
|
plural = [drop_final_vowel(form) for form, descr in forms
|
||||||
|
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
|
||||||
|
gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
|
||||||
|
else (['masc'] if 'Arabic masculine nouns' in obj['categories']
|
||||||
|
else []))
|
||||||
|
gf_entry = {
|
||||||
|
'cat': 'N',
|
||||||
|
'lemma': lemma,
|
||||||
|
'args': {
|
||||||
|
'sg': lemma,
|
||||||
|
'pl': plural,
|
||||||
|
'g': gender
|
||||||
|
}
|
||||||
|
}
|
||||||
|
elif obj['pos'] == 'verb':
|
||||||
|
lemma = [form for form, descr in forms
|
||||||
|
if all([w in descr for
|
||||||
|
w in ["active", "indicative", "masculine", "past",
|
||||||
|
"perfective", "singular", "third-person"]])][:1]
|
||||||
|
gf_entry = {
|
||||||
|
'cat': 'V',
|
||||||
|
'lemma': lemma,
|
||||||
|
'args': {
|
||||||
|
'perfect': lemma,
|
||||||
|
'imperfect': [form for form, descr in forms
|
||||||
|
if all([w in descr for
|
||||||
|
w in [
|
||||||
|
"active", "indicative", "masculine", "non-past",
|
||||||
|
"imperfective", "singular", "third-person"]])][:1],
|
||||||
|
'cls': ['Form' + max([n for n in [
|
||||||
|
'I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','']
|
||||||
|
if n in ' '.join([c for c in obj['categories']
|
||||||
|
if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
|
||||||
|
key=len)] # max in RGL is XI, in Wikt XIII
|
||||||
|
}
|
||||||
|
}
|
||||||
|
elif obj['pos'] == 'adj':
|
||||||
|
lemma = [form for form, descr in forms
|
||||||
|
if all([w in descr for w in [
|
||||||
|
'indefinite', 'masculine', 'singular', 'informal']])][:1]
|
||||||
|
gf_entry = {
|
||||||
|
'cat': 'A',
|
||||||
|
'lemma': lemma,
|
||||||
|
'args': {
|
||||||
|
'masc_sg': lemma,
|
||||||
|
'masc_pl': [form for form, descr in forms
|
||||||
|
if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
|
||||||
|
'fem_sg': [form for form, descr in forms
|
||||||
|
if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],
|
||||||
|
'fem_pl': [form for form, descr in forms
|
||||||
|
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for patt in ['masc_sg', 'masc_pl']:
|
||||||
|
if patt in gf_entry['args']:
|
||||||
|
if form := gf_entry['args'][patt]:
|
||||||
|
if spatt := get_sound_fcl_pattern(form[0]):
|
||||||
|
gf_entry['args'][patt[5:]+'_patt'] = [spatt] # sg_patt, pl_patt
|
||||||
|
|
||||||
|
else:
|
||||||
|
gf_entry = {f: d for f, d in forms}
|
||||||
|
|
||||||
|
if 'lemma' in gf_entry and gf_entry['lemma']:
|
||||||
|
gf_entry['lemma'] = gf_entry['lemma'][0]
|
||||||
|
if 'root' in dforms:
|
||||||
|
gf_entry['args']['root'] = [dforms['root']]
|
||||||
|
elif root := get_sound_trigram_root(gf_entry['lemma']):
|
||||||
|
gf_entry['args']['root'] = [root]
|
||||||
|
args = sorted([(r, quote_if(x[0])) for r, x in gf_entry['args'].items() if x])
|
||||||
|
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join([r + ' = ' + v for (r, v) in args]) + '}'
|
||||||
|
gf_entry['labels'] = ','.join([r for r, v in args])
|
||||||
|
|
||||||
|
return gf_entry
|
||||||
|
|
||||||
|
|
||||||
|
# "root": ["ش ر ح (š-r-ḥ)"]
|
||||||
|
def find_root(s):
|
||||||
|
return ''.join([c for c in s if is_arabic(c)])
|
||||||
|
|
||||||
|
|
||||||
|
# GF code generation
|
||||||
|
|
||||||
|
# start with the header of the desired GF module
|
||||||
|
|
||||||
|
if MODE == 'gf-abs':
|
||||||
|
print('abstract MorphoDictAraAbs = Cat ** {')
|
||||||
|
if MODE == 'gf-cnc':
|
||||||
|
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra, MoreAra in {')
|
||||||
|
|
||||||
|
# go through the Arabic Wiktionary entries
|
||||||
|
# generate functions with unique names
|
||||||
|
|
||||||
|
if MODE.startswith('gf') or MODE=='json':
|
||||||
|
with open(FILTERED_WIKT) as file:
|
||||||
|
seen_gf_funs = {} # to disambiguate names if needed
|
||||||
|
number = 1
|
||||||
|
for line in file:
|
||||||
|
try:
|
||||||
|
obj = json.loads(line)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
number += 1 # if you find the same word_C again, mark it word_1_C
|
||||||
|
|
||||||
|
# only take entries that are marked as lemmas
|
||||||
|
if 'Arabic lemmas' in obj.get('categories', []):
|
||||||
|
entry = {
|
||||||
|
'pos': obj['pos'],
|
||||||
|
'forms': forms_for_pos(obj),
|
||||||
|
'all_forms': wikt_forms_from_obj(obj),
|
||||||
|
'senses': [sense['glosses'] for sense in obj.get('senses', [])
|
||||||
|
if 'glosses' in sense]
|
||||||
|
}
|
||||||
|
|
||||||
|
# if you only want to see the Wikt information used GF generation
|
||||||
|
if MODE == 'json':
|
||||||
|
print(json.dumps(entry, ensure_ascii=False))
|
||||||
|
|
||||||
|
# if you want to proceed to GF generation
|
||||||
|
if MODE.startswith('gf'):
|
||||||
|
|
||||||
|
lemma = entry['forms'].get('lemma', None)
|
||||||
|
if lemma:
|
||||||
|
cat = entry['forms']['cat']
|
||||||
|
lin = entry['forms']['lin']
|
||||||
|
labels = entry['forms']['labels']
|
||||||
|
discrim = seen_gf_funs.get((lemma, cat), 0)
|
||||||
|
fun = gf_fun(lemma, cat, discrim)
|
||||||
|
|
||||||
|
# abstract syntax, save in MorphoDictAraAbs.gf
|
||||||
|
if MODE == 'gf-abs':
|
||||||
|
print('fun', fun, ':', cat, ';', '--', number, entry['senses'])
|
||||||
|
|
||||||
|
# concrete syntax, save in MorphoDictAra.gf
|
||||||
|
elif MODE == 'gf-cnc':
|
||||||
|
print('lin', fun, '=', lin, ';')
|
||||||
|
|
||||||
|
# function-source map, save in source_of_MorphoDictAra.jsonl
|
||||||
|
elif MODE == 'gf-map':
|
||||||
|
source = wikt_forms_from_obj(obj)
|
||||||
|
source['gf_labels'] = labels
|
||||||
|
mapitem = {'fun': fun, 'source': source}
|
||||||
|
print(json.dumps(mapitem, ensure_ascii=False))
|
||||||
|
|
||||||
|
seen_gf_funs[(lemma, cat)] = discrim + 1 # next word_d_C will get a new number
|
||||||
|
|
||||||
|
# terminate the GF file with a closing brace
|
||||||
|
if MODE in ['gf-abs', 'gf-cnc']:
|
||||||
|
print('}')
|
||||||
|
|
||||||
|
|
||||||
|
# evaluation:
|
||||||
|
# linearize all words to tables
|
||||||
|
# compare them to the forms found in Wiktionary
|
||||||
|
# report on matches
|
||||||
|
|
||||||
|
# format of GF table:
|
||||||
|
# {'s (AComp Def Bare)': 'الأَيَُونَانِ'}
|
||||||
|
# coming from pgf tabularLinearize
|
||||||
|
|
||||||
|
# compare the table for one function, returning a report as a dict
|
||||||
|
def compare_tables(gf, wikt, fun, show_buckwalter=True):
|
||||||
|
report = {}
|
||||||
|
for pair in gf.items():
|
||||||
|
gf_form = pair[1]
|
||||||
|
gf_params = pair[0]
|
||||||
|
gf_tags = tuple(word for word in
|
||||||
|
pair[0].replace('(', ' ').replace(')', ' ').split()
|
||||||
|
if word in arabic_rgl_features)
|
||||||
|
if not gf_tags:
|
||||||
|
continue # if gf_tags match no Wikt tags, do not include this form
|
||||||
|
wikt_tags = {arabic_rgl_features[tag] for tag in gf_tags}
|
||||||
|
wikt_form = None
|
||||||
|
wikt_descr = None
|
||||||
|
for form, descr in wikt.items():
|
||||||
|
if all([tag in descr for tag in wikt_tags]):
|
||||||
|
wikt_form = reorder_shadda(form)
|
||||||
|
wikt_descr = descr
|
||||||
|
break
|
||||||
|
report[gf_tags] = { # flat param description with only Wikt-relevant tags
|
||||||
|
'gf_params': gf_params, # full param description
|
||||||
|
'gf_form': gf_form,
|
||||||
|
'wikt_form': wikt_form,
|
||||||
|
'wikt_descr': wikt_descr
|
||||||
|
}
|
||||||
|
if show_buckwalter:
|
||||||
|
report[gf_tags]['gf_form_rom'] = to_buckwalter(gf_form) if gf_form else None
|
||||||
|
report[gf_tags]['wikt_form_rom'] = to_buckwalter(wikt_form) if wikt_form else None
|
||||||
|
if wikt_form:
|
||||||
|
report[gf_tags]['voc_match'] = int(normal(gf_form) == normal(wikt_form))
|
||||||
|
report[gf_tags]['unvoc_match'] = int(normal(unvocalize(gf_form)) == normal(unvocalize(wikt_form)))
|
||||||
|
ritems = tuple(report.items()) # need an unmutable structure, because otherwise ints are added to items
|
||||||
|
report['fun'] = fun
|
||||||
|
report['labels'] = wikt['gf_labels']
|
||||||
|
report['total_found'] = len([f for f, v in ritems if v['wikt_form'] is not None ])
|
||||||
|
report['total_voc'] = sum([v.get('voc_match', 0) for f, v in ritems])
|
||||||
|
report['total_unvoc'] = sum([v.get('unvoc_match', 0) for f, v in ritems])
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
# with a given grammar and function, prepare input for compare_tables
|
||||||
|
# and produce a report, possibly summarizing it
|
||||||
|
def eval_with_wikt(gr, lang, fun, wikt, verbose=False):
|
||||||
|
if fun not in gr.functions:
|
||||||
|
print(fun, 'not found in grammar')
|
||||||
|
return
|
||||||
|
gf = {p: s for (p, s) in lang.tabularLinearize(pgf.Expr(fun, [])).items()
|
||||||
|
if p.startswith('s ')} # require the s field, exclude s2
|
||||||
|
report = compare_tables(gf, wikt, fun)
|
||||||
|
if verbose:
|
||||||
|
return report
|
||||||
|
else:
|
||||||
|
if report['total_found'] == 0:
|
||||||
|
verdict = 'NOT_FOUND'
|
||||||
|
flaws = False
|
||||||
|
elif report['total_found'] == report['total_voc']:
|
||||||
|
verdict = 'PERFECT'
|
||||||
|
flaws = False
|
||||||
|
elif report['total_found'] == report['total_unvoc']:
|
||||||
|
verdict = 'PERFECT_UNVOC'
|
||||||
|
flaws = True
|
||||||
|
elif report['total_voc'] == 0:
|
||||||
|
verdict = 'TOTALLY_WRONG'
|
||||||
|
flaws = True
|
||||||
|
else:
|
||||||
|
verdict = 'PARTIAL'
|
||||||
|
flaws = True
|
||||||
|
summary = {
|
||||||
|
'fun': report['fun'],
|
||||||
|
'forms': report['total_found'],
|
||||||
|
'voc': report['total_voc'],
|
||||||
|
'unvoc': report['total_unvoc'],
|
||||||
|
'verdict': verdict,
|
||||||
|
'labels': report['labels']
|
||||||
|
}
|
||||||
|
|
||||||
|
if flaws:
|
||||||
|
for f, v in report.items():
|
||||||
|
if v.get('voc_match', 1) == 0:
|
||||||
|
summary['first_error'] = v
|
||||||
|
break
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
def eval_grammar(pgffile, concretename, mapfile, show=True, verbose=False):
|
||||||
|
gr = pgf.readPGF(pgffile)
|
||||||
|
concrete = gr.languages[concretename]
|
||||||
|
|
||||||
|
totals = {'A': {}, 'N': {}, 'V': {}}
|
||||||
|
|
||||||
|
with open(mapfile) as file:
|
||||||
|
for line in file:
|
||||||
|
obj = json.loads(line)
|
||||||
|
fun = obj['fun'][1:-1]
|
||||||
|
report = eval_with_wikt(gr, concrete, fun, obj['source'], verbose)
|
||||||
|
|
||||||
|
cat = fun[-1]
|
||||||
|
if 'verdict' in report:
|
||||||
|
rep = report['verdict']
|
||||||
|
totals[cat][rep] = totals[cat].get(rep, 0) + 1
|
||||||
|
|
||||||
|
if show:
|
||||||
|
print(json.dumps(report, ensure_ascii=False))
|
||||||
|
|
||||||
|
print(json.dumps(totals, ensure_ascii=False))
|
||||||
|
|
||||||
|
|
||||||
|
if MODE.startswith('eval'):
|
||||||
|
verbose = MODE=='eval-verbose'
|
||||||
|
show = verbose or MODE=='eval-funs'
|
||||||
|
eval_grammar(PGF_FILE, CONCRETE_MODULE, FUNCTION_SOURCE_MAP, show, verbose)
|
||||||
|
|
||||||
|
|
||||||
44
src/arabic/wiktionary/to_wordnet.py
Normal file
44
src/arabic/wiktionary/to_wordnet.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import csv
|
||||||
|
import json
|
||||||
|
|
||||||
|
from arabic_utilities import *
|
||||||
|
|
||||||
|
# to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
|
||||||
|
# the following are assumed
|
||||||
|
|
||||||
|
|
||||||
|
# from https://www.grammaticalframework.org/~krasimir/arabic.tsv.gz
|
||||||
|
# WN_TSV = 'arabic.tsv' # Krasimir
|
||||||
|
WN_TSV = 'ar2en_words_gf.csv' # Zarzoura
|
||||||
|
|
||||||
|
# built as explained in ./read_wiktionary.py
|
||||||
|
MORPHO_GF = 'MorphoDictAraAbs.gf'
|
||||||
|
|
||||||
|
|
||||||
|
# fun 'دُبُ_N' : N ; -- 10 [['bear']]
|
||||||
|
funmap = {}
|
||||||
|
with open(MORPHO_GF) as gffile:
|
||||||
|
for line in gffile:
|
||||||
|
line = line.split()
|
||||||
|
if line[2:] and line[0] == 'fun':
|
||||||
|
fun = line[1]
|
||||||
|
key = unvocalize(fun)
|
||||||
|
cat = line[3]
|
||||||
|
sense = ' '.join(line[6:])
|
||||||
|
funmap[(key, cat)] = funmap.get((key, cat), [])
|
||||||
|
funmap[(key, cat)].append({'fun': fun, 'sense': sense})
|
||||||
|
|
||||||
|
|
||||||
|
# abandon_1_V2 ParseAra ترك (1,1,1,3,322,3)
|
||||||
|
with open(WN_TSV) as wnfile:
|
||||||
|
## wnreader = csv.reader(wnfile, delimiter='\t')
|
||||||
|
for row in wnfile:
|
||||||
|
## word = row[-1].strip() # does not show tha arabic, but the second-last word
|
||||||
|
word = unvocalize(get_arabic(row))
|
||||||
|
wnfun = row.split()[-1] # 0 in Krasimir
|
||||||
|
cat = [c for c in wnfun if c.isalpha()][-1] # the last letter; the dict only contains N, A, V
|
||||||
|
funs = funmap.get((word, cat), [])
|
||||||
|
result = {'wnfun': wnfun, 'sought': word, 'found': funs}
|
||||||
|
print(json.dumps(result, ensure_ascii=False))
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user