forked from GitHub/gf-rgl
new function in ParadigmsAra to deal with Wiktionary data; lots of untested guesses
This commit is contained in:
@@ -868,4 +868,71 @@ formV : (root : Str) -> VerbForm -> V = \s,f -> case f of {
|
|||||||
param VerbForm =
|
param VerbForm =
|
||||||
FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ;
|
FormI | FormII | FormIII | FormIV | FormV | FormVI | FormVII | FormVIII | FormX | FormXI ;
|
||||||
|
|
||||||
|
-- paradigms for Wiktionary extraction
|
||||||
|
---- TODO: better usage of information in Wiktionary
|
||||||
|
|
||||||
|
oper
|
||||||
|
wmkN = overload {
|
||||||
|
wmkN : {sg, pl : Str ; g : Gender} -> N
|
||||||
|
= \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt
|
||||||
|
wmkN : {sg : Str} -> N
|
||||||
|
= \r -> smartN r.sg ;
|
||||||
|
wmkN : {sg : Str ; g : Gender ; root : Str} -> N
|
||||||
|
= \r -> smartN r.sg ** {g = r.g} ; ----
|
||||||
|
wmkN : {sg : Str; g : Gender} -> N
|
||||||
|
= \r -> smartN r.sg ** {g = r.g} ;
|
||||||
|
wmkN : {sg : Str; pl : Str; g : Gender; root : Str} -> N
|
||||||
|
= \r -> mkN r.sg r.pl r.g nohum ; --- hum/nohum not in Wikt
|
||||||
|
wmkN : {sg : Str; pl : Str} -> N
|
||||||
|
= \r -> mkN r.sg r.pl masc nohum ; ---- ** {g = (smartN r.sg).g} ;
|
||||||
|
wmkN : {sg : Str; root : Str} -> N
|
||||||
|
= \r -> smartN r.sg ;
|
||||||
|
} ;
|
||||||
|
|
||||||
|
wmkA = overload {
|
||||||
|
wmkA : {root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
mkA : {masc_sg : Str; fem_pl : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
mkA : {masc_sg : Str; fem_sg: Str ; masc_pl : Str; fem_pl : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
mkA : {masc_sg : Str; fem_sg : Str; fem_pl : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
mkA : {masc_sg : Str; fem_sg : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
mkA : {masc_sg : Str; fem_sg : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; fem_pl : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
mkA : {masc_sg : Str; masc_pl : Str; fem_sg : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
mkA : {masc_sg : Str; masc_pl : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
mkA : {masc_sg : Str; masc_pl : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
mkA : {masc_sg : Str; root : Str} -> A
|
||||||
|
= \r -> mkA r.root ;
|
||||||
|
mkA : {masc_sg : Str} -> A
|
||||||
|
= \r -> mkA r.masc_sg ; ----
|
||||||
|
} ;
|
||||||
|
|
||||||
|
wmkV = overload {
|
||||||
|
wmkV : {perfect : Str; cls : VerbForm; root : Str} -> V
|
||||||
|
= \r -> mkV r.root r.cls ; ----
|
||||||
|
wmkV : {perfect : Str; cls : VerbForm} -> V
|
||||||
|
= \r -> mkV r.perfect r.cls ; ----
|
||||||
|
wmkV : {perfect : Str; imperfect : Str; cls : VerbForm; root : Str} -> V
|
||||||
|
= \r -> mkV r.root r.cls ; ----
|
||||||
|
wmkV : {perfect : Str; imperfect : Str; cls : VerbForm} -> V
|
||||||
|
= \r -> variants {} ; ---- mkV r.imperfect ; ----
|
||||||
|
wmkV : {root : Str ; cls : VerbForm} -> V
|
||||||
|
= \r -> mkV r.root r.cls ;
|
||||||
|
wmkV : {imperfect : Str} -> V
|
||||||
|
= \r -> variants {} ; ---- mkV r.imperfect ;
|
||||||
|
} ;
|
||||||
|
|
||||||
} ;
|
} ;
|
||||||
|
|||||||
@@ -1,7 +1,22 @@
|
|||||||
import gzip
|
import gzip
|
||||||
import json
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# data from https://kaikki.org/dictionary/rawdata.html
|
||||||
|
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
|
||||||
|
# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022.
|
||||||
|
|
||||||
|
if not sys.argv[1:]:
|
||||||
|
print('usage: read_wiktionary (raw | gf-cnc | gf-abs)')
|
||||||
|
exit()
|
||||||
|
|
||||||
|
MODE = sys.argv[1] #
|
||||||
|
|
||||||
|
# step 1: extract data from this file using the raw option
|
||||||
WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
|
WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
|
||||||
|
|
||||||
|
# the following file is generated.
|
||||||
|
# in the sequel, use this file with gf-abs or gf-cnc option
|
||||||
FILTERED_WIKT = 'wikt_arabic.jsonl'
|
FILTERED_WIKT = 'wikt_arabic.jsonl'
|
||||||
|
|
||||||
|
|
||||||
@@ -14,11 +29,12 @@ def get_gzip_json(file, sample=100000, langs=[]):
|
|||||||
obj = json.loads(line)
|
obj = json.loads(line)
|
||||||
if obj.get('lang', None) in langs:
|
if obj.get('lang', None) in langs:
|
||||||
print(line.decode("utf-8"))
|
print(line.decode("utf-8"))
|
||||||
print(n)
|
# print(n)
|
||||||
|
|
||||||
|
if MODE == 'raw':
|
||||||
|
get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])
|
||||||
|
|
||||||
# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])
|
# python3 read_wiktionary.py raw >wikt_arabic.jsonl
|
||||||
# python3 read_wiktionary.py >wikt_arabic.jsonl
|
|
||||||
|
|
||||||
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
|
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
|
||||||
buckwalter_dict = {
|
buckwalter_dict = {
|
||||||
@@ -80,6 +96,12 @@ def unvocalize(s):
|
|||||||
def is_arabic(s):
|
def is_arabic(s):
|
||||||
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
||||||
|
|
||||||
|
# quote forms but not parameters
|
||||||
|
def quote_if(s, cond=is_arabic):
|
||||||
|
if cond(s):
|
||||||
|
return '"' + s + '"'
|
||||||
|
else:
|
||||||
|
return s
|
||||||
|
|
||||||
def gf_fun(s, pos, disamb=0):
|
def gf_fun(s, pos, disamb=0):
|
||||||
discrim = '_' + str(disamb) if disamb else ''
|
discrim = '_' + str(disamb) if disamb else ''
|
||||||
@@ -99,8 +121,8 @@ def forms_for_pos(obj):
|
|||||||
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
||||||
plural = [form[:-1] for form, descr in forms
|
plural = [form[:-1] for form, descr in forms
|
||||||
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
|
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
|
||||||
gender = (['Fem'] if 'Arabic feminine nouns' in obj['categories']
|
gender = (['fem'] if 'Arabic feminine nouns' in obj['categories']
|
||||||
else (['Masc'] if 'Arabic masculine nouns' in obj['categories']
|
else (['masc'] if 'Arabic masculine nouns' in obj['categories']
|
||||||
else []))
|
else []))
|
||||||
gf_entry = {
|
gf_entry = {
|
||||||
'cat': 'N',
|
'cat': 'N',
|
||||||
@@ -122,15 +144,20 @@ def forms_for_pos(obj):
|
|||||||
'perfect': lemma,
|
'perfect': lemma,
|
||||||
'imperfect': [form for form, descr in forms
|
'imperfect': [form for form, descr in forms
|
||||||
if all([w in descr for
|
if all([w in descr for
|
||||||
w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1],
|
w in [
|
||||||
'cls': [max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII', 'XIV', 'XV', '']
|
"active", "indicative", "masculine", "non-past",
|
||||||
if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
|
"imperfective", "singular", "third-person"]])][:1],
|
||||||
key=len)]
|
'cls': ['Form' + max([n for n in [
|
||||||
|
'I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','']
|
||||||
|
if n in ' '.join([c for c in obj['categories']
|
||||||
|
if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
|
||||||
|
key=len)] # max in RGL is XI, in Wikt XIII
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
elif obj['pos'] == 'adj':
|
elif obj['pos'] == 'adj':
|
||||||
lemma = [form for form, descr in forms
|
lemma = [form for form, descr in forms
|
||||||
if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1]
|
if all([w in descr for w in [
|
||||||
|
'indefinite', 'masculine', 'singular', 'informal']])][:1]
|
||||||
gf_entry = {
|
gf_entry = {
|
||||||
'cat': 'A',
|
'cat': 'A',
|
||||||
'lemma': lemma,
|
'lemma': lemma,
|
||||||
@@ -150,8 +177,9 @@ def forms_for_pos(obj):
|
|||||||
|
|
||||||
if 'lemma' in gf_entry and gf_entry['lemma']:
|
if 'lemma' in gf_entry and gf_entry['lemma']:
|
||||||
gf_entry['lemma'] = gf_entry['lemma'][0]
|
gf_entry['lemma'] = gf_entry['lemma'][0]
|
||||||
|
if obj['root']:
|
||||||
gf_entry['args']['root'] = obj['root']
|
gf_entry['args']['root'] = obj['root']
|
||||||
args = [r + ' = ' + '"' + x[0] + '"' for r, x in gf_entry['args'].items() if x]
|
args = [r + ' = ' + quote_if(x[0]) for r, x in gf_entry['args'].items() if x]
|
||||||
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}'
|
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}'
|
||||||
|
|
||||||
return gf_entry
|
return gf_entry
|
||||||
@@ -160,19 +188,19 @@ def forms_for_pos(obj):
|
|||||||
def find_root(s):
|
def find_root(s):
|
||||||
return ''.join([c for c in s if is_arabic(c)])
|
return ''.join([c for c in s if is_arabic(c)])
|
||||||
|
|
||||||
import sys
|
|
||||||
MODE = sys.argv[1]
|
|
||||||
|
|
||||||
if MODE == 'gf-abs':
|
if MODE == 'gf-abs':
|
||||||
print('abstract MorphoDictAraAbs = Cat ** {')
|
print('abstract MorphoDictAraAbs = Cat ** {')
|
||||||
if MODE == 'gf-cnc':
|
if MODE == 'gf-cnc':
|
||||||
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {')
|
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {')
|
||||||
|
|
||||||
|
if MODE != 'raw':
|
||||||
with open(FILTERED_WIKT) as file:
|
with open(FILTERED_WIKT) as file:
|
||||||
seen_gf_funs = {}
|
seen_gf_funs = {}
|
||||||
for line in file:
|
for line in file:
|
||||||
|
try:
|
||||||
obj = json.loads(line)
|
obj = json.loads(line)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
root = [find_root(t['expansion']) for
|
root = [find_root(t['expansion']) for
|
||||||
t in obj.get('etymology_templates', []) if
|
t in obj.get('etymology_templates', []) if
|
||||||
t.get('name', None) =='ar-root'][:1]
|
t.get('name', None) =='ar-root'][:1]
|
||||||
|
|||||||
Reference in New Issue
Block a user