mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-06-22 01:36:20 -06:00
182 lines
6.0 KiB
Python
182 lines
6.0 KiB
Python
import gzip
|
||
import json
|
||
|
||
WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
|
||
FILTERED_WIKT = 'wikt_arabic.jsonl'
|
||
|
||
|
||
def get_gzip_json(file, sample=100000, langs=[]):
|
||
with gzip.open(file) as decompressed:
|
||
n = 0
|
||
for line in decompressed:
|
||
n += 1
|
||
if n % sample == 0:
|
||
obj = json.loads(line)
|
||
if obj.get('lang', None) in langs:
|
||
print(line.decode("utf-8"))
|
||
print(n)
|
||
|
||
|
||
# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])
|
||
# python3 read_wiktionary.py >wikt_arabic.jsonl
|
||
# 621-671
|
||
|
||
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
|
||
buckwalter_dict = {
|
||
0x621: "'", # ء
|
||
0x622: '|', # آ
|
||
0x623: '>', # أ
|
||
0x624: '&', # ؤ
|
||
0x625: '<', # إ
|
||
0x626: '}', # ئ
|
||
0x627: 'A', # ا
|
||
0x628: 'b', # ب
|
||
0x629: 'p', # ة
|
||
0x62a: 't', # ت
|
||
0x62b: 'v', # ث
|
||
0x62c: 'j', # ج
|
||
0x62d: 'H', # ح
|
||
0x62e: 'x', # خ
|
||
0x62f: 'd', # د
|
||
0x630: '*', # ذ
|
||
0x631: 'r', # ر
|
||
0x632: 'z', # ز
|
||
0x633: 's', # س
|
||
0x634: '$', # ش
|
||
0x635: 'S', # ص
|
||
0x636: 'D', # ض
|
||
0x637: 'T', # ط
|
||
0x638: 'Z', # ظ
|
||
0x639: 'E', # ع
|
||
0x63a: 'g', # غ
|
||
0x641: 'f', # ف
|
||
0x642: 'q', # ق
|
||
0x643: 'k', # ك
|
||
0x644: 'l', # ل
|
||
0x645: 'm', # م
|
||
0x646: 'n', # ن
|
||
0x647: 'h', # ه
|
||
0x648: 'w', # و
|
||
0x649: 'Y', # ى
|
||
0x64a: 'y', # ي
|
||
0x64b: 'F', # ً
|
||
0x64c: 'N', # ٌ
|
||
0x64d: 'K', # ٍ
|
||
0x64e: 'a', # َ
|
||
0x64f: 'u', # ُ
|
||
0x650: 'i', # ِ
|
||
0x651: '~', # ّ
|
||
0x652: 'o', # ْ
|
||
0x670: '`', # '
|
||
0x671: '{' # ٱ
|
||
}
|
||
|
||
def to_buckwalter(s):
|
||
return ''.join([buckwalter_dict.get(ord(c), '?') for c in s])
|
||
|
||
|
||
def unvocalize(s):
|
||
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
|
||
|
||
def is_arabic(s):
|
||
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
||
|
||
|
||
def gf_fun(s, pos):
|
||
return ''.join(["'", s, "_", pos, "'"])
|
||
|
||
|
||
def forms_for_pos(obj):
|
||
forms = {
|
||
form['form']:
|
||
form.get('tags', []) for
|
||
form in obj.get('forms', []) if
|
||
'romanization' not in form.get('tags', []) and
|
||
is_arabic(form['form'])
|
||
}.items()
|
||
if obj['pos'] == 'noun':
|
||
lemma = [form[:-1] for form, descr in forms
|
||
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
||
return {
|
||
'gf_fun': gf_fun(lemma[0], 'N') if lemma else None,
|
||
'gf_cat': 'N',
|
||
'singular': lemma,
|
||
'plural': [form[:-1] for form, descr in forms
|
||
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1],
|
||
'gender': 'Fem' if 'Arabic feminine nouns' in obj['categories']
|
||
else ('Masc' if 'Arabic masculine nouns' in obj['categories']
|
||
else None)
|
||
}
|
||
elif obj['pos'] == 'verb':
|
||
lemma = [form for form, descr in forms
|
||
if all([w in descr for
|
||
w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1]
|
||
return {
|
||
'gf_fun': gf_fun(lemma[0], 'V') if lemma else None,
|
||
'gf_cat': 'V',
|
||
'perfect': lemma,
|
||
'imperfect': [form for form, descr in forms
|
||
if all([w in descr for
|
||
w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1],
|
||
'verbclass': max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','']
|
||
if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
|
||
key=len)
|
||
}
|
||
elif obj['pos'] == 'adj':
|
||
lemma = [form for form, descr in forms
|
||
if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1]
|
||
return {
|
||
'gf_fun': gf_fun(lemma[0], 'A') if lemma else None,
|
||
'gf_cat': 'A',
|
||
'masc_singular': lemma,
|
||
'masc_plural': [form for form, descr in forms
|
||
if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
|
||
'fem_singular': [form for form, descr in forms
|
||
if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],
|
||
'fem_plural': [form for form, descr in forms
|
||
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
|
||
}
|
||
|
||
else:
|
||
return {f: d for f, d in forms}
|
||
|
||
|
||
# "root": ["ش ر ح (š-r-ḥ)"]
|
||
def find_root(s):
|
||
return ''.join([c for c in s if is_arabic(c)])
|
||
|
||
import sys
|
||
MODE = sys.argv[1]
|
||
|
||
if MODE == 'gf':
|
||
print('abstract MorphoDictAraAbs = Cat ** {')
|
||
|
||
with open(FILTERED_WIKT) as file:
|
||
seen_gf_funs = set()
|
||
for line in file:
|
||
obj = json.loads(line)
|
||
if 'Arabic lemmas' in obj.get('categories', []):
|
||
entry = {
|
||
'pos': obj['pos'],
|
||
'root': [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1],
|
||
'forms': forms_for_pos(obj),
|
||
'senses': [sense['glosses'] for sense in obj.get('senses', [])
|
||
if 'glosses' in sense]
|
||
}
|
||
# entry['n_forms'] = len(entry['forms'])
|
||
# print(entry['pos'], entry['n_forms'])
|
||
if MODE == 'json':
|
||
print(json.dumps(entry, ensure_ascii=False))
|
||
|
||
if MODE == 'gf':
|
||
|
||
if 'gf_fun' in entry['forms'] and entry['forms']['gf_fun']:
|
||
if entry['forms']['gf_fun'] not in seen_gf_funs:
|
||
print('fun', entry['forms']['gf_fun'], ':', entry['forms']['gf_cat'], ';', '--', entry['senses'])
|
||
seen_gf_funs.add(entry['forms']['gf_fun'])
|
||
|
||
# to do: rename duplicate function names: of 13762 names, 12946 are unique
|
||
|
||
if MODE == 'gf':
|
||
print('}')
|