mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-05-27 17:08:54 -06:00
arabic/wiktionary using paradigms with records as arguments to cope with heterogeneous information
This commit is contained in:
@@ -19,7 +19,6 @@ def get_gzip_json(file, sample=100000, langs=[]):
|
|||||||
|
|
||||||
# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])
|
# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])
|
||||||
# python3 read_wiktionary.py >wikt_arabic.jsonl
|
# python3 read_wiktionary.py >wikt_arabic.jsonl
|
||||||
# 621-671
|
|
||||||
|
|
||||||
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
|
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
|
||||||
buckwalter_dict = {
|
buckwalter_dict = {
|
||||||
@@ -100,15 +99,17 @@ def forms_for_pos(obj):
|
|||||||
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
||||||
plural = [form[:-1] for form, descr in forms
|
plural = [form[:-1] for form, descr in forms
|
||||||
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
|
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
|
||||||
gender = ('Fem' if 'Arabic feminine nouns' in obj['categories']
|
gender = (['Fem'] if 'Arabic feminine nouns' in obj['categories']
|
||||||
else ('Masc' if 'Arabic masculine nouns' in obj['categories']
|
else (['Masc'] if 'Arabic masculine nouns' in obj['categories']
|
||||||
else None))
|
else []))
|
||||||
gf_entry = {
|
gf_entry = {
|
||||||
'cat': 'N',
|
'cat': 'N',
|
||||||
'lemma': lemma,
|
'lemma': lemma,
|
||||||
'singular': lemma,
|
'args': {
|
||||||
'plural': plural,
|
'sg': lemma,
|
||||||
'gender': gender
|
'pl': plural,
|
||||||
|
'g': gender
|
||||||
|
}
|
||||||
}
|
}
|
||||||
elif obj['pos'] == 'verb':
|
elif obj['pos'] == 'verb':
|
||||||
lemma = [form for form, descr in forms
|
lemma = [form for form, descr in forms
|
||||||
@@ -117,13 +118,15 @@ def forms_for_pos(obj):
|
|||||||
gf_entry = {
|
gf_entry = {
|
||||||
'cat': 'V',
|
'cat': 'V',
|
||||||
'lemma': lemma,
|
'lemma': lemma,
|
||||||
'perfect': lemma,
|
'args': {
|
||||||
'imperfect': [form for form, descr in forms
|
'perfect': lemma,
|
||||||
|
'imperfect': [form for form, descr in forms
|
||||||
if all([w in descr for
|
if all([w in descr for
|
||||||
w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1],
|
w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1],
|
||||||
'verbclass': max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','']
|
'cls': [max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII', 'XIV', 'XV', '']
|
||||||
if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
|
if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
|
||||||
key=len)
|
key=len)]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
elif obj['pos'] == 'adj':
|
elif obj['pos'] == 'adj':
|
||||||
lemma = [form for form, descr in forms
|
lemma = [form for form, descr in forms
|
||||||
@@ -131,13 +134,15 @@ def forms_for_pos(obj):
|
|||||||
gf_entry = {
|
gf_entry = {
|
||||||
'cat': 'A',
|
'cat': 'A',
|
||||||
'lemma': lemma,
|
'lemma': lemma,
|
||||||
'masc_singular': lemma,
|
'args': {
|
||||||
'masc_plural': [form for form, descr in forms
|
'masc_sg': lemma,
|
||||||
|
'masc_pl': [form for form, descr in forms
|
||||||
if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
|
if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
|
||||||
'fem_singular': [form for form, descr in forms
|
'fem_sg': [form for form, descr in forms
|
||||||
if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],
|
if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],
|
||||||
'fem_plural': [form for form, descr in forms
|
'fem_pl': [form for form, descr in forms
|
||||||
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
|
if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@@ -145,11 +150,11 @@ def forms_for_pos(obj):
|
|||||||
|
|
||||||
if 'lemma' in gf_entry and gf_entry['lemma']:
|
if 'lemma' in gf_entry and gf_entry['lemma']:
|
||||||
gf_entry['lemma'] = gf_entry['lemma'][0]
|
gf_entry['lemma'] = gf_entry['lemma'][0]
|
||||||
form = gf_entry['imperfect'][0] if gf_entry['cat'] == 'V' and gf_entry['imperfect'] else gf_entry['lemma']
|
gf_entry['args']['root'] = obj['root']
|
||||||
gf_entry['lin'] = ''.join(['mk', gf_entry['cat'], ' "' + form + '"'])
|
args = [r + ' = ' + '"' + x[0] + '"' for r, x in gf_entry['args'].items() if x]
|
||||||
|
gf_entry['lin'] = 'wmk' + gf_entry['cat'] + ' {' + ' ; '.join(args) + '}'
|
||||||
|
|
||||||
return gf_entry
|
return gf_entry
|
||||||
|
|
||||||
|
|
||||||
# "root": ["ش ر ح (š-r-ḥ)"]
|
# "root": ["ش ر ح (š-r-ḥ)"]
|
||||||
def find_root(s):
|
def find_root(s):
|
||||||
@@ -171,10 +176,10 @@ with open(FILTERED_WIKT) as file:
|
|||||||
root = [find_root(t['expansion']) for
|
root = [find_root(t['expansion']) for
|
||||||
t in obj.get('etymology_templates', []) if
|
t in obj.get('etymology_templates', []) if
|
||||||
t.get('name', None) =='ar-root'][:1]
|
t.get('name', None) =='ar-root'][:1]
|
||||||
|
obj['root'] = root
|
||||||
if 'Arabic lemmas' in obj.get('categories', []):
|
if 'Arabic lemmas' in obj.get('categories', []):
|
||||||
entry = {
|
entry = {
|
||||||
'pos': obj['pos'],
|
'pos': obj['pos'],
|
||||||
'root': root,
|
|
||||||
'forms': forms_for_pos(obj),
|
'forms': forms_for_pos(obj),
|
||||||
'senses': [sense['glosses'] for sense in obj.get('senses', [])
|
'senses': [sense['glosses'] for sense in obj.get('senses', [])
|
||||||
if 'glosses' in sense]
|
if 'glosses' in sense]
|
||||||
|
|||||||
Reference in New Issue
Block a user