compilable MorphoDictAra generation except for V, not yet using all forms

This commit is contained in:
Aarne Ranta
2023-09-12 19:38:14 +02:00
parent 714d8abac0
commit 8eceb53643

View File

@@ -82,8 +82,9 @@ def is_arabic(s):
return s and any(1574 <= ord(c) <= 1616 for c in s) return s and any(1574 <= ord(c) <= 1616 for c in s)
def gf_fun(s, pos): def gf_fun(s, pos, disamb=0):
return ''.join(["'", s, "_", pos, "'"]) discrim = '_' + str(disamb) if disamb else ''
return ''.join(["'", s, discrim, "_", pos, "'"])
def forms_for_pos(obj): def forms_for_pos(obj):
@@ -97,23 +98,25 @@ def forms_for_pos(obj):
if obj['pos'] == 'noun': if obj['pos'] == 'noun':
lemma = [form[:-1] for form, descr in forms lemma = [form[:-1] for form, descr in forms
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1] if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
return { plural = [form[:-1] for form, descr in forms
'gf_fun': gf_fun(lemma[0], 'N') if lemma else None, if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
'gf_cat': 'N', gender = ('Fem' if 'Arabic feminine nouns' in obj['categories']
'singular': lemma,
'plural': [form[:-1] for form, descr in forms
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1],
'gender': 'Fem' if 'Arabic feminine nouns' in obj['categories']
else ('Masc' if 'Arabic masculine nouns' in obj['categories'] else ('Masc' if 'Arabic masculine nouns' in obj['categories']
else None) else None))
gf_entry = {
'cat': 'N',
'lemma': lemma,
'singular': lemma,
'plural': plural,
'gender': gender
} }
elif obj['pos'] == 'verb': elif obj['pos'] == 'verb':
lemma = [form for form, descr in forms lemma = [form for form, descr in forms
if all([w in descr for if all([w in descr for
w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1] w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1]
return { gf_entry = {
'gf_fun': gf_fun(lemma[0], 'V') if lemma else None, 'cat': 'V',
'gf_cat': 'V', 'lemma': lemma,
'perfect': lemma, 'perfect': lemma,
'imperfect': [form for form, descr in forms 'imperfect': [form for form, descr in forms
if all([w in descr for if all([w in descr for
@@ -125,9 +128,9 @@ def forms_for_pos(obj):
elif obj['pos'] == 'adj': elif obj['pos'] == 'adj':
lemma = [form for form, descr in forms lemma = [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1] if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1]
return { gf_entry = {
'gf_fun': gf_fun(lemma[0], 'A') if lemma else None, 'cat': 'A',
'gf_cat': 'A', 'lemma': lemma,
'masc_singular': lemma, 'masc_singular': lemma,
'masc_plural': [form for form, descr in forms 'masc_plural': [form for form, descr in forms
if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1], if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
@@ -138,7 +141,14 @@ def forms_for_pos(obj):
} }
else: else:
return {f: d for f, d in forms} gf_entry = {f: d for f, d in forms}
if 'lemma' in gf_entry and gf_entry['lemma']:
gf_entry['lemma'] = gf_entry['lemma'][0]
form = gf_entry['imperfect'][0] if gf_entry['cat'] == 'V' and gf_entry['imperfect'] else gf_entry['lemma']
gf_entry['lin'] = ''.join(['mk', gf_entry['cat'], ' "' + form + '"'])
return gf_entry
# "root": ["ش ر ح (š-r-ḥ)"] # "root": ["ش ر ح (š-r-ḥ)"]
@@ -148,17 +158,23 @@ def find_root(s):
import sys import sys
MODE = sys.argv[1] MODE = sys.argv[1]
if MODE == 'gf': if MODE == 'gf-abs':
print('abstract MorphoDictAraAbs = Cat ** {') print('abstract MorphoDictAraAbs = Cat ** {')
if MODE == 'gf-cnc':
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {')
with open(FILTERED_WIKT) as file: with open(FILTERED_WIKT) as file:
seen_gf_funs = set() seen_gf_funs = {}
for line in file: for line in file:
obj = json.loads(line) obj = json.loads(line)
root = [find_root(t['expansion']) for
t in obj.get('etymology_templates', []) if
t.get('name', None) =='ar-root'][:1]
if 'Arabic lemmas' in obj.get('categories', []): if 'Arabic lemmas' in obj.get('categories', []):
entry = { entry = {
'pos': obj['pos'], 'pos': obj['pos'],
'root': [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1], 'root': root,
'forms': forms_for_pos(obj), 'forms': forms_for_pos(obj),
'senses': [sense['glosses'] for sense in obj.get('senses', []) 'senses': [sense['glosses'] for sense in obj.get('senses', [])
if 'glosses' in sense] if 'glosses' in sense]
@@ -168,14 +184,23 @@ with open(FILTERED_WIKT) as file:
if MODE == 'json': if MODE == 'json':
print(json.dumps(entry, ensure_ascii=False)) print(json.dumps(entry, ensure_ascii=False))
if MODE == 'gf': if MODE.startswith('gf'):
if 'gf_fun' in entry['forms'] and entry['forms']['gf_fun']: lemma = entry['forms'].get('lemma', None)
if entry['forms']['gf_fun'] not in seen_gf_funs: if lemma:
print('fun', entry['forms']['gf_fun'], ':', entry['forms']['gf_cat'], ';', '--', entry['senses']) cat = entry['forms']['cat']
seen_gf_funs.add(entry['forms']['gf_fun']) lin = entry['forms']['lin']
discrim = seen_gf_funs.get((lemma, cat), 0)
fun = gf_fun(lemma, cat, discrim)
if MODE == 'gf-abs':
print('fun', fun, ':', cat, ';', '--', entry['senses'])
if MODE == 'gf-cnc':
print('lin', fun, '=', lin, ';')
seen_gf_funs[(lemma, cat)] = discrim + 1
# to do: rename duplicate function names: of 13762 names, 12946 are unique # to do: rename duplicate function names: of 13762 names, 12946 are unique
if MODE == 'gf': if MODE.startswith('gf'):
print('}') print('}')