mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-05-27 08:58:55 -06:00
compilable MorphoDictAra generation except for V, not yet using all forms
This commit is contained in:
@@ -82,8 +82,9 @@ def is_arabic(s):
|
|||||||
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
||||||
|
|
||||||
|
|
||||||
def gf_fun(s, pos):
|
def gf_fun(s, pos, disamb=0):
|
||||||
return ''.join(["'", s, "_", pos, "'"])
|
discrim = '_' + str(disamb) if disamb else ''
|
||||||
|
return ''.join(["'", s, discrim, "_", pos, "'"])
|
||||||
|
|
||||||
|
|
||||||
def forms_for_pos(obj):
|
def forms_for_pos(obj):
|
||||||
@@ -97,23 +98,25 @@ def forms_for_pos(obj):
|
|||||||
if obj['pos'] == 'noun':
|
if obj['pos'] == 'noun':
|
||||||
lemma = [form[:-1] for form, descr in forms
|
lemma = [form[:-1] for form, descr in forms
|
||||||
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
|
||||||
return {
|
plural = [form[:-1] for form, descr in forms
|
||||||
'gf_fun': gf_fun(lemma[0], 'N') if lemma else None,
|
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1]
|
||||||
'gf_cat': 'N',
|
gender = ('Fem' if 'Arabic feminine nouns' in obj['categories']
|
||||||
'singular': lemma,
|
|
||||||
'plural': [form[:-1] for form, descr in forms
|
|
||||||
if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1],
|
|
||||||
'gender': 'Fem' if 'Arabic feminine nouns' in obj['categories']
|
|
||||||
else ('Masc' if 'Arabic masculine nouns' in obj['categories']
|
else ('Masc' if 'Arabic masculine nouns' in obj['categories']
|
||||||
else None)
|
else None))
|
||||||
|
gf_entry = {
|
||||||
|
'cat': 'N',
|
||||||
|
'lemma': lemma,
|
||||||
|
'singular': lemma,
|
||||||
|
'plural': plural,
|
||||||
|
'gender': gender
|
||||||
}
|
}
|
||||||
elif obj['pos'] == 'verb':
|
elif obj['pos'] == 'verb':
|
||||||
lemma = [form for form, descr in forms
|
lemma = [form for form, descr in forms
|
||||||
if all([w in descr for
|
if all([w in descr for
|
||||||
w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1]
|
w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1]
|
||||||
return {
|
gf_entry = {
|
||||||
'gf_fun': gf_fun(lemma[0], 'V') if lemma else None,
|
'cat': 'V',
|
||||||
'gf_cat': 'V',
|
'lemma': lemma,
|
||||||
'perfect': lemma,
|
'perfect': lemma,
|
||||||
'imperfect': [form for form, descr in forms
|
'imperfect': [form for form, descr in forms
|
||||||
if all([w in descr for
|
if all([w in descr for
|
||||||
@@ -125,9 +128,9 @@ def forms_for_pos(obj):
|
|||||||
elif obj['pos'] == 'adj':
|
elif obj['pos'] == 'adj':
|
||||||
lemma = [form for form, descr in forms
|
lemma = [form for form, descr in forms
|
||||||
if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1]
|
if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1]
|
||||||
return {
|
gf_entry = {
|
||||||
'gf_fun': gf_fun(lemma[0], 'A') if lemma else None,
|
'cat': 'A',
|
||||||
'gf_cat': 'A',
|
'lemma': lemma,
|
||||||
'masc_singular': lemma,
|
'masc_singular': lemma,
|
||||||
'masc_plural': [form for form, descr in forms
|
'masc_plural': [form for form, descr in forms
|
||||||
if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
|
if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
|
||||||
@@ -138,9 +141,16 @@ def forms_for_pos(obj):
|
|||||||
}
|
}
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return {f: d for f, d in forms}
|
gf_entry = {f: d for f, d in forms}
|
||||||
|
|
||||||
|
if 'lemma' in gf_entry and gf_entry['lemma']:
|
||||||
|
gf_entry['lemma'] = gf_entry['lemma'][0]
|
||||||
|
form = gf_entry['imperfect'][0] if gf_entry['cat'] == 'V' and gf_entry['imperfect'] else gf_entry['lemma']
|
||||||
|
gf_entry['lin'] = ''.join(['mk', gf_entry['cat'], ' "' + form + '"'])
|
||||||
|
|
||||||
|
return gf_entry
|
||||||
|
|
||||||
|
|
||||||
# "root": ["ش ر ح (š-r-ḥ)"]
|
# "root": ["ش ر ح (š-r-ḥ)"]
|
||||||
def find_root(s):
|
def find_root(s):
|
||||||
return ''.join([c for c in s if is_arabic(c)])
|
return ''.join([c for c in s if is_arabic(c)])
|
||||||
@@ -148,17 +158,23 @@ def find_root(s):
|
|||||||
import sys
|
import sys
|
||||||
MODE = sys.argv[1]
|
MODE = sys.argv[1]
|
||||||
|
|
||||||
if MODE == 'gf':
|
if MODE == 'gf-abs':
|
||||||
print('abstract MorphoDictAraAbs = Cat ** {')
|
print('abstract MorphoDictAraAbs = Cat ** {')
|
||||||
|
if MODE == 'gf-cnc':
|
||||||
|
print('concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra in {')
|
||||||
|
|
||||||
|
|
||||||
with open(FILTERED_WIKT) as file:
|
with open(FILTERED_WIKT) as file:
|
||||||
seen_gf_funs = set()
|
seen_gf_funs = {}
|
||||||
for line in file:
|
for line in file:
|
||||||
obj = json.loads(line)
|
obj = json.loads(line)
|
||||||
|
root = [find_root(t['expansion']) for
|
||||||
|
t in obj.get('etymology_templates', []) if
|
||||||
|
t.get('name', None) =='ar-root'][:1]
|
||||||
if 'Arabic lemmas' in obj.get('categories', []):
|
if 'Arabic lemmas' in obj.get('categories', []):
|
||||||
entry = {
|
entry = {
|
||||||
'pos': obj['pos'],
|
'pos': obj['pos'],
|
||||||
'root': [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1],
|
'root': root,
|
||||||
'forms': forms_for_pos(obj),
|
'forms': forms_for_pos(obj),
|
||||||
'senses': [sense['glosses'] for sense in obj.get('senses', [])
|
'senses': [sense['glosses'] for sense in obj.get('senses', [])
|
||||||
if 'glosses' in sense]
|
if 'glosses' in sense]
|
||||||
@@ -168,14 +184,23 @@ with open(FILTERED_WIKT) as file:
|
|||||||
if MODE == 'json':
|
if MODE == 'json':
|
||||||
print(json.dumps(entry, ensure_ascii=False))
|
print(json.dumps(entry, ensure_ascii=False))
|
||||||
|
|
||||||
if MODE == 'gf':
|
if MODE.startswith('gf'):
|
||||||
|
|
||||||
if 'gf_fun' in entry['forms'] and entry['forms']['gf_fun']:
|
lemma = entry['forms'].get('lemma', None)
|
||||||
if entry['forms']['gf_fun'] not in seen_gf_funs:
|
if lemma:
|
||||||
print('fun', entry['forms']['gf_fun'], ':', entry['forms']['gf_cat'], ';', '--', entry['senses'])
|
cat = entry['forms']['cat']
|
||||||
seen_gf_funs.add(entry['forms']['gf_fun'])
|
lin = entry['forms']['lin']
|
||||||
|
discrim = seen_gf_funs.get((lemma, cat), 0)
|
||||||
|
fun = gf_fun(lemma, cat, discrim)
|
||||||
|
|
||||||
|
if MODE == 'gf-abs':
|
||||||
|
print('fun', fun, ':', cat, ';', '--', entry['senses'])
|
||||||
|
if MODE == 'gf-cnc':
|
||||||
|
print('lin', fun, '=', lin, ';')
|
||||||
|
|
||||||
|
seen_gf_funs[(lemma, cat)] = discrim + 1
|
||||||
|
|
||||||
# to do: rename duplicate function names: of 13762 names, 12946 are unique
|
# to do: rename duplicate function names: of 13762 names, 12946 are unique
|
||||||
|
|
||||||
if MODE == 'gf':
|
if MODE.startswith('gf'):
|
||||||
print('}')
|
print('}')
|
||||||
|
|||||||
Reference in New Issue
Block a user