to_wordnet applied to a new format of data

This commit is contained in:
Aarne Ranta
2023-09-25 08:22:47 +02:00
parent aa1dff6702
commit 561a8c130d

View File

@@ -6,7 +6,8 @@ import json
# from https://www.grammaticalframework.org/~krasimir/arabic.tsv.gz # from https://www.grammaticalframework.org/~krasimir/arabic.tsv.gz
WN_TSV = 'arabic.tsv' # WN_TSV = 'arabic.tsv' # Krasimir
WN_TSV = 'ar2en_words_gf.csv' # Zarzoura
# built as explained in ./read_wiktionary.py # built as explained in ./read_wiktionary.py
MORPHO_GF = 'MorphoDictAraAbs.gf' MORPHO_GF = 'MorphoDictAraAbs.gf'
@@ -41,7 +42,7 @@ with open(WN_TSV) as wnfile:
for row in wnfile: for row in wnfile:
## word = row[-1].strip() # does not show tha arabic, but the second-last word ## word = row[-1].strip() # does not show tha arabic, but the second-last word
word = unvocalize(get_arabic(row)) word = unvocalize(get_arabic(row))
wnfun = row.split()[0] wnfun = row.split()[-1] # 0 in Krasimir
cat = [c for c in wnfun if c.isalpha()][-1] # the last letter; the dict only contains N, A, V cat = [c for c in wnfun if c.isalpha()][-1] # the last letter; the dict only contains N, A, V
funs = funmap.get((word, cat), []) funs = funmap.get((word, cat), [])
result = {'wnfun': wnfun, 'sought': word, 'found': funs} result = {'wnfun': wnfun, 'sought': word, 'found': funs}