1
0
forked from GitHub/gf-rgl

Hrv: dumping essential Wiktionary information to smaller json files

This commit is contained in:
Aarne Ranta
2022-09-28 19:09:04 +02:00
parent b1a1961454
commit d8e313c75c

View File

@@ -6,11 +6,15 @@ import sys
# Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), # Proceedings of the 13th Conference on Language Resources and Evaluation (LREC),
# pp. 1317-1325, Marseille, 20-25 June 2022. # pp. 1317-1325, Marseille, 20-25 June 2022.
FILE = 'data/raw-wiktextract-data.json' WIKTIONARY_FILE = 'data/raw-wiktextract-data.json'
MYLANG = 'Serbo-Croatian' MYLANG = 'Serbo-Croatian'
REFLANG = 'English' REFLANG = 'English'
MORPHO_OUTPUT_FILE = 'm.json'
TRANS_OUTPUT_FILE = 't.json'
GENDERS = ['masculine', 'feminine', 'neuter'] GENDERS = ['masculine', 'feminine', 'neuter']
NOUN_FORMS = { NOUN_FORMS = {
@@ -129,23 +133,36 @@ def get_forms(pos, forms):
def lexinfo(data): def lexinfo(data):
return {'pos': data['pos'], return data['word'], {
'word': data['word'], 'pos': data['pos'], 'forms': get_forms(data['pos'], data['forms'])}
'forms': get_forms(data['pos'], data['forms'])
}
def morpho(mylang, data):
if data.get('lang', '') == mylang and (
all([x in data for x in ['pos', 'word', 'forms']])):
print(lexinfo(data))
def translations(mylang, reflang, data): # write morphology of mylang in m.json
if data.get('lang', '') == reflang and ( def morpho(mylang, lines):
all([x in data for x in ['pos', 'word']])): with open(MORPHO_OUTPUT_FILE, 'w', encoding="utf-8") as file:
for t in [t for t in data.get('translations', []) for line in lines:
if t['lang'] == mylang]: data = json.loads(line)
print(data['word'], data['pos'], t.get('word')) if data.get('lang', '') == mylang and (
all([x in data for x in ['pos', 'word', 'forms']])):
word, info = lexinfo(data)
file.write(json.dumps({word: info})+'\n')
# write translations from reflang to mylang in t.json
def translations(mylang, reflang, lines):
with open(TRANS_OUTPUT_FILE, 'w', encoding="utf-8") as file:
for line in lines:
data = json.loads(line)
if data.get('lang', '') == reflang and (
all([x in data for x in ['pos', 'word']])):
for t in [t for t in data.get('translations', [])
if t['lang'] == mylang]:
file.write(json.dumps(
{data['word']:
{'pos': data['pos'],
'trans': t.get('word'),
'sense': t.get('sense')}
})+'\n')
def main(): def main():
@@ -156,18 +173,13 @@ def main():
mylang, reflang = MYLANG, REFLANG mylang, reflang = MYLANG, REFLANG
if sys.argv[3:]: if sys.argv[3:]:
mylang, reflang = sys.argv[2:] mylang, reflang = sys.argv[2:]
with open(FILE, "r", encoding="utf-8") as f: with open(WIKTIONARY_FILE, "r", encoding="utf-8") as lines:
for line in f:
data = json.loads(line)
if mode == 'trans': if mode == 'trans':
translations(mylang, reflang, data) translations(mylang, reflang, lines)
else: elif mode == 'morpho':
morpho(mylang, data) morpho(mylang, lines)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
# noun plata [{'form': 'pláta', 'tags': ['canonical', 'feminine']}, {'form': 'пла́та', 'tags': ['Cyrillic']}, {'form': '', 'source': 'Declension', 'tags': ['table-tags']}, {'form': 'plata', 'tags': ['nominative', 'singular'], 'source': 'Declension'}, {'form': 'plate', 'tags': ['nominative', 'plural'], 'source': 'Declension'}, {'form': 'plate', 'tags': ['genitive', 'singular'], 'source': 'Declension'}, {'form': 'plata', 'tags': ['genitive', 'plural'], 'source': 'Declension'}, {'form': 'plati', 'tags': ['dative', 'singular'], 'source': 'Declension'}, {'form': 'platama', 'tags': ['dative', 'plural'], 'source': 'Declension'}, {'form': 'platu', 'tags': ['accusative', 'singular'], 'source': 'Declension'}, {'form': 'plate', 'tags': ['accusative', 'plural'], 'source': 'Declension'}, {'form': 'plato', 'tags': ['singular', 'vocative'], 'source': 'Declension'}, {'form': 'plate', 'tags': ['plural', 'vocative'], 'source': 'Declension'}, {'form': 'plati', 'tags': ['locative', 'singular'], 'source': 'Declension'}, {'form': 'platama', 'tags': ['locative', 'plural'], 'source': 'Declension'}, {'form': 'platom', 'tags': ['instrumental', 'singular'], 'source': 'Declension'}, {'form': 'platama', 'tags': ['instrumental', 'plural'], 'source': 'Declension'}]