Files
comp-syntax-gu-mlt/lab2/scripts/describe_nobel.py
2025-05-04 12:14:37 +02:00

96 lines
2.3 KiB
Python

import sys
import json
import pgf
# query: https://w.wiki/3tEM
DATA_FILE = '../data/query.json'
WIKIDATA_PREFIX = 'http://www.wikidata.org/entity/'
GRAMMAR_PREFIX = 'Nobel'
GRAMMAR_FILE = f'../grammars/{GRAMMAR_PREFIX}.pgf'
FUN_FILE = f'../data/{GRAMMAR_PREFIX}-funs.jsonl'
with open(DATA_FILE) as file:
data = json.load(file)
#print(data[0])
awards = {(d['award'], d['awardLabel']) for d in data}
#print(awards)
#print(len(awards))
countries = {(d['country'], d['countryLabel']) for d in data}
# template-based generation in English
def pronoun(d):
sex = d.get('sexLabel', 'other')
if sex == 'female':
return 'she'
elif sex == 'male':
return 'he'
else:
return 'they'
def year(date):
return date[:4]
# template-based generation in English
def template_description(d):
died = f"{d['personLabel']} died {year(d['deathDate'])}" if 'deathDate' in d else ''
return (
f"{d['personLabel']} was born in {d['countryLabel']} in {year(d['birthDate'])}. " +
f"{pronoun(d)} got the {d['awardLabel']} in {year(d['date'])}." +
died
)
# grammar-based generation in a given language
def name(d):
person = d['personLabel']
return f'StringName "{person}"'
def funs(funfile):
with open(funfile) as file:
data = {WIKIDATA_PREFIX + qf[0]: qf[1] for line in file for qf in [json.loads(line)]}
return data
def country(fundata, d):
return fundata[d['country']]
def award(fundata, d):
return fundata[d['award']]
def grammar_description(grammar, fundata, d, lang):
born = pgf.readExpr(
f"BornSentence ({name(d)}) {country(fundata, d)} (YearDate {year(d['birthDate'])})")
awarded = pgf.readExpr(
f"AwardSentence {pronoun(d)}_Name {award(fundata, d)} (YearDate {year(d['date'])})")
sentences = [born, awarded]
if 'deathDate' in d:
died = pgf.readExpr(
f"DiedSentence ({name(d)}) (YearDate {year(d['deathDate'])})")
sentences.append(died)
return ' '.join([lang.linearize(s) + '.' for s in sentences])
if sys.argv[1:]:
grammar = pgf.readPGF(GRAMMAR_FILE)
fundata = funs(FUN_FILE)
lang = grammar.languages[GRAMMAR_PREFIX + sys.argv[1]]
for d in data:
print(grammar_description(grammar, fundata, d, lang))
else:
for d in data:
print(template_description(d))