everything in place for Lab 2

This commit is contained in:
Aarne Ranta
2025-05-04 12:14:37 +02:00
parent cc604093d0
commit 6ea7d5d838
10 changed files with 403 additions and 72 deletions

View File

@@ -1,49 +0,0 @@
import json
# query: https://w.wiki/3tEM
DATA_FILE = 'query.json'
with open(DATA_FILE) as file:
data = json.load(file)
print(data[0])
awards = {(d['award'], d['awardLabel']) for d in data}
#print(awards)
#print(len(awards))
countries = {(d['country'], d['countryLabel']) for d in data}
#print(countries)
#print(len(countries))
#print(data[0].keys())
def pronoun(d):
sex = d.get('sexLabel', 'other')
if sex == 'female':
return 'she'
elif sex == 'male':
return 'he'
else:
return 'they'
def year(date):
return date[:4]
def person_descr(d):
died = f"{d['personLabel']} died {year(d['deathDate'])}" if 'deathDate' in d else ''
return (
f"{d['personLabel']} from {d['countryLabel']} was born in {year(d['birthDate'])}. " +
f"{pronoun(d)} got {d['awardLabel']} in {year(d['date'])}." +
died
)
for d in data:
print(person_descr(d))

View File

@@ -0,0 +1,95 @@
import sys
import json
import pgf
# query: https://w.wiki/3tEM
DATA_FILE = '../data/query.json'
WIKIDATA_PREFIX = 'http://www.wikidata.org/entity/'
GRAMMAR_PREFIX = 'Nobel'
GRAMMAR_FILE = f'../grammars/{GRAMMAR_PREFIX}.pgf'
FUN_FILE = f'../data/{GRAMMAR_PREFIX}-funs.jsonl'
with open(DATA_FILE) as file:
data = json.load(file)
#print(data[0])
awards = {(d['award'], d['awardLabel']) for d in data}
#print(awards)
#print(len(awards))
countries = {(d['country'], d['countryLabel']) for d in data}
# template-based generation in English
def pronoun(d):
sex = d.get('sexLabel', 'other')
if sex == 'female':
return 'she'
elif sex == 'male':
return 'he'
else:
return 'they'
def year(date):
return date[:4]
# template-based generation in English
def template_description(d):
died = f"{d['personLabel']} died {year(d['deathDate'])}" if 'deathDate' in d else ''
return (
f"{d['personLabel']} was born in {d['countryLabel']} in {year(d['birthDate'])}. " +
f"{pronoun(d)} got the {d['awardLabel']} in {year(d['date'])}." +
died
)
# grammar-based generation in a given language
def name(d):
person = d['personLabel']
return f'StringName "{person}"'
def funs(funfile):
with open(funfile) as file:
data = {WIKIDATA_PREFIX + qf[0]: qf[1] for line in file for qf in [json.loads(line)]}
return data
def country(fundata, d):
return fundata[d['country']]
def award(fundata, d):
return fundata[d['award']]
def grammar_description(grammar, fundata, d, lang):
born = pgf.readExpr(
f"BornSentence ({name(d)}) {country(fundata, d)} (YearDate {year(d['birthDate'])})")
awarded = pgf.readExpr(
f"AwardSentence {pronoun(d)}_Name {award(fundata, d)} (YearDate {year(d['date'])})")
sentences = [born, awarded]
if 'deathDate' in d:
died = pgf.readExpr(
f"DiedSentence ({name(d)}) (YearDate {year(d['deathDate'])})")
sentences.append(died)
return ' '.join([lang.linearize(s) + '.' for s in sentences])
if sys.argv[1:]:
grammar = pgf.readPGF(GRAMMAR_FILE)
fundata = funs(FUN_FILE)
lang = grammar.languages[GRAMMAR_PREFIX + sys.argv[1]]
for d in data:
print(grammar_description(grammar, fundata, d, lang))
else:
for d in data:
print(template_description(d))

View File

@@ -8,9 +8,10 @@ from gf_utils import *
"""
To collect labels from query.json (Wikidata query result) and extract grammars:
python3 find_labels.py init >labels.jsonl
python3 find_labels.py abstract >Labels.gf
python3 find_labels.py en >LabelsEng.gf
python3 find_labels.py init >../data/labels.jsonl
python3 find_labels.py funs >../data/funs.jsonl
python3 find_labels.py abstract >../data/Labels.gf
python3 find_labels.py en >../data/LabelsEng.gf
"""
WIKIDATA_FILE = '../data/query.json'
@@ -19,7 +20,7 @@ WIKIDATA_URL_PREFIX = 'http://www.wikidata.org/wiki/Special:EntityData/'
NOBEL_FIELDS = ['award', 'country']
LABEL_FILE = '../data/labels.jsonl'
USAGE = 'usage: find_labels.py (init | abstract | en | sv | fi | ...)'
USAGE = 'usage: find_labels.py (init | funs | abstract | en | sv | fi | ...)'
if sys.argv[1:]:
MODE = sys.argv[1]
@@ -78,7 +79,9 @@ def extract_labels(labeldata, mode):
eng = labels.get('en', 'X')
cat = labels['field'].capitalize()
fun = mk_fun_from_strs([qid, eng, cat])
if mode == 'abstract':
if mode == 'funs':
print(json.dumps([qid, fun], ensure_ascii=False))
elif mode == 'abstract':
print(mk_fun_rule(fun, cat))
else:
lin = labels.get(mode, labels.get('en', 'X'))