start files for lab2

This commit is contained in:
Aarne Ranta
2025-05-04 09:38:01 +02:00
parent 31239a3d63
commit 0671eee0ba
26 changed files with 636 additions and 0 deletions

View File

@@ -0,0 +1,49 @@
import json
# query: https://w.wiki/3tEM
DATA_FILE = 'query.json'
with open(DATA_FILE) as file:
data = json.load(file)
print(data[0])
awards = {(d['award'], d['awardLabel']) for d in data}
#print(awards)
#print(len(awards))
countries = {(d['country'], d['countryLabel']) for d in data}
#print(countries)
#print(len(countries))
#print(data[0].keys())
def pronoun(d):
sex = d.get('sexLabel', 'other')
if sex == 'female':
return 'she'
elif sex == 'male':
return 'he'
else:
return 'they'
def year(date):
return date[:4]
def person_descr(d):
died = f"{d['personLabel']} died {year(d['deathDate'])}" if 'deathDate' in d else ''
return (
f"{d['personLabel']} from {d['countryLabel']} was born in {year(d['birthDate'])}. " +
f"{pronoun(d)} got {d['awardLabel']} in {year(d['date'])}." +
died
)
for d in data:
print(person_descr(d))

101
lab2/scripts/find_labels.py Normal file
View File

@@ -0,0 +1,101 @@
# https://www.wikidata.org/wiki/Special:EntityData/Q18644475.json
import urllib.request
import json
import sys
import ssl
from gf_utils import *
"""
To collect labels from query.json (Wikidata query result) and extract grammars:
python3 find_labels.py init >labels.jsonl
python3 find_labels.py abstract >Labels.gf
python3 find_labels.py en >LabelsEng.gf
"""
WIKIDATA_FILE = '../data/query.json'
WIKIDATA_PREFIX = 'http://www.wikidata.org/entity/'
WIKIDATA_URL_PREFIX = 'http://www.wikidata.org/wiki/Special:EntityData/'
NOBEL_FIELDS = ['award', 'country']
LABEL_FILE = '../data/labels.jsonl'
USAGE = 'usage: find_labels.py (init | abstract | en | sv | fi | ...)'
if sys.argv[1:]:
MODE = sys.argv[1]
else:
print(USAGE)
# qids given in the data file
def get_wikidata_qids(jsonfile, fields):
qids = set()
with open(jsonfile) as file:
data = json.load(file)
for d in data:
for f in fields:
if f in d:
qids.add((f, d[f][len(WIKIDATA_PREFIX):]))
return qids
qids = get_wikidata_qids(WIKIDATA_FILE, NOBEL_FIELDS)
if __name__ == '__mainz__':
for qid in qids:
print(qid)
# get all wikidata for each qid
# use this only once, because it is slow
def get_wikidata_json(qids):
context = ssl._create_unverified_context()
for field, qid in qids:
try:
with urllib.request.urlopen(WIKIDATA_URL_PREFIX + qid +'.json', context=context) as url:
data = json.load(url)
yield (field, qid, data)
except Exception as error:
pass
# extract the labels, redirect to LABEL_FILE (only once)
def get_wikidata_labels(data, languages=None):
for field, qid, dict in data:
entities = dict.get('entities', {'foo': {}})
entity = list(entities.values())[0]
entitylabels = entity.get('labels', {})
entitylabels = {val['language']: val['value']
for val in entitylabels.values()
if (languages is None) or
(val['language'] in languages)}
entitylabels['field'] = field
print(json.dumps({qid: entitylabels}, ensure_ascii=False))
# {"Q800": {"tg": "Коста Рика", "sk": "Kostarika", ... "field": <field>}}
def extract_labels(labeldata, mode):
for entry in data:
qid, labels = list(entry.items())[0]
eng = labels.get('en', 'X')
cat = labels['field'].capitalize()
fun = mk_fun_from_strs([qid, eng, cat])
if mode == 'abstract':
print(mk_fun_rule(fun, cat))
else:
lin = labels.get(mode, labels.get('en', 'X'))
oper = 'mk' + cat
print(mk_lin_rule(fun, mk_lin(oper, [lin], [])))
if MODE == 'init':
# do this only once, redirect to labels.jsonl
data = get_wikidata_json(list(qids))
get_wikidata_labels(data)
else:
# do this once for abs and for every language you want
with open(LABEL_FILE) as file:
data = [json.loads(line) for line in file]
extract_labels(data, MODE)

View File

@@ -0,0 +1,101 @@
# https://www.wikidata.org/wiki/Special:EntityData/Q18644475.json
import urllib.request
import json
import sys
import ssl
from gf_utils import *
"""
To collect labels from query.json (Wikidata query result) and extract grammars:
python3 find_labels.py init >labels.jsonl
python3 find_labels.py abstract >Labels.gf
python3 find_labels.py en >LabelsEng.gf
"""
WIKIDATA_FILE = 'query.json'
WIKIDATA_PREFIX = 'http://www.wikidata.org/entity/'
WIKIDATA_URL_PREFIX = 'http://www.wikidata.org/wiki/Special:EntityData/'
NOBEL_FIELDS = ['award', 'country']
LABEL_FILE = 'labels.jsonl'
USAGE = 'usage: find_labels.py (init | abstract | en | sv | fi | ...)'
if sys.argv[1:]:
MODE = sys.argv[1]
else:
print(USAGE)
# qids given in the data file
def get_wikidata_qids(jsonfile, fields):
qids = set()
with open(jsonfile) as file:
data = json.load(file)
for d in data:
for f in fields:
if f in d:
qids.add((f, d[f][len(WIKIDATA_PREFIX):]))
return qids
qids = get_wikidata_qids(WIKIDATA_FILE, NOBEL_FIELDS)
if __name__ == '__mainz__':
for qid in qids:
print(qid)
# get all wikidata for each qid
# use this only once, because it is slow
def get_wikidata_json(qids):
context = ssl._create_unverified_context()
for field, qid in qids:
try:
with urllib.request.urlopen(WIKIDATA_URL_PREFIX + qid +'.json', context=context) as url:
data = json.load(url)
yield (field, qid, data)
except Exception as error:
pass
# extract the labels, redirect to LABEL_FILE (only once)
def get_wikidata_labels(data, languages=None):
for field, qid, dict in data:
entities = dict.get('entities', {'foo': {}})
entity = list(entities.values())[0]
entitylabels = entity.get('labels', {})
entitylabels = {val['language']: val['value']
for val in entitylabels.values()
if (languages is None) or
(val['language'] in languages)}
entitylabels['field'] = field
print(json.dumps({qid: entitylabels}, ensure_ascii=False))
# {"Q800": {"tg": "Коста Рика", "sk": "Kostarika", ... "field": <field>}}
def extract_labels(labeldata, mode):
for entry in data:
qid, labels = list(entry.items())[0]
eng = labels.get('en', 'X')
cat = labels['field'].capitalize()
fun = mk_fun_from_strs([qid, eng, cat])
if mode == 'abstract':
print(mk_fun_rule(fun, cat))
else:
lin = labels.get(mode, labels.get('en', 'X'))
oper = 'mk' + cat
print(mk_lin_rule(fun, mk_lin(oper, [lin], [])))
if MODE == 'init':
# do this only once, redirect to labels.jsonl
data = get_wikidata_json(list(qids))
get_wikidata_labels(data)
else:
# do this once for abs and for every language you want
with open(LABEL_FILE) as file:
data = [json.loads(line) for line in file]
extract_labels(data, MODE)

48
lab2/scripts/gf_utils.py Normal file
View File

@@ -0,0 +1,48 @@
# import pgf
def mk_fun(s):
s = '_'.join(s.split()) # spaces replaced by underscores
if (s[0].isalpha() and
all(ord(c)<256 and (c.isdigit() or c.isalpha() or c in "_'")
for c in s)): # test if legal GF identifier
return s
else:
return "'" + s.replace("'", "\\'") + "'" # if not, single quotes make it legal
def mk_fun_from_strs(ss):
return mk_fun('_'.join(ss))
def quote(s):
return '"' + s + '"'
def app(fun, args):
return ' '.join([fun, quote(args[0])] + args[1:])
def empty_variants():
return 'variants {}'
def mk_lin(oper, words, params):
return ' '.join([oper] + [quote(w) for w in words] + params)
def mk_cat_rule(cat):
return ' '.join(['cat', cat, ';\n'])
def mk_fun_rule(fun, cat, comment=None):
co = '--' + comment if comment else ''
return ' '.join(['fun', fun, ':', cat, ';', co])
def mk_lin_rule(fun, lin, comment=None):
co = '--' + comment if comment else ''
return ' '.join(['lin', fun, '=', lin, ';', co])
def mk_lincat_rule(cat, lin):
return ' '.join(['lincat', cat, '=', lin, ';'])