forked from GitHub/comp-syntax-gu-mlt
start files for lab2
This commit is contained in:
112
lab2/data/labels.jsonl
Normal file
112
lab2/data/labels.jsonl
Normal file
File diff suppressed because one or more lines are too long
1
lab2/data/query.json
Normal file
1
lab2/data/query.json
Normal file
File diff suppressed because one or more lines are too long
112
lab2/grammars/Labels.gf
Normal file
112
lab2/grammars/Labels.gf
Normal file
@@ -0,0 +1,112 @@
|
||||
fun Q800_Costa_Rica_Country : Country ;
|
||||
fun Q219060_State_of_Palestine_Country : Country ;
|
||||
fun Q37_Lithuania_Country : Country ;
|
||||
fun Q137816_Taiwan_under_Japanese_rule_Country : Country ;
|
||||
fun Q1028_Morocco_Country : Country ;
|
||||
fun Q796_Iraq_Country : Country ;
|
||||
fun Q184_Belarus_Country : Country ;
|
||||
fun Q225_Bosnia_and_Herzegovina_Country : Country ;
|
||||
fun Q20_Norway_Country : Country ;
|
||||
fun Q211_Latvia_Country : Country ;
|
||||
fun Q117_Ghana_Country : Country ;
|
||||
fun Q39_Switzerland_Country : Country ;
|
||||
fun Q159631_Kingdom_of_Württemberg_Country : Country ;
|
||||
fun Q17_Japan_Country : Country ;
|
||||
fun Q189_Iceland_Country : Country ;
|
||||
fun Q221_North_Macedonia_Country : Country ;
|
||||
fun Q9683_Tang_dynasty_Country : Country ;
|
||||
fun Q79_Egypt_Country : Country ;
|
||||
fun Q408_Australia_Country : Country ;
|
||||
fun Q4628_Faroe_Islands_Country : Country ;
|
||||
fun Q145_United_Kingdom_Country : Country ;
|
||||
fun Q214_Slovakia_Country : Country ;
|
||||
fun Q16_Canada_Country : Country ;
|
||||
fun Q924_Tanzania_Country : Country ;
|
||||
fun Q55502_Kingdom_of_Jerusalem_Country : Country ;
|
||||
fun Q183_Germany_Country : Country ;
|
||||
fun Q754_Trinidad_and_Tobago_Country : Country ;
|
||||
fun Q298_Chile_Country : Country ;
|
||||
fun Q41_Greece_Country : Country ;
|
||||
fun Q30623_Manchukuo_Country : Country ;
|
||||
fun Q774_Guatemala_Country : Country ;
|
||||
fun Q836_Myanmar_Country : Country ;
|
||||
fun Q902_Bangladesh_Country : Country ;
|
||||
fun Q215_Slovenia_Country : Country ;
|
||||
fun Q7313_Yuan_dynasty_Country : Country ;
|
||||
fun Q822_Lebanon_Country : Country ;
|
||||
fun Q12548_Holy_Roman_Empire_Country : Country ;
|
||||
fun Q12407080_early_Islamic_period_in_Palestine_Country : Country ;
|
||||
fun Q717_Venezuela_Country : Country ;
|
||||
fun Q31_Belgium_Country : Country ;
|
||||
fun Q794_Iran_Country : Country ;
|
||||
fun Q43_Turkey_Country : Country ;
|
||||
fun Q948_Tunisia_Country : Country ;
|
||||
fun Q258_South_Africa_Country : Country ;
|
||||
fun Q28_Hungary_Country : Country ;
|
||||
fun Q80061_Nobel_Prize_in_Physiology_or_Medicine_Award : Award ;
|
||||
fun Q142_France_Country : Country ;
|
||||
fun Q805_Yemen_Country : Country ;
|
||||
fun Q881_Vietnam_Country : Country ;
|
||||
fun Q7462_Song_dynasty_Country : Country ;
|
||||
fun Q12544_Byzantine_Empire_Country : Country ;
|
||||
fun Q664_New_Zealand_Country : Country ;
|
||||
fun Q33_Finland_Country : Country ;
|
||||
fun Q282428_Mamluk_Sultanate_Country : Country ;
|
||||
fun Q38104_Nobel_Prize_in_Physics_Award : Award ;
|
||||
fun Q9903_Ming_dynasty_Country : Country ;
|
||||
fun Q739_Colombia_Country : Country ;
|
||||
fun Q13426199_Republic_of_China_Country : Country ;
|
||||
fun Q55_Netherlands_Country : Country ;
|
||||
fun Q159_Russia_Country : Country ;
|
||||
fun Q27_Ireland_Country : Country ;
|
||||
fun Q48685_Kingdom_of_Judah_Country : Country ;
|
||||
fun Q810_Jordan_Country : Country ;
|
||||
fun Q36_Poland_Country : Country ;
|
||||
fun Q1014_Liberia_Country : Country ;
|
||||
fun Q38872_Prussia_Country : Country ;
|
||||
fun 'Q574_Timor-Leste_Country' : Country ;
|
||||
fun Q974_Democratic_Republic_of_the_Congo_Country : Country ;
|
||||
fun Q15843470_Roman_Palestine_Country : Country ;
|
||||
fun Q40_Austria_Country : Country ;
|
||||
fun Q928_Philippines_Country : Country ;
|
||||
fun Q148_People's_Republic_of_China_Country : Country ;
|
||||
fun Q35_Denmark_Country : Country ;
|
||||
fun Q954_Zimbabwe_Country : Country ;
|
||||
fun Q216173_Free_City_of_Danzig_Country : Country ;
|
||||
fun Q227_Azerbaijan_Country : Country ;
|
||||
fun Q252_Indonesia_Country : Country ;
|
||||
fun Q801_Israel_Country : Country ;
|
||||
fun Q155_Brazil_Country : Country ;
|
||||
fun Q29_Spain_Country : Country ;
|
||||
fun Q7075820_Occupied_Enemy_Territory_Administration_Country : Country ;
|
||||
fun Q2685298_Romanian_People's_Republic_Country : Country ;
|
||||
fun Q45_Portugal_Country : Country ;
|
||||
fun Q32_Luxembourg_Country : Country ;
|
||||
fun Q115_Ethiopia_Country : Country ;
|
||||
fun Q193714_Mandatory_Palestine_Country : Country ;
|
||||
fun Q34_Sweden_Country : Country ;
|
||||
fun Q262_Algeria_Country : Country ;
|
||||
fun Q37922_Nobel_Prize_in_Literature_Award : Award ;
|
||||
fun Q843_Pakistan_Country : Country ;
|
||||
fun Q35637_Nobel_Peace_Prize_Award : Award ;
|
||||
fun Q1033_Nigeria_Country : Country ;
|
||||
fun Q38_Italy_Country : Country ;
|
||||
fun Q668_India_Country : Country ;
|
||||
fun Q496922_Hasmonean_dynasty_Country : Country ;
|
||||
fun Q212_Ukraine_Country : Country ;
|
||||
fun Q44585_Nobel_Prize_in_Chemistry_Award : Award ;
|
||||
fun Q760_Saint_Lucia_Country : Country ;
|
||||
fun Q414_Argentina_Country : Country ;
|
||||
fun Q218_Romania_Country : Country ;
|
||||
fun Q213_Czech_Republic_Country : Country ;
|
||||
fun Q219_Bulgaria_Country : Country ;
|
||||
fun Q12560_Ottoman_Empire_Country : Country ;
|
||||
fun Q224_Croatia_Country : Country ;
|
||||
fun Q419_Peru_Country : Country ;
|
||||
fun Q1019_Madagascar_Country : Country ;
|
||||
fun Q30_United_States_Country : Country ;
|
||||
fun Q180114_Ayyubid_dynasty_Country : Country ;
|
||||
fun Q8733_Qing_dynasty_Country : Country ;
|
||||
fun Q96_Mexico_Country : Country ;
|
||||
fun Q884_South_Korea_Country : Country ;
|
||||
fun Q114_Kenya_Country : Country ;
|
||||
112
lab2/grammars/LabelsEng.gf
Normal file
112
lab2/grammars/LabelsEng.gf
Normal file
@@ -0,0 +1,112 @@
|
||||
lin Q800_Costa_Rica_Country = mkCountry "Costa Rica" ;
|
||||
lin Q219060_State_of_Palestine_Country = mkCountry "State of Palestine" ;
|
||||
lin Q37_Lithuania_Country = mkCountry "Lithuania" ;
|
||||
lin Q137816_Taiwan_under_Japanese_rule_Country = mkCountry "Taiwan under Japanese rule" ;
|
||||
lin Q1028_Morocco_Country = mkCountry "Morocco" ;
|
||||
lin Q796_Iraq_Country = mkCountry "Iraq" ;
|
||||
lin Q184_Belarus_Country = mkCountry "Belarus" ;
|
||||
lin Q225_Bosnia_and_Herzegovina_Country = mkCountry "Bosnia and Herzegovina" ;
|
||||
lin Q20_Norway_Country = mkCountry "Norway" ;
|
||||
lin Q211_Latvia_Country = mkCountry "Latvia" ;
|
||||
lin Q117_Ghana_Country = mkCountry "Ghana" ;
|
||||
lin Q39_Switzerland_Country = mkCountry "Switzerland" ;
|
||||
lin Q159631_Kingdom_of_Württemberg_Country = mkCountry "Kingdom of Württemberg" ;
|
||||
lin Q17_Japan_Country = mkCountry "Japan" ;
|
||||
lin Q189_Iceland_Country = mkCountry "Iceland" ;
|
||||
lin Q221_North_Macedonia_Country = mkCountry "North Macedonia" ;
|
||||
lin Q9683_Tang_dynasty_Country = mkCountry "Tang dynasty" ;
|
||||
lin Q79_Egypt_Country = mkCountry "Egypt" ;
|
||||
lin Q408_Australia_Country = mkCountry "Australia" ;
|
||||
lin Q4628_Faroe_Islands_Country = mkCountry "Faroe Islands" ;
|
||||
lin Q145_United_Kingdom_Country = mkCountry "United Kingdom" ;
|
||||
lin Q214_Slovakia_Country = mkCountry "Slovakia" ;
|
||||
lin Q16_Canada_Country = mkCountry "Canada" ;
|
||||
lin Q924_Tanzania_Country = mkCountry "Tanzania" ;
|
||||
lin Q55502_Kingdom_of_Jerusalem_Country = mkCountry "Kingdom of Jerusalem" ;
|
||||
lin Q183_Germany_Country = mkCountry "Germany" ;
|
||||
lin Q754_Trinidad_and_Tobago_Country = mkCountry "Trinidad and Tobago" ;
|
||||
lin Q298_Chile_Country = mkCountry "Chile" ;
|
||||
lin Q41_Greece_Country = mkCountry "Greece" ;
|
||||
lin Q30623_Manchukuo_Country = mkCountry "Manchukuo" ;
|
||||
lin Q774_Guatemala_Country = mkCountry "Guatemala" ;
|
||||
lin Q836_Myanmar_Country = mkCountry "Myanmar" ;
|
||||
lin Q902_Bangladesh_Country = mkCountry "Bangladesh" ;
|
||||
lin Q215_Slovenia_Country = mkCountry "Slovenia" ;
|
||||
lin Q7313_Yuan_dynasty_Country = mkCountry "Yuan dynasty" ;
|
||||
lin Q822_Lebanon_Country = mkCountry "Lebanon" ;
|
||||
lin Q12548_Holy_Roman_Empire_Country = mkCountry "Holy Roman Empire" ;
|
||||
lin Q12407080_early_Islamic_period_in_Palestine_Country = mkCountry "early Islamic period in Palestine" ;
|
||||
lin Q717_Venezuela_Country = mkCountry "Venezuela" ;
|
||||
lin Q31_Belgium_Country = mkCountry "Belgium" ;
|
||||
lin Q794_Iran_Country = mkCountry "Iran" ;
|
||||
lin Q43_Turkey_Country = mkCountry "Turkey" ;
|
||||
lin Q948_Tunisia_Country = mkCountry "Tunisia" ;
|
||||
lin Q258_South_Africa_Country = mkCountry "South Africa" ;
|
||||
lin Q28_Hungary_Country = mkCountry "Hungary" ;
|
||||
lin Q80061_Nobel_Prize_in_Physiology_or_Medicine_Award = mkAward "Nobel Prize in Physiology or Medicine" ;
|
||||
lin Q142_France_Country = mkCountry "France" ;
|
||||
lin Q805_Yemen_Country = mkCountry "Yemen" ;
|
||||
lin Q881_Vietnam_Country = mkCountry "Vietnam" ;
|
||||
lin Q7462_Song_dynasty_Country = mkCountry "Song dynasty" ;
|
||||
lin Q12544_Byzantine_Empire_Country = mkCountry "Byzantine Empire" ;
|
||||
lin Q664_New_Zealand_Country = mkCountry "New Zealand" ;
|
||||
lin Q33_Finland_Country = mkCountry "Finland" ;
|
||||
lin Q282428_Mamluk_Sultanate_Country = mkCountry "Mamluk Sultanate" ;
|
||||
lin Q38104_Nobel_Prize_in_Physics_Award = mkAward "Nobel Prize in Physics" ;
|
||||
lin Q9903_Ming_dynasty_Country = mkCountry "Ming dynasty" ;
|
||||
lin Q739_Colombia_Country = mkCountry "Colombia" ;
|
||||
lin Q13426199_Republic_of_China_Country = mkCountry "Republic of China" ;
|
||||
lin Q55_Netherlands_Country = mkCountry "Netherlands" ;
|
||||
lin Q159_Russia_Country = mkCountry "Russia" ;
|
||||
lin Q27_Ireland_Country = mkCountry "Ireland" ;
|
||||
lin Q48685_Kingdom_of_Judah_Country = mkCountry "Kingdom of Judah" ;
|
||||
lin Q810_Jordan_Country = mkCountry "Jordan" ;
|
||||
lin Q36_Poland_Country = mkCountry "Poland" ;
|
||||
lin Q1014_Liberia_Country = mkCountry "Liberia" ;
|
||||
lin Q38872_Prussia_Country = mkCountry "Prussia" ;
|
||||
lin 'Q574_Timor-Leste_Country' = mkCountry "Timor-Leste" ;
|
||||
lin Q974_Democratic_Republic_of_the_Congo_Country = mkCountry "Democratic Republic of the Congo" ;
|
||||
lin Q15843470_Roman_Palestine_Country = mkCountry "Roman Palestine" ;
|
||||
lin Q40_Austria_Country = mkCountry "Austria" ;
|
||||
lin Q928_Philippines_Country = mkCountry "Philippines" ;
|
||||
lin Q148_People's_Republic_of_China_Country = mkCountry "People's Republic of China" ;
|
||||
lin Q35_Denmark_Country = mkCountry "Denmark" ;
|
||||
lin Q954_Zimbabwe_Country = mkCountry "Zimbabwe" ;
|
||||
lin Q216173_Free_City_of_Danzig_Country = mkCountry "Free City of Danzig" ;
|
||||
lin Q227_Azerbaijan_Country = mkCountry "Azerbaijan" ;
|
||||
lin Q252_Indonesia_Country = mkCountry "Indonesia" ;
|
||||
lin Q801_Israel_Country = mkCountry "Israel" ;
|
||||
lin Q155_Brazil_Country = mkCountry "Brazil" ;
|
||||
lin Q29_Spain_Country = mkCountry "Spain" ;
|
||||
lin Q7075820_Occupied_Enemy_Territory_Administration_Country = mkCountry "Occupied Enemy Territory Administration" ;
|
||||
lin Q2685298_Romanian_People's_Republic_Country = mkCountry "Romanian People's Republic" ;
|
||||
lin Q45_Portugal_Country = mkCountry "Portugal" ;
|
||||
lin Q32_Luxembourg_Country = mkCountry "Luxembourg" ;
|
||||
lin Q115_Ethiopia_Country = mkCountry "Ethiopia" ;
|
||||
lin Q193714_Mandatory_Palestine_Country = mkCountry "Mandatory Palestine" ;
|
||||
lin Q34_Sweden_Country = mkCountry "Sweden" ;
|
||||
lin Q262_Algeria_Country = mkCountry "Algeria" ;
|
||||
lin Q37922_Nobel_Prize_in_Literature_Award = mkAward "Nobel Prize in Literature" ;
|
||||
lin Q843_Pakistan_Country = mkCountry "Pakistan" ;
|
||||
lin Q35637_Nobel_Peace_Prize_Award = mkAward "Nobel Peace Prize" ;
|
||||
lin Q1033_Nigeria_Country = mkCountry "Nigeria" ;
|
||||
lin Q38_Italy_Country = mkCountry "Italy" ;
|
||||
lin Q668_India_Country = mkCountry "India" ;
|
||||
lin Q496922_Hasmonean_dynasty_Country = mkCountry "Hasmonean dynasty" ;
|
||||
lin Q212_Ukraine_Country = mkCountry "Ukraine" ;
|
||||
lin Q44585_Nobel_Prize_in_Chemistry_Award = mkAward "Nobel Prize in Chemistry" ;
|
||||
lin Q760_Saint_Lucia_Country = mkCountry "Saint Lucia" ;
|
||||
lin Q414_Argentina_Country = mkCountry "Argentina" ;
|
||||
lin Q218_Romania_Country = mkCountry "Romania" ;
|
||||
lin Q213_Czech_Republic_Country = mkCountry "Czech Republic" ;
|
||||
lin Q219_Bulgaria_Country = mkCountry "Bulgaria" ;
|
||||
lin Q12560_Ottoman_Empire_Country = mkCountry "Ottoman Empire" ;
|
||||
lin Q224_Croatia_Country = mkCountry "Croatia" ;
|
||||
lin Q419_Peru_Country = mkCountry "Peru" ;
|
||||
lin Q1019_Madagascar_Country = mkCountry "Madagascar" ;
|
||||
lin Q30_United_States_Country = mkCountry "United States" ;
|
||||
lin Q180114_Ayyubid_dynasty_Country = mkCountry "Ayyubid dynasty" ;
|
||||
lin Q8733_Qing_dynasty_Country = mkCountry "Qing dynasty" ;
|
||||
lin Q96_Mexico_Country = mkCountry "Mexico" ;
|
||||
lin Q884_South_Korea_Country = mkCountry "South Korea" ;
|
||||
lin Q114_Kenya_Country = mkCountry "Kenya" ;
|
||||
49
lab2/scripts/analyse_nobel.py
Normal file
49
lab2/scripts/analyse_nobel.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import json
|
||||
|
||||
# query: https://w.wiki/3tEM
|
||||
|
||||
DATA_FILE = 'query.json'
|
||||
|
||||
with open(DATA_FILE) as file:
|
||||
data = json.load(file)
|
||||
|
||||
print(data[0])
|
||||
|
||||
awards = {(d['award'], d['awardLabel']) for d in data}
|
||||
|
||||
#print(awards)
|
||||
#print(len(awards))
|
||||
|
||||
countries = {(d['country'], d['countryLabel']) for d in data}
|
||||
|
||||
#print(countries)
|
||||
#print(len(countries))
|
||||
|
||||
#print(data[0].keys())
|
||||
|
||||
def pronoun(d):
|
||||
sex = d.get('sexLabel', 'other')
|
||||
if sex == 'female':
|
||||
return 'she'
|
||||
elif sex == 'male':
|
||||
return 'he'
|
||||
else:
|
||||
return 'they'
|
||||
|
||||
|
||||
def year(date):
|
||||
return date[:4]
|
||||
|
||||
|
||||
def person_descr(d):
|
||||
died = f"{d['personLabel']} died {year(d['deathDate'])}" if 'deathDate' in d else ''
|
||||
return (
|
||||
f"{d['personLabel']} from {d['countryLabel']} was born in {year(d['birthDate'])}. " +
|
||||
f"{pronoun(d)} got {d['awardLabel']} in {year(d['date'])}." +
|
||||
died
|
||||
)
|
||||
|
||||
for d in data:
|
||||
print(person_descr(d))
|
||||
|
||||
|
||||
101
lab2/scripts/find_labels.py
Normal file
101
lab2/scripts/find_labels.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# https://www.wikidata.org/wiki/Special:EntityData/Q18644475.json
|
||||
|
||||
import urllib.request
|
||||
import json
|
||||
import sys
|
||||
import ssl
|
||||
from gf_utils import *
|
||||
|
||||
"""
|
||||
To collect labels from query.json (Wikidata query result) and extract grammars:
|
||||
python3 find_labels.py init >labels.jsonl
|
||||
python3 find_labels.py abstract >Labels.gf
|
||||
python3 find_labels.py en >LabelsEng.gf
|
||||
"""
|
||||
|
||||
WIKIDATA_FILE = '../data/query.json'
|
||||
WIKIDATA_PREFIX = 'http://www.wikidata.org/entity/'
|
||||
WIKIDATA_URL_PREFIX = 'http://www.wikidata.org/wiki/Special:EntityData/'
|
||||
NOBEL_FIELDS = ['award', 'country']
|
||||
LABEL_FILE = '../data/labels.jsonl'
|
||||
|
||||
USAGE = 'usage: find_labels.py (init | abstract | en | sv | fi | ...)'
|
||||
|
||||
if sys.argv[1:]:
|
||||
MODE = sys.argv[1]
|
||||
else:
|
||||
print(USAGE)
|
||||
|
||||
|
||||
# qids given in the data file
|
||||
def get_wikidata_qids(jsonfile, fields):
|
||||
qids = set()
|
||||
with open(jsonfile) as file:
|
||||
data = json.load(file)
|
||||
for d in data:
|
||||
for f in fields:
|
||||
if f in d:
|
||||
qids.add((f, d[f][len(WIKIDATA_PREFIX):]))
|
||||
return qids
|
||||
|
||||
|
||||
qids = get_wikidata_qids(WIKIDATA_FILE, NOBEL_FIELDS)
|
||||
|
||||
if __name__ == '__mainz__':
|
||||
for qid in qids:
|
||||
print(qid)
|
||||
|
||||
# get all wikidata for each qid
|
||||
# use this only once, because it is slow
|
||||
def get_wikidata_json(qids):
|
||||
context = ssl._create_unverified_context()
|
||||
for field, qid in qids:
|
||||
try:
|
||||
with urllib.request.urlopen(WIKIDATA_URL_PREFIX + qid +'.json', context=context) as url:
|
||||
data = json.load(url)
|
||||
yield (field, qid, data)
|
||||
except Exception as error:
|
||||
pass
|
||||
|
||||
# extract the labels, redirect to LABEL_FILE (only once)
|
||||
def get_wikidata_labels(data, languages=None):
|
||||
for field, qid, dict in data:
|
||||
entities = dict.get('entities', {'foo': {}})
|
||||
entity = list(entities.values())[0]
|
||||
entitylabels = entity.get('labels', {})
|
||||
entitylabels = {val['language']: val['value']
|
||||
for val in entitylabels.values()
|
||||
if (languages is None) or
|
||||
(val['language'] in languages)}
|
||||
entitylabels['field'] = field
|
||||
print(json.dumps({qid: entitylabels}, ensure_ascii=False))
|
||||
|
||||
|
||||
# {"Q800": {"tg": "Коста Рика", "sk": "Kostarika", ... "field": <field>}}
|
||||
def extract_labels(labeldata, mode):
|
||||
for entry in data:
|
||||
qid, labels = list(entry.items())[0]
|
||||
eng = labels.get('en', 'X')
|
||||
cat = labels['field'].capitalize()
|
||||
fun = mk_fun_from_strs([qid, eng, cat])
|
||||
if mode == 'abstract':
|
||||
print(mk_fun_rule(fun, cat))
|
||||
else:
|
||||
lin = labels.get(mode, labels.get('en', 'X'))
|
||||
oper = 'mk' + cat
|
||||
print(mk_lin_rule(fun, mk_lin(oper, [lin], [])))
|
||||
|
||||
|
||||
if MODE == 'init':
|
||||
# do this only once, redirect to labels.jsonl
|
||||
data = get_wikidata_json(list(qids))
|
||||
get_wikidata_labels(data)
|
||||
else:
|
||||
# do this once for abs and for every language you want
|
||||
with open(LABEL_FILE) as file:
|
||||
data = [json.loads(line) for line in file]
|
||||
extract_labels(data, MODE)
|
||||
|
||||
|
||||
|
||||
|
||||
101
lab2/scripts/find_labels.py~
Normal file
101
lab2/scripts/find_labels.py~
Normal file
@@ -0,0 +1,101 @@
|
||||
# https://www.wikidata.org/wiki/Special:EntityData/Q18644475.json
|
||||
|
||||
import urllib.request
|
||||
import json
|
||||
import sys
|
||||
import ssl
|
||||
from gf_utils import *
|
||||
|
||||
"""
|
||||
To collect labels from query.json (Wikidata query result) and extract grammars:
|
||||
python3 find_labels.py init >labels.jsonl
|
||||
python3 find_labels.py abstract >Labels.gf
|
||||
python3 find_labels.py en >LabelsEng.gf
|
||||
"""
|
||||
|
||||
WIKIDATA_FILE = 'query.json'
|
||||
WIKIDATA_PREFIX = 'http://www.wikidata.org/entity/'
|
||||
WIKIDATA_URL_PREFIX = 'http://www.wikidata.org/wiki/Special:EntityData/'
|
||||
NOBEL_FIELDS = ['award', 'country']
|
||||
LABEL_FILE = 'labels.jsonl'
|
||||
|
||||
USAGE = 'usage: find_labels.py (init | abstract | en | sv | fi | ...)'
|
||||
|
||||
if sys.argv[1:]:
|
||||
MODE = sys.argv[1]
|
||||
else:
|
||||
print(USAGE)
|
||||
|
||||
|
||||
# qids given in the data file
|
||||
def get_wikidata_qids(jsonfile, fields):
|
||||
qids = set()
|
||||
with open(jsonfile) as file:
|
||||
data = json.load(file)
|
||||
for d in data:
|
||||
for f in fields:
|
||||
if f in d:
|
||||
qids.add((f, d[f][len(WIKIDATA_PREFIX):]))
|
||||
return qids
|
||||
|
||||
|
||||
qids = get_wikidata_qids(WIKIDATA_FILE, NOBEL_FIELDS)
|
||||
|
||||
if __name__ == '__mainz__':
|
||||
for qid in qids:
|
||||
print(qid)
|
||||
|
||||
# get all wikidata for each qid
|
||||
# use this only once, because it is slow
|
||||
def get_wikidata_json(qids):
|
||||
context = ssl._create_unverified_context()
|
||||
for field, qid in qids:
|
||||
try:
|
||||
with urllib.request.urlopen(WIKIDATA_URL_PREFIX + qid +'.json', context=context) as url:
|
||||
data = json.load(url)
|
||||
yield (field, qid, data)
|
||||
except Exception as error:
|
||||
pass
|
||||
|
||||
# extract the labels, redirect to LABEL_FILE (only once)
|
||||
def get_wikidata_labels(data, languages=None):
|
||||
for field, qid, dict in data:
|
||||
entities = dict.get('entities', {'foo': {}})
|
||||
entity = list(entities.values())[0]
|
||||
entitylabels = entity.get('labels', {})
|
||||
entitylabels = {val['language']: val['value']
|
||||
for val in entitylabels.values()
|
||||
if (languages is None) or
|
||||
(val['language'] in languages)}
|
||||
entitylabels['field'] = field
|
||||
print(json.dumps({qid: entitylabels}, ensure_ascii=False))
|
||||
|
||||
|
||||
# {"Q800": {"tg": "Коста Рика", "sk": "Kostarika", ... "field": <field>}}
|
||||
def extract_labels(labeldata, mode):
|
||||
for entry in data:
|
||||
qid, labels = list(entry.items())[0]
|
||||
eng = labels.get('en', 'X')
|
||||
cat = labels['field'].capitalize()
|
||||
fun = mk_fun_from_strs([qid, eng, cat])
|
||||
if mode == 'abstract':
|
||||
print(mk_fun_rule(fun, cat))
|
||||
else:
|
||||
lin = labels.get(mode, labels.get('en', 'X'))
|
||||
oper = 'mk' + cat
|
||||
print(mk_lin_rule(fun, mk_lin(oper, [lin], [])))
|
||||
|
||||
|
||||
if MODE == 'init':
|
||||
# do this only once, redirect to labels.jsonl
|
||||
data = get_wikidata_json(list(qids))
|
||||
get_wikidata_labels(data)
|
||||
else:
|
||||
# do this once for abs and for every language you want
|
||||
with open(LABEL_FILE) as file:
|
||||
data = [json.loads(line) for line in file]
|
||||
extract_labels(data, MODE)
|
||||
|
||||
|
||||
|
||||
|
||||
48
lab2/scripts/gf_utils.py
Normal file
48
lab2/scripts/gf_utils.py
Normal file
@@ -0,0 +1,48 @@
|
||||
# import pgf
|
||||
|
||||
def mk_fun(s):
|
||||
s = '_'.join(s.split()) # spaces replaced by underscores
|
||||
|
||||
if (s[0].isalpha() and
|
||||
all(ord(c)<256 and (c.isdigit() or c.isalpha() or c in "_'")
|
||||
for c in s)): # test if legal GF identifier
|
||||
return s
|
||||
else:
|
||||
return "'" + s.replace("'", "\\'") + "'" # if not, single quotes make it legal
|
||||
|
||||
|
||||
def mk_fun_from_strs(ss):
|
||||
return mk_fun('_'.join(ss))
|
||||
|
||||
|
||||
def quote(s):
|
||||
return '"' + s + '"'
|
||||
|
||||
|
||||
def app(fun, args):
|
||||
return ' '.join([fun, quote(args[0])] + args[1:])
|
||||
|
||||
def empty_variants():
|
||||
return 'variants {}'
|
||||
|
||||
def mk_lin(oper, words, params):
|
||||
return ' '.join([oper] + [quote(w) for w in words] + params)
|
||||
|
||||
|
||||
def mk_cat_rule(cat):
|
||||
return ' '.join(['cat', cat, ';\n'])
|
||||
|
||||
|
||||
def mk_fun_rule(fun, cat, comment=None):
|
||||
co = '--' + comment if comment else ''
|
||||
return ' '.join(['fun', fun, ':', cat, ';', co])
|
||||
|
||||
|
||||
def mk_lin_rule(fun, lin, comment=None):
|
||||
co = '--' + comment if comment else ''
|
||||
return ' '.join(['lin', fun, '=', lin, ';', co])
|
||||
|
||||
|
||||
def mk_lincat_rule(cat, lin):
|
||||
return ' '.join(['lincat', cat, '=', lin, ';'])
|
||||
|
||||
Reference in New Issue
Block a user