From 6ea7d5d838a29cfd2517e540be74207ef6b76879 Mon Sep 17 00:00:00 2001 From: Aarne Ranta Date: Sun, 4 May 2025 12:14:37 +0200 Subject: [PATCH] everything in place for Lab 2 --- lab2/README.md | 14 ++-- lab2/data/Nobel-funs.jsonl | 112 +++++++++++++++++++++++++++++ lab2/grammars/LabelsEng.gf | 2 +- lab2/grammars/LabelsFin.gf | 128 +++++++++++++++++++++++++++++++++ lab2/grammars/Nobel.gf | 10 +-- lab2/grammars/NobelEng.gf | 16 +++-- lab2/grammars/NobelFin.gf | 36 ++++++++++ lab2/scripts/analyse_nobel.py | 49 ------------- lab2/scripts/describe_nobel.py | 95 ++++++++++++++++++++++++ lab2/scripts/find_labels.py | 13 ++-- 10 files changed, 403 insertions(+), 72 deletions(-) create mode 100644 lab2/data/Nobel-funs.jsonl create mode 100644 lab2/grammars/LabelsFin.gf create mode 100644 lab2/grammars/NobelFin.gf delete mode 100644 lab2/scripts/analyse_nobel.py create mode 100644 lab2/scripts/describe_nobel.py diff --git a/lab2/README.md b/lab2/README.md index 30314e4..5704632 100644 --- a/lab2/README.md +++ b/lab2/README.md @@ -1,14 +1,14 @@ # Lab 2: Multilingual text generation from Wikidata This uses GF to generate texts from facts in the Wikidata fact database. -You will be given +You are given -- an abstract syntax, -- an English concrete syntax, -- a json dump from Wikidata -- a Python file that connects Wikidata with GF +- an abstract syntax and an English concrete syntax, in the subdirectory grammars/ +- a json dump from Wikidata, in the subdirectory data/ +- a Python file that connects Wikidata with GF, in the subdirectory scripts/ -Your task will be to create a concrete syntax for some other language by using the +Your task is to create a concrete syntax for some other language by using the GF RGL and evaluate the text generated by this. -The listed files will be provided before the lab starts. +More instructions will be given in the lectures of the week 5-9 May 2025. + diff --git a/lab2/data/Nobel-funs.jsonl b/lab2/data/Nobel-funs.jsonl new file mode 100644 index 0000000..9f1b826 --- /dev/null +++ b/lab2/data/Nobel-funs.jsonl @@ -0,0 +1,112 @@ +["Q800", "Q800_Costa_Rica_Country"] +["Q219060", "Q219060_State_of_Palestine_Country"] +["Q37", "Q37_Lithuania_Country"] +["Q137816", "Q137816_Taiwan_under_Japanese_rule_Country"] +["Q1028", "Q1028_Morocco_Country"] +["Q796", "Q796_Iraq_Country"] +["Q184", "Q184_Belarus_Country"] +["Q225", "Q225_Bosnia_and_Herzegovina_Country"] +["Q20", "Q20_Norway_Country"] +["Q211", "Q211_Latvia_Country"] +["Q117", "Q117_Ghana_Country"] +["Q39", "Q39_Switzerland_Country"] +["Q159631", "Q159631_Kingdom_of_Württemberg_Country"] +["Q17", "Q17_Japan_Country"] +["Q189", "Q189_Iceland_Country"] +["Q221", "Q221_North_Macedonia_Country"] +["Q9683", "Q9683_Tang_dynasty_Country"] +["Q79", "Q79_Egypt_Country"] +["Q408", "Q408_Australia_Country"] +["Q4628", "Q4628_Faroe_Islands_Country"] +["Q145", "Q145_United_Kingdom_Country"] +["Q214", "Q214_Slovakia_Country"] +["Q16", "Q16_Canada_Country"] +["Q924", "Q924_Tanzania_Country"] +["Q55502", "Q55502_Kingdom_of_Jerusalem_Country"] +["Q183", "Q183_Germany_Country"] +["Q754", "Q754_Trinidad_and_Tobago_Country"] +["Q298", "Q298_Chile_Country"] +["Q41", "Q41_Greece_Country"] +["Q30623", "Q30623_Manchukuo_Country"] +["Q774", "Q774_Guatemala_Country"] +["Q836", "Q836_Myanmar_Country"] +["Q902", "Q902_Bangladesh_Country"] +["Q215", "Q215_Slovenia_Country"] +["Q7313", "Q7313_Yuan_dynasty_Country"] +["Q822", "Q822_Lebanon_Country"] +["Q12548", "Q12548_Holy_Roman_Empire_Country"] +["Q12407080", "Q12407080_early_Islamic_period_in_Palestine_Country"] +["Q717", "Q717_Venezuela_Country"] +["Q31", "Q31_Belgium_Country"] +["Q794", "Q794_Iran_Country"] +["Q43", "Q43_Turkey_Country"] +["Q948", "Q948_Tunisia_Country"] +["Q258", "Q258_South_Africa_Country"] +["Q28", "Q28_Hungary_Country"] +["Q80061", "Q80061_Nobel_Prize_in_Physiology_or_Medicine_Award"] +["Q142", "Q142_France_Country"] +["Q805", "Q805_Yemen_Country"] +["Q881", "Q881_Vietnam_Country"] +["Q7462", "Q7462_Song_dynasty_Country"] +["Q12544", "Q12544_Byzantine_Empire_Country"] +["Q664", "Q664_New_Zealand_Country"] +["Q33", "Q33_Finland_Country"] +["Q282428", "Q282428_Mamluk_Sultanate_Country"] +["Q38104", "Q38104_Nobel_Prize_in_Physics_Award"] +["Q9903", "Q9903_Ming_dynasty_Country"] +["Q739", "Q739_Colombia_Country"] +["Q13426199", "Q13426199_Republic_of_China_Country"] +["Q55", "Q55_Netherlands_Country"] +["Q159", "Q159_Russia_Country"] +["Q27", "Q27_Ireland_Country"] +["Q48685", "Q48685_Kingdom_of_Judah_Country"] +["Q810", "Q810_Jordan_Country"] +["Q36", "Q36_Poland_Country"] +["Q1014", "Q1014_Liberia_Country"] +["Q38872", "Q38872_Prussia_Country"] +["Q574", "'Q574_Timor-Leste_Country'"] +["Q974", "Q974_Democratic_Republic_of_the_Congo_Country"] +["Q15843470", "Q15843470_Roman_Palestine_Country"] +["Q40", "Q40_Austria_Country"] +["Q928", "Q928_Philippines_Country"] +["Q148", "Q148_People's_Republic_of_China_Country"] +["Q35", "Q35_Denmark_Country"] +["Q954", "Q954_Zimbabwe_Country"] +["Q216173", "Q216173_Free_City_of_Danzig_Country"] +["Q227", "Q227_Azerbaijan_Country"] +["Q252", "Q252_Indonesia_Country"] +["Q801", "Q801_Israel_Country"] +["Q155", "Q155_Brazil_Country"] +["Q29", "Q29_Spain_Country"] +["Q7075820", "Q7075820_Occupied_Enemy_Territory_Administration_Country"] +["Q2685298", "Q2685298_Romanian_People's_Republic_Country"] +["Q45", "Q45_Portugal_Country"] +["Q32", "Q32_Luxembourg_Country"] +["Q115", "Q115_Ethiopia_Country"] +["Q193714", "Q193714_Mandatory_Palestine_Country"] +["Q34", "Q34_Sweden_Country"] +["Q262", "Q262_Algeria_Country"] +["Q37922", "Q37922_Nobel_Prize_in_Literature_Award"] +["Q843", "Q843_Pakistan_Country"] +["Q35637", "Q35637_Nobel_Peace_Prize_Award"] +["Q1033", "Q1033_Nigeria_Country"] +["Q38", "Q38_Italy_Country"] +["Q668", "Q668_India_Country"] +["Q496922", "Q496922_Hasmonean_dynasty_Country"] +["Q212", "Q212_Ukraine_Country"] +["Q44585", "Q44585_Nobel_Prize_in_Chemistry_Award"] +["Q760", "Q760_Saint_Lucia_Country"] +["Q414", "Q414_Argentina_Country"] +["Q218", "Q218_Romania_Country"] +["Q213", "Q213_Czech_Republic_Country"] +["Q219", "Q219_Bulgaria_Country"] +["Q12560", "Q12560_Ottoman_Empire_Country"] +["Q224", "Q224_Croatia_Country"] +["Q419", "Q419_Peru_Country"] +["Q1019", "Q1019_Madagascar_Country"] +["Q30", "Q30_United_States_Country"] +["Q180114", "Q180114_Ayyubid_dynasty_Country"] +["Q8733", "Q8733_Qing_dynasty_Country"] +["Q96", "Q96_Mexico_Country"] +["Q884", "Q884_South_Korea_Country"] +["Q114", "Q114_Kenya_Country"] diff --git a/lab2/grammars/LabelsEng.gf b/lab2/grammars/LabelsEng.gf index fb685b4..3b70bd0 100644 --- a/lab2/grammars/LabelsEng.gf +++ b/lab2/grammars/LabelsEng.gf @@ -9,7 +9,7 @@ oper mkCountry = overload { } ; oper mkAward = overload { - mkAward : Str -> NP = \s -> mkNP (mkPN s) ; + mkAward : Str -> NP = \s -> mkNP the_Det (mkN s) ; mkAward : NP -> NP = \np -> np ; } ; diff --git a/lab2/grammars/LabelsFin.gf b/lab2/grammars/LabelsFin.gf new file mode 100644 index 0000000..cd1b411 --- /dev/null +++ b/lab2/grammars/LabelsFin.gf @@ -0,0 +1,128 @@ +concrete LabelsFin of Labels = open SyntaxFin, ParadigmsFin in { + +lincat Country = NP ; +lincat Award = NP ; + +oper mkCountry = overload { + mkCountry : Str -> NP = \s -> mkNP (mkPN s) ; + mkCountry : NP -> NP = \np -> np ; + } ; + +oper mkAward = overload { + mkAward : Str -> NP = \s -> mkNP the_Det (mkN s) ; + mkAward : NP -> NP = \np -> np ; + } ; + +lin Q800_Costa_Rica_Country = mkCountry "Costa Rica" ; +lin Q219060_State_of_Palestine_Country = mkCountry "Palestiina" ; +lin Q37_Lithuania_Country = mkCountry "Liettua" ; +lin Q137816_Taiwan_under_Japanese_rule_Country = mkCountry "Taiwan Japanin alaisuudessa" ; +lin Q1028_Morocco_Country = mkCountry "Marokko" ; +lin Q796_Iraq_Country = mkCountry "Irak" ; +lin Q184_Belarus_Country = mkCountry "Valko-Venäjä" ; +lin Q225_Bosnia_and_Herzegovina_Country = mkCountry "Bosnia ja Hertsegovina" ; +lin Q20_Norway_Country = mkCountry "Norja" ; +lin Q211_Latvia_Country = mkCountry "Latvia" ; +lin Q117_Ghana_Country = mkCountry "Ghana" ; +lin Q39_Switzerland_Country = mkCountry "Sveitsi" ; +lin Q159631_Kingdom_of_Württemberg_Country = mkCountry "Württembergin kuningaskunta" ; +lin Q17_Japan_Country = mkCountry "Japani" ; +lin Q189_Iceland_Country = mkCountry "Islanti" ; +lin Q221_North_Macedonia_Country = mkCountry "Pohjois-Makedonia" ; +lin Q9683_Tang_dynasty_Country = mkCountry "Tang-dynastia" ; +lin Q79_Egypt_Country = mkCountry "Egypti" ; +lin Q408_Australia_Country = mkCountry "Australia" ; +lin Q4628_Faroe_Islands_Country = mkCountry "Färsaaret" ; +lin Q145_United_Kingdom_Country = mkCountry "Yhdistynyt kuningaskunta" ; +lin Q214_Slovakia_Country = mkCountry "Slovakia" ; +lin Q16_Canada_Country = mkCountry "Kanada" ; +lin Q924_Tanzania_Country = mkCountry "Tansania" ; +lin Q55502_Kingdom_of_Jerusalem_Country = mkCountry "Jerusalemin kuningaskunta" ; +lin Q183_Germany_Country = mkCountry "Saksa" ; +lin Q754_Trinidad_and_Tobago_Country = mkCountry "Trinidad ja Tobago" ; +lin Q298_Chile_Country = mkCountry "Chile" ; +lin Q41_Greece_Country = mkCountry "Kreikka" ; +lin Q30623_Manchukuo_Country = mkCountry "Mantšukuo" ; +lin Q774_Guatemala_Country = mkCountry "Guatemala" ; +lin Q836_Myanmar_Country = mkCountry "Myanmar" ; +lin Q902_Bangladesh_Country = mkCountry "Bangladesh" ; +lin Q215_Slovenia_Country = mkCountry "Slovenia" ; +lin Q7313_Yuan_dynasty_Country = mkCountry "Yuan" ; +lin Q822_Lebanon_Country = mkCountry "Libanon" ; +lin Q12548_Holy_Roman_Empire_Country = mkCountry "Pyhä saksalais-roomalainen keisarikunta" ; +lin Q12407080_early_Islamic_period_in_Palestine_Country = mkCountry "early Islamic period in Palestine" ; +lin Q717_Venezuela_Country = mkCountry "Venezuela" ; +lin Q31_Belgium_Country = mkCountry "Belgia" ; +lin Q794_Iran_Country = mkCountry "Iran" ; +lin Q43_Turkey_Country = mkCountry "Turkki" ; +lin Q948_Tunisia_Country = mkCountry "Tunisia" ; +lin Q258_South_Africa_Country = mkCountry "Etelä-Afrikka" ; +lin Q28_Hungary_Country = mkCountry "Unkari" ; +lin Q80061_Nobel_Prize_in_Physiology_or_Medicine_Award = mkAward "Nobelin fysiologian tai lääketieteen palkinto" ; +lin Q142_France_Country = mkCountry "Ranska" ; +lin Q805_Yemen_Country = mkCountry "Jemen" ; +lin Q881_Vietnam_Country = mkCountry "Vietnam" ; +lin Q7462_Song_dynasty_Country = mkCountry "Song-dynastia" ; +lin Q12544_Byzantine_Empire_Country = mkCountry "Bysantin valtakunta" ; +lin Q664_New_Zealand_Country = mkCountry "Uusi-Seelanti" ; +lin Q33_Finland_Country = mkCountry "Suomi" ; +lin Q282428_Mamluk_Sultanate_Country = mkCountry "Mamlukin sulttaanikunta" ; +lin Q38104_Nobel_Prize_in_Physics_Award = mkAward "Nobelin fysiikanpalkinto" ; +lin Q9903_Ming_dynasty_Country = mkCountry "Ming-dynastia" ; +lin Q739_Colombia_Country = mkCountry "Kolumbia" ; +lin Q13426199_Republic_of_China_Country = mkCountry "Kiinan tasavalta" ; +lin Q55_Netherlands_Country = mkCountry "Alankomaat" ; +lin Q159_Russia_Country = mkCountry "Venäjä" ; +lin Q27_Ireland_Country = mkCountry "Irlanti" ; +lin Q48685_Kingdom_of_Judah_Country = mkCountry "Juudan kuningaskunta" ; +lin Q810_Jordan_Country = mkCountry "Jordania" ; +lin Q36_Poland_Country = mkCountry "Puola" ; +lin Q1014_Liberia_Country = mkCountry "Liberia" ; +lin Q38872_Prussia_Country = mkCountry "Preussi" ; +lin 'Q574_Timor-Leste_Country' = mkCountry "Itä-Timor" ; +lin Q974_Democratic_Republic_of_the_Congo_Country = mkCountry "Kongon demokraattinen tasavalta" ; +lin Q15843470_Roman_Palestine_Country = mkCountry "Roman Palestine" ; +lin Q40_Austria_Country = mkCountry "Itävalta" ; +lin Q928_Philippines_Country = mkCountry "Filippiinit" ; +lin Q148_People's_Republic_of_China_Country = mkCountry "Kiinan kansantasavalta" ; +lin Q35_Denmark_Country = mkCountry "Tanska" ; +lin Q954_Zimbabwe_Country = mkCountry "Zimbabwe" ; +lin Q216173_Free_City_of_Danzig_Country = mkCountry "Danzigin vapaakaupunki" ; +lin Q227_Azerbaijan_Country = mkCountry "Azerbaidžan" ; +lin Q252_Indonesia_Country = mkCountry "Indonesia" ; +lin Q801_Israel_Country = mkCountry "Israel" ; +lin Q155_Brazil_Country = mkCountry "Brasilia" ; +lin Q29_Spain_Country = mkCountry "Espanja" ; +lin Q7075820_Occupied_Enemy_Territory_Administration_Country = mkCountry "Occupied Enemy Territory Administration" ; +lin Q2685298_Romanian_People's_Republic_Country = mkCountry "Romanian kansantasavalta" ; +lin Q45_Portugal_Country = mkCountry "Portugali" ; +lin Q32_Luxembourg_Country = mkCountry "Luxemburg" ; +lin Q115_Ethiopia_Country = mkCountry "Etiopia" ; +lin Q193714_Mandatory_Palestine_Country = mkCountry "Palestiinan brittiläinen mandaatti" ; +lin Q34_Sweden_Country = mkCountry "Ruotsi" ; +lin Q262_Algeria_Country = mkCountry "Algeria" ; +lin Q37922_Nobel_Prize_in_Literature_Award = mkAward "Nobelin kirjallisuuspalkinto" ; +lin Q843_Pakistan_Country = mkCountry "Pakistan" ; +lin Q35637_Nobel_Peace_Prize_Award = mkAward "Nobelin rauhanpalkinto" ; +lin Q1033_Nigeria_Country = mkCountry "Nigeria" ; +lin Q38_Italy_Country = mkCountry "Italia" ; +lin Q668_India_Country = mkCountry "Intia" ; +lin Q496922_Hasmonean_dynasty_Country = mkCountry "Israelin toinen kuningaskunta" ; +lin Q212_Ukraine_Country = mkCountry "Ukraina" ; +lin Q44585_Nobel_Prize_in_Chemistry_Award = mkAward "Nobelin kemianpalkinto" ; +lin Q760_Saint_Lucia_Country = mkCountry "Saint Lucia" ; +lin Q414_Argentina_Country = mkCountry "Argentiina" ; +lin Q218_Romania_Country = mkCountry "Romania" ; +lin Q213_Czech_Republic_Country = mkCountry "Tšekki" ; +lin Q219_Bulgaria_Country = mkCountry "Bulgaria" ; +lin Q12560_Ottoman_Empire_Country = mkCountry "Osmanien valtakunta" ; +lin Q224_Croatia_Country = mkCountry "Kroatia" ; +lin Q419_Peru_Country = mkCountry "Peru" ; +lin Q1019_Madagascar_Country = mkCountry "Madagaskar" ; +lin Q30_United_States_Country = mkCountry "Yhdysvallat" ; +lin Q180114_Ayyubid_dynasty_Country = mkCountry "Aijubidit" ; +lin Q8733_Qing_dynasty_Country = mkCountry "Qing-dynastia" ; +lin Q96_Mexico_Country = mkCountry "Meksiko" ; +lin Q884_South_Korea_Country = mkCountry "Korean tasavalta" ; +lin Q114_Kenya_Country = mkCountry "Kenia" ; +} \ No newline at end of file diff --git a/lab2/grammars/Nobel.gf b/lab2/grammars/Nobel.gf index 160abcb..b717da9 100644 --- a/lab2/grammars/Nobel.gf +++ b/lab2/grammars/Nobel.gf @@ -1,15 +1,17 @@ abstract Nobel = Labels ** { -flags startcat = Description ; +flags startcat = Sentence ; cat - Description ; + Sentence ; Name ; Date ; fun - LivingDescription : Name -> Name -> Country -> Date -> Date -> Award -> Description ; - PastDescription : Name -> Name -> Country -> Date -> Date -> Date -> Award -> Description ; + BornSentence : Name -> Country -> Date -> Sentence ; + AwardSentence : Name -> Award -> Date -> Sentence ; + DiedSentence : Name -> Date -> Sentence ; + StringName : String -> Name ; YearDate : Int -> Date ; he_Name, she_Name, they_Name : Name ; diff --git a/lab2/grammars/NobelEng.gf b/lab2/grammars/NobelEng.gf index 311f312..dc5bf72 100644 --- a/lab2/grammars/NobelEng.gf +++ b/lab2/grammars/NobelEng.gf @@ -6,17 +6,20 @@ concrete NobelEng of Nobel = LabelsEng ** open in { lincat - Description = Text ; + Sentence = S ; Name = NP ; Date = Adv ; lin - LivingDescription name pron country birthdate awarddate award = - mkText - (mkPhr (mkS pastTense (mkCl name (mkVP (mkVP born_VP (inAdv country)) birthdate)))) - (mkText (mkS pastTense (mkCl pron (mkVP (mkVP (mkV2 get_V) award) awarddate)))) ; + BornSentence name country date = + mkS pastTense (mkCl name (mkVP (mkVP born_VP (inAdv country)) date)) ; + + AwardSentence name award date = + mkS pastTense (mkCl name (mkVP (mkVP (mkV2 get_V) award) date)) ; + + DiedSentence name date = + mkS pastTense (mkCl name (mkVP die_VP date)) ; --- PastDescription : Name -> Country -> Date -> Date -> Award -> Description ; StringName s = symb s ; YearDate i = inAdv ; @@ -28,5 +31,6 @@ lin oper inAdv : NP -> Adv = \np -> SyntaxEng.mkAdv in_Prep np ; born_VP = mkVP (mkA "born") ; + die_VP = mkVP (mkV "die") ; } diff --git a/lab2/grammars/NobelFin.gf b/lab2/grammars/NobelFin.gf new file mode 100644 index 0000000..8749f29 --- /dev/null +++ b/lab2/grammars/NobelFin.gf @@ -0,0 +1,36 @@ +concrete NobelFin of Nobel = LabelsFin ** open + SyntaxFin, + ParadigmsFin, + SymbolicFin +in { + +lincat + Sentence = S ; + Name = NP ; + Date = Adv ; + +lin + BornSentence name country date = + mkS pastTense (mkCl name (mkVP (mkVP born_VP (inAdv country)) date)) ; + + AwardSentence name award date = + mkS pastTense (mkCl name (mkVP (mkVP get_V2 award) date)) ; + + DiedSentence name date = + mkS pastTense (mkCl name (mkVP die_VP date)) ; + + StringName s = symb s ; + + YearDate i = SyntaxFin.mkAdv (mkPrep "vuonna" nominative) ; + + he_Name = he_NP ; + she_Name = she_NP ; + they_Name = he_NP ; + +oper + inAdv : NP -> Adv = \np -> SyntaxFin.mkAdv in_Prep np ; + born_VP = mkVP (mkV "syntyä") ; + die_VP = mkVP (mkV "kuolla") ; + get_V2 = mkV2 (mkV "saada") ; + +} diff --git a/lab2/scripts/analyse_nobel.py b/lab2/scripts/analyse_nobel.py deleted file mode 100644 index 318ed6c..0000000 --- a/lab2/scripts/analyse_nobel.py +++ /dev/null @@ -1,49 +0,0 @@ -import json - -# query: https://w.wiki/3tEM - -DATA_FILE = 'query.json' - -with open(DATA_FILE) as file: - data = json.load(file) - -print(data[0]) - -awards = {(d['award'], d['awardLabel']) for d in data} - -#print(awards) -#print(len(awards)) - -countries = {(d['country'], d['countryLabel']) for d in data} - -#print(countries) -#print(len(countries)) - -#print(data[0].keys()) - -def pronoun(d): - sex = d.get('sexLabel', 'other') - if sex == 'female': - return 'she' - elif sex == 'male': - return 'he' - else: - return 'they' - - -def year(date): - return date[:4] - - -def person_descr(d): - died = f"{d['personLabel']} died {year(d['deathDate'])}" if 'deathDate' in d else '' - return ( - f"{d['personLabel']} from {d['countryLabel']} was born in {year(d['birthDate'])}. " + - f"{pronoun(d)} got {d['awardLabel']} in {year(d['date'])}." + - died - ) - -for d in data: - print(person_descr(d)) - - diff --git a/lab2/scripts/describe_nobel.py b/lab2/scripts/describe_nobel.py new file mode 100644 index 0000000..becb851 --- /dev/null +++ b/lab2/scripts/describe_nobel.py @@ -0,0 +1,95 @@ +import sys +import json +import pgf + +# query: https://w.wiki/3tEM + +DATA_FILE = '../data/query.json' +WIKIDATA_PREFIX = 'http://www.wikidata.org/entity/' +GRAMMAR_PREFIX = 'Nobel' +GRAMMAR_FILE = f'../grammars/{GRAMMAR_PREFIX}.pgf' +FUN_FILE = f'../data/{GRAMMAR_PREFIX}-funs.jsonl' + + +with open(DATA_FILE) as file: + data = json.load(file) + +#print(data[0]) + +awards = {(d['award'], d['awardLabel']) for d in data} + +#print(awards) +#print(len(awards)) + +countries = {(d['country'], d['countryLabel']) for d in data} + +# template-based generation in English + +def pronoun(d): + sex = d.get('sexLabel', 'other') + if sex == 'female': + return 'she' + elif sex == 'male': + return 'he' + else: + return 'they' + +def year(date): + return date[:4] + +# template-based generation in English + +def template_description(d): + died = f"{d['personLabel']} died {year(d['deathDate'])}" if 'deathDate' in d else '' + return ( + f"{d['personLabel']} was born in {d['countryLabel']} in {year(d['birthDate'])}. " + + f"{pronoun(d)} got the {d['awardLabel']} in {year(d['date'])}." + + died + ) + +# grammar-based generation in a given language + +def name(d): + person = d['personLabel'] + return f'StringName "{person}"' + + +def funs(funfile): + with open(funfile) as file: + data = {WIKIDATA_PREFIX + qf[0]: qf[1] for line in file for qf in [json.loads(line)]} + return data + + +def country(fundata, d): + return fundata[d['country']] + + +def award(fundata, d): + return fundata[d['award']] + + +def grammar_description(grammar, fundata, d, lang): + born = pgf.readExpr( + f"BornSentence ({name(d)}) {country(fundata, d)} (YearDate {year(d['birthDate'])})") + awarded = pgf.readExpr( + f"AwardSentence {pronoun(d)}_Name {award(fundata, d)} (YearDate {year(d['date'])})") + sentences = [born, awarded] + if 'deathDate' in d: + died = pgf.readExpr( + f"DiedSentence ({name(d)}) (YearDate {year(d['deathDate'])})") + sentences.append(died) + return ' '.join([lang.linearize(s) + '.' for s in sentences]) + + +if sys.argv[1:]: + grammar = pgf.readPGF(GRAMMAR_FILE) + fundata = funs(FUN_FILE) + lang = grammar.languages[GRAMMAR_PREFIX + sys.argv[1]] + for d in data: + print(grammar_description(grammar, fundata, d, lang)) +else: + for d in data: + print(template_description(d)) + + + diff --git a/lab2/scripts/find_labels.py b/lab2/scripts/find_labels.py index 61d05b4..0afc204 100644 --- a/lab2/scripts/find_labels.py +++ b/lab2/scripts/find_labels.py @@ -8,9 +8,10 @@ from gf_utils import * """ To collect labels from query.json (Wikidata query result) and extract grammars: -python3 find_labels.py init >labels.jsonl -python3 find_labels.py abstract >Labels.gf -python3 find_labels.py en >LabelsEng.gf +python3 find_labels.py init >../data/labels.jsonl +python3 find_labels.py funs >../data/funs.jsonl +python3 find_labels.py abstract >../data/Labels.gf +python3 find_labels.py en >../data/LabelsEng.gf """ WIKIDATA_FILE = '../data/query.json' @@ -19,7 +20,7 @@ WIKIDATA_URL_PREFIX = 'http://www.wikidata.org/wiki/Special:EntityData/' NOBEL_FIELDS = ['award', 'country'] LABEL_FILE = '../data/labels.jsonl' -USAGE = 'usage: find_labels.py (init | abstract | en | sv | fi | ...)' +USAGE = 'usage: find_labels.py (init | funs | abstract | en | sv | fi | ...)' if sys.argv[1:]: MODE = sys.argv[1] @@ -78,7 +79,9 @@ def extract_labels(labeldata, mode): eng = labels.get('en', 'X') cat = labels['field'].capitalize() fun = mk_fun_from_strs([qid, eng, cat]) - if mode == 'abstract': + if mode == 'funs': + print(json.dumps([qid, fun], ensure_ascii=False)) + elif mode == 'abstract': print(mk_fun_rule(fun, cat)) else: lin = labels.get(mode, labels.get('en', 'X'))