forked from GitHub/comp-syntax-gu-mlt
preparing for 2023
This commit is contained in:
72
lab2/wikipedia-2022/data_facts.py
Normal file
72
lab2/wikipedia-2022/data_facts.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import pgf
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
class FactSystem:
|
||||
def __init__(self,fnames,gr,lang1):
|
||||
self.fieldnames = fnames
|
||||
self.grammar = gr
|
||||
self.language1 = lang1 # the language in which entities are parsed to trees
|
||||
|
||||
def get_data(self,filename):
|
||||
data = []
|
||||
Data = namedtuple('Data', self.fieldnames)
|
||||
file = open(filename)
|
||||
for line in file:
|
||||
fields = Data(*line.split('\t'))
|
||||
data.append(fields)
|
||||
return data
|
||||
|
||||
# can raise ParseError
|
||||
def str2exp(self,cat,s):
|
||||
eng = self.grammar.languages[self.language1]
|
||||
pp = eng.parse(s,cat=pgf.readType(cat))
|
||||
_,e = pp.__next__()
|
||||
return e
|
||||
|
||||
def exp2str(self,exp):
|
||||
eng = self.grammar.languages[self.language1]
|
||||
return eng.linearize(exp)
|
||||
|
||||
def data2lin(self,cat,s):
|
||||
return self.exp2str(self.str2exp(cat,s))
|
||||
|
||||
def run(self,datafile,fact_generator):
|
||||
gr = self.grammar
|
||||
data = sorted(list(self.get_data(datafile)))
|
||||
langs = list(gr.languages.values())
|
||||
for lang in langs:
|
||||
text = []
|
||||
for tree in fact_generator(self,data):
|
||||
lin = lang.linearize(tree)
|
||||
if lin: text.append(lin[0].upper() + lin[1:])
|
||||
print('\n'.join(text))
|
||||
|
||||
|
||||
def simple_facts(factsys,data):
|
||||
"for each tuple in data, generate an attribute fact for each field"
|
||||
fields = factsys.fieldnames.split()
|
||||
facts = []
|
||||
for tuple in data:
|
||||
object = factsys.str2exp("Object",tuple[0])
|
||||
for (attr,val) in [(fields[i],tuple[i]) for i in range(1,len(fields))]:
|
||||
fact = pgf.Expr("AttributeFact", [
|
||||
factsys.str2exp("Attribute",attr),
|
||||
object,
|
||||
factsys.str2exp("Value",val)])
|
||||
facts.append(fact)
|
||||
return facts
|
||||
|
||||
|
||||
def example_run():
|
||||
gr = pgf.readPGF('Countries.pgf')
|
||||
factsys = FactSystem(
|
||||
'country capital area population continent currency',
|
||||
gr,
|
||||
'CountriesEng'
|
||||
)
|
||||
|
||||
factsys.run('../data/countries.tsv',simple_facts)
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_run()
|
||||
Reference in New Issue
Block a user