1
0
forked from GitHub/gf-core

python examples compatible with both Python 2 and 3

This commit is contained in:
prasanth.kolachina
2016-09-19 08:32:08 +00:00
parent 8729339d26
commit ef33f1ab35
2 changed files with 703 additions and 595 deletions

View File

@@ -1,283 +1,369 @@
#!/usr/bin/env python #!/usr/bin/env python
# Python 2 and 3 compatible
from __future__ import print_function
""" """
""" """
import argparse, re, string, sys, time; import argparse, codecs, re, string, sys, time;
from itertools import imap, count; try:
from itertools import imap as map;
from itertools import count;
except ImportError:
from itertools import count;
pass;
from operator import itemgetter; from operator import itemgetter;
import pgf; import pgf;
def lexerI(sentence): class Lexer(object):
return sentence.rstrip(string.whitespace+string.punctuation); def __init__(self, lang='None', grammar=None, gflang=None):
import translation_pipeline;
lexers = {'None': self.lexerI, \
'Eng': self.lexerI, \
'Chi': self.lexerChi, \
'Translator': translation_pipeline.pipeline_lexer, \
'Web': self.lexerWeb
};
if grammar:
self._pgf = grammar;
self._lang = gflang;
def lexerChi(sentence): self.tokenize = lexers[lang];
sentence = sentence.decode('utf-8'); return;
def lexerI(self, sentence):
#return sentence.decode('utf-8').rstrip(string.whitespace+string.punctuation).encode('utf-8');
return sentence.rstrip(string.whitespace+string.punctuation);
def lexerChi(self, sentence):
#sentence = sentence.decode('utf-8');
tokens, idx, n = [], 0, len(sentence); tokens, idx, n = [], 0, len(sentence);
prev = True; prev = True;
while idx < n: while idx < n:
if sentence[idx] in string.whitespace: if sentence[idx] in string.whitespace:
prev = True; prev = True;
idx += 1; idx += 1;
continue; continue;
if 0 < ord(sentence[idx]) < 128: if 0 < ord(sentence[idx]) < 128:
if sentence[idx] in string.punctuation: if sentence[idx] in string.punctuation:
prev = True; prev = True;
if prev: if prev:
tokens.append( sentence[idx] ); tokens.append( sentence[idx] );
prev = False; prev = False;
else: else:
tokens[-1] = tokens[-1]+sentence[idx]; tokens[-1] = tokens[-1]+sentence[idx];
else: else:
prev = True; prev = True;
tokens.append( sentence[idx] ); tokens.append( sentence[idx] );
idx += 1; idx += 1;
return ' '.join(tokens).encode('utf-8'); return ' '.join(tokens);#.encode('utf-8');
def lexer(lang='translator'): def lexerWeb(self, sentence):
if lang[-3:] == 'Eng': tokensList = re.split('\s+?', sentence.strip());
return lexerI; for idx, token in enumerate(tokensList):
elif lang[-3:] == 'Chi': if not token[0].isupper():
return lexerChi; continue;
elif lang == 'translator': lowertoken = tokensList[idx].lower();
import translation_pipeline; count = 0;
return translation_pipeline.pipeline_lexer; for analysis in self._pgf.languages[self._lang].lookupMorpho(lowertoken):
else: count += 1;
return lexerI; tokensList[idx] = lowertoken if count else token;
for idx, token in enumerate(tokensList):
if token.find('-') == -1:
continue;
count = 0;
for analysis in self._pgf.languages[self._lang].lookupMorpho(token):
count += 1;
if count:
continue;
token = tokensList[idx].replace('-', '');
for analysis in self._pgf.languages[self._lang].lookupMorpho(token):
count += 1;
if count:
tokensList[idx] = token;
continue;
token = tokensList[idx].replace('-', ' ');
return ' '.join(tokensList);
def postprocessor(sentence): def postprocessor(sentence):
if sentence == None: if sentence == None:
return ''; return '';
if sentence.startswith('* ') or sentence.startswith('% '): if sentence.startswith('* ') or sentence.startswith('% '):
sentence = sentence[2:]; sentence = sentence[2:];
sentence = sentence.replace(' &+ ', ''); sentence = sentence.replace(' &+ ', '');
sentence = sentence.replace('<+>', ' '); sentence = sentence.replace('<+>', ' ');
return sentence; return sentence;
def readJohnsonRerankerTrees(inputStream): def readJohnsonRerankerTrees(inputStream):
endOfParse = False; endOfParse = False;
while True: while True:
sentheader = inputStream.next(); sentheader = inputStream.next();
if sentheader == '': if sentheader == '':
break; break;
parsescount, sentidx = map(int, sentheader.strip().split()); parsescount, sentidx = map(int, sentheader.strip().split());
parsesBlock = []; parsesBlock = [];
for i in xrange(parsescount): for i in xrange(parsescount):
parseprob = inputStream.next(); parseprob = inputStream.next();
if parseprob.strip() == '': if parseprob.strip() == '':
endOfParse = True; endOfParse = True;
break; break;
parse = inputStream.next(); parse = inputStream.next();
parsesBlock.append( (float(parseprob.strip()), pgf.readExpr(parse.strip())) ); parsesBlock.append((float(parseprob.strip()), pgf.readExpr(parse.strip())));
yield sentidx, parsesBlock; yield sentidx, parsesBlock;
if not endOfParse: if not endOfParse:
_ = inputStream.next(); _ = inputStream.next();
endOfParse = False; endOfParse = False;
def readMosesNbestFormat(inputStream): def readMosesNbestFormat(inputStream):
transBlock = []; transBlock = [];
currentHypothesisId = 0; currentHypothesisId = 0;
while True: while True:
line = inputStream.next(); line = inputStream.next();
if line == '': if line == '':
break; break;
fields = line.strip().split('|||'); fields = line.strip().split('|||');
if str(fields[0].strip()) != str(currentHypothesisId): if str(fields[0].strip()) != str(currentHypothesisId):
yield currentHypothesisId, transBlock; yield currentHypothesisId, transBlock;
transBlock = []; transBlock = [];
currentHypothesisId = int(fields[0]); currentHypothesisId = int(fields[0]);
transBlock.append( (map(float, tuple([val.strip() for val in fields[3].split()])), fields[1].strip()) ); transBlock.append( (map(float, \
tuple([val.strip() for val in fields[3].split()])), \
fields[1].strip()) );
def printJohnsonRerankerFormat(gfparsesList, sentid=count(1)): def printJohnsonRerankerFormat(gfparsesList, sentids=count(1)):
johnsonRepr = []; johnsonRepr = [];
parseHash = {}; parseHash = {};
for parse in sorted(gfparsesList, key=itemgetter(0)): for parse in sorted(gfparsesList, key=itemgetter(0)):
if not parseHash.has_key(parse[1]): if parse[1] not in parseHash:
johnsonRepr.append( str(-1*parse[0]) ); johnsonRepr.append( str(-1*parse[0]) );
johnsonRepr.append( str(parse[1]) ); johnsonRepr.append( str(parse[1]) );
parseHash.setdefault(parse[1], []).append(parse[0]); parseHash.setdefault(parse[1], []).append(parse[0]);
curid = sentid.next(); curid = next(sentids);
if len(gfparsesList): if len(gfparsesList):
johnsonRepr.insert(0, '%d %d' %(len(parseHash.values()), curid)); johnsonRepr.insert(0, '%d %d' %(len(parseHash.values()), curid));
duplicateInstances = len(filter(lambda X: len(parseHash[X]) > 1, parseHash.keys())); duplicateInstances = len(list(filter(lambda X: len(parseHash[X]) > 1, \
#if duplicateInstances: print >>sys.stderr, "%d duplicate parses found in K-best parsing" %(duplicateInstances); parseHash.keys())));
return '\n'.join(johnsonRepr)+'\n'; return '\n'.join(johnsonRepr)+'\n';
def printMosesNbestFormat(hypothesisList, sentid=count(1)): def printMosesNbestFormat(hypothesisList, sentids=count(1)):
mosesRepr = []; mosesRepr = [];
sid = sentid.next(); sid = next(sentids);
for hypScores, hypStr in hypothesisList: for hypScores, hypStr in hypothesisList:
if not hasattr(hypScores, '__iter__'): if not hasattr(hypScores, '__iter__'):
hypScores = (hypScores, ); hypScores = (hypScores, );
mosesRepr.append("%d ||| %s ||| NULL ||| %s" %(sid, hypStr, ' '.join(['%.6f'%score for score in hypScores]))); mosesRepr.append("%d ||| %s ||| NULL ||| %s" \
return '\n'.join(mosesRepr); %(sid, hypStr, ' '.join('%.6f'%score for score in hypScores)) );
return '\n'.join(mosesRepr);
def getKLinearizations(grammar, tgtlanguage, abstractParsesList, K=10): def getKLinearizations(grammar, tgtlanguage, abstractParsesList, K=10):
generator = grammar.languages[tgtlanguage].linearizeAll; generator = grammar.languages[tgtlanguage].linearizeAll;
for parsesBlock in abstractParsesList: for parsesBlock in abstractParsesList:
kBestTrans = []; kBestTrans = [];
for parseprob, parse in parsesBlock: for parseprob, parse in parsesBlock:
for linstring in generator(parse, n=K): for linstring in generator(parse, n=K):
kBestTrans.append( ((parseprob,), postprocessor(linstring)) ); kBestTrans.append( ((parseprob,), postprocessor(linstring)) );
yield kBestTrans; yield kBestTrans;
def getKBestParses(grammar, language, K, serializable=False, sentid=count(1), max_length=50): def getKBestParses(grammar, language, K, callbacks=[], \
parser = grammar.languages[language].parse; serializable=False, sentids=count(1), max_length=50):
import translation_pipeline parser = grammar.languages[language].parse;
def worker(sentence): import translation_pipeline;
sentence = sentence.strip(); callbacks_PN = translation_pipeline.parseNames;
curid = sentid.next(); callbacks_Symb = translation_pipeline.parseUnknown;
tstart = time.time(); def worker(sentence):
kBestParses = []; sentence = sentence.strip();
parseScores = {}; curid = next(sentids);
if len(sentence.split()) > max_length: tstart = time.time();
tend, err = time.time(), "Sentence too long (%d tokens). Might potentially run out of memory" %(len(sentence.split())); kBestParses = [];
print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err); parseScores = {};
return tend-tstart, kBestParses; # temporary hack to make sure parser does not get killed for very long sentences; if len(sentence.split()) > max_length:
try: # temporary hack to make sure parser does not get
callbacks = [('PN', translation_pipeline.parseNames(grammar, args.srclang, sentence)), ('Symb', translation_pipeline.parseUnknown(grammar, args.srclang, sentence))] # killed for very long sentences;
for parseidx, parse in enumerate( parser(sentence, heuristics=0, callbacks=callbacks) ): tend, err = time.time(), \
parseScores[parse[0]] = True; "Sentence too long (%d tokens). Might potentially run out of memory" \
kBestParses.append( (parse[0], str(parse[1]) if serializable else parse[1]) ); %(len(sentence.split()));
if parseidx == K-1: break; print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr);
#if len(parseScores) >= K: break; return tend-tstart, kBestParses;
tend = time.time();
print >>sys.stderr, '%d\t%.4f' %(curid, tend-tstart); # with modified API for callbacks, each callback function has to
return tend-tstart, kBestParses; # be freshly created for each sentence; otherwise, they do not
except pgf.ParseError, err: # work.
tend = time.time(); try:
print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err); callbacks = [('PN', callbacks_PN(grammar, language, sentence)),\
return tend-tstart, kBestParses; ('Symb', callbacks_Symb(grammar, language, sentence))];
except UnicodeEncodeError, err: for parseidx, parse in enumerate(parser(sentence, \
tend = time.time(); heuristics=0, callbacks=callbacks)):
print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err); parseScores[parse[0]] = True;
return tend-tstart, kBestParses; kBestParses.append((parse[0], str(parse[1]) if serializable \
return worker; else parse[1]));
if parseidx == K-1:
break;
tend = time.time();
print('%d\t%.4f' %(curid, tend-tstart), file=sys.stderr);
return tend-tstart, kBestParses;
except pgf.ParseError as err:
tend = time.time();
print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr);
return tend-tstart, kBestParses;
except UnicodeEncodeError as err:
tend = time.time();
print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr);
return tend-tstart, kBestParses;
return worker;
def pgf_parse(args): def pgf_parse(args):
grammar = pgf.readPGF(args.pgfgrammar); grammar = pgf.readPGF(args.pgfgrammar);
import translation_pipeline; preprocessor = Lexer().tokenize;
#if sys.version_info < (3, 0):
preprocessor = lexer(); # args.inputstream = codecs.getreader('utf-8')(args.inputstream);
inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) ); inputSet = map(preprocessor, args.inputstream);
outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1])); #operator.itemgetter(1); web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize;
parser = getKBestParses(grammar, args.srclang, 1); inputSet = map(web_preprocessor, inputSet);
outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1]));
sentidx = 0; parser = getKBestParses(grammar, args.srclang, 1);
for time, parsesBlock in imap(parser, inputSet):
sentidx += 1; sentidx = 0;
print >>args.outputstream, "%d\t%f\t%s" %(sentidx, time, str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''); for time, parsesBlock in map(parser, inputSet):
return; sentidx += 1;
print("%d\t%f\t%s" %(sentidx, time, \
str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''), \
file=args.outputstream);
return;
def pgf_kparse(args): def pgf_kparse(args):
grammar = pgf.readPGF(args.pgfgrammar); grammar = pgf.readPGF(args.pgfgrammar);
import translation_pipeline; preprocessor = Lexer().tokenize;
#if sys.version_info < (3, 0):
preprocessor = lexer(); # args.inputstream = codecs.getreader('utf-8')(args.inputstream);
inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) ); inputSet = map(preprocessor, args.inputstream);
outputPrinter = printJohnsonRerankerFormat; web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize;
parser = getKBestParses(grammar, args.srclang, args.K); inputSet = map(web_preprocessor, inputSet);
outputPrinter = printJohnsonRerankerFormat;
sentidx = 0; parser = getKBestParses(grammar, args.srclang, args.K);
for time, parsesBlock in imap(parser, inputSet):
sentidx += 1; sentidx = 0;
strParses = str(outputPrinter(parsesBlock)); for time, parsesBlock in map(parser, inputSet):
if not (strParses == '\n'): sentidx += 1;
print >>args.outputstream, strParses; strParses = str(outputPrinter(parsesBlock));
return; if not (strParses == '\n'):
print(strParses, file=args.outputstream);
return;
def pgf_linearize(args): def pgf_linearize(args):
grammar = pgf.readPGF(args.pgfgrammar); grammar = pgf.readPGF(args.pgfgrammar);
outputPrinter = postprocessor; def parse_line(line):
inputSet = []; try:
for line in args.inputstream: sentid, parsetime, parserepr = line.strip('\n').split('\t', 2);
try: except ValueError:
sentid, parsetime, parserepr = line.strip('\n').split('\t', 2); print("Line not in proper format: %s" %(line), file=stderr);
except ValueError: parseprob, abstree = parserepr.split('\t') if parserepr.strip() \
print line.strip(); else (0, '');
parseprob, abstree = parserepr.split('\t') if parserepr.strip() else (0, ''); return ((int(sentid), float(parsetime), float(parseprob), \
inputSet.append( (int(sentid), float(parsetime), float(parseprob), pgf.readExpr(abstree) if abstree else None) ); pgf.readExpr(abstree) if abstree else None));
linearizer = grammar.languages[args.tgtlang].linearize;
for sentid, _, _, abstree in inputSet: #if sys.version_info < (3, 0):
if abstree: # args.inputstream = codecs.getreader('utf-8')(args.inputstream);
print >>args.outputstream, str(outputPrinter(linearizer(abstree))); inputSet = map(parse_line, (line for line in args.inputstream));
else: outputPrinter = postprocessor;
print >>args.outputstream, ""; linearizer = grammar.languages[args.tgtlang].linearize;
return; for sentid, _, _, abstree in inputSet:
if abstree:
print(str(outputPrinter(linearizer(abstree))), \
file=args.outputstream);
else:
print("", file=args.outputstream);
return;
def pgf_klinearize(args): def pgf_klinearize(args):
grammar = pgf.readPGF(args.pgfgrammar); grammar = pgf.readPGF(args.pgfgrammar);
outputPrinter = printMosesNbestFormat; #if sys.version_info < (3, 0):
inputSet = [(sentid, parsesBlock) for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)]; # args.inputstream = codecs.getreader('utf-8')(args.inputstream);
sentIdsList = imap(itemgetter(0), inputSet); inputSet = [(sentid, parsesBlock) \
parsesBlocks = map(itemgetter(1), inputSet); for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)];
outputPrinter = printMosesNbestFormat;
for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K): sentIdsList = map(itemgetter(0), inputSet);
strTrans = str(outputPrinter(transBlock, sentIdsList)); parsesBlocks = map(itemgetter(1), inputSet);
if strTrans:
print >>args.outputstream, strTrans; for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K):
return; strTrans = str(outputPrinter(transBlock, sentIdsList));
if strTrans:
print(strTrans, file=args.outputstream);
return;
def cmdLineParser(): def cmdLineParser():
argparser = argparse.ArgumentParser(prog='gf_utils.py', description='Examples for carrying out (K-best) parsing, translation and linearization using GF C runtime.'); argparser = argparse.ArgumentParser(prog='gf_utils.py', \
description='Examples for carrying out (K-best) parsing, \
subparsers = argparser.add_subparsers(); translation and linearization using GF C runtime.');
parser = subparsers.add_parser('parse', help='GF parsing of sentences');
kparser = subparsers.add_parser('kparse', help='K-best GF parsing of sentences'); subparsers = argparser.add_subparsers();
linearizer = subparsers.add_parser('linearize', help='Linearize GF abstract syntax treess'); parser = subparsers.add_parser('parse', help='GF parsing of sentences');
klinearizer = subparsers.add_parser('klinearize', help='Linearize K-variants of GF abstract syntax trees'); kparser = subparsers.add_parser('kparse', help='K-best GF parsing of sentences');
linearizer = subparsers.add_parser('linearize', help='Linearize GF abstract syntax treess');
parser.set_defaults(func=pgf_parse); klinearizer = subparsers.add_parser('klinearize', help='Linearize K-variants of GF abstract syntax trees');
parser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
help='PGF Grammar file'); parser.set_defaults(func=pgf_parse);
parser.add_argument('-p', '--start-sym', dest='startcat', required=False, \ parser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
help='Start symbol in the grammar'); help='PGF Grammar file');
parser.add_argument('-s', '--src-lang', dest='srclang', required=True, \ parser.add_argument('-p', '--start-sym', dest='startcat', required=False, \
help='Source language'); help='Start symbol in the grammar');
parser.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \ parser.add_argument('-s', '--src-lang', dest='srclang', required=True, \
help='Input file') ; help='Source language');
parser.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \ parser.add_argument('-i', '--input', dest='inputstream', nargs='?', \
help='Output file'); type=argparse.FileType(mode='r'), default=sys.stdin, \
help='Input file') ;
parser.add_argument('-o', '--output', dest='outputstream', nargs='?', \
type=argparse.FileType(mode='w'), default=sys.stdout, \
help='Output file');
kparser.set_defaults(func=pgf_kparse);
kparser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
help='PGF Grammar file');
kparser.add_argument('-p', '--start-sym', dest='startcat', required=False, \
help='Start symbol in the grammar');
kparser.add_argument('-s', '--src-lang', dest='srclang', required=True, \
help='Source language');
kparser.add_argument('-K', dest='K', required=True, \
type=int, \
help='K value for multiple parses');
kparser.add_argument('-i', '--input', dest='inputstream', nargs='?', \
type=argparse.FileType(mode='r'), default=sys.stdin, \
help='Input file');
kparser.add_argument('-o', '--output', dest='outputstream', nargs='?', \
type=argparse.FileType(mode='w'), default=sys.stdout, \
help='Output file');
linearizer.set_defaults(func=pgf_linearize);
linearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
help='PGF Grammar file');
linearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
help='Target language');
linearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', \
type=argparse.FileType(mode='r'), default=sys.stdin, \
help='Input file');
linearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', \
type=argparse.FileType(mode='w'), default=sys.stdout, \
help='Output file');
klinearizer.set_defaults(func=pgf_klinearize);
klinearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
help='PGF Grammar file');
klinearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
help='Target language');
klinearizer.add_argument('-K', '--kbest', dest='K', required=True, \
type=int, \
help='K value for multiple linearizations');
klinearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', \
type=argparse.FileType(mode='r'), default=sys.stdin, \
help='Input file');
klinearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', \
type=argparse.FileType(mode='w'), default=sys.stdout, \
help='Output file');
return argparser;
kparser.set_defaults(func=pgf_kparse);
kparser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
help='PGF Grammar file');
kparser.add_argument('-p', '--start-sym', dest='startcat', required=False, \
help='Start symbol in the grammar');
kparser.add_argument('-s', '--src-lang', dest='srclang', required=True, \
help='Source language');
kparser.add_argument('-K', dest='K', required=True, type=int, \
help='K value for multiple parses');
kparser.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \
help='Input file');
kparser.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \
help='Output file');
linearizer.set_defaults(func=pgf_linearize);
linearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
help='PGF Grammar file');
linearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
help='Target language');
linearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \
help='Input file');
linearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \
help='Output file');
klinearizer.set_defaults(func=pgf_klinearize);
klinearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
help='PGF Grammar file');
klinearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
help='Target language');
klinearizer.add_argument('-K', dest='K', required=True, type=int, \
help='K value for multiple linearizations');
klinearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \
help='Input file');
klinearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \
help='Output file');
return argparser;
if __name__ == '__main__': if __name__ == '__main__':
args = cmdLineParser().parse_args(sys.argv[1:]); args = cmdLineParser().parse_args(sys.argv[1:]);
args.func(args); args.func(args);

View File

@@ -1,6 +1,18 @@
#!/usr/bin/env python #!/usr/bin/env python
# Python 2 and 3 compatible
from __future__ import print_function
"""
"""
import argparse, codecs, copy, itertools, logging, math, operator, os, os.path, re, string, sys, time; import argparse, codecs, copy, itertools, logging, math, operator, os, os.path, re, string, sys, time;
try:
from itertools import imap as map;
from itertools import ifilter as filter;
except ImportError:
pass;
import xml.etree.ElementTree as etree; import xml.etree.ElementTree as etree;
import pgf; import pgf;
@@ -8,392 +20,402 @@ import gf_utils;
# http://snipplr.com/view/25657/indent-xml-using-elementtree/ # http://snipplr.com/view/25657/indent-xml-using-elementtree/
def indentXMLNodes(elem, level=0): def indentXMLNodes(elem, level=0):
i = "\n" + level*" " i = "\n" + level*" "
if len(elem): if len(elem):
if not elem.text or not elem.text.strip(): if not elem.text or not elem.text.strip():
elem.text = i + " " elem.text = i + " "
if not elem.tail or not elem.tail.strip(): if not elem.tail or not elem.tail.strip():
elem.tail = i elem.tail = i
for elem in elem: for elem in elem:
indentXMLNodes(elem, level+1) indentXMLNodes(elem, level+1)
if not elem.tail or not elem.tail.strip(): if not elem.tail or not elem.tail.strip():
elem.tail = i elem.tail = i
else: else:
if level and (not elem.tail or not elem.tail.strip()): if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i elem.tail = i
def readTranslationPipelineOptions(propsfile, default_namespace): def readTranslationPipelineOptions(propsfile, default_namespace):
with codecs.open(propsfile, 'r', 'utf-8') as infile: with codecs.open(propsfile, 'r', 'utf-8') as infile:
for line in infile: for line in infile:
if not line.strip(): if not line.strip():
continue; continue;
key, value = line.strip().split('=', 1); key, value = line.strip().split('=', 1);
key, value = key.strip(), value.strip(); key, value = key.strip(), value.strip();
if key == 'srclang': default_namespace.srclang = value; if key == 'srclang':
elif key == 'tgtlangs': default_namespace.tgtlangs = [val.strip() for val in ','.split(value)]; default_namespace.srclang = value;
elif key == 'input': default_namespace.input = value; elif key == 'tgtlangs':
elif key == 'format': default_namespace.format = value; default_namespace.tgtlangs = [val.strip() for val in ','.split(value)];
elif key == 'exp_directory': default_namespace.exp_directory = value; elif key == 'input':
else: default_namespace.input = value;
#print >>sys.stderr, "Unknown option-%s found in props file. Ignoring and proceeding." %(key); elif key == 'format':
logging.warning("Unknown option-%s found in props file. Ignoring and proceeding." %(key)); default_namespace.format = value;
continue; elif key == 'exp_directory':
return default_namespace; default_namespace.exp_directory = value;
else:
logging.warning("Unknown option-%s found in props file. Ignoring and proceeding." %(key));
continue;
return default_namespace;
def sgmReader(sgmDoc): def sgmReader(sgmDoc):
root = sgmDoc.getroot(); root = sgmDoc.getroot();
for element in root.iter(): for element in root.iter():
if element.text is not None and element.text.strip(): if element.text is not None and element.text.strip():
yield element.text.strip().encode('utf-8'); yield element.text.strip().encode('utf-8');
def addToSgm(sgmDoc, strItem): def addToSgm(sgmDoc, strItem):
for node in sgmDoc.findall('.//seg'): for node in sgmDoc.findall('.//seg'):
if not node.text.strip(): if not node.text.strip():
strItem = strItem.decode('utf-8'); strItem = strItem.decode('utf-8');
node.text = ' %s ' %(strItem if strItem.strip() else 'EMPTY'); node.text = ' %s ' %(strItem if strItem.strip() else 'EMPTY');
return; return;
logging.error("No more nodes available for adding content"); logging.error("No more nodes available for adding content");
return; return;
def sgmWriter(sgmDoc): def sgmWriter(sgmDoc):
indentXMLNodes( sgmDoc.getroot() ); indentXMLNodes( sgmDoc.getroot() );
return etree.tostring(sgmDoc.getroot(), encoding='utf-8', method='xml'); return etree.tostring(sgmDoc.getroot(), encoding='utf-8', method='xml');
def getXMLSkeleton(sgmDoc, tgtlang): def getXMLSkeleton(sgmDoc, tgtlang):
skeletonDoc = copy.deepcopy(sgmDoc); skeletonDoc = copy.deepcopy(sgmDoc);
root = skeletonDoc.getroot(); root = skeletonDoc.getroot();
root.tag = 'tstset'; root.tag = 'tstset';
root.attrib['trlang'] = tgtlang[-3:]; root.attrib['trlang'] = tgtlang[-3:];
root.find('doc').attrib['sysid'] = tgtlang[:-3]; root.find('doc').attrib['sysid'] = tgtlang[:-3];
for node in root.findall('.//seg'): for node in root.findall('.//seg'):
node.text = ''; node.text = '';
return skeletonDoc; return skeletonDoc;
def pipeline_lexer(sentence): def pipeline_lexer(sentence):
tokens = sentence.strip().split(); tokens = sentence.strip().split();
#tokens = filter(None, re.split('(\W+)', sentence.strip())); #tokens = filter(None, re.split('(\W+)', sentence.strip()));
n = len(tokens); n = len(tokens);
idx = len(tokens)-1; idx = len(tokens)-1;
while idx >= 0: while idx >= 0:
if tokens[idx] in ".?!)": if tokens[idx] in ".?!)":
idx -= 1; idx -= 1;
else: else:
break; break;
tokens = tokens[:idx+1]; tokens = tokens[:idx+1];
idx = 0; idx = 0;
while idx < len(tokens): while idx < len(tokens):
if tokens[idx] in "'\"(": if tokens[idx] in "'\"(":
idx += 1; idx += 1;
else: else:
break; break;
tokens = tokens[idx:]; tokens = tokens[idx:];
return ' '.join(tokens); return ' '.join(tokens);
def web_lexer(grammar, lang, sentences):
for instance in sentences:
tokensList = re.split('\s+?', instance.strip());
for idx, token in enumerate(tokensList):
if not token[0].isupper():
continue;
lowertoken = tokensList[idx].lower();
count = 0;
for analysis in grammar.languages[lang].lookupMorpho(lowertoken):
count += 1;
tokensList[idx] = lowertoken if count else token;
for idx, token in enumerate(tokensList):
if token.find('-') == -1:
continue;
count = 0;
for analysis in grammar.languages[lang].lookupMorpho(token):
count += 1;
if count:
continue;
token = tokensList[idx].replace('-', '');
for analysis in grammar.languages[lang].lookupMorpho(token):
count += 1;
if count:
tokensList[idx] = token;
continue;
token = tokensList[idx].replace('-', ' ');
yield ' '.join(tokensList);
def clean_gfstrings(sentence): def clean_gfstrings(sentence):
absFuncName = re.compile('\[[^]]+?\]'); absFuncName = re.compile('\[[^]]+?\]');
untranslatedEntries = {}; untranslatedEntries = {};
for entry in re.findall(absFuncName, sentence): for entry in re.findall(absFuncName, sentence):
untranslatedEntries[entry] = untranslatedEntries.setdefault(entry, 0)+1; untranslatedEntries[entry] = untranslatedEntries.setdefault(entry, 0)+1;
for entry in untranslatedEntries: for entry in untranslatedEntries:
while untranslatedEntries[entry] > 1: while untranslatedEntries[entry] > 1:
sentence = sentence.replace(entry, '', 1); sentence = sentence.replace(entry, '', 1);
untranslatedEntries[entry] -= 1; untranslatedEntries[entry] -= 1;
sentence = sentence.replace(entry, ' '.join(entry[1:-1].split('_')[:-1]) if entry.find('_') != -1 else ''); sentence = sentence.replace(entry, \
return ' '.join( sentence.split() ); ' '.join(entry[1:-1].split('_')[:-1]) if entry.find('_') != -1 \
else '');
return ' '.join( sentence.split() );
def parseNames(grammar, language, sentence): def parseNames(grammar, language, sentence):
def callback(lin_idx, start): def callback(lin_idx, start):
moving_start, end, eot = start, len(sentence), True; moving_start, end, eot = start, len(sentence), True;
if moving_start < end and (not sentence[moving_start].isupper()): if moving_start < end and (not sentence[moving_start].isupper()):
return None; return None;
while moving_start < end: while moving_start < end:
if sentence[moving_start] in string.whitespace: if sentence[moving_start] in string.whitespace:
eot = True; eot = True;
elif eot and sentence[moving_start].isupper(): elif eot and sentence[moving_start].isupper():
eot = False; eot = False;
elif eot and (not sentence[moving_start].isupper()): elif eot and (not sentence[moving_start].isupper()):
end = moving_start-1; end = moving_start-1;
break; break;
moving_start += 1; moving_start += 1;
possible_name = sentence[start:end].strip(); possible_name = sentence[start:end].strip();
if possible_name: if possible_name:
if language.endswith('Eng') and (possible_name == "I" or possible_name == "I'm"): if language.endswith('Eng') and \
return None; (possible_name == "I" or possible_name == "I'm"):
elif language.endswith('Eng') and possible_name.endswith("'s"): return None;
end_idx = possible_name.rfind("'s"); elif language.endswith('Eng') and possible_name.endswith("'s"):
if end_idx != -1: end_idx = possible_name.rfind("'s");
possible_name = possible_name[:end_idx].strip(); if end_idx != -1:
end -= 2; possible_name = possible_name[:end_idx].strip();
if not possible_name: end -= 2;
return None; if not possible_name:
expr, prob = None, None; return None;
for analysis in grammar.languages[language].lookupMorpho(possible_name): expr, prob = None, None;
category = grammar.functionType(analysis[0]).cat; for analysis in grammar.languages[language].lookupMorpho(possible_name):
if prob < analysis[-1]: category = grammar.functionType(analysis[0]).cat;
if category == "PN": if prob < analysis[-1]:
expr, prob = pgf.Expr(analysis[0], []), analysis[-1]; if category == "PN":
elif category == "Weekday": expr, prob = pgf.Expr(analysis[0], []), analysis[-1];
expr, prob = pgf.Expr("weekdayPN", [pgf.Expr(analysis[0], [])]), analysis[-1]; elif category == "Weekday":
elif category == "Month": expr, prob = pgf.Expr("weekdayPN", \
expr, prob = pgf.Expr("monthPN", [pgf.Expr(analysis[0], [])]), analysis[-1]; [pgf.Expr(analysis[0], [])]), analysis[-1];
elif category == "Language": elif category == "Month":
return None; expr, prob = pgf.Expr("monthPN", \
# generic named entity [pgf.Expr(analysis[0], [])]), analysis[-1];
if expr == None: elif category == "Language":
expr = pgf.Expr(possible_name); return None;
expr = pgf.Expr("MkSymb", [expr]); # generic named entity
expr = pgf.Expr("SymbPN", [expr]); if expr == None:
return (expr, 0, end); expr = pgf.Expr(possible_name);
return None; expr = pgf.Expr("MkSymb", [expr]);
return callback; expr = pgf.Expr("SymbPN", [expr]);
return (expr, 0, end);
return None;
return callback;
def parseUnknown(grammar, language, sentence): def parseUnknown(grammar, language, sentence):
def callback(lin_idx, start): def callback(lin_idx, start):
moving_start, end, eot = start, len(sentence), True; moving_start, end, eot = start, len(sentence), True;
isNewToken = (moving_start == 0) or (moving_start > 1 and sentence[moving_start-1].isspace()) # -- added to deal with segmentation errors like may => ma_N + Symb y # -- added to deal with segmentation errors like may => ma_N + Symb y
if moving_start < end and (not sentence[moving_start].isupper()): isNewToken = (moving_start == 0) or \
while moving_start < end: (moving_start > 1 and sentence[moving_start-1].isspace())
if sentence[moving_start] in string.whitespace: if moving_start < end and (not sentence[moving_start].isupper()):
end = moving_start; while moving_start < end:
break; if sentence[moving_start] in string.whitespace:
moving_start += 1; end = moving_start;
unknown_word = sentence[start:end].strip(); break;
if unknown_word and isNewToken: moving_start += 1;
count = 0; unknown_word = sentence[start:end].strip();
for analysis in grammar.languages[language].lookupMorpho(unknown_word): if unknown_word and isNewToken:
count += 1; count = 0;
if not count: for analysis in grammar.languages[language].lookupMorpho(unknown_word):
expr = pgf.Expr("MkSymb", [pgf.Expr(unknown_word)]); count += 1;
return (expr, 0, end); if not count:
return None; expr = pgf.Expr("MkSymb", [pgf.Expr(unknown_word)]);
return callback; return (expr, 0, end);
return None;
return callback;
def parseTester(grammar, language): def parseTester(grammar, language, sentence):
def callback(lin_idx, sentence, start): def callback(lin_idx, start):
if start < len(sentence): if start < len(sentence):
return (pgf.Expr(sentence[start]), 0, start+1); return (pgf.Expr(sentence[start]), 0, start+1);
return None; return None;
return callback; return callback;
def translateWordsAsChunks(grammar, language, tgtlanguages, word): def translateWordsAsChunks(grammar, language, tgtlanguages, word):
parser = grammar.languages[language].parse; parser = grammar.languages[language].parse;
linearizersList = dict((lang, grammar.languages[lang].linearize) for lang in tgtlanguages); linearizersList = dict((lang, grammar.languages[lang].linearize) \
translations = []; for lang in tgtlanguages);
try: translations = [];
for parseidx, parse in enumerate( parser(word) ): try:
for lang in tgtlanguages: for parseidx, parse in enumerate( parser(word) ):
trans = linearizersList[lang](parse[1]); for lang in tgtlanguages:
translations.append(( lang, gf_utils.postprocessor(trans.strip() if trans else '') ) ); trans = linearizersList[lang](parse[1]);
break; translations.append((lang, gf_utils.postprocessor(\
except pgf.ParseError, err: trans.strip() if trans else '')));
return []; break;
return translations; except pgf.ParseError as err:
return [];
return translations;
def translateWord(grammar, language, tgtlanguages, word): def translateWord(grammar, language, tgtlanguages, word):
possible_translations = translateWordsAsChunks(grammar, language, tgtlanguages, word); possible_translations = translateWordsAsChunks(grammar, language, \
if len(possible_translations): tgtlanguages, word);
return possible_translations; if len(possible_translations):
return possible_translations;
lowerword = word.lower(); lowerword = word.lower();
try: try:
partialExprList = grammar.languages[language].parse(word, cat='Chunk'); partialExprList = grammar.languages[language].parse(word, cat='Chunk');
for expr in partialExprList: for expr in partialExprList:
return [(lang, gf_utils.gf_postprocessor( grammar.languages[lang].linearize(expr[1]) )) for lang in tgtlanguages]; return [(lang, gf_utils.gf_postprocessor(\
except pgf.ParseError: grammar.languages[lang].linearize(expr[1]))) \
morphAnalysis = grammar.languages[language].lookupMorpho(word) + grammar.languages[language].lookupMorpho(lowerword); for lang in tgtlanguages];
for morph in morphAnalysis: except pgf.ParseError:
countPositiveLanguages = filter(None, [grammar.languages[lang].hasLinearization(morph[0]) for lang in tgtlanguages]); morphAnalysis = grammar.languages[language].lookupMorpho(word) +\
if len(countPositiveLanguages) > 0.5*len(tgtlanguages): grammar.languages[language].lookupMorpho(lowerword);
return [(lang, gf_utils.gf_postprocessor( grammar.languages[lang].linearize( pgf.readExpr(morph[0]) ) )) for lang in tgtlanguages]; for morph in morphAnalysis:
return [(lang, word) for lang in tgtlanguages]; countPositiveLanguages = list(filter(None, \
[grammar.languages[lang].hasLinearization(morph[0]) \
for lang in tgtlanguages]));
if len(countPositiveLanguages) > 0.5*len(tgtlanguages):
return [(lang, \
gf_utils.gf_postprocessor(grammar.languages[lang].linearize(pgf.readExpr(morph[0])))) \
for lang in tgtlanguages];
return [(lang, word) for lang in tgtlanguages];
def translationByLookup(grammar, language, tgtlanguages, sentence): def translationByLookup(grammar, language, tgtlanguages, sentence):
parser = grammar.languages[language].parse; parser = grammar.languages[language].parse;
linearizersList = dict([(lang, grammar.languages[lang].linearize) for lang in tgtlanguages]); linearizersList = dict([(lang, grammar.languages[lang].linearize) \
queue = [sentence.strip().split()]; for lang in tgtlanguages]);
transChunks = {}; queue = [sentence.strip().split()];
while len(queue): transChunks = {};
head = queue[0]; while len(queue):
if not len(head): head = queue[0];
pass; if not len(head):
elif len(head) == 1 and head[0].strip(): pass;
for lang, wordchoice in translateWord(grammar, language, tgtlanguages, head[0]): elif len(head) == 1 and head[0].strip():
transChunks.setdefault(lang, []).append( gf_utils.postprocessor(wordchoice) ); for lang, wordchoice in translateWord(grammar, language, \
else: tgtlanguages, head[0]):
try: transChunks.setdefault(lang, []).append(\
for parseidx, parse in enumerate( parser(' '.join(head)) ): gf_utils.postprocessor(wordchoice));
for lang in tgtlanguages: else:
if linearizersList[lang](parse[1]) == None: try:
transChunks.setdefault(lang, []).append( ' ' ); for parseidx, parse in enumerate(parser(' '.join(head))):
else: for lang in tgtlanguages:
transChunks.setdefault(lang, []).append( gf_utils.postprocessor( linearizersList[lang](parse[1]).strip() ) ); if linearizersList[lang](parse[1]) == None:
break; transChunks.setdefault(lang, []).append(' ');
except pgf.ParseError, err: else:
#unseenToken = re.findall('"[^"]+?"', err.message)[0][1:-1]; transChunks.setdefault(lang, []).append(\
unseenToken = err.message.strip().split()[-1][1:-1]; gf_utils.postprocessor(linearizersList[lang](parse[1]).strip()));
idx = head.index(unseenToken); break;
queue.insert(1, head[:idx] ); except pgf.ParseError as err:
queue.insert(2, [head[idx]] ); #unseenToken = re.findall('"[^"]+?"', err.message)[0][1:-1];
queue.insert(3, head[idx+1:] ); unseenToken = err.message.strip().split()[-1][1:-1];
del queue[0]; idx = head.index(unseenToken);
for lang in tgtlanguages: queue.insert(1, head[:idx] );
yield (lang, ' '.join(transChunks[lang])); queue.insert(2, [head[idx]] );
queue.insert(3, head[idx+1:] );
del queue[0];
for lang in tgtlanguages:
yield (lang, ' '.join(transChunks[lang]));
def pipelineParsing(grammar, language, sentences, K=20): def pipelineParsing(grammar, language, sentences, K=20):
#buf = [sent for sent in sentences]; #buf = [sent for sent in sentences];
buf, sentences = itertools.tee(sentences, 2); buf, sentences = itertools.tee(sentences, 2);
sentences = itertools.imap(gf_utils.lexer(lang=language), sentences); parser = gf_utils.getKBestParses(grammar, language, K);
parser = gf_utils.getKBestParses(grammar, language, K); for sent, (time, parsesBlock) in zip(buf, map(parser, sentences)):
for sent, (time, parsesBlock) in itertools.izip(buf, itertools.imap(parser, sentences)): yield (sent, parsesBlock);
yield (sent, parsesBlock);
def translation_pipeline(props): def translation_pipeline(props):
if props.propsfile: if props.propsfile:
props = readTranslationPipelineOptions(props.propsfile, props); props = readTranslationPipelineOptions(props.propsfile, props);
# UGLY HACK FOR K-best translation: if K-best translation output format is only txt
if props.bestK != 1:
props.format = 'txt';
if not os.path.isdir( props.exp_directory ):
logging.info("Creating output directory: %s" %(props.exp_directory));
os.makedirs(props.exp_directory);
if not props.srclang: # UGLY HACK FOR K-best translation: if K-best translation output format is only txt
logging.critical("Mandatory option source-lang missing. Can not determine source language."); if props.bestK != 1:
sys.exit(1); props.format = 'txt';
grammar = pgf.readPGF(props.pgffile); if not os.path.isdir( props.exp_directory ):
logging.info("Creating output directory: %s" %(props.exp_directory));
os.makedirs(props.exp_directory);
sourceLanguage = filter(None, [lang if lang[-3:] == props.srclang else '' for lang in grammar.languages.keys()])[0]; if not props.srclang:
logging.info("Translating from %s" %(sourceLanguage)); logging.critical("Mandatory option source-lang missing. Can not determine source language.");
sys.exit(1);
if len(props.tgtlangs): grammar = pgf.readPGF(props.pgffile);
target_langs = props.tgtlangs;
sourceLanguage = filter(None, [lang if lang[-3:] == props.srclang else '' for lang in grammar.languages.keys()]);
sourceLanguage = list(sourceLanguage)[0];
logging.info("Translating from %s" %(sourceLanguage));
if len(props.tgtlangs):
target_langs = props.tgtlangs;
else:
target_langs = filter(None, [lang[-3:] if lang != sourceLanguage \
else '' for lang in grammar.languages.keys()]);
targetLanguages = filter(None, [lang if lang[-3:] in target_langs \
else '' for lang in grammar.languages.keys()]);
targetLanguages = list(targetLanguages);
logging.info("Translating into the following languages: %s" %(','.join(targetLanguages)));
K = props.bestK if props.bestK != 1 else 20; # by default we look for 20 best parses
bestK = props.bestK;
if not props.input:
logging.info( "Input file name missing. Reading input from stdin." );
inputStream = sys.stdin;
outputPrefix = os.getpid();
else:
inputStream = codecs.open(props.input, 'r');
outputPrefix = os.path.splitext( os.path.split(props.input)[1] )[0];
if props.format == 'sgm':
inputDoc = etree.parse(inputStream);
reader = sgmReader;
skeletonDoc = getXMLSkeleton;
addItem = addToSgm;
writer = sgmWriter;
elif props.format == 'txt':
logging.info("Input format is txt. Assuming one-sentence-per-line format.");
inputDoc = inputStream;
reader = lambda X: X;
skeletonDoc = lambda X, lang: list();
addItem = lambda X, y: list.append(X, y);
writer = lambda X: ('\n'.join(X) if bestK == 1 else \
'\n'.join(map(gf_utils.printMosesNbestFormat, X)));
translationBlocks = {};
for tgtlang in targetLanguages+['abstract']:
translationBlocks[tgtlang] = skeletonDoc(inputDoc, tgtlang);
preprocessor = pipeline_lexer;
postprocessor = clean_gfstrings;
logging.info( "Parsing text in %s" %(sourceLanguage) );
# 1. Get Abstract Trees for sentences in source language.
tokenized_sentences = map(preprocessor, reader(inputDoc));
web_lexer = gf_utils.Lexer('Web', grammar, sourceLanguage).tokenize;
absParses = [parsesBlock for parsesBlock in \
pipelineParsing(grammar, sourceLanguage, \
map(web_lexer, tokenized_sentences), K)];
logging.info( "Linearizing into %s" %(','.join(targetLanguages)) );
# 2. Linearize in all target Languages
for idx, parsesBlock in enumerate( map(operator.itemgetter(1), absParses) ):
translationBuffer = {};
if not len(parsesBlock):
# failed to parse;
# translate using lookup
for tgtlang, translation in translationByLookup(grammar, sourceLanguage,\
targetLanguages, absParses[idx][0]):
if bestK == 1:
addItem(translationBlocks[tgtlang], postprocessor(translation));
else:
addItem(translationBlocks[tgtlang], [((0,), postprocessor(translation))]);
addItem(translationBlocks['abstract'], '');
else: else:
target_langs = filter(None, [lang[-3:] if lang != sourceLanguage else '' for lang in grammar.languages.keys()]); bestTranslationIdx = 0;
targetLanguages = filter(None, [lang if lang[-3:] in target_langs else '' for lang in grammar.languages.keys()]); for tgtlang in targetLanguages:
logging.info("Translating into the following languages: %s" %(','.join(targetLanguages))); translationBuffer[tgtlang] = next(gf_utils.getKLinearizations(grammar, \
tgtlang, [parsesBlock], K=bestK));
K = props.bestK if props.bestK != 1 else 20; # by default we look for 20 best parses if bestK == 1:
bestK = props.bestK; for tidx, translation in enumerate(translationBuffer[tgtlang]):
if postprocessor(translation[1]).strip():
if not props.input: if tidx > bestTranslationIdx:
logging.info( "Input file name missing. Reading input from stdin." ); bestTranslationIdx = tidx;
inputStream = sys.stdin; break;
outputPrefix = os.getpid(); for tgtlang in targetLanguages:
if bestK == 1:
else: translation = postprocessor(translationBuffer[tgtlang][bestTranslationIdx][1]) \
inputStream = codecs.open(props.input, 'r'); if len(translationBuffer[tgtlang]) > bestTranslationIdx \
outputPrefix = os.path.splitext( os.path.split(props.input)[1] )[0]; else ((None,), '');
abstract = str(parsesBlock[bestTranslationIdx][1]);
if props.format == 'sgm': else:
inputDoc = etree.parse(inputStream); translation = translationBuffer[tgtlang] \
reader = sgmReader; if len(translationBuffer[tgtlang]) \
skeletonDoc = getXMLSkeleton; else [];
addItem = addToSgm; abstract = parsesBlock;
writer = sgmWriter; addItem(translationBlocks[tgtlang], translation);
elif props.format == 'txt': addItem(translationBlocks['abstract'], abstract);
logging.info("Input format is txt. Assuming one-sentence-per-line format.");
inputDoc = inputStream; for tgtlang in targetLanguages+['abstract']:
reader = lambda X: X; outputFile = os.path.join( props.exp_directory, '%s-%s.%s' %(outputPrefix, tgtlang[-3:] \
skeletonDoc = lambda X, lang: list(); if tgtlang!='abstract' \
addItem = lambda X, y: list.append(X, y); else 'abstract', props.format) );
writer = lambda X: ('\n'.join(X) if bestK == 1 else '\n'.join(map(gf_utils.printMosesNbestFormat, X))); logging.info( "Writing translations for %s to %s" %(tgtlang, outputFile) );
with codecs.open(outputFile, 'w', encoding='utf-8') as outputStream:
translationBlocks = {}; print(writer(translationBlocks[tgtlang]), file=outputStream);
for tgtlang in targetLanguages+['abstract']: return;
translationBlocks[tgtlang] = skeletonDoc(inputDoc, tgtlang);
preprocessor = pipeline_lexer;
postprocessor = clean_gfstrings;
logging.info( "Parsing text in %s" %(sourceLanguage) );
# 1. Get Abstract Trees for sentences in source language.
tokenized_sentences = itertools.imap(preprocessor, reader(inputDoc));
absParses = [parsesBlock for parsesBlock in pipelineParsing(grammar, sourceLanguage, web_lexer(grammar, sourceLanguage, tokenized_sentences), K)];
logging.info( "Linearizing into %s" %(','.join(targetLanguages)) );
# 2. Linearize in all target Languages
for idx, parsesBlock in enumerate( itertools.imap(operator.itemgetter(1), absParses) ):
translationBuffer = {};
if not len(parsesBlock):
# failed to parse;
# translate using lookup
for tgtlang, translation in translationByLookup(grammar, sourceLanguage, targetLanguages, absParses[idx][0]):
if bestK == 1:
addItem(translationBlocks[tgtlang], postprocessor(translation));
else:
addItem(translationBlocks[tgtlang], [((0,), postprocessor(translation))]);
addItem(translationBlocks['abstract'], '');
else:
bestTranslationIdx = 0;
for tgtlang in targetLanguages:
translationBuffer[tgtlang] = gf_utils.getKLinearizations(grammar, tgtlang, [parsesBlock], K=bestK).next();
if bestK == 1:
for tidx, translation in enumerate(translationBuffer[tgtlang]):
if postprocessor(translation[1]).strip():
if tidx > bestTranslationIdx:
bestTranslationIdx = tidx;
break;
for tgtlang in targetLanguages:
if bestK == 1:
translation = postprocessor(translationBuffer[tgtlang][bestTranslationIdx][1]) if len(translationBuffer[tgtlang]) > bestTranslationIdx else ((None,), '');
abstract = str(parsesBlock[bestTranslationIdx][1]);
else:
translation = translationBuffer[tgtlang] if len(translationBuffer[tgtlang]) else [];
abstract = parsesBlock;
addItem(translationBlocks[tgtlang], translation);
addItem(translationBlocks['abstract'], abstract);
for tgtlang in targetLanguages+['abstract']:
outputFile = os.path.join( props.exp_directory, '%s-%s.%s' %(outputPrefix, tgtlang[-3:] if tgtlang!='abstract' else 'abstract', props.format) );
logging.info( "Writing translations for %s to %s" %(tgtlang, outputFile) );
with codecs.open(outputFile, 'w') as outputStream:
print >>outputStream, writer(translationBlocks[tgtlang]);
return;
def cmdLineParser(): def cmdLineParser():
argparser = argparse.ArgumentParser(prog='translation_pipeline.py', description='Run the GF translation pipeline on standard test-sets'); argparser = argparse.ArgumentParser(prog='translation_pipeline.py', description='Run the GF translation pipeline on standard test-sets');
argparser.add_argument('-g', '--pgf', dest='pgffile', required=True, help='PGF grammar file to run the pipeline'); argparser.add_argument('-g', '--pgf', dest='pgffile', required=True, help='PGF grammar file to run the pipeline');
argparser.add_argument('-s', '--source', dest='srclang', default='', help='Source language of input sentences'); argparser.add_argument('-s', '--source', dest='srclang', default='', help='Source language of input sentences');
argparser.add_argument('-t', '--target', dest='tgtlangs', nargs='*', default=[], help='Target languages to linearize (default is all other languages)'); argparser.add_argument('-t', '--target', dest='tgtlangs', nargs='*', default=[], help='Target languages to linearize (default is all other languages)');
argparser.add_argument('-i', '--input', dest='input', default='', help='input file (default will accept STDIN)'); argparser.add_argument('-i', '--input', dest='input', default='', help='input file (default will accept STDIN)');
argparser.add_argument('-e', '--exp', dest='exp_directory', default=os.getcwd(), help='experiement directory to write translation files'); argparser.add_argument('-e', '--exp', dest='exp_directory', default=os.getcwd(), help='experiement directory to write translation files');
argparser.add_argument('-f', '--format', dest='format', default='txt', choices=['txt', 'sgm'], help='input file format (output files will be written in the same format)'); argparser.add_argument('-f', '--format', dest='format', default='txt', choices=['txt', 'sgm'], help='input file format (output files will be written in the same format)');
argparser.add_argument('-p', '--props', dest='propsfile', default='', help='properties file for the translation pipeline (specify the above arguments in a file)'); argparser.add_argument('-p', '--props', dest='propsfile', default='', help='properties file for the translation pipeline (specify the above arguments in a file)');
argparser.add_argument('-K', dest='bestK', type=int, default=1, help='K value for K-best translation'); argparser.add_argument('-K', dest='bestK', type=int, default=1, help='K value for K-best translation');
return argparser; return argparser;
if __name__ == '__main__': if __name__ == '__main__':
logging.basicConfig(level='INFO'); logging.basicConfig(level='INFO');
pipelineEnv = cmdLineParser().parse_args(sys.argv[1:]); pipelineEnv = cmdLineParser().parse_args(sys.argv[1:]);
translation_pipeline(pipelineEnv); translation_pipeline(pipelineEnv);