forked from GitHub/gf-core
python examples compatible with both Python 2 and 3
This commit is contained in:
@@ -1,283 +1,369 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Python 2 and 3 compatible
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse, re, string, sys, time;
|
import argparse, codecs, re, string, sys, time;
|
||||||
from itertools import imap, count;
|
try:
|
||||||
|
from itertools import imap as map;
|
||||||
|
from itertools import count;
|
||||||
|
except ImportError:
|
||||||
|
from itertools import count;
|
||||||
|
pass;
|
||||||
from operator import itemgetter;
|
from operator import itemgetter;
|
||||||
|
|
||||||
import pgf;
|
import pgf;
|
||||||
|
|
||||||
def lexerI(sentence):
|
class Lexer(object):
|
||||||
return sentence.rstrip(string.whitespace+string.punctuation);
|
def __init__(self, lang='None', grammar=None, gflang=None):
|
||||||
|
import translation_pipeline;
|
||||||
|
lexers = {'None': self.lexerI, \
|
||||||
|
'Eng': self.lexerI, \
|
||||||
|
'Chi': self.lexerChi, \
|
||||||
|
'Translator': translation_pipeline.pipeline_lexer, \
|
||||||
|
'Web': self.lexerWeb
|
||||||
|
};
|
||||||
|
if grammar:
|
||||||
|
self._pgf = grammar;
|
||||||
|
self._lang = gflang;
|
||||||
|
|
||||||
def lexerChi(sentence):
|
self.tokenize = lexers[lang];
|
||||||
sentence = sentence.decode('utf-8');
|
return;
|
||||||
|
|
||||||
|
def lexerI(self, sentence):
|
||||||
|
#return sentence.decode('utf-8').rstrip(string.whitespace+string.punctuation).encode('utf-8');
|
||||||
|
return sentence.rstrip(string.whitespace+string.punctuation);
|
||||||
|
|
||||||
|
def lexerChi(self, sentence):
|
||||||
|
#sentence = sentence.decode('utf-8');
|
||||||
tokens, idx, n = [], 0, len(sentence);
|
tokens, idx, n = [], 0, len(sentence);
|
||||||
prev = True;
|
prev = True;
|
||||||
while idx < n:
|
while idx < n:
|
||||||
if sentence[idx] in string.whitespace:
|
if sentence[idx] in string.whitespace:
|
||||||
prev = True;
|
prev = True;
|
||||||
idx += 1;
|
idx += 1;
|
||||||
continue;
|
continue;
|
||||||
if 0 < ord(sentence[idx]) < 128:
|
if 0 < ord(sentence[idx]) < 128:
|
||||||
if sentence[idx] in string.punctuation:
|
if sentence[idx] in string.punctuation:
|
||||||
prev = True;
|
prev = True;
|
||||||
if prev:
|
if prev:
|
||||||
tokens.append( sentence[idx] );
|
tokens.append( sentence[idx] );
|
||||||
prev = False;
|
prev = False;
|
||||||
else:
|
else:
|
||||||
tokens[-1] = tokens[-1]+sentence[idx];
|
tokens[-1] = tokens[-1]+sentence[idx];
|
||||||
else:
|
else:
|
||||||
prev = True;
|
prev = True;
|
||||||
tokens.append( sentence[idx] );
|
tokens.append( sentence[idx] );
|
||||||
idx += 1;
|
idx += 1;
|
||||||
return ' '.join(tokens).encode('utf-8');
|
return ' '.join(tokens);#.encode('utf-8');
|
||||||
|
|
||||||
def lexer(lang='translator'):
|
def lexerWeb(self, sentence):
|
||||||
if lang[-3:] == 'Eng':
|
tokensList = re.split('\s+?', sentence.strip());
|
||||||
return lexerI;
|
for idx, token in enumerate(tokensList):
|
||||||
elif lang[-3:] == 'Chi':
|
if not token[0].isupper():
|
||||||
return lexerChi;
|
continue;
|
||||||
elif lang == 'translator':
|
lowertoken = tokensList[idx].lower();
|
||||||
import translation_pipeline;
|
count = 0;
|
||||||
return translation_pipeline.pipeline_lexer;
|
for analysis in self._pgf.languages[self._lang].lookupMorpho(lowertoken):
|
||||||
else:
|
count += 1;
|
||||||
return lexerI;
|
tokensList[idx] = lowertoken if count else token;
|
||||||
|
for idx, token in enumerate(tokensList):
|
||||||
|
if token.find('-') == -1:
|
||||||
|
continue;
|
||||||
|
count = 0;
|
||||||
|
for analysis in self._pgf.languages[self._lang].lookupMorpho(token):
|
||||||
|
count += 1;
|
||||||
|
if count:
|
||||||
|
continue;
|
||||||
|
token = tokensList[idx].replace('-', '');
|
||||||
|
for analysis in self._pgf.languages[self._lang].lookupMorpho(token):
|
||||||
|
count += 1;
|
||||||
|
if count:
|
||||||
|
tokensList[idx] = token;
|
||||||
|
continue;
|
||||||
|
token = tokensList[idx].replace('-', ' ');
|
||||||
|
return ' '.join(tokensList);
|
||||||
|
|
||||||
def postprocessor(sentence):
|
def postprocessor(sentence):
|
||||||
if sentence == None:
|
if sentence == None:
|
||||||
return '';
|
return '';
|
||||||
if sentence.startswith('* ') or sentence.startswith('% '):
|
if sentence.startswith('* ') or sentence.startswith('% '):
|
||||||
sentence = sentence[2:];
|
sentence = sentence[2:];
|
||||||
sentence = sentence.replace(' &+ ', '');
|
sentence = sentence.replace(' &+ ', '');
|
||||||
sentence = sentence.replace('<+>', ' ');
|
sentence = sentence.replace('<+>', ' ');
|
||||||
return sentence;
|
return sentence;
|
||||||
|
|
||||||
def readJohnsonRerankerTrees(inputStream):
|
def readJohnsonRerankerTrees(inputStream):
|
||||||
endOfParse = False;
|
endOfParse = False;
|
||||||
while True:
|
while True:
|
||||||
sentheader = inputStream.next();
|
sentheader = inputStream.next();
|
||||||
if sentheader == '':
|
if sentheader == '':
|
||||||
break;
|
break;
|
||||||
parsescount, sentidx = map(int, sentheader.strip().split());
|
parsescount, sentidx = map(int, sentheader.strip().split());
|
||||||
parsesBlock = [];
|
parsesBlock = [];
|
||||||
for i in xrange(parsescount):
|
for i in xrange(parsescount):
|
||||||
parseprob = inputStream.next();
|
parseprob = inputStream.next();
|
||||||
if parseprob.strip() == '':
|
if parseprob.strip() == '':
|
||||||
endOfParse = True;
|
endOfParse = True;
|
||||||
break;
|
break;
|
||||||
parse = inputStream.next();
|
parse = inputStream.next();
|
||||||
parsesBlock.append( (float(parseprob.strip()), pgf.readExpr(parse.strip())) );
|
parsesBlock.append((float(parseprob.strip()), pgf.readExpr(parse.strip())));
|
||||||
yield sentidx, parsesBlock;
|
yield sentidx, parsesBlock;
|
||||||
if not endOfParse:
|
if not endOfParse:
|
||||||
_ = inputStream.next();
|
_ = inputStream.next();
|
||||||
endOfParse = False;
|
endOfParse = False;
|
||||||
|
|
||||||
def readMosesNbestFormat(inputStream):
|
def readMosesNbestFormat(inputStream):
|
||||||
transBlock = [];
|
transBlock = [];
|
||||||
currentHypothesisId = 0;
|
currentHypothesisId = 0;
|
||||||
while True:
|
while True:
|
||||||
line = inputStream.next();
|
line = inputStream.next();
|
||||||
if line == '':
|
if line == '':
|
||||||
break;
|
break;
|
||||||
fields = line.strip().split('|||');
|
fields = line.strip().split('|||');
|
||||||
if str(fields[0].strip()) != str(currentHypothesisId):
|
if str(fields[0].strip()) != str(currentHypothesisId):
|
||||||
yield currentHypothesisId, transBlock;
|
yield currentHypothesisId, transBlock;
|
||||||
transBlock = [];
|
transBlock = [];
|
||||||
currentHypothesisId = int(fields[0]);
|
currentHypothesisId = int(fields[0]);
|
||||||
transBlock.append( (map(float, tuple([val.strip() for val in fields[3].split()])), fields[1].strip()) );
|
transBlock.append( (map(float, \
|
||||||
|
tuple([val.strip() for val in fields[3].split()])), \
|
||||||
|
fields[1].strip()) );
|
||||||
|
|
||||||
def printJohnsonRerankerFormat(gfparsesList, sentid=count(1)):
|
def printJohnsonRerankerFormat(gfparsesList, sentids=count(1)):
|
||||||
johnsonRepr = [];
|
johnsonRepr = [];
|
||||||
parseHash = {};
|
parseHash = {};
|
||||||
for parse in sorted(gfparsesList, key=itemgetter(0)):
|
for parse in sorted(gfparsesList, key=itemgetter(0)):
|
||||||
if not parseHash.has_key(parse[1]):
|
if parse[1] not in parseHash:
|
||||||
johnsonRepr.append( str(-1*parse[0]) );
|
johnsonRepr.append( str(-1*parse[0]) );
|
||||||
johnsonRepr.append( str(parse[1]) );
|
johnsonRepr.append( str(parse[1]) );
|
||||||
parseHash.setdefault(parse[1], []).append(parse[0]);
|
parseHash.setdefault(parse[1], []).append(parse[0]);
|
||||||
curid = sentid.next();
|
curid = next(sentids);
|
||||||
if len(gfparsesList):
|
if len(gfparsesList):
|
||||||
johnsonRepr.insert(0, '%d %d' %(len(parseHash.values()), curid));
|
johnsonRepr.insert(0, '%d %d' %(len(parseHash.values()), curid));
|
||||||
duplicateInstances = len(filter(lambda X: len(parseHash[X]) > 1, parseHash.keys()));
|
duplicateInstances = len(list(filter(lambda X: len(parseHash[X]) > 1, \
|
||||||
#if duplicateInstances: print >>sys.stderr, "%d duplicate parses found in K-best parsing" %(duplicateInstances);
|
parseHash.keys())));
|
||||||
return '\n'.join(johnsonRepr)+'\n';
|
return '\n'.join(johnsonRepr)+'\n';
|
||||||
|
|
||||||
def printMosesNbestFormat(hypothesisList, sentid=count(1)):
|
def printMosesNbestFormat(hypothesisList, sentids=count(1)):
|
||||||
mosesRepr = [];
|
mosesRepr = [];
|
||||||
sid = sentid.next();
|
sid = next(sentids);
|
||||||
for hypScores, hypStr in hypothesisList:
|
for hypScores, hypStr in hypothesisList:
|
||||||
if not hasattr(hypScores, '__iter__'):
|
if not hasattr(hypScores, '__iter__'):
|
||||||
hypScores = (hypScores, );
|
hypScores = (hypScores, );
|
||||||
mosesRepr.append("%d ||| %s ||| NULL ||| %s" %(sid, hypStr, ' '.join(['%.6f'%score for score in hypScores])));
|
mosesRepr.append("%d ||| %s ||| NULL ||| %s" \
|
||||||
return '\n'.join(mosesRepr);
|
%(sid, hypStr, ' '.join('%.6f'%score for score in hypScores)) );
|
||||||
|
return '\n'.join(mosesRepr);
|
||||||
|
|
||||||
def getKLinearizations(grammar, tgtlanguage, abstractParsesList, K=10):
|
def getKLinearizations(grammar, tgtlanguage, abstractParsesList, K=10):
|
||||||
generator = grammar.languages[tgtlanguage].linearizeAll;
|
generator = grammar.languages[tgtlanguage].linearizeAll;
|
||||||
for parsesBlock in abstractParsesList:
|
for parsesBlock in abstractParsesList:
|
||||||
kBestTrans = [];
|
kBestTrans = [];
|
||||||
for parseprob, parse in parsesBlock:
|
for parseprob, parse in parsesBlock:
|
||||||
for linstring in generator(parse, n=K):
|
for linstring in generator(parse, n=K):
|
||||||
kBestTrans.append( ((parseprob,), postprocessor(linstring)) );
|
kBestTrans.append( ((parseprob,), postprocessor(linstring)) );
|
||||||
yield kBestTrans;
|
yield kBestTrans;
|
||||||
|
|
||||||
def getKBestParses(grammar, language, K, serializable=False, sentid=count(1), max_length=50):
|
def getKBestParses(grammar, language, K, callbacks=[], \
|
||||||
parser = grammar.languages[language].parse;
|
serializable=False, sentids=count(1), max_length=50):
|
||||||
import translation_pipeline
|
parser = grammar.languages[language].parse;
|
||||||
def worker(sentence):
|
import translation_pipeline;
|
||||||
sentence = sentence.strip();
|
callbacks_PN = translation_pipeline.parseNames;
|
||||||
curid = sentid.next();
|
callbacks_Symb = translation_pipeline.parseUnknown;
|
||||||
tstart = time.time();
|
def worker(sentence):
|
||||||
kBestParses = [];
|
sentence = sentence.strip();
|
||||||
parseScores = {};
|
curid = next(sentids);
|
||||||
if len(sentence.split()) > max_length:
|
tstart = time.time();
|
||||||
tend, err = time.time(), "Sentence too long (%d tokens). Might potentially run out of memory" %(len(sentence.split()));
|
kBestParses = [];
|
||||||
print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err);
|
parseScores = {};
|
||||||
return tend-tstart, kBestParses; # temporary hack to make sure parser does not get killed for very long sentences;
|
if len(sentence.split()) > max_length:
|
||||||
try:
|
# temporary hack to make sure parser does not get
|
||||||
callbacks = [('PN', translation_pipeline.parseNames(grammar, args.srclang, sentence)), ('Symb', translation_pipeline.parseUnknown(grammar, args.srclang, sentence))]
|
# killed for very long sentences;
|
||||||
for parseidx, parse in enumerate( parser(sentence, heuristics=0, callbacks=callbacks) ):
|
tend, err = time.time(), \
|
||||||
parseScores[parse[0]] = True;
|
"Sentence too long (%d tokens). Might potentially run out of memory" \
|
||||||
kBestParses.append( (parse[0], str(parse[1]) if serializable else parse[1]) );
|
%(len(sentence.split()));
|
||||||
if parseidx == K-1: break;
|
print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr);
|
||||||
#if len(parseScores) >= K: break;
|
return tend-tstart, kBestParses;
|
||||||
tend = time.time();
|
|
||||||
print >>sys.stderr, '%d\t%.4f' %(curid, tend-tstart);
|
# with modified API for callbacks, each callback function has to
|
||||||
return tend-tstart, kBestParses;
|
# be freshly created for each sentence; otherwise, they do not
|
||||||
except pgf.ParseError, err:
|
# work.
|
||||||
tend = time.time();
|
try:
|
||||||
print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err);
|
callbacks = [('PN', callbacks_PN(grammar, language, sentence)),\
|
||||||
return tend-tstart, kBestParses;
|
('Symb', callbacks_Symb(grammar, language, sentence))];
|
||||||
except UnicodeEncodeError, err:
|
for parseidx, parse in enumerate(parser(sentence, \
|
||||||
tend = time.time();
|
heuristics=0, callbacks=callbacks)):
|
||||||
print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err);
|
parseScores[parse[0]] = True;
|
||||||
return tend-tstart, kBestParses;
|
kBestParses.append((parse[0], str(parse[1]) if serializable \
|
||||||
return worker;
|
else parse[1]));
|
||||||
|
if parseidx == K-1:
|
||||||
|
break;
|
||||||
|
tend = time.time();
|
||||||
|
print('%d\t%.4f' %(curid, tend-tstart), file=sys.stderr);
|
||||||
|
return tend-tstart, kBestParses;
|
||||||
|
except pgf.ParseError as err:
|
||||||
|
tend = time.time();
|
||||||
|
print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr);
|
||||||
|
return tend-tstart, kBestParses;
|
||||||
|
except UnicodeEncodeError as err:
|
||||||
|
tend = time.time();
|
||||||
|
print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr);
|
||||||
|
return tend-tstart, kBestParses;
|
||||||
|
return worker;
|
||||||
|
|
||||||
def pgf_parse(args):
|
def pgf_parse(args):
|
||||||
grammar = pgf.readPGF(args.pgfgrammar);
|
grammar = pgf.readPGF(args.pgfgrammar);
|
||||||
import translation_pipeline;
|
preprocessor = Lexer().tokenize;
|
||||||
|
#if sys.version_info < (3, 0):
|
||||||
preprocessor = lexer();
|
# args.inputstream = codecs.getreader('utf-8')(args.inputstream);
|
||||||
inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) );
|
inputSet = map(preprocessor, args.inputstream);
|
||||||
outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1])); #operator.itemgetter(1);
|
web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize;
|
||||||
parser = getKBestParses(grammar, args.srclang, 1);
|
inputSet = map(web_preprocessor, inputSet);
|
||||||
|
outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1]));
|
||||||
sentidx = 0;
|
parser = getKBestParses(grammar, args.srclang, 1);
|
||||||
for time, parsesBlock in imap(parser, inputSet):
|
|
||||||
sentidx += 1;
|
sentidx = 0;
|
||||||
print >>args.outputstream, "%d\t%f\t%s" %(sentidx, time, str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else '');
|
for time, parsesBlock in map(parser, inputSet):
|
||||||
return;
|
sentidx += 1;
|
||||||
|
print("%d\t%f\t%s" %(sentidx, time, \
|
||||||
|
str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''), \
|
||||||
|
file=args.outputstream);
|
||||||
|
return;
|
||||||
|
|
||||||
def pgf_kparse(args):
|
def pgf_kparse(args):
|
||||||
grammar = pgf.readPGF(args.pgfgrammar);
|
grammar = pgf.readPGF(args.pgfgrammar);
|
||||||
import translation_pipeline;
|
preprocessor = Lexer().tokenize;
|
||||||
|
#if sys.version_info < (3, 0):
|
||||||
preprocessor = lexer();
|
# args.inputstream = codecs.getreader('utf-8')(args.inputstream);
|
||||||
inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) );
|
inputSet = map(preprocessor, args.inputstream);
|
||||||
outputPrinter = printJohnsonRerankerFormat;
|
web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize;
|
||||||
parser = getKBestParses(grammar, args.srclang, args.K);
|
inputSet = map(web_preprocessor, inputSet);
|
||||||
|
outputPrinter = printJohnsonRerankerFormat;
|
||||||
sentidx = 0;
|
parser = getKBestParses(grammar, args.srclang, args.K);
|
||||||
for time, parsesBlock in imap(parser, inputSet):
|
|
||||||
sentidx += 1;
|
sentidx = 0;
|
||||||
strParses = str(outputPrinter(parsesBlock));
|
for time, parsesBlock in map(parser, inputSet):
|
||||||
if not (strParses == '\n'):
|
sentidx += 1;
|
||||||
print >>args.outputstream, strParses;
|
strParses = str(outputPrinter(parsesBlock));
|
||||||
return;
|
if not (strParses == '\n'):
|
||||||
|
print(strParses, file=args.outputstream);
|
||||||
|
return;
|
||||||
|
|
||||||
def pgf_linearize(args):
|
def pgf_linearize(args):
|
||||||
grammar = pgf.readPGF(args.pgfgrammar);
|
grammar = pgf.readPGF(args.pgfgrammar);
|
||||||
outputPrinter = postprocessor;
|
def parse_line(line):
|
||||||
inputSet = [];
|
try:
|
||||||
for line in args.inputstream:
|
sentid, parsetime, parserepr = line.strip('\n').split('\t', 2);
|
||||||
try:
|
except ValueError:
|
||||||
sentid, parsetime, parserepr = line.strip('\n').split('\t', 2);
|
print("Line not in proper format: %s" %(line), file=stderr);
|
||||||
except ValueError:
|
parseprob, abstree = parserepr.split('\t') if parserepr.strip() \
|
||||||
print line.strip();
|
else (0, '');
|
||||||
parseprob, abstree = parserepr.split('\t') if parserepr.strip() else (0, '');
|
return ((int(sentid), float(parsetime), float(parseprob), \
|
||||||
inputSet.append( (int(sentid), float(parsetime), float(parseprob), pgf.readExpr(abstree) if abstree else None) );
|
pgf.readExpr(abstree) if abstree else None));
|
||||||
linearizer = grammar.languages[args.tgtlang].linearize;
|
|
||||||
for sentid, _, _, abstree in inputSet:
|
#if sys.version_info < (3, 0):
|
||||||
if abstree:
|
# args.inputstream = codecs.getreader('utf-8')(args.inputstream);
|
||||||
print >>args.outputstream, str(outputPrinter(linearizer(abstree)));
|
inputSet = map(parse_line, (line for line in args.inputstream));
|
||||||
else:
|
outputPrinter = postprocessor;
|
||||||
print >>args.outputstream, "";
|
linearizer = grammar.languages[args.tgtlang].linearize;
|
||||||
return;
|
for sentid, _, _, abstree in inputSet:
|
||||||
|
if abstree:
|
||||||
|
print(str(outputPrinter(linearizer(abstree))), \
|
||||||
|
file=args.outputstream);
|
||||||
|
else:
|
||||||
|
print("", file=args.outputstream);
|
||||||
|
return;
|
||||||
|
|
||||||
def pgf_klinearize(args):
|
def pgf_klinearize(args):
|
||||||
grammar = pgf.readPGF(args.pgfgrammar);
|
grammar = pgf.readPGF(args.pgfgrammar);
|
||||||
outputPrinter = printMosesNbestFormat;
|
#if sys.version_info < (3, 0):
|
||||||
inputSet = [(sentid, parsesBlock) for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)];
|
# args.inputstream = codecs.getreader('utf-8')(args.inputstream);
|
||||||
sentIdsList = imap(itemgetter(0), inputSet);
|
inputSet = [(sentid, parsesBlock) \
|
||||||
parsesBlocks = map(itemgetter(1), inputSet);
|
for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)];
|
||||||
|
outputPrinter = printMosesNbestFormat;
|
||||||
for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K):
|
sentIdsList = map(itemgetter(0), inputSet);
|
||||||
strTrans = str(outputPrinter(transBlock, sentIdsList));
|
parsesBlocks = map(itemgetter(1), inputSet);
|
||||||
if strTrans:
|
|
||||||
print >>args.outputstream, strTrans;
|
for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K):
|
||||||
return;
|
strTrans = str(outputPrinter(transBlock, sentIdsList));
|
||||||
|
if strTrans:
|
||||||
|
print(strTrans, file=args.outputstream);
|
||||||
|
return;
|
||||||
|
|
||||||
def cmdLineParser():
|
def cmdLineParser():
|
||||||
argparser = argparse.ArgumentParser(prog='gf_utils.py', description='Examples for carrying out (K-best) parsing, translation and linearization using GF C runtime.');
|
argparser = argparse.ArgumentParser(prog='gf_utils.py', \
|
||||||
|
description='Examples for carrying out (K-best) parsing, \
|
||||||
subparsers = argparser.add_subparsers();
|
translation and linearization using GF C runtime.');
|
||||||
parser = subparsers.add_parser('parse', help='GF parsing of sentences');
|
|
||||||
kparser = subparsers.add_parser('kparse', help='K-best GF parsing of sentences');
|
subparsers = argparser.add_subparsers();
|
||||||
linearizer = subparsers.add_parser('linearize', help='Linearize GF abstract syntax treess');
|
parser = subparsers.add_parser('parse', help='GF parsing of sentences');
|
||||||
klinearizer = subparsers.add_parser('klinearize', help='Linearize K-variants of GF abstract syntax trees');
|
kparser = subparsers.add_parser('kparse', help='K-best GF parsing of sentences');
|
||||||
|
linearizer = subparsers.add_parser('linearize', help='Linearize GF abstract syntax treess');
|
||||||
parser.set_defaults(func=pgf_parse);
|
klinearizer = subparsers.add_parser('klinearize', help='Linearize K-variants of GF abstract syntax trees');
|
||||||
parser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
|
|
||||||
help='PGF Grammar file');
|
parser.set_defaults(func=pgf_parse);
|
||||||
parser.add_argument('-p', '--start-sym', dest='startcat', required=False, \
|
parser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
|
||||||
help='Start symbol in the grammar');
|
help='PGF Grammar file');
|
||||||
parser.add_argument('-s', '--src-lang', dest='srclang', required=True, \
|
parser.add_argument('-p', '--start-sym', dest='startcat', required=False, \
|
||||||
help='Source language');
|
help='Start symbol in the grammar');
|
||||||
parser.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \
|
parser.add_argument('-s', '--src-lang', dest='srclang', required=True, \
|
||||||
help='Input file') ;
|
help='Source language');
|
||||||
parser.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \
|
parser.add_argument('-i', '--input', dest='inputstream', nargs='?', \
|
||||||
help='Output file');
|
type=argparse.FileType(mode='r'), default=sys.stdin, \
|
||||||
|
help='Input file') ;
|
||||||
|
parser.add_argument('-o', '--output', dest='outputstream', nargs='?', \
|
||||||
|
type=argparse.FileType(mode='w'), default=sys.stdout, \
|
||||||
|
help='Output file');
|
||||||
|
|
||||||
|
kparser.set_defaults(func=pgf_kparse);
|
||||||
|
kparser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
|
||||||
|
help='PGF Grammar file');
|
||||||
|
kparser.add_argument('-p', '--start-sym', dest='startcat', required=False, \
|
||||||
|
help='Start symbol in the grammar');
|
||||||
|
kparser.add_argument('-s', '--src-lang', dest='srclang', required=True, \
|
||||||
|
help='Source language');
|
||||||
|
kparser.add_argument('-K', dest='K', required=True, \
|
||||||
|
type=int, \
|
||||||
|
help='K value for multiple parses');
|
||||||
|
kparser.add_argument('-i', '--input', dest='inputstream', nargs='?', \
|
||||||
|
type=argparse.FileType(mode='r'), default=sys.stdin, \
|
||||||
|
help='Input file');
|
||||||
|
kparser.add_argument('-o', '--output', dest='outputstream', nargs='?', \
|
||||||
|
type=argparse.FileType(mode='w'), default=sys.stdout, \
|
||||||
|
help='Output file');
|
||||||
|
|
||||||
|
linearizer.set_defaults(func=pgf_linearize);
|
||||||
|
linearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
|
||||||
|
help='PGF Grammar file');
|
||||||
|
linearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
|
||||||
|
help='Target language');
|
||||||
|
linearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', \
|
||||||
|
type=argparse.FileType(mode='r'), default=sys.stdin, \
|
||||||
|
help='Input file');
|
||||||
|
linearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', \
|
||||||
|
type=argparse.FileType(mode='w'), default=sys.stdout, \
|
||||||
|
help='Output file');
|
||||||
|
|
||||||
|
klinearizer.set_defaults(func=pgf_klinearize);
|
||||||
|
klinearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
|
||||||
|
help='PGF Grammar file');
|
||||||
|
klinearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
|
||||||
|
help='Target language');
|
||||||
|
klinearizer.add_argument('-K', '--kbest', dest='K', required=True, \
|
||||||
|
type=int, \
|
||||||
|
help='K value for multiple linearizations');
|
||||||
|
klinearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', \
|
||||||
|
type=argparse.FileType(mode='r'), default=sys.stdin, \
|
||||||
|
help='Input file');
|
||||||
|
klinearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', \
|
||||||
|
type=argparse.FileType(mode='w'), default=sys.stdout, \
|
||||||
|
help='Output file');
|
||||||
|
|
||||||
|
return argparser;
|
||||||
|
|
||||||
kparser.set_defaults(func=pgf_kparse);
|
|
||||||
kparser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
|
|
||||||
help='PGF Grammar file');
|
|
||||||
kparser.add_argument('-p', '--start-sym', dest='startcat', required=False, \
|
|
||||||
help='Start symbol in the grammar');
|
|
||||||
kparser.add_argument('-s', '--src-lang', dest='srclang', required=True, \
|
|
||||||
help='Source language');
|
|
||||||
kparser.add_argument('-K', dest='K', required=True, type=int, \
|
|
||||||
help='K value for multiple parses');
|
|
||||||
kparser.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \
|
|
||||||
help='Input file');
|
|
||||||
kparser.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \
|
|
||||||
help='Output file');
|
|
||||||
|
|
||||||
linearizer.set_defaults(func=pgf_linearize);
|
|
||||||
linearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
|
|
||||||
help='PGF Grammar file');
|
|
||||||
linearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
|
|
||||||
help='Target language');
|
|
||||||
linearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \
|
|
||||||
help='Input file');
|
|
||||||
linearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \
|
|
||||||
help='Output file');
|
|
||||||
|
|
||||||
klinearizer.set_defaults(func=pgf_klinearize);
|
|
||||||
klinearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
|
|
||||||
help='PGF Grammar file');
|
|
||||||
klinearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
|
|
||||||
help='Target language');
|
|
||||||
klinearizer.add_argument('-K', dest='K', required=True, type=int, \
|
|
||||||
help='K value for multiple linearizations');
|
|
||||||
klinearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \
|
|
||||||
help='Input file');
|
|
||||||
klinearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \
|
|
||||||
help='Output file');
|
|
||||||
|
|
||||||
return argparser;
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
args = cmdLineParser().parse_args(sys.argv[1:]);
|
args = cmdLineParser().parse_args(sys.argv[1:]);
|
||||||
args.func(args);
|
args.func(args);
|
||||||
|
|||||||
@@ -1,6 +1,18 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Python 2 and 3 compatible
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
|
||||||
import argparse, codecs, copy, itertools, logging, math, operator, os, os.path, re, string, sys, time;
|
import argparse, codecs, copy, itertools, logging, math, operator, os, os.path, re, string, sys, time;
|
||||||
|
try:
|
||||||
|
from itertools import imap as map;
|
||||||
|
from itertools import ifilter as filter;
|
||||||
|
except ImportError:
|
||||||
|
pass;
|
||||||
|
|
||||||
import xml.etree.ElementTree as etree;
|
import xml.etree.ElementTree as etree;
|
||||||
|
|
||||||
import pgf;
|
import pgf;
|
||||||
@@ -8,392 +20,402 @@ import gf_utils;
|
|||||||
|
|
||||||
# http://snipplr.com/view/25657/indent-xml-using-elementtree/
|
# http://snipplr.com/view/25657/indent-xml-using-elementtree/
|
||||||
def indentXMLNodes(elem, level=0):
|
def indentXMLNodes(elem, level=0):
|
||||||
i = "\n" + level*" "
|
i = "\n" + level*" "
|
||||||
if len(elem):
|
if len(elem):
|
||||||
if not elem.text or not elem.text.strip():
|
if not elem.text or not elem.text.strip():
|
||||||
elem.text = i + " "
|
elem.text = i + " "
|
||||||
if not elem.tail or not elem.tail.strip():
|
if not elem.tail or not elem.tail.strip():
|
||||||
elem.tail = i
|
elem.tail = i
|
||||||
for elem in elem:
|
for elem in elem:
|
||||||
indentXMLNodes(elem, level+1)
|
indentXMLNodes(elem, level+1)
|
||||||
if not elem.tail or not elem.tail.strip():
|
if not elem.tail or not elem.tail.strip():
|
||||||
elem.tail = i
|
elem.tail = i
|
||||||
else:
|
else:
|
||||||
if level and (not elem.tail or not elem.tail.strip()):
|
if level and (not elem.tail or not elem.tail.strip()):
|
||||||
elem.tail = i
|
elem.tail = i
|
||||||
|
|
||||||
def readTranslationPipelineOptions(propsfile, default_namespace):
|
def readTranslationPipelineOptions(propsfile, default_namespace):
|
||||||
with codecs.open(propsfile, 'r', 'utf-8') as infile:
|
with codecs.open(propsfile, 'r', 'utf-8') as infile:
|
||||||
for line in infile:
|
for line in infile:
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue;
|
continue;
|
||||||
key, value = line.strip().split('=', 1);
|
key, value = line.strip().split('=', 1);
|
||||||
key, value = key.strip(), value.strip();
|
key, value = key.strip(), value.strip();
|
||||||
if key == 'srclang': default_namespace.srclang = value;
|
if key == 'srclang':
|
||||||
elif key == 'tgtlangs': default_namespace.tgtlangs = [val.strip() for val in ','.split(value)];
|
default_namespace.srclang = value;
|
||||||
elif key == 'input': default_namespace.input = value;
|
elif key == 'tgtlangs':
|
||||||
elif key == 'format': default_namespace.format = value;
|
default_namespace.tgtlangs = [val.strip() for val in ','.split(value)];
|
||||||
elif key == 'exp_directory': default_namespace.exp_directory = value;
|
elif key == 'input':
|
||||||
else:
|
default_namespace.input = value;
|
||||||
#print >>sys.stderr, "Unknown option-%s found in props file. Ignoring and proceeding." %(key);
|
elif key == 'format':
|
||||||
logging.warning("Unknown option-%s found in props file. Ignoring and proceeding." %(key));
|
default_namespace.format = value;
|
||||||
continue;
|
elif key == 'exp_directory':
|
||||||
return default_namespace;
|
default_namespace.exp_directory = value;
|
||||||
|
else:
|
||||||
|
logging.warning("Unknown option-%s found in props file. Ignoring and proceeding." %(key));
|
||||||
|
continue;
|
||||||
|
return default_namespace;
|
||||||
|
|
||||||
def sgmReader(sgmDoc):
|
def sgmReader(sgmDoc):
|
||||||
root = sgmDoc.getroot();
|
root = sgmDoc.getroot();
|
||||||
for element in root.iter():
|
for element in root.iter():
|
||||||
if element.text is not None and element.text.strip():
|
if element.text is not None and element.text.strip():
|
||||||
yield element.text.strip().encode('utf-8');
|
yield element.text.strip().encode('utf-8');
|
||||||
|
|
||||||
def addToSgm(sgmDoc, strItem):
|
def addToSgm(sgmDoc, strItem):
|
||||||
for node in sgmDoc.findall('.//seg'):
|
for node in sgmDoc.findall('.//seg'):
|
||||||
if not node.text.strip():
|
if not node.text.strip():
|
||||||
strItem = strItem.decode('utf-8');
|
strItem = strItem.decode('utf-8');
|
||||||
node.text = ' %s ' %(strItem if strItem.strip() else 'EMPTY');
|
node.text = ' %s ' %(strItem if strItem.strip() else 'EMPTY');
|
||||||
return;
|
return;
|
||||||
logging.error("No more nodes available for adding content");
|
logging.error("No more nodes available for adding content");
|
||||||
return;
|
return;
|
||||||
|
|
||||||
def sgmWriter(sgmDoc):
|
def sgmWriter(sgmDoc):
|
||||||
indentXMLNodes( sgmDoc.getroot() );
|
indentXMLNodes( sgmDoc.getroot() );
|
||||||
return etree.tostring(sgmDoc.getroot(), encoding='utf-8', method='xml');
|
return etree.tostring(sgmDoc.getroot(), encoding='utf-8', method='xml');
|
||||||
|
|
||||||
def getXMLSkeleton(sgmDoc, tgtlang):
|
def getXMLSkeleton(sgmDoc, tgtlang):
|
||||||
skeletonDoc = copy.deepcopy(sgmDoc);
|
skeletonDoc = copy.deepcopy(sgmDoc);
|
||||||
root = skeletonDoc.getroot();
|
root = skeletonDoc.getroot();
|
||||||
root.tag = 'tstset';
|
root.tag = 'tstset';
|
||||||
root.attrib['trlang'] = tgtlang[-3:];
|
root.attrib['trlang'] = tgtlang[-3:];
|
||||||
root.find('doc').attrib['sysid'] = tgtlang[:-3];
|
root.find('doc').attrib['sysid'] = tgtlang[:-3];
|
||||||
for node in root.findall('.//seg'):
|
for node in root.findall('.//seg'):
|
||||||
node.text = '';
|
node.text = '';
|
||||||
return skeletonDoc;
|
return skeletonDoc;
|
||||||
|
|
||||||
def pipeline_lexer(sentence):
|
def pipeline_lexer(sentence):
|
||||||
tokens = sentence.strip().split();
|
tokens = sentence.strip().split();
|
||||||
#tokens = filter(None, re.split('(\W+)', sentence.strip()));
|
#tokens = filter(None, re.split('(\W+)', sentence.strip()));
|
||||||
n = len(tokens);
|
n = len(tokens);
|
||||||
idx = len(tokens)-1;
|
idx = len(tokens)-1;
|
||||||
while idx >= 0:
|
while idx >= 0:
|
||||||
if tokens[idx] in ".?!)":
|
if tokens[idx] in ".?!)":
|
||||||
idx -= 1;
|
idx -= 1;
|
||||||
else:
|
else:
|
||||||
break;
|
break;
|
||||||
tokens = tokens[:idx+1];
|
tokens = tokens[:idx+1];
|
||||||
idx = 0;
|
idx = 0;
|
||||||
while idx < len(tokens):
|
while idx < len(tokens):
|
||||||
if tokens[idx] in "'\"(":
|
if tokens[idx] in "'\"(":
|
||||||
idx += 1;
|
idx += 1;
|
||||||
else:
|
else:
|
||||||
break;
|
break;
|
||||||
tokens = tokens[idx:];
|
tokens = tokens[idx:];
|
||||||
return ' '.join(tokens);
|
return ' '.join(tokens);
|
||||||
|
|
||||||
def web_lexer(grammar, lang, sentences):
|
|
||||||
for instance in sentences:
|
|
||||||
tokensList = re.split('\s+?', instance.strip());
|
|
||||||
for idx, token in enumerate(tokensList):
|
|
||||||
if not token[0].isupper():
|
|
||||||
continue;
|
|
||||||
lowertoken = tokensList[idx].lower();
|
|
||||||
count = 0;
|
|
||||||
for analysis in grammar.languages[lang].lookupMorpho(lowertoken):
|
|
||||||
count += 1;
|
|
||||||
tokensList[idx] = lowertoken if count else token;
|
|
||||||
for idx, token in enumerate(tokensList):
|
|
||||||
if token.find('-') == -1:
|
|
||||||
continue;
|
|
||||||
count = 0;
|
|
||||||
for analysis in grammar.languages[lang].lookupMorpho(token):
|
|
||||||
count += 1;
|
|
||||||
if count:
|
|
||||||
continue;
|
|
||||||
token = tokensList[idx].replace('-', '');
|
|
||||||
for analysis in grammar.languages[lang].lookupMorpho(token):
|
|
||||||
count += 1;
|
|
||||||
if count:
|
|
||||||
tokensList[idx] = token;
|
|
||||||
continue;
|
|
||||||
token = tokensList[idx].replace('-', ' ');
|
|
||||||
yield ' '.join(tokensList);
|
|
||||||
|
|
||||||
def clean_gfstrings(sentence):
|
def clean_gfstrings(sentence):
|
||||||
absFuncName = re.compile('\[[^]]+?\]');
|
absFuncName = re.compile('\[[^]]+?\]');
|
||||||
untranslatedEntries = {};
|
untranslatedEntries = {};
|
||||||
for entry in re.findall(absFuncName, sentence):
|
for entry in re.findall(absFuncName, sentence):
|
||||||
untranslatedEntries[entry] = untranslatedEntries.setdefault(entry, 0)+1;
|
untranslatedEntries[entry] = untranslatedEntries.setdefault(entry, 0)+1;
|
||||||
for entry in untranslatedEntries:
|
for entry in untranslatedEntries:
|
||||||
while untranslatedEntries[entry] > 1:
|
while untranslatedEntries[entry] > 1:
|
||||||
sentence = sentence.replace(entry, '', 1);
|
sentence = sentence.replace(entry, '', 1);
|
||||||
untranslatedEntries[entry] -= 1;
|
untranslatedEntries[entry] -= 1;
|
||||||
sentence = sentence.replace(entry, ' '.join(entry[1:-1].split('_')[:-1]) if entry.find('_') != -1 else '');
|
sentence = sentence.replace(entry, \
|
||||||
return ' '.join( sentence.split() );
|
' '.join(entry[1:-1].split('_')[:-1]) if entry.find('_') != -1 \
|
||||||
|
else '');
|
||||||
|
return ' '.join( sentence.split() );
|
||||||
|
|
||||||
def parseNames(grammar, language, sentence):
|
def parseNames(grammar, language, sentence):
|
||||||
def callback(lin_idx, start):
|
def callback(lin_idx, start):
|
||||||
moving_start, end, eot = start, len(sentence), True;
|
moving_start, end, eot = start, len(sentence), True;
|
||||||
if moving_start < end and (not sentence[moving_start].isupper()):
|
if moving_start < end and (not sentence[moving_start].isupper()):
|
||||||
return None;
|
return None;
|
||||||
while moving_start < end:
|
while moving_start < end:
|
||||||
if sentence[moving_start] in string.whitespace:
|
if sentence[moving_start] in string.whitespace:
|
||||||
eot = True;
|
eot = True;
|
||||||
elif eot and sentence[moving_start].isupper():
|
elif eot and sentence[moving_start].isupper():
|
||||||
eot = False;
|
eot = False;
|
||||||
elif eot and (not sentence[moving_start].isupper()):
|
elif eot and (not sentence[moving_start].isupper()):
|
||||||
end = moving_start-1;
|
end = moving_start-1;
|
||||||
break;
|
break;
|
||||||
moving_start += 1;
|
moving_start += 1;
|
||||||
possible_name = sentence[start:end].strip();
|
possible_name = sentence[start:end].strip();
|
||||||
if possible_name:
|
if possible_name:
|
||||||
if language.endswith('Eng') and (possible_name == "I" or possible_name == "I'm"):
|
if language.endswith('Eng') and \
|
||||||
return None;
|
(possible_name == "I" or possible_name == "I'm"):
|
||||||
elif language.endswith('Eng') and possible_name.endswith("'s"):
|
return None;
|
||||||
end_idx = possible_name.rfind("'s");
|
elif language.endswith('Eng') and possible_name.endswith("'s"):
|
||||||
if end_idx != -1:
|
end_idx = possible_name.rfind("'s");
|
||||||
possible_name = possible_name[:end_idx].strip();
|
if end_idx != -1:
|
||||||
end -= 2;
|
possible_name = possible_name[:end_idx].strip();
|
||||||
if not possible_name:
|
end -= 2;
|
||||||
return None;
|
if not possible_name:
|
||||||
expr, prob = None, None;
|
return None;
|
||||||
for analysis in grammar.languages[language].lookupMorpho(possible_name):
|
expr, prob = None, None;
|
||||||
category = grammar.functionType(analysis[0]).cat;
|
for analysis in grammar.languages[language].lookupMorpho(possible_name):
|
||||||
if prob < analysis[-1]:
|
category = grammar.functionType(analysis[0]).cat;
|
||||||
if category == "PN":
|
if prob < analysis[-1]:
|
||||||
expr, prob = pgf.Expr(analysis[0], []), analysis[-1];
|
if category == "PN":
|
||||||
elif category == "Weekday":
|
expr, prob = pgf.Expr(analysis[0], []), analysis[-1];
|
||||||
expr, prob = pgf.Expr("weekdayPN", [pgf.Expr(analysis[0], [])]), analysis[-1];
|
elif category == "Weekday":
|
||||||
elif category == "Month":
|
expr, prob = pgf.Expr("weekdayPN", \
|
||||||
expr, prob = pgf.Expr("monthPN", [pgf.Expr(analysis[0], [])]), analysis[-1];
|
[pgf.Expr(analysis[0], [])]), analysis[-1];
|
||||||
elif category == "Language":
|
elif category == "Month":
|
||||||
return None;
|
expr, prob = pgf.Expr("monthPN", \
|
||||||
# generic named entity
|
[pgf.Expr(analysis[0], [])]), analysis[-1];
|
||||||
if expr == None:
|
elif category == "Language":
|
||||||
expr = pgf.Expr(possible_name);
|
return None;
|
||||||
expr = pgf.Expr("MkSymb", [expr]);
|
# generic named entity
|
||||||
expr = pgf.Expr("SymbPN", [expr]);
|
if expr == None:
|
||||||
return (expr, 0, end);
|
expr = pgf.Expr(possible_name);
|
||||||
return None;
|
expr = pgf.Expr("MkSymb", [expr]);
|
||||||
return callback;
|
expr = pgf.Expr("SymbPN", [expr]);
|
||||||
|
return (expr, 0, end);
|
||||||
|
return None;
|
||||||
|
return callback;
|
||||||
|
|
||||||
def parseUnknown(grammar, language, sentence):
|
def parseUnknown(grammar, language, sentence):
|
||||||
def callback(lin_idx, start):
|
def callback(lin_idx, start):
|
||||||
moving_start, end, eot = start, len(sentence), True;
|
moving_start, end, eot = start, len(sentence), True;
|
||||||
isNewToken = (moving_start == 0) or (moving_start > 1 and sentence[moving_start-1].isspace()) # -- added to deal with segmentation errors like may => ma_N + Symb y
|
# -- added to deal with segmentation errors like may => ma_N + Symb y
|
||||||
if moving_start < end and (not sentence[moving_start].isupper()):
|
isNewToken = (moving_start == 0) or \
|
||||||
while moving_start < end:
|
(moving_start > 1 and sentence[moving_start-1].isspace())
|
||||||
if sentence[moving_start] in string.whitespace:
|
if moving_start < end and (not sentence[moving_start].isupper()):
|
||||||
end = moving_start;
|
while moving_start < end:
|
||||||
break;
|
if sentence[moving_start] in string.whitespace:
|
||||||
moving_start += 1;
|
end = moving_start;
|
||||||
unknown_word = sentence[start:end].strip();
|
break;
|
||||||
if unknown_word and isNewToken:
|
moving_start += 1;
|
||||||
count = 0;
|
unknown_word = sentence[start:end].strip();
|
||||||
for analysis in grammar.languages[language].lookupMorpho(unknown_word):
|
if unknown_word and isNewToken:
|
||||||
count += 1;
|
count = 0;
|
||||||
if not count:
|
for analysis in grammar.languages[language].lookupMorpho(unknown_word):
|
||||||
expr = pgf.Expr("MkSymb", [pgf.Expr(unknown_word)]);
|
count += 1;
|
||||||
return (expr, 0, end);
|
if not count:
|
||||||
return None;
|
expr = pgf.Expr("MkSymb", [pgf.Expr(unknown_word)]);
|
||||||
return callback;
|
return (expr, 0, end);
|
||||||
|
return None;
|
||||||
|
return callback;
|
||||||
|
|
||||||
def parseTester(grammar, language):
|
def parseTester(grammar, language, sentence):
|
||||||
def callback(lin_idx, sentence, start):
|
def callback(lin_idx, start):
|
||||||
if start < len(sentence):
|
if start < len(sentence):
|
||||||
return (pgf.Expr(sentence[start]), 0, start+1);
|
return (pgf.Expr(sentence[start]), 0, start+1);
|
||||||
return None;
|
return None;
|
||||||
return callback;
|
return callback;
|
||||||
|
|
||||||
def translateWordsAsChunks(grammar, language, tgtlanguages, word):
|
def translateWordsAsChunks(grammar, language, tgtlanguages, word):
|
||||||
parser = grammar.languages[language].parse;
|
parser = grammar.languages[language].parse;
|
||||||
linearizersList = dict((lang, grammar.languages[lang].linearize) for lang in tgtlanguages);
|
linearizersList = dict((lang, grammar.languages[lang].linearize) \
|
||||||
translations = [];
|
for lang in tgtlanguages);
|
||||||
try:
|
translations = [];
|
||||||
for parseidx, parse in enumerate( parser(word) ):
|
try:
|
||||||
for lang in tgtlanguages:
|
for parseidx, parse in enumerate( parser(word) ):
|
||||||
trans = linearizersList[lang](parse[1]);
|
for lang in tgtlanguages:
|
||||||
translations.append(( lang, gf_utils.postprocessor(trans.strip() if trans else '') ) );
|
trans = linearizersList[lang](parse[1]);
|
||||||
break;
|
translations.append((lang, gf_utils.postprocessor(\
|
||||||
except pgf.ParseError, err:
|
trans.strip() if trans else '')));
|
||||||
return [];
|
break;
|
||||||
return translations;
|
except pgf.ParseError as err:
|
||||||
|
return [];
|
||||||
|
return translations;
|
||||||
|
|
||||||
def translateWord(grammar, language, tgtlanguages, word):
|
def translateWord(grammar, language, tgtlanguages, word):
|
||||||
possible_translations = translateWordsAsChunks(grammar, language, tgtlanguages, word);
|
possible_translations = translateWordsAsChunks(grammar, language, \
|
||||||
if len(possible_translations):
|
tgtlanguages, word);
|
||||||
return possible_translations;
|
if len(possible_translations):
|
||||||
|
return possible_translations;
|
||||||
lowerword = word.lower();
|
lowerword = word.lower();
|
||||||
try:
|
try:
|
||||||
partialExprList = grammar.languages[language].parse(word, cat='Chunk');
|
partialExprList = grammar.languages[language].parse(word, cat='Chunk');
|
||||||
for expr in partialExprList:
|
for expr in partialExprList:
|
||||||
return [(lang, gf_utils.gf_postprocessor( grammar.languages[lang].linearize(expr[1]) )) for lang in tgtlanguages];
|
return [(lang, gf_utils.gf_postprocessor(\
|
||||||
except pgf.ParseError:
|
grammar.languages[lang].linearize(expr[1]))) \
|
||||||
morphAnalysis = grammar.languages[language].lookupMorpho(word) + grammar.languages[language].lookupMorpho(lowerword);
|
for lang in tgtlanguages];
|
||||||
for morph in morphAnalysis:
|
except pgf.ParseError:
|
||||||
countPositiveLanguages = filter(None, [grammar.languages[lang].hasLinearization(morph[0]) for lang in tgtlanguages]);
|
morphAnalysis = grammar.languages[language].lookupMorpho(word) +\
|
||||||
if len(countPositiveLanguages) > 0.5*len(tgtlanguages):
|
grammar.languages[language].lookupMorpho(lowerword);
|
||||||
return [(lang, gf_utils.gf_postprocessor( grammar.languages[lang].linearize( pgf.readExpr(morph[0]) ) )) for lang in tgtlanguages];
|
for morph in morphAnalysis:
|
||||||
return [(lang, word) for lang in tgtlanguages];
|
countPositiveLanguages = list(filter(None, \
|
||||||
|
[grammar.languages[lang].hasLinearization(morph[0]) \
|
||||||
|
for lang in tgtlanguages]));
|
||||||
|
if len(countPositiveLanguages) > 0.5*len(tgtlanguages):
|
||||||
|
return [(lang, \
|
||||||
|
gf_utils.gf_postprocessor(grammar.languages[lang].linearize(pgf.readExpr(morph[0])))) \
|
||||||
|
for lang in tgtlanguages];
|
||||||
|
return [(lang, word) for lang in tgtlanguages];
|
||||||
|
|
||||||
def translationByLookup(grammar, language, tgtlanguages, sentence):
|
def translationByLookup(grammar, language, tgtlanguages, sentence):
|
||||||
parser = grammar.languages[language].parse;
|
parser = grammar.languages[language].parse;
|
||||||
linearizersList = dict([(lang, grammar.languages[lang].linearize) for lang in tgtlanguages]);
|
linearizersList = dict([(lang, grammar.languages[lang].linearize) \
|
||||||
queue = [sentence.strip().split()];
|
for lang in tgtlanguages]);
|
||||||
transChunks = {};
|
queue = [sentence.strip().split()];
|
||||||
while len(queue):
|
transChunks = {};
|
||||||
head = queue[0];
|
while len(queue):
|
||||||
if not len(head):
|
head = queue[0];
|
||||||
pass;
|
if not len(head):
|
||||||
elif len(head) == 1 and head[0].strip():
|
pass;
|
||||||
for lang, wordchoice in translateWord(grammar, language, tgtlanguages, head[0]):
|
elif len(head) == 1 and head[0].strip():
|
||||||
transChunks.setdefault(lang, []).append( gf_utils.postprocessor(wordchoice) );
|
for lang, wordchoice in translateWord(grammar, language, \
|
||||||
else:
|
tgtlanguages, head[0]):
|
||||||
try:
|
transChunks.setdefault(lang, []).append(\
|
||||||
for parseidx, parse in enumerate( parser(' '.join(head)) ):
|
gf_utils.postprocessor(wordchoice));
|
||||||
for lang in tgtlanguages:
|
else:
|
||||||
if linearizersList[lang](parse[1]) == None:
|
try:
|
||||||
transChunks.setdefault(lang, []).append( ' ' );
|
for parseidx, parse in enumerate(parser(' '.join(head))):
|
||||||
else:
|
for lang in tgtlanguages:
|
||||||
transChunks.setdefault(lang, []).append( gf_utils.postprocessor( linearizersList[lang](parse[1]).strip() ) );
|
if linearizersList[lang](parse[1]) == None:
|
||||||
break;
|
transChunks.setdefault(lang, []).append(' ');
|
||||||
except pgf.ParseError, err:
|
else:
|
||||||
#unseenToken = re.findall('"[^"]+?"', err.message)[0][1:-1];
|
transChunks.setdefault(lang, []).append(\
|
||||||
unseenToken = err.message.strip().split()[-1][1:-1];
|
gf_utils.postprocessor(linearizersList[lang](parse[1]).strip()));
|
||||||
idx = head.index(unseenToken);
|
break;
|
||||||
queue.insert(1, head[:idx] );
|
except pgf.ParseError as err:
|
||||||
queue.insert(2, [head[idx]] );
|
#unseenToken = re.findall('"[^"]+?"', err.message)[0][1:-1];
|
||||||
queue.insert(3, head[idx+1:] );
|
unseenToken = err.message.strip().split()[-1][1:-1];
|
||||||
del queue[0];
|
idx = head.index(unseenToken);
|
||||||
for lang in tgtlanguages:
|
queue.insert(1, head[:idx] );
|
||||||
yield (lang, ' '.join(transChunks[lang]));
|
queue.insert(2, [head[idx]] );
|
||||||
|
queue.insert(3, head[idx+1:] );
|
||||||
|
del queue[0];
|
||||||
|
for lang in tgtlanguages:
|
||||||
|
yield (lang, ' '.join(transChunks[lang]));
|
||||||
|
|
||||||
def pipelineParsing(grammar, language, sentences, K=20):
|
def pipelineParsing(grammar, language, sentences, K=20):
|
||||||
#buf = [sent for sent in sentences];
|
#buf = [sent for sent in sentences];
|
||||||
buf, sentences = itertools.tee(sentences, 2);
|
buf, sentences = itertools.tee(sentences, 2);
|
||||||
sentences = itertools.imap(gf_utils.lexer(lang=language), sentences);
|
parser = gf_utils.getKBestParses(grammar, language, K);
|
||||||
parser = gf_utils.getKBestParses(grammar, language, K);
|
for sent, (time, parsesBlock) in zip(buf, map(parser, sentences)):
|
||||||
for sent, (time, parsesBlock) in itertools.izip(buf, itertools.imap(parser, sentences)):
|
yield (sent, parsesBlock);
|
||||||
yield (sent, parsesBlock);
|
|
||||||
|
|
||||||
def translation_pipeline(props):
|
def translation_pipeline(props):
|
||||||
if props.propsfile:
|
if props.propsfile:
|
||||||
props = readTranslationPipelineOptions(props.propsfile, props);
|
props = readTranslationPipelineOptions(props.propsfile, props);
|
||||||
|
|
||||||
# UGLY HACK FOR K-best translation: if K-best translation output format is only txt
|
|
||||||
if props.bestK != 1:
|
|
||||||
props.format = 'txt';
|
|
||||||
|
|
||||||
if not os.path.isdir( props.exp_directory ):
|
|
||||||
logging.info("Creating output directory: %s" %(props.exp_directory));
|
|
||||||
os.makedirs(props.exp_directory);
|
|
||||||
|
|
||||||
if not props.srclang:
|
# UGLY HACK FOR K-best translation: if K-best translation output format is only txt
|
||||||
logging.critical("Mandatory option source-lang missing. Can not determine source language.");
|
if props.bestK != 1:
|
||||||
sys.exit(1);
|
props.format = 'txt';
|
||||||
|
|
||||||
grammar = pgf.readPGF(props.pgffile);
|
if not os.path.isdir( props.exp_directory ):
|
||||||
|
logging.info("Creating output directory: %s" %(props.exp_directory));
|
||||||
|
os.makedirs(props.exp_directory);
|
||||||
|
|
||||||
sourceLanguage = filter(None, [lang if lang[-3:] == props.srclang else '' for lang in grammar.languages.keys()])[0];
|
if not props.srclang:
|
||||||
logging.info("Translating from %s" %(sourceLanguage));
|
logging.critical("Mandatory option source-lang missing. Can not determine source language.");
|
||||||
|
sys.exit(1);
|
||||||
|
|
||||||
if len(props.tgtlangs):
|
grammar = pgf.readPGF(props.pgffile);
|
||||||
target_langs = props.tgtlangs;
|
|
||||||
|
sourceLanguage = filter(None, [lang if lang[-3:] == props.srclang else '' for lang in grammar.languages.keys()]);
|
||||||
|
sourceLanguage = list(sourceLanguage)[0];
|
||||||
|
logging.info("Translating from %s" %(sourceLanguage));
|
||||||
|
|
||||||
|
if len(props.tgtlangs):
|
||||||
|
target_langs = props.tgtlangs;
|
||||||
|
else:
|
||||||
|
target_langs = filter(None, [lang[-3:] if lang != sourceLanguage \
|
||||||
|
else '' for lang in grammar.languages.keys()]);
|
||||||
|
targetLanguages = filter(None, [lang if lang[-3:] in target_langs \
|
||||||
|
else '' for lang in grammar.languages.keys()]);
|
||||||
|
targetLanguages = list(targetLanguages);
|
||||||
|
logging.info("Translating into the following languages: %s" %(','.join(targetLanguages)));
|
||||||
|
|
||||||
|
K = props.bestK if props.bestK != 1 else 20; # by default we look for 20 best parses
|
||||||
|
bestK = props.bestK;
|
||||||
|
|
||||||
|
if not props.input:
|
||||||
|
logging.info( "Input file name missing. Reading input from stdin." );
|
||||||
|
inputStream = sys.stdin;
|
||||||
|
outputPrefix = os.getpid();
|
||||||
|
else:
|
||||||
|
inputStream = codecs.open(props.input, 'r');
|
||||||
|
outputPrefix = os.path.splitext( os.path.split(props.input)[1] )[0];
|
||||||
|
|
||||||
|
if props.format == 'sgm':
|
||||||
|
inputDoc = etree.parse(inputStream);
|
||||||
|
reader = sgmReader;
|
||||||
|
skeletonDoc = getXMLSkeleton;
|
||||||
|
addItem = addToSgm;
|
||||||
|
writer = sgmWriter;
|
||||||
|
elif props.format == 'txt':
|
||||||
|
logging.info("Input format is txt. Assuming one-sentence-per-line format.");
|
||||||
|
inputDoc = inputStream;
|
||||||
|
reader = lambda X: X;
|
||||||
|
skeletonDoc = lambda X, lang: list();
|
||||||
|
addItem = lambda X, y: list.append(X, y);
|
||||||
|
writer = lambda X: ('\n'.join(X) if bestK == 1 else \
|
||||||
|
'\n'.join(map(gf_utils.printMosesNbestFormat, X)));
|
||||||
|
|
||||||
|
translationBlocks = {};
|
||||||
|
for tgtlang in targetLanguages+['abstract']:
|
||||||
|
translationBlocks[tgtlang] = skeletonDoc(inputDoc, tgtlang);
|
||||||
|
|
||||||
|
preprocessor = pipeline_lexer;
|
||||||
|
postprocessor = clean_gfstrings;
|
||||||
|
|
||||||
|
logging.info( "Parsing text in %s" %(sourceLanguage) );
|
||||||
|
# 1. Get Abstract Trees for sentences in source language.
|
||||||
|
tokenized_sentences = map(preprocessor, reader(inputDoc));
|
||||||
|
web_lexer = gf_utils.Lexer('Web', grammar, sourceLanguage).tokenize;
|
||||||
|
absParses = [parsesBlock for parsesBlock in \
|
||||||
|
pipelineParsing(grammar, sourceLanguage, \
|
||||||
|
map(web_lexer, tokenized_sentences), K)];
|
||||||
|
|
||||||
|
logging.info( "Linearizing into %s" %(','.join(targetLanguages)) );
|
||||||
|
# 2. Linearize in all target Languages
|
||||||
|
for idx, parsesBlock in enumerate( map(operator.itemgetter(1), absParses) ):
|
||||||
|
translationBuffer = {};
|
||||||
|
if not len(parsesBlock):
|
||||||
|
# failed to parse;
|
||||||
|
# translate using lookup
|
||||||
|
for tgtlang, translation in translationByLookup(grammar, sourceLanguage,\
|
||||||
|
targetLanguages, absParses[idx][0]):
|
||||||
|
if bestK == 1:
|
||||||
|
addItem(translationBlocks[tgtlang], postprocessor(translation));
|
||||||
|
else:
|
||||||
|
addItem(translationBlocks[tgtlang], [((0,), postprocessor(translation))]);
|
||||||
|
addItem(translationBlocks['abstract'], '');
|
||||||
else:
|
else:
|
||||||
target_langs = filter(None, [lang[-3:] if lang != sourceLanguage else '' for lang in grammar.languages.keys()]);
|
bestTranslationIdx = 0;
|
||||||
targetLanguages = filter(None, [lang if lang[-3:] in target_langs else '' for lang in grammar.languages.keys()]);
|
for tgtlang in targetLanguages:
|
||||||
logging.info("Translating into the following languages: %s" %(','.join(targetLanguages)));
|
translationBuffer[tgtlang] = next(gf_utils.getKLinearizations(grammar, \
|
||||||
|
tgtlang, [parsesBlock], K=bestK));
|
||||||
K = props.bestK if props.bestK != 1 else 20; # by default we look for 20 best parses
|
if bestK == 1:
|
||||||
bestK = props.bestK;
|
for tidx, translation in enumerate(translationBuffer[tgtlang]):
|
||||||
|
if postprocessor(translation[1]).strip():
|
||||||
if not props.input:
|
if tidx > bestTranslationIdx:
|
||||||
logging.info( "Input file name missing. Reading input from stdin." );
|
bestTranslationIdx = tidx;
|
||||||
inputStream = sys.stdin;
|
break;
|
||||||
outputPrefix = os.getpid();
|
for tgtlang in targetLanguages:
|
||||||
|
if bestK == 1:
|
||||||
else:
|
translation = postprocessor(translationBuffer[tgtlang][bestTranslationIdx][1]) \
|
||||||
inputStream = codecs.open(props.input, 'r');
|
if len(translationBuffer[tgtlang]) > bestTranslationIdx \
|
||||||
outputPrefix = os.path.splitext( os.path.split(props.input)[1] )[0];
|
else ((None,), '');
|
||||||
|
abstract = str(parsesBlock[bestTranslationIdx][1]);
|
||||||
if props.format == 'sgm':
|
else:
|
||||||
inputDoc = etree.parse(inputStream);
|
translation = translationBuffer[tgtlang] \
|
||||||
reader = sgmReader;
|
if len(translationBuffer[tgtlang]) \
|
||||||
skeletonDoc = getXMLSkeleton;
|
else [];
|
||||||
addItem = addToSgm;
|
abstract = parsesBlock;
|
||||||
writer = sgmWriter;
|
addItem(translationBlocks[tgtlang], translation);
|
||||||
elif props.format == 'txt':
|
addItem(translationBlocks['abstract'], abstract);
|
||||||
logging.info("Input format is txt. Assuming one-sentence-per-line format.");
|
|
||||||
inputDoc = inputStream;
|
for tgtlang in targetLanguages+['abstract']:
|
||||||
reader = lambda X: X;
|
outputFile = os.path.join( props.exp_directory, '%s-%s.%s' %(outputPrefix, tgtlang[-3:] \
|
||||||
skeletonDoc = lambda X, lang: list();
|
if tgtlang!='abstract' \
|
||||||
addItem = lambda X, y: list.append(X, y);
|
else 'abstract', props.format) );
|
||||||
writer = lambda X: ('\n'.join(X) if bestK == 1 else '\n'.join(map(gf_utils.printMosesNbestFormat, X)));
|
logging.info( "Writing translations for %s to %s" %(tgtlang, outputFile) );
|
||||||
|
with codecs.open(outputFile, 'w', encoding='utf-8') as outputStream:
|
||||||
translationBlocks = {};
|
print(writer(translationBlocks[tgtlang]), file=outputStream);
|
||||||
for tgtlang in targetLanguages+['abstract']:
|
return;
|
||||||
translationBlocks[tgtlang] = skeletonDoc(inputDoc, tgtlang);
|
|
||||||
|
|
||||||
preprocessor = pipeline_lexer;
|
|
||||||
postprocessor = clean_gfstrings;
|
|
||||||
|
|
||||||
logging.info( "Parsing text in %s" %(sourceLanguage) );
|
|
||||||
# 1. Get Abstract Trees for sentences in source language.
|
|
||||||
tokenized_sentences = itertools.imap(preprocessor, reader(inputDoc));
|
|
||||||
absParses = [parsesBlock for parsesBlock in pipelineParsing(grammar, sourceLanguage, web_lexer(grammar, sourceLanguage, tokenized_sentences), K)];
|
|
||||||
|
|
||||||
logging.info( "Linearizing into %s" %(','.join(targetLanguages)) );
|
|
||||||
# 2. Linearize in all target Languages
|
|
||||||
for idx, parsesBlock in enumerate( itertools.imap(operator.itemgetter(1), absParses) ):
|
|
||||||
translationBuffer = {};
|
|
||||||
if not len(parsesBlock):
|
|
||||||
# failed to parse;
|
|
||||||
# translate using lookup
|
|
||||||
for tgtlang, translation in translationByLookup(grammar, sourceLanguage, targetLanguages, absParses[idx][0]):
|
|
||||||
if bestK == 1:
|
|
||||||
addItem(translationBlocks[tgtlang], postprocessor(translation));
|
|
||||||
else:
|
|
||||||
addItem(translationBlocks[tgtlang], [((0,), postprocessor(translation))]);
|
|
||||||
addItem(translationBlocks['abstract'], '');
|
|
||||||
else:
|
|
||||||
bestTranslationIdx = 0;
|
|
||||||
for tgtlang in targetLanguages:
|
|
||||||
translationBuffer[tgtlang] = gf_utils.getKLinearizations(grammar, tgtlang, [parsesBlock], K=bestK).next();
|
|
||||||
if bestK == 1:
|
|
||||||
for tidx, translation in enumerate(translationBuffer[tgtlang]):
|
|
||||||
if postprocessor(translation[1]).strip():
|
|
||||||
if tidx > bestTranslationIdx:
|
|
||||||
bestTranslationIdx = tidx;
|
|
||||||
break;
|
|
||||||
for tgtlang in targetLanguages:
|
|
||||||
if bestK == 1:
|
|
||||||
translation = postprocessor(translationBuffer[tgtlang][bestTranslationIdx][1]) if len(translationBuffer[tgtlang]) > bestTranslationIdx else ((None,), '');
|
|
||||||
abstract = str(parsesBlock[bestTranslationIdx][1]);
|
|
||||||
else:
|
|
||||||
translation = translationBuffer[tgtlang] if len(translationBuffer[tgtlang]) else [];
|
|
||||||
abstract = parsesBlock;
|
|
||||||
addItem(translationBlocks[tgtlang], translation);
|
|
||||||
addItem(translationBlocks['abstract'], abstract);
|
|
||||||
|
|
||||||
for tgtlang in targetLanguages+['abstract']:
|
|
||||||
outputFile = os.path.join( props.exp_directory, '%s-%s.%s' %(outputPrefix, tgtlang[-3:] if tgtlang!='abstract' else 'abstract', props.format) );
|
|
||||||
logging.info( "Writing translations for %s to %s" %(tgtlang, outputFile) );
|
|
||||||
with codecs.open(outputFile, 'w') as outputStream:
|
|
||||||
print >>outputStream, writer(translationBlocks[tgtlang]);
|
|
||||||
return;
|
|
||||||
|
|
||||||
def cmdLineParser():
|
def cmdLineParser():
|
||||||
argparser = argparse.ArgumentParser(prog='translation_pipeline.py', description='Run the GF translation pipeline on standard test-sets');
|
argparser = argparse.ArgumentParser(prog='translation_pipeline.py', description='Run the GF translation pipeline on standard test-sets');
|
||||||
argparser.add_argument('-g', '--pgf', dest='pgffile', required=True, help='PGF grammar file to run the pipeline');
|
argparser.add_argument('-g', '--pgf', dest='pgffile', required=True, help='PGF grammar file to run the pipeline');
|
||||||
argparser.add_argument('-s', '--source', dest='srclang', default='', help='Source language of input sentences');
|
argparser.add_argument('-s', '--source', dest='srclang', default='', help='Source language of input sentences');
|
||||||
argparser.add_argument('-t', '--target', dest='tgtlangs', nargs='*', default=[], help='Target languages to linearize (default is all other languages)');
|
argparser.add_argument('-t', '--target', dest='tgtlangs', nargs='*', default=[], help='Target languages to linearize (default is all other languages)');
|
||||||
argparser.add_argument('-i', '--input', dest='input', default='', help='input file (default will accept STDIN)');
|
argparser.add_argument('-i', '--input', dest='input', default='', help='input file (default will accept STDIN)');
|
||||||
argparser.add_argument('-e', '--exp', dest='exp_directory', default=os.getcwd(), help='experiement directory to write translation files');
|
argparser.add_argument('-e', '--exp', dest='exp_directory', default=os.getcwd(), help='experiement directory to write translation files');
|
||||||
argparser.add_argument('-f', '--format', dest='format', default='txt', choices=['txt', 'sgm'], help='input file format (output files will be written in the same format)');
|
argparser.add_argument('-f', '--format', dest='format', default='txt', choices=['txt', 'sgm'], help='input file format (output files will be written in the same format)');
|
||||||
argparser.add_argument('-p', '--props', dest='propsfile', default='', help='properties file for the translation pipeline (specify the above arguments in a file)');
|
argparser.add_argument('-p', '--props', dest='propsfile', default='', help='properties file for the translation pipeline (specify the above arguments in a file)');
|
||||||
argparser.add_argument('-K', dest='bestK', type=int, default=1, help='K value for K-best translation');
|
argparser.add_argument('-K', dest='bestK', type=int, default=1, help='K value for K-best translation');
|
||||||
return argparser;
|
return argparser;
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
logging.basicConfig(level='INFO');
|
logging.basicConfig(level='INFO');
|
||||||
pipelineEnv = cmdLineParser().parse_args(sys.argv[1:]);
|
pipelineEnv = cmdLineParser().parse_args(sys.argv[1:]);
|
||||||
translation_pipeline(pipelineEnv);
|
translation_pipeline(pipelineEnv);
|
||||||
|
|||||||
Reference in New Issue
Block a user