From ef33f1ab35443af8b182afc3524e76c8a50136a8 Mon Sep 17 00:00:00 2001 From: "prasanth.kolachina" Date: Mon, 19 Sep 2016 08:32:08 +0000 Subject: [PATCH] python examples compatible with both Python 2 and 3 --- src/runtime/python/examples/gf_utils.py | 576 ++++++++------ .../python/examples/translation_pipeline.py | 722 +++++++++--------- 2 files changed, 703 insertions(+), 595 deletions(-) diff --git a/src/runtime/python/examples/gf_utils.py b/src/runtime/python/examples/gf_utils.py index bb637cf04..2be326e0e 100644 --- a/src/runtime/python/examples/gf_utils.py +++ b/src/runtime/python/examples/gf_utils.py @@ -1,283 +1,369 @@ #!/usr/bin/env python +# Python 2 and 3 compatible +from __future__ import print_function + """ """ -import argparse, re, string, sys, time; -from itertools import imap, count; +import argparse, codecs, re, string, sys, time; +try: + from itertools import imap as map; + from itertools import count; +except ImportError: + from itertools import count; + pass; from operator import itemgetter; import pgf; -def lexerI(sentence): - return sentence.rstrip(string.whitespace+string.punctuation); +class Lexer(object): + def __init__(self, lang='None', grammar=None, gflang=None): + import translation_pipeline; + lexers = {'None': self.lexerI, \ + 'Eng': self.lexerI, \ + 'Chi': self.lexerChi, \ + 'Translator': translation_pipeline.pipeline_lexer, \ + 'Web': self.lexerWeb + }; + if grammar: + self._pgf = grammar; + self._lang = gflang; -def lexerChi(sentence): - sentence = sentence.decode('utf-8'); + self.tokenize = lexers[lang]; + return; + + def lexerI(self, sentence): + #return sentence.decode('utf-8').rstrip(string.whitespace+string.punctuation).encode('utf-8'); + return sentence.rstrip(string.whitespace+string.punctuation); + + def lexerChi(self, sentence): + #sentence = sentence.decode('utf-8'); tokens, idx, n = [], 0, len(sentence); prev = True; while idx < n: - if sentence[idx] in string.whitespace: - prev = True; - idx += 1; - continue; - if 0 < ord(sentence[idx]) < 128: - if sentence[idx] in string.punctuation: - prev = True; - if prev: - tokens.append( sentence[idx] ); - prev = False; - else: - tokens[-1] = tokens[-1]+sentence[idx]; - else: - prev = True; - tokens.append( sentence[idx] ); - idx += 1; - return ' '.join(tokens).encode('utf-8'); + if sentence[idx] in string.whitespace: + prev = True; + idx += 1; + continue; + if 0 < ord(sentence[idx]) < 128: + if sentence[idx] in string.punctuation: + prev = True; + if prev: + tokens.append( sentence[idx] ); + prev = False; + else: + tokens[-1] = tokens[-1]+sentence[idx]; + else: + prev = True; + tokens.append( sentence[idx] ); + idx += 1; + return ' '.join(tokens);#.encode('utf-8'); -def lexer(lang='translator'): - if lang[-3:] == 'Eng': - return lexerI; - elif lang[-3:] == 'Chi': - return lexerChi; - elif lang == 'translator': - import translation_pipeline; - return translation_pipeline.pipeline_lexer; - else: - return lexerI; + def lexerWeb(self, sentence): + tokensList = re.split('\s+?', sentence.strip()); + for idx, token in enumerate(tokensList): + if not token[0].isupper(): + continue; + lowertoken = tokensList[idx].lower(); + count = 0; + for analysis in self._pgf.languages[self._lang].lookupMorpho(lowertoken): + count += 1; + tokensList[idx] = lowertoken if count else token; + for idx, token in enumerate(tokensList): + if token.find('-') == -1: + continue; + count = 0; + for analysis in self._pgf.languages[self._lang].lookupMorpho(token): + count += 1; + if count: + continue; + token = tokensList[idx].replace('-', ''); + for analysis in self._pgf.languages[self._lang].lookupMorpho(token): + count += 1; + if count: + tokensList[idx] = token; + continue; + token = tokensList[idx].replace('-', ' '); + return ' '.join(tokensList); def postprocessor(sentence): - if sentence == None: - return ''; - if sentence.startswith('* ') or sentence.startswith('% '): - sentence = sentence[2:]; - sentence = sentence.replace(' &+ ', ''); - sentence = sentence.replace('<+>', ' '); - return sentence; + if sentence == None: + return ''; + if sentence.startswith('* ') or sentence.startswith('% '): + sentence = sentence[2:]; + sentence = sentence.replace(' &+ ', ''); + sentence = sentence.replace('<+>', ' '); + return sentence; def readJohnsonRerankerTrees(inputStream): - endOfParse = False; - while True: - sentheader = inputStream.next(); - if sentheader == '': - break; - parsescount, sentidx = map(int, sentheader.strip().split()); - parsesBlock = []; - for i in xrange(parsescount): - parseprob = inputStream.next(); - if parseprob.strip() == '': - endOfParse = True; - break; - parse = inputStream.next(); - parsesBlock.append( (float(parseprob.strip()), pgf.readExpr(parse.strip())) ); - yield sentidx, parsesBlock; - if not endOfParse: - _ = inputStream.next(); - endOfParse = False; + endOfParse = False; + while True: + sentheader = inputStream.next(); + if sentheader == '': + break; + parsescount, sentidx = map(int, sentheader.strip().split()); + parsesBlock = []; + for i in xrange(parsescount): + parseprob = inputStream.next(); + if parseprob.strip() == '': + endOfParse = True; + break; + parse = inputStream.next(); + parsesBlock.append((float(parseprob.strip()), pgf.readExpr(parse.strip()))); + yield sentidx, parsesBlock; + if not endOfParse: + _ = inputStream.next(); + endOfParse = False; def readMosesNbestFormat(inputStream): - transBlock = []; - currentHypothesisId = 0; - while True: - line = inputStream.next(); - if line == '': - break; - fields = line.strip().split('|||'); - if str(fields[0].strip()) != str(currentHypothesisId): - yield currentHypothesisId, transBlock; - transBlock = []; - currentHypothesisId = int(fields[0]); - transBlock.append( (map(float, tuple([val.strip() for val in fields[3].split()])), fields[1].strip()) ); + transBlock = []; + currentHypothesisId = 0; + while True: + line = inputStream.next(); + if line == '': + break; + fields = line.strip().split('|||'); + if str(fields[0].strip()) != str(currentHypothesisId): + yield currentHypothesisId, transBlock; + transBlock = []; + currentHypothesisId = int(fields[0]); + transBlock.append( (map(float, \ + tuple([val.strip() for val in fields[3].split()])), \ + fields[1].strip()) ); -def printJohnsonRerankerFormat(gfparsesList, sentid=count(1)): - johnsonRepr = []; - parseHash = {}; - for parse in sorted(gfparsesList, key=itemgetter(0)): - if not parseHash.has_key(parse[1]): - johnsonRepr.append( str(-1*parse[0]) ); - johnsonRepr.append( str(parse[1]) ); - parseHash.setdefault(parse[1], []).append(parse[0]); - curid = sentid.next(); - if len(gfparsesList): - johnsonRepr.insert(0, '%d %d' %(len(parseHash.values()), curid)); - duplicateInstances = len(filter(lambda X: len(parseHash[X]) > 1, parseHash.keys())); - #if duplicateInstances: print >>sys.stderr, "%d duplicate parses found in K-best parsing" %(duplicateInstances); - return '\n'.join(johnsonRepr)+'\n'; +def printJohnsonRerankerFormat(gfparsesList, sentids=count(1)): + johnsonRepr = []; + parseHash = {}; + for parse in sorted(gfparsesList, key=itemgetter(0)): + if parse[1] not in parseHash: + johnsonRepr.append( str(-1*parse[0]) ); + johnsonRepr.append( str(parse[1]) ); + parseHash.setdefault(parse[1], []).append(parse[0]); + curid = next(sentids); + if len(gfparsesList): + johnsonRepr.insert(0, '%d %d' %(len(parseHash.values()), curid)); + duplicateInstances = len(list(filter(lambda X: len(parseHash[X]) > 1, \ + parseHash.keys()))); + return '\n'.join(johnsonRepr)+'\n'; -def printMosesNbestFormat(hypothesisList, sentid=count(1)): - mosesRepr = []; - sid = sentid.next(); - for hypScores, hypStr in hypothesisList: - if not hasattr(hypScores, '__iter__'): - hypScores = (hypScores, ); - mosesRepr.append("%d ||| %s ||| NULL ||| %s" %(sid, hypStr, ' '.join(['%.6f'%score for score in hypScores]))); - return '\n'.join(mosesRepr); +def printMosesNbestFormat(hypothesisList, sentids=count(1)): + mosesRepr = []; + sid = next(sentids); + for hypScores, hypStr in hypothesisList: + if not hasattr(hypScores, '__iter__'): + hypScores = (hypScores, ); + mosesRepr.append("%d ||| %s ||| NULL ||| %s" \ + %(sid, hypStr, ' '.join('%.6f'%score for score in hypScores)) ); + return '\n'.join(mosesRepr); def getKLinearizations(grammar, tgtlanguage, abstractParsesList, K=10): - generator = grammar.languages[tgtlanguage].linearizeAll; - for parsesBlock in abstractParsesList: - kBestTrans = []; - for parseprob, parse in parsesBlock: - for linstring in generator(parse, n=K): - kBestTrans.append( ((parseprob,), postprocessor(linstring)) ); - yield kBestTrans; + generator = grammar.languages[tgtlanguage].linearizeAll; + for parsesBlock in abstractParsesList: + kBestTrans = []; + for parseprob, parse in parsesBlock: + for linstring in generator(parse, n=K): + kBestTrans.append( ((parseprob,), postprocessor(linstring)) ); + yield kBestTrans; -def getKBestParses(grammar, language, K, serializable=False, sentid=count(1), max_length=50): - parser = grammar.languages[language].parse; - import translation_pipeline - def worker(sentence): - sentence = sentence.strip(); - curid = sentid.next(); - tstart = time.time(); - kBestParses = []; - parseScores = {}; - if len(sentence.split()) > max_length: - tend, err = time.time(), "Sentence too long (%d tokens). Might potentially run out of memory" %(len(sentence.split())); - print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err); - return tend-tstart, kBestParses; # temporary hack to make sure parser does not get killed for very long sentences; - try: - callbacks = [('PN', translation_pipeline.parseNames(grammar, args.srclang, sentence)), ('Symb', translation_pipeline.parseUnknown(grammar, args.srclang, sentence))] - for parseidx, parse in enumerate( parser(sentence, heuristics=0, callbacks=callbacks) ): - parseScores[parse[0]] = True; - kBestParses.append( (parse[0], str(parse[1]) if serializable else parse[1]) ); - if parseidx == K-1: break; - #if len(parseScores) >= K: break; - tend = time.time(); - print >>sys.stderr, '%d\t%.4f' %(curid, tend-tstart); - return tend-tstart, kBestParses; - except pgf.ParseError, err: - tend = time.time(); - print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err); - return tend-tstart, kBestParses; - except UnicodeEncodeError, err: - tend = time.time(); - print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err); - return tend-tstart, kBestParses; - return worker; +def getKBestParses(grammar, language, K, callbacks=[], \ + serializable=False, sentids=count(1), max_length=50): + parser = grammar.languages[language].parse; + import translation_pipeline; + callbacks_PN = translation_pipeline.parseNames; + callbacks_Symb = translation_pipeline.parseUnknown; + def worker(sentence): + sentence = sentence.strip(); + curid = next(sentids); + tstart = time.time(); + kBestParses = []; + parseScores = {}; + if len(sentence.split()) > max_length: + # temporary hack to make sure parser does not get + # killed for very long sentences; + tend, err = time.time(), \ + "Sentence too long (%d tokens). Might potentially run out of memory" \ + %(len(sentence.split())); + print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr); + return tend-tstart, kBestParses; + + # with modified API for callbacks, each callback function has to + # be freshly created for each sentence; otherwise, they do not + # work. + try: + callbacks = [('PN', callbacks_PN(grammar, language, sentence)),\ + ('Symb', callbacks_Symb(grammar, language, sentence))]; + for parseidx, parse in enumerate(parser(sentence, \ + heuristics=0, callbacks=callbacks)): + parseScores[parse[0]] = True; + kBestParses.append((parse[0], str(parse[1]) if serializable \ + else parse[1])); + if parseidx == K-1: + break; + tend = time.time(); + print('%d\t%.4f' %(curid, tend-tstart), file=sys.stderr); + return tend-tstart, kBestParses; + except pgf.ParseError as err: + tend = time.time(); + print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr); + return tend-tstart, kBestParses; + except UnicodeEncodeError as err: + tend = time.time(); + print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr); + return tend-tstart, kBestParses; + return worker; def pgf_parse(args): - grammar = pgf.readPGF(args.pgfgrammar); - import translation_pipeline; - - preprocessor = lexer(); - inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) ); - outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1])); #operator.itemgetter(1); - parser = getKBestParses(grammar, args.srclang, 1); - - sentidx = 0; - for time, parsesBlock in imap(parser, inputSet): - sentidx += 1; - print >>args.outputstream, "%d\t%f\t%s" %(sentidx, time, str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''); - return; + grammar = pgf.readPGF(args.pgfgrammar); + preprocessor = Lexer().tokenize; + #if sys.version_info < (3, 0): + # args.inputstream = codecs.getreader('utf-8')(args.inputstream); + inputSet = map(preprocessor, args.inputstream); + web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize; + inputSet = map(web_preprocessor, inputSet); + outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1])); + parser = getKBestParses(grammar, args.srclang, 1); + + sentidx = 0; + for time, parsesBlock in map(parser, inputSet): + sentidx += 1; + print("%d\t%f\t%s" %(sentidx, time, \ + str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''), \ + file=args.outputstream); + return; def pgf_kparse(args): - grammar = pgf.readPGF(args.pgfgrammar); - import translation_pipeline; - - preprocessor = lexer(); - inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) ); - outputPrinter = printJohnsonRerankerFormat; - parser = getKBestParses(grammar, args.srclang, args.K); - - sentidx = 0; - for time, parsesBlock in imap(parser, inputSet): - sentidx += 1; - strParses = str(outputPrinter(parsesBlock)); - if not (strParses == '\n'): - print >>args.outputstream, strParses; - return; + grammar = pgf.readPGF(args.pgfgrammar); + preprocessor = Lexer().tokenize; + #if sys.version_info < (3, 0): + # args.inputstream = codecs.getreader('utf-8')(args.inputstream); + inputSet = map(preprocessor, args.inputstream); + web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize; + inputSet = map(web_preprocessor, inputSet); + outputPrinter = printJohnsonRerankerFormat; + parser = getKBestParses(grammar, args.srclang, args.K); + + sentidx = 0; + for time, parsesBlock in map(parser, inputSet): + sentidx += 1; + strParses = str(outputPrinter(parsesBlock)); + if not (strParses == '\n'): + print(strParses, file=args.outputstream); + return; def pgf_linearize(args): - grammar = pgf.readPGF(args.pgfgrammar); - outputPrinter = postprocessor; - inputSet = []; - for line in args.inputstream: - try: - sentid, parsetime, parserepr = line.strip('\n').split('\t', 2); - except ValueError: - print line.strip(); - parseprob, abstree = parserepr.split('\t') if parserepr.strip() else (0, ''); - inputSet.append( (int(sentid), float(parsetime), float(parseprob), pgf.readExpr(abstree) if abstree else None) ); - linearizer = grammar.languages[args.tgtlang].linearize; - for sentid, _, _, abstree in inputSet: - if abstree: - print >>args.outputstream, str(outputPrinter(linearizer(abstree))); - else: - print >>args.outputstream, ""; - return; + grammar = pgf.readPGF(args.pgfgrammar); + def parse_line(line): + try: + sentid, parsetime, parserepr = line.strip('\n').split('\t', 2); + except ValueError: + print("Line not in proper format: %s" %(line), file=stderr); + parseprob, abstree = parserepr.split('\t') if parserepr.strip() \ + else (0, ''); + return ((int(sentid), float(parsetime), float(parseprob), \ + pgf.readExpr(abstree) if abstree else None)); + + #if sys.version_info < (3, 0): + # args.inputstream = codecs.getreader('utf-8')(args.inputstream); + inputSet = map(parse_line, (line for line in args.inputstream)); + outputPrinter = postprocessor; + linearizer = grammar.languages[args.tgtlang].linearize; + for sentid, _, _, abstree in inputSet: + if abstree: + print(str(outputPrinter(linearizer(abstree))), \ + file=args.outputstream); + else: + print("", file=args.outputstream); + return; def pgf_klinearize(args): - grammar = pgf.readPGF(args.pgfgrammar); - outputPrinter = printMosesNbestFormat; - inputSet = [(sentid, parsesBlock) for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)]; - sentIdsList = imap(itemgetter(0), inputSet); - parsesBlocks = map(itemgetter(1), inputSet); - - for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K): - strTrans = str(outputPrinter(transBlock, sentIdsList)); - if strTrans: - print >>args.outputstream, strTrans; - return; + grammar = pgf.readPGF(args.pgfgrammar); + #if sys.version_info < (3, 0): + # args.inputstream = codecs.getreader('utf-8')(args.inputstream); + inputSet = [(sentid, parsesBlock) \ + for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)]; + outputPrinter = printMosesNbestFormat; + sentIdsList = map(itemgetter(0), inputSet); + parsesBlocks = map(itemgetter(1), inputSet); + + for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K): + strTrans = str(outputPrinter(transBlock, sentIdsList)); + if strTrans: + print(strTrans, file=args.outputstream); + return; def cmdLineParser(): - argparser = argparse.ArgumentParser(prog='gf_utils.py', description='Examples for carrying out (K-best) parsing, translation and linearization using GF C runtime.'); - - subparsers = argparser.add_subparsers(); - parser = subparsers.add_parser('parse', help='GF parsing of sentences'); - kparser = subparsers.add_parser('kparse', help='K-best GF parsing of sentences'); - linearizer = subparsers.add_parser('linearize', help='Linearize GF abstract syntax treess'); - klinearizer = subparsers.add_parser('klinearize', help='Linearize K-variants of GF abstract syntax trees'); - - parser.set_defaults(func=pgf_parse); - parser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \ - help='PGF Grammar file'); - parser.add_argument('-p', '--start-sym', dest='startcat', required=False, \ - help='Start symbol in the grammar'); - parser.add_argument('-s', '--src-lang', dest='srclang', required=True, \ - help='Source language'); - parser.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \ - help='Input file') ; - parser.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \ - help='Output file'); + argparser = argparse.ArgumentParser(prog='gf_utils.py', \ + description='Examples for carrying out (K-best) parsing, \ + translation and linearization using GF C runtime.'); + + subparsers = argparser.add_subparsers(); + parser = subparsers.add_parser('parse', help='GF parsing of sentences'); + kparser = subparsers.add_parser('kparse', help='K-best GF parsing of sentences'); + linearizer = subparsers.add_parser('linearize', help='Linearize GF abstract syntax treess'); + klinearizer = subparsers.add_parser('klinearize', help='Linearize K-variants of GF abstract syntax trees'); + + parser.set_defaults(func=pgf_parse); + parser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \ + help='PGF Grammar file'); + parser.add_argument('-p', '--start-sym', dest='startcat', required=False, \ + help='Start symbol in the grammar'); + parser.add_argument('-s', '--src-lang', dest='srclang', required=True, \ + help='Source language'); + parser.add_argument('-i', '--input', dest='inputstream', nargs='?', \ + type=argparse.FileType(mode='r'), default=sys.stdin, \ + help='Input file') ; + parser.add_argument('-o', '--output', dest='outputstream', nargs='?', \ + type=argparse.FileType(mode='w'), default=sys.stdout, \ + help='Output file'); + + kparser.set_defaults(func=pgf_kparse); + kparser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \ + help='PGF Grammar file'); + kparser.add_argument('-p', '--start-sym', dest='startcat', required=False, \ + help='Start symbol in the grammar'); + kparser.add_argument('-s', '--src-lang', dest='srclang', required=True, \ + help='Source language'); + kparser.add_argument('-K', dest='K', required=True, \ + type=int, \ + help='K value for multiple parses'); + kparser.add_argument('-i', '--input', dest='inputstream', nargs='?', \ + type=argparse.FileType(mode='r'), default=sys.stdin, \ + help='Input file'); + kparser.add_argument('-o', '--output', dest='outputstream', nargs='?', \ + type=argparse.FileType(mode='w'), default=sys.stdout, \ + help='Output file'); + + linearizer.set_defaults(func=pgf_linearize); + linearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \ + help='PGF Grammar file'); + linearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \ + help='Target language'); + linearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', \ + type=argparse.FileType(mode='r'), default=sys.stdin, \ + help='Input file'); + linearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', \ + type=argparse.FileType(mode='w'), default=sys.stdout, \ + help='Output file'); + + klinearizer.set_defaults(func=pgf_klinearize); + klinearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \ + help='PGF Grammar file'); + klinearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \ + help='Target language'); + klinearizer.add_argument('-K', '--kbest', dest='K', required=True, \ + type=int, \ + help='K value for multiple linearizations'); + klinearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', \ + type=argparse.FileType(mode='r'), default=sys.stdin, \ + help='Input file'); + klinearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', \ + type=argparse.FileType(mode='w'), default=sys.stdout, \ + help='Output file'); + + return argparser; - kparser.set_defaults(func=pgf_kparse); - kparser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \ - help='PGF Grammar file'); - kparser.add_argument('-p', '--start-sym', dest='startcat', required=False, \ - help='Start symbol in the grammar'); - kparser.add_argument('-s', '--src-lang', dest='srclang', required=True, \ - help='Source language'); - kparser.add_argument('-K', dest='K', required=True, type=int, \ - help='K value for multiple parses'); - kparser.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \ - help='Input file'); - kparser.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \ - help='Output file'); - - linearizer.set_defaults(func=pgf_linearize); - linearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \ - help='PGF Grammar file'); - linearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \ - help='Target language'); - linearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \ - help='Input file'); - linearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \ - help='Output file'); - - klinearizer.set_defaults(func=pgf_klinearize); - klinearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \ - help='PGF Grammar file'); - klinearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \ - help='Target language'); - klinearizer.add_argument('-K', dest='K', required=True, type=int, \ - help='K value for multiple linearizations'); - klinearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \ - help='Input file'); - klinearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \ - help='Output file'); - - return argparser; if __name__ == '__main__': - args = cmdLineParser().parse_args(sys.argv[1:]); - args.func(args); + args = cmdLineParser().parse_args(sys.argv[1:]); + args.func(args); diff --git a/src/runtime/python/examples/translation_pipeline.py b/src/runtime/python/examples/translation_pipeline.py index bfd8b5c94..e8dc92583 100644 --- a/src/runtime/python/examples/translation_pipeline.py +++ b/src/runtime/python/examples/translation_pipeline.py @@ -1,6 +1,18 @@ #!/usr/bin/env python +# Python 2 and 3 compatible +from __future__ import print_function + +""" +""" + import argparse, codecs, copy, itertools, logging, math, operator, os, os.path, re, string, sys, time; +try: + from itertools import imap as map; + from itertools import ifilter as filter; +except ImportError: + pass; + import xml.etree.ElementTree as etree; import pgf; @@ -8,392 +20,402 @@ import gf_utils; # http://snipplr.com/view/25657/indent-xml-using-elementtree/ def indentXMLNodes(elem, level=0): - i = "\n" + level*" " - if len(elem): - if not elem.text or not elem.text.strip(): - elem.text = i + " " - if not elem.tail or not elem.tail.strip(): - elem.tail = i - for elem in elem: - indentXMLNodes(elem, level+1) - if not elem.tail or not elem.tail.strip(): - elem.tail = i + i = "\n" + level*" " + if len(elem): + if not elem.text or not elem.text.strip(): + elem.text = i + " " + if not elem.tail or not elem.tail.strip(): + elem.tail = i + for elem in elem: + indentXMLNodes(elem, level+1) + if not elem.tail or not elem.tail.strip(): + elem.tail = i else: - if level and (not elem.tail or not elem.tail.strip()): - elem.tail = i + if level and (not elem.tail or not elem.tail.strip()): + elem.tail = i def readTranslationPipelineOptions(propsfile, default_namespace): - with codecs.open(propsfile, 'r', 'utf-8') as infile: - for line in infile: - if not line.strip(): - continue; - key, value = line.strip().split('=', 1); - key, value = key.strip(), value.strip(); - if key == 'srclang': default_namespace.srclang = value; - elif key == 'tgtlangs': default_namespace.tgtlangs = [val.strip() for val in ','.split(value)]; - elif key == 'input': default_namespace.input = value; - elif key == 'format': default_namespace.format = value; - elif key == 'exp_directory': default_namespace.exp_directory = value; - else: - #print >>sys.stderr, "Unknown option-%s found in props file. Ignoring and proceeding." %(key); - logging.warning("Unknown option-%s found in props file. Ignoring and proceeding." %(key)); - continue; - return default_namespace; + with codecs.open(propsfile, 'r', 'utf-8') as infile: + for line in infile: + if not line.strip(): + continue; + key, value = line.strip().split('=', 1); + key, value = key.strip(), value.strip(); + if key == 'srclang': + default_namespace.srclang = value; + elif key == 'tgtlangs': + default_namespace.tgtlangs = [val.strip() for val in ','.split(value)]; + elif key == 'input': + default_namespace.input = value; + elif key == 'format': + default_namespace.format = value; + elif key == 'exp_directory': + default_namespace.exp_directory = value; + else: + logging.warning("Unknown option-%s found in props file. Ignoring and proceeding." %(key)); + continue; + return default_namespace; def sgmReader(sgmDoc): - root = sgmDoc.getroot(); - for element in root.iter(): - if element.text is not None and element.text.strip(): - yield element.text.strip().encode('utf-8'); + root = sgmDoc.getroot(); + for element in root.iter(): + if element.text is not None and element.text.strip(): + yield element.text.strip().encode('utf-8'); def addToSgm(sgmDoc, strItem): - for node in sgmDoc.findall('.//seg'): - if not node.text.strip(): - strItem = strItem.decode('utf-8'); - node.text = ' %s ' %(strItem if strItem.strip() else 'EMPTY'); - return; - logging.error("No more nodes available for adding content"); - return; + for node in sgmDoc.findall('.//seg'): + if not node.text.strip(): + strItem = strItem.decode('utf-8'); + node.text = ' %s ' %(strItem if strItem.strip() else 'EMPTY'); + return; + logging.error("No more nodes available for adding content"); + return; def sgmWriter(sgmDoc): - indentXMLNodes( sgmDoc.getroot() ); - return etree.tostring(sgmDoc.getroot(), encoding='utf-8', method='xml'); + indentXMLNodes( sgmDoc.getroot() ); + return etree.tostring(sgmDoc.getroot(), encoding='utf-8', method='xml'); def getXMLSkeleton(sgmDoc, tgtlang): - skeletonDoc = copy.deepcopy(sgmDoc); - root = skeletonDoc.getroot(); - root.tag = 'tstset'; - root.attrib['trlang'] = tgtlang[-3:]; - root.find('doc').attrib['sysid'] = tgtlang[:-3]; - for node in root.findall('.//seg'): - node.text = ''; - return skeletonDoc; + skeletonDoc = copy.deepcopy(sgmDoc); + root = skeletonDoc.getroot(); + root.tag = 'tstset'; + root.attrib['trlang'] = tgtlang[-3:]; + root.find('doc').attrib['sysid'] = tgtlang[:-3]; + for node in root.findall('.//seg'): + node.text = ''; + return skeletonDoc; def pipeline_lexer(sentence): - tokens = sentence.strip().split(); - #tokens = filter(None, re.split('(\W+)', sentence.strip())); - n = len(tokens); - idx = len(tokens)-1; - while idx >= 0: - if tokens[idx] in ".?!)": - idx -= 1; - else: - break; - tokens = tokens[:idx+1]; - idx = 0; - while idx < len(tokens): - if tokens[idx] in "'\"(": - idx += 1; - else: - break; - tokens = tokens[idx:]; - return ' '.join(tokens); - -def web_lexer(grammar, lang, sentences): - for instance in sentences: - tokensList = re.split('\s+?', instance.strip()); - for idx, token in enumerate(tokensList): - if not token[0].isupper(): - continue; - lowertoken = tokensList[idx].lower(); - count = 0; - for analysis in grammar.languages[lang].lookupMorpho(lowertoken): - count += 1; - tokensList[idx] = lowertoken if count else token; - for idx, token in enumerate(tokensList): - if token.find('-') == -1: - continue; - count = 0; - for analysis in grammar.languages[lang].lookupMorpho(token): - count += 1; - if count: - continue; - token = tokensList[idx].replace('-', ''); - for analysis in grammar.languages[lang].lookupMorpho(token): - count += 1; - if count: - tokensList[idx] = token; - continue; - token = tokensList[idx].replace('-', ' '); - yield ' '.join(tokensList); + tokens = sentence.strip().split(); + #tokens = filter(None, re.split('(\W+)', sentence.strip())); + n = len(tokens); + idx = len(tokens)-1; + while idx >= 0: + if tokens[idx] in ".?!)": + idx -= 1; + else: + break; + tokens = tokens[:idx+1]; + idx = 0; + while idx < len(tokens): + if tokens[idx] in "'\"(": + idx += 1; + else: + break; + tokens = tokens[idx:]; + return ' '.join(tokens); def clean_gfstrings(sentence): - absFuncName = re.compile('\[[^]]+?\]'); - untranslatedEntries = {}; - for entry in re.findall(absFuncName, sentence): - untranslatedEntries[entry] = untranslatedEntries.setdefault(entry, 0)+1; - for entry in untranslatedEntries: - while untranslatedEntries[entry] > 1: - sentence = sentence.replace(entry, '', 1); - untranslatedEntries[entry] -= 1; - sentence = sentence.replace(entry, ' '.join(entry[1:-1].split('_')[:-1]) if entry.find('_') != -1 else ''); - return ' '.join( sentence.split() ); + absFuncName = re.compile('\[[^]]+?\]'); + untranslatedEntries = {}; + for entry in re.findall(absFuncName, sentence): + untranslatedEntries[entry] = untranslatedEntries.setdefault(entry, 0)+1; + for entry in untranslatedEntries: + while untranslatedEntries[entry] > 1: + sentence = sentence.replace(entry, '', 1); + untranslatedEntries[entry] -= 1; + sentence = sentence.replace(entry, \ + ' '.join(entry[1:-1].split('_')[:-1]) if entry.find('_') != -1 \ + else ''); + return ' '.join( sentence.split() ); def parseNames(grammar, language, sentence): - def callback(lin_idx, start): - moving_start, end, eot = start, len(sentence), True; - if moving_start < end and (not sentence[moving_start].isupper()): - return None; - while moving_start < end: - if sentence[moving_start] in string.whitespace: - eot = True; - elif eot and sentence[moving_start].isupper(): - eot = False; - elif eot and (not sentence[moving_start].isupper()): - end = moving_start-1; - break; - moving_start += 1; - possible_name = sentence[start:end].strip(); - if possible_name: - if language.endswith('Eng') and (possible_name == "I" or possible_name == "I'm"): - return None; - elif language.endswith('Eng') and possible_name.endswith("'s"): - end_idx = possible_name.rfind("'s"); - if end_idx != -1: - possible_name = possible_name[:end_idx].strip(); - end -= 2; - if not possible_name: - return None; - expr, prob = None, None; - for analysis in grammar.languages[language].lookupMorpho(possible_name): - category = grammar.functionType(analysis[0]).cat; - if prob < analysis[-1]: - if category == "PN": - expr, prob = pgf.Expr(analysis[0], []), analysis[-1]; - elif category == "Weekday": - expr, prob = pgf.Expr("weekdayPN", [pgf.Expr(analysis[0], [])]), analysis[-1]; - elif category == "Month": - expr, prob = pgf.Expr("monthPN", [pgf.Expr(analysis[0], [])]), analysis[-1]; - elif category == "Language": - return None; - # generic named entity - if expr == None: - expr = pgf.Expr(possible_name); - expr = pgf.Expr("MkSymb", [expr]); - expr = pgf.Expr("SymbPN", [expr]); - return (expr, 0, end); - return None; - return callback; + def callback(lin_idx, start): + moving_start, end, eot = start, len(sentence), True; + if moving_start < end and (not sentence[moving_start].isupper()): + return None; + while moving_start < end: + if sentence[moving_start] in string.whitespace: + eot = True; + elif eot and sentence[moving_start].isupper(): + eot = False; + elif eot and (not sentence[moving_start].isupper()): + end = moving_start-1; + break; + moving_start += 1; + possible_name = sentence[start:end].strip(); + if possible_name: + if language.endswith('Eng') and \ + (possible_name == "I" or possible_name == "I'm"): + return None; + elif language.endswith('Eng') and possible_name.endswith("'s"): + end_idx = possible_name.rfind("'s"); + if end_idx != -1: + possible_name = possible_name[:end_idx].strip(); + end -= 2; + if not possible_name: + return None; + expr, prob = None, None; + for analysis in grammar.languages[language].lookupMorpho(possible_name): + category = grammar.functionType(analysis[0]).cat; + if prob < analysis[-1]: + if category == "PN": + expr, prob = pgf.Expr(analysis[0], []), analysis[-1]; + elif category == "Weekday": + expr, prob = pgf.Expr("weekdayPN", \ + [pgf.Expr(analysis[0], [])]), analysis[-1]; + elif category == "Month": + expr, prob = pgf.Expr("monthPN", \ + [pgf.Expr(analysis[0], [])]), analysis[-1]; + elif category == "Language": + return None; + # generic named entity + if expr == None: + expr = pgf.Expr(possible_name); + expr = pgf.Expr("MkSymb", [expr]); + expr = pgf.Expr("SymbPN", [expr]); + return (expr, 0, end); + return None; + return callback; def parseUnknown(grammar, language, sentence): - def callback(lin_idx, start): - moving_start, end, eot = start, len(sentence), True; - isNewToken = (moving_start == 0) or (moving_start > 1 and sentence[moving_start-1].isspace()) # -- added to deal with segmentation errors like may => ma_N + Symb y - if moving_start < end and (not sentence[moving_start].isupper()): - while moving_start < end: - if sentence[moving_start] in string.whitespace: - end = moving_start; - break; - moving_start += 1; - unknown_word = sentence[start:end].strip(); - if unknown_word and isNewToken: - count = 0; - for analysis in grammar.languages[language].lookupMorpho(unknown_word): - count += 1; - if not count: - expr = pgf.Expr("MkSymb", [pgf.Expr(unknown_word)]); - return (expr, 0, end); - return None; - return callback; + def callback(lin_idx, start): + moving_start, end, eot = start, len(sentence), True; + # -- added to deal with segmentation errors like may => ma_N + Symb y + isNewToken = (moving_start == 0) or \ + (moving_start > 1 and sentence[moving_start-1].isspace()) + if moving_start < end and (not sentence[moving_start].isupper()): + while moving_start < end: + if sentence[moving_start] in string.whitespace: + end = moving_start; + break; + moving_start += 1; + unknown_word = sentence[start:end].strip(); + if unknown_word and isNewToken: + count = 0; + for analysis in grammar.languages[language].lookupMorpho(unknown_word): + count += 1; + if not count: + expr = pgf.Expr("MkSymb", [pgf.Expr(unknown_word)]); + return (expr, 0, end); + return None; + return callback; -def parseTester(grammar, language): - def callback(lin_idx, sentence, start): - if start < len(sentence): - return (pgf.Expr(sentence[start]), 0, start+1); - return None; - return callback; +def parseTester(grammar, language, sentence): + def callback(lin_idx, start): + if start < len(sentence): + return (pgf.Expr(sentence[start]), 0, start+1); + return None; + return callback; def translateWordsAsChunks(grammar, language, tgtlanguages, word): - parser = grammar.languages[language].parse; - linearizersList = dict((lang, grammar.languages[lang].linearize) for lang in tgtlanguages); - translations = []; - try: - for parseidx, parse in enumerate( parser(word) ): - for lang in tgtlanguages: - trans = linearizersList[lang](parse[1]); - translations.append(( lang, gf_utils.postprocessor(trans.strip() if trans else '') ) ); - break; - except pgf.ParseError, err: - return []; - return translations; + parser = grammar.languages[language].parse; + linearizersList = dict((lang, grammar.languages[lang].linearize) \ + for lang in tgtlanguages); + translations = []; + try: + for parseidx, parse in enumerate( parser(word) ): + for lang in tgtlanguages: + trans = linearizersList[lang](parse[1]); + translations.append((lang, gf_utils.postprocessor(\ + trans.strip() if trans else ''))); + break; + except pgf.ParseError as err: + return []; + return translations; def translateWord(grammar, language, tgtlanguages, word): - possible_translations = translateWordsAsChunks(grammar, language, tgtlanguages, word); - if len(possible_translations): - return possible_translations; - - lowerword = word.lower(); - try: - partialExprList = grammar.languages[language].parse(word, cat='Chunk'); - for expr in partialExprList: - return [(lang, gf_utils.gf_postprocessor( grammar.languages[lang].linearize(expr[1]) )) for lang in tgtlanguages]; - except pgf.ParseError: - morphAnalysis = grammar.languages[language].lookupMorpho(word) + grammar.languages[language].lookupMorpho(lowerword); - for morph in morphAnalysis: - countPositiveLanguages = filter(None, [grammar.languages[lang].hasLinearization(morph[0]) for lang in tgtlanguages]); - if len(countPositiveLanguages) > 0.5*len(tgtlanguages): - return [(lang, gf_utils.gf_postprocessor( grammar.languages[lang].linearize( pgf.readExpr(morph[0]) ) )) for lang in tgtlanguages]; - return [(lang, word) for lang in tgtlanguages]; + possible_translations = translateWordsAsChunks(grammar, language, \ + tgtlanguages, word); + if len(possible_translations): + return possible_translations; + lowerword = word.lower(); + try: + partialExprList = grammar.languages[language].parse(word, cat='Chunk'); + for expr in partialExprList: + return [(lang, gf_utils.gf_postprocessor(\ + grammar.languages[lang].linearize(expr[1]))) \ + for lang in tgtlanguages]; + except pgf.ParseError: + morphAnalysis = grammar.languages[language].lookupMorpho(word) +\ + grammar.languages[language].lookupMorpho(lowerword); + for morph in morphAnalysis: + countPositiveLanguages = list(filter(None, \ + [grammar.languages[lang].hasLinearization(morph[0]) \ + for lang in tgtlanguages])); + if len(countPositiveLanguages) > 0.5*len(tgtlanguages): + return [(lang, \ + gf_utils.gf_postprocessor(grammar.languages[lang].linearize(pgf.readExpr(morph[0])))) \ + for lang in tgtlanguages]; + return [(lang, word) for lang in tgtlanguages]; def translationByLookup(grammar, language, tgtlanguages, sentence): - parser = grammar.languages[language].parse; - linearizersList = dict([(lang, grammar.languages[lang].linearize) for lang in tgtlanguages]); - queue = [sentence.strip().split()]; - transChunks = {}; - while len(queue): - head = queue[0]; - if not len(head): - pass; - elif len(head) == 1 and head[0].strip(): - for lang, wordchoice in translateWord(grammar, language, tgtlanguages, head[0]): - transChunks.setdefault(lang, []).append( gf_utils.postprocessor(wordchoice) ); - else: - try: - for parseidx, parse in enumerate( parser(' '.join(head)) ): - for lang in tgtlanguages: - if linearizersList[lang](parse[1]) == None: - transChunks.setdefault(lang, []).append( ' ' ); - else: - transChunks.setdefault(lang, []).append( gf_utils.postprocessor( linearizersList[lang](parse[1]).strip() ) ); - break; - except pgf.ParseError, err: - #unseenToken = re.findall('"[^"]+?"', err.message)[0][1:-1]; - unseenToken = err.message.strip().split()[-1][1:-1]; - idx = head.index(unseenToken); - queue.insert(1, head[:idx] ); - queue.insert(2, [head[idx]] ); - queue.insert(3, head[idx+1:] ); - del queue[0]; - for lang in tgtlanguages: - yield (lang, ' '.join(transChunks[lang])); + parser = grammar.languages[language].parse; + linearizersList = dict([(lang, grammar.languages[lang].linearize) \ + for lang in tgtlanguages]); + queue = [sentence.strip().split()]; + transChunks = {}; + while len(queue): + head = queue[0]; + if not len(head): + pass; + elif len(head) == 1 and head[0].strip(): + for lang, wordchoice in translateWord(grammar, language, \ + tgtlanguages, head[0]): + transChunks.setdefault(lang, []).append(\ + gf_utils.postprocessor(wordchoice)); + else: + try: + for parseidx, parse in enumerate(parser(' '.join(head))): + for lang in tgtlanguages: + if linearizersList[lang](parse[1]) == None: + transChunks.setdefault(lang, []).append(' '); + else: + transChunks.setdefault(lang, []).append(\ + gf_utils.postprocessor(linearizersList[lang](parse[1]).strip())); + break; + except pgf.ParseError as err: + #unseenToken = re.findall('"[^"]+?"', err.message)[0][1:-1]; + unseenToken = err.message.strip().split()[-1][1:-1]; + idx = head.index(unseenToken); + queue.insert(1, head[:idx] ); + queue.insert(2, [head[idx]] ); + queue.insert(3, head[idx+1:] ); + del queue[0]; + for lang in tgtlanguages: + yield (lang, ' '.join(transChunks[lang])); def pipelineParsing(grammar, language, sentences, K=20): - #buf = [sent for sent in sentences]; - buf, sentences = itertools.tee(sentences, 2); - sentences = itertools.imap(gf_utils.lexer(lang=language), sentences); - parser = gf_utils.getKBestParses(grammar, language, K); - for sent, (time, parsesBlock) in itertools.izip(buf, itertools.imap(parser, sentences)): - yield (sent, parsesBlock); + #buf = [sent for sent in sentences]; + buf, sentences = itertools.tee(sentences, 2); + parser = gf_utils.getKBestParses(grammar, language, K); + for sent, (time, parsesBlock) in zip(buf, map(parser, sentences)): + yield (sent, parsesBlock); def translation_pipeline(props): - if props.propsfile: - props = readTranslationPipelineOptions(props.propsfile, props); - - # UGLY HACK FOR K-best translation: if K-best translation output format is only txt - if props.bestK != 1: - props.format = 'txt'; - - if not os.path.isdir( props.exp_directory ): - logging.info("Creating output directory: %s" %(props.exp_directory)); - os.makedirs(props.exp_directory); + if props.propsfile: + props = readTranslationPipelineOptions(props.propsfile, props); - if not props.srclang: - logging.critical("Mandatory option source-lang missing. Can not determine source language."); - sys.exit(1); + # UGLY HACK FOR K-best translation: if K-best translation output format is only txt + if props.bestK != 1: + props.format = 'txt'; - grammar = pgf.readPGF(props.pgffile); + if not os.path.isdir( props.exp_directory ): + logging.info("Creating output directory: %s" %(props.exp_directory)); + os.makedirs(props.exp_directory); - sourceLanguage = filter(None, [lang if lang[-3:] == props.srclang else '' for lang in grammar.languages.keys()])[0]; - logging.info("Translating from %s" %(sourceLanguage)); + if not props.srclang: + logging.critical("Mandatory option source-lang missing. Can not determine source language."); + sys.exit(1); - if len(props.tgtlangs): - target_langs = props.tgtlangs; + grammar = pgf.readPGF(props.pgffile); + + sourceLanguage = filter(None, [lang if lang[-3:] == props.srclang else '' for lang in grammar.languages.keys()]); + sourceLanguage = list(sourceLanguage)[0]; + logging.info("Translating from %s" %(sourceLanguage)); + + if len(props.tgtlangs): + target_langs = props.tgtlangs; + else: + target_langs = filter(None, [lang[-3:] if lang != sourceLanguage \ + else '' for lang in grammar.languages.keys()]); + targetLanguages = filter(None, [lang if lang[-3:] in target_langs \ + else '' for lang in grammar.languages.keys()]); + targetLanguages = list(targetLanguages); + logging.info("Translating into the following languages: %s" %(','.join(targetLanguages))); + + K = props.bestK if props.bestK != 1 else 20; # by default we look for 20 best parses + bestK = props.bestK; + + if not props.input: + logging.info( "Input file name missing. Reading input from stdin." ); + inputStream = sys.stdin; + outputPrefix = os.getpid(); + else: + inputStream = codecs.open(props.input, 'r'); + outputPrefix = os.path.splitext( os.path.split(props.input)[1] )[0]; + + if props.format == 'sgm': + inputDoc = etree.parse(inputStream); + reader = sgmReader; + skeletonDoc = getXMLSkeleton; + addItem = addToSgm; + writer = sgmWriter; + elif props.format == 'txt': + logging.info("Input format is txt. Assuming one-sentence-per-line format."); + inputDoc = inputStream; + reader = lambda X: X; + skeletonDoc = lambda X, lang: list(); + addItem = lambda X, y: list.append(X, y); + writer = lambda X: ('\n'.join(X) if bestK == 1 else \ + '\n'.join(map(gf_utils.printMosesNbestFormat, X))); + + translationBlocks = {}; + for tgtlang in targetLanguages+['abstract']: + translationBlocks[tgtlang] = skeletonDoc(inputDoc, tgtlang); + + preprocessor = pipeline_lexer; + postprocessor = clean_gfstrings; + + logging.info( "Parsing text in %s" %(sourceLanguage) ); + # 1. Get Abstract Trees for sentences in source language. + tokenized_sentences = map(preprocessor, reader(inputDoc)); + web_lexer = gf_utils.Lexer('Web', grammar, sourceLanguage).tokenize; + absParses = [parsesBlock for parsesBlock in \ + pipelineParsing(grammar, sourceLanguage, \ + map(web_lexer, tokenized_sentences), K)]; + + logging.info( "Linearizing into %s" %(','.join(targetLanguages)) ); + # 2. Linearize in all target Languages + for idx, parsesBlock in enumerate( map(operator.itemgetter(1), absParses) ): + translationBuffer = {}; + if not len(parsesBlock): + # failed to parse; + # translate using lookup + for tgtlang, translation in translationByLookup(grammar, sourceLanguage,\ + targetLanguages, absParses[idx][0]): + if bestK == 1: + addItem(translationBlocks[tgtlang], postprocessor(translation)); + else: + addItem(translationBlocks[tgtlang], [((0,), postprocessor(translation))]); + addItem(translationBlocks['abstract'], ''); else: - target_langs = filter(None, [lang[-3:] if lang != sourceLanguage else '' for lang in grammar.languages.keys()]); - targetLanguages = filter(None, [lang if lang[-3:] in target_langs else '' for lang in grammar.languages.keys()]); - logging.info("Translating into the following languages: %s" %(','.join(targetLanguages))); - - K = props.bestK if props.bestK != 1 else 20; # by default we look for 20 best parses - bestK = props.bestK; - - if not props.input: - logging.info( "Input file name missing. Reading input from stdin." ); - inputStream = sys.stdin; - outputPrefix = os.getpid(); - - else: - inputStream = codecs.open(props.input, 'r'); - outputPrefix = os.path.splitext( os.path.split(props.input)[1] )[0]; - - if props.format == 'sgm': - inputDoc = etree.parse(inputStream); - reader = sgmReader; - skeletonDoc = getXMLSkeleton; - addItem = addToSgm; - writer = sgmWriter; - elif props.format == 'txt': - logging.info("Input format is txt. Assuming one-sentence-per-line format."); - inputDoc = inputStream; - reader = lambda X: X; - skeletonDoc = lambda X, lang: list(); - addItem = lambda X, y: list.append(X, y); - writer = lambda X: ('\n'.join(X) if bestK == 1 else '\n'.join(map(gf_utils.printMosesNbestFormat, X))); - - translationBlocks = {}; - for tgtlang in targetLanguages+['abstract']: - translationBlocks[tgtlang] = skeletonDoc(inputDoc, tgtlang); - - preprocessor = pipeline_lexer; - postprocessor = clean_gfstrings; - - logging.info( "Parsing text in %s" %(sourceLanguage) ); - # 1. Get Abstract Trees for sentences in source language. - tokenized_sentences = itertools.imap(preprocessor, reader(inputDoc)); - absParses = [parsesBlock for parsesBlock in pipelineParsing(grammar, sourceLanguage, web_lexer(grammar, sourceLanguage, tokenized_sentences), K)]; - - logging.info( "Linearizing into %s" %(','.join(targetLanguages)) ); - # 2. Linearize in all target Languages - for idx, parsesBlock in enumerate( itertools.imap(operator.itemgetter(1), absParses) ): - translationBuffer = {}; - if not len(parsesBlock): - # failed to parse; - # translate using lookup - for tgtlang, translation in translationByLookup(grammar, sourceLanguage, targetLanguages, absParses[idx][0]): - if bestK == 1: - addItem(translationBlocks[tgtlang], postprocessor(translation)); - else: - addItem(translationBlocks[tgtlang], [((0,), postprocessor(translation))]); - addItem(translationBlocks['abstract'], ''); - else: - bestTranslationIdx = 0; - for tgtlang in targetLanguages: - translationBuffer[tgtlang] = gf_utils.getKLinearizations(grammar, tgtlang, [parsesBlock], K=bestK).next(); - if bestK == 1: - for tidx, translation in enumerate(translationBuffer[tgtlang]): - if postprocessor(translation[1]).strip(): - if tidx > bestTranslationIdx: - bestTranslationIdx = tidx; - break; - for tgtlang in targetLanguages: - if bestK == 1: - translation = postprocessor(translationBuffer[tgtlang][bestTranslationIdx][1]) if len(translationBuffer[tgtlang]) > bestTranslationIdx else ((None,), ''); - abstract = str(parsesBlock[bestTranslationIdx][1]); - else: - translation = translationBuffer[tgtlang] if len(translationBuffer[tgtlang]) else []; - abstract = parsesBlock; - addItem(translationBlocks[tgtlang], translation); - addItem(translationBlocks['abstract'], abstract); - - for tgtlang in targetLanguages+['abstract']: - outputFile = os.path.join( props.exp_directory, '%s-%s.%s' %(outputPrefix, tgtlang[-3:] if tgtlang!='abstract' else 'abstract', props.format) ); - logging.info( "Writing translations for %s to %s" %(tgtlang, outputFile) ); - with codecs.open(outputFile, 'w') as outputStream: - print >>outputStream, writer(translationBlocks[tgtlang]); - return; + bestTranslationIdx = 0; + for tgtlang in targetLanguages: + translationBuffer[tgtlang] = next(gf_utils.getKLinearizations(grammar, \ + tgtlang, [parsesBlock], K=bestK)); + if bestK == 1: + for tidx, translation in enumerate(translationBuffer[tgtlang]): + if postprocessor(translation[1]).strip(): + if tidx > bestTranslationIdx: + bestTranslationIdx = tidx; + break; + for tgtlang in targetLanguages: + if bestK == 1: + translation = postprocessor(translationBuffer[tgtlang][bestTranslationIdx][1]) \ + if len(translationBuffer[tgtlang]) > bestTranslationIdx \ + else ((None,), ''); + abstract = str(parsesBlock[bestTranslationIdx][1]); + else: + translation = translationBuffer[tgtlang] \ + if len(translationBuffer[tgtlang]) \ + else []; + abstract = parsesBlock; + addItem(translationBlocks[tgtlang], translation); + addItem(translationBlocks['abstract'], abstract); + + for tgtlang in targetLanguages+['abstract']: + outputFile = os.path.join( props.exp_directory, '%s-%s.%s' %(outputPrefix, tgtlang[-3:] \ + if tgtlang!='abstract' \ + else 'abstract', props.format) ); + logging.info( "Writing translations for %s to %s" %(tgtlang, outputFile) ); + with codecs.open(outputFile, 'w', encoding='utf-8') as outputStream: + print(writer(translationBlocks[tgtlang]), file=outputStream); + return; def cmdLineParser(): - argparser = argparse.ArgumentParser(prog='translation_pipeline.py', description='Run the GF translation pipeline on standard test-sets'); - argparser.add_argument('-g', '--pgf', dest='pgffile', required=True, help='PGF grammar file to run the pipeline'); - argparser.add_argument('-s', '--source', dest='srclang', default='', help='Source language of input sentences'); - argparser.add_argument('-t', '--target', dest='tgtlangs', nargs='*', default=[], help='Target languages to linearize (default is all other languages)'); - argparser.add_argument('-i', '--input', dest='input', default='', help='input file (default will accept STDIN)'); - argparser.add_argument('-e', '--exp', dest='exp_directory', default=os.getcwd(), help='experiement directory to write translation files'); - argparser.add_argument('-f', '--format', dest='format', default='txt', choices=['txt', 'sgm'], help='input file format (output files will be written in the same format)'); - argparser.add_argument('-p', '--props', dest='propsfile', default='', help='properties file for the translation pipeline (specify the above arguments in a file)'); - argparser.add_argument('-K', dest='bestK', type=int, default=1, help='K value for K-best translation'); - return argparser; + argparser = argparse.ArgumentParser(prog='translation_pipeline.py', description='Run the GF translation pipeline on standard test-sets'); + argparser.add_argument('-g', '--pgf', dest='pgffile', required=True, help='PGF grammar file to run the pipeline'); + argparser.add_argument('-s', '--source', dest='srclang', default='', help='Source language of input sentences'); + argparser.add_argument('-t', '--target', dest='tgtlangs', nargs='*', default=[], help='Target languages to linearize (default is all other languages)'); + argparser.add_argument('-i', '--input', dest='input', default='', help='input file (default will accept STDIN)'); + argparser.add_argument('-e', '--exp', dest='exp_directory', default=os.getcwd(), help='experiement directory to write translation files'); + argparser.add_argument('-f', '--format', dest='format', default='txt', choices=['txt', 'sgm'], help='input file format (output files will be written in the same format)'); + argparser.add_argument('-p', '--props', dest='propsfile', default='', help='properties file for the translation pipeline (specify the above arguments in a file)'); + argparser.add_argument('-K', dest='bestK', type=int, default=1, help='K value for K-best translation'); + return argparser; if __name__ == '__main__': - logging.basicConfig(level='INFO'); - pipelineEnv = cmdLineParser().parse_args(sys.argv[1:]); - translation_pipeline(pipelineEnv); + logging.basicConfig(level='INFO'); + pipelineEnv = cmdLineParser().parse_args(sys.argv[1:]); + translation_pipeline(pipelineEnv);