forked from GitHub/gf-core
remove parsing for long sentences, to stop pipeline from crashing/hanging
This commit is contained in:
@@ -122,7 +122,7 @@ def getKLinearizations(grammar, tgtlanguage, abstractParsesList):
|
|||||||
kBestTrans.append( ((parseprob,), postprocessor( generator(parse) )) );
|
kBestTrans.append( ((parseprob,), postprocessor( generator(parse) )) );
|
||||||
yield kBestTrans;
|
yield kBestTrans;
|
||||||
|
|
||||||
def getKBestParses(grammar, language, K, callbacks=[], serializable=False, sentid=count(1)):
|
def getKBestParses(grammar, language, K, callbacks=[], serializable=False, sentid=count(1), max_length=50):
|
||||||
parser = grammar.languages[language].parse;
|
parser = grammar.languages[language].parse;
|
||||||
def worker(sentence):
|
def worker(sentence):
|
||||||
sentence = sentence.strip();
|
sentence = sentence.strip();
|
||||||
@@ -130,6 +130,10 @@ def getKBestParses(grammar, language, K, callbacks=[], serializable=False, senti
|
|||||||
tstart = time.time();
|
tstart = time.time();
|
||||||
kBestParses = [];
|
kBestParses = [];
|
||||||
parseScores = {};
|
parseScores = {};
|
||||||
|
if len(sentence.split()) > max_length:
|
||||||
|
tend, err = time.time(), "Sentence too long (%d tokens). Might potentially run out of memory" %(len(sentence.split()));
|
||||||
|
print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err);
|
||||||
|
return tend-tstart, kBestParses; # temporary hack to make sure parser does not get killed for very long sentences;
|
||||||
try:
|
try:
|
||||||
for parseidx, parse in enumerate( parser(sentence, heuristics=0, callbacks=callbacks) ):
|
for parseidx, parse in enumerate( parser(sentence, heuristics=0, callbacks=callbacks) ):
|
||||||
parseScores[parse[0]] = True;
|
parseScores[parse[0]] = True;
|
||||||
|
|||||||
Reference in New Issue
Block a user