(unittest) Add option to only use cc, never parse

Usage like before, but add -only-cc as one of the arguments. For example:

`python3 unittest/unittest.py src/somali/unittest/vp.gftest -only-cc`

In order for it to work, the test file has to only contain test cases like this:

```
LangSom: isku BIND ma barto
Lang: PhrUtt NoPConj (UttS (UseCl (TTAnt TPres ASimul) PNeg (PredVP (UsePron youSg_Pron) (ReflVP (SlashV2a teach_V2))))) NoVoc
```

&+ needs to be written as BIND.
This commit is contained in:
Inari Listenmaa
2019-08-11 14:34:55 +02:00
committed by GitHub
parent 2d3d382a41
commit 4c02a6c6d1

View File

@@ -53,9 +53,45 @@ def importfile(linenr, lang):
def stripstrings(strings): def stripstrings(strings):
return [s for s0 in strings for s in [s0.strip()] if s] return [s for s0 in strings for s in [s0.strip()] if s]
def create_gf_input_cc_only(testlines):
# building the input to the GF process out of the lines of test file
gfinput = ''
testing = False
for linenr, line in enumerate(testlines, 1):
if line.startswith('#') or line.startswith('--'):
# a comment line: do nothing
pass
elif ':' in line:
if not testing:
gfinput += 'ps "### %d" \n' % (linenr,)
testing = True
lang, sent = stripstrings(line.split(':', 1))
langfile = importfile(linenr, lang)
if '/abstract/' not in langfile:
gfinput += 'ps "+++ %d %s" \n' % (linenr, lang)
gfinput += 'i -retain -no-pmcfg %s \n' % (langfile,)
gfinput += 'ps "%s" \n' % (sent,) # Gold standard to compare against
else:
gfinput += 'cc -unqual -one %s \n' % (sent,)
elif not line.strip():
# an empty line: start a new test
testing = False
else:
error(linenr, "Ill-formatted line in test file:", line)
exit(1)
def runtest(testlines): # if cc only, gf input is this long and complicated thing
# first we build the input to the GF process: command = [
u'gf',
u'-run',
u'-retain',
u'-no-pmcfg',
u'-gfo-dir=/tmp']
return (command,gfinput)
def create_gf_input(testlines):
# building the input to the GF process out of the lines of test file
gfinput = '' gfinput = ''
testing = False testing = False
for linenr, line in enumerate(testlines, 1): for linenr, line in enumerate(testlines, 1):
@@ -81,14 +117,25 @@ def runtest(testlines):
error(linenr, "Ill-formatted line in test file:", line) error(linenr, "Ill-formatted line in test file:", line)
exit(1) exit(1)
# then we call GF with the script, catching stdout: # If we're parsing, then command is just `gf -run'
gf = Popen('gf -run'.split(), stdin=PIPE, stdout=PIPE) return ('gf -run'.split(), gfinput)
def runtest(testlines,is_cc_only):
# first we build the input to the GF process:
if is_cc_only:
command,gfinput = create_gf_input_cc_only(testlines)
else:
command,gfinput = create_gf_input(testlines)
# calling GF from a subprocess:
gf = Popen(command, stdin=PIPE, stdout=PIPE)
stdout, _stderr = gf.communicate(gfinput.encode(ENCODING)) stdout, _stderr = gf.communicate(gfinput.encode(ENCODING))
stdout = stdout.decode(ENCODING) stdout = stdout.decode(ENCODING)
# then we analyse the result from the GF process: # then we analyse the result from the GF process:
totalerrors = 0 totalerrors = 0
alltests = stripstrings(stdout.split('###')) alltests = stripstrings(stdout.split('###'))
for testnr, test in enumerate(alltests, 1): for testnr, test in enumerate(alltests, 1):
sents = stripstrings(test.split('+++')) sents = stripstrings(test.split('+++'))
startline = int(sents.pop(0)) startline = int(sents.pop(0))
@@ -103,16 +150,24 @@ def runtest(testlines):
error(linenr, theerror) error(linenr, theerror)
testerrors += 1 testerrors += 1
else: else:
allerrors = [(sum(tree not in oldtrees for _, _, oldtrees in oldresults), tree) if is_cc_only:
for tree in alltrees] # If is_cc_only, gfinput (and thus stdout) include gold standard
besterrors, besttree = min(allerrors) gold = alltrees.pop(0)
if besterrors > 0: lin = alltrees.pop(0)
for oldlinenr, oldlang, oldtrees in oldresults: if gold != lin:
if besttree not in oldtrees: testerrors += 1
error(linenr, "Line %s (%s) is not a translation of line %s (%s)" error(linenr,"\nExpected linearisation\n\t%s \n\nActual linearisation\n\t%s" % (gold, lin))
% (linenr, lang, oldlinenr, oldlang)) else:
testerrors += 1 allerrors = [(sum(tree not in oldtrees for _, _, oldtrees in oldresults), tree)
oldresults.append((linenr, lang, alltrees)) for tree in alltrees]
besterrors, besttree = min(allerrors)
if besterrors > 0:
for oldlinenr, oldlang, oldtrees in oldresults:
if besttree not in oldtrees:
error(linenr, "Line %s (%s) is not a translation of line %s (%s)"
% (linenr, lang, oldlinenr, oldlang))
testerrors += 1
oldresults.append((linenr, lang, alltrees))
if not testerrors: if not testerrors:
print("OK!") print("OK!")
print() print()
@@ -130,14 +185,19 @@ if __name__ == '__main__':
if len(sys.argv) <= 1: if len(sys.argv) <= 1:
usage() usage()
exit(1) exit(1)
if "-only-cc" in sys.argv:
is_cc_only = True
else:
is_cc_only = False
for filename in sys.argv[1:]: for filename in sys.argv[1:]:
try: if filename != "-only-cc":
print("# Testing file:", filename) try:
with io.open(filename, encoding=ENCODING) as F: print("# Testing file:", filename)
with io.open(filename, encoding=ENCODING) as F:
print()
runtest(F,is_cc_only)
except IOError as err:
print(err)
print() print()
runtest(F) usage()
except IOError as err: exit(1)
print(err)
print()
usage()
exit(1)