1
0
forked from GitHub/gf-core

the parser in the C runtime can now detect incomplete sentences just like the parser in the Haskell runtime. This is also reflected in all bindings.

This commit is contained in:
Krasimir Angelov
2017-09-06 12:38:42 +02:00
parent 18f2135785
commit 15d014abb8
13 changed files with 184 additions and 71 deletions

View File

@@ -2139,30 +2139,37 @@ pgf_parse_result_enum_next(GuEnum* self, void* to, GuPool* pool)
*(PgfExprProb**)to = pgf_parse_result_next(ps);
}
static GuString
pgf_parsing_last_token(PgfParsing* ps, GuPool* pool)
static PgfParseError*
pgf_parsing_new_exception(PgfParsing* ps, GuPool* pool)
{
if (ps->before == NULL)
return "";
const uint8_t* p = (uint8_t*) ps->sentence;
const uint8_t* end = p + (ps->before ? ps->before->end_offset : 0);
const uint8_t* start = (uint8_t*) ps->sentence;
const uint8_t* end = (uint8_t*) ps->sentence + ps->before->end_offset;
PgfParseError* err = gu_new(PgfParseError, pool);
err->incomplete= (*end == 0);
err->offset = 0;
err->token_ptr = (char*) p;
const uint8_t* p = start;
while (p < end) {
if (gu_ucs_is_space(gu_utf8_decode(&p))) {
start = p;
err->token_ptr = (char*) p;
}
err->offset++;
}
if (err->incomplete) {
err->token_ptr = NULL;
err->token_len = 0;
return err;
}
while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
end = p;
}
char* tok = gu_malloc(pool, end-start+1);
memcpy(tok, start, (end-start));
tok[end-start] = 0;
return tok;
err->token_len = ((char*)end)-err->token_ptr;
return err;
}
PGF_API GuEnum*
@@ -2204,7 +2211,7 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ, GuString sentence,
while (gu_buf_length(ps->expr_queue) == 0) {
if (!pgf_parsing_proceed(ps)) {
GuExnData* exn = gu_raise(err, PgfParseError);
exn->data = (void*) pgf_parsing_last_token(ps, exn->pool);
exn->data = (void*) pgf_parsing_new_exception(ps, exn->pool);
return NULL;
}
@@ -2249,7 +2256,7 @@ pgf_parse_with_oracle(PgfConcr* concr, PgfType* typ,
while (gu_buf_length(ps->expr_queue) == 0) {
if (!pgf_parsing_proceed(ps)) {
GuExnData* exn = gu_raise(err, PgfParseError);
exn->data = (void*) pgf_parsing_last_token(ps, exn->pool);
exn->data = (void*) pgf_parsing_new_exception(ps, exn->pool);
return NULL;
}
@@ -2312,7 +2319,7 @@ pgf_complete(PgfConcr* concr, PgfType* type, GuString sentence,
while (ps->before->end_offset < len) {
if (!pgf_parsing_proceed(ps)) {
GuExnData* exn = gu_raise(err, PgfParseError);
exn->data = (void*) pgf_parsing_last_token(ps, exn->pool);
exn->data = (void*) pgf_parsing_new_exception(ps, exn->pool);
return NULL;
}

View File

@@ -122,6 +122,13 @@ PGF_API_DECL PgfExprEnum*
pgf_generate_all(PgfPGF* pgf, PgfType* ty,
GuExn* err, GuPool* pool, GuPool* out_pool);
typedef struct {
int incomplete; // equal to !=0 if the sentence is incomplete, 0 otherwise
size_t offset;
const char* token_ptr;
size_t token_len;
} PgfParseError;
PGF_API_DECL PgfExprEnum*
pgf_parse(PgfConcr* concr, PgfType* typ, GuString sentence,
GuExn* err, GuPool* pool, GuPool* out_pool);

View File

@@ -63,7 +63,7 @@ module PGF2 (-- * PGF
alignWords,
-- ** Parsing
parse, parseWithHeuristics,
ParseOutput(..), parse, parseWithHeuristics,
-- ** Sentence Lookup
lookupSentence,
-- ** Generation
@@ -480,7 +480,15 @@ getAnalysis ref self c_lemma c_anal prob exn = do
anal <- peekUtf8CString c_anal
writeIORef ref ((lemma, anal, prob):ans)
parse :: Concr -> Type -> String -> Either String [(Expr,Float)]
-- | This data type encodes the different outcomes which you could get from the parser.
data ParseOutput
= ParseFailed Int String -- ^ The integer is the position in number of unicode characters where the parser failed.
-- The string is the token where the parser have failed.
| ParseOk [(Expr,Float)] -- ^ If the parsing and the type checking are successful we get a list of abstract syntax trees.
-- The list should be non-empty.
| ParseIncomplete -- ^ The sentence is not complete.
parse :: Concr -> Type -> String -> ParseOutput
parse lang ty sent = parseWithHeuristics lang ty sent (-1.0) []
parseWithHeuristics :: Concr -- ^ the language with which we parse
@@ -497,7 +505,7 @@ parseWithHeuristics :: Concr -- ^ the language with which we parse
-- the input sentence; the current offset in the sentence.
-- If a literal has been recognized then the output should
-- be Just (expr,probability,end_offset)
-> Either String [(Expr,Float)]
-> ParseOutput
parseWithHeuristics lang (Type ctype _) sent heuristic callbacks =
unsafePerformIO $
do exprPl <- gu_new_pool
@@ -510,11 +518,19 @@ parseWithHeuristics lang (Type ctype _) sent heuristic callbacks =
if failed
then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError
if is_parse_error
then do c_tok <- (#peek GuExn, data.data) exn
tok <- peekUtf8CString c_tok
gu_pool_free parsePl
gu_pool_free exprPl
return (Left tok)
then do c_err <- (#peek GuExn, data.data) exn
c_incomplete <- (#peek PgfParseError, incomplete) c_err
if (c_incomplete :: CInt) == 0
then do c_offset <- (#peek PgfParseError, offset) c_err
token_ptr <- (#peek PgfParseError, token_ptr) c_err
token_len <- (#peek PgfParseError, token_len) c_err
tok <- peekUtf8CStringLen token_ptr token_len
gu_pool_free parsePl
gu_pool_free exprPl
return (ParseFailed (fromIntegral (c_offset :: CInt)) tok)
else do gu_pool_free parsePl
gu_pool_free exprPl
return ParseIncomplete
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
@@ -528,7 +544,7 @@ parseWithHeuristics lang (Type ctype _) sent heuristic callbacks =
else do parseFPl <- newForeignPtr gu_pool_finalizer parsePl
exprFPl <- newForeignPtr gu_pool_finalizer exprPl
exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl)
return (Right exprs)
return (ParseOk exprs)
mkCallbacksMap :: Ptr PgfConcr -> [(String, Int -> Int -> Maybe (Expr,Float,Int))] -> Ptr GuPool -> IO (Ptr PgfCallbacksMap)
mkCallbacksMap concr callbacks pool = do
@@ -595,7 +611,7 @@ parseWithOracle :: Concr -- ^ the language with which we parse
-> Cat -- ^ the start category
-> String -- ^ the input sentence
-> Oracle
-> Either String [(Expr,Float)]
-> ParseOutput
parseWithOracle lang cat sent (predict,complete,literal) =
unsafePerformIO $
do parsePl <- gu_new_pool
@@ -612,11 +628,19 @@ parseWithOracle lang cat sent (predict,complete,literal) =
if failed
then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError
if is_parse_error
then do c_tok <- (#peek GuExn, data.data) exn
tok <- peekUtf8CString c_tok
gu_pool_free parsePl
gu_pool_free exprPl
return (Left tok)
then do c_err <- (#peek GuExn, data.data) exn
c_incomplete <- (#peek PgfParseError, incomplete) c_err
if (c_incomplete :: CInt) == 0
then do c_offset <- (#peek PgfParseError, offset) c_err
token_ptr <- (#peek PgfParseError, token_ptr) c_err
token_len <- (#peek PgfParseError, token_len) c_err
tok <- peekUtf8CStringLen token_ptr token_len
gu_pool_free parsePl
gu_pool_free exprPl
return (ParseFailed (fromIntegral (c_offset :: CInt)) tok)
else do gu_pool_free parsePl
gu_pool_free exprPl
return ParseIncomplete
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
@@ -630,7 +654,7 @@ parseWithOracle lang cat sent (predict,complete,literal) =
else do parseFPl <- newForeignPtr gu_pool_finalizer parsePl
exprFPl <- newForeignPtr gu_pool_finalizer exprPl
exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl)
return (Right exprs)
return (ParseOk exprs)
where
oracleWrapper oracle catPtr lblPtr offset = do
cat <- peekUtf8CString catPtr

View File

@@ -2,7 +2,7 @@
module PGF2.FFI where
import Foreign ( alloca, poke )
import Foreign ( alloca, peek, poke )
import Foreign.C
import Foreign.Ptr
import Foreign.ForeignPtr
@@ -116,6 +116,19 @@ peekUtf8CString ptr =
else do cs <- decode pptr
return (((toEnum . fromEnum) x) : cs)
peekUtf8CStringLen :: CString -> CInt -> IO String
peekUtf8CStringLen ptr len =
alloca $ \pptr ->
poke pptr ptr >> decode pptr (ptr `plusPtr` fromIntegral len)
where
decode pptr end = do
ptr <- peek pptr
if ptr >= end
then return []
else do x <- gu_utf8_decode pptr
cs <- decode pptr end
return (((toEnum . fromEnum) x) : cs)
newUtf8CString :: String -> Ptr GuPool -> IO CString
newUtf8CString s pool = do
-- An UTF8 character takes up to 6 bytes. We allocate enough

View File

@@ -37,18 +37,18 @@ execute cmd =
P lang s -> do pgf <- gets fst
c <- getConcr' pgf lang
case parse c (startCat pgf) s of
Left tok -> do put (pgf,[])
putln ("Parse error: "++tok)
Right ts -> do put (pgf,map show ts)
pop
ParseFailed _ tok -> do put (pgf,[])
putln ("Parse error: "++tok)
ParseOk ts -> do put (pgf,map show ts)
pop
T from to s -> do pgf <- gets fst
cfrom <- getConcr' pgf from
cto <- getConcr' pgf to
case parse cfrom (startCat pgf) s of
Left tok -> do put (pgf,[])
putln ("Parse error: "++tok)
Right ts -> do put (pgf,map (linearize cto.fst) ts)
pop
ParseFailed _ tok -> do put (pgf,[])
putln ("Parse error: "++tok)
ParseOk ts -> do put (pgf,map (linearize cto.fst) ts)
pop
I path -> do pgf <- liftIO (readPGF path)
putln . unwords . M.keys $ languages pgf
put (pgf,[])

View File

@@ -34,10 +34,8 @@ gu2j_string(JNIEnv *env, GuString s) {
}
JPGF_INTERNAL jstring
gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf) {
const char* s = gu_string_buf_data(sbuf);
gu2j_string_len(JNIEnv *env, const char* s, size_t len) {
const char* utf8 = s;
size_t len = gu_string_buf_length(sbuf);
jchar* utf16 = alloca(len*sizeof(jchar));
jchar* dst = utf16;
@@ -56,6 +54,11 @@ gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf) {
return (*env)->NewString(env, utf16, dst-utf16);
}
JPGF_INTERNAL jstring
gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf) {
return gu2j_string_len(env, gu_string_buf_data(sbuf), gu_string_buf_length(sbuf));
}
JPGF_INTERNAL GuString
j2gu_string(JNIEnv *env, jstring s, GuPool* pool) {
GuString str = (*env)->GetStringUTFChars(env, s, 0);

View File

@@ -20,6 +20,9 @@
JPGF_INTERNAL_DECL jstring
gu2j_string(JNIEnv *env, GuString s);
JPGF_INTERNAL_DECL jstring
gu2j_string_len(JNIEnv *env, const char* s, size_t len);
JPGF_INTERNAL_DECL jstring
gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf);

View File

@@ -591,6 +591,30 @@ JNIEXPORT void JNICALL Java_org_grammaticalframework_pgf_Parser_addLiteralCallba
j2gu_string(env, jcat, pool), &callback->callback);
}
static void
throw_parse_error(JNIEnv *env, PgfParseError* err)
{
jstring jtoken;
if (err->incomplete)
jtoken = NULL;
else {
jtoken = gu2j_string_len(env, err->token_ptr, err->token_len);
if (!jtoken)
return;
}
jclass exception_class = (*env)->FindClass(env, "org/grammaticalframework/pgf/ParseError");
if (!exception_class)
return;
jmethodID constrId = (*env)->GetMethodID(env, exception_class, "<init>", "(Ljava/lang/String;IZ)V");
if (!constrId)
return;
jobject exception = (*env)->NewObject(env, exception_class, constrId, jtoken, err->offset, err->incomplete);
if (!exception)
return;
(*env)->Throw(env, exception);
}
JNIEXPORT jobject JNICALL
Java_org_grammaticalframework_pgf_Parser_parseWithHeuristics
(JNIEnv* env, jclass clazz, jobject jconcr, jstring jstartCat, jstring js, jdouble heuristics, jlong callbacksRef, jobject jpool)
@@ -615,8 +639,7 @@ Java_org_grammaticalframework_pgf_Parser_parseWithHeuristics
GuString msg = (GuString) gu_exn_caught_data(parse_err);
throw_string_exception(env, "org/grammaticalframework/pgf/PGFError", msg);
} else if (gu_exn_caught(parse_err, PgfParseError)) {
GuString tok = (GuString) gu_exn_caught_data(parse_err);
throw_string_exception(env, "org/grammaticalframework/pgf/ParseError", tok);
throw_parse_error(env, (PgfParseError*) gu_exn_caught_data(parse_err));
}
gu_pool_free(out_pool);
@@ -656,8 +679,7 @@ Java_org_grammaticalframework_pgf_Completer_complete(JNIEnv* env, jclass clazz,
GuString msg = (GuString) gu_exn_caught_data(parse_err);
throw_string_exception(env, "org/grammaticalframework/pgf/PGFError", msg);
} else if (gu_exn_caught(parse_err, PgfParseError)) {
GuString tok = (GuString) gu_exn_caught_data(parse_err);
throw_string_exception(env, "org/grammaticalframework/pgf/ParseError", tok);
throw_parse_error(env, (PgfParseError*) gu_exn_caught_data(parse_err));
}
gu_pool_free(pool);

View File

@@ -4,11 +4,26 @@ package org.grammaticalframework.pgf;
public class ParseError extends Exception {
private static final long serialVersionUID = -6086991674218306569L;
public ParseError(String token) {
super(token);
private String token;
private int offset;
private boolean incomplete;
public ParseError(String token, int offset, boolean incomplete) {
super(incomplete ? "The sentence is incomplete" : "Unexpected token: \""+token+"\"");
this.token = token;
this.offset = offset;
this.incomplete = incomplete;
}
public String getToken() {
return token;
}
public int getOffset() {
return offset;
}
public String getToken() {
return getMessage();
public boolean isIncomplete() {
return incomplete;
}
}

View File

@@ -1545,13 +1545,28 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
GuString msg = (GuString) gu_exn_caught_data(parse_err);
PyErr_SetString(PGFError, msg);
} else if (gu_exn_caught(parse_err, PgfParseError)) {
GuString tok = (GuString) gu_exn_caught_data(parse_err);
PyObject* py_tok = PyString_FromString(tok);
PyObject_SetAttrString(ParseError, "token", py_tok);
PyErr_Format(ParseError, "Unexpected token: \"%s\"", tok);
Py_DECREF(py_tok);
PgfParseError* err = (PgfParseError*) gu_exn_caught_data(parse_err);
PyObject* py_offset = PyInt_FromLong(err->offset);
if (err->incomplete) {
PyObject_SetAttrString(ParseError, "incomplete", Py_True);
PyObject_SetAttrString(ParseError, "offset", py_offset);
PyErr_Format(ParseError, "The sentence is incomplete");
} else {
PyObject* py_tok = PyString_FromStringAndSize(err->token_ptr,
err->token_len);
PyObject_SetAttrString(ParseError, "incomplete", Py_False);
PyObject_SetAttrString(ParseError, "offset", py_offset);
PyObject_SetAttrString(ParseError, "token", py_tok);
#if PY_MAJOR_VERSION >= 3
PyErr_Format(ParseError, "Unexpected token: \"%U\"", py_tok);
#else
PyErr_Format(ParseError, "Unexpected token: \"%s\"", PyString_AsString(py_tok));
#endif
Py_DECREF(py_tok);
}
Py_DECREF(py_offset);
}
Py_DECREF(pyres);
pyres = NULL;
}