diff --git a/doc/runtime-api.html b/doc/runtime-api.html index 966f5f15c..792dbed39 100644 --- a/doc/runtime-api.html +++ b/doc/runtime-api.html @@ -154,12 +154,12 @@ or by calling __next__ if you are using Python 3: -This gives you a result of type Either String [(Expr, Float)]. -If the result is Left then the parser has failed and you will -get the token where the parser got stuck. If the parsing was successful -then you get a potentially infinite list of parse results: +This gives you a result of type ParseOutput. +If the result is ParseFailed then the parser has failed and you will +get the offset and the token where the parser got stuck. If the parsing was successful +then you get ParseOk with a potentially infinite list of parse results:
-Prelude PGF2> let Right ((e,p):rest) = res
+Prelude PGF2> let ParseOk ((e,p):rest) = res
 
diff --git a/src/compiler/GF/Command/Commands2.hs b/src/compiler/GF/Command/Commands2.hs index c8e6fbff3..995d2d560 100644 --- a/src/compiler/GF/Command/Commands2.hs +++ b/src/compiler/GF/Command/Commands2.hs @@ -636,10 +636,12 @@ pgfCommands = Map.fromList [ cncs = optConcs env opts parsed rs = Piped (Exprs ts,unlines msgs) where - ts = [hsExpr t|Right ts<-rs,(t,p)<-takeOptNum opts ts] - msgs = concatMap (either err ok) rs - err msg = ["Parse failed: "++msg] - ok = map (PGF2.showExpr [] . fst).takeOptNum opts + ts = [hsExpr t|ParseOk ts<-rs,(t,p)<-takeOptNum opts ts] + msgs = concatMap mkMsg rs + + mkMsg (ParseOk ts) = (map (PGF2.showExpr [] . fst).takeOptNum opts) ts + mkMsg (ParseFailed _ tok) = ["Parse failed: "++tok] + mkMsg (ParseIncomplete) = ["The sentence is incomplete"] optLins env opts ts = case opts of _ | isOpt "groups" opts -> diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index ecfb7d2ea..2cae01db6 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -2139,30 +2139,37 @@ pgf_parse_result_enum_next(GuEnum* self, void* to, GuPool* pool) *(PgfExprProb**)to = pgf_parse_result_next(ps); } -static GuString -pgf_parsing_last_token(PgfParsing* ps, GuPool* pool) +static PgfParseError* +pgf_parsing_new_exception(PgfParsing* ps, GuPool* pool) { - if (ps->before == NULL) - return ""; + const uint8_t* p = (uint8_t*) ps->sentence; + const uint8_t* end = p + (ps->before ? ps->before->end_offset : 0); - const uint8_t* start = (uint8_t*) ps->sentence; - const uint8_t* end = (uint8_t*) ps->sentence + ps->before->end_offset; + PgfParseError* err = gu_new(PgfParseError, pool); + err->incomplete= (*end == 0); + err->offset = 0; + err->token_ptr = (char*) p; - const uint8_t* p = start; while (p < end) { if (gu_ucs_is_space(gu_utf8_decode(&p))) { - start = p; + err->token_ptr = (char*) p; } + err->offset++; + } + + if (err->incomplete) { + err->token_ptr = NULL; + err->token_len = 0; + return err; } while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) { end = p; } - char* tok = gu_malloc(pool, end-start+1); - memcpy(tok, start, (end-start)); - tok[end-start] = 0; - return tok; + err->token_len = ((char*)end)-err->token_ptr; + + return err; } PGF_API GuEnum* @@ -2204,7 +2211,7 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ, GuString sentence, while (gu_buf_length(ps->expr_queue) == 0) { if (!pgf_parsing_proceed(ps)) { GuExnData* exn = gu_raise(err, PgfParseError); - exn->data = (void*) pgf_parsing_last_token(ps, exn->pool); + exn->data = (void*) pgf_parsing_new_exception(ps, exn->pool); return NULL; } @@ -2249,7 +2256,7 @@ pgf_parse_with_oracle(PgfConcr* concr, PgfType* typ, while (gu_buf_length(ps->expr_queue) == 0) { if (!pgf_parsing_proceed(ps)) { GuExnData* exn = gu_raise(err, PgfParseError); - exn->data = (void*) pgf_parsing_last_token(ps, exn->pool); + exn->data = (void*) pgf_parsing_new_exception(ps, exn->pool); return NULL; } @@ -2312,7 +2319,7 @@ pgf_complete(PgfConcr* concr, PgfType* type, GuString sentence, while (ps->before->end_offset < len) { if (!pgf_parsing_proceed(ps)) { GuExnData* exn = gu_raise(err, PgfParseError); - exn->data = (void*) pgf_parsing_last_token(ps, exn->pool); + exn->data = (void*) pgf_parsing_new_exception(ps, exn->pool); return NULL; } diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index 632a1d332..e6c1c70b8 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -122,6 +122,13 @@ PGF_API_DECL PgfExprEnum* pgf_generate_all(PgfPGF* pgf, PgfType* ty, GuExn* err, GuPool* pool, GuPool* out_pool); +typedef struct { + int incomplete; // equal to !=0 if the sentence is incomplete, 0 otherwise + size_t offset; + const char* token_ptr; + size_t token_len; +} PgfParseError; + PGF_API_DECL PgfExprEnum* pgf_parse(PgfConcr* concr, PgfType* typ, GuString sentence, GuExn* err, GuPool* pool, GuPool* out_pool); diff --git a/src/runtime/haskell-bind/PGF2.hsc b/src/runtime/haskell-bind/PGF2.hsc index 8fb6e6ccb..0d65822f9 100644 --- a/src/runtime/haskell-bind/PGF2.hsc +++ b/src/runtime/haskell-bind/PGF2.hsc @@ -63,7 +63,7 @@ module PGF2 (-- * PGF alignWords, -- ** Parsing - parse, parseWithHeuristics, + ParseOutput(..), parse, parseWithHeuristics, -- ** Sentence Lookup lookupSentence, -- ** Generation @@ -480,7 +480,15 @@ getAnalysis ref self c_lemma c_anal prob exn = do anal <- peekUtf8CString c_anal writeIORef ref ((lemma, anal, prob):ans) -parse :: Concr -> Type -> String -> Either String [(Expr,Float)] +-- | This data type encodes the different outcomes which you could get from the parser. +data ParseOutput + = ParseFailed Int String -- ^ The integer is the position in number of unicode characters where the parser failed. + -- The string is the token where the parser have failed. + | ParseOk [(Expr,Float)] -- ^ If the parsing and the type checking are successful we get a list of abstract syntax trees. + -- The list should be non-empty. + | ParseIncomplete -- ^ The sentence is not complete. + +parse :: Concr -> Type -> String -> ParseOutput parse lang ty sent = parseWithHeuristics lang ty sent (-1.0) [] parseWithHeuristics :: Concr -- ^ the language with which we parse @@ -497,7 +505,7 @@ parseWithHeuristics :: Concr -- ^ the language with which we parse -- the input sentence; the current offset in the sentence. -- If a literal has been recognized then the output should -- be Just (expr,probability,end_offset) - -> Either String [(Expr,Float)] + -> ParseOutput parseWithHeuristics lang (Type ctype _) sent heuristic callbacks = unsafePerformIO $ do exprPl <- gu_new_pool @@ -510,11 +518,19 @@ parseWithHeuristics lang (Type ctype _) sent heuristic callbacks = if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error - then do c_tok <- (#peek GuExn, data.data) exn - tok <- peekUtf8CString c_tok - gu_pool_free parsePl - gu_pool_free exprPl - return (Left tok) + then do c_err <- (#peek GuExn, data.data) exn + c_incomplete <- (#peek PgfParseError, incomplete) c_err + if (c_incomplete :: CInt) == 0 + then do c_offset <- (#peek PgfParseError, offset) c_err + token_ptr <- (#peek PgfParseError, token_ptr) c_err + token_len <- (#peek PgfParseError, token_len) c_err + tok <- peekUtf8CStringLen token_ptr token_len + gu_pool_free parsePl + gu_pool_free exprPl + return (ParseFailed (fromIntegral (c_offset :: CInt)) tok) + else do gu_pool_free parsePl + gu_pool_free exprPl + return ParseIncomplete else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn @@ -528,7 +544,7 @@ parseWithHeuristics lang (Type ctype _) sent heuristic callbacks = else do parseFPl <- newForeignPtr gu_pool_finalizer parsePl exprFPl <- newForeignPtr gu_pool_finalizer exprPl exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl) - return (Right exprs) + return (ParseOk exprs) mkCallbacksMap :: Ptr PgfConcr -> [(String, Int -> Int -> Maybe (Expr,Float,Int))] -> Ptr GuPool -> IO (Ptr PgfCallbacksMap) mkCallbacksMap concr callbacks pool = do @@ -595,7 +611,7 @@ parseWithOracle :: Concr -- ^ the language with which we parse -> Cat -- ^ the start category -> String -- ^ the input sentence -> Oracle - -> Either String [(Expr,Float)] + -> ParseOutput parseWithOracle lang cat sent (predict,complete,literal) = unsafePerformIO $ do parsePl <- gu_new_pool @@ -612,11 +628,19 @@ parseWithOracle lang cat sent (predict,complete,literal) = if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error - then do c_tok <- (#peek GuExn, data.data) exn - tok <- peekUtf8CString c_tok - gu_pool_free parsePl - gu_pool_free exprPl - return (Left tok) + then do c_err <- (#peek GuExn, data.data) exn + c_incomplete <- (#peek PgfParseError, incomplete) c_err + if (c_incomplete :: CInt) == 0 + then do c_offset <- (#peek PgfParseError, offset) c_err + token_ptr <- (#peek PgfParseError, token_ptr) c_err + token_len <- (#peek PgfParseError, token_len) c_err + tok <- peekUtf8CStringLen token_ptr token_len + gu_pool_free parsePl + gu_pool_free exprPl + return (ParseFailed (fromIntegral (c_offset :: CInt)) tok) + else do gu_pool_free parsePl + gu_pool_free exprPl + return ParseIncomplete else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn @@ -630,7 +654,7 @@ parseWithOracle lang cat sent (predict,complete,literal) = else do parseFPl <- newForeignPtr gu_pool_finalizer parsePl exprFPl <- newForeignPtr gu_pool_finalizer exprPl exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl) - return (Right exprs) + return (ParseOk exprs) where oracleWrapper oracle catPtr lblPtr offset = do cat <- peekUtf8CString catPtr diff --git a/src/runtime/haskell-bind/PGF2/FFI.hs b/src/runtime/haskell-bind/PGF2/FFI.hs index a47655d8d..3f30631d8 100644 --- a/src/runtime/haskell-bind/PGF2/FFI.hs +++ b/src/runtime/haskell-bind/PGF2/FFI.hs @@ -2,7 +2,7 @@ module PGF2.FFI where -import Foreign ( alloca, poke ) +import Foreign ( alloca, peek, poke ) import Foreign.C import Foreign.Ptr import Foreign.ForeignPtr @@ -116,6 +116,19 @@ peekUtf8CString ptr = else do cs <- decode pptr return (((toEnum . fromEnum) x) : cs) +peekUtf8CStringLen :: CString -> CInt -> IO String +peekUtf8CStringLen ptr len = + alloca $ \pptr -> + poke pptr ptr >> decode pptr (ptr `plusPtr` fromIntegral len) + where + decode pptr end = do + ptr <- peek pptr + if ptr >= end + then return [] + else do x <- gu_utf8_decode pptr + cs <- decode pptr end + return (((toEnum . fromEnum) x) : cs) + newUtf8CString :: String -> Ptr GuPool -> IO CString newUtf8CString s pool = do -- An UTF8 character takes up to 6 bytes. We allocate enough diff --git a/src/runtime/haskell-bind/examples/pgf-shell.hs b/src/runtime/haskell-bind/examples/pgf-shell.hs index 722770822..05c991691 100644 --- a/src/runtime/haskell-bind/examples/pgf-shell.hs +++ b/src/runtime/haskell-bind/examples/pgf-shell.hs @@ -37,18 +37,18 @@ execute cmd = P lang s -> do pgf <- gets fst c <- getConcr' pgf lang case parse c (startCat pgf) s of - Left tok -> do put (pgf,[]) - putln ("Parse error: "++tok) - Right ts -> do put (pgf,map show ts) - pop + ParseFailed _ tok -> do put (pgf,[]) + putln ("Parse error: "++tok) + ParseOk ts -> do put (pgf,map show ts) + pop T from to s -> do pgf <- gets fst cfrom <- getConcr' pgf from cto <- getConcr' pgf to case parse cfrom (startCat pgf) s of - Left tok -> do put (pgf,[]) - putln ("Parse error: "++tok) - Right ts -> do put (pgf,map (linearize cto.fst) ts) - pop + ParseFailed _ tok -> do put (pgf,[]) + putln ("Parse error: "++tok) + ParseOk ts -> do put (pgf,map (linearize cto.fst) ts) + pop I path -> do pgf <- liftIO (readPGF path) putln . unwords . M.keys $ languages pgf put (pgf,[]) diff --git a/src/runtime/java/jni_utils.c b/src/runtime/java/jni_utils.c index 59c4a7e54..f897b8372 100644 --- a/src/runtime/java/jni_utils.c +++ b/src/runtime/java/jni_utils.c @@ -34,10 +34,8 @@ gu2j_string(JNIEnv *env, GuString s) { } JPGF_INTERNAL jstring -gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf) { - const char* s = gu_string_buf_data(sbuf); +gu2j_string_len(JNIEnv *env, const char* s, size_t len) { const char* utf8 = s; - size_t len = gu_string_buf_length(sbuf); jchar* utf16 = alloca(len*sizeof(jchar)); jchar* dst = utf16; @@ -56,6 +54,11 @@ gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf) { return (*env)->NewString(env, utf16, dst-utf16); } +JPGF_INTERNAL jstring +gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf) { + return gu2j_string_len(env, gu_string_buf_data(sbuf), gu_string_buf_length(sbuf)); +} + JPGF_INTERNAL GuString j2gu_string(JNIEnv *env, jstring s, GuPool* pool) { GuString str = (*env)->GetStringUTFChars(env, s, 0); diff --git a/src/runtime/java/jni_utils.h b/src/runtime/java/jni_utils.h index f2d050092..e45f5b7bd 100644 --- a/src/runtime/java/jni_utils.h +++ b/src/runtime/java/jni_utils.h @@ -20,6 +20,9 @@ JPGF_INTERNAL_DECL jstring gu2j_string(JNIEnv *env, GuString s); +JPGF_INTERNAL_DECL jstring +gu2j_string_len(JNIEnv *env, const char* s, size_t len); + JPGF_INTERNAL_DECL jstring gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf); diff --git a/src/runtime/java/jpgf.c b/src/runtime/java/jpgf.c index db662f5c2..9f5a1a417 100644 --- a/src/runtime/java/jpgf.c +++ b/src/runtime/java/jpgf.c @@ -591,6 +591,30 @@ JNIEXPORT void JNICALL Java_org_grammaticalframework_pgf_Parser_addLiteralCallba j2gu_string(env, jcat, pool), &callback->callback); } +static void +throw_parse_error(JNIEnv *env, PgfParseError* err) +{ + jstring jtoken; + if (err->incomplete) + jtoken = NULL; + else { + jtoken = gu2j_string_len(env, err->token_ptr, err->token_len); + if (!jtoken) + return; + } + + jclass exception_class = (*env)->FindClass(env, "org/grammaticalframework/pgf/ParseError"); + if (!exception_class) + return; + jmethodID constrId = (*env)->GetMethodID(env, exception_class, "", "(Ljava/lang/String;IZ)V"); + if (!constrId) + return; + jobject exception = (*env)->NewObject(env, exception_class, constrId, jtoken, err->offset, err->incomplete); + if (!exception) + return; + (*env)->Throw(env, exception); +} + JNIEXPORT jobject JNICALL Java_org_grammaticalframework_pgf_Parser_parseWithHeuristics (JNIEnv* env, jclass clazz, jobject jconcr, jstring jstartCat, jstring js, jdouble heuristics, jlong callbacksRef, jobject jpool) @@ -615,8 +639,7 @@ Java_org_grammaticalframework_pgf_Parser_parseWithHeuristics GuString msg = (GuString) gu_exn_caught_data(parse_err); throw_string_exception(env, "org/grammaticalframework/pgf/PGFError", msg); } else if (gu_exn_caught(parse_err, PgfParseError)) { - GuString tok = (GuString) gu_exn_caught_data(parse_err); - throw_string_exception(env, "org/grammaticalframework/pgf/ParseError", tok); + throw_parse_error(env, (PgfParseError*) gu_exn_caught_data(parse_err)); } gu_pool_free(out_pool); @@ -656,8 +679,7 @@ Java_org_grammaticalframework_pgf_Completer_complete(JNIEnv* env, jclass clazz, GuString msg = (GuString) gu_exn_caught_data(parse_err); throw_string_exception(env, "org/grammaticalframework/pgf/PGFError", msg); } else if (gu_exn_caught(parse_err, PgfParseError)) { - GuString tok = (GuString) gu_exn_caught_data(parse_err); - throw_string_exception(env, "org/grammaticalframework/pgf/ParseError", tok); + throw_parse_error(env, (PgfParseError*) gu_exn_caught_data(parse_err)); } gu_pool_free(pool); diff --git a/src/runtime/java/org/grammaticalframework/pgf/ParseError.java b/src/runtime/java/org/grammaticalframework/pgf/ParseError.java index 7fd332708..8b3f51ae2 100644 --- a/src/runtime/java/org/grammaticalframework/pgf/ParseError.java +++ b/src/runtime/java/org/grammaticalframework/pgf/ParseError.java @@ -4,11 +4,26 @@ package org.grammaticalframework.pgf; public class ParseError extends Exception { private static final long serialVersionUID = -6086991674218306569L; - public ParseError(String token) { - super(token); + private String token; + private int offset; + private boolean incomplete; + + public ParseError(String token, int offset, boolean incomplete) { + super(incomplete ? "The sentence is incomplete" : "Unexpected token: \""+token+"\""); + this.token = token; + this.offset = offset; + this.incomplete = incomplete; + } + + public String getToken() { + return token; + } + + public int getOffset() { + return offset; } - public String getToken() { - return getMessage(); + public boolean isIncomplete() { + return incomplete; } } diff --git a/src/runtime/python/pypgf.c b/src/runtime/python/pypgf.c index 7da62e453..97718103b 100644 --- a/src/runtime/python/pypgf.c +++ b/src/runtime/python/pypgf.c @@ -1545,13 +1545,28 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) GuString msg = (GuString) gu_exn_caught_data(parse_err); PyErr_SetString(PGFError, msg); } else if (gu_exn_caught(parse_err, PgfParseError)) { - GuString tok = (GuString) gu_exn_caught_data(parse_err); - PyObject* py_tok = PyString_FromString(tok); - PyObject_SetAttrString(ParseError, "token", py_tok); - PyErr_Format(ParseError, "Unexpected token: \"%s\"", tok); - Py_DECREF(py_tok); + PgfParseError* err = (PgfParseError*) gu_exn_caught_data(parse_err); + PyObject* py_offset = PyInt_FromLong(err->offset); + if (err->incomplete) { + PyObject_SetAttrString(ParseError, "incomplete", Py_True); + PyObject_SetAttrString(ParseError, "offset", py_offset); + PyErr_Format(ParseError, "The sentence is incomplete"); + } else { + PyObject* py_tok = PyString_FromStringAndSize(err->token_ptr, + err->token_len); + PyObject_SetAttrString(ParseError, "incomplete", Py_False); + PyObject_SetAttrString(ParseError, "offset", py_offset); + PyObject_SetAttrString(ParseError, "token", py_tok); +#if PY_MAJOR_VERSION >= 3 + PyErr_Format(ParseError, "Unexpected token: \"%U\"", py_tok); +#else + PyErr_Format(ParseError, "Unexpected token: \"%s\"", PyString_AsString(py_tok)); +#endif + Py_DECREF(py_tok); + } + Py_DECREF(py_offset); } - + Py_DECREF(pyres); pyres = NULL; } diff --git a/src/server/PGFService.hs b/src/server/PGFService.hs index b1020b4b8..244f2b96c 100644 --- a/src/server/PGFService.hs +++ b/src/server/PGFService.hs @@ -191,10 +191,11 @@ cpgfMain qsem command (t,(pgf,pc)) = -- Without caching parse results: parse' start mlimit ((from,concr),input) = - return $ maybe id take mlimit . drop start # cparse + case C.parseWithHeuristics concr cat input (-1) callbacks of + C.ParseOk ts -> return (Right (maybe id take mlimit (drop start ts))) + C.ParseFailed _ tok -> return (Left tok) + C.ParseIncomplete -> return (Left "") where - --cparse = C.parse concr cat input - cparse = C.parseWithHeuristics concr cat input (-1) callbacks callbacks = maybe [] cb $ lookup (C.abstractName pgf) C.literalCallbacks cb fs = [(cat,f pgf (from,concr) input)|(cat,f)<-fs] {- @@ -277,8 +278,9 @@ cpgfMain qsem command (t,(pgf,pc)) = | isUpper c -> toLower c : cs s -> s - parse1 = either (const Nothing) (fmap fst . listToMaybe) . - C.parse concr cat + parse1 s = case C.parse concr cat s of + C.ParseOk ((t,_):ts) -> Just t + _ -> Nothing morph w = listToMaybe [t | (f,a,p)<-C.lookupMorpho concr w, t<-maybeToList (C.readExpr f)]