-Prelude PGF2> let Right ((e,p):rest) = res +Prelude PGF2> let ParseOk ((e,p):rest) = resdiff --git a/src/compiler/GF/Command/Commands2.hs b/src/compiler/GF/Command/Commands2.hs index c8e6fbff3..995d2d560 100644 --- a/src/compiler/GF/Command/Commands2.hs +++ b/src/compiler/GF/Command/Commands2.hs @@ -636,10 +636,12 @@ pgfCommands = Map.fromList [ cncs = optConcs env opts parsed rs = Piped (Exprs ts,unlines msgs) where - ts = [hsExpr t|Right ts<-rs,(t,p)<-takeOptNum opts ts] - msgs = concatMap (either err ok) rs - err msg = ["Parse failed: "++msg] - ok = map (PGF2.showExpr [] . fst).takeOptNum opts + ts = [hsExpr t|ParseOk ts<-rs,(t,p)<-takeOptNum opts ts] + msgs = concatMap mkMsg rs + + mkMsg (ParseOk ts) = (map (PGF2.showExpr [] . fst).takeOptNum opts) ts + mkMsg (ParseFailed _ tok) = ["Parse failed: "++tok] + mkMsg (ParseIncomplete) = ["The sentence is incomplete"] optLins env opts ts = case opts of _ | isOpt "groups" opts -> diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index ecfb7d2ea..2cae01db6 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -2139,30 +2139,37 @@ pgf_parse_result_enum_next(GuEnum* self, void* to, GuPool* pool) *(PgfExprProb**)to = pgf_parse_result_next(ps); } -static GuString -pgf_parsing_last_token(PgfParsing* ps, GuPool* pool) +static PgfParseError* +pgf_parsing_new_exception(PgfParsing* ps, GuPool* pool) { - if (ps->before == NULL) - return ""; + const uint8_t* p = (uint8_t*) ps->sentence; + const uint8_t* end = p + (ps->before ? ps->before->end_offset : 0); - const uint8_t* start = (uint8_t*) ps->sentence; - const uint8_t* end = (uint8_t*) ps->sentence + ps->before->end_offset; + PgfParseError* err = gu_new(PgfParseError, pool); + err->incomplete= (*end == 0); + err->offset = 0; + err->token_ptr = (char*) p; - const uint8_t* p = start; while (p < end) { if (gu_ucs_is_space(gu_utf8_decode(&p))) { - start = p; + err->token_ptr = (char*) p; } + err->offset++; + } + + if (err->incomplete) { + err->token_ptr = NULL; + err->token_len = 0; + return err; } while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) { end = p; } - char* tok = gu_malloc(pool, end-start+1); - memcpy(tok, start, (end-start)); - tok[end-start] = 0; - return tok; + err->token_len = ((char*)end)-err->token_ptr; + + return err; } PGF_API GuEnum* @@ -2204,7 +2211,7 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ, GuString sentence, while (gu_buf_length(ps->expr_queue) == 0) { if (!pgf_parsing_proceed(ps)) { GuExnData* exn = gu_raise(err, PgfParseError); - exn->data = (void*) pgf_parsing_last_token(ps, exn->pool); + exn->data = (void*) pgf_parsing_new_exception(ps, exn->pool); return NULL; } @@ -2249,7 +2256,7 @@ pgf_parse_with_oracle(PgfConcr* concr, PgfType* typ, while (gu_buf_length(ps->expr_queue) == 0) { if (!pgf_parsing_proceed(ps)) { GuExnData* exn = gu_raise(err, PgfParseError); - exn->data = (void*) pgf_parsing_last_token(ps, exn->pool); + exn->data = (void*) pgf_parsing_new_exception(ps, exn->pool); return NULL; } @@ -2312,7 +2319,7 @@ pgf_complete(PgfConcr* concr, PgfType* type, GuString sentence, while (ps->before->end_offset < len) { if (!pgf_parsing_proceed(ps)) { GuExnData* exn = gu_raise(err, PgfParseError); - exn->data = (void*) pgf_parsing_last_token(ps, exn->pool); + exn->data = (void*) pgf_parsing_new_exception(ps, exn->pool); return NULL; } diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index 632a1d332..e6c1c70b8 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -122,6 +122,13 @@ PGF_API_DECL PgfExprEnum* pgf_generate_all(PgfPGF* pgf, PgfType* ty, GuExn* err, GuPool* pool, GuPool* out_pool); +typedef struct { + int incomplete; // equal to !=0 if the sentence is incomplete, 0 otherwise + size_t offset; + const char* token_ptr; + size_t token_len; +} PgfParseError; + PGF_API_DECL PgfExprEnum* pgf_parse(PgfConcr* concr, PgfType* typ, GuString sentence, GuExn* err, GuPool* pool, GuPool* out_pool); diff --git a/src/runtime/haskell-bind/PGF2.hsc b/src/runtime/haskell-bind/PGF2.hsc index 8fb6e6ccb..0d65822f9 100644 --- a/src/runtime/haskell-bind/PGF2.hsc +++ b/src/runtime/haskell-bind/PGF2.hsc @@ -63,7 +63,7 @@ module PGF2 (-- * PGF alignWords, -- ** Parsing - parse, parseWithHeuristics, + ParseOutput(..), parse, parseWithHeuristics, -- ** Sentence Lookup lookupSentence, -- ** Generation @@ -480,7 +480,15 @@ getAnalysis ref self c_lemma c_anal prob exn = do anal <- peekUtf8CString c_anal writeIORef ref ((lemma, anal, prob):ans) -parse :: Concr -> Type -> String -> Either String [(Expr,Float)] +-- | This data type encodes the different outcomes which you could get from the parser. +data ParseOutput + = ParseFailed Int String -- ^ The integer is the position in number of unicode characters where the parser failed. + -- The string is the token where the parser have failed. + | ParseOk [(Expr,Float)] -- ^ If the parsing and the type checking are successful we get a list of abstract syntax trees. + -- The list should be non-empty. + | ParseIncomplete -- ^ The sentence is not complete. + +parse :: Concr -> Type -> String -> ParseOutput parse lang ty sent = parseWithHeuristics lang ty sent (-1.0) [] parseWithHeuristics :: Concr -- ^ the language with which we parse @@ -497,7 +505,7 @@ parseWithHeuristics :: Concr -- ^ the language with which we parse -- the input sentence; the current offset in the sentence. -- If a literal has been recognized then the output should -- be Just (expr,probability,end_offset) - -> Either String [(Expr,Float)] + -> ParseOutput parseWithHeuristics lang (Type ctype _) sent heuristic callbacks = unsafePerformIO $ do exprPl <- gu_new_pool @@ -510,11 +518,19 @@ parseWithHeuristics lang (Type ctype _) sent heuristic callbacks = if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error - then do c_tok <- (#peek GuExn, data.data) exn - tok <- peekUtf8CString c_tok - gu_pool_free parsePl - gu_pool_free exprPl - return (Left tok) + then do c_err <- (#peek GuExn, data.data) exn + c_incomplete <- (#peek PgfParseError, incomplete) c_err + if (c_incomplete :: CInt) == 0 + then do c_offset <- (#peek PgfParseError, offset) c_err + token_ptr <- (#peek PgfParseError, token_ptr) c_err + token_len <- (#peek PgfParseError, token_len) c_err + tok <- peekUtf8CStringLen token_ptr token_len + gu_pool_free parsePl + gu_pool_free exprPl + return (ParseFailed (fromIntegral (c_offset :: CInt)) tok) + else do gu_pool_free parsePl + gu_pool_free exprPl + return ParseIncomplete else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn @@ -528,7 +544,7 @@ parseWithHeuristics lang (Type ctype _) sent heuristic callbacks = else do parseFPl <- newForeignPtr gu_pool_finalizer parsePl exprFPl <- newForeignPtr gu_pool_finalizer exprPl exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl) - return (Right exprs) + return (ParseOk exprs) mkCallbacksMap :: Ptr PgfConcr -> [(String, Int -> Int -> Maybe (Expr,Float,Int))] -> Ptr GuPool -> IO (Ptr PgfCallbacksMap) mkCallbacksMap concr callbacks pool = do @@ -595,7 +611,7 @@ parseWithOracle :: Concr -- ^ the language with which we parse -> Cat -- ^ the start category -> String -- ^ the input sentence -> Oracle - -> Either String [(Expr,Float)] + -> ParseOutput parseWithOracle lang cat sent (predict,complete,literal) = unsafePerformIO $ do parsePl <- gu_new_pool @@ -612,11 +628,19 @@ parseWithOracle lang cat sent (predict,complete,literal) = if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error - then do c_tok <- (#peek GuExn, data.data) exn - tok <- peekUtf8CString c_tok - gu_pool_free parsePl - gu_pool_free exprPl - return (Left tok) + then do c_err <- (#peek GuExn, data.data) exn + c_incomplete <- (#peek PgfParseError, incomplete) c_err + if (c_incomplete :: CInt) == 0 + then do c_offset <- (#peek PgfParseError, offset) c_err + token_ptr <- (#peek PgfParseError, token_ptr) c_err + token_len <- (#peek PgfParseError, token_len) c_err + tok <- peekUtf8CStringLen token_ptr token_len + gu_pool_free parsePl + gu_pool_free exprPl + return (ParseFailed (fromIntegral (c_offset :: CInt)) tok) + else do gu_pool_free parsePl + gu_pool_free exprPl + return ParseIncomplete else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn @@ -630,7 +654,7 @@ parseWithOracle lang cat sent (predict,complete,literal) = else do parseFPl <- newForeignPtr gu_pool_finalizer parsePl exprFPl <- newForeignPtr gu_pool_finalizer exprPl exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl) - return (Right exprs) + return (ParseOk exprs) where oracleWrapper oracle catPtr lblPtr offset = do cat <- peekUtf8CString catPtr diff --git a/src/runtime/haskell-bind/PGF2/FFI.hs b/src/runtime/haskell-bind/PGF2/FFI.hs index a47655d8d..3f30631d8 100644 --- a/src/runtime/haskell-bind/PGF2/FFI.hs +++ b/src/runtime/haskell-bind/PGF2/FFI.hs @@ -2,7 +2,7 @@ module PGF2.FFI where -import Foreign ( alloca, poke ) +import Foreign ( alloca, peek, poke ) import Foreign.C import Foreign.Ptr import Foreign.ForeignPtr @@ -116,6 +116,19 @@ peekUtf8CString ptr = else do cs <- decode pptr return (((toEnum . fromEnum) x) : cs) +peekUtf8CStringLen :: CString -> CInt -> IO String +peekUtf8CStringLen ptr len = + alloca $ \pptr -> + poke pptr ptr >> decode pptr (ptr `plusPtr` fromIntegral len) + where + decode pptr end = do + ptr <- peek pptr + if ptr >= end + then return [] + else do x <- gu_utf8_decode pptr + cs <- decode pptr end + return (((toEnum . fromEnum) x) : cs) + newUtf8CString :: String -> Ptr GuPool -> IO CString newUtf8CString s pool = do -- An UTF8 character takes up to 6 bytes. We allocate enough diff --git a/src/runtime/haskell-bind/examples/pgf-shell.hs b/src/runtime/haskell-bind/examples/pgf-shell.hs index 722770822..05c991691 100644 --- a/src/runtime/haskell-bind/examples/pgf-shell.hs +++ b/src/runtime/haskell-bind/examples/pgf-shell.hs @@ -37,18 +37,18 @@ execute cmd = P lang s -> do pgf <- gets fst c <- getConcr' pgf lang case parse c (startCat pgf) s of - Left tok -> do put (pgf,[]) - putln ("Parse error: "++tok) - Right ts -> do put (pgf,map show ts) - pop + ParseFailed _ tok -> do put (pgf,[]) + putln ("Parse error: "++tok) + ParseOk ts -> do put (pgf,map show ts) + pop T from to s -> do pgf <- gets fst cfrom <- getConcr' pgf from cto <- getConcr' pgf to case parse cfrom (startCat pgf) s of - Left tok -> do put (pgf,[]) - putln ("Parse error: "++tok) - Right ts -> do put (pgf,map (linearize cto.fst) ts) - pop + ParseFailed _ tok -> do put (pgf,[]) + putln ("Parse error: "++tok) + ParseOk ts -> do put (pgf,map (linearize cto.fst) ts) + pop I path -> do pgf <- liftIO (readPGF path) putln . unwords . M.keys $ languages pgf put (pgf,[]) diff --git a/src/runtime/java/jni_utils.c b/src/runtime/java/jni_utils.c index 59c4a7e54..f897b8372 100644 --- a/src/runtime/java/jni_utils.c +++ b/src/runtime/java/jni_utils.c @@ -34,10 +34,8 @@ gu2j_string(JNIEnv *env, GuString s) { } JPGF_INTERNAL jstring -gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf) { - const char* s = gu_string_buf_data(sbuf); +gu2j_string_len(JNIEnv *env, const char* s, size_t len) { const char* utf8 = s; - size_t len = gu_string_buf_length(sbuf); jchar* utf16 = alloca(len*sizeof(jchar)); jchar* dst = utf16; @@ -56,6 +54,11 @@ gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf) { return (*env)->NewString(env, utf16, dst-utf16); } +JPGF_INTERNAL jstring +gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf) { + return gu2j_string_len(env, gu_string_buf_data(sbuf), gu_string_buf_length(sbuf)); +} + JPGF_INTERNAL GuString j2gu_string(JNIEnv *env, jstring s, GuPool* pool) { GuString str = (*env)->GetStringUTFChars(env, s, 0); diff --git a/src/runtime/java/jni_utils.h b/src/runtime/java/jni_utils.h index f2d050092..e45f5b7bd 100644 --- a/src/runtime/java/jni_utils.h +++ b/src/runtime/java/jni_utils.h @@ -20,6 +20,9 @@ JPGF_INTERNAL_DECL jstring gu2j_string(JNIEnv *env, GuString s); +JPGF_INTERNAL_DECL jstring +gu2j_string_len(JNIEnv *env, const char* s, size_t len); + JPGF_INTERNAL_DECL jstring gu2j_string_buf(JNIEnv *env, GuStringBuf* sbuf); diff --git a/src/runtime/java/jpgf.c b/src/runtime/java/jpgf.c index db662f5c2..9f5a1a417 100644 --- a/src/runtime/java/jpgf.c +++ b/src/runtime/java/jpgf.c @@ -591,6 +591,30 @@ JNIEXPORT void JNICALL Java_org_grammaticalframework_pgf_Parser_addLiteralCallba j2gu_string(env, jcat, pool), &callback->callback); } +static void +throw_parse_error(JNIEnv *env, PgfParseError* err) +{ + jstring jtoken; + if (err->incomplete) + jtoken = NULL; + else { + jtoken = gu2j_string_len(env, err->token_ptr, err->token_len); + if (!jtoken) + return; + } + + jclass exception_class = (*env)->FindClass(env, "org/grammaticalframework/pgf/ParseError"); + if (!exception_class) + return; + jmethodID constrId = (*env)->GetMethodID(env, exception_class, "