diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index 7b564c069..ea5228bc1 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -2230,6 +2230,101 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ, GuString sentence, return &ps->en; } +PGF_API PgfParsing* +pgf_parse_to_chart(PgfConcr* concr, PgfType* typ, GuString sentence, + double heuristics, + PgfCallbacksMap* callbacks, + size_t n_roots, + GuExn* err, + GuPool* pool, GuPool* out_pool) +{ + if (concr->sequences == NULL || + concr->cnccats == NULL) { + GuExnData* err_data = gu_raise(err, PgfExn); + if (err_data) { + err_data->data = "The concrete syntax is not loaded"; + return NULL; + } + } + + // Begin parsing a sentence with the specified category + PgfParsing* ps = + pgf_parsing_init(concr, typ->cid, sentence, heuristics, callbacks, NULL, err, pool, out_pool); + if (ps == NULL) { + return NULL; + } + +#ifdef PGF_COUNTS_DEBUG + pgf_parsing_print_counts(ps); +#endif + + while (gu_buf_length(ps->expr_queue) < n_roots) { + if (!pgf_parsing_proceed(ps)) { + break; + } + +#ifdef PGF_COUNTS_DEBUG + pgf_parsing_print_counts(ps); +#endif + } + + return ps; +} + +PGF_API PgfCCats* +pgf_get_parse_roots(PgfParsing* ps, GuPool* pool) +{ + size_t n_cats = 0; + size_t n_states = gu_buf_length(ps->expr_queue); + GuSeq* roots = gu_new_seq(PgfCCat*, n_states, pool); + for (size_t i = 0; i < n_states; i++) { + PgfCCat* ccat = gu_buf_get(ps->expr_queue, PgfExprState*, i)->answers->ccat; + + bool found = false; + for (size_t j = 0; j < n_cats; j++) { + if (gu_seq_get(roots, PgfCCat*, j) == ccat) { + found = true; + break; + } + } + + if (!found) { + gu_seq_set(roots, PgfCCat*, n_cats, ccat); + n_cats++; + } + } + roots->len = n_cats; + return roots; +} + +PGF_API GuSeq* +pgf_ccat_to_range(PgfParsing* ps, PgfCCat* ccat, GuPool* pool) +{ + PgfItemConts* conts = ccat->conts; + PgfParseState* state = ps->before; + GuBuf* buf = gu_new_buf(PgfParseRange, pool); + + while (conts != NULL) { + PgfParseRange* range = gu_buf_extend(buf); + range->start = conts->state->end_offset; + range->end = conts->state->end_offset; + range->field = conts->ccat->cnccat->labels[conts->lin_idx]; + + while (state != NULL) { + if (pgf_parsing_get_completed(state, conts) == ccat) { + if (state->start_offset >= range->start) + range->end = state->start_offset; + break; + } + state = state->next; + } + + conts = conts->ccat->conts; + } + + return gu_buf_data_seq(buf); +} + PGF_API PgfExprEnum* pgf_parse_with_oracle(PgfConcr* concr, PgfType* typ, GuString sentence, diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index 8fdc52b62..b40284a42 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -208,6 +208,12 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ, GuExn* err, GuPool* pool, GuPool* out_pool); +typedef struct { + size_t start; + size_t end; + GuString field; +} PgfParseRange; + typedef struct PgfOracleCallback PgfOracleCallback; struct PgfOracleCallback { diff --git a/src/runtime/haskell-bind/PGF2.hsc b/src/runtime/haskell-bind/PGF2.hsc index a41c915f1..9ef325343 100644 --- a/src/runtime/haskell-bind/PGF2.hsc +++ b/src/runtime/haskell-bind/PGF2.hsc @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -65,6 +66,7 @@ module PGF2 (-- * PGF alignWords, -- ** Parsing ParseOutput(..), parse, parseWithHeuristics, + parseToChart, PArg(..), -- ** Sentence Lookup lookupSentence, -- ** Generation @@ -86,6 +88,7 @@ import Prelude hiding (fromEnum,(<>)) -- GHC 8.4.1 clash with Text.PrettyPrint import Control.Exception(Exception,throwIO) import Control.Monad(forM_) import System.IO.Unsafe(unsafePerformIO,unsafeInterleaveIO) +import System.IO(fixIO) import Text.PrettyPrint import PGF2.Expr import PGF2.Type @@ -99,7 +102,7 @@ import Data.IORef import Data.Char(isUpper,isSpace) import Data.List(isSuffixOf,maximumBy,nub) import Data.Function(on) - +import Data.Maybe(maybe) ----------------------------------------------------------------------- -- Functions that take a PGF. @@ -569,14 +572,14 @@ getAnalysis ref self c_lemma c_anal prob exn = do writeIORef ref ((lemma, anal, prob):ans) -- | This data type encodes the different outcomes which you could get from the parser. -data ParseOutput +data ParseOutput a = ParseFailed Int String -- ^ The integer is the position in number of unicode characters where the parser failed. -- The string is the token where the parser have failed. - | ParseOk [(Expr,Float)] -- ^ If the parsing and the type checking are successful we get a list of abstract syntax trees. - -- The list should be non-empty. + | ParseOk a -- ^ If the parsing and the type checking are successful + -- we get the abstract syntax trees as either a list or a chart. | ParseIncomplete -- ^ The sentence is not complete. -parse :: Concr -> Type -> String -> ParseOutput +parse :: Concr -> Type -> String -> ParseOutput [(Expr,Float)] parse lang ty sent = parseWithHeuristics lang ty sent (-1.0) [] parseWithHeuristics :: Concr -- ^ the language with which we parse @@ -593,7 +596,7 @@ parseWithHeuristics :: Concr -- ^ the language with which we parse -- the input sentence; the current offset in the sentence. -- If a literal has been recognized then the output should -- be Just (expr,probability,end_offset) - -> ParseOutput + -> ParseOutput [(Expr,Float)] parseWithHeuristics lang (Type ctype touchType) sent heuristic callbacks = unsafePerformIO $ do exprPl <- gu_new_pool @@ -635,6 +638,129 @@ parseWithHeuristics lang (Type ctype touchType) sent heuristic callbacks = exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl) return (ParseOk exprs) +parseToChart :: Concr -- ^ the language with which we parse + -> Type -- ^ the start category + -> String -- ^ the input sentence + -> Double -- ^ the heuristic factor. + -- A negative value tells the parser + -- to lookup up the default from + -- the grammar flags + -> [(Cat, Int -> Int -> Maybe (Expr,Float,Int))] + -- ^ a list of callbacks for literal categories. + -- The arguments of the callback are: + -- the index of the constituent for the literal category; + -- the input sentence; the current offset in the sentence. + -- If a literal has been recognized then the output should + -- be Just (expr,probability,end_offset) + -> Int -- ^ the maximal number of roots + -> ParseOutput ([FId],Map.Map FId ([(Int,Int,String)],[(Expr,[PArg],Float)])) +parseToChart lang (Type ctype touchType) sent heuristic callbacks roots = + unsafePerformIO $ + withGuPool $ \parsePl -> do + do exn <- gu_new_exn parsePl + sent <- newUtf8CString sent parsePl + callbacks_map <- mkCallbacksMap (concr lang) callbacks parsePl + ps <- pgf_parse_to_chart (concr lang) ctype sent heuristic callbacks_map (fromIntegral roots) exn parsePl parsePl + touchType + failed <- gu_exn_is_raised exn + if failed + then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError + if is_parse_error + then do c_err <- (#peek GuExn, data.data) exn + c_incomplete <- (#peek PgfParseError, incomplete) c_err + if (c_incomplete :: CInt) == 0 + then do c_offset <- (#peek PgfParseError, offset) c_err + token_ptr <- (#peek PgfParseError, token_ptr) c_err + token_len <- (#peek PgfParseError, token_len) c_err + tok <- peekUtf8CStringLen token_ptr token_len + touchConcr lang + return (ParseFailed (fromIntegral (c_offset :: CInt)) tok) + else do touchConcr lang + return ParseIncomplete + else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn + if is_exn + then do c_msg <- (#peek GuExn, data.data) exn + msg <- peekUtf8CString c_msg + touchConcr lang + throwIO (PGFError msg) + else do touchConcr lang + throwIO (PGFError "Parsing failed") + else do c_roots <- pgf_get_parse_roots ps parsePl + let get_range c_ccat = pgf_ccat_to_range ps c_ccat parsePl + c_len <- (#peek GuSeq, len) c_roots + chart <- peekCCats get_range Map.empty (c_len :: CSizeT) (c_roots `plusPtr` (#offset GuSeq, data)) + touchConcr lang + return (ParseOk chart) + where + peekCCats get_range chart 0 ptr = return ([],chart) + peekCCats get_range chart len ptr = do + (root, chart) <- deRef (peekCCat get_range chart) ptr + (roots,chart) <- peekCCats get_range chart (len-1) (ptr `plusPtr` (#size PgfCCat*)) + return (root:roots,chart) + + peekCCat get_range chart c_ccat = do + fid <- peekFId c_ccat + c_total_cats <- (#peek PgfConcr, total_cats) (concr lang) + if Map.member fid chart || fid < c_total_cats + then return (fid,chart) + else do range <- get_range c_ccat >>= peekSequence peekRange (#size PgfParseRange) + c_prods <- (#peek PgfCCat, prods) c_ccat + if c_prods == nullPtr + then do return (fid,Map.insert fid (range,[]) chart) + else do c_len <- (#peek PgfCCat, n_synprods) c_ccat + (prods,chart) <- fixIO (\res -> peekProductions (Map.insert fid (range,fst res) chart) + (fromIntegral (c_len :: CSizeT)) + (c_prods `plusPtr` (#offset GuSeq, data))) + return (fid,chart) + where + peekProductions chart 0 ptr = return ([],chart) + peekProductions chart len ptr = do + (ps1, chart) <- deRef (peekProduction chart) ptr + (ps2,chart) <- peekProductions chart (len-1) (ptr `plusPtr` (#size GuVariant)) + return (ps1++ps2,chart) + + peekProduction chart p = do + tag <- gu_variant_tag p + dt <- gu_variant_data p + case tag of + (#const PGF_PRODUCTION_APPLY) -> do { c_cncfun <- (#peek PgfProductionApply, fun) dt ; + c_absfun <- (#peek PgfCncFun, absfun) c_cncfun ; + expr <- (#peek PgfAbsFun, ep.expr) c_absfun ; + p <- (#peek PgfAbsFun, ep.prob) c_absfun ; + c_args <- (#peek PgfProductionApply, args) dt ; + c_len <- (#peek GuSeq, len) c_args ; + (pargs,chart) <- peekPArgs chart (c_len :: CSizeT) (c_args `plusPtr` (#offset GuSeq, data)) ; + return ([(Expr expr (touchConcr lang), pargs, p)],chart) } + (#const PGF_PRODUCTION_COERCE) -> do { c_coerce <- (#peek PgfProductionCoerce, coerce) dt ; + (fid,chart) <- peekCCat get_range chart c_coerce ; + return (maybe [] snd (Map.lookup fid chart),chart) } + (#const PGF_PRODUCTION_EXTERN) -> do { c_ep <- (#peek PgfProductionExtern, ep) dt ; + expr <- (#peek PgfExprProb, expr) c_ep ; + p <- (#peek PgfExprProb, prob) c_ep ; + return ([(Expr expr (touchConcr lang), [], p)],chart) } + _ -> error ("Unknown production type "++show tag++" in the grammar") + + peekPArgs chart 0 ptr = return ([],chart) + peekPArgs chart len ptr = do + (a, chart) <- peekPArg chart ptr + (as,chart) <- peekPArgs chart (len-1) (ptr `plusPtr` (#size PgfPArg)) + return (a:as,chart) + + peekPArg chart ptr = do + c_hypos <- (#peek PgfPArg, hypos) ptr + hypos <- if c_hypos /= nullPtr + then peekSequence (deRef peekFId) (#size int) c_hypos + else return [] + c_ccat <- (#peek PgfPArg, ccat) ptr + (fid,chart) <- peekCCat get_range chart c_ccat + return (PArg hypos fid,chart) + + peekRange ptr = do + s <- (#peek PgfParseRange, start) ptr + e <- (#peek PgfParseRange, end) ptr + f <- (#peek PgfParseRange, field) ptr >>= peekCString + return ((fromIntegral :: CSizeT -> Int) s, (fromIntegral :: CSizeT -> Int) e, f) + mkCallbacksMap :: Ptr PgfConcr -> [(String, Int -> Int -> Maybe (Expr,Float,Int))] -> Ptr GuPool -> IO (Ptr PgfCallbacksMap) mkCallbacksMap concr callbacks pool = do callbacks_map <- pgf_new_callbacks_map concr pool @@ -700,7 +826,7 @@ parseWithOracle :: Concr -- ^ the language with which we parse -> Cat -- ^ the start category -> String -- ^ the input sentence -> Oracle - -> ParseOutput + -> ParseOutput [(Expr,Float)] parseWithOracle lang cat sent (predict,complete,literal) = unsafePerformIO $ do parsePl <- gu_new_pool @@ -906,7 +1032,6 @@ tabularLinearizeAll lang e = unsafePerformIO $ throwIO (PGFError msg) else do throwIO (PGFError "The abstract tree cannot be linearized") -type FId = Int type LIndex = Int -- | BracketedString represents a sentence that is linearized diff --git a/src/runtime/haskell-bind/PGF2/FFI.hsc b/src/runtime/haskell-bind/PGF2/FFI.hsc index 713adcecc..673c5c877 100644 --- a/src/runtime/haskell-bind/PGF2/FFI.hsc +++ b/src/runtime/haskell-bind/PGF2/FFI.hsc @@ -6,6 +6,7 @@ module PGF2.FFI where #include #include #include +#include import Foreign ( alloca, peek, poke, peekByteOff ) import Foreign.C @@ -237,6 +238,16 @@ newSequence elem_size pokeElem values pool = do pokeElem ptr x pokeElems (ptr `plusPtr` (fromIntegral elem_size)) xs +type FId = Int +data PArg = PArg [FId] {-# UNPACK #-} !FId deriving (Eq,Ord,Show) + +peekFId :: Ptr a -> IO FId +peekFId c_ccat = do + c_fid <- (#peek PgfCCat, fid) c_ccat + return (fromIntegral (c_fid :: CInt)) + +deRef peekValue ptr = peek ptr >>= peekValue + ------------------------------------------------------------------ -- libpgf API @@ -261,6 +272,7 @@ data PgfAbsCat data PgfCCat data PgfCncFun data PgfProductionApply +data PgfParsing foreign import ccall "pgf/pgf.h pgf_read" pgf_read :: CString -> Ptr GuPool -> Ptr GuExn -> IO (Ptr PgfPGF) @@ -361,6 +373,15 @@ foreign import ccall "wrapper" foreign import ccall "pgf/pgf.h pgf_align_words" pgf_align_words :: Ptr PgfConcr -> PgfExpr -> Ptr GuExn -> Ptr GuPool -> IO (Ptr GuSeq) +foreign import ccall "pgf/pgf.h pgf_parse_to_chart" + pgf_parse_to_chart :: Ptr PgfConcr -> PgfType -> CString -> Double -> Ptr PgfCallbacksMap -> CSizeT -> Ptr GuExn -> Ptr GuPool -> Ptr GuPool -> IO (Ptr PgfParsing) + +foreign import ccall "pgf/pgf.h pgf_get_parse_roots" + pgf_get_parse_roots :: Ptr PgfParsing -> Ptr GuPool -> IO (Ptr GuSeq) + +foreign import ccall "pgf/pgf.h pgf_ccat_to_range" + pgf_ccat_to_range :: Ptr PgfParsing -> Ptr PgfCCat -> Ptr GuPool -> IO (Ptr GuSeq) + foreign import ccall "pgf/pgf.h pgf_parse_with_heuristics" pgf_parse_with_heuristics :: Ptr PgfConcr -> PgfType -> CString -> Double -> Ptr PgfCallbacksMap -> Ptr GuExn -> Ptr GuPool -> Ptr GuPool -> IO (Ptr GuEnum) diff --git a/src/runtime/haskell-bind/PGF2/Internal.hsc b/src/runtime/haskell-bind/PGF2/Internal.hsc index 3cb4199d0..ed894a361 100644 --- a/src/runtime/haskell-bind/PGF2/Internal.hsc +++ b/src/runtime/haskell-bind/PGF2/Internal.hsc @@ -53,7 +53,6 @@ data Production = PApply {-# UNPACK #-} !FunId [PArg] | PCoerce {-# UNPACK #-} !FId deriving (Eq,Ord,Show) -data PArg = PArg [FId] {-# UNPACK #-} !FId deriving (Eq,Ord,Show) type FunId = Int type SeqId = Int data Literal = @@ -186,10 +185,6 @@ concrProductions c fid = unsafePerformIO $ do fid <- peekFId c_ccat return (PArg hypos fid) -peekFId c_ccat = do - c_fid <- (#peek PgfCCat, fid) c_ccat - return (fromIntegral (c_fid :: CInt)) - concrTotalFuns :: Concr -> FunId concrTotalFuns c = unsafePerformIO $ do c_cncfuns <- (#peek PgfConcr, cncfuns) (concr c) @@ -271,8 +266,6 @@ concrSequence c seqid = unsafePerformIO $ do forms <- peekForms (len-1) (ptr `plusPtr` (#size PgfAlternative)) return ((form,prefixes):forms) -deRef peekValue ptr = peek ptr >>= peekValue - fidString, fidInt, fidFloat, fidVar, fidStart :: FId fidString = (-1) fidInt = (-2)