mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
a version of the parser which returns a chart rather than a list of expressions
This commit is contained in:
@@ -2230,6 +2230,101 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ, GuString sentence,
|
||||
return &ps->en;
|
||||
}
|
||||
|
||||
PGF_API PgfParsing*
|
||||
pgf_parse_to_chart(PgfConcr* concr, PgfType* typ, GuString sentence,
|
||||
double heuristics,
|
||||
PgfCallbacksMap* callbacks,
|
||||
size_t n_roots,
|
||||
GuExn* err,
|
||||
GuPool* pool, GuPool* out_pool)
|
||||
{
|
||||
if (concr->sequences == NULL ||
|
||||
concr->cnccats == NULL) {
|
||||
GuExnData* err_data = gu_raise(err, PgfExn);
|
||||
if (err_data) {
|
||||
err_data->data = "The concrete syntax is not loaded";
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Begin parsing a sentence with the specified category
|
||||
PgfParsing* ps =
|
||||
pgf_parsing_init(concr, typ->cid, sentence, heuristics, callbacks, NULL, err, pool, out_pool);
|
||||
if (ps == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef PGF_COUNTS_DEBUG
|
||||
pgf_parsing_print_counts(ps);
|
||||
#endif
|
||||
|
||||
while (gu_buf_length(ps->expr_queue) < n_roots) {
|
||||
if (!pgf_parsing_proceed(ps)) {
|
||||
break;
|
||||
}
|
||||
|
||||
#ifdef PGF_COUNTS_DEBUG
|
||||
pgf_parsing_print_counts(ps);
|
||||
#endif
|
||||
}
|
||||
|
||||
return ps;
|
||||
}
|
||||
|
||||
PGF_API PgfCCats*
|
||||
pgf_get_parse_roots(PgfParsing* ps, GuPool* pool)
|
||||
{
|
||||
size_t n_cats = 0;
|
||||
size_t n_states = gu_buf_length(ps->expr_queue);
|
||||
GuSeq* roots = gu_new_seq(PgfCCat*, n_states, pool);
|
||||
for (size_t i = 0; i < n_states; i++) {
|
||||
PgfCCat* ccat = gu_buf_get(ps->expr_queue, PgfExprState*, i)->answers->ccat;
|
||||
|
||||
bool found = false;
|
||||
for (size_t j = 0; j < n_cats; j++) {
|
||||
if (gu_seq_get(roots, PgfCCat*, j) == ccat) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
gu_seq_set(roots, PgfCCat*, n_cats, ccat);
|
||||
n_cats++;
|
||||
}
|
||||
}
|
||||
roots->len = n_cats;
|
||||
return roots;
|
||||
}
|
||||
|
||||
PGF_API GuSeq*
|
||||
pgf_ccat_to_range(PgfParsing* ps, PgfCCat* ccat, GuPool* pool)
|
||||
{
|
||||
PgfItemConts* conts = ccat->conts;
|
||||
PgfParseState* state = ps->before;
|
||||
GuBuf* buf = gu_new_buf(PgfParseRange, pool);
|
||||
|
||||
while (conts != NULL) {
|
||||
PgfParseRange* range = gu_buf_extend(buf);
|
||||
range->start = conts->state->end_offset;
|
||||
range->end = conts->state->end_offset;
|
||||
range->field = conts->ccat->cnccat->labels[conts->lin_idx];
|
||||
|
||||
while (state != NULL) {
|
||||
if (pgf_parsing_get_completed(state, conts) == ccat) {
|
||||
if (state->start_offset >= range->start)
|
||||
range->end = state->start_offset;
|
||||
break;
|
||||
}
|
||||
state = state->next;
|
||||
}
|
||||
|
||||
conts = conts->ccat->conts;
|
||||
}
|
||||
|
||||
return gu_buf_data_seq(buf);
|
||||
}
|
||||
|
||||
PGF_API PgfExprEnum*
|
||||
pgf_parse_with_oracle(PgfConcr* concr, PgfType* typ,
|
||||
GuString sentence,
|
||||
|
||||
@@ -208,6 +208,12 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ,
|
||||
GuExn* err,
|
||||
GuPool* pool, GuPool* out_pool);
|
||||
|
||||
typedef struct {
|
||||
size_t start;
|
||||
size_t end;
|
||||
GuString field;
|
||||
} PgfParseRange;
|
||||
|
||||
typedef struct PgfOracleCallback PgfOracleCallback;
|
||||
|
||||
struct PgfOracleCallback {
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
#include <pgf/pgf.h>
|
||||
#include <pgf/linearizer.h>
|
||||
#include <pgf/data.h>
|
||||
#include <gu/enum.h>
|
||||
#include <gu/exn.h>
|
||||
|
||||
@@ -65,6 +66,7 @@ module PGF2 (-- * PGF
|
||||
alignWords,
|
||||
-- ** Parsing
|
||||
ParseOutput(..), parse, parseWithHeuristics,
|
||||
parseToChart, PArg(..),
|
||||
-- ** Sentence Lookup
|
||||
lookupSentence,
|
||||
-- ** Generation
|
||||
@@ -86,6 +88,7 @@ import Prelude hiding (fromEnum,(<>)) -- GHC 8.4.1 clash with Text.PrettyPrint
|
||||
import Control.Exception(Exception,throwIO)
|
||||
import Control.Monad(forM_)
|
||||
import System.IO.Unsafe(unsafePerformIO,unsafeInterleaveIO)
|
||||
import System.IO(fixIO)
|
||||
import Text.PrettyPrint
|
||||
import PGF2.Expr
|
||||
import PGF2.Type
|
||||
@@ -99,7 +102,7 @@ import Data.IORef
|
||||
import Data.Char(isUpper,isSpace)
|
||||
import Data.List(isSuffixOf,maximumBy,nub)
|
||||
import Data.Function(on)
|
||||
|
||||
import Data.Maybe(maybe)
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
-- Functions that take a PGF.
|
||||
@@ -569,14 +572,14 @@ getAnalysis ref self c_lemma c_anal prob exn = do
|
||||
writeIORef ref ((lemma, anal, prob):ans)
|
||||
|
||||
-- | This data type encodes the different outcomes which you could get from the parser.
|
||||
data ParseOutput
|
||||
data ParseOutput a
|
||||
= ParseFailed Int String -- ^ The integer is the position in number of unicode characters where the parser failed.
|
||||
-- The string is the token where the parser have failed.
|
||||
| ParseOk [(Expr,Float)] -- ^ If the parsing and the type checking are successful we get a list of abstract syntax trees.
|
||||
-- The list should be non-empty.
|
||||
| ParseOk a -- ^ If the parsing and the type checking are successful
|
||||
-- we get the abstract syntax trees as either a list or a chart.
|
||||
| ParseIncomplete -- ^ The sentence is not complete.
|
||||
|
||||
parse :: Concr -> Type -> String -> ParseOutput
|
||||
parse :: Concr -> Type -> String -> ParseOutput [(Expr,Float)]
|
||||
parse lang ty sent = parseWithHeuristics lang ty sent (-1.0) []
|
||||
|
||||
parseWithHeuristics :: Concr -- ^ the language with which we parse
|
||||
@@ -593,7 +596,7 @@ parseWithHeuristics :: Concr -- ^ the language with which we parse
|
||||
-- the input sentence; the current offset in the sentence.
|
||||
-- If a literal has been recognized then the output should
|
||||
-- be Just (expr,probability,end_offset)
|
||||
-> ParseOutput
|
||||
-> ParseOutput [(Expr,Float)]
|
||||
parseWithHeuristics lang (Type ctype touchType) sent heuristic callbacks =
|
||||
unsafePerformIO $
|
||||
do exprPl <- gu_new_pool
|
||||
@@ -635,6 +638,129 @@ parseWithHeuristics lang (Type ctype touchType) sent heuristic callbacks =
|
||||
exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl)
|
||||
return (ParseOk exprs)
|
||||
|
||||
parseToChart :: Concr -- ^ the language with which we parse
|
||||
-> Type -- ^ the start category
|
||||
-> String -- ^ the input sentence
|
||||
-> Double -- ^ the heuristic factor.
|
||||
-- A negative value tells the parser
|
||||
-- to lookup up the default from
|
||||
-- the grammar flags
|
||||
-> [(Cat, Int -> Int -> Maybe (Expr,Float,Int))]
|
||||
-- ^ a list of callbacks for literal categories.
|
||||
-- The arguments of the callback are:
|
||||
-- the index of the constituent for the literal category;
|
||||
-- the input sentence; the current offset in the sentence.
|
||||
-- If a literal has been recognized then the output should
|
||||
-- be Just (expr,probability,end_offset)
|
||||
-> Int -- ^ the maximal number of roots
|
||||
-> ParseOutput ([FId],Map.Map FId ([(Int,Int,String)],[(Expr,[PArg],Float)]))
|
||||
parseToChart lang (Type ctype touchType) sent heuristic callbacks roots =
|
||||
unsafePerformIO $
|
||||
withGuPool $ \parsePl -> do
|
||||
do exn <- gu_new_exn parsePl
|
||||
sent <- newUtf8CString sent parsePl
|
||||
callbacks_map <- mkCallbacksMap (concr lang) callbacks parsePl
|
||||
ps <- pgf_parse_to_chart (concr lang) ctype sent heuristic callbacks_map (fromIntegral roots) exn parsePl parsePl
|
||||
touchType
|
||||
failed <- gu_exn_is_raised exn
|
||||
if failed
|
||||
then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError
|
||||
if is_parse_error
|
||||
then do c_err <- (#peek GuExn, data.data) exn
|
||||
c_incomplete <- (#peek PgfParseError, incomplete) c_err
|
||||
if (c_incomplete :: CInt) == 0
|
||||
then do c_offset <- (#peek PgfParseError, offset) c_err
|
||||
token_ptr <- (#peek PgfParseError, token_ptr) c_err
|
||||
token_len <- (#peek PgfParseError, token_len) c_err
|
||||
tok <- peekUtf8CStringLen token_ptr token_len
|
||||
touchConcr lang
|
||||
return (ParseFailed (fromIntegral (c_offset :: CInt)) tok)
|
||||
else do touchConcr lang
|
||||
return ParseIncomplete
|
||||
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
|
||||
if is_exn
|
||||
then do c_msg <- (#peek GuExn, data.data) exn
|
||||
msg <- peekUtf8CString c_msg
|
||||
touchConcr lang
|
||||
throwIO (PGFError msg)
|
||||
else do touchConcr lang
|
||||
throwIO (PGFError "Parsing failed")
|
||||
else do c_roots <- pgf_get_parse_roots ps parsePl
|
||||
let get_range c_ccat = pgf_ccat_to_range ps c_ccat parsePl
|
||||
c_len <- (#peek GuSeq, len) c_roots
|
||||
chart <- peekCCats get_range Map.empty (c_len :: CSizeT) (c_roots `plusPtr` (#offset GuSeq, data))
|
||||
touchConcr lang
|
||||
return (ParseOk chart)
|
||||
where
|
||||
peekCCats get_range chart 0 ptr = return ([],chart)
|
||||
peekCCats get_range chart len ptr = do
|
||||
(root, chart) <- deRef (peekCCat get_range chart) ptr
|
||||
(roots,chart) <- peekCCats get_range chart (len-1) (ptr `plusPtr` (#size PgfCCat*))
|
||||
return (root:roots,chart)
|
||||
|
||||
peekCCat get_range chart c_ccat = do
|
||||
fid <- peekFId c_ccat
|
||||
c_total_cats <- (#peek PgfConcr, total_cats) (concr lang)
|
||||
if Map.member fid chart || fid < c_total_cats
|
||||
then return (fid,chart)
|
||||
else do range <- get_range c_ccat >>= peekSequence peekRange (#size PgfParseRange)
|
||||
c_prods <- (#peek PgfCCat, prods) c_ccat
|
||||
if c_prods == nullPtr
|
||||
then do return (fid,Map.insert fid (range,[]) chart)
|
||||
else do c_len <- (#peek PgfCCat, n_synprods) c_ccat
|
||||
(prods,chart) <- fixIO (\res -> peekProductions (Map.insert fid (range,fst res) chart)
|
||||
(fromIntegral (c_len :: CSizeT))
|
||||
(c_prods `plusPtr` (#offset GuSeq, data)))
|
||||
return (fid,chart)
|
||||
where
|
||||
peekProductions chart 0 ptr = return ([],chart)
|
||||
peekProductions chart len ptr = do
|
||||
(ps1, chart) <- deRef (peekProduction chart) ptr
|
||||
(ps2,chart) <- peekProductions chart (len-1) (ptr `plusPtr` (#size GuVariant))
|
||||
return (ps1++ps2,chart)
|
||||
|
||||
peekProduction chart p = do
|
||||
tag <- gu_variant_tag p
|
||||
dt <- gu_variant_data p
|
||||
case tag of
|
||||
(#const PGF_PRODUCTION_APPLY) -> do { c_cncfun <- (#peek PgfProductionApply, fun) dt ;
|
||||
c_absfun <- (#peek PgfCncFun, absfun) c_cncfun ;
|
||||
expr <- (#peek PgfAbsFun, ep.expr) c_absfun ;
|
||||
p <- (#peek PgfAbsFun, ep.prob) c_absfun ;
|
||||
c_args <- (#peek PgfProductionApply, args) dt ;
|
||||
c_len <- (#peek GuSeq, len) c_args ;
|
||||
(pargs,chart) <- peekPArgs chart (c_len :: CSizeT) (c_args `plusPtr` (#offset GuSeq, data)) ;
|
||||
return ([(Expr expr (touchConcr lang), pargs, p)],chart) }
|
||||
(#const PGF_PRODUCTION_COERCE) -> do { c_coerce <- (#peek PgfProductionCoerce, coerce) dt ;
|
||||
(fid,chart) <- peekCCat get_range chart c_coerce ;
|
||||
return (maybe [] snd (Map.lookup fid chart),chart) }
|
||||
(#const PGF_PRODUCTION_EXTERN) -> do { c_ep <- (#peek PgfProductionExtern, ep) dt ;
|
||||
expr <- (#peek PgfExprProb, expr) c_ep ;
|
||||
p <- (#peek PgfExprProb, prob) c_ep ;
|
||||
return ([(Expr expr (touchConcr lang), [], p)],chart) }
|
||||
_ -> error ("Unknown production type "++show tag++" in the grammar")
|
||||
|
||||
peekPArgs chart 0 ptr = return ([],chart)
|
||||
peekPArgs chart len ptr = do
|
||||
(a, chart) <- peekPArg chart ptr
|
||||
(as,chart) <- peekPArgs chart (len-1) (ptr `plusPtr` (#size PgfPArg))
|
||||
return (a:as,chart)
|
||||
|
||||
peekPArg chart ptr = do
|
||||
c_hypos <- (#peek PgfPArg, hypos) ptr
|
||||
hypos <- if c_hypos /= nullPtr
|
||||
then peekSequence (deRef peekFId) (#size int) c_hypos
|
||||
else return []
|
||||
c_ccat <- (#peek PgfPArg, ccat) ptr
|
||||
(fid,chart) <- peekCCat get_range chart c_ccat
|
||||
return (PArg hypos fid,chart)
|
||||
|
||||
peekRange ptr = do
|
||||
s <- (#peek PgfParseRange, start) ptr
|
||||
e <- (#peek PgfParseRange, end) ptr
|
||||
f <- (#peek PgfParseRange, field) ptr >>= peekCString
|
||||
return ((fromIntegral :: CSizeT -> Int) s, (fromIntegral :: CSizeT -> Int) e, f)
|
||||
|
||||
mkCallbacksMap :: Ptr PgfConcr -> [(String, Int -> Int -> Maybe (Expr,Float,Int))] -> Ptr GuPool -> IO (Ptr PgfCallbacksMap)
|
||||
mkCallbacksMap concr callbacks pool = do
|
||||
callbacks_map <- pgf_new_callbacks_map concr pool
|
||||
@@ -700,7 +826,7 @@ parseWithOracle :: Concr -- ^ the language with which we parse
|
||||
-> Cat -- ^ the start category
|
||||
-> String -- ^ the input sentence
|
||||
-> Oracle
|
||||
-> ParseOutput
|
||||
-> ParseOutput [(Expr,Float)]
|
||||
parseWithOracle lang cat sent (predict,complete,literal) =
|
||||
unsafePerformIO $
|
||||
do parsePl <- gu_new_pool
|
||||
@@ -906,7 +1032,6 @@ tabularLinearizeAll lang e = unsafePerformIO $
|
||||
throwIO (PGFError msg)
|
||||
else do throwIO (PGFError "The abstract tree cannot be linearized")
|
||||
|
||||
type FId = Int
|
||||
type LIndex = Int
|
||||
|
||||
-- | BracketedString represents a sentence that is linearized
|
||||
|
||||
@@ -6,6 +6,7 @@ module PGF2.FFI where
|
||||
#include <gu/hash.h>
|
||||
#include <gu/utf8.h>
|
||||
#include <pgf/pgf.h>
|
||||
#include <pgf/data.h>
|
||||
|
||||
import Foreign ( alloca, peek, poke, peekByteOff )
|
||||
import Foreign.C
|
||||
@@ -237,6 +238,16 @@ newSequence elem_size pokeElem values pool = do
|
||||
pokeElem ptr x
|
||||
pokeElems (ptr `plusPtr` (fromIntegral elem_size)) xs
|
||||
|
||||
type FId = Int
|
||||
data PArg = PArg [FId] {-# UNPACK #-} !FId deriving (Eq,Ord,Show)
|
||||
|
||||
peekFId :: Ptr a -> IO FId
|
||||
peekFId c_ccat = do
|
||||
c_fid <- (#peek PgfCCat, fid) c_ccat
|
||||
return (fromIntegral (c_fid :: CInt))
|
||||
|
||||
deRef peekValue ptr = peek ptr >>= peekValue
|
||||
|
||||
------------------------------------------------------------------
|
||||
-- libpgf API
|
||||
|
||||
@@ -261,6 +272,7 @@ data PgfAbsCat
|
||||
data PgfCCat
|
||||
data PgfCncFun
|
||||
data PgfProductionApply
|
||||
data PgfParsing
|
||||
|
||||
foreign import ccall "pgf/pgf.h pgf_read"
|
||||
pgf_read :: CString -> Ptr GuPool -> Ptr GuExn -> IO (Ptr PgfPGF)
|
||||
@@ -361,6 +373,15 @@ foreign import ccall "wrapper"
|
||||
foreign import ccall "pgf/pgf.h pgf_align_words"
|
||||
pgf_align_words :: Ptr PgfConcr -> PgfExpr -> Ptr GuExn -> Ptr GuPool -> IO (Ptr GuSeq)
|
||||
|
||||
foreign import ccall "pgf/pgf.h pgf_parse_to_chart"
|
||||
pgf_parse_to_chart :: Ptr PgfConcr -> PgfType -> CString -> Double -> Ptr PgfCallbacksMap -> CSizeT -> Ptr GuExn -> Ptr GuPool -> Ptr GuPool -> IO (Ptr PgfParsing)
|
||||
|
||||
foreign import ccall "pgf/pgf.h pgf_get_parse_roots"
|
||||
pgf_get_parse_roots :: Ptr PgfParsing -> Ptr GuPool -> IO (Ptr GuSeq)
|
||||
|
||||
foreign import ccall "pgf/pgf.h pgf_ccat_to_range"
|
||||
pgf_ccat_to_range :: Ptr PgfParsing -> Ptr PgfCCat -> Ptr GuPool -> IO (Ptr GuSeq)
|
||||
|
||||
foreign import ccall "pgf/pgf.h pgf_parse_with_heuristics"
|
||||
pgf_parse_with_heuristics :: Ptr PgfConcr -> PgfType -> CString -> Double -> Ptr PgfCallbacksMap -> Ptr GuExn -> Ptr GuPool -> Ptr GuPool -> IO (Ptr GuEnum)
|
||||
|
||||
|
||||
@@ -53,7 +53,6 @@ data Production
|
||||
= PApply {-# UNPACK #-} !FunId [PArg]
|
||||
| PCoerce {-# UNPACK #-} !FId
|
||||
deriving (Eq,Ord,Show)
|
||||
data PArg = PArg [FId] {-# UNPACK #-} !FId deriving (Eq,Ord,Show)
|
||||
type FunId = Int
|
||||
type SeqId = Int
|
||||
data Literal =
|
||||
@@ -186,10 +185,6 @@ concrProductions c fid = unsafePerformIO $ do
|
||||
fid <- peekFId c_ccat
|
||||
return (PArg hypos fid)
|
||||
|
||||
peekFId c_ccat = do
|
||||
c_fid <- (#peek PgfCCat, fid) c_ccat
|
||||
return (fromIntegral (c_fid :: CInt))
|
||||
|
||||
concrTotalFuns :: Concr -> FunId
|
||||
concrTotalFuns c = unsafePerformIO $ do
|
||||
c_cncfuns <- (#peek PgfConcr, cncfuns) (concr c)
|
||||
@@ -271,8 +266,6 @@ concrSequence c seqid = unsafePerformIO $ do
|
||||
forms <- peekForms (len-1) (ptr `plusPtr` (#size PgfAlternative))
|
||||
return ((form,prefixes):forms)
|
||||
|
||||
deRef peekValue ptr = peek ptr >>= peekValue
|
||||
|
||||
fidString, fidInt, fidFloat, fidVar, fidStart :: FId
|
||||
fidString = (-1)
|
||||
fidInt = (-2)
|
||||
|
||||
Reference in New Issue
Block a user