mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 11:42:49 -06:00
experimental robust parser
This commit is contained in:
@@ -389,12 +389,17 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
|||||||
"Shows all trees returned by parsing a string in the grammars in scope.",
|
"Shows all trees returned by parsing a string in the grammars in scope.",
|
||||||
"The -lang flag can be used to restrict this to fewer languages.",
|
"The -lang flag can be used to restrict this to fewer languages.",
|
||||||
"The default start category can be overridden by the -cat flag.",
|
"The default start category can be overridden by the -cat flag.",
|
||||||
"See also the ps command for lexing and character encoding."
|
"See also the ps command for lexing and character encoding.",
|
||||||
|
"",
|
||||||
|
"The -openclass flag is experimental and allows some robustness in ",
|
||||||
|
"the parser. For example if -openclass=\"A,N,V\" is given, the parser",
|
||||||
|
"will accept unknown adjectives, nouns and verbs with the resource grammar."
|
||||||
],
|
],
|
||||||
exec = \opts -> returnFromExprs . concatMap (par opts) . toStrings,
|
exec = \opts -> returnFromExprs . concatMap (par opts) . toStrings,
|
||||||
flags = [
|
flags = [
|
||||||
("cat","target category of parsing"),
|
("cat","target category of parsing"),
|
||||||
("lang","the languages of parsing (comma-separated, no spaces)")
|
("lang","the languages of parsing (comma-separated, no spaces)"),
|
||||||
|
("openclass","list of open-class categories for robust parsing")
|
||||||
]
|
]
|
||||||
}),
|
}),
|
||||||
("pg", emptyCommandInfo { -----
|
("pg", emptyCommandInfo { -----
|
||||||
@@ -742,7 +747,9 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
|||||||
]
|
]
|
||||||
where
|
where
|
||||||
enc = encodeUnicode cod
|
enc = encodeUnicode cod
|
||||||
par opts s = concat [parse pgf lang (optType opts) s | lang <- optLangs opts, canParse pgf lang]
|
par opts s = case optOpenTypes opts of
|
||||||
|
[] -> concat [parse pgf lang (optType opts) s | lang <- optLangs opts, canParse pgf lang]
|
||||||
|
open_typs -> concat [parseWithRecovery pgf lang (optType opts) open_typs s | lang <- optLangs opts, canParse pgf lang]
|
||||||
|
|
||||||
void = ([],[])
|
void = ([],[])
|
||||||
|
|
||||||
@@ -789,6 +796,11 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
|||||||
"" -> languages pgf
|
"" -> languages pgf
|
||||||
lang -> map mkCId (chunks ',' lang)
|
lang -> map mkCId (chunks ',' lang)
|
||||||
optLang opts = head $ optLangs opts ++ [wildCId]
|
optLang opts = head $ optLangs opts ++ [wildCId]
|
||||||
|
|
||||||
|
optOpenTypes opts = case valStrOpts "openclass" "" opts of
|
||||||
|
"" -> []
|
||||||
|
cats -> mapMaybe readType (chunks ',' cats)
|
||||||
|
|
||||||
optType opts =
|
optType opts =
|
||||||
let str = valStrOpts "cat" (showCId $ lookStartCat pgf) opts
|
let str = valStrOpts "cat" (showCId $ lookStartCat pgf) opts
|
||||||
in case readType str of
|
in case readType str of
|
||||||
|
|||||||
@@ -12,6 +12,9 @@ module GF.Data.TrieMap
|
|||||||
, insertWith
|
, insertWith
|
||||||
|
|
||||||
, unionWith
|
, unionWith
|
||||||
|
, unionsWith
|
||||||
|
|
||||||
|
, elems
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Prelude hiding (lookup, null)
|
import Prelude hiding (lookup, null)
|
||||||
@@ -53,3 +56,11 @@ unionWith f (Tr mb_v1 m1) (Tr mb_v2 m2) =
|
|||||||
(Just v1,Just v2) -> Just (f v1 v2)
|
(Just v1,Just v2) -> Just (f v1 v2)
|
||||||
m = Map.unionWith (unionWith f) m1 m2
|
m = Map.unionWith (unionWith f) m1 m2
|
||||||
in Tr mb_v m
|
in Tr mb_v m
|
||||||
|
|
||||||
|
unionsWith :: Ord k => (v -> v -> v) -> [TrieMap k v] -> TrieMap k v
|
||||||
|
unionsWith f = foldl (unionWith f) empty
|
||||||
|
|
||||||
|
elems :: TrieMap k v -> [v]
|
||||||
|
elems tr = collect tr []
|
||||||
|
where
|
||||||
|
collect (Tr mb_v m) xs = maybe id (:) mb_v (Map.fold collect xs m)
|
||||||
|
|||||||
@@ -262,7 +262,7 @@ wordCompletion gfenv line0 prefix0 p =
|
|||||||
-> do mb_state0 <- try (evaluate (initState pgf (optLang opts) (optType opts)))
|
-> do mb_state0 <- try (evaluate (initState pgf (optLang opts) (optType opts)))
|
||||||
case mb_state0 of
|
case mb_state0 of
|
||||||
Right state0 -> let ws = words (take (length s - length prefix) s)
|
Right state0 -> let ws = words (take (length s - length prefix) s)
|
||||||
in case foldM nextState state0 ws of
|
in case loop state0 ws of
|
||||||
Nothing -> ret ' ' []
|
Nothing -> ret ' ' []
|
||||||
Just state -> let compls = getCompletions state prefix
|
Just state -> let compls = getCompletions state prefix
|
||||||
in ret ' ' (map (encode gfenv) (Map.keys compls))
|
in ret ' ' (map (encode gfenv) (Map.keys compls))
|
||||||
@@ -295,7 +295,11 @@ wordCompletion gfenv line0 prefix0 p =
|
|||||||
Just ty -> ty
|
Just ty -> ty
|
||||||
Nothing -> error ("Can't parse '"++str++"' as type")
|
Nothing -> error ("Can't parse '"++str++"' as type")
|
||||||
|
|
||||||
|
loop ps [] = Just ps
|
||||||
|
loop ps (t:ts) = case nextState ps t of
|
||||||
|
Left es -> Nothing
|
||||||
|
Right ps -> loop ps ts
|
||||||
|
|
||||||
ret c [x] = return [x++[c]]
|
ret c [x] = return [x++[c]]
|
||||||
ret _ xs = return xs
|
ret _ xs = return xs
|
||||||
|
|
||||||
|
|||||||
17
src/PGF.hs
17
src/PGF.hs
@@ -29,7 +29,7 @@ module PGF(
|
|||||||
-- * Types
|
-- * Types
|
||||||
Type, Hypo,
|
Type, Hypo,
|
||||||
showType, readType,
|
showType, readType,
|
||||||
mkType, mkHypo, mkDepHypo, mkImplHypo,
|
mkType, mkHypo, mkDepHypo, mkImplHypo,
|
||||||
categories, startCat,
|
categories, startCat,
|
||||||
|
|
||||||
-- * Functions
|
-- * Functions
|
||||||
@@ -54,7 +54,7 @@ module PGF(
|
|||||||
showPrintName,
|
showPrintName,
|
||||||
|
|
||||||
-- ** Parsing
|
-- ** Parsing
|
||||||
parse, canParse, parseAllLang, parseAll,
|
parse, parseWithRecovery, canParse, parseAllLang, parseAll,
|
||||||
|
|
||||||
-- ** Evaluation
|
-- ** Evaluation
|
||||||
PGF.compute, paraphrase,
|
PGF.compute, paraphrase,
|
||||||
@@ -75,7 +75,7 @@ module PGF(
|
|||||||
-- ** Word Completion (Incremental Parsing)
|
-- ** Word Completion (Incremental Parsing)
|
||||||
complete,
|
complete,
|
||||||
Incremental.ParseState,
|
Incremental.ParseState,
|
||||||
Incremental.initState, Incremental.nextState, Incremental.getCompletions, Incremental.extractTrees,
|
Incremental.initState, Incremental.nextState, Incremental.getCompletions, Incremental.recoveryStates, Incremental.extractTrees,
|
||||||
|
|
||||||
-- ** Generation
|
-- ** Generation
|
||||||
generateRandom, generateAll, generateAllDepth,
|
generateRandom, generateAll, generateAllDepth,
|
||||||
@@ -131,6 +131,8 @@ linearize :: PGF -> Language -> Tree -> String
|
|||||||
-- for parsing, see 'canParse'.
|
-- for parsing, see 'canParse'.
|
||||||
parse :: PGF -> Language -> Type -> String -> [Tree]
|
parse :: PGF -> Language -> Type -> String -> [Tree]
|
||||||
|
|
||||||
|
parseWithRecovery :: PGF -> Language -> Type -> [Type] -> String -> [Tree]
|
||||||
|
|
||||||
-- | Checks whether the given language can be used for parsing.
|
-- | Checks whether the given language can be used for parsing.
|
||||||
canParse :: PGF -> Language -> Bool
|
canParse :: PGF -> Language -> Bool
|
||||||
|
|
||||||
@@ -241,6 +243,8 @@ parse pgf lang typ s =
|
|||||||
Nothing -> error ("No parser built for language: " ++ showCId lang)
|
Nothing -> error ("No parser built for language: " ++ showCId lang)
|
||||||
Nothing -> error ("Unknown language: " ++ showCId lang)
|
Nothing -> error ("Unknown language: " ++ showCId lang)
|
||||||
|
|
||||||
|
parseWithRecovery pgf lang typ open_typs s = Incremental.parseWithRecovery pgf lang typ open_typs (words s)
|
||||||
|
|
||||||
canParse pgf cnc = isJust (lookParser pgf cnc)
|
canParse pgf cnc = isJust (lookParser pgf cnc)
|
||||||
|
|
||||||
linearizeAll mgr = map snd . linearizeAllLang mgr
|
linearizeAll mgr = map snd . linearizeAllLang mgr
|
||||||
@@ -282,7 +286,7 @@ functionType pgf fun =
|
|||||||
complete pgf from typ input =
|
complete pgf from typ input =
|
||||||
let (ws,prefix) = tokensAndPrefix input
|
let (ws,prefix) = tokensAndPrefix input
|
||||||
state0 = Incremental.initState pgf from typ
|
state0 = Incremental.initState pgf from typ
|
||||||
in case foldM Incremental.nextState state0 ws of
|
in case loop state0 ws of
|
||||||
Nothing -> []
|
Nothing -> []
|
||||||
Just state ->
|
Just state ->
|
||||||
(if null prefix && not (null (Incremental.extractTrees state typ)) then [unwords ws ++ " "] else [])
|
(if null prefix && not (null (Incremental.extractTrees state typ)) then [unwords ws ++ " "] else [])
|
||||||
@@ -294,6 +298,11 @@ complete pgf from typ input =
|
|||||||
| otherwise = (init ws, last ws)
|
| otherwise = (init ws, last ws)
|
||||||
where ws = words s
|
where ws = words s
|
||||||
|
|
||||||
|
loop ps [] = Just ps
|
||||||
|
loop ps (t:ts) = case Incremental.nextState ps t of
|
||||||
|
Left es -> Nothing
|
||||||
|
Right ps -> loop ps ts
|
||||||
|
|
||||||
-- | Converts an expression to normal form
|
-- | Converts an expression to normal form
|
||||||
compute :: PGF -> Expr -> Expr
|
compute :: PGF -> Expr -> Expr
|
||||||
compute pgf = PGF.Data.normalForm (funs (abstract pgf)) 0 []
|
compute pgf = PGF.Data.normalForm (funs (abstract pgf)) 0 []
|
||||||
|
|||||||
@@ -1,11 +1,14 @@
|
|||||||
{-# LANGUAGE BangPatterns #-}
|
{-# LANGUAGE BangPatterns #-}
|
||||||
module PGF.Parsing.FCFG.Incremental
|
module PGF.Parsing.FCFG.Incremental
|
||||||
( ParseState
|
( ParseState
|
||||||
|
, ErrorState
|
||||||
, initState
|
, initState
|
||||||
, nextState
|
, nextState
|
||||||
, getCompletions
|
, getCompletions
|
||||||
|
, recoveryStates
|
||||||
, extractTrees
|
, extractTrees
|
||||||
, parse
|
, parse
|
||||||
|
, parseWithRecovery
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.Array.IArray
|
import Data.Array.IArray
|
||||||
@@ -26,8 +29,28 @@ import PGF.Macros
|
|||||||
import PGF.TypeCheck
|
import PGF.TypeCheck
|
||||||
import Debug.Trace
|
import Debug.Trace
|
||||||
|
|
||||||
parse :: PGF -> Language -> Type -> [String] -> [Expr]
|
parse :: PGF -> Language -> Type -> [String] -> [Tree]
|
||||||
parse pgf lang typ toks = maybe [] (\ps -> extractTrees ps typ) (foldM nextState (initState pgf lang typ) toks)
|
parse pgf lang typ toks = loop (initState pgf lang typ) toks
|
||||||
|
where
|
||||||
|
loop ps [] = extractTrees ps typ
|
||||||
|
loop ps (t:ts) = case nextState ps t of
|
||||||
|
Left es -> []
|
||||||
|
Right ps -> loop ps ts
|
||||||
|
|
||||||
|
parseWithRecovery :: PGF -> Language -> Type -> [Type] -> [String] -> [Tree]
|
||||||
|
parseWithRecovery pgf lang typ open_typs toks = accept (initState pgf lang typ) toks
|
||||||
|
where
|
||||||
|
accept ps [] = extractTrees ps typ
|
||||||
|
accept ps (t:ts) =
|
||||||
|
case nextState ps t of
|
||||||
|
Right ps -> accept ps ts
|
||||||
|
Left es -> skip (recoveryStates open_typs es) ts
|
||||||
|
|
||||||
|
skip ps_map [] = extractTrees (fst ps_map) typ
|
||||||
|
skip ps_map (t:ts) =
|
||||||
|
case Map.lookup t (snd ps_map) of
|
||||||
|
Just ps -> accept ps ts
|
||||||
|
Nothing -> skip ps_map ts
|
||||||
|
|
||||||
-- | Creates an initial parsing state for a given language and
|
-- | Creates an initial parsing state for a given language and
|
||||||
-- startup category.
|
-- startup category.
|
||||||
@@ -46,16 +69,18 @@ initState pgf lang (DTyp _ start _) =
|
|||||||
Just pinfo -> pinfo
|
Just pinfo -> pinfo
|
||||||
_ -> error ("Unknown language: " ++ showCId lang)
|
_ -> error ("Unknown language: " ++ showCId lang)
|
||||||
|
|
||||||
in State pgf
|
in PState pgf
|
||||||
pinfo
|
pinfo
|
||||||
(Chart emptyAC [] emptyPC (productions pinfo) (totalCats pinfo) 0)
|
(Chart emptyAC [] emptyPC (productions pinfo) (totalCats pinfo) 0)
|
||||||
(TMap.singleton [] (Set.fromList items))
|
(TMap.singleton [] (Set.fromList items))
|
||||||
|
|
||||||
-- | From the current state and the next token
|
-- | From the current state and the next token
|
||||||
-- 'nextState' computes a new state where the token
|
-- 'nextState' computes a new state, where the token
|
||||||
-- is consumed and the current position shifted by one.
|
-- is consumed and the current position is shifted by one.
|
||||||
nextState :: ParseState -> String -> Maybe ParseState
|
-- If the new token cannot be accepted then an error state
|
||||||
nextState (State pgf pinfo chart items) t =
|
-- is returned.
|
||||||
|
nextState :: ParseState -> String -> Either ErrorState ParseState
|
||||||
|
nextState (PState pgf pinfo chart items) t =
|
||||||
let (mb_agenda,map_items) = TMap.decompose items
|
let (mb_agenda,map_items) = TMap.decompose items
|
||||||
agenda = maybe [] Set.toList mb_agenda
|
agenda = maybe [] Set.toList mb_agenda
|
||||||
acc = fromMaybe TMap.empty (Map.lookup t map_items)
|
acc = fromMaybe TMap.empty (Map.lookup t map_items)
|
||||||
@@ -66,8 +91,8 @@ nextState (State pgf pinfo chart items) t =
|
|||||||
, offset =offset chart1+1
|
, offset =offset chart1+1
|
||||||
}
|
}
|
||||||
in if TMap.null acc1
|
in if TMap.null acc1
|
||||||
then Nothing
|
then Left (EState pgf pinfo chart2)
|
||||||
else Just (State pgf pinfo chart2 acc1)
|
else Right (PState pgf pinfo chart2 acc1)
|
||||||
where
|
where
|
||||||
add (tok:toks) item acc
|
add (tok:toks) item acc
|
||||||
| tok == t = TMap.insertWith Set.union toks (Set.singleton item) acc
|
| tok == t = TMap.insertWith Set.union toks (Set.singleton item) acc
|
||||||
@@ -78,7 +103,7 @@ nextState (State pgf pinfo chart items) t =
|
|||||||
-- next words and the consequent states. This is used for word completions in
|
-- next words and the consequent states. This is used for word completions in
|
||||||
-- the GF interpreter.
|
-- the GF interpreter.
|
||||||
getCompletions :: ParseState -> String -> Map.Map String ParseState
|
getCompletions :: ParseState -> String -> Map.Map String ParseState
|
||||||
getCompletions (State pgf pinfo chart items) w =
|
getCompletions (PState pgf pinfo chart items) w =
|
||||||
let (mb_agenda,map_items) = TMap.decompose items
|
let (mb_agenda,map_items) = TMap.decompose items
|
||||||
agenda = maybe [] Set.toList mb_agenda
|
agenda = maybe [] Set.toList mb_agenda
|
||||||
acc = Map.filterWithKey (\tok _ -> isPrefixOf w tok) map_items
|
acc = Map.filterWithKey (\tok _ -> isPrefixOf w tok) map_items
|
||||||
@@ -88,12 +113,34 @@ getCompletions (State pgf pinfo chart items) w =
|
|||||||
, passive=emptyPC
|
, passive=emptyPC
|
||||||
, offset =offset chart1+1
|
, offset =offset chart1+1
|
||||||
}
|
}
|
||||||
in fmap (State pgf pinfo chart2) acc'
|
in fmap (PState pgf pinfo chart2) acc'
|
||||||
where
|
where
|
||||||
add (tok:toks) item acc
|
add (tok:toks) item acc
|
||||||
| isPrefixOf w tok = Map.insertWith (TMap.unionWith Set.union) tok (TMap.singleton toks (Set.singleton item)) acc
|
| isPrefixOf w tok = Map.insertWith (TMap.unionWith Set.union) tok (TMap.singleton toks (Set.singleton item)) acc
|
||||||
add _ item acc = acc
|
add _ item acc = acc
|
||||||
|
|
||||||
|
recoveryStates :: [Type] -> ErrorState -> (ParseState, Map.Map String ParseState)
|
||||||
|
recoveryStates open_types (EState pgf pinfo chart) =
|
||||||
|
let open_fcats = concatMap type2fcats open_types
|
||||||
|
agenda = foldl (complete open_fcats) [] (actives chart)
|
||||||
|
(acc,chart1) = process Nothing add (sequences pinfo) (functions pinfo) agenda Map.empty chart
|
||||||
|
chart2 = chart1{ active =emptyAC
|
||||||
|
, actives=active chart1 : actives chart1
|
||||||
|
, passive=emptyPC
|
||||||
|
, offset =offset chart1+1
|
||||||
|
}
|
||||||
|
in (PState pgf pinfo chart (TMap.singleton [] (Set.fromList agenda)), fmap (PState pgf pinfo chart2) acc)
|
||||||
|
where
|
||||||
|
type2fcats (DTyp _ cat _) = fromMaybe [] (Map.lookup cat (startCats pinfo))
|
||||||
|
|
||||||
|
complete open_fcats items ac =
|
||||||
|
foldl (Set.fold (\(Active j' ppos funid seqid args keyc) ->
|
||||||
|
(:) (Active j' (ppos+1) funid seqid args keyc)))
|
||||||
|
items
|
||||||
|
[set | fcat <- open_fcats, set <- lookupACByFCat fcat ac]
|
||||||
|
|
||||||
|
add (tok:toks) item acc = Map.insertWith (TMap.unionWith Set.union) tok (TMap.singleton toks (Set.singleton item)) acc
|
||||||
|
|
||||||
-- | This function extracts the list of all completed parse trees
|
-- | This function extracts the list of all completed parse trees
|
||||||
-- that spans the whole input consumed so far. The trees are also
|
-- that spans the whole input consumed so far. The trees are also
|
||||||
-- limited by the category specified, which is usually
|
-- limited by the category specified, which is usually
|
||||||
@@ -189,7 +236,7 @@ process mbt fn !seqs !funs (item@(Active j ppos funid seqid args key0):items) ac
|
|||||||
Just set -> Set.fold (\(Active j' ppos funid seqid args keyc) ->
|
Just set -> Set.fold (\(Active j' ppos funid seqid args keyc) ->
|
||||||
let FSymCat d _ = unsafeAt (unsafeAt seqs seqid) ppos
|
let FSymCat d _ = unsafeAt (unsafeAt seqs seqid) ppos
|
||||||
in (:) (Active j' (ppos+1) funid seqid (updateAt d fid args) keyc)) items set
|
in (:) (Active j' (ppos+1) funid seqid (updateAt d fid args) keyc)) items set
|
||||||
in process mbt fn seqs funs items2 acc chart{passive=insertPC (mkPK key0 j) fid (passive chart)
|
in process mbt fn seqs funs items2 acc chart{passive=insertPC (mkPK key0 j) fid (passive chart)
|
||||||
,forest =IntMap.insert fid (Set.singleton (FApply funid args)) (forest chart)
|
,forest =IntMap.insert fid (Set.singleton (FApply funid args)) (forest chart)
|
||||||
,nextId =nextId chart+1
|
,nextId =nextId chart+1
|
||||||
}
|
}
|
||||||
@@ -243,6 +290,12 @@ emptyAC = IntMap.empty
|
|||||||
lookupAC :: ActiveKey -> ActiveChart -> Maybe (Set.Set Active)
|
lookupAC :: ActiveKey -> ActiveChart -> Maybe (Set.Set Active)
|
||||||
lookupAC (AK fcat l) chart = IntMap.lookup fcat chart >>= IntMap.lookup l
|
lookupAC (AK fcat l) chart = IntMap.lookup fcat chart >>= IntMap.lookup l
|
||||||
|
|
||||||
|
lookupACByFCat :: FCat -> ActiveChart -> [Set.Set Active]
|
||||||
|
lookupACByFCat fcat chart =
|
||||||
|
case IntMap.lookup fcat chart of
|
||||||
|
Nothing -> []
|
||||||
|
Just map -> IntMap.elems map
|
||||||
|
|
||||||
labelsAC :: FCat -> ActiveChart -> [FIndex]
|
labelsAC :: FCat -> ActiveChart -> [FIndex]
|
||||||
labelsAC fcat chart =
|
labelsAC fcat chart =
|
||||||
case IntMap.lookup fcat chart of
|
case IntMap.lookup fcat chart of
|
||||||
@@ -296,7 +349,7 @@ foldForest f g b fcat forest =
|
|||||||
|
|
||||||
-- | An abstract data type whose values represent
|
-- | An abstract data type whose values represent
|
||||||
-- the current state in an incremental parser.
|
-- the current state in an incremental parser.
|
||||||
data ParseState = State PGF ParserInfo Chart (TMap.TrieMap String (Set.Set Active))
|
data ParseState = PState PGF ParserInfo Chart (TMap.TrieMap String (Set.Set Active))
|
||||||
|
|
||||||
data Chart
|
data Chart
|
||||||
= Chart
|
= Chart
|
||||||
@@ -308,3 +361,11 @@ data Chart
|
|||||||
, offset :: {-# UNPACK #-} !Int
|
, offset :: {-# UNPACK #-} !Int
|
||||||
}
|
}
|
||||||
deriving Show
|
deriving Show
|
||||||
|
|
||||||
|
----------------------------------------------------------------
|
||||||
|
-- Error State
|
||||||
|
----------------------------------------------------------------
|
||||||
|
|
||||||
|
-- | An abstract data type whose values represent
|
||||||
|
-- the state in an incremental parser after an error.
|
||||||
|
data ErrorState = EState PGF ParserInfo Chart
|
||||||
|
|||||||
Reference in New Issue
Block a user