experimental robust parser

This commit is contained in:
krasimir
2009-10-23 08:35:32 +00:00
parent 1141be0972
commit 4e7a5bb297
5 changed files with 122 additions and 25 deletions

View File

@@ -389,12 +389,17 @@ allCommands cod env@(pgf, mos) = Map.fromList [
"Shows all trees returned by parsing a string in the grammars in scope.", "Shows all trees returned by parsing a string in the grammars in scope.",
"The -lang flag can be used to restrict this to fewer languages.", "The -lang flag can be used to restrict this to fewer languages.",
"The default start category can be overridden by the -cat flag.", "The default start category can be overridden by the -cat flag.",
"See also the ps command for lexing and character encoding." "See also the ps command for lexing and character encoding.",
"",
"The -openclass flag is experimental and allows some robustness in ",
"the parser. For example if -openclass=\"A,N,V\" is given, the parser",
"will accept unknown adjectives, nouns and verbs with the resource grammar."
], ],
exec = \opts -> returnFromExprs . concatMap (par opts) . toStrings, exec = \opts -> returnFromExprs . concatMap (par opts) . toStrings,
flags = [ flags = [
("cat","target category of parsing"), ("cat","target category of parsing"),
("lang","the languages of parsing (comma-separated, no spaces)") ("lang","the languages of parsing (comma-separated, no spaces)"),
("openclass","list of open-class categories for robust parsing")
] ]
}), }),
("pg", emptyCommandInfo { ----- ("pg", emptyCommandInfo { -----
@@ -742,7 +747,9 @@ allCommands cod env@(pgf, mos) = Map.fromList [
] ]
where where
enc = encodeUnicode cod enc = encodeUnicode cod
par opts s = concat [parse pgf lang (optType opts) s | lang <- optLangs opts, canParse pgf lang] par opts s = case optOpenTypes opts of
[] -> concat [parse pgf lang (optType opts) s | lang <- optLangs opts, canParse pgf lang]
open_typs -> concat [parseWithRecovery pgf lang (optType opts) open_typs s | lang <- optLangs opts, canParse pgf lang]
void = ([],[]) void = ([],[])
@@ -789,6 +796,11 @@ allCommands cod env@(pgf, mos) = Map.fromList [
"" -> languages pgf "" -> languages pgf
lang -> map mkCId (chunks ',' lang) lang -> map mkCId (chunks ',' lang)
optLang opts = head $ optLangs opts ++ [wildCId] optLang opts = head $ optLangs opts ++ [wildCId]
optOpenTypes opts = case valStrOpts "openclass" "" opts of
"" -> []
cats -> mapMaybe readType (chunks ',' cats)
optType opts = optType opts =
let str = valStrOpts "cat" (showCId $ lookStartCat pgf) opts let str = valStrOpts "cat" (showCId $ lookStartCat pgf) opts
in case readType str of in case readType str of

View File

@@ -12,6 +12,9 @@ module GF.Data.TrieMap
, insertWith , insertWith
, unionWith , unionWith
, unionsWith
, elems
) where ) where
import Prelude hiding (lookup, null) import Prelude hiding (lookup, null)
@@ -53,3 +56,11 @@ unionWith f (Tr mb_v1 m1) (Tr mb_v2 m2) =
(Just v1,Just v2) -> Just (f v1 v2) (Just v1,Just v2) -> Just (f v1 v2)
m = Map.unionWith (unionWith f) m1 m2 m = Map.unionWith (unionWith f) m1 m2
in Tr mb_v m in Tr mb_v m
unionsWith :: Ord k => (v -> v -> v) -> [TrieMap k v] -> TrieMap k v
unionsWith f = foldl (unionWith f) empty
elems :: TrieMap k v -> [v]
elems tr = collect tr []
where
collect (Tr mb_v m) xs = maybe id (:) mb_v (Map.fold collect xs m)

View File

@@ -262,7 +262,7 @@ wordCompletion gfenv line0 prefix0 p =
-> do mb_state0 <- try (evaluate (initState pgf (optLang opts) (optType opts))) -> do mb_state0 <- try (evaluate (initState pgf (optLang opts) (optType opts)))
case mb_state0 of case mb_state0 of
Right state0 -> let ws = words (take (length s - length prefix) s) Right state0 -> let ws = words (take (length s - length prefix) s)
in case foldM nextState state0 ws of in case loop state0 ws of
Nothing -> ret ' ' [] Nothing -> ret ' ' []
Just state -> let compls = getCompletions state prefix Just state -> let compls = getCompletions state prefix
in ret ' ' (map (encode gfenv) (Map.keys compls)) in ret ' ' (map (encode gfenv) (Map.keys compls))
@@ -295,7 +295,11 @@ wordCompletion gfenv line0 prefix0 p =
Just ty -> ty Just ty -> ty
Nothing -> error ("Can't parse '"++str++"' as type") Nothing -> error ("Can't parse '"++str++"' as type")
loop ps [] = Just ps
loop ps (t:ts) = case nextState ps t of
Left es -> Nothing
Right ps -> loop ps ts
ret c [x] = return [x++[c]] ret c [x] = return [x++[c]]
ret _ xs = return xs ret _ xs = return xs

View File

@@ -29,7 +29,7 @@ module PGF(
-- * Types -- * Types
Type, Hypo, Type, Hypo,
showType, readType, showType, readType,
mkType, mkHypo, mkDepHypo, mkImplHypo, mkType, mkHypo, mkDepHypo, mkImplHypo,
categories, startCat, categories, startCat,
-- * Functions -- * Functions
@@ -54,7 +54,7 @@ module PGF(
showPrintName, showPrintName,
-- ** Parsing -- ** Parsing
parse, canParse, parseAllLang, parseAll, parse, parseWithRecovery, canParse, parseAllLang, parseAll,
-- ** Evaluation -- ** Evaluation
PGF.compute, paraphrase, PGF.compute, paraphrase,
@@ -75,7 +75,7 @@ module PGF(
-- ** Word Completion (Incremental Parsing) -- ** Word Completion (Incremental Parsing)
complete, complete,
Incremental.ParseState, Incremental.ParseState,
Incremental.initState, Incremental.nextState, Incremental.getCompletions, Incremental.extractTrees, Incremental.initState, Incremental.nextState, Incremental.getCompletions, Incremental.recoveryStates, Incremental.extractTrees,
-- ** Generation -- ** Generation
generateRandom, generateAll, generateAllDepth, generateRandom, generateAll, generateAllDepth,
@@ -131,6 +131,8 @@ linearize :: PGF -> Language -> Tree -> String
-- for parsing, see 'canParse'. -- for parsing, see 'canParse'.
parse :: PGF -> Language -> Type -> String -> [Tree] parse :: PGF -> Language -> Type -> String -> [Tree]
parseWithRecovery :: PGF -> Language -> Type -> [Type] -> String -> [Tree]
-- | Checks whether the given language can be used for parsing. -- | Checks whether the given language can be used for parsing.
canParse :: PGF -> Language -> Bool canParse :: PGF -> Language -> Bool
@@ -241,6 +243,8 @@ parse pgf lang typ s =
Nothing -> error ("No parser built for language: " ++ showCId lang) Nothing -> error ("No parser built for language: " ++ showCId lang)
Nothing -> error ("Unknown language: " ++ showCId lang) Nothing -> error ("Unknown language: " ++ showCId lang)
parseWithRecovery pgf lang typ open_typs s = Incremental.parseWithRecovery pgf lang typ open_typs (words s)
canParse pgf cnc = isJust (lookParser pgf cnc) canParse pgf cnc = isJust (lookParser pgf cnc)
linearizeAll mgr = map snd . linearizeAllLang mgr linearizeAll mgr = map snd . linearizeAllLang mgr
@@ -282,7 +286,7 @@ functionType pgf fun =
complete pgf from typ input = complete pgf from typ input =
let (ws,prefix) = tokensAndPrefix input let (ws,prefix) = tokensAndPrefix input
state0 = Incremental.initState pgf from typ state0 = Incremental.initState pgf from typ
in case foldM Incremental.nextState state0 ws of in case loop state0 ws of
Nothing -> [] Nothing -> []
Just state -> Just state ->
(if null prefix && not (null (Incremental.extractTrees state typ)) then [unwords ws ++ " "] else []) (if null prefix && not (null (Incremental.extractTrees state typ)) then [unwords ws ++ " "] else [])
@@ -294,6 +298,11 @@ complete pgf from typ input =
| otherwise = (init ws, last ws) | otherwise = (init ws, last ws)
where ws = words s where ws = words s
loop ps [] = Just ps
loop ps (t:ts) = case Incremental.nextState ps t of
Left es -> Nothing
Right ps -> loop ps ts
-- | Converts an expression to normal form -- | Converts an expression to normal form
compute :: PGF -> Expr -> Expr compute :: PGF -> Expr -> Expr
compute pgf = PGF.Data.normalForm (funs (abstract pgf)) 0 [] compute pgf = PGF.Data.normalForm (funs (abstract pgf)) 0 []

View File

@@ -1,11 +1,14 @@
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BangPatterns #-}
module PGF.Parsing.FCFG.Incremental module PGF.Parsing.FCFG.Incremental
( ParseState ( ParseState
, ErrorState
, initState , initState
, nextState , nextState
, getCompletions , getCompletions
, recoveryStates
, extractTrees , extractTrees
, parse , parse
, parseWithRecovery
) where ) where
import Data.Array.IArray import Data.Array.IArray
@@ -26,8 +29,28 @@ import PGF.Macros
import PGF.TypeCheck import PGF.TypeCheck
import Debug.Trace import Debug.Trace
parse :: PGF -> Language -> Type -> [String] -> [Expr] parse :: PGF -> Language -> Type -> [String] -> [Tree]
parse pgf lang typ toks = maybe [] (\ps -> extractTrees ps typ) (foldM nextState (initState pgf lang typ) toks) parse pgf lang typ toks = loop (initState pgf lang typ) toks
where
loop ps [] = extractTrees ps typ
loop ps (t:ts) = case nextState ps t of
Left es -> []
Right ps -> loop ps ts
parseWithRecovery :: PGF -> Language -> Type -> [Type] -> [String] -> [Tree]
parseWithRecovery pgf lang typ open_typs toks = accept (initState pgf lang typ) toks
where
accept ps [] = extractTrees ps typ
accept ps (t:ts) =
case nextState ps t of
Right ps -> accept ps ts
Left es -> skip (recoveryStates open_typs es) ts
skip ps_map [] = extractTrees (fst ps_map) typ
skip ps_map (t:ts) =
case Map.lookup t (snd ps_map) of
Just ps -> accept ps ts
Nothing -> skip ps_map ts
-- | Creates an initial parsing state for a given language and -- | Creates an initial parsing state for a given language and
-- startup category. -- startup category.
@@ -46,16 +69,18 @@ initState pgf lang (DTyp _ start _) =
Just pinfo -> pinfo Just pinfo -> pinfo
_ -> error ("Unknown language: " ++ showCId lang) _ -> error ("Unknown language: " ++ showCId lang)
in State pgf in PState pgf
pinfo pinfo
(Chart emptyAC [] emptyPC (productions pinfo) (totalCats pinfo) 0) (Chart emptyAC [] emptyPC (productions pinfo) (totalCats pinfo) 0)
(TMap.singleton [] (Set.fromList items)) (TMap.singleton [] (Set.fromList items))
-- | From the current state and the next token -- | From the current state and the next token
-- 'nextState' computes a new state where the token -- 'nextState' computes a new state, where the token
-- is consumed and the current position shifted by one. -- is consumed and the current position is shifted by one.
nextState :: ParseState -> String -> Maybe ParseState -- If the new token cannot be accepted then an error state
nextState (State pgf pinfo chart items) t = -- is returned.
nextState :: ParseState -> String -> Either ErrorState ParseState
nextState (PState pgf pinfo chart items) t =
let (mb_agenda,map_items) = TMap.decompose items let (mb_agenda,map_items) = TMap.decompose items
agenda = maybe [] Set.toList mb_agenda agenda = maybe [] Set.toList mb_agenda
acc = fromMaybe TMap.empty (Map.lookup t map_items) acc = fromMaybe TMap.empty (Map.lookup t map_items)
@@ -66,8 +91,8 @@ nextState (State pgf pinfo chart items) t =
, offset =offset chart1+1 , offset =offset chart1+1
} }
in if TMap.null acc1 in if TMap.null acc1
then Nothing then Left (EState pgf pinfo chart2)
else Just (State pgf pinfo chart2 acc1) else Right (PState pgf pinfo chart2 acc1)
where where
add (tok:toks) item acc add (tok:toks) item acc
| tok == t = TMap.insertWith Set.union toks (Set.singleton item) acc | tok == t = TMap.insertWith Set.union toks (Set.singleton item) acc
@@ -78,7 +103,7 @@ nextState (State pgf pinfo chart items) t =
-- next words and the consequent states. This is used for word completions in -- next words and the consequent states. This is used for word completions in
-- the GF interpreter. -- the GF interpreter.
getCompletions :: ParseState -> String -> Map.Map String ParseState getCompletions :: ParseState -> String -> Map.Map String ParseState
getCompletions (State pgf pinfo chart items) w = getCompletions (PState pgf pinfo chart items) w =
let (mb_agenda,map_items) = TMap.decompose items let (mb_agenda,map_items) = TMap.decompose items
agenda = maybe [] Set.toList mb_agenda agenda = maybe [] Set.toList mb_agenda
acc = Map.filterWithKey (\tok _ -> isPrefixOf w tok) map_items acc = Map.filterWithKey (\tok _ -> isPrefixOf w tok) map_items
@@ -88,12 +113,34 @@ getCompletions (State pgf pinfo chart items) w =
, passive=emptyPC , passive=emptyPC
, offset =offset chart1+1 , offset =offset chart1+1
} }
in fmap (State pgf pinfo chart2) acc' in fmap (PState pgf pinfo chart2) acc'
where where
add (tok:toks) item acc add (tok:toks) item acc
| isPrefixOf w tok = Map.insertWith (TMap.unionWith Set.union) tok (TMap.singleton toks (Set.singleton item)) acc | isPrefixOf w tok = Map.insertWith (TMap.unionWith Set.union) tok (TMap.singleton toks (Set.singleton item)) acc
add _ item acc = acc add _ item acc = acc
recoveryStates :: [Type] -> ErrorState -> (ParseState, Map.Map String ParseState)
recoveryStates open_types (EState pgf pinfo chart) =
let open_fcats = concatMap type2fcats open_types
agenda = foldl (complete open_fcats) [] (actives chart)
(acc,chart1) = process Nothing add (sequences pinfo) (functions pinfo) agenda Map.empty chart
chart2 = chart1{ active =emptyAC
, actives=active chart1 : actives chart1
, passive=emptyPC
, offset =offset chart1+1
}
in (PState pgf pinfo chart (TMap.singleton [] (Set.fromList agenda)), fmap (PState pgf pinfo chart2) acc)
where
type2fcats (DTyp _ cat _) = fromMaybe [] (Map.lookup cat (startCats pinfo))
complete open_fcats items ac =
foldl (Set.fold (\(Active j' ppos funid seqid args keyc) ->
(:) (Active j' (ppos+1) funid seqid args keyc)))
items
[set | fcat <- open_fcats, set <- lookupACByFCat fcat ac]
add (tok:toks) item acc = Map.insertWith (TMap.unionWith Set.union) tok (TMap.singleton toks (Set.singleton item)) acc
-- | This function extracts the list of all completed parse trees -- | This function extracts the list of all completed parse trees
-- that spans the whole input consumed so far. The trees are also -- that spans the whole input consumed so far. The trees are also
-- limited by the category specified, which is usually -- limited by the category specified, which is usually
@@ -189,7 +236,7 @@ process mbt fn !seqs !funs (item@(Active j ppos funid seqid args key0):items) ac
Just set -> Set.fold (\(Active j' ppos funid seqid args keyc) -> Just set -> Set.fold (\(Active j' ppos funid seqid args keyc) ->
let FSymCat d _ = unsafeAt (unsafeAt seqs seqid) ppos let FSymCat d _ = unsafeAt (unsafeAt seqs seqid) ppos
in (:) (Active j' (ppos+1) funid seqid (updateAt d fid args) keyc)) items set in (:) (Active j' (ppos+1) funid seqid (updateAt d fid args) keyc)) items set
in process mbt fn seqs funs items2 acc chart{passive=insertPC (mkPK key0 j) fid (passive chart) in process mbt fn seqs funs items2 acc chart{passive=insertPC (mkPK key0 j) fid (passive chart)
,forest =IntMap.insert fid (Set.singleton (FApply funid args)) (forest chart) ,forest =IntMap.insert fid (Set.singleton (FApply funid args)) (forest chart)
,nextId =nextId chart+1 ,nextId =nextId chart+1
} }
@@ -243,6 +290,12 @@ emptyAC = IntMap.empty
lookupAC :: ActiveKey -> ActiveChart -> Maybe (Set.Set Active) lookupAC :: ActiveKey -> ActiveChart -> Maybe (Set.Set Active)
lookupAC (AK fcat l) chart = IntMap.lookup fcat chart >>= IntMap.lookup l lookupAC (AK fcat l) chart = IntMap.lookup fcat chart >>= IntMap.lookup l
lookupACByFCat :: FCat -> ActiveChart -> [Set.Set Active]
lookupACByFCat fcat chart =
case IntMap.lookup fcat chart of
Nothing -> []
Just map -> IntMap.elems map
labelsAC :: FCat -> ActiveChart -> [FIndex] labelsAC :: FCat -> ActiveChart -> [FIndex]
labelsAC fcat chart = labelsAC fcat chart =
case IntMap.lookup fcat chart of case IntMap.lookup fcat chart of
@@ -296,7 +349,7 @@ foldForest f g b fcat forest =
-- | An abstract data type whose values represent -- | An abstract data type whose values represent
-- the current state in an incremental parser. -- the current state in an incremental parser.
data ParseState = State PGF ParserInfo Chart (TMap.TrieMap String (Set.Set Active)) data ParseState = PState PGF ParserInfo Chart (TMap.TrieMap String (Set.Set Active))
data Chart data Chart
= Chart = Chart
@@ -308,3 +361,11 @@ data Chart
, offset :: {-# UNPACK #-} !Int , offset :: {-# UNPACK #-} !Int
} }
deriving Show deriving Show
----------------------------------------------------------------
-- Error State
----------------------------------------------------------------
-- | An abstract data type whose values represent
-- the state in an incremental parser after an error.
data ErrorState = EState PGF ParserInfo Chart