From 4e7a5bb2975690ca01791c354253a76ef672484d Mon Sep 17 00:00:00 2001 From: krasimir Date: Fri, 23 Oct 2009 08:35:32 +0000 Subject: [PATCH] experimental robust parser --- src/GF/Command/Commands.hs | 18 +++++- src/GF/Data/TrieMap.hs | 11 ++++ src/GFI.hs | 8 ++- src/PGF.hs | 17 ++++-- src/PGF/Parsing/FCFG/Incremental.hs | 93 ++++++++++++++++++++++++----- 5 files changed, 122 insertions(+), 25 deletions(-) diff --git a/src/GF/Command/Commands.hs b/src/GF/Command/Commands.hs index 1b12d82cc..d182b65ba 100644 --- a/src/GF/Command/Commands.hs +++ b/src/GF/Command/Commands.hs @@ -389,12 +389,17 @@ allCommands cod env@(pgf, mos) = Map.fromList [ "Shows all trees returned by parsing a string in the grammars in scope.", "The -lang flag can be used to restrict this to fewer languages.", "The default start category can be overridden by the -cat flag.", - "See also the ps command for lexing and character encoding." + "See also the ps command for lexing and character encoding.", + "", + "The -openclass flag is experimental and allows some robustness in ", + "the parser. For example if -openclass=\"A,N,V\" is given, the parser", + "will accept unknown adjectives, nouns and verbs with the resource grammar." ], exec = \opts -> returnFromExprs . concatMap (par opts) . toStrings, flags = [ ("cat","target category of parsing"), - ("lang","the languages of parsing (comma-separated, no spaces)") + ("lang","the languages of parsing (comma-separated, no spaces)"), + ("openclass","list of open-class categories for robust parsing") ] }), ("pg", emptyCommandInfo { ----- @@ -742,7 +747,9 @@ allCommands cod env@(pgf, mos) = Map.fromList [ ] where enc = encodeUnicode cod - par opts s = concat [parse pgf lang (optType opts) s | lang <- optLangs opts, canParse pgf lang] + par opts s = case optOpenTypes opts of + [] -> concat [parse pgf lang (optType opts) s | lang <- optLangs opts, canParse pgf lang] + open_typs -> concat [parseWithRecovery pgf lang (optType opts) open_typs s | lang <- optLangs opts, canParse pgf lang] void = ([],[]) @@ -789,6 +796,11 @@ allCommands cod env@(pgf, mos) = Map.fromList [ "" -> languages pgf lang -> map mkCId (chunks ',' lang) optLang opts = head $ optLangs opts ++ [wildCId] + + optOpenTypes opts = case valStrOpts "openclass" "" opts of + "" -> [] + cats -> mapMaybe readType (chunks ',' cats) + optType opts = let str = valStrOpts "cat" (showCId $ lookStartCat pgf) opts in case readType str of diff --git a/src/GF/Data/TrieMap.hs b/src/GF/Data/TrieMap.hs index 37c56fc3a..a6749d641 100644 --- a/src/GF/Data/TrieMap.hs +++ b/src/GF/Data/TrieMap.hs @@ -12,6 +12,9 @@ module GF.Data.TrieMap , insertWith , unionWith + , unionsWith + + , elems ) where import Prelude hiding (lookup, null) @@ -53,3 +56,11 @@ unionWith f (Tr mb_v1 m1) (Tr mb_v2 m2) = (Just v1,Just v2) -> Just (f v1 v2) m = Map.unionWith (unionWith f) m1 m2 in Tr mb_v m + +unionsWith :: Ord k => (v -> v -> v) -> [TrieMap k v] -> TrieMap k v +unionsWith f = foldl (unionWith f) empty + +elems :: TrieMap k v -> [v] +elems tr = collect tr [] + where + collect (Tr mb_v m) xs = maybe id (:) mb_v (Map.fold collect xs m) diff --git a/src/GFI.hs b/src/GFI.hs index 7b98f0ff8..727e91209 100644 --- a/src/GFI.hs +++ b/src/GFI.hs @@ -262,7 +262,7 @@ wordCompletion gfenv line0 prefix0 p = -> do mb_state0 <- try (evaluate (initState pgf (optLang opts) (optType opts))) case mb_state0 of Right state0 -> let ws = words (take (length s - length prefix) s) - in case foldM nextState state0 ws of + in case loop state0 ws of Nothing -> ret ' ' [] Just state -> let compls = getCompletions state prefix in ret ' ' (map (encode gfenv) (Map.keys compls)) @@ -295,7 +295,11 @@ wordCompletion gfenv line0 prefix0 p = Just ty -> ty Nothing -> error ("Can't parse '"++str++"' as type") - + loop ps [] = Just ps + loop ps (t:ts) = case nextState ps t of + Left es -> Nothing + Right ps -> loop ps ts + ret c [x] = return [x++[c]] ret _ xs = return xs diff --git a/src/PGF.hs b/src/PGF.hs index b9ad357c9..81e6d5024 100644 --- a/src/PGF.hs +++ b/src/PGF.hs @@ -29,7 +29,7 @@ module PGF( -- * Types Type, Hypo, showType, readType, - mkType, mkHypo, mkDepHypo, mkImplHypo, + mkType, mkHypo, mkDepHypo, mkImplHypo, categories, startCat, -- * Functions @@ -54,7 +54,7 @@ module PGF( showPrintName, -- ** Parsing - parse, canParse, parseAllLang, parseAll, + parse, parseWithRecovery, canParse, parseAllLang, parseAll, -- ** Evaluation PGF.compute, paraphrase, @@ -75,7 +75,7 @@ module PGF( -- ** Word Completion (Incremental Parsing) complete, Incremental.ParseState, - Incremental.initState, Incremental.nextState, Incremental.getCompletions, Incremental.extractTrees, + Incremental.initState, Incremental.nextState, Incremental.getCompletions, Incremental.recoveryStates, Incremental.extractTrees, -- ** Generation generateRandom, generateAll, generateAllDepth, @@ -131,6 +131,8 @@ linearize :: PGF -> Language -> Tree -> String -- for parsing, see 'canParse'. parse :: PGF -> Language -> Type -> String -> [Tree] +parseWithRecovery :: PGF -> Language -> Type -> [Type] -> String -> [Tree] + -- | Checks whether the given language can be used for parsing. canParse :: PGF -> Language -> Bool @@ -241,6 +243,8 @@ parse pgf lang typ s = Nothing -> error ("No parser built for language: " ++ showCId lang) Nothing -> error ("Unknown language: " ++ showCId lang) +parseWithRecovery pgf lang typ open_typs s = Incremental.parseWithRecovery pgf lang typ open_typs (words s) + canParse pgf cnc = isJust (lookParser pgf cnc) linearizeAll mgr = map snd . linearizeAllLang mgr @@ -282,7 +286,7 @@ functionType pgf fun = complete pgf from typ input = let (ws,prefix) = tokensAndPrefix input state0 = Incremental.initState pgf from typ - in case foldM Incremental.nextState state0 ws of + in case loop state0 ws of Nothing -> [] Just state -> (if null prefix && not (null (Incremental.extractTrees state typ)) then [unwords ws ++ " "] else []) @@ -294,6 +298,11 @@ complete pgf from typ input = | otherwise = (init ws, last ws) where ws = words s + loop ps [] = Just ps + loop ps (t:ts) = case Incremental.nextState ps t of + Left es -> Nothing + Right ps -> loop ps ts + -- | Converts an expression to normal form compute :: PGF -> Expr -> Expr compute pgf = PGF.Data.normalForm (funs (abstract pgf)) 0 [] diff --git a/src/PGF/Parsing/FCFG/Incremental.hs b/src/PGF/Parsing/FCFG/Incremental.hs index 6ae18e3bf..dbb87556d 100644 --- a/src/PGF/Parsing/FCFG/Incremental.hs +++ b/src/PGF/Parsing/FCFG/Incremental.hs @@ -1,11 +1,14 @@ {-# LANGUAGE BangPatterns #-} module PGF.Parsing.FCFG.Incremental ( ParseState + , ErrorState , initState , nextState , getCompletions + , recoveryStates , extractTrees , parse + , parseWithRecovery ) where import Data.Array.IArray @@ -26,8 +29,28 @@ import PGF.Macros import PGF.TypeCheck import Debug.Trace -parse :: PGF -> Language -> Type -> [String] -> [Expr] -parse pgf lang typ toks = maybe [] (\ps -> extractTrees ps typ) (foldM nextState (initState pgf lang typ) toks) +parse :: PGF -> Language -> Type -> [String] -> [Tree] +parse pgf lang typ toks = loop (initState pgf lang typ) toks + where + loop ps [] = extractTrees ps typ + loop ps (t:ts) = case nextState ps t of + Left es -> [] + Right ps -> loop ps ts + +parseWithRecovery :: PGF -> Language -> Type -> [Type] -> [String] -> [Tree] +parseWithRecovery pgf lang typ open_typs toks = accept (initState pgf lang typ) toks + where + accept ps [] = extractTrees ps typ + accept ps (t:ts) = + case nextState ps t of + Right ps -> accept ps ts + Left es -> skip (recoveryStates open_typs es) ts + + skip ps_map [] = extractTrees (fst ps_map) typ + skip ps_map (t:ts) = + case Map.lookup t (snd ps_map) of + Just ps -> accept ps ts + Nothing -> skip ps_map ts -- | Creates an initial parsing state for a given language and -- startup category. @@ -46,16 +69,18 @@ initState pgf lang (DTyp _ start _) = Just pinfo -> pinfo _ -> error ("Unknown language: " ++ showCId lang) - in State pgf - pinfo - (Chart emptyAC [] emptyPC (productions pinfo) (totalCats pinfo) 0) - (TMap.singleton [] (Set.fromList items)) + in PState pgf + pinfo + (Chart emptyAC [] emptyPC (productions pinfo) (totalCats pinfo) 0) + (TMap.singleton [] (Set.fromList items)) -- | From the current state and the next token --- 'nextState' computes a new state where the token --- is consumed and the current position shifted by one. -nextState :: ParseState -> String -> Maybe ParseState -nextState (State pgf pinfo chart items) t = +-- 'nextState' computes a new state, where the token +-- is consumed and the current position is shifted by one. +-- If the new token cannot be accepted then an error state +-- is returned. +nextState :: ParseState -> String -> Either ErrorState ParseState +nextState (PState pgf pinfo chart items) t = let (mb_agenda,map_items) = TMap.decompose items agenda = maybe [] Set.toList mb_agenda acc = fromMaybe TMap.empty (Map.lookup t map_items) @@ -66,8 +91,8 @@ nextState (State pgf pinfo chart items) t = , offset =offset chart1+1 } in if TMap.null acc1 - then Nothing - else Just (State pgf pinfo chart2 acc1) + then Left (EState pgf pinfo chart2) + else Right (PState pgf pinfo chart2 acc1) where add (tok:toks) item acc | tok == t = TMap.insertWith Set.union toks (Set.singleton item) acc @@ -78,7 +103,7 @@ nextState (State pgf pinfo chart items) t = -- next words and the consequent states. This is used for word completions in -- the GF interpreter. getCompletions :: ParseState -> String -> Map.Map String ParseState -getCompletions (State pgf pinfo chart items) w = +getCompletions (PState pgf pinfo chart items) w = let (mb_agenda,map_items) = TMap.decompose items agenda = maybe [] Set.toList mb_agenda acc = Map.filterWithKey (\tok _ -> isPrefixOf w tok) map_items @@ -88,12 +113,34 @@ getCompletions (State pgf pinfo chart items) w = , passive=emptyPC , offset =offset chart1+1 } - in fmap (State pgf pinfo chart2) acc' + in fmap (PState pgf pinfo chart2) acc' where add (tok:toks) item acc | isPrefixOf w tok = Map.insertWith (TMap.unionWith Set.union) tok (TMap.singleton toks (Set.singleton item)) acc add _ item acc = acc +recoveryStates :: [Type] -> ErrorState -> (ParseState, Map.Map String ParseState) +recoveryStates open_types (EState pgf pinfo chart) = + let open_fcats = concatMap type2fcats open_types + agenda = foldl (complete open_fcats) [] (actives chart) + (acc,chart1) = process Nothing add (sequences pinfo) (functions pinfo) agenda Map.empty chart + chart2 = chart1{ active =emptyAC + , actives=active chart1 : actives chart1 + , passive=emptyPC + , offset =offset chart1+1 + } + in (PState pgf pinfo chart (TMap.singleton [] (Set.fromList agenda)), fmap (PState pgf pinfo chart2) acc) + where + type2fcats (DTyp _ cat _) = fromMaybe [] (Map.lookup cat (startCats pinfo)) + + complete open_fcats items ac = + foldl (Set.fold (\(Active j' ppos funid seqid args keyc) -> + (:) (Active j' (ppos+1) funid seqid args keyc))) + items + [set | fcat <- open_fcats, set <- lookupACByFCat fcat ac] + + add (tok:toks) item acc = Map.insertWith (TMap.unionWith Set.union) tok (TMap.singleton toks (Set.singleton item)) acc + -- | This function extracts the list of all completed parse trees -- that spans the whole input consumed so far. The trees are also -- limited by the category specified, which is usually @@ -189,7 +236,7 @@ process mbt fn !seqs !funs (item@(Active j ppos funid seqid args key0):items) ac Just set -> Set.fold (\(Active j' ppos funid seqid args keyc) -> let FSymCat d _ = unsafeAt (unsafeAt seqs seqid) ppos in (:) (Active j' (ppos+1) funid seqid (updateAt d fid args) keyc)) items set - in process mbt fn seqs funs items2 acc chart{passive=insertPC (mkPK key0 j) fid (passive chart) + in process mbt fn seqs funs items2 acc chart{passive=insertPC (mkPK key0 j) fid (passive chart) ,forest =IntMap.insert fid (Set.singleton (FApply funid args)) (forest chart) ,nextId =nextId chart+1 } @@ -243,6 +290,12 @@ emptyAC = IntMap.empty lookupAC :: ActiveKey -> ActiveChart -> Maybe (Set.Set Active) lookupAC (AK fcat l) chart = IntMap.lookup fcat chart >>= IntMap.lookup l +lookupACByFCat :: FCat -> ActiveChart -> [Set.Set Active] +lookupACByFCat fcat chart = + case IntMap.lookup fcat chart of + Nothing -> [] + Just map -> IntMap.elems map + labelsAC :: FCat -> ActiveChart -> [FIndex] labelsAC fcat chart = case IntMap.lookup fcat chart of @@ -296,7 +349,7 @@ foldForest f g b fcat forest = -- | An abstract data type whose values represent -- the current state in an incremental parser. -data ParseState = State PGF ParserInfo Chart (TMap.TrieMap String (Set.Set Active)) +data ParseState = PState PGF ParserInfo Chart (TMap.TrieMap String (Set.Set Active)) data Chart = Chart @@ -308,3 +361,11 @@ data Chart , offset :: {-# UNPACK #-} !Int } deriving Show + +---------------------------------------------------------------- +-- Error State +---------------------------------------------------------------- + +-- | An abstract data type whose values represent +-- the state in an incremental parser after an error. +data ErrorState = EState PGF ParserInfo Chart