diff --git a/doc/gf-history.html b/doc/gf-history.html index a6b3b970b..81ca8106a 100644 --- a/doc/gf-history.html +++ b/doc/gf-history.html @@ -13,6 +13,19 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2

+17/11 (AR) Made it possible for lexers to be nondeterministic. +Now with a simple-minded implementation that the parser is sent +each lexing result in turn. The option -cut is used for +breaking after first lexing leading to successful parse. The only +nondeterministic lexer right now is -lexer=subseqs, which +first filters with -lexer=ignore (dropping words neither in +the grammar nor literals) and then starts ignoring other words from +longest to shortest subsequence. This is usable for parser tasks +of keyword spotting type, but expensive (2n) in long input. +A smarter implementation is therefore desirable. + +

+ 14/11 (AR) Functions can be made unparsable (or "internal" as in BNFC). This is done by i -noparse=file, where the nonparsable functions are given in file using the diff --git a/src/GF/Data/Operations.hs b/src/GF/Data/Operations.hs index 339a053cf..f5434486f 100644 --- a/src/GF/Data/Operations.hs +++ b/src/GF/Data/Operations.hs @@ -65,14 +65,14 @@ module GF.Data.Operations (-- * misc functions updateAssoc, removeAssoc, -- * chop into separator-separated parts - chunks, readIntArg, + chunks, readIntArg, subSequences, -- * state monad with error; from Agda 6\/11\/2001 STM(..), appSTM, stm, stmr, readSTM, updateSTM, writeSTM, done, -- * error monad class - ErrorMonad(..), checkAgain, checks, allChecks - + ErrorMonad(..), checkAgain, checks, allChecks, doUntil + ) where import Data.Char (isSpace, toUpper, isSpace, isDigit) @@ -656,3 +656,16 @@ allChecks ms = case ms of (m: ms) -> let rs = allChecks ms in handle_ (liftM2 (:) m rs) rs _ -> return [] +doUntil :: ErrorMonad m => (a -> Bool) -> [m a] -> m a +doUntil cond ms = case ms of + a:as -> do + v <- a + if cond v then return v else doUntil cond as + _ -> raise "no result" + +-- subsequences sorted from longest to shortest ; their number is 2^n +subSequences :: [a] -> [[a]] +subSequences = sortBy (\x y -> compare (length y) (length x)) . subs where + subs xs = case xs of + [] -> [[]] + x:xs -> let xss = subs xs in [x:y | y <- xss] ++ xss diff --git a/src/GF/Shell/HelpFile.hs b/src/GF/Shell/HelpFile.hs index 61532737b..8ec2971b6 100644 --- a/src/GF/Shell/HelpFile.hs +++ b/src/GF/Shell/HelpFile.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/11/14 16:03:41 $ +-- > CVS $Date: 2005/05/12 10:03:34 $ -- > CVS $Author: aarne $ --- > CVS $Revision: 1.20 $ +-- > CVS $Revision: 1.9 $ -- -- Help on shell commands. Generated from HelpFile by 'make help'. -- PLEASE DON'T EDIT THIS FILE. @@ -198,6 +198,7 @@ txtHelpFile = "\n -lines parse each line of input separately, ignoring empty lines" ++ "\n -all as -lines, but also parse empty lines" ++ "\n -prob rank results by probability" ++ + "\n -cut stop after first lexing result leading to parser success" ++ "\n options for selecting parsing method:" ++ "\n (default)parse using an overgenerating CFG" ++ "\n -cfg parse using a much less overgenerating CFG" ++ @@ -531,6 +532,8 @@ txtHelpFile = "\n -lexer=codelit like code, but treat unknown words as string literals" ++ "\n -lexer=textlit like text, but treat unknown words as string literals" ++ "\n -lexer=codeC use a C-like lexer" ++ + "\n -lexer=ignore like literals, but ignore unknown words" ++ + "\n -lexer=subseqs like ignore, but then try all subsequences from longest" ++ "\n" ++ "\n-number, the maximum number of generated items in a list. " ++ "\n The default is unlimited." ++ diff --git a/src/GF/Shell/ShellCommands.hs b/src/GF/Shell/ShellCommands.hs index 56cedc202..121d8cda6 100644 --- a/src/GF/Shell/ShellCommands.hs +++ b/src/GF/Shell/ShellCommands.hs @@ -173,7 +173,7 @@ optionsOfCommand co = case co of CTransformGrammar _ -> flags "printer" CConvertLatex _ -> none CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer" - CParse -> both "new newer cfg mcfg n ign raw v lines all prob" + CParse -> both "cut new newer cfg mcfg n ign raw v lines all prob" "cat lang lexer parser number rawtrees" CTranslate _ _ -> opts "cat lexer parser" CGenerateRandom -> both "cf prob" "cat lang number depth" diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs index 75294ff4b..26bad1ee9 100644 --- a/src/GF/UseGrammar/Custom.hs +++ b/src/GF/UseGrammar/Custom.hs @@ -161,7 +161,7 @@ customStringCommand :: CustomData (StateGrammar -> String -> String) customParser :: CustomData (StateGrammar -> CFCat -> CFParser) -- | useTokenizer, \"-lexer=x\" -customTokenizer :: CustomData (StateGrammar -> String -> [CFTok]) +customTokenizer :: CustomData (StateGrammar -> String -> [[CFTok]]) -- | useUntokenizer, \"-unlexer=x\" --- should be from token list to string customUntokenizer :: CustomData (StateGrammar -> String -> String) @@ -416,22 +416,24 @@ customParser = -- add your own parsers here ] -customTokenizer = +customTokenizer = + let sg = singleton in customData "Tokenizers, selected by option -lexer=x" $ [ - (strCI "words", const $ tokWords) - ,(strCI "literals", const $ tokLits) - ,(strCI "vars", const $ tokVars) - ,(strCI "chars", const $ map (tS . singleton)) - ,(strCI "code", const $ lexHaskell) - ,(strCI "codevars", lexHaskellVar . stateIsWord) - ,(strCI "text", const $ lexText) - ,(strCI "unglue", \gr -> map tS . decomposeWords (stateMorpho gr)) - ,(strCI "codelit", lexHaskellLiteral . stateIsWord) - ,(strCI "textlit", lexTextLiteral . stateIsWord) - ,(strCI "codeC", const $ lexC2M) - ,(strCI "ignore", \gr -> lexIgnore (stateIsWord gr) . tokLits) - ,(strCI "codeCHigh", const $ lexC2M' True) + (strCI "words", const $ sg . tokWords) + ,(strCI "literals", const $ sg . tokLits) + ,(strCI "vars", const $ sg . tokVars) + ,(strCI "chars", const $ sg . map (tS . singleton)) + ,(strCI "code", const $ sg . lexHaskell) + ,(strCI "codevars", \gr -> sg . (lexHaskellVar $ stateIsWord gr)) + ,(strCI "text", const $ sg . lexText) + ,(strCI "unglue", \gr -> sg . map tS . decomposeWords (stateMorpho gr)) + ,(strCI "codelit", \gr -> sg . (lexHaskellLiteral $ stateIsWord gr)) + ,(strCI "textlit", \gr -> sg . (lexTextLiteral $ stateIsWord gr)) + ,(strCI "codeC", const $ sg . lexC2M) + ,(strCI "ignore", \gr -> sg . lexIgnore (stateIsWord gr) . tokLits) + ,(strCI "subseqs", \gr -> subSequences . lexIgnore (stateIsWord gr) . tokLits) + ,(strCI "codeCHigh", const $ sg . lexC2M' True) -- add your own tokenizers here ] diff --git a/src/GF/UseGrammar/Parsing.hs b/src/GF/UseGrammar/Parsing.hs index a9da37df5..a4699bcab 100644 --- a/src/GF/UseGrammar/Parsing.hs +++ b/src/GF/UseGrammar/Parsing.hs @@ -40,7 +40,7 @@ import qualified GF.Parsing.GFC as New import GF.Data.Operations -import Data.List (nub) +import Data.List (nub,sortBy) import Control.Monad (liftM) -- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002 @@ -51,7 +51,7 @@ parseString os sg cat = liftM fst . parseStringMsg os sg cat parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String) parseStringMsg os sg cat s = do (ts,(_,ss)) <- checkStart $ parseStringC os sg cat s - return (ts,unlines ss) + return (ts, unlines $ reverse ss) parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree] @@ -73,7 +73,10 @@ parseStringC opts0 sg cat s | otherwise = "c" -- default algorithm strategy = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown tokenizer = customOrDefault opts useTokenizer customTokenizer sg - ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat (tokenizer s) + toks = case tokenizer s of + t:_ -> t + _ -> [] ---- no support for undet. tok. + ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts return $ optIntOrAll opts flagNumber ts' @@ -82,10 +85,11 @@ parseStringC opts0 sg cat s = do cf = stateCF sg gr = stateGrammarST sg cn = cncId sg - tok = customOrDefault opts useTokenizer customTokenizer sg + toks = customOrDefault opts useTokenizer customTokenizer sg s parser = customOrDefault opts useParser customParser sg cat - tokens2trms opts sg cn parser (tok s) - + if oElem (iOpt "cut") opts + then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks + else mapM (tokens2trms opts sg cn parser) toks >>= return . concat tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree] tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info @@ -93,10 +97,12 @@ tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info info = snd result trees = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2)) -trees2trms :: Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree] +trees2trms :: + Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree] trees2trms opts sg cn as ts0 info = do + let s = unwords $ map prCFTok as ts <- case () of - _ | null ts0 -> checkWarn "No success in cf parsing" >> return [] + _ | null ts0 -> checkWarn ("No success in cf parsing" +++ s) >> return [] _ | raw -> do ts1 <- return (map cf2trm0 ts0) ----- should not need annot checks [ diff --git a/src/HelpFile b/src/HelpFile index 08cd01074..09c923c3f 100644 --- a/src/HelpFile +++ b/src/HelpFile @@ -169,6 +169,7 @@ p, parse: p String -lines parse each line of input separately, ignoring empty lines -all as -lines, but also parse empty lines -prob rank results by probability + -cut stop after first lexing result leading to parser success options for selecting parsing method: (default)parse using an overgenerating CFG -cfg parse using a much less overgenerating CFG @@ -502,6 +503,8 @@ q, quit: q -lexer=codelit like code, but treat unknown words as string literals -lexer=textlit like text, but treat unknown words as string literals -lexer=codeC use a C-like lexer + -lexer=ignore like literals, but ignore unknown words + -lexer=subseqs like ignore, but then try all subsequences from longest -number, the maximum number of generated items in a list. The default is unlimited.