nondeterministic lexer, e.g. subseqs

This commit is contained in:
aarne
2005-11-17 23:17:42 +00:00
parent e29a1430bf
commit 524c4829f9
7 changed files with 69 additions and 29 deletions

View File

@@ -13,6 +13,19 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2
</center>
<p>
17/11 (AR) Made it possible for lexers to be nondeterministic.
Now with a simple-minded implementation that the parser is sent
each lexing result in turn. The option <tt>-cut</tt> is used for
breaking after first lexing leading to successful parse. The only
nondeterministic lexer right now is <tt>-lexer=subseqs</tt>, which
first filters with <tt>-lexer=ignore</tt> (dropping words neither in
the grammar nor literals) and then starts ignoring other words from
longest to shortest subsequence. This is usable for parser tasks
of keyword spotting type, but expensive (2<sup>n</sup>) in long input.
A smarter implementation is therefore desirable.
<p>
14/11 (AR) Functions can be made unparsable (or "internal" as
in BNFC). This is done by <tt>i -noparse=file</tt>, where
the nonparsable functions are given in <tt>file</tt> using the

View File

@@ -65,14 +65,14 @@ module GF.Data.Operations (-- * misc functions
updateAssoc, removeAssoc,
-- * chop into separator-separated parts
chunks, readIntArg,
chunks, readIntArg, subSequences,
-- * state monad with error; from Agda 6\/11\/2001
STM(..), appSTM, stm, stmr, readSTM, updateSTM, writeSTM, done,
-- * error monad class
ErrorMonad(..), checkAgain, checks, allChecks
ErrorMonad(..), checkAgain, checks, allChecks, doUntil
) where
import Data.Char (isSpace, toUpper, isSpace, isDigit)
@@ -656,3 +656,16 @@ allChecks ms = case ms of
(m: ms) -> let rs = allChecks ms in handle_ (liftM2 (:) m rs) rs
_ -> return []
doUntil :: ErrorMonad m => (a -> Bool) -> [m a] -> m a
doUntil cond ms = case ms of
a:as -> do
v <- a
if cond v then return v else doUntil cond as
_ -> raise "no result"
-- subsequences sorted from longest to shortest ; their number is 2^n
subSequences :: [a] -> [[a]]
subSequences = sortBy (\x y -> compare (length y) (length x)) . subs where
subs xs = case xs of
[] -> [[]]
x:xs -> let xss = subs xs in [x:y | y <- xss] ++ xss

View File

@@ -5,9 +5,9 @@
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/11/14 16:03:41 $
-- > CVS $Date: 2005/05/12 10:03:34 $
-- > CVS $Author: aarne $
-- > CVS $Revision: 1.20 $
-- > CVS $Revision: 1.9 $
--
-- Help on shell commands. Generated from HelpFile by 'make help'.
-- PLEASE DON'T EDIT THIS FILE.
@@ -198,6 +198,7 @@ txtHelpFile =
"\n -lines parse each line of input separately, ignoring empty lines" ++
"\n -all as -lines, but also parse empty lines" ++
"\n -prob rank results by probability" ++
"\n -cut stop after first lexing result leading to parser success" ++
"\n options for selecting parsing method:" ++
"\n (default)parse using an overgenerating CFG" ++
"\n -cfg parse using a much less overgenerating CFG" ++
@@ -531,6 +532,8 @@ txtHelpFile =
"\n -lexer=codelit like code, but treat unknown words as string literals" ++
"\n -lexer=textlit like text, but treat unknown words as string literals" ++
"\n -lexer=codeC use a C-like lexer" ++
"\n -lexer=ignore like literals, but ignore unknown words" ++
"\n -lexer=subseqs like ignore, but then try all subsequences from longest" ++
"\n" ++
"\n-number, the maximum number of generated items in a list. " ++
"\n The default is unlimited." ++

View File

@@ -173,7 +173,7 @@ optionsOfCommand co = case co of
CTransformGrammar _ -> flags "printer"
CConvertLatex _ -> none
CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer"
CParse -> both "new newer cfg mcfg n ign raw v lines all prob"
CParse -> both "cut new newer cfg mcfg n ign raw v lines all prob"
"cat lang lexer parser number rawtrees"
CTranslate _ _ -> opts "cat lexer parser"
CGenerateRandom -> both "cf prob" "cat lang number depth"

View File

@@ -161,7 +161,7 @@ customStringCommand :: CustomData (StateGrammar -> String -> String)
customParser :: CustomData (StateGrammar -> CFCat -> CFParser)
-- | useTokenizer, \"-lexer=x\"
customTokenizer :: CustomData (StateGrammar -> String -> [CFTok])
customTokenizer :: CustomData (StateGrammar -> String -> [[CFTok]])
-- | useUntokenizer, \"-unlexer=x\" --- should be from token list to string
customUntokenizer :: CustomData (StateGrammar -> String -> String)
@@ -416,22 +416,24 @@ customParser =
-- add your own parsers here
]
customTokenizer =
customTokenizer =
let sg = singleton in
customData "Tokenizers, selected by option -lexer=x" $
[
(strCI "words", const $ tokWords)
,(strCI "literals", const $ tokLits)
,(strCI "vars", const $ tokVars)
,(strCI "chars", const $ map (tS . singleton))
,(strCI "code", const $ lexHaskell)
,(strCI "codevars", lexHaskellVar . stateIsWord)
,(strCI "text", const $ lexText)
,(strCI "unglue", \gr -> map tS . decomposeWords (stateMorpho gr))
,(strCI "codelit", lexHaskellLiteral . stateIsWord)
,(strCI "textlit", lexTextLiteral . stateIsWord)
,(strCI "codeC", const $ lexC2M)
,(strCI "ignore", \gr -> lexIgnore (stateIsWord gr) . tokLits)
,(strCI "codeCHigh", const $ lexC2M' True)
(strCI "words", const $ sg . tokWords)
,(strCI "literals", const $ sg . tokLits)
,(strCI "vars", const $ sg . tokVars)
,(strCI "chars", const $ sg . map (tS . singleton))
,(strCI "code", const $ sg . lexHaskell)
,(strCI "codevars", \gr -> sg . (lexHaskellVar $ stateIsWord gr))
,(strCI "text", const $ sg . lexText)
,(strCI "unglue", \gr -> sg . map tS . decomposeWords (stateMorpho gr))
,(strCI "codelit", \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))
,(strCI "textlit", \gr -> sg . (lexTextLiteral $ stateIsWord gr))
,(strCI "codeC", const $ sg . lexC2M)
,(strCI "ignore", \gr -> sg . lexIgnore (stateIsWord gr) . tokLits)
,(strCI "subseqs", \gr -> subSequences . lexIgnore (stateIsWord gr) . tokLits)
,(strCI "codeCHigh", const $ sg . lexC2M' True)
-- add your own tokenizers here
]

View File

@@ -40,7 +40,7 @@ import qualified GF.Parsing.GFC as New
import GF.Data.Operations
import Data.List (nub)
import Data.List (nub,sortBy)
import Control.Monad (liftM)
-- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
@@ -51,7 +51,7 @@ parseString os sg cat = liftM fst . parseStringMsg os sg cat
parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String)
parseStringMsg os sg cat s = do
(ts,(_,ss)) <- checkStart $ parseStringC os sg cat s
return (ts,unlines ss)
return (ts, unlines $ reverse ss)
parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
@@ -73,7 +73,10 @@ parseStringC opts0 sg cat s
| otherwise = "c" -- default algorithm
strategy = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat (tokenizer s)
toks = case tokenizer s of
t:_ -> t
_ -> [] ---- no support for undet. tok.
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
return $ optIntOrAll opts flagNumber ts'
@@ -82,10 +85,11 @@ parseStringC opts0 sg cat s = do
cf = stateCF sg
gr = stateGrammarST sg
cn = cncId sg
tok = customOrDefault opts useTokenizer customTokenizer sg
toks = customOrDefault opts useTokenizer customTokenizer sg s
parser = customOrDefault opts useParser customParser sg cat
tokens2trms opts sg cn parser (tok s)
if oElem (iOpt "cut") opts
then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
@@ -93,10 +97,12 @@ tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
info = snd result
trees = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2))
trees2trms :: Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
trees2trms ::
Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
trees2trms opts sg cn as ts0 info = do
let s = unwords $ map prCFTok as
ts <- case () of
_ | null ts0 -> checkWarn "No success in cf parsing" >> return []
_ | null ts0 -> checkWarn ("No success in cf parsing" +++ s) >> return []
_ | raw -> do
ts1 <- return (map cf2trm0 ts0) ----- should not need annot
checks [

View File

@@ -169,6 +169,7 @@ p, parse: p String
-lines parse each line of input separately, ignoring empty lines
-all as -lines, but also parse empty lines
-prob rank results by probability
-cut stop after first lexing result leading to parser success
options for selecting parsing method:
(default)parse using an overgenerating CFG
-cfg parse using a much less overgenerating CFG
@@ -502,6 +503,8 @@ q, quit: q
-lexer=codelit like code, but treat unknown words as string literals
-lexer=textlit like text, but treat unknown words as string literals
-lexer=codeC use a C-like lexer
-lexer=ignore like literals, but ignore unknown words
-lexer=subseqs like ignore, but then try all subsequences from longest
-number, the maximum number of generated items in a list.
The default is unlimited.