nondeterministic lexer, e.g. subseqs

2026-06-28 04:16:28 -06:00 · 2005-11-17 23:17:42 +00:00
parent e29a1430bf
commit 524c4829f9
7 changed files with 69 additions and 29 deletions
@@ -13,6 +13,19 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2
 </center>
 <p>
 17/11 (AR) Made it possible for lexers to be nondeterministic.
 Now with a simple-minded implementation that the parser is sent
 each lexing result in turn. The option <tt>-cut</tt> is used for
 breaking after first lexing leading to successful parse. The only
 nondeterministic lexer right now is <tt>-lexer=subseqs</tt>, which
 first filters with <tt>-lexer=ignore</tt> (dropping words neither in
 the grammar nor literals) and then starts ignoring other words from
 longest to shortest subsequence. This is usable for parser tasks
 of keyword spotting type, but expensive (2<sup>n</sup>) in long input.
 A smarter implementation is therefore desirable.
 <p>
 14/11 (AR) Functions can be made unparsable (or "internal" as
 in BNFC). This is done by <tt>i -noparse=file</tt>, where
 the nonparsable functions are given in <tt>file</tt> using the
@@ -65,14 +65,14 @@ module GF.Data.Operations (-- * misc functions
 		   updateAssoc, removeAssoc,
 		   -- * chop into separator-separated parts
-		   chunks, readIntArg,
+		   chunks, readIntArg, subSequences,
 		   -- * state monad with error; from Agda 6\/11\/2001
 		   STM(..), appSTM, stm, stmr, readSTM, updateSTM, writeSTM, done,
 		   -- * error monad class
-		   ErrorMonad(..), checkAgain, checks, allChecks
+		   ErrorMonad(..), checkAgain, checks, allChecks, doUntil
-
+                
 		  ) where
 import Data.Char (isSpace, toUpper, isSpace, isDigit)
@@ -656,3 +656,16 @@ allChecks ms = case ms of
  (m: ms) -> let rs = allChecks ms in handle_ (liftM2 (:) m rs) rs
  _ -> return []
 doUntil :: ErrorMonad m => (a -> Bool) -> [m a] -> m a
 doUntil cond ms = case ms of
  a:as -> do
    v <- a
    if cond v then return v else doUntil cond as
  _ -> raise "no result"
 -- subsequences sorted from longest to shortest ; their number is 2^n
 subSequences :: [a] -> [[a]]
 subSequences = sortBy (\x y -> compare (length y) (length x)) . subs where
  subs xs = case xs of
    [] -> [[]]
    x:xs -> let xss = subs xs in [x:y | y <- xss] ++ xss
@@ -5,9 +5,9 @@
 -- Stability   : (stable)
 -- Portability : (portable)
 --
-- > CVS $Date: 2005/11/14 16:03:41 $
+-- > CVS $Date: 2005/05/12 10:03:34 $
 -- > CVS $Author: aarne $
-- > CVS $Revision: 1.20 $
+-- > CVS $Revision: 1.9 $
 --
 -- Help on shell commands. Generated from HelpFile by 'make help'.
 -- PLEASE DON'T EDIT THIS FILE.
@@ -198,6 +198,7 @@ txtHelpFile =
  "\n      -lines   parse each line of input separately, ignoring empty lines" ++
  "\n      -all     as -lines, but also parse empty lines" ++
  "\n      -prob    rank results by probability" ++
  "\n      -cut     stop after first lexing result leading to parser success" ++
  "\n  options for selecting parsing method:" ++
  "\n      (default)parse using an overgenerating CFG" ++
  "\n      -cfg     parse using a much less overgenerating CFG" ++
@@ -531,6 +532,8 @@ txtHelpFile =
  "\n    -lexer=codelit       like code, but treat unknown words as string literals" ++
  "\n    -lexer=textlit       like text, but treat unknown words as string literals" ++
  "\n    -lexer=codeC         use a C-like lexer" ++
  "\n    -lexer=ignore        like literals, but ignore unknown words" ++
  "\n    -lexer=subseqs       like ignore, but then try all subsequences from longest" ++
  "\n" ++
  "\n-number, the maximum number of generated items in a list. " ++
  "\n       The default is unlimited." ++
@@ -173,7 +173,7 @@ optionsOfCommand co = case co of
  CTransformGrammar _ -> flags "printer"
  CConvertLatex _ -> none
  CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer"
-  CParse -> both "new newer cfg mcfg n ign raw v lines all prob" 
+  CParse -> both "cut new newer cfg mcfg n ign raw v lines all prob" 
                 "cat lang lexer parser number rawtrees"
  CTranslate _ _ -> opts "cat lexer parser"
  CGenerateRandom -> both "cf prob" "cat lang number depth"
@@ -161,7 +161,7 @@ customStringCommand  :: CustomData (StateGrammar -> String -> String)
 customParser         :: CustomData (StateGrammar -> CFCat -> CFParser)
 -- | useTokenizer, \"-lexer=x\"
-customTokenizer      :: CustomData (StateGrammar -> String -> [CFTok])  
+customTokenizer      :: CustomData (StateGrammar -> String -> [[CFTok]])  
 -- | useUntokenizer, \"-unlexer=x\" --- should be from token list to string
 customUntokenizer    :: CustomData (StateGrammar -> String -> String)  
@@ -416,22 +416,24 @@ customParser =
 -- add your own parsers here
  ]
-customTokenizer = 
+customTokenizer =
  let sg = singleton in 
  customData "Tokenizers, selected by option -lexer=x" $
  [
-   (strCI "words",     const $ tokWords)
+   (strCI "words",     const $ sg . tokWords)
-  ,(strCI "literals",  const $ tokLits)
+  ,(strCI "literals",  const $ sg . tokLits)
-  ,(strCI "vars",      const $ tokVars)
+  ,(strCI "vars",      const $ sg . tokVars)
-  ,(strCI "chars",     const $ map (tS . singleton))
+  ,(strCI "chars",     const $ sg . map (tS . singleton))
-  ,(strCI "code",      const $ lexHaskell)
+  ,(strCI "code",      const $ sg . lexHaskell)
-  ,(strCI "codevars",  lexHaskellVar . stateIsWord)
+  ,(strCI "codevars",  \gr -> sg . (lexHaskellVar $ stateIsWord gr))
-  ,(strCI "text",      const $ lexText)
+  ,(strCI "text",      const $ sg . lexText)
-  ,(strCI "unglue",    \gr -> map tS . decomposeWords (stateMorpho gr))
+  ,(strCI "unglue",    \gr -> sg . map tS . decomposeWords (stateMorpho gr))
-  ,(strCI "codelit",   lexHaskellLiteral . stateIsWord)
+  ,(strCI "codelit",   \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))
-  ,(strCI "textlit",   lexTextLiteral . stateIsWord)
+  ,(strCI "textlit",   \gr -> sg . (lexTextLiteral $ stateIsWord gr))
-  ,(strCI "codeC",     const $ lexC2M)
+  ,(strCI "codeC",     const $ sg . lexC2M)
-  ,(strCI "ignore",    \gr -> lexIgnore (stateIsWord gr) . tokLits)
+  ,(strCI "ignore",    \gr -> sg . lexIgnore (stateIsWord gr) . tokLits)
-  ,(strCI "codeCHigh", const $ lexC2M' True)
+  ,(strCI "subseqs",   \gr -> subSequences . lexIgnore (stateIsWord gr) . tokLits)
  ,(strCI "codeCHigh", const $ sg . lexC2M' True)
 -- add your own tokenizers here
  ]
@@ -40,7 +40,7 @@ import qualified GF.Parsing.GFC as New
 import GF.Data.Operations
-import Data.List (nub)
+import Data.List (nub,sortBy)
 import Control.Monad (liftM)
 -- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
@@ -51,7 +51,7 @@ parseString os sg cat = liftM fst . parseStringMsg os sg cat
 parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String)
 parseStringMsg os sg cat s = do
  (ts,(_,ss)) <- checkStart $ parseStringC os sg cat s
-  return (ts,unlines ss)
+  return (ts, unlines $ reverse ss)
 parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
@@ -73,7 +73,10 @@ parseStringC opts0 sg cat s
 		| otherwise              = "c" -- default algorithm
      strategy  = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
      tokenizer = customOrDefault opts useTokenizer customTokenizer sg
-  ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat (tokenizer s)
+      toks = case tokenizer s of
               t:_ -> t
               _ -> [] ---- no support for undet. tok.
  ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
  ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
  return $ optIntOrAll opts flagNumber ts'
@@ -82,10 +85,11 @@ parseStringC opts0 sg cat s = do
      cf  = stateCF sg
      gr  = stateGrammarST sg
      cn  = cncId sg
-      tok = customOrDefault opts useTokenizer customTokenizer sg
+      toks = customOrDefault opts useTokenizer customTokenizer sg s
      parser = customOrDefault opts useParser customParser sg cat
-  tokens2trms opts sg cn parser (tok s)
+  if oElem (iOpt "cut") opts 
-
+    then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
    else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
 tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
 tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
@@ -93,10 +97,12 @@ tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
 	  info   = snd result
 	  trees  = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2))
-trees2trms :: Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
+trees2trms :: 
  Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
 trees2trms opts sg cn as ts0 info = do
  let s = unwords $ map prCFTok as
  ts  <- case () of
-    _ | null ts0 -> checkWarn "No success in cf parsing" >> return []
+    _ | null ts0 -> checkWarn ("No success in cf parsing" +++ s) >> return []
    _ | raw      -> do
      ts1 <- return (map cf2trm0 ts0) ----- should not need annot
      checks [
@@ -169,6 +169,7 @@ p,  parse: p String
      -lines   parse each line of input separately, ignoring empty lines
      -all     as -lines, but also parse empty lines
      -prob    rank results by probability
      -cut     stop after first lexing result leading to parser success
  options for selecting parsing method:
      (default)parse using an overgenerating CFG
      -cfg     parse using a much less overgenerating CFG
@@ -502,6 +503,8 @@ q, quit: q
    -lexer=codelit       like code, but treat unknown words as string literals
    -lexer=textlit       like text, but treat unknown words as string literals
    -lexer=codeC         use a C-like lexer
    -lexer=ignore        like literals, but ignore unknown words
    -lexer=subseqs       like ignore, but then try all subsequences from longest
 -number, the maximum number of generated items in a list. 
       The default is unlimited.