nondeterministic lexer, e.g. subseqs

2026-07-08 14:42:46 -06:00 · 2005-11-17 23:17:42 +00:00
parent e29a1430bf
commit 524c4829f9
7 changed files with 69 additions and 29 deletions
@@ -13,6 +13,19 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2
 </center>
 <p>

+17/11 (AR) Made it possible for lexers to be nondeterministic.
+Now with a simple-minded implementation that the parser is sent
+each lexing result in turn. The option <tt>-cut</tt> is used for
+breaking after first lexing leading to successful parse. The only
+nondeterministic lexer right now is <tt>-lexer=subseqs</tt>, which
+first filters with <tt>-lexer=ignore</tt> (dropping words neither in
+the grammar nor literals) and then starts ignoring other words from
+longest to shortest subsequence. This is usable for parser tasks
+of keyword spotting type, but expensive (2<sup>n</sup>) in long input.
+A smarter implementation is therefore desirable.
+
+<p>
+
 14/11 (AR) Functions can be made unparsable (or "internal" as
 in BNFC). This is done by <tt>i -noparse=file</tt>, where
 the nonparsable functions are given in <tt>file</tt> using the
@@ -65,14 +65,14 @@ module GF.Data.Operations (-- * misc functions
 		   updateAssoc, removeAssoc,

 		   -- * chop into separator-separated parts
-		   chunks, readIntArg,
+		   chunks, readIntArg, subSequences,

 		   -- * state monad with error; from Agda 6\/11\/2001
 		   STM(..), appSTM, stm, stmr, readSTM, updateSTM, writeSTM, done,

 		   -- * error monad class
-		   ErrorMonad(..), checkAgain, checks, allChecks
-
+		   ErrorMonad(..), checkAgain, checks, allChecks, doUntil
+                
 		  ) where

 import Data.Char (isSpace, toUpper, isSpace, isDigit)
@@ -656,3 +656,16 @@ allChecks ms = case ms of
  (m: ms) -> let rs = allChecks ms in handle_ (liftM2 (:) m rs) rs
  _ -> return []

+doUntil :: ErrorMonad m => (a -> Bool) -> [m a] -> m a
+doUntil cond ms = case ms of
+  a:as -> do
+    v <- a
+    if cond v then return v else doUntil cond as
+  _ -> raise "no result"
+
+-- subsequences sorted from longest to shortest ; their number is 2^n
+subSequences :: [a] -> [[a]]
+subSequences = sortBy (\x y -> compare (length y) (length x)) . subs where
+  subs xs = case xs of
+    [] -> [[]]
+    x:xs -> let xss = subs xs in [x:y | y <- xss] ++ xss
@@ -5,9 +5,9 @@
 -- Stability   : (stable)
 -- Portability : (portable)
 --
-- > CVS $Date: 2005/11/14 16:03:41 $
+-- > CVS $Date: 2005/05/12 10:03:34 $
 -- > CVS $Author: aarne $
-- > CVS $Revision: 1.20 $
+-- > CVS $Revision: 1.9 $
 --
 -- Help on shell commands. Generated from HelpFile by 'make help'.
 -- PLEASE DON'T EDIT THIS FILE.
@@ -198,6 +198,7 @@ txtHelpFile =
  "\n      -lines   parse each line of input separately, ignoring empty lines" ++
  "\n      -all     as -lines, but also parse empty lines" ++
  "\n      -prob    rank results by probability" ++
+  "\n      -cut     stop after first lexing result leading to parser success" ++
  "\n  options for selecting parsing method:" ++
  "\n      (default)parse using an overgenerating CFG" ++
  "\n      -cfg     parse using a much less overgenerating CFG" ++
@@ -531,6 +532,8 @@ txtHelpFile =
  "\n    -lexer=codelit       like code, but treat unknown words as string literals" ++
  "\n    -lexer=textlit       like text, but treat unknown words as string literals" ++
  "\n    -lexer=codeC         use a C-like lexer" ++
+  "\n    -lexer=ignore        like literals, but ignore unknown words" ++
+  "\n    -lexer=subseqs       like ignore, but then try all subsequences from longest" ++
  "\n" ++
  "\n-number, the maximum number of generated items in a list. " ++
  "\n       The default is unlimited." ++
@@ -173,7 +173,7 @@ optionsOfCommand co = case co of
  CTransformGrammar _ -> flags "printer"
  CConvertLatex _ -> none
  CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer"
-  CParse -> both "new newer cfg mcfg n ign raw v lines all prob" 
+  CParse -> both "cut new newer cfg mcfg n ign raw v lines all prob" 
                 "cat lang lexer parser number rawtrees"
  CTranslate _ _ -> opts "cat lexer parser"
  CGenerateRandom -> both "cf prob" "cat lang number depth"
@@ -161,7 +161,7 @@ customStringCommand  :: CustomData (StateGrammar -> String -> String)
 customParser         :: CustomData (StateGrammar -> CFCat -> CFParser)

 -- | useTokenizer, \"-lexer=x\"
-customTokenizer      :: CustomData (StateGrammar -> String -> [CFTok])  
+customTokenizer      :: CustomData (StateGrammar -> String -> [[CFTok]])  

 -- | useUntokenizer, \"-unlexer=x\" --- should be from token list to string
 customUntokenizer    :: CustomData (StateGrammar -> String -> String)  
@@ -416,22 +416,24 @@ customParser =
 -- add your own parsers here
  ]

-customTokenizer = 
+customTokenizer =
+  let sg = singleton in 
  customData "Tokenizers, selected by option -lexer=x" $
  [
-   (strCI "words",     const $ tokWords)
-  ,(strCI "literals",  const $ tokLits)
-  ,(strCI "vars",      const $ tokVars)
-  ,(strCI "chars",     const $ map (tS . singleton))
-  ,(strCI "code",      const $ lexHaskell)
-  ,(strCI "codevars",  lexHaskellVar . stateIsWord)
-  ,(strCI "text",      const $ lexText)
-  ,(strCI "unglue",    \gr -> map tS . decomposeWords (stateMorpho gr))
-  ,(strCI "codelit",   lexHaskellLiteral . stateIsWord)
-  ,(strCI "textlit",   lexTextLiteral . stateIsWord)
-  ,(strCI "codeC",     const $ lexC2M)
-  ,(strCI "ignore",    \gr -> lexIgnore (stateIsWord gr) . tokLits)
-  ,(strCI "codeCHigh", const $ lexC2M' True)
+   (strCI "words",     const $ sg . tokWords)
+  ,(strCI "literals",  const $ sg . tokLits)
+  ,(strCI "vars",      const $ sg . tokVars)
+  ,(strCI "chars",     const $ sg . map (tS . singleton))
+  ,(strCI "code",      const $ sg . lexHaskell)
+  ,(strCI "codevars",  \gr -> sg . (lexHaskellVar $ stateIsWord gr))
+  ,(strCI "text",      const $ sg . lexText)
+  ,(strCI "unglue",    \gr -> sg . map tS . decomposeWords (stateMorpho gr))
+  ,(strCI "codelit",   \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))
+  ,(strCI "textlit",   \gr -> sg . (lexTextLiteral $ stateIsWord gr))
+  ,(strCI "codeC",     const $ sg . lexC2M)
+  ,(strCI "ignore",    \gr -> sg . lexIgnore (stateIsWord gr) . tokLits)
+  ,(strCI "subseqs",   \gr -> subSequences . lexIgnore (stateIsWord gr) . tokLits)
+  ,(strCI "codeCHigh", const $ sg . lexC2M' True)
 -- add your own tokenizers here
  ]

@@ -40,7 +40,7 @@ import qualified GF.Parsing.GFC as New

 import GF.Data.Operations

-import Data.List (nub)
+import Data.List (nub,sortBy)
 import Control.Monad (liftM)

 -- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
@@ -51,7 +51,7 @@ parseString os sg cat = liftM fst . parseStringMsg os sg cat
 parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String)
 parseStringMsg os sg cat s = do
  (ts,(_,ss)) <- checkStart $ parseStringC os sg cat s
-  return (ts,unlines ss)
+  return (ts, unlines $ reverse ss)

 parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]

@@ -73,7 +73,10 @@ parseStringC opts0 sg cat s
 		| otherwise              = "c" -- default algorithm
      strategy  = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
      tokenizer = customOrDefault opts useTokenizer customTokenizer sg
-  ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat (tokenizer s)
+      toks = case tokenizer s of
+               t:_ -> t
+               _ -> [] ---- no support for undet. tok.
+  ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
  ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
  return $ optIntOrAll opts flagNumber ts'

@@ -82,10 +85,11 @@ parseStringC opts0 sg cat s = do
      cf  = stateCF sg
      gr  = stateGrammarST sg
      cn  = cncId sg
-      tok = customOrDefault opts useTokenizer customTokenizer sg
+      toks = customOrDefault opts useTokenizer customTokenizer sg s
      parser = customOrDefault opts useParser customParser sg cat
-  tokens2trms opts sg cn parser (tok s)
-
+  if oElem (iOpt "cut") opts 
+    then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
+    else mapM (tokens2trms opts sg cn parser) toks >>= return . concat

 tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
 tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
@@ -93,10 +97,12 @@ tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
 	  info   = snd result
 	  trees  = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2))

-trees2trms :: Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
+trees2trms :: 
+  Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
 trees2trms opts sg cn as ts0 info = do
+  let s = unwords $ map prCFTok as
  ts  <- case () of
-    _ | null ts0 -> checkWarn "No success in cf parsing" >> return []
+    _ | null ts0 -> checkWarn ("No success in cf parsing" +++ s) >> return []
    _ | raw      -> do
      ts1 <- return (map cf2trm0 ts0) ----- should not need annot
      checks [
@@ -169,6 +169,7 @@ p,  parse: p String
      -lines   parse each line of input separately, ignoring empty lines
      -all     as -lines, but also parse empty lines
      -prob    rank results by probability
+      -cut     stop after first lexing result leading to parser success
  options for selecting parsing method:
      (default)parse using an overgenerating CFG
      -cfg     parse using a much less overgenerating CFG
@@ -502,6 +503,8 @@ q, quit: q
    -lexer=codelit       like code, but treat unknown words as string literals
    -lexer=textlit       like text, but treat unknown words as string literals
    -lexer=codeC         use a C-like lexer
+    -lexer=ignore        like literals, but ignore unknown words
+    -lexer=subseqs       like ignore, but then try all subsequences from longest

 -number, the maximum number of generated items in a list. 
       The default is unlimited.