mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 19:42:50 -06:00
nondeterministic lexer, e.g. subseqs
This commit is contained in:
@@ -13,6 +13,19 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2
|
|||||||
</center>
|
</center>
|
||||||
<p>
|
<p>
|
||||||
|
|
||||||
|
17/11 (AR) Made it possible for lexers to be nondeterministic.
|
||||||
|
Now with a simple-minded implementation that the parser is sent
|
||||||
|
each lexing result in turn. The option <tt>-cut</tt> is used for
|
||||||
|
breaking after first lexing leading to successful parse. The only
|
||||||
|
nondeterministic lexer right now is <tt>-lexer=subseqs</tt>, which
|
||||||
|
first filters with <tt>-lexer=ignore</tt> (dropping words neither in
|
||||||
|
the grammar nor literals) and then starts ignoring other words from
|
||||||
|
longest to shortest subsequence. This is usable for parser tasks
|
||||||
|
of keyword spotting type, but expensive (2<sup>n</sup>) in long input.
|
||||||
|
A smarter implementation is therefore desirable.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
|
||||||
14/11 (AR) Functions can be made unparsable (or "internal" as
|
14/11 (AR) Functions can be made unparsable (or "internal" as
|
||||||
in BNFC). This is done by <tt>i -noparse=file</tt>, where
|
in BNFC). This is done by <tt>i -noparse=file</tt>, where
|
||||||
the nonparsable functions are given in <tt>file</tt> using the
|
the nonparsable functions are given in <tt>file</tt> using the
|
||||||
|
|||||||
@@ -65,14 +65,14 @@ module GF.Data.Operations (-- * misc functions
|
|||||||
updateAssoc, removeAssoc,
|
updateAssoc, removeAssoc,
|
||||||
|
|
||||||
-- * chop into separator-separated parts
|
-- * chop into separator-separated parts
|
||||||
chunks, readIntArg,
|
chunks, readIntArg, subSequences,
|
||||||
|
|
||||||
-- * state monad with error; from Agda 6\/11\/2001
|
-- * state monad with error; from Agda 6\/11\/2001
|
||||||
STM(..), appSTM, stm, stmr, readSTM, updateSTM, writeSTM, done,
|
STM(..), appSTM, stm, stmr, readSTM, updateSTM, writeSTM, done,
|
||||||
|
|
||||||
-- * error monad class
|
-- * error monad class
|
||||||
ErrorMonad(..), checkAgain, checks, allChecks
|
ErrorMonad(..), checkAgain, checks, allChecks, doUntil
|
||||||
|
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.Char (isSpace, toUpper, isSpace, isDigit)
|
import Data.Char (isSpace, toUpper, isSpace, isDigit)
|
||||||
@@ -656,3 +656,16 @@ allChecks ms = case ms of
|
|||||||
(m: ms) -> let rs = allChecks ms in handle_ (liftM2 (:) m rs) rs
|
(m: ms) -> let rs = allChecks ms in handle_ (liftM2 (:) m rs) rs
|
||||||
_ -> return []
|
_ -> return []
|
||||||
|
|
||||||
|
doUntil :: ErrorMonad m => (a -> Bool) -> [m a] -> m a
|
||||||
|
doUntil cond ms = case ms of
|
||||||
|
a:as -> do
|
||||||
|
v <- a
|
||||||
|
if cond v then return v else doUntil cond as
|
||||||
|
_ -> raise "no result"
|
||||||
|
|
||||||
|
-- subsequences sorted from longest to shortest ; their number is 2^n
|
||||||
|
subSequences :: [a] -> [[a]]
|
||||||
|
subSequences = sortBy (\x y -> compare (length y) (length x)) . subs where
|
||||||
|
subs xs = case xs of
|
||||||
|
[] -> [[]]
|
||||||
|
x:xs -> let xss = subs xs in [x:y | y <- xss] ++ xss
|
||||||
|
|||||||
@@ -5,9 +5,9 @@
|
|||||||
-- Stability : (stable)
|
-- Stability : (stable)
|
||||||
-- Portability : (portable)
|
-- Portability : (portable)
|
||||||
--
|
--
|
||||||
-- > CVS $Date: 2005/11/14 16:03:41 $
|
-- > CVS $Date: 2005/05/12 10:03:34 $
|
||||||
-- > CVS $Author: aarne $
|
-- > CVS $Author: aarne $
|
||||||
-- > CVS $Revision: 1.20 $
|
-- > CVS $Revision: 1.9 $
|
||||||
--
|
--
|
||||||
-- Help on shell commands. Generated from HelpFile by 'make help'.
|
-- Help on shell commands. Generated from HelpFile by 'make help'.
|
||||||
-- PLEASE DON'T EDIT THIS FILE.
|
-- PLEASE DON'T EDIT THIS FILE.
|
||||||
@@ -198,6 +198,7 @@ txtHelpFile =
|
|||||||
"\n -lines parse each line of input separately, ignoring empty lines" ++
|
"\n -lines parse each line of input separately, ignoring empty lines" ++
|
||||||
"\n -all as -lines, but also parse empty lines" ++
|
"\n -all as -lines, but also parse empty lines" ++
|
||||||
"\n -prob rank results by probability" ++
|
"\n -prob rank results by probability" ++
|
||||||
|
"\n -cut stop after first lexing result leading to parser success" ++
|
||||||
"\n options for selecting parsing method:" ++
|
"\n options for selecting parsing method:" ++
|
||||||
"\n (default)parse using an overgenerating CFG" ++
|
"\n (default)parse using an overgenerating CFG" ++
|
||||||
"\n -cfg parse using a much less overgenerating CFG" ++
|
"\n -cfg parse using a much less overgenerating CFG" ++
|
||||||
@@ -531,6 +532,8 @@ txtHelpFile =
|
|||||||
"\n -lexer=codelit like code, but treat unknown words as string literals" ++
|
"\n -lexer=codelit like code, but treat unknown words as string literals" ++
|
||||||
"\n -lexer=textlit like text, but treat unknown words as string literals" ++
|
"\n -lexer=textlit like text, but treat unknown words as string literals" ++
|
||||||
"\n -lexer=codeC use a C-like lexer" ++
|
"\n -lexer=codeC use a C-like lexer" ++
|
||||||
|
"\n -lexer=ignore like literals, but ignore unknown words" ++
|
||||||
|
"\n -lexer=subseqs like ignore, but then try all subsequences from longest" ++
|
||||||
"\n" ++
|
"\n" ++
|
||||||
"\n-number, the maximum number of generated items in a list. " ++
|
"\n-number, the maximum number of generated items in a list. " ++
|
||||||
"\n The default is unlimited." ++
|
"\n The default is unlimited." ++
|
||||||
|
|||||||
@@ -173,7 +173,7 @@ optionsOfCommand co = case co of
|
|||||||
CTransformGrammar _ -> flags "printer"
|
CTransformGrammar _ -> flags "printer"
|
||||||
CConvertLatex _ -> none
|
CConvertLatex _ -> none
|
||||||
CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer"
|
CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer"
|
||||||
CParse -> both "new newer cfg mcfg n ign raw v lines all prob"
|
CParse -> both "cut new newer cfg mcfg n ign raw v lines all prob"
|
||||||
"cat lang lexer parser number rawtrees"
|
"cat lang lexer parser number rawtrees"
|
||||||
CTranslate _ _ -> opts "cat lexer parser"
|
CTranslate _ _ -> opts "cat lexer parser"
|
||||||
CGenerateRandom -> both "cf prob" "cat lang number depth"
|
CGenerateRandom -> both "cf prob" "cat lang number depth"
|
||||||
|
|||||||
@@ -161,7 +161,7 @@ customStringCommand :: CustomData (StateGrammar -> String -> String)
|
|||||||
customParser :: CustomData (StateGrammar -> CFCat -> CFParser)
|
customParser :: CustomData (StateGrammar -> CFCat -> CFParser)
|
||||||
|
|
||||||
-- | useTokenizer, \"-lexer=x\"
|
-- | useTokenizer, \"-lexer=x\"
|
||||||
customTokenizer :: CustomData (StateGrammar -> String -> [CFTok])
|
customTokenizer :: CustomData (StateGrammar -> String -> [[CFTok]])
|
||||||
|
|
||||||
-- | useUntokenizer, \"-unlexer=x\" --- should be from token list to string
|
-- | useUntokenizer, \"-unlexer=x\" --- should be from token list to string
|
||||||
customUntokenizer :: CustomData (StateGrammar -> String -> String)
|
customUntokenizer :: CustomData (StateGrammar -> String -> String)
|
||||||
@@ -416,22 +416,24 @@ customParser =
|
|||||||
-- add your own parsers here
|
-- add your own parsers here
|
||||||
]
|
]
|
||||||
|
|
||||||
customTokenizer =
|
customTokenizer =
|
||||||
|
let sg = singleton in
|
||||||
customData "Tokenizers, selected by option -lexer=x" $
|
customData "Tokenizers, selected by option -lexer=x" $
|
||||||
[
|
[
|
||||||
(strCI "words", const $ tokWords)
|
(strCI "words", const $ sg . tokWords)
|
||||||
,(strCI "literals", const $ tokLits)
|
,(strCI "literals", const $ sg . tokLits)
|
||||||
,(strCI "vars", const $ tokVars)
|
,(strCI "vars", const $ sg . tokVars)
|
||||||
,(strCI "chars", const $ map (tS . singleton))
|
,(strCI "chars", const $ sg . map (tS . singleton))
|
||||||
,(strCI "code", const $ lexHaskell)
|
,(strCI "code", const $ sg . lexHaskell)
|
||||||
,(strCI "codevars", lexHaskellVar . stateIsWord)
|
,(strCI "codevars", \gr -> sg . (lexHaskellVar $ stateIsWord gr))
|
||||||
,(strCI "text", const $ lexText)
|
,(strCI "text", const $ sg . lexText)
|
||||||
,(strCI "unglue", \gr -> map tS . decomposeWords (stateMorpho gr))
|
,(strCI "unglue", \gr -> sg . map tS . decomposeWords (stateMorpho gr))
|
||||||
,(strCI "codelit", lexHaskellLiteral . stateIsWord)
|
,(strCI "codelit", \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))
|
||||||
,(strCI "textlit", lexTextLiteral . stateIsWord)
|
,(strCI "textlit", \gr -> sg . (lexTextLiteral $ stateIsWord gr))
|
||||||
,(strCI "codeC", const $ lexC2M)
|
,(strCI "codeC", const $ sg . lexC2M)
|
||||||
,(strCI "ignore", \gr -> lexIgnore (stateIsWord gr) . tokLits)
|
,(strCI "ignore", \gr -> sg . lexIgnore (stateIsWord gr) . tokLits)
|
||||||
,(strCI "codeCHigh", const $ lexC2M' True)
|
,(strCI "subseqs", \gr -> subSequences . lexIgnore (stateIsWord gr) . tokLits)
|
||||||
|
,(strCI "codeCHigh", const $ sg . lexC2M' True)
|
||||||
-- add your own tokenizers here
|
-- add your own tokenizers here
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ import qualified GF.Parsing.GFC as New
|
|||||||
|
|
||||||
import GF.Data.Operations
|
import GF.Data.Operations
|
||||||
|
|
||||||
import Data.List (nub)
|
import Data.List (nub,sortBy)
|
||||||
import Control.Monad (liftM)
|
import Control.Monad (liftM)
|
||||||
|
|
||||||
-- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
|
-- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
|
||||||
@@ -51,7 +51,7 @@ parseString os sg cat = liftM fst . parseStringMsg os sg cat
|
|||||||
parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String)
|
parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String)
|
||||||
parseStringMsg os sg cat s = do
|
parseStringMsg os sg cat s = do
|
||||||
(ts,(_,ss)) <- checkStart $ parseStringC os sg cat s
|
(ts,(_,ss)) <- checkStart $ parseStringC os sg cat s
|
||||||
return (ts,unlines ss)
|
return (ts, unlines $ reverse ss)
|
||||||
|
|
||||||
parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
|
parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
|
||||||
|
|
||||||
@@ -73,7 +73,10 @@ parseStringC opts0 sg cat s
|
|||||||
| otherwise = "c" -- default algorithm
|
| otherwise = "c" -- default algorithm
|
||||||
strategy = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
|
strategy = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
|
||||||
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
|
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
|
||||||
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat (tokenizer s)
|
toks = case tokenizer s of
|
||||||
|
t:_ -> t
|
||||||
|
_ -> [] ---- no support for undet. tok.
|
||||||
|
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
|
||||||
ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
|
ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
|
||||||
return $ optIntOrAll opts flagNumber ts'
|
return $ optIntOrAll opts flagNumber ts'
|
||||||
|
|
||||||
@@ -82,10 +85,11 @@ parseStringC opts0 sg cat s = do
|
|||||||
cf = stateCF sg
|
cf = stateCF sg
|
||||||
gr = stateGrammarST sg
|
gr = stateGrammarST sg
|
||||||
cn = cncId sg
|
cn = cncId sg
|
||||||
tok = customOrDefault opts useTokenizer customTokenizer sg
|
toks = customOrDefault opts useTokenizer customTokenizer sg s
|
||||||
parser = customOrDefault opts useParser customParser sg cat
|
parser = customOrDefault opts useParser customParser sg cat
|
||||||
tokens2trms opts sg cn parser (tok s)
|
if oElem (iOpt "cut") opts
|
||||||
|
then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
|
||||||
|
else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
|
||||||
|
|
||||||
tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
|
tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
|
||||||
tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
|
tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
|
||||||
@@ -93,10 +97,12 @@ tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
|
|||||||
info = snd result
|
info = snd result
|
||||||
trees = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2))
|
trees = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2))
|
||||||
|
|
||||||
trees2trms :: Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
|
trees2trms ::
|
||||||
|
Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
|
||||||
trees2trms opts sg cn as ts0 info = do
|
trees2trms opts sg cn as ts0 info = do
|
||||||
|
let s = unwords $ map prCFTok as
|
||||||
ts <- case () of
|
ts <- case () of
|
||||||
_ | null ts0 -> checkWarn "No success in cf parsing" >> return []
|
_ | null ts0 -> checkWarn ("No success in cf parsing" +++ s) >> return []
|
||||||
_ | raw -> do
|
_ | raw -> do
|
||||||
ts1 <- return (map cf2trm0 ts0) ----- should not need annot
|
ts1 <- return (map cf2trm0 ts0) ----- should not need annot
|
||||||
checks [
|
checks [
|
||||||
|
|||||||
@@ -169,6 +169,7 @@ p, parse: p String
|
|||||||
-lines parse each line of input separately, ignoring empty lines
|
-lines parse each line of input separately, ignoring empty lines
|
||||||
-all as -lines, but also parse empty lines
|
-all as -lines, but also parse empty lines
|
||||||
-prob rank results by probability
|
-prob rank results by probability
|
||||||
|
-cut stop after first lexing result leading to parser success
|
||||||
options for selecting parsing method:
|
options for selecting parsing method:
|
||||||
(default)parse using an overgenerating CFG
|
(default)parse using an overgenerating CFG
|
||||||
-cfg parse using a much less overgenerating CFG
|
-cfg parse using a much less overgenerating CFG
|
||||||
@@ -502,6 +503,8 @@ q, quit: q
|
|||||||
-lexer=codelit like code, but treat unknown words as string literals
|
-lexer=codelit like code, but treat unknown words as string literals
|
||||||
-lexer=textlit like text, but treat unknown words as string literals
|
-lexer=textlit like text, but treat unknown words as string literals
|
||||||
-lexer=codeC use a C-like lexer
|
-lexer=codeC use a C-like lexer
|
||||||
|
-lexer=ignore like literals, but ignore unknown words
|
||||||
|
-lexer=subseqs like ignore, but then try all subsequences from longest
|
||||||
|
|
||||||
-number, the maximum number of generated items in a list.
|
-number, the maximum number of generated items in a list.
|
||||||
The default is unlimited.
|
The default is unlimited.
|
||||||
|
|||||||
Reference in New Issue
Block a user