mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 19:42:50 -06:00
made -fcfg default parser; added lexer textvars
This commit is contained in:
@@ -12,6 +12,16 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2
|
|||||||
|
|
||||||
</center>
|
</center>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
|
||||||
|
20/6 (AR) The FCFG parser is know the default, as it even handles literals.
|
||||||
|
The old default can be selected by <tt>p -old</tt>. Since
|
||||||
|
FCFG does not support variable bindings, <tt>-old</tt> is automatically
|
||||||
|
selected if the grammar has bindings - and unless the <tt>-fcfg</tt> flag
|
||||||
|
is used.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
|
||||||
17/6 (AR) The FCFG parser is now the recommended method for parsing
|
17/6 (AR) The FCFG parser is now the recommended method for parsing
|
||||||
heavy grammars such as the resource grammars. It does not yet support
|
heavy grammars such as the resource grammars. It does not yet support
|
||||||
literals and variable bindings.
|
literals and variable bindings.
|
||||||
|
|||||||
@@ -188,6 +188,10 @@ stateOptions = loptions
|
|||||||
stateGrammarWords = allMorphoWords . stateMorpho
|
stateGrammarWords = allMorphoWords . stateMorpho
|
||||||
stateGrammarLang st = (grammar st, cncId st)
|
stateGrammarLang st = (grammar st, cncId st)
|
||||||
|
|
||||||
|
---- this should be computed at compile time and stored
|
||||||
|
stateHasHOAS :: StateGrammar -> Bool
|
||||||
|
stateHasHOAS = hasHOAS . stateGrammarST
|
||||||
|
|
||||||
cncModuleIdST :: StateGrammar -> CanonGrammar
|
cncModuleIdST :: StateGrammar -> CanonGrammar
|
||||||
cncModuleIdST = stateGrammarST
|
cncModuleIdST = stateGrammarST
|
||||||
|
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ module GF.Grammar.LookAbs (GFCGrammar,
|
|||||||
lookupRef,
|
lookupRef,
|
||||||
refsForType,
|
refsForType,
|
||||||
funRulesOf,
|
funRulesOf,
|
||||||
|
hasHOAS,
|
||||||
allCatsOf,
|
allCatsOf,
|
||||||
allBindCatsOf,
|
allBindCatsOf,
|
||||||
funsForType,
|
funsForType,
|
||||||
@@ -130,6 +131,10 @@ funRulesOf gr =
|
|||||||
mtype m == MTAbstract,
|
mtype m == MTAbstract,
|
||||||
(f, C.AbsFun typ _) <- tree2list (jments m)]
|
(f, C.AbsFun typ _) <- tree2list (jments m)]
|
||||||
|
|
||||||
|
-- testing for higher-order abstract syntax
|
||||||
|
hasHOAS :: GFCGrammar -> Bool
|
||||||
|
hasHOAS gr = any isHigherOrderType [t | (_,t) <- funRulesOf gr] where
|
||||||
|
|
||||||
allCatsOf :: GFCGrammar -> [(Cat,Context)]
|
allCatsOf :: GFCGrammar -> [(Cat,Context)]
|
||||||
allCatsOf gr =
|
allCatsOf gr =
|
||||||
[((i,c),cont) | (i, ModMod m) <- modules gr,
|
[((i,c),cont) | (i, ModMod m) <- modules gr,
|
||||||
|
|||||||
@@ -136,6 +136,10 @@ isRecursiveType t = errVal False $ do
|
|||||||
(cc,c) <- catSkeleton t -- thus recursivity on Cat level
|
(cc,c) <- catSkeleton t -- thus recursivity on Cat level
|
||||||
return $ any (== c) cc
|
return $ any (== c) cc
|
||||||
|
|
||||||
|
isHigherOrderType :: Type -> Bool
|
||||||
|
isHigherOrderType t = errVal True $ do -- pessimistic choice
|
||||||
|
co <- contextOfType t
|
||||||
|
return $ not $ null [x | (x,Prod _ _ _) <- co]
|
||||||
|
|
||||||
contextOfType :: Type -> Err Context
|
contextOfType :: Type -> Err Context
|
||||||
contextOfType typ = case typ of
|
contextOfType typ = case typ of
|
||||||
|
|||||||
@@ -212,12 +212,12 @@ txtHelpFile =
|
|||||||
"\n -fail show strings whose parse fails prefixed by #FAIL" ++
|
"\n -fail show strings whose parse fails prefixed by #FAIL" ++
|
||||||
"\n -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS" ++
|
"\n -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS" ++
|
||||||
"\n options for selecting parsing method:" ++
|
"\n options for selecting parsing method:" ++
|
||||||
"\n (default)parse using an overgenerating CFG" ++
|
"\n -fcfg parse using a fast variant of MCFG (default is no HOAS in grammar)" ++
|
||||||
|
"\n -old parse using an overgenerating CFG (default if HOAS in grammar)" ++
|
||||||
"\n -cfg parse using a much less overgenerating CFG" ++
|
"\n -cfg parse using a much less overgenerating CFG" ++
|
||||||
"\n -mcfg parse using an even less overgenerating MCFG" ++
|
"\n -mcfg parse using an even less overgenerating MCFG" ++
|
||||||
"\n -fcfg parse using a faster variant of MCFG" ++
|
"\n Note: the first time parsing with -cfg, -mcfg, and -fcfg may take a long time" ++
|
||||||
"\n Note: the first time parsing with -cfg, -mcfg, and -fcfg might take a long time" ++
|
"\n options that only work for the -old default parsing method:" ++
|
||||||
"\n options that only work for the default parsing method:" ++
|
|
||||||
"\n -n non-strict: tolerates morphological errors" ++
|
"\n -n non-strict: tolerates morphological errors" ++
|
||||||
"\n -ign ignore unknown words when parsing" ++
|
"\n -ign ignore unknown words when parsing" ++
|
||||||
"\n -raw return context-free terms in raw form" ++
|
"\n -raw return context-free terms in raw form" ++
|
||||||
@@ -594,6 +594,7 @@ txtHelpFile =
|
|||||||
"\n -lexer=chars each character is a token" ++
|
"\n -lexer=chars each character is a token" ++
|
||||||
"\n -lexer=code use Haskell's lex" ++
|
"\n -lexer=code use Haskell's lex" ++
|
||||||
"\n -lexer=codevars like code, but treat unknown words as variables, ?? as meta " ++
|
"\n -lexer=codevars like code, but treat unknown words as variables, ?? as meta " ++
|
||||||
|
"\n -lexer=textvars like text, but treat unknown words as variables, ?? as meta " ++
|
||||||
"\n -lexer=text with conventions on punctuation and capital letters" ++
|
"\n -lexer=text with conventions on punctuation and capital letters" ++
|
||||||
"\n -lexer=codelit like code, but treat unknown words as string literals" ++
|
"\n -lexer=codelit like code, but treat unknown words as string literals" ++
|
||||||
"\n -lexer=textlit like text, but treat unknown words as string literals" ++
|
"\n -lexer=textlit like text, but treat unknown words as string literals" ++
|
||||||
|
|||||||
@@ -184,8 +184,9 @@ optionsOfCommand co = case co of
|
|||||||
CTransformGrammar _ -> flags "printer"
|
CTransformGrammar _ -> flags "printer"
|
||||||
CConvertLatex _ -> none
|
CConvertLatex _ -> none
|
||||||
CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer mark"
|
CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer mark"
|
||||||
CParse -> both "ambiguous fail cut new newer cfg mcfg fcfg n ign raw v lines all prob"
|
CParse ->
|
||||||
"cat lang lexer parser number rawtrees"
|
both "ambiguous fail cut new newer old cfg mcfg fcfg n ign raw v lines all prob"
|
||||||
|
"cat lang lexer parser number rawtrees"
|
||||||
CTranslate _ _ -> opts "cat lexer parser"
|
CTranslate _ _ -> opts "cat lexer parser"
|
||||||
CGenerateRandom -> both "cf prob" "cat lang number depth atoms noexpand doexpand"
|
CGenerateRandom -> both "cf prob" "cat lang number depth atoms noexpand doexpand"
|
||||||
CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand doexpand"
|
CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand doexpand"
|
||||||
|
|||||||
@@ -450,6 +450,7 @@ customTokenizer =
|
|||||||
,(strCI "chars", const $ sg . map (tS . singleton))
|
,(strCI "chars", const $ sg . map (tS . singleton))
|
||||||
,(strCI "code", const $ sg . lexHaskell)
|
,(strCI "code", const $ sg . lexHaskell)
|
||||||
,(strCI "codevars", \gr -> sg . (lexHaskellVar $ stateIsWord gr))
|
,(strCI "codevars", \gr -> sg . (lexHaskellVar $ stateIsWord gr))
|
||||||
|
,(strCI "textvars", \gr -> sg . (lexTextVar $ stateIsWord gr))
|
||||||
,(strCI "text", const $ sg . lexText)
|
,(strCI "text", const $ sg . lexText)
|
||||||
,(strCI "unglue", \gr -> sg . map tS . decomposeWords (stateMorpho gr))
|
,(strCI "unglue", \gr -> sg . map tS . decomposeWords (stateMorpho gr))
|
||||||
,(strCI "codelit", \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))
|
,(strCI "codelit", \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))
|
||||||
|
|||||||
@@ -54,34 +54,9 @@ parseStringMsg os sg cat s = do
|
|||||||
return (ts, unlines $ reverse ss)
|
return (ts, unlines $ reverse ss)
|
||||||
|
|
||||||
parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
|
parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
|
||||||
|
parseStringC opts0 sg cat s
|
||||||
---- to test peb's new parser 6/10/2003
|
| oElem (iOpt "old") opts0 ||
|
||||||
---- (obsoleted by "newer" below)
|
(not (oElem (iOpt "fcfg") opts0) && stateHasHOAS sg) = do
|
||||||
-- parseStringC opts0 sg cat s
|
|
||||||
-- | oElem newParser opts0 = do
|
|
||||||
-- let pm = maybe "" id $ getOptVal opts0 useParser -- -parser=pm
|
|
||||||
-- ct = cfCat2Cat cat
|
|
||||||
-- ts <- checkErr $ NewOld.newParser pm sg ct s
|
|
||||||
-- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
|
|
||||||
|
|
||||||
-- to use peb's newer parser 7/4-05
|
|
||||||
parseStringC opts0 sg cat s
|
|
||||||
| oElem newCParser opts0 || oElem newMParser opts0 || oElem newFParser opts0 || oElem newParser opts0 || oElem newerParser opts0 = do
|
|
||||||
let opts = unionOptions opts0 $ stateOptions sg
|
|
||||||
algorithm | oElem newCParser opts0 = "c"
|
|
||||||
| oElem newMParser opts0 = "m"
|
|
||||||
| oElem newFParser opts0 = "f"
|
|
||||||
| otherwise = "c" -- default algorithm
|
|
||||||
strategy = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
|
|
||||||
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
|
|
||||||
toks = case tokenizer s of
|
|
||||||
t:_ -> t
|
|
||||||
_ -> [] ---- no support for undet. tok.
|
|
||||||
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
|
|
||||||
ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
|
|
||||||
return $ optIntOrAll opts flagNumber ts'
|
|
||||||
|
|
||||||
parseStringC opts0 sg cat s = do
|
|
||||||
let opts = unionOptions opts0 $ stateOptions sg
|
let opts = unionOptions opts0 $ stateOptions sg
|
||||||
cf = stateCF sg
|
cf = stateCF sg
|
||||||
gr = stateGrammarST sg
|
gr = stateGrammarST sg
|
||||||
@@ -92,6 +67,26 @@ parseStringC opts0 sg cat s = do
|
|||||||
then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
|
then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
|
||||||
else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
|
else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
|
||||||
|
|
||||||
|
---- | or [oElem p opts0 |
|
||||||
|
---- p <- [newCParser,newMParser,newFParser,newParser,newerParser] = do
|
||||||
|
|
||||||
|
| otherwise = do
|
||||||
|
let opts = unionOptions opts0 $ stateOptions sg
|
||||||
|
algorithm | oElem newCParser opts0 = "c"
|
||||||
|
| oElem newMParser opts0 = "m"
|
||||||
|
| oElem newFParser opts0 = "f"
|
||||||
|
| otherwise = "f" -- default algorithm: FCFG
|
||||||
|
strategy = maybe "bottomup" id $ getOptVal opts useParser
|
||||||
|
-- -parser=bottomup/topdown
|
||||||
|
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
|
||||||
|
toks = case tokenizer s of
|
||||||
|
t:_ -> t
|
||||||
|
_ -> [] ---- no support for undet. tok.
|
||||||
|
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
|
||||||
|
ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
|
||||||
|
return $ optIntOrAll opts flagNumber ts'
|
||||||
|
|
||||||
|
|
||||||
tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
|
tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
|
||||||
tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
|
tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
|
||||||
where result = parser toks
|
where result = parser toks
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ module GF.UseGrammar.Tokenize ( tokWords,
|
|||||||
lexHaskellLiteral,
|
lexHaskellLiteral,
|
||||||
lexHaskellVar,
|
lexHaskellVar,
|
||||||
lexText,
|
lexText,
|
||||||
|
lexTextVar,
|
||||||
lexC2M, lexC2M',
|
lexC2M, lexC2M',
|
||||||
lexTextLiteral,
|
lexTextLiteral,
|
||||||
lexIgnore,
|
lexIgnore,
|
||||||
@@ -58,6 +59,10 @@ isFloat s = case s of
|
|||||||
'.':cs@(_:_) -> all isDigit cs
|
'.':cs@(_:_) -> all isDigit cs
|
||||||
_ -> False
|
_ -> False
|
||||||
|
|
||||||
|
isString s = case s of
|
||||||
|
c:cs@(_:_) -> (c == '\'' && d == '\'') || (c == '"' && d == '"') where d = last cs
|
||||||
|
_ -> False
|
||||||
|
|
||||||
|
|
||||||
mkCFTok :: String -> CFTok
|
mkCFTok :: String -> CFTok
|
||||||
mkCFTok s = case s of
|
mkCFTok s = case s of
|
||||||
@@ -86,6 +91,7 @@ mkLit s
|
|||||||
| all isDigit s = tI s
|
| all isDigit s = tI s
|
||||||
| otherwise = tL s
|
| otherwise = tL s
|
||||||
|
|
||||||
|
-- obsolete
|
||||||
mkTL :: String -> CFTok
|
mkTL :: String -> CFTok
|
||||||
mkTL s
|
mkTL s
|
||||||
| isFloat s = tF s
|
| isFloat s = tF s
|
||||||
@@ -104,6 +110,7 @@ lexText :: String -> [CFTok]
|
|||||||
lexText = uncap . lx where
|
lexText = uncap . lx where
|
||||||
|
|
||||||
lx s = case s of
|
lx s = case s of
|
||||||
|
'?':'?':cs -> tS "??" : lx cs
|
||||||
p : cs | isMPunct p -> tS [p] : uncap (lx cs)
|
p : cs | isMPunct p -> tS [p] : uncap (lx cs)
|
||||||
p : cs | isPunct p -> tS [p] : lx cs
|
p : cs | isPunct p -> tS [p] : lx cs
|
||||||
s : cs | isSpace s -> lx cs
|
s : cs | isSpace s -> lx cs
|
||||||
@@ -177,7 +184,7 @@ unknown2string isKnown = map mkOne where
|
|||||||
| isFloat s = tF s
|
| isFloat s = tF s
|
||||||
| all isDigit s = tI s
|
| all isDigit s = tI s
|
||||||
| otherwise = tL s
|
| otherwise = tL s
|
||||||
mkOne t@(TC s) = if isKnown s then t else mkTL s
|
mkOne t@(TC s) = if isKnown s then t else mkLit s
|
||||||
mkOne t = t
|
mkOne t = t
|
||||||
|
|
||||||
unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok]
|
unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok]
|
||||||
@@ -186,6 +193,7 @@ unknown2var isKnown = map mkOne where
|
|||||||
mkOne t@(TS s)
|
mkOne t@(TS s)
|
||||||
| isKnown s = t
|
| isKnown s = t
|
||||||
| isFloat s = tF s
|
| isFloat s = tF s
|
||||||
|
| isString s = tL (init (tail s))
|
||||||
| all isDigit s = tI s
|
| all isDigit s = tI s
|
||||||
| otherwise = tV s
|
| otherwise = tV s
|
||||||
mkOne t@(TC s) = if isKnown s then t else tV s
|
mkOne t@(TC s) = if isKnown s then t else tV s
|
||||||
@@ -197,6 +205,8 @@ lexTextLiteral isKnown = unknown2string (eitherUpper isKnown) . lexText
|
|||||||
lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell
|
lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell
|
||||||
|
|
||||||
lexHaskellVar isKnown = unknown2var isKnown . lexHaskell
|
lexHaskellVar isKnown = unknown2var isKnown . lexHaskell
|
||||||
|
lexTextVar isKnown = unknown2var (eitherUpper isKnown) . lexText
|
||||||
|
|
||||||
|
|
||||||
eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs)
|
eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs)
|
||||||
eitherUpper isKnown w = isKnown w
|
eitherUpper isKnown w = isKnown w
|
||||||
|
|||||||
@@ -183,12 +183,12 @@ p, parse: p String
|
|||||||
-fail show strings whose parse fails prefixed by #FAIL
|
-fail show strings whose parse fails prefixed by #FAIL
|
||||||
-ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS
|
-ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS
|
||||||
options for selecting parsing method:
|
options for selecting parsing method:
|
||||||
(default)parse using an overgenerating CFG
|
-fcfg parse using a fast variant of MCFG (default is no HOAS in grammar)
|
||||||
|
-old parse using an overgenerating CFG (default if HOAS in grammar)
|
||||||
-cfg parse using a much less overgenerating CFG
|
-cfg parse using a much less overgenerating CFG
|
||||||
-mcfg parse using an even less overgenerating MCFG
|
-mcfg parse using an even less overgenerating MCFG
|
||||||
-fcfg parse using a faster variant of MCFG
|
Note: the first time parsing with -cfg, -mcfg, and -fcfg may take a long time
|
||||||
Note: the first time parsing with -cfg, -mcfg, and -fcfg might take a long time
|
options that only work for the -old default parsing method:
|
||||||
options that only work for the default parsing method:
|
|
||||||
-n non-strict: tolerates morphological errors
|
-n non-strict: tolerates morphological errors
|
||||||
-ign ignore unknown words when parsing
|
-ign ignore unknown words when parsing
|
||||||
-raw return context-free terms in raw form
|
-raw return context-free terms in raw form
|
||||||
@@ -565,6 +565,7 @@ q, quit: q
|
|||||||
-lexer=chars each character is a token
|
-lexer=chars each character is a token
|
||||||
-lexer=code use Haskell's lex
|
-lexer=code use Haskell's lex
|
||||||
-lexer=codevars like code, but treat unknown words as variables, ?? as meta
|
-lexer=codevars like code, but treat unknown words as variables, ?? as meta
|
||||||
|
-lexer=textvars like text, but treat unknown words as variables, ?? as meta
|
||||||
-lexer=text with conventions on punctuation and capital letters
|
-lexer=text with conventions on punctuation and capital letters
|
||||||
-lexer=codelit like code, but treat unknown words as string literals
|
-lexer=codelit like code, but treat unknown words as string literals
|
||||||
-lexer=textlit like text, but treat unknown words as string literals
|
-lexer=textlit like text, but treat unknown words as string literals
|
||||||
|
|||||||
Reference in New Issue
Block a user