1
0
forked from GitHub/gf-core

made -fcfg default parser; added lexer textvars

This commit is contained in:
aarne
2006-06-20 08:38:44 +00:00
parent 658fd526b4
commit 0333ba894b
11 changed files with 71 additions and 40 deletions

View File

@@ -12,6 +12,16 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2
</center>
<p>
20/6 (AR) The FCFG parser is know the default, as it even handles literals.
The old default can be selected by <tt>p -old</tt>. Since
FCFG does not support variable bindings, <tt>-old</tt> is automatically
selected if the grammar has bindings - and unless the <tt>-fcfg</tt> flag
is used.
<p>
17/6 (AR) The FCFG parser is now the recommended method for parsing
heavy grammars such as the resource grammars. It does not yet support
literals and variable bindings.

View File

@@ -367,7 +367,6 @@ Russian
Spanish
- no list of irregular verbs
- multiple clitics (with V3) not always right

View File

@@ -188,6 +188,10 @@ stateOptions = loptions
stateGrammarWords = allMorphoWords . stateMorpho
stateGrammarLang st = (grammar st, cncId st)
---- this should be computed at compile time and stored
stateHasHOAS :: StateGrammar -> Bool
stateHasHOAS = hasHOAS . stateGrammarST
cncModuleIdST :: StateGrammar -> CanonGrammar
cncModuleIdST = stateGrammarST

View File

@@ -21,6 +21,7 @@ module GF.Grammar.LookAbs (GFCGrammar,
lookupRef,
refsForType,
funRulesOf,
hasHOAS,
allCatsOf,
allBindCatsOf,
funsForType,
@@ -130,6 +131,10 @@ funRulesOf gr =
mtype m == MTAbstract,
(f, C.AbsFun typ _) <- tree2list (jments m)]
-- testing for higher-order abstract syntax
hasHOAS :: GFCGrammar -> Bool
hasHOAS gr = any isHigherOrderType [t | (_,t) <- funRulesOf gr] where
allCatsOf :: GFCGrammar -> [(Cat,Context)]
allCatsOf gr =
[((i,c),cont) | (i, ModMod m) <- modules gr,

View File

@@ -136,6 +136,10 @@ isRecursiveType t = errVal False $ do
(cc,c) <- catSkeleton t -- thus recursivity on Cat level
return $ any (== c) cc
isHigherOrderType :: Type -> Bool
isHigherOrderType t = errVal True $ do -- pessimistic choice
co <- contextOfType t
return $ not $ null [x | (x,Prod _ _ _) <- co]
contextOfType :: Type -> Err Context
contextOfType typ = case typ of

View File

@@ -212,12 +212,12 @@ txtHelpFile =
"\n -fail show strings whose parse fails prefixed by #FAIL" ++
"\n -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS" ++
"\n options for selecting parsing method:" ++
"\n (default)parse using an overgenerating CFG" ++
"\n -fcfg parse using a fast variant of MCFG (default is no HOAS in grammar)" ++
"\n -old parse using an overgenerating CFG (default if HOAS in grammar)" ++
"\n -cfg parse using a much less overgenerating CFG" ++
"\n -mcfg parse using an even less overgenerating MCFG" ++
"\n -fcfg parse using a faster variant of MCFG" ++
"\n Note: the first time parsing with -cfg, -mcfg, and -fcfg might take a long time" ++
"\n options that only work for the default parsing method:" ++
"\n Note: the first time parsing with -cfg, -mcfg, and -fcfg may take a long time" ++
"\n options that only work for the -old default parsing method:" ++
"\n -n non-strict: tolerates morphological errors" ++
"\n -ign ignore unknown words when parsing" ++
"\n -raw return context-free terms in raw form" ++
@@ -594,6 +594,7 @@ txtHelpFile =
"\n -lexer=chars each character is a token" ++
"\n -lexer=code use Haskell's lex" ++
"\n -lexer=codevars like code, but treat unknown words as variables, ?? as meta " ++
"\n -lexer=textvars like text, but treat unknown words as variables, ?? as meta " ++
"\n -lexer=text with conventions on punctuation and capital letters" ++
"\n -lexer=codelit like code, but treat unknown words as string literals" ++
"\n -lexer=textlit like text, but treat unknown words as string literals" ++

View File

@@ -184,8 +184,9 @@ optionsOfCommand co = case co of
CTransformGrammar _ -> flags "printer"
CConvertLatex _ -> none
CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer mark"
CParse -> both "ambiguous fail cut new newer cfg mcfg fcfg n ign raw v lines all prob"
"cat lang lexer parser number rawtrees"
CParse ->
both "ambiguous fail cut new newer old cfg mcfg fcfg n ign raw v lines all prob"
"cat lang lexer parser number rawtrees"
CTranslate _ _ -> opts "cat lexer parser"
CGenerateRandom -> both "cf prob" "cat lang number depth atoms noexpand doexpand"
CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand doexpand"

View File

@@ -450,6 +450,7 @@ customTokenizer =
,(strCI "chars", const $ sg . map (tS . singleton))
,(strCI "code", const $ sg . lexHaskell)
,(strCI "codevars", \gr -> sg . (lexHaskellVar $ stateIsWord gr))
,(strCI "textvars", \gr -> sg . (lexTextVar $ stateIsWord gr))
,(strCI "text", const $ sg . lexText)
,(strCI "unglue", \gr -> sg . map tS . decomposeWords (stateMorpho gr))
,(strCI "codelit", \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))

View File

@@ -54,34 +54,9 @@ parseStringMsg os sg cat s = do
return (ts, unlines $ reverse ss)
parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
---- to test peb's new parser 6/10/2003
---- (obsoleted by "newer" below)
-- parseStringC opts0 sg cat s
-- | oElem newParser opts0 = do
-- let pm = maybe "" id $ getOptVal opts0 useParser -- -parser=pm
-- ct = cfCat2Cat cat
-- ts <- checkErr $ NewOld.newParser pm sg ct s
-- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
-- to use peb's newer parser 7/4-05
parseStringC opts0 sg cat s
| oElem newCParser opts0 || oElem newMParser opts0 || oElem newFParser opts0 || oElem newParser opts0 || oElem newerParser opts0 = do
let opts = unionOptions opts0 $ stateOptions sg
algorithm | oElem newCParser opts0 = "c"
| oElem newMParser opts0 = "m"
| oElem newFParser opts0 = "f"
| otherwise = "c" -- default algorithm
strategy = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
toks = case tokenizer s of
t:_ -> t
_ -> [] ---- no support for undet. tok.
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
return $ optIntOrAll opts flagNumber ts'
parseStringC opts0 sg cat s = do
parseStringC opts0 sg cat s
| oElem (iOpt "old") opts0 ||
(not (oElem (iOpt "fcfg") opts0) && stateHasHOAS sg) = do
let opts = unionOptions opts0 $ stateOptions sg
cf = stateCF sg
gr = stateGrammarST sg
@@ -92,6 +67,26 @@ parseStringC opts0 sg cat s = do
then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
---- | or [oElem p opts0 |
---- p <- [newCParser,newMParser,newFParser,newParser,newerParser] = do
| otherwise = do
let opts = unionOptions opts0 $ stateOptions sg
algorithm | oElem newCParser opts0 = "c"
| oElem newMParser opts0 = "m"
| oElem newFParser opts0 = "f"
| otherwise = "f" -- default algorithm: FCFG
strategy = maybe "bottomup" id $ getOptVal opts useParser
-- -parser=bottomup/topdown
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
toks = case tokenizer s of
t:_ -> t
_ -> [] ---- no support for undet. tok.
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
return $ optIntOrAll opts flagNumber ts'
tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
where result = parser toks

View File

@@ -20,6 +20,7 @@ module GF.UseGrammar.Tokenize ( tokWords,
lexHaskellLiteral,
lexHaskellVar,
lexText,
lexTextVar,
lexC2M, lexC2M',
lexTextLiteral,
lexIgnore,
@@ -58,6 +59,10 @@ isFloat s = case s of
'.':cs@(_:_) -> all isDigit cs
_ -> False
isString s = case s of
c:cs@(_:_) -> (c == '\'' && d == '\'') || (c == '"' && d == '"') where d = last cs
_ -> False
mkCFTok :: String -> CFTok
mkCFTok s = case s of
@@ -86,6 +91,7 @@ mkLit s
| all isDigit s = tI s
| otherwise = tL s
-- obsolete
mkTL :: String -> CFTok
mkTL s
| isFloat s = tF s
@@ -104,6 +110,7 @@ lexText :: String -> [CFTok]
lexText = uncap . lx where
lx s = case s of
'?':'?':cs -> tS "??" : lx cs
p : cs | isMPunct p -> tS [p] : uncap (lx cs)
p : cs | isPunct p -> tS [p] : lx cs
s : cs | isSpace s -> lx cs
@@ -177,7 +184,7 @@ unknown2string isKnown = map mkOne where
| isFloat s = tF s
| all isDigit s = tI s
| otherwise = tL s
mkOne t@(TC s) = if isKnown s then t else mkTL s
mkOne t@(TC s) = if isKnown s then t else mkLit s
mkOne t = t
unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok]
@@ -186,6 +193,7 @@ unknown2var isKnown = map mkOne where
mkOne t@(TS s)
| isKnown s = t
| isFloat s = tF s
| isString s = tL (init (tail s))
| all isDigit s = tI s
| otherwise = tV s
mkOne t@(TC s) = if isKnown s then t else tV s
@@ -197,6 +205,8 @@ lexTextLiteral isKnown = unknown2string (eitherUpper isKnown) . lexText
lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell
lexHaskellVar isKnown = unknown2var isKnown . lexHaskell
lexTextVar isKnown = unknown2var (eitherUpper isKnown) . lexText
eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs)
eitherUpper isKnown w = isKnown w

View File

@@ -183,12 +183,12 @@ p, parse: p String
-fail show strings whose parse fails prefixed by #FAIL
-ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS
options for selecting parsing method:
(default)parse using an overgenerating CFG
-fcfg parse using a fast variant of MCFG (default is no HOAS in grammar)
-old parse using an overgenerating CFG (default if HOAS in grammar)
-cfg parse using a much less overgenerating CFG
-mcfg parse using an even less overgenerating MCFG
-fcfg parse using a faster variant of MCFG
Note: the first time parsing with -cfg, -mcfg, and -fcfg might take a long time
options that only work for the default parsing method:
Note: the first time parsing with -cfg, -mcfg, and -fcfg may take a long time
options that only work for the -old default parsing method:
-n non-strict: tolerates morphological errors
-ign ignore unknown words when parsing
-raw return context-free terms in raw form
@@ -565,6 +565,7 @@ q, quit: q
-lexer=chars each character is a token
-lexer=code use Haskell's lex
-lexer=codevars like code, but treat unknown words as variables, ?? as meta
-lexer=textvars like text, but treat unknown words as variables, ?? as meta
-lexer=text with conventions on punctuation and capital letters
-lexer=codelit like code, but treat unknown words as string literals
-lexer=textlit like text, but treat unknown words as string literals