made -fcfg default parser; added lexer textvars

This commit is contained in:
aarne
2006-06-20 08:38:44 +00:00
parent 658fd526b4
commit 0333ba894b
11 changed files with 71 additions and 40 deletions

View File

@@ -12,6 +12,16 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2
</center> </center>
<p>
20/6 (AR) The FCFG parser is know the default, as it even handles literals.
The old default can be selected by <tt>p -old</tt>. Since
FCFG does not support variable bindings, <tt>-old</tt> is automatically
selected if the grammar has bindings - and unless the <tt>-fcfg</tt> flag
is used.
<p>
17/6 (AR) The FCFG parser is now the recommended method for parsing 17/6 (AR) The FCFG parser is now the recommended method for parsing
heavy grammars such as the resource grammars. It does not yet support heavy grammars such as the resource grammars. It does not yet support
literals and variable bindings. literals and variable bindings.

View File

@@ -367,7 +367,6 @@ Russian
Spanish Spanish
- no list of irregular verbs
- multiple clitics (with V3) not always right - multiple clitics (with V3) not always right

View File

@@ -188,6 +188,10 @@ stateOptions = loptions
stateGrammarWords = allMorphoWords . stateMorpho stateGrammarWords = allMorphoWords . stateMorpho
stateGrammarLang st = (grammar st, cncId st) stateGrammarLang st = (grammar st, cncId st)
---- this should be computed at compile time and stored
stateHasHOAS :: StateGrammar -> Bool
stateHasHOAS = hasHOAS . stateGrammarST
cncModuleIdST :: StateGrammar -> CanonGrammar cncModuleIdST :: StateGrammar -> CanonGrammar
cncModuleIdST = stateGrammarST cncModuleIdST = stateGrammarST

View File

@@ -21,6 +21,7 @@ module GF.Grammar.LookAbs (GFCGrammar,
lookupRef, lookupRef,
refsForType, refsForType,
funRulesOf, funRulesOf,
hasHOAS,
allCatsOf, allCatsOf,
allBindCatsOf, allBindCatsOf,
funsForType, funsForType,
@@ -130,6 +131,10 @@ funRulesOf gr =
mtype m == MTAbstract, mtype m == MTAbstract,
(f, C.AbsFun typ _) <- tree2list (jments m)] (f, C.AbsFun typ _) <- tree2list (jments m)]
-- testing for higher-order abstract syntax
hasHOAS :: GFCGrammar -> Bool
hasHOAS gr = any isHigherOrderType [t | (_,t) <- funRulesOf gr] where
allCatsOf :: GFCGrammar -> [(Cat,Context)] allCatsOf :: GFCGrammar -> [(Cat,Context)]
allCatsOf gr = allCatsOf gr =
[((i,c),cont) | (i, ModMod m) <- modules gr, [((i,c),cont) | (i, ModMod m) <- modules gr,

View File

@@ -136,6 +136,10 @@ isRecursiveType t = errVal False $ do
(cc,c) <- catSkeleton t -- thus recursivity on Cat level (cc,c) <- catSkeleton t -- thus recursivity on Cat level
return $ any (== c) cc return $ any (== c) cc
isHigherOrderType :: Type -> Bool
isHigherOrderType t = errVal True $ do -- pessimistic choice
co <- contextOfType t
return $ not $ null [x | (x,Prod _ _ _) <- co]
contextOfType :: Type -> Err Context contextOfType :: Type -> Err Context
contextOfType typ = case typ of contextOfType typ = case typ of

View File

@@ -212,12 +212,12 @@ txtHelpFile =
"\n -fail show strings whose parse fails prefixed by #FAIL" ++ "\n -fail show strings whose parse fails prefixed by #FAIL" ++
"\n -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS" ++ "\n -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS" ++
"\n options for selecting parsing method:" ++ "\n options for selecting parsing method:" ++
"\n (default)parse using an overgenerating CFG" ++ "\n -fcfg parse using a fast variant of MCFG (default is no HOAS in grammar)" ++
"\n -old parse using an overgenerating CFG (default if HOAS in grammar)" ++
"\n -cfg parse using a much less overgenerating CFG" ++ "\n -cfg parse using a much less overgenerating CFG" ++
"\n -mcfg parse using an even less overgenerating MCFG" ++ "\n -mcfg parse using an even less overgenerating MCFG" ++
"\n -fcfg parse using a faster variant of MCFG" ++ "\n Note: the first time parsing with -cfg, -mcfg, and -fcfg may take a long time" ++
"\n Note: the first time parsing with -cfg, -mcfg, and -fcfg might take a long time" ++ "\n options that only work for the -old default parsing method:" ++
"\n options that only work for the default parsing method:" ++
"\n -n non-strict: tolerates morphological errors" ++ "\n -n non-strict: tolerates morphological errors" ++
"\n -ign ignore unknown words when parsing" ++ "\n -ign ignore unknown words when parsing" ++
"\n -raw return context-free terms in raw form" ++ "\n -raw return context-free terms in raw form" ++
@@ -594,6 +594,7 @@ txtHelpFile =
"\n -lexer=chars each character is a token" ++ "\n -lexer=chars each character is a token" ++
"\n -lexer=code use Haskell's lex" ++ "\n -lexer=code use Haskell's lex" ++
"\n -lexer=codevars like code, but treat unknown words as variables, ?? as meta " ++ "\n -lexer=codevars like code, but treat unknown words as variables, ?? as meta " ++
"\n -lexer=textvars like text, but treat unknown words as variables, ?? as meta " ++
"\n -lexer=text with conventions on punctuation and capital letters" ++ "\n -lexer=text with conventions on punctuation and capital letters" ++
"\n -lexer=codelit like code, but treat unknown words as string literals" ++ "\n -lexer=codelit like code, but treat unknown words as string literals" ++
"\n -lexer=textlit like text, but treat unknown words as string literals" ++ "\n -lexer=textlit like text, but treat unknown words as string literals" ++

View File

@@ -184,8 +184,9 @@ optionsOfCommand co = case co of
CTransformGrammar _ -> flags "printer" CTransformGrammar _ -> flags "printer"
CConvertLatex _ -> none CConvertLatex _ -> none
CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer mark" CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer mark"
CParse -> both "ambiguous fail cut new newer cfg mcfg fcfg n ign raw v lines all prob" CParse ->
"cat lang lexer parser number rawtrees" both "ambiguous fail cut new newer old cfg mcfg fcfg n ign raw v lines all prob"
"cat lang lexer parser number rawtrees"
CTranslate _ _ -> opts "cat lexer parser" CTranslate _ _ -> opts "cat lexer parser"
CGenerateRandom -> both "cf prob" "cat lang number depth atoms noexpand doexpand" CGenerateRandom -> both "cf prob" "cat lang number depth atoms noexpand doexpand"
CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand doexpand" CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand doexpand"

View File

@@ -450,6 +450,7 @@ customTokenizer =
,(strCI "chars", const $ sg . map (tS . singleton)) ,(strCI "chars", const $ sg . map (tS . singleton))
,(strCI "code", const $ sg . lexHaskell) ,(strCI "code", const $ sg . lexHaskell)
,(strCI "codevars", \gr -> sg . (lexHaskellVar $ stateIsWord gr)) ,(strCI "codevars", \gr -> sg . (lexHaskellVar $ stateIsWord gr))
,(strCI "textvars", \gr -> sg . (lexTextVar $ stateIsWord gr))
,(strCI "text", const $ sg . lexText) ,(strCI "text", const $ sg . lexText)
,(strCI "unglue", \gr -> sg . map tS . decomposeWords (stateMorpho gr)) ,(strCI "unglue", \gr -> sg . map tS . decomposeWords (stateMorpho gr))
,(strCI "codelit", \gr -> sg . (lexHaskellLiteral $ stateIsWord gr)) ,(strCI "codelit", \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))

View File

@@ -54,34 +54,9 @@ parseStringMsg os sg cat s = do
return (ts, unlines $ reverse ss) return (ts, unlines $ reverse ss)
parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree] parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
parseStringC opts0 sg cat s
---- to test peb's new parser 6/10/2003 | oElem (iOpt "old") opts0 ||
---- (obsoleted by "newer" below) (not (oElem (iOpt "fcfg") opts0) && stateHasHOAS sg) = do
-- parseStringC opts0 sg cat s
-- | oElem newParser opts0 = do
-- let pm = maybe "" id $ getOptVal opts0 useParser -- -parser=pm
-- ct = cfCat2Cat cat
-- ts <- checkErr $ NewOld.newParser pm sg ct s
-- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
-- to use peb's newer parser 7/4-05
parseStringC opts0 sg cat s
| oElem newCParser opts0 || oElem newMParser opts0 || oElem newFParser opts0 || oElem newParser opts0 || oElem newerParser opts0 = do
let opts = unionOptions opts0 $ stateOptions sg
algorithm | oElem newCParser opts0 = "c"
| oElem newMParser opts0 = "m"
| oElem newFParser opts0 = "f"
| otherwise = "c" -- default algorithm
strategy = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
toks = case tokenizer s of
t:_ -> t
_ -> [] ---- no support for undet. tok.
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
return $ optIntOrAll opts flagNumber ts'
parseStringC opts0 sg cat s = do
let opts = unionOptions opts0 $ stateOptions sg let opts = unionOptions opts0 $ stateOptions sg
cf = stateCF sg cf = stateCF sg
gr = stateGrammarST sg gr = stateGrammarST sg
@@ -92,6 +67,26 @@ parseStringC opts0 sg cat s = do
then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
else mapM (tokens2trms opts sg cn parser) toks >>= return . concat else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
---- | or [oElem p opts0 |
---- p <- [newCParser,newMParser,newFParser,newParser,newerParser] = do
| otherwise = do
let opts = unionOptions opts0 $ stateOptions sg
algorithm | oElem newCParser opts0 = "c"
| oElem newMParser opts0 = "m"
| oElem newFParser opts0 = "f"
| otherwise = "f" -- default algorithm: FCFG
strategy = maybe "bottomup" id $ getOptVal opts useParser
-- -parser=bottomup/topdown
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
toks = case tokenizer s of
t:_ -> t
_ -> [] ---- no support for undet. tok.
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
return $ optIntOrAll opts flagNumber ts'
tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree] tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
where result = parser toks where result = parser toks

View File

@@ -20,6 +20,7 @@ module GF.UseGrammar.Tokenize ( tokWords,
lexHaskellLiteral, lexHaskellLiteral,
lexHaskellVar, lexHaskellVar,
lexText, lexText,
lexTextVar,
lexC2M, lexC2M', lexC2M, lexC2M',
lexTextLiteral, lexTextLiteral,
lexIgnore, lexIgnore,
@@ -58,6 +59,10 @@ isFloat s = case s of
'.':cs@(_:_) -> all isDigit cs '.':cs@(_:_) -> all isDigit cs
_ -> False _ -> False
isString s = case s of
c:cs@(_:_) -> (c == '\'' && d == '\'') || (c == '"' && d == '"') where d = last cs
_ -> False
mkCFTok :: String -> CFTok mkCFTok :: String -> CFTok
mkCFTok s = case s of mkCFTok s = case s of
@@ -86,6 +91,7 @@ mkLit s
| all isDigit s = tI s | all isDigit s = tI s
| otherwise = tL s | otherwise = tL s
-- obsolete
mkTL :: String -> CFTok mkTL :: String -> CFTok
mkTL s mkTL s
| isFloat s = tF s | isFloat s = tF s
@@ -104,6 +110,7 @@ lexText :: String -> [CFTok]
lexText = uncap . lx where lexText = uncap . lx where
lx s = case s of lx s = case s of
'?':'?':cs -> tS "??" : lx cs
p : cs | isMPunct p -> tS [p] : uncap (lx cs) p : cs | isMPunct p -> tS [p] : uncap (lx cs)
p : cs | isPunct p -> tS [p] : lx cs p : cs | isPunct p -> tS [p] : lx cs
s : cs | isSpace s -> lx cs s : cs | isSpace s -> lx cs
@@ -177,7 +184,7 @@ unknown2string isKnown = map mkOne where
| isFloat s = tF s | isFloat s = tF s
| all isDigit s = tI s | all isDigit s = tI s
| otherwise = tL s | otherwise = tL s
mkOne t@(TC s) = if isKnown s then t else mkTL s mkOne t@(TC s) = if isKnown s then t else mkLit s
mkOne t = t mkOne t = t
unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok] unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok]
@@ -186,6 +193,7 @@ unknown2var isKnown = map mkOne where
mkOne t@(TS s) mkOne t@(TS s)
| isKnown s = t | isKnown s = t
| isFloat s = tF s | isFloat s = tF s
| isString s = tL (init (tail s))
| all isDigit s = tI s | all isDigit s = tI s
| otherwise = tV s | otherwise = tV s
mkOne t@(TC s) = if isKnown s then t else tV s mkOne t@(TC s) = if isKnown s then t else tV s
@@ -197,6 +205,8 @@ lexTextLiteral isKnown = unknown2string (eitherUpper isKnown) . lexText
lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell
lexHaskellVar isKnown = unknown2var isKnown . lexHaskell lexHaskellVar isKnown = unknown2var isKnown . lexHaskell
lexTextVar isKnown = unknown2var (eitherUpper isKnown) . lexText
eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs) eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs)
eitherUpper isKnown w = isKnown w eitherUpper isKnown w = isKnown w

View File

@@ -183,12 +183,12 @@ p, parse: p String
-fail show strings whose parse fails prefixed by #FAIL -fail show strings whose parse fails prefixed by #FAIL
-ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS
options for selecting parsing method: options for selecting parsing method:
(default)parse using an overgenerating CFG -fcfg parse using a fast variant of MCFG (default is no HOAS in grammar)
-old parse using an overgenerating CFG (default if HOAS in grammar)
-cfg parse using a much less overgenerating CFG -cfg parse using a much less overgenerating CFG
-mcfg parse using an even less overgenerating MCFG -mcfg parse using an even less overgenerating MCFG
-fcfg parse using a faster variant of MCFG Note: the first time parsing with -cfg, -mcfg, and -fcfg may take a long time
Note: the first time parsing with -cfg, -mcfg, and -fcfg might take a long time options that only work for the -old default parsing method:
options that only work for the default parsing method:
-n non-strict: tolerates morphological errors -n non-strict: tolerates morphological errors
-ign ignore unknown words when parsing -ign ignore unknown words when parsing
-raw return context-free terms in raw form -raw return context-free terms in raw form
@@ -565,6 +565,7 @@ q, quit: q
-lexer=chars each character is a token -lexer=chars each character is a token
-lexer=code use Haskell's lex -lexer=code use Haskell's lex
-lexer=codevars like code, but treat unknown words as variables, ?? as meta -lexer=codevars like code, but treat unknown words as variables, ?? as meta
-lexer=textvars like text, but treat unknown words as variables, ?? as meta
-lexer=text with conventions on punctuation and capital letters -lexer=text with conventions on punctuation and capital letters
-lexer=codelit like code, but treat unknown words as string literals -lexer=codelit like code, but treat unknown words as string literals
-lexer=textlit like text, but treat unknown words as string literals -lexer=textlit like text, but treat unknown words as string literals