diff --git a/doc/gf-history.html b/doc/gf-history.html index 57a425ca2..ca71a3b7c 100644 --- a/doc/gf-history.html +++ b/doc/gf-history.html @@ -12,6 +12,16 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2 +
+ +20/6 (AR) The FCFG parser is know the default, as it even handles literals. +The old default can be selected by p -old. Since +FCFG does not support variable bindings, -old is automatically +selected if the grammar has bindings - and unless the -fcfg flag +is used. + +
+ 17/6 (AR) The FCFG parser is now the recommended method for parsing heavy grammars such as the resource grammars. It does not yet support literals and variable bindings. diff --git a/lib/resource-1.0/doc/index.txt b/lib/resource-1.0/doc/index.txt index f9a84ffbc..292cf134d 100644 --- a/lib/resource-1.0/doc/index.txt +++ b/lib/resource-1.0/doc/index.txt @@ -367,7 +367,6 @@ Russian Spanish -- no list of irregular verbs - multiple clitics (with V3) not always right diff --git a/src/GF/Compile/ShellState.hs b/src/GF/Compile/ShellState.hs index 3a7115b34..88da1283b 100644 --- a/src/GF/Compile/ShellState.hs +++ b/src/GF/Compile/ShellState.hs @@ -188,6 +188,10 @@ stateOptions = loptions stateGrammarWords = allMorphoWords . stateMorpho stateGrammarLang st = (grammar st, cncId st) +---- this should be computed at compile time and stored +stateHasHOAS :: StateGrammar -> Bool +stateHasHOAS = hasHOAS . stateGrammarST + cncModuleIdST :: StateGrammar -> CanonGrammar cncModuleIdST = stateGrammarST diff --git a/src/GF/Grammar/LookAbs.hs b/src/GF/Grammar/LookAbs.hs index 0c86ae3e9..5bd4c1e41 100644 --- a/src/GF/Grammar/LookAbs.hs +++ b/src/GF/Grammar/LookAbs.hs @@ -21,6 +21,7 @@ module GF.Grammar.LookAbs (GFCGrammar, lookupRef, refsForType, funRulesOf, + hasHOAS, allCatsOf, allBindCatsOf, funsForType, @@ -130,6 +131,10 @@ funRulesOf gr = mtype m == MTAbstract, (f, C.AbsFun typ _) <- tree2list (jments m)] +-- testing for higher-order abstract syntax +hasHOAS :: GFCGrammar -> Bool +hasHOAS gr = any isHigherOrderType [t | (_,t) <- funRulesOf gr] where + allCatsOf :: GFCGrammar -> [(Cat,Context)] allCatsOf gr = [((i,c),cont) | (i, ModMod m) <- modules gr, diff --git a/src/GF/Grammar/Macros.hs b/src/GF/Grammar/Macros.hs index e7d073382..a3cad8bae 100644 --- a/src/GF/Grammar/Macros.hs +++ b/src/GF/Grammar/Macros.hs @@ -136,6 +136,10 @@ isRecursiveType t = errVal False $ do (cc,c) <- catSkeleton t -- thus recursivity on Cat level return $ any (== c) cc +isHigherOrderType :: Type -> Bool +isHigherOrderType t = errVal True $ do -- pessimistic choice + co <- contextOfType t + return $ not $ null [x | (x,Prod _ _ _) <- co] contextOfType :: Type -> Err Context contextOfType typ = case typ of diff --git a/src/GF/Shell/HelpFile.hs b/src/GF/Shell/HelpFile.hs index 215a33875..18dcd3579 100644 --- a/src/GF/Shell/HelpFile.hs +++ b/src/GF/Shell/HelpFile.hs @@ -212,12 +212,12 @@ txtHelpFile = "\n -fail show strings whose parse fails prefixed by #FAIL" ++ "\n -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS" ++ "\n options for selecting parsing method:" ++ - "\n (default)parse using an overgenerating CFG" ++ + "\n -fcfg parse using a fast variant of MCFG (default is no HOAS in grammar)" ++ + "\n -old parse using an overgenerating CFG (default if HOAS in grammar)" ++ "\n -cfg parse using a much less overgenerating CFG" ++ "\n -mcfg parse using an even less overgenerating MCFG" ++ - "\n -fcfg parse using a faster variant of MCFG" ++ - "\n Note: the first time parsing with -cfg, -mcfg, and -fcfg might take a long time" ++ - "\n options that only work for the default parsing method:" ++ + "\n Note: the first time parsing with -cfg, -mcfg, and -fcfg may take a long time" ++ + "\n options that only work for the -old default parsing method:" ++ "\n -n non-strict: tolerates morphological errors" ++ "\n -ign ignore unknown words when parsing" ++ "\n -raw return context-free terms in raw form" ++ @@ -594,6 +594,7 @@ txtHelpFile = "\n -lexer=chars each character is a token" ++ "\n -lexer=code use Haskell's lex" ++ "\n -lexer=codevars like code, but treat unknown words as variables, ?? as meta " ++ + "\n -lexer=textvars like text, but treat unknown words as variables, ?? as meta " ++ "\n -lexer=text with conventions on punctuation and capital letters" ++ "\n -lexer=codelit like code, but treat unknown words as string literals" ++ "\n -lexer=textlit like text, but treat unknown words as string literals" ++ diff --git a/src/GF/Shell/ShellCommands.hs b/src/GF/Shell/ShellCommands.hs index ff3960eef..08b9720bd 100644 --- a/src/GF/Shell/ShellCommands.hs +++ b/src/GF/Shell/ShellCommands.hs @@ -184,8 +184,9 @@ optionsOfCommand co = case co of CTransformGrammar _ -> flags "printer" CConvertLatex _ -> none CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer mark" - CParse -> both "ambiguous fail cut new newer cfg mcfg fcfg n ign raw v lines all prob" - "cat lang lexer parser number rawtrees" + CParse -> + both "ambiguous fail cut new newer old cfg mcfg fcfg n ign raw v lines all prob" + "cat lang lexer parser number rawtrees" CTranslate _ _ -> opts "cat lexer parser" CGenerateRandom -> both "cf prob" "cat lang number depth atoms noexpand doexpand" CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand doexpand" diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs index 4314747bf..07eda0a37 100644 --- a/src/GF/UseGrammar/Custom.hs +++ b/src/GF/UseGrammar/Custom.hs @@ -450,6 +450,7 @@ customTokenizer = ,(strCI "chars", const $ sg . map (tS . singleton)) ,(strCI "code", const $ sg . lexHaskell) ,(strCI "codevars", \gr -> sg . (lexHaskellVar $ stateIsWord gr)) + ,(strCI "textvars", \gr -> sg . (lexTextVar $ stateIsWord gr)) ,(strCI "text", const $ sg . lexText) ,(strCI "unglue", \gr -> sg . map tS . decomposeWords (stateMorpho gr)) ,(strCI "codelit", \gr -> sg . (lexHaskellLiteral $ stateIsWord gr)) diff --git a/src/GF/UseGrammar/Parsing.hs b/src/GF/UseGrammar/Parsing.hs index 6e8965f08..e979579c9 100644 --- a/src/GF/UseGrammar/Parsing.hs +++ b/src/GF/UseGrammar/Parsing.hs @@ -54,34 +54,9 @@ parseStringMsg os sg cat s = do return (ts, unlines $ reverse ss) parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree] - ----- to test peb's new parser 6/10/2003 ----- (obsoleted by "newer" below) --- parseStringC opts0 sg cat s --- | oElem newParser opts0 = do --- let pm = maybe "" id $ getOptVal opts0 useParser -- -parser=pm --- ct = cfCat2Cat cat --- ts <- checkErr $ NewOld.newParser pm sg ct s --- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts - --- to use peb's newer parser 7/4-05 -parseStringC opts0 sg cat s - | oElem newCParser opts0 || oElem newMParser opts0 || oElem newFParser opts0 || oElem newParser opts0 || oElem newerParser opts0 = do - let opts = unionOptions opts0 $ stateOptions sg - algorithm | oElem newCParser opts0 = "c" - | oElem newMParser opts0 = "m" - | oElem newFParser opts0 = "f" - | otherwise = "c" -- default algorithm - strategy = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown - tokenizer = customOrDefault opts useTokenizer customTokenizer sg - toks = case tokenizer s of - t:_ -> t - _ -> [] ---- no support for undet. tok. - ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks - ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts - return $ optIntOrAll opts flagNumber ts' - -parseStringC opts0 sg cat s = do +parseStringC opts0 sg cat s + | oElem (iOpt "old") opts0 || + (not (oElem (iOpt "fcfg") opts0) && stateHasHOAS sg) = do let opts = unionOptions opts0 $ stateOptions sg cf = stateCF sg gr = stateGrammarST sg @@ -92,6 +67,26 @@ parseStringC opts0 sg cat s = do then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks else mapM (tokens2trms opts sg cn parser) toks >>= return . concat +---- | or [oElem p opts0 | +---- p <- [newCParser,newMParser,newFParser,newParser,newerParser] = do + + | otherwise = do + let opts = unionOptions opts0 $ stateOptions sg + algorithm | oElem newCParser opts0 = "c" + | oElem newMParser opts0 = "m" + | oElem newFParser opts0 = "f" + | otherwise = "f" -- default algorithm: FCFG + strategy = maybe "bottomup" id $ getOptVal opts useParser + -- -parser=bottomup/topdown + tokenizer = customOrDefault opts useTokenizer customTokenizer sg + toks = case tokenizer s of + t:_ -> t + _ -> [] ---- no support for undet. tok. + ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks + ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts + return $ optIntOrAll opts flagNumber ts' + + tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree] tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info where result = parser toks diff --git a/src/GF/UseGrammar/Tokenize.hs b/src/GF/UseGrammar/Tokenize.hs index d16fdf32f..9f1ab5449 100644 --- a/src/GF/UseGrammar/Tokenize.hs +++ b/src/GF/UseGrammar/Tokenize.hs @@ -20,6 +20,7 @@ module GF.UseGrammar.Tokenize ( tokWords, lexHaskellLiteral, lexHaskellVar, lexText, + lexTextVar, lexC2M, lexC2M', lexTextLiteral, lexIgnore, @@ -58,6 +59,10 @@ isFloat s = case s of '.':cs@(_:_) -> all isDigit cs _ -> False +isString s = case s of + c:cs@(_:_) -> (c == '\'' && d == '\'') || (c == '"' && d == '"') where d = last cs + _ -> False + mkCFTok :: String -> CFTok mkCFTok s = case s of @@ -86,6 +91,7 @@ mkLit s | all isDigit s = tI s | otherwise = tL s +-- obsolete mkTL :: String -> CFTok mkTL s | isFloat s = tF s @@ -104,6 +110,7 @@ lexText :: String -> [CFTok] lexText = uncap . lx where lx s = case s of + '?':'?':cs -> tS "??" : lx cs p : cs | isMPunct p -> tS [p] : uncap (lx cs) p : cs | isPunct p -> tS [p] : lx cs s : cs | isSpace s -> lx cs @@ -177,7 +184,7 @@ unknown2string isKnown = map mkOne where | isFloat s = tF s | all isDigit s = tI s | otherwise = tL s - mkOne t@(TC s) = if isKnown s then t else mkTL s + mkOne t@(TC s) = if isKnown s then t else mkLit s mkOne t = t unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok] @@ -186,6 +193,7 @@ unknown2var isKnown = map mkOne where mkOne t@(TS s) | isKnown s = t | isFloat s = tF s + | isString s = tL (init (tail s)) | all isDigit s = tI s | otherwise = tV s mkOne t@(TC s) = if isKnown s then t else tV s @@ -197,6 +205,8 @@ lexTextLiteral isKnown = unknown2string (eitherUpper isKnown) . lexText lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell lexHaskellVar isKnown = unknown2var isKnown . lexHaskell +lexTextVar isKnown = unknown2var (eitherUpper isKnown) . lexText + eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs) eitherUpper isKnown w = isKnown w diff --git a/src/HelpFile b/src/HelpFile index c3402c383..96afff0ec 100644 --- a/src/HelpFile +++ b/src/HelpFile @@ -183,12 +183,12 @@ p, parse: p String -fail show strings whose parse fails prefixed by #FAIL -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS options for selecting parsing method: - (default)parse using an overgenerating CFG + -fcfg parse using a fast variant of MCFG (default is no HOAS in grammar) + -old parse using an overgenerating CFG (default if HOAS in grammar) -cfg parse using a much less overgenerating CFG -mcfg parse using an even less overgenerating MCFG - -fcfg parse using a faster variant of MCFG - Note: the first time parsing with -cfg, -mcfg, and -fcfg might take a long time - options that only work for the default parsing method: + Note: the first time parsing with -cfg, -mcfg, and -fcfg may take a long time + options that only work for the -old default parsing method: -n non-strict: tolerates morphological errors -ign ignore unknown words when parsing -raw return context-free terms in raw form @@ -565,6 +565,7 @@ q, quit: q -lexer=chars each character is a token -lexer=code use Haskell's lex -lexer=codevars like code, but treat unknown words as variables, ?? as meta + -lexer=textvars like text, but treat unknown words as variables, ?? as meta -lexer=text with conventions on punctuation and capital letters -lexer=codelit like code, but treat unknown words as string literals -lexer=textlit like text, but treat unknown words as string literals