made -fcfg default parser; added lexer textvars

2026-06-26 03:16:27 -06:00 · 2006-06-20 08:38:44 +00:00
parent cb168e92e2
commit 402a113b56
10 changed files with 71 additions and 39 deletions
@@ -12,6 +12,16 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2
 </center>
 <p>
 20/6 (AR) The FCFG parser is know the default, as it even handles literals.
 The old default can be selected by <tt>p -old</tt>. Since
 FCFG does not support variable bindings, <tt>-old</tt> is automatically
 selected if the grammar has bindings - and unless the <tt>-fcfg</tt> flag
 is used.
 <p>
 17/6 (AR) The FCFG parser is now the recommended method for parsing
 heavy grammars such as the resource grammars. It does not yet support
 literals and variable bindings.
@@ -188,6 +188,10 @@ stateOptions   = loptions
 stateGrammarWords = allMorphoWords . stateMorpho
 stateGrammarLang st = (grammar st, cncId st)
 ---- this should be computed at compile time and stored
 stateHasHOAS :: StateGrammar -> Bool
 stateHasHOAS = hasHOAS . stateGrammarST
 cncModuleIdST :: StateGrammar -> CanonGrammar
 cncModuleIdST = stateGrammarST
@@ -21,6 +21,7 @@ module GF.Grammar.LookAbs (GFCGrammar,
 		lookupRef,
 		refsForType,
 		funRulesOf,
 		hasHOAS,
 		allCatsOf,
 		allBindCatsOf,
 		funsForType,
@@ -130,6 +131,10 @@ funRulesOf gr =
                 mtype m == MTAbstract,
                 (f, C.AbsFun typ _) <- tree2list (jments m)]
 -- testing for higher-order abstract syntax
 hasHOAS :: GFCGrammar -> Bool
 hasHOAS gr = any isHigherOrderType [t | (_,t) <- funRulesOf gr] where
 allCatsOf :: GFCGrammar -> [(Cat,Context)]
 allCatsOf gr = 
  [((i,c),cont) | (i, ModMod m) <- modules gr,
@@ -136,6 +136,10 @@ isRecursiveType t = errVal False $ do
  (cc,c) <- catSkeleton t -- thus recursivity on Cat level
  return $ any (== c) cc
 isHigherOrderType :: Type -> Bool
 isHigherOrderType t = errVal True $ do  -- pessimistic choice
  co <- contextOfType t
  return $ not $ null [x | (x,Prod _ _ _) <- co]
 contextOfType :: Type -> Err Context
 contextOfType typ = case typ of
@@ -212,12 +212,12 @@ txtHelpFile =
  "\n      -fail      show strings whose parse fails prefixed by #FAIL" ++
  "\n      -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS" ++
  "\n  options for selecting parsing method:" ++
-  "\n      (default)parse using an overgenerating CFG" ++
+  "\n      -fcfg    parse using a fast variant of MCFG (default is no HOAS in grammar)" ++
  "\n      -old     parse using an overgenerating CFG (default if HOAS in grammar)" ++
  "\n      -cfg     parse using a much less overgenerating CFG" ++
  "\n      -mcfg    parse using an even less overgenerating MCFG" ++
-  "\n      -fcfg    parse using a faster variant of MCFG" ++
+  "\n      Note:    the first time parsing with -cfg, -mcfg, and -fcfg may take a long time" ++
-  "\n      Note:    the first time parsing with -cfg, -mcfg, and -fcfg might take a long time" ++
+  "\n  options that only work for the -old default parsing method:" ++
  "\n  options that only work for the default parsing method:" ++
  "\n      -n       non-strict: tolerates morphological errors" ++
  "\n      -ign     ignore unknown words when parsing" ++
  "\n      -raw     return context-free terms in raw form" ++
@@ -594,6 +594,7 @@ txtHelpFile =
  "\n    -lexer=chars         each character is a token" ++
  "\n    -lexer=code          use Haskell's lex" ++
  "\n    -lexer=codevars      like code, but treat unknown words as variables, ?? as meta " ++
  "\n    -lexer=textvars      like text, but treat unknown words as variables, ?? as meta " ++
  "\n    -lexer=text          with conventions on punctuation and capital letters" ++
  "\n    -lexer=codelit       like code, but treat unknown words as string literals" ++
  "\n    -lexer=textlit       like text, but treat unknown words as string literals" ++
@@ -184,8 +184,9 @@ optionsOfCommand co = case co of
  CTransformGrammar _ -> flags "printer"
  CConvertLatex _ -> none
  CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer mark"
-  CParse -> both "ambiguous fail cut new newer cfg mcfg fcfg n ign raw v lines all prob" 
+  CParse -> 
-                 "cat lang lexer parser number rawtrees"
+    both "ambiguous fail cut new newer old cfg mcfg fcfg n ign raw v lines all prob" 
         "cat lang lexer parser number rawtrees"
  CTranslate _ _ -> opts "cat lexer parser"
  CGenerateRandom -> both "cf prob" "cat lang number depth atoms noexpand doexpand"
  CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand doexpand"
@@ -450,6 +450,7 @@ customTokenizer =
  ,(strCI "chars",     const $ sg . map (tS . singleton))
  ,(strCI "code",      const $ sg . lexHaskell)
  ,(strCI "codevars",  \gr -> sg . (lexHaskellVar $ stateIsWord gr))
  ,(strCI "textvars",  \gr -> sg . (lexTextVar $ stateIsWord gr))
  ,(strCI "text",      const $ sg . lexText)
  ,(strCI "unglue",    \gr -> sg . map tS . decomposeWords (stateMorpho gr))
  ,(strCI "codelit",   \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))
@@ -54,34 +54,9 @@ parseStringMsg os sg cat s = do
  return (ts, unlines $ reverse ss)
 parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
-
+parseStringC opts0 sg cat s 
---- to test peb's new parser 6/10/2003
+ | oElem (iOpt "old") opts0 ||
---- (obsoleted by "newer" below)
+   (not (oElem (iOpt "fcfg") opts0) && stateHasHOAS sg) = do
 -- parseStringC opts0 sg cat s
 --  | oElem newParser opts0 = do  
 --   let pm = maybe "" id $ getOptVal opts0 useParser -- -parser=pm
 --       ct = cfCat2Cat cat
 --   ts <- checkErr $ NewOld.newParser pm sg ct s
 --   mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
 -- to use peb's newer parser 7/4-05 
 parseStringC opts0 sg cat s
 | oElem newCParser opts0 || oElem newMParser opts0 || oElem newFParser opts0 || oElem newParser opts0 || oElem newerParser opts0 = do  
  let opts      = unionOptions opts0 $ stateOptions sg
      algorithm | oElem newCParser opts0 = "c"
 		| oElem newMParser opts0 = "m"
 		| oElem newFParser opts0 = "f"
 		| otherwise              = "c" -- default algorithm
      strategy  = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
      tokenizer = customOrDefault opts useTokenizer customTokenizer sg
      toks = case tokenizer s of
               t:_ -> t
               _ -> [] ---- no support for undet. tok.
  ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
  ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
  return $ optIntOrAll opts flagNumber ts'
 parseStringC opts0 sg cat s = do
  let opts = unionOptions opts0 $ stateOptions sg
      cf  = stateCF sg
      gr  = stateGrammarST sg
@@ -92,6 +67,26 @@ parseStringC opts0 sg cat s = do
    then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
    else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
 ---- | or [oElem p opts0 | 
 ----        p <- [newCParser,newMParser,newFParser,newParser,newerParser] = do  
 | otherwise = do
  let opts      = unionOptions opts0 $ stateOptions sg
      algorithm | oElem newCParser opts0 = "c"
 		| oElem newMParser opts0 = "m"
 		| oElem newFParser opts0 = "f"
 		| otherwise              = "f" -- default algorithm: FCFG
      strategy  = maybe "bottomup" id $ getOptVal opts useParser 
                      -- -parser=bottomup/topdown
      tokenizer = customOrDefault opts useTokenizer customTokenizer sg
      toks = case tokenizer s of
               t:_ -> t
               _ -> [] ---- no support for undet. tok.
  ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
  ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
  return $ optIntOrAll opts flagNumber ts'
 tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
 tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
    where result = parser toks
@@ -20,6 +20,7 @@ module GF.UseGrammar.Tokenize ( tokWords,
 		  lexHaskellLiteral,
 		  lexHaskellVar,
 		  lexText,
 		  lexTextVar,
 		  lexC2M, lexC2M',
 		  lexTextLiteral,
                  lexIgnore,
@@ -58,6 +59,10 @@ isFloat s = case s of
  '.':cs@(_:_)     -> all isDigit cs
  _ -> False
 isString s = case s of
  c:cs@(_:_) -> (c == '\'' && d == '\'') || (c == '"' && d == '"') where d = last cs
  _ -> False
 mkCFTok :: String -> CFTok
 mkCFTok s = case s of
@@ -86,6 +91,7 @@ mkLit s
  | all isDigit s = tI s
  | otherwise = tL s
 -- obsolete
 mkTL :: String -> CFTok
 mkTL s 
  | isFloat s = tF s
@@ -104,6 +110,7 @@ lexText :: String -> [CFTok]
 lexText = uncap . lx where
  lx s = case s of
    '?':'?':cs          -> tS "??" : lx cs
    p : cs | isMPunct p -> tS [p] : uncap (lx cs)
    p : cs | isPunct p  -> tS [p] : lx cs
    s : cs | isSpace s  -> lx cs
@@ -177,7 +184,7 @@ unknown2string isKnown = map mkOne where
    | isFloat s = tF s
    | all isDigit s = tI s 
    | otherwise = tL s
-  mkOne t@(TC s) = if isKnown s then t else mkTL s
+  mkOne t@(TC s) = if isKnown s then t else mkLit s
  mkOne t        = t
 unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok]
@@ -186,6 +193,7 @@ unknown2var isKnown = map mkOne where
  mkOne t@(TS s) 
    | isKnown s = t
    | isFloat s = tF s
    | isString s = tL (init (tail s))
    | all isDigit s = tI s 
    | otherwise = tV s
  mkOne t@(TC s) = if isKnown s then t else tV s
@@ -197,6 +205,8 @@ lexTextLiteral    isKnown = unknown2string (eitherUpper isKnown) . lexText
 lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell
 lexHaskellVar     isKnown = unknown2var isKnown . lexHaskell
 lexTextVar    isKnown = unknown2var (eitherUpper isKnown) . lexText
 eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs)
 eitherUpper isKnown w = isKnown w
@@ -183,12 +183,12 @@ p,  parse: p String
      -fail      show strings whose parse fails prefixed by #FAIL
      -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS
  options for selecting parsing method:
-      (default)parse using an overgenerating CFG
+      -fcfg    parse using a fast variant of MCFG (default is no HOAS in grammar)
      -old     parse using an overgenerating CFG (default if HOAS in grammar)
      -cfg     parse using a much less overgenerating CFG
      -mcfg    parse using an even less overgenerating MCFG
-      -fcfg    parse using a faster variant of MCFG
+      Note:    the first time parsing with -cfg, -mcfg, and -fcfg may take a long time
-      Note:    the first time parsing with -cfg, -mcfg, and -fcfg might take a long time
+  options that only work for the -old default parsing method:
  options that only work for the default parsing method:
      -n       non-strict: tolerates morphological errors
      -ign     ignore unknown words when parsing
      -raw     return context-free terms in raw form
@@ -565,6 +565,7 @@ q, quit: q
    -lexer=chars         each character is a token
    -lexer=code          use Haskell's lex
    -lexer=codevars      like code, but treat unknown words as variables, ?? as meta 
    -lexer=textvars      like text, but treat unknown words as variables, ?? as meta 
    -lexer=text          with conventions on punctuation and capital letters
    -lexer=codelit       like code, but treat unknown words as string literals
    -lexer=textlit       like text, but treat unknown words as string literals