modules for embedded GF

2005-05-10 11:49:13 +00:00
parent b2357c9ddc
commit 8b0e66b45b
6 changed files with 363 additions and 6 deletions
--- a/doc/gf-index.html
+++ b/doc/gf-index.html
@@ -18,11 +18,14 @@ November 8, 2004.
 </p><h2>News</h2>
 <!--
 <i>May 9, 2005</i>. Version 2.2 coming soon. Here is a 
 <a href="download-2.2/GF-2.2.tgz">preliminary source release</a>.
 Here are the <a href="doc/gf2.2-highlights.html">highlights</a>.
 <p>
 -->
 <i>May 9, 2005</i>.
 PhD Thesis by
--- a/src/GF/API.hs
+++ b/src/GF/API.hs
@@ -5,9 +5,9 @@
 -- Stability   : (stable)
 -- Portability : (portable)
 --
-- > CVS $Date: 2005/04/21 16:45:57 $ 
+-- > CVS $Date: 2005/05/10 12:49:13 $ 
-- > CVS $Author: bringert $
+-- > CVS $Author: aarne $
-- > CVS $Revision: 1.33 $
+-- > CVS $Revision: 1.34 $
 --
 -- Application Programmer's Interface to GF; also used by Shell. AR 10/11/2001
 -----------------------------------------------------------------------------
@@ -88,6 +88,12 @@ linearize :: GFGrammar -> Tree -> String
 linearize sgr = err id id . optLinearizeTree opts sgr where
  opts = addOption firstLin $ stateOptions sgr
 term2tree :: GFGrammar -> G.Term -> Tree
 term2tree gr = errVal uTree . annotate (grammar gr) . qualifTerm (absId gr)
 tree2term :: Tree -> G.Term
 tree2term = tree2exp
 linearizeToAll :: [GFGrammar] -> Tree -> [String]
 linearizeToAll grs t = [linearize gr t | gr <- grs]
--- a/src/GF/Embed/EmbedAPI.hs
+++ b/src/GF/Embed/EmbedAPI.hs
@@ -0,0 +1,78 @@
 ----------------------------------------------------------------------
 -- |
 -- Module      : EmbedAPI
 -- Maintainer  : Aarne Ranta
 -- Stability   : (stable)
 -- Portability : (portable)
 --
 -- > CVS $Date: 
 -- > CVS $Author: 
 -- > CVS $Revision: 
 --
 -- Reduced Application Programmer's Interface to GF, meant for
 -- embedded GF systems. AR 10/5/2005
 -----------------------------------------------------------------------------
 module GF.Embed.EmbedAPI where
 import GF.Compile.ShellState (ShellState,grammar2shellState,canModules,stateGrammarOfLang,abstract,grammar)
 import GF.UseGrammar.Linear (linTree2string)
 import GF.Embed.EmbedParsing (parseString)
 import GF.Canon.CMacros (noMark)
 import GF.Grammar.Grammar (Trm)
 import GF.Grammar.MMacros (exp2tree)
 import GF.Grammar.Macros (zIdent)
 import GF.Grammar.Values (tree2exp)
 import GF.Grammar.TypeCheck (annotate)
 import GF.Canon.GetGFC (getCanonGrammar)
 import GF.Infra.Modules (emptyMGrammar)
 import GF.CF.CFIdent (string2CFCat)
 import GF.Infra.UseIO
 import GF.Data.Operations
 import GF.Infra.Option (noOptions,useUntokenizer)
 import GF.Infra.Ident (prIdent)
 import GF.Embed.EmbedCustom
 -- This API is meant to be used when embedding GF grammars in Haskell 
 -- programs. The embedded system is supposed to use the
 -- .gfcm grammar format, which is first produced by the gf program.
 ---------------------------------------------------
 -- Interface
 ---------------------------------------------------
 type MultiGrammar = ShellState
 type Language     = String
 type Category     = String
 type Tree         = Trm
 file2grammar :: FilePath -> IO MultiGrammar
 linearize    :: MultiGrammar -> Language -> Tree -> String
 parse        :: MultiGrammar -> Language -> Category -> String -> [Tree]
 ---------------------------------------------------
 -- Implementation
 ---------------------------------------------------
 file2grammar file = do
  can <- useIOE (error "cannot parse grammar file") $ getCanonGrammar file
  return $ errVal (error "cannot build multigrammar") $ grammar2shellState noOptions (can,emptyMGrammar) 
 linearize mgr lang = 
  untok .
  linTree2string noMark (canModules mgr) (zIdent lang) . 
  errVal (error "illegal tree") . 
  annotate gr
 where
   gr    = grammar sgr
   sgr   = stateGrammarOfLang mgr (zIdent lang)
   untok = customOrDefault noOptions useUntokenizer customUntokenizer sgr
 parse mgr lang cat = 
  map tree2exp . 
  errVal [] . 
  parseString noOptions sgr cfcat
 where
   sgr   = stateGrammarOfLang mgr (zIdent lang)
   cfcat = string2CFCat abs cat
   abs   = maybe (error "no abstract syntax") prIdent $ abstract mgr
--- a/src/GF/Embed/EmbedCustom.hs
+++ b/src/GF/Embed/EmbedCustom.hs
@@ -0,0 +1,113 @@
 ----------------------------------------------------------------------
 -- |
 -- Module      : EmbedCustom
 -- Maintainer  : AR
 -- Stability   : (stable)
 -- Portability : (portable)
 --
 -- > CVS $Date: 
 -- > CVS $Author: 
 -- > CVS $Revision: 
 --
 -- A database for customizable lexers and unlexers. Reduced version of
 -- GF.API, intended for embedded GF grammars.
 -----------------------------------------------------------------------------
 module GF.Embed.EmbedCustom where
 import GF.Data.Operations
 import GF.Text.Text
 import GF.UseGrammar.Tokenize
 import GF.UseGrammar.Morphology
 import GF.Infra.Option
 import GF.CF.CFIdent
 import GF.Compile.ShellState
 import Data.Char
 -- | useTokenizer, \"-lexer=x\"
 customTokenizer      :: CustomData (StateGrammar -> String -> [CFTok])  
 -- | useUntokenizer, \"-unlexer=x\" --- should be from token list to string
 customUntokenizer    :: CustomData (StateGrammar -> String -> String)  
 -- | this is the way of selecting an item
 customOrDefault :: Options -> OptFun -> CustomData a -> a
 customOrDefault opts optfun db = maybe (defaultCustomVal db) id $ 
                                   customAsOptVal opts optfun db
 -- | to produce menus of custom operations
 customInfo :: CustomData a -> (String, [String])
 customInfo c = (titleCustomData c, map (ciStr . fst) (dbCustomData c))
 type CommandId = String
 strCI :: String -> CommandId
 strCI = id
 ciStr :: CommandId -> String
 ciStr = id
 ciOpt :: CommandId -> Option
 ciOpt = iOpt
 newtype CustomData a = CustomData (String, [(CommandId,a)])
 customData :: String -> [(CommandId, a)] -> CustomData a
 customData title db = CustomData (title,db)
 dbCustomData :: CustomData a -> [(CommandId, a)]
 dbCustomData (CustomData (_,db)) = db
 titleCustomData :: CustomData a -> String
 titleCustomData (CustomData (t,_)) = t
 lookupCustom :: CustomData a -> CommandId -> Maybe a
 lookupCustom = flip lookup . dbCustomData
 customAsOptVal :: Options -> OptFun -> CustomData a -> Maybe a
 customAsOptVal opts optfun db = do
  arg <- getOptVal opts optfun
  lookupCustom db (strCI arg)
 -- | take the first entry from the database
 defaultCustomVal :: CustomData a -> a
 defaultCustomVal (CustomData (s,db)) = 
  ifNull (error ("empty database:" +++ s)) (snd . head) db
 customTokenizer = 
  customData "Tokenizers, selected by option -lexer=x" $
  [
   (strCI "words",     const $ tokWords)
  ,(strCI "literals",  const $ tokLits)
  ,(strCI "vars",      const $ tokVars)
  ,(strCI "chars",     const $ map (tS . singleton))
  ,(strCI "code",      const $ lexHaskell)
  ,(strCI "codevars",  lexHaskellVar . stateIsWord)
  ,(strCI "text",      const $ lexText)
  ,(strCI "unglue",    \gr -> map tS . decomposeWords (stateMorpho gr))
  ,(strCI "codelit",   lexHaskellLiteral . stateIsWord)
  ,(strCI "textlit",   lexTextLiteral . stateIsWord)
  ,(strCI "codeC",     const $ lexC2M)
  ,(strCI "codeCHigh", const $ lexC2M' True)
 -- add your own tokenizers here
  ]
 customUntokenizer = 
  customData "Untokenizers, selected by option -unlexer=x" $
  [
   (strCI "unwords",   const $ id)   -- DEFAULT
  ,(strCI "text",      const $ formatAsText)
  ,(strCI "html",      const $ formatAsHTML)
  ,(strCI "latex",     const $ formatAsLatex)
  ,(strCI "code",      const $ formatAsCode)
  ,(strCI "concat",    const $ filter (not . isSpace))
  ,(strCI "textlit",   const $ formatAsTextLit)
  ,(strCI "codelit",   const $ formatAsCodeLit)
  ,(strCI "concat",    const $ concatRemSpace)
  ,(strCI "glue",      const $ performBinds)
  ,(strCI "reverse",   const $ reverse)
  ,(strCI "bind",      const $ performBinds) -- backward compat
 -- add your own untokenizers here
  ]
--- a/src/GF/Embed/EmbedParsing.hs
+++ b/src/GF/Embed/EmbedParsing.hs
@@ -0,0 +1,137 @@
 ----------------------------------------------------------------------
 -- |
 -- Module      : EmbedParsing
 -- Maintainer  : AR
 -- Stability   : (stable)
 -- Portability : (portable)
 --
 -- > CVS $Date: 
 -- > CVS $Author: 
 -- > CVS $Revision: 
 --
 -- just one parse method, for use in embedded GF systems 
 -----------------------------------------------------------------------------
 module GF.Embed.EmbedParsing where
 import GF.Infra.CheckM
 import qualified GF.Canon.AbsGFC as C
 import GF.Canon.GFC
 import GF.Canon.MkGFC (trExp) ----
 import GF.Canon.CMacros
 import GF.Grammar.MMacros (refreshMetas)
 import GF.UseGrammar.Linear
 import GF.Data.Str
 import GF.CF.CF
 import GF.CF.CFIdent
 import GF.Infra.Ident
 import GF.Grammar.TypeCheck
 import GF.Grammar.Values
 import GF.UseGrammar.Tokenize
 import GF.CF.Profile
 import GF.Infra.Option
 import GF.Compile.ShellState
 import GF.Embed.EmbedCustom
 import GF.CF.PPrCF (prCFTree)
 import qualified GF.OldParsing.ParseCF as PCFOld -- OBSOLETE
 -- import qualified GF.Parsing.GFC as New
 import GF.Data.Operations
 import Data.List (nub)
 import Control.Monad (liftM)
 -- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
 parseString :: Options -> StateGrammar -> CFCat -> String -> Err [Tree]
 parseString os sg cat = liftM fst . parseStringMsg os sg cat
 parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String)
 parseStringMsg os sg cat s = do
  (ts,(_,ss)) <- checkStart $ parseStringC os sg cat s
  return (ts,unlines ss)
 parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
 parseStringC opts0 sg cat s
 | otherwise = do
  let opts = unionOptions opts0 $ stateOptions sg
      cf  = stateCF sg
      gr  = stateGrammarST sg
      cn  = cncId sg
      tok = customOrDefault opts useTokenizer customTokenizer sg
      parser = PCFOld.parse "ibn" (stateCF sg) cat -- customOrDefault opts useParser customParser sg cat
  tokens2trms opts sg cn parser (tok s)
 tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
 tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
    where result = parser toks
 	  info   = snd result
 	  trees  = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2))
 trees2trms :: Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
 trees2trms opts sg cn as ts0 info = do
  ts  <- case () of
    _ | null ts0 -> checkWarn "No success in cf parsing" >> return []
    _ | raw      -> do
      ts1 <- return (map cf2trm0 ts0) ----- should not need annot
      checks [
         mapM (checkErr . (annotate gr) . trExp) ts1 ---- complicated, often fails
        ,checkWarn (unlines ("Raw CF trees:":(map prCFTree ts0))) >> return []
        ]
    _ -> do
      let num = optIntOrN opts flagRawtrees 99999
      let (ts01,rest) = splitAt num ts0
      if null rest then return () 
         else checkWarn ("Warning: only" +++ show num +++ "raw parses out of" +++ 
                          show (length ts0) +++ 
                          "considered; use -rawtrees=<Int> to see more"
                     )
      (ts1,ss) <- checkErr $ mapErrN 1 postParse ts01
      if null ts1 then raise ss else return ()
      ts2 <- mapM (checkErr . annotate gr . refreshMetas [] . trExp) ts1 ---- 
      if forgive then return ts2 else do
        let tsss = [(t, allLinsOfTree gr cn t) | t <- ts2]
            ps = [t | (t,ss) <- tsss, 
                      any (compatToks as) (map str2cftoks ss)]
        if null ps 
           then raise $ "Failure in morphology." ++
                  if verb 
                     then "\nPossible corrections: " +++++
                          unlines (nub (map sstr (concatMap snd tsss)))
                     else ""
           else return ps
  if verb 
     then checkWarn $ " the token list" +++ show as ++++ unknown as +++++ info
     else return ()
  return $ optIntOrAll opts flagNumber $ nub ts
 where
   gr  = stateGrammarST sg
   raw     = oElem rawParse opts
   verb    = oElem beVerbose opts
   forgive = oElem forgiveParse opts
   unknown ts = case filter noMatch [t | t@(TS _) <- ts] of
     [] -> "where all words are known"
     us -> "with the unknown tokens" +++ show us --- needs to be fixed for literals
   terminals = map TS $ stateGrammarWords sg
   noMatch t = all (not . compatTok t) terminals 
 --- too much type checking in building term info? return FullTerm to save work?
 -- | raw parsing: so simple it is for a context-free CF grammar
 cf2trm0 :: CFTree -> C.Exp
 cf2trm0 (CFTree (fun, (_, trees))) = mkAppAtom (cffun2trm fun) (map cf2trm0 trees)
 where
   cffun2trm (CFFun (fun,_)) = fun
   mkApp = foldl C.EApp
   mkAppAtom a = mkApp (C.EAtom a)
--- a/src/GF/Grammar/MMacros.hs
+++ b/src/GF/Grammar/MMacros.hs
@@ -5,9 +5,9 @@
 -- Stability   : (stable)
 -- Portability : (portable)
 --
-- > CVS $Date: 2005/04/21 16:22:24 $ 
+-- > CVS $Date: 2005/05/10 12:49:13 $ 
-- > CVS $Author: bringert $
+-- > CVS $Author: aarne $
-- > CVS $Revision: 1.8 $
+-- > CVS $Revision: 1.9 $
 --
 -- some more abstractions on grammars, esp. for Edit
 -----------------------------------------------------------------------------
@@ -318,3 +318,23 @@ reindexTerm = qualif (0,[]) where
    _ -> composSafeOp (qualif dg) t
  look x  = maybe x id . lookup x --- if x is not in scope it is unchanged
  ind x d = identC $ prIdent x ++ "_" ++ show d
 -- this method works for context-free abstract syntax
 -- and is meant to be used in simple embedded GF applications
 exp2tree :: Exp -> Err Tree
 exp2tree e = do
  (bs,f,xs) <- termForm e
  cont <- case bs of
    [] -> return []
    _  -> prtBad "cannot convert bindings in" e
  at <- case f of
    Q  m c -> return $ AtC (m,c)
    QC m c -> return $ AtC (m,c)
    Meta m -> return $ AtM m
    K s    -> return $ AtL s
    EInt n -> return $ AtI n
    _ -> prtBad "cannot convert to atom" f
  ts <- mapM exp2tree xs
  return $ Tr (N (cont,at,uVal,([],[]),True),ts)