From 5111762d31a50c4fccc8da02c7c327a2b51d0e77 Mon Sep 17 00:00:00 2001
From: aarne <unknown>
Date: Tue, 10 May 2005 11:49:13 +0000
Subject: [PATCH] modules for embedded GF

---
 doc/gf-index.html            |   3 +
 src/GF/API.hs                |  12 ++-
 src/GF/Embed/EmbedAPI.hs     |  78 ++++++++++++++++++++
 src/GF/Embed/EmbedCustom.hs  | 113 +++++++++++++++++++++++++++++
 src/GF/Embed/EmbedParsing.hs | 137 +++++++++++++++++++++++++++++++++++
 src/GF/Grammar/MMacros.hs    |  26 ++++++-
 6 files changed, 363 insertions(+), 6 deletions(-)
 create mode 100644 src/GF/Embed/EmbedAPI.hs
 create mode 100644 src/GF/Embed/EmbedCustom.hs
 create mode 100644 src/GF/Embed/EmbedParsing.hs
diff --git a/doc/gf-index.html b/doc/gf-index.html
index ae5511f91..340f64f44 100644
--- a/doc/gf-index.html
+++ b/doc/gf-index.html
@@ -18,11 +18,14 @@ November 8, 2004.
 
 </p><h2>News</h2>
 
+<!--
 <i>May 9, 2005</i>. Version 2.2 coming soon. Here is a 
 <a href="download-2.2/GF-2.2.tgz">preliminary source release</a>.
 Here are the <a href="doc/gf2.2-highlights.html">highlights</a>.
 
+
 <p>
+-->
 
 <i>May 9, 2005</i>.
 PhD Thesis by
diff --git a/src/GF/API.hs b/src/GF/API.hs
index 64572e041..3b252f4f3 100644
--- a/src/GF/API.hs
+++ b/src/GF/API.hs
@@ -5,9 +5,9 @@
 -- Stability   : (stable)
 -- Portability : (portable)
 --
--- > CVS $Date: 2005/04/21 16:45:57 $ 
--- > CVS $Author: bringert $
--- > CVS $Revision: 1.33 $
+-- > CVS $Date: 2005/05/10 12:49:13 $ 
+-- > CVS $Author: aarne $
+-- > CVS $Revision: 1.34 $
 --
 -- Application Programmer's Interface to GF; also used by Shell. AR 10/11/2001
 -----------------------------------------------------------------------------
@@ -88,6 +88,12 @@ linearize :: GFGrammar -> Tree -> String
 linearize sgr = err id id . optLinearizeTree opts sgr where
   opts = addOption firstLin $ stateOptions sgr
 
+term2tree :: GFGrammar -> G.Term -> Tree
+term2tree gr = errVal uTree . annotate (grammar gr) . qualifTerm (absId gr)
+
+tree2term :: Tree -> G.Term
+tree2term = tree2exp
+
 linearizeToAll :: [GFGrammar] -> Tree -> [String]
 linearizeToAll grs t = [linearize gr t | gr <- grs]
 
diff --git a/src/GF/Embed/EmbedAPI.hs b/src/GF/Embed/EmbedAPI.hs
new file mode 100644
index 000000000..197c24de2
--- /dev/null
+++ b/src/GF/Embed/EmbedAPI.hs
@@ -0,0 +1,78 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : EmbedAPI
+-- Maintainer  : Aarne Ranta
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 
+-- > CVS $Author: 
+-- > CVS $Revision: 
+--
+-- Reduced Application Programmer's Interface to GF, meant for
+-- embedded GF systems. AR 10/5/2005
+-----------------------------------------------------------------------------
+
+module GF.Embed.EmbedAPI where
+
+import GF.Compile.ShellState (ShellState,grammar2shellState,canModules,stateGrammarOfLang,abstract,grammar)
+import GF.UseGrammar.Linear (linTree2string)
+import GF.Embed.EmbedParsing (parseString)
+import GF.Canon.CMacros (noMark)
+import GF.Grammar.Grammar (Trm)
+import GF.Grammar.MMacros (exp2tree)
+import GF.Grammar.Macros (zIdent)
+import GF.Grammar.Values (tree2exp)
+import GF.Grammar.TypeCheck (annotate)
+import GF.Canon.GetGFC (getCanonGrammar)
+import GF.Infra.Modules (emptyMGrammar)
+import GF.CF.CFIdent (string2CFCat)
+import GF.Infra.UseIO
+import GF.Data.Operations
+import GF.Infra.Option (noOptions,useUntokenizer)
+import GF.Infra.Ident (prIdent)
+import GF.Embed.EmbedCustom
+
+-- This API is meant to be used when embedding GF grammars in Haskell 
+-- programs. The embedded system is supposed to use the
+-- .gfcm grammar format, which is first produced by the gf program.
+
+---------------------------------------------------
+-- Interface
+---------------------------------------------------
+
+type MultiGrammar = ShellState
+type Language     = String
+type Category     = String
+type Tree         = Trm
+
+file2grammar :: FilePath -> IO MultiGrammar
+linearize    :: MultiGrammar -> Language -> Tree -> String
+parse        :: MultiGrammar -> Language -> Category -> String -> [Tree]
+
+---------------------------------------------------
+-- Implementation
+---------------------------------------------------
+
+file2grammar file = do
+  can <- useIOE (error "cannot parse grammar file") $ getCanonGrammar file
+  return $ errVal (error "cannot build multigrammar") $ grammar2shellState noOptions (can,emptyMGrammar) 
+
+linearize mgr lang = 
+  untok .
+  linTree2string noMark (canModules mgr) (zIdent lang) . 
+  errVal (error "illegal tree") . 
+  annotate gr
+ where
+   gr    = grammar sgr
+   sgr   = stateGrammarOfLang mgr (zIdent lang)
+   untok = customOrDefault noOptions useUntokenizer customUntokenizer sgr
+
+parse mgr lang cat = 
+  map tree2exp . 
+  errVal [] . 
+  parseString noOptions sgr cfcat
+ where
+   sgr   = stateGrammarOfLang mgr (zIdent lang)
+   cfcat = string2CFCat abs cat
+   abs   = maybe (error "no abstract syntax") prIdent $ abstract mgr
diff --git a/src/GF/Embed/EmbedCustom.hs b/src/GF/Embed/EmbedCustom.hs
new file mode 100644
index 000000000..f315441c5
--- /dev/null
+++ b/src/GF/Embed/EmbedCustom.hs
@@ -0,0 +1,113 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : EmbedCustom
+-- Maintainer  : AR
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 
+-- > CVS $Author: 
+-- > CVS $Revision: 
+--
+-- A database for customizable lexers and unlexers. Reduced version of
+-- GF.API, intended for embedded GF grammars.
+
+-----------------------------------------------------------------------------
+
+module GF.Embed.EmbedCustom where
+
+import GF.Data.Operations
+import GF.Text.Text
+import GF.UseGrammar.Tokenize
+import GF.UseGrammar.Morphology
+import GF.Infra.Option
+import GF.CF.CFIdent
+import GF.Compile.ShellState
+import Data.Char
+
+-- | useTokenizer, \"-lexer=x\"
+customTokenizer      :: CustomData (StateGrammar -> String -> [CFTok])  
+
+-- | useUntokenizer, \"-unlexer=x\" --- should be from token list to string
+customUntokenizer    :: CustomData (StateGrammar -> String -> String)  
+
+-- | this is the way of selecting an item
+customOrDefault :: Options -> OptFun -> CustomData a -> a
+customOrDefault opts optfun db = maybe (defaultCustomVal db) id $ 
+                                   customAsOptVal opts optfun db
+
+-- | to produce menus of custom operations
+customInfo :: CustomData a -> (String, [String])
+customInfo c = (titleCustomData c, map (ciStr . fst) (dbCustomData c))
+
+type CommandId = String
+
+strCI :: String -> CommandId
+strCI = id
+
+ciStr :: CommandId -> String
+ciStr = id
+
+ciOpt :: CommandId -> Option
+ciOpt = iOpt
+
+newtype CustomData a = CustomData (String, [(CommandId,a)])
+
+customData :: String -> [(CommandId, a)] -> CustomData a
+customData title db = CustomData (title,db)
+
+dbCustomData :: CustomData a -> [(CommandId, a)]
+dbCustomData (CustomData (_,db)) = db
+
+titleCustomData :: CustomData a -> String
+titleCustomData (CustomData (t,_)) = t
+
+lookupCustom :: CustomData a -> CommandId -> Maybe a
+lookupCustom = flip lookup . dbCustomData
+
+customAsOptVal :: Options -> OptFun -> CustomData a -> Maybe a
+customAsOptVal opts optfun db = do
+  arg <- getOptVal opts optfun
+  lookupCustom db (strCI arg)
+
+-- | take the first entry from the database
+defaultCustomVal :: CustomData a -> a
+defaultCustomVal (CustomData (s,db)) = 
+  ifNull (error ("empty database:" +++ s)) (snd . head) db
+
+customTokenizer = 
+  customData "Tokenizers, selected by option -lexer=x" $
+  [
+   (strCI "words",     const $ tokWords)
+  ,(strCI "literals",  const $ tokLits)
+  ,(strCI "vars",      const $ tokVars)
+  ,(strCI "chars",     const $ map (tS . singleton))
+  ,(strCI "code",      const $ lexHaskell)
+  ,(strCI "codevars",  lexHaskellVar . stateIsWord)
+  ,(strCI "text",      const $ lexText)
+  ,(strCI "unglue",    \gr -> map tS . decomposeWords (stateMorpho gr))
+  ,(strCI "codelit",   lexHaskellLiteral . stateIsWord)
+  ,(strCI "textlit",   lexTextLiteral . stateIsWord)
+  ,(strCI "codeC",     const $ lexC2M)
+  ,(strCI "codeCHigh", const $ lexC2M' True)
+-- add your own tokenizers here
+  ]
+
+customUntokenizer = 
+  customData "Untokenizers, selected by option -unlexer=x" $
+  [
+   (strCI "unwords",   const $ id)   -- DEFAULT
+  ,(strCI "text",      const $ formatAsText)
+  ,(strCI "html",      const $ formatAsHTML)
+  ,(strCI "latex",     const $ formatAsLatex)
+  ,(strCI "code",      const $ formatAsCode)
+  ,(strCI "concat",    const $ filter (not . isSpace))
+  ,(strCI "textlit",   const $ formatAsTextLit)
+  ,(strCI "codelit",   const $ formatAsCodeLit)
+  ,(strCI "concat",    const $ concatRemSpace)
+  ,(strCI "glue",      const $ performBinds)
+  ,(strCI "reverse",   const $ reverse)
+  ,(strCI "bind",      const $ performBinds) -- backward compat
+-- add your own untokenizers here
+  ]
+
diff --git a/src/GF/Embed/EmbedParsing.hs b/src/GF/Embed/EmbedParsing.hs
new file mode 100644
index 000000000..485fa2379
--- /dev/null
+++ b/src/GF/Embed/EmbedParsing.hs
@@ -0,0 +1,137 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : EmbedParsing
+-- Maintainer  : AR
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 
+-- > CVS $Author: 
+-- > CVS $Revision: 
+--
+-- just one parse method, for use in embedded GF systems 
+-----------------------------------------------------------------------------
+
+module GF.Embed.EmbedParsing where
+
+import GF.Infra.CheckM
+import qualified GF.Canon.AbsGFC as C
+import GF.Canon.GFC
+import GF.Canon.MkGFC (trExp) ----
+import GF.Canon.CMacros
+import GF.Grammar.MMacros (refreshMetas)
+import GF.UseGrammar.Linear
+import GF.Data.Str
+import GF.CF.CF
+import GF.CF.CFIdent
+import GF.Infra.Ident
+import GF.Grammar.TypeCheck
+import GF.Grammar.Values
+import GF.UseGrammar.Tokenize
+import GF.CF.Profile
+import GF.Infra.Option
+import GF.Compile.ShellState
+import GF.Embed.EmbedCustom
+import GF.CF.PPrCF (prCFTree)
+
+import qualified GF.OldParsing.ParseCF as PCFOld -- OBSOLETE
+
+
+-- import qualified GF.Parsing.GFC as New
+
+import GF.Data.Operations
+
+import Data.List (nub)
+import Control.Monad (liftM)
+
+-- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
+
+parseString :: Options -> StateGrammar -> CFCat -> String -> Err [Tree]
+parseString os sg cat = liftM fst . parseStringMsg os sg cat
+
+parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String)
+parseStringMsg os sg cat s = do
+  (ts,(_,ss)) <- checkStart $ parseStringC os sg cat s
+  return (ts,unlines ss)
+
+parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
+parseStringC opts0 sg cat s
+
+ | otherwise = do
+  let opts = unionOptions opts0 $ stateOptions sg
+      cf  = stateCF sg
+      gr  = stateGrammarST sg
+      cn  = cncId sg
+      tok = customOrDefault opts useTokenizer customTokenizer sg
+      parser = PCFOld.parse "ibn" (stateCF sg) cat -- customOrDefault opts useParser customParser sg cat
+  tokens2trms opts sg cn parser (tok s)
+
+tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
+tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
+    where result = parser toks
+	  info   = snd result
+	  trees  = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2))
+
+trees2trms :: Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
+trees2trms opts sg cn as ts0 info = do
+  ts  <- case () of
+    _ | null ts0 -> checkWarn "No success in cf parsing" >> return []
+    _ | raw      -> do
+      ts1 <- return (map cf2trm0 ts0) ----- should not need annot
+      checks [
+         mapM (checkErr . (annotate gr) . trExp) ts1 ---- complicated, often fails
+        ,checkWarn (unlines ("Raw CF trees:":(map prCFTree ts0))) >> return []
+        ]
+    _ -> do
+      let num = optIntOrN opts flagRawtrees 99999
+      let (ts01,rest) = splitAt num ts0
+      if null rest then return () 
+         else checkWarn ("Warning: only" +++ show num +++ "raw parses out of" +++ 
+                          show (length ts0) +++ 
+                          "considered; use -rawtrees=<Int> to see more"
+                     )
+      (ts1,ss) <- checkErr $ mapErrN 1 postParse ts01
+      if null ts1 then raise ss else return ()
+      ts2 <- mapM (checkErr . annotate gr . refreshMetas [] . trExp) ts1 ---- 
+      if forgive then return ts2 else do
+        let tsss = [(t, allLinsOfTree gr cn t) | t <- ts2]
+            ps = [t | (t,ss) <- tsss, 
+                      any (compatToks as) (map str2cftoks ss)]
+        if null ps 
+           then raise $ "Failure in morphology." ++
+                  if verb 
+                     then "\nPossible corrections: " +++++
+                          unlines (nub (map sstr (concatMap snd tsss)))
+                     else ""
+           else return ps
+
+  if verb 
+     then checkWarn $ " the token list" +++ show as ++++ unknown as +++++ info
+     else return ()
+
+  return $ optIntOrAll opts flagNumber $ nub ts
+ where
+   gr  = stateGrammarST sg
+
+   raw     = oElem rawParse opts
+   verb    = oElem beVerbose opts
+   forgive = oElem forgiveParse opts
+
+   unknown ts = case filter noMatch [t | t@(TS _) <- ts] of
+     [] -> "where all words are known"
+     us -> "with the unknown tokens" +++ show us --- needs to be fixed for literals
+   terminals = map TS $ stateGrammarWords sg
+   noMatch t = all (not . compatTok t) terminals 
+     
+
+--- too much type checking in building term info? return FullTerm to save work?
+
+-- | raw parsing: so simple it is for a context-free CF grammar
+cf2trm0 :: CFTree -> C.Exp
+cf2trm0 (CFTree (fun, (_, trees))) = mkAppAtom (cffun2trm fun) (map cf2trm0 trees)
+ where
+   cffun2trm (CFFun (fun,_)) = fun
+   mkApp = foldl C.EApp
+   mkAppAtom a = mkApp (C.EAtom a)
+
+
diff --git a/src/GF/Grammar/MMacros.hs b/src/GF/Grammar/MMacros.hs
index d4d1fa6e7..8370e102a 100644
--- a/src/GF/Grammar/MMacros.hs
+++ b/src/GF/Grammar/MMacros.hs
@@ -5,9 +5,9 @@
 -- Stability   : (stable)
 -- Portability : (portable)
 --
--- > CVS $Date: 2005/04/21 16:22:24 $ 
--- > CVS $Author: bringert $
--- > CVS $Revision: 1.8 $
+-- > CVS $Date: 2005/05/10 12:49:13 $ 
+-- > CVS $Author: aarne $
+-- > CVS $Revision: 1.9 $
 --
 -- some more abstractions on grammars, esp. for Edit
 -----------------------------------------------------------------------------
@@ -318,3 +318,23 @@ reindexTerm = qualif (0,[]) where
     _ -> composSafeOp (qualif dg) t
   look x  = maybe x id . lookup x --- if x is not in scope it is unchanged
   ind x d = identC $ prIdent x ++ "_" ++ show d
+
+
+-- this method works for context-free abstract syntax
+-- and is meant to be used in simple embedded GF applications
+
+exp2tree :: Exp -> Err Tree
+exp2tree e = do
+  (bs,f,xs) <- termForm e
+  cont <- case bs of
+    [] -> return []
+    _  -> prtBad "cannot convert bindings in" e
+  at <- case f of
+    Q  m c -> return $ AtC (m,c)
+    QC m c -> return $ AtC (m,c)
+    Meta m -> return $ AtM m
+    K s    -> return $ AtL s
+    EInt n -> return $ AtI n
+    _ -> prtBad "cannot convert to atom" f
+  ts <- mapM exp2tree xs
+  return $ Tr (N (cont,at,uVal,([],[]),True),ts)