From 5111762d31a50c4fccc8da02c7c327a2b51d0e77 Mon Sep 17 00:00:00 2001 From: aarne Date: Tue, 10 May 2005 11:49:13 +0000 Subject: [PATCH] modules for embedded GF --- doc/gf-index.html | 3 + src/GF/API.hs | 12 ++- src/GF/Embed/EmbedAPI.hs | 78 ++++++++++++++++++++ src/GF/Embed/EmbedCustom.hs | 113 +++++++++++++++++++++++++++++ src/GF/Embed/EmbedParsing.hs | 137 +++++++++++++++++++++++++++++++++++ src/GF/Grammar/MMacros.hs | 26 ++++++- 6 files changed, 363 insertions(+), 6 deletions(-) create mode 100644 src/GF/Embed/EmbedAPI.hs create mode 100644 src/GF/Embed/EmbedCustom.hs create mode 100644 src/GF/Embed/EmbedParsing.hs diff --git a/doc/gf-index.html b/doc/gf-index.html index ae5511f91..340f64f44 100644 --- a/doc/gf-index.html +++ b/doc/gf-index.html @@ -18,11 +18,14 @@ November 8, 2004.

News

+ May 9, 2005. PhD Thesis by diff --git a/src/GF/API.hs b/src/GF/API.hs index 64572e041..3b252f4f3 100644 --- a/src/GF/API.hs +++ b/src/GF/API.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/04/21 16:45:57 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.33 $ +-- > CVS $Date: 2005/05/10 12:49:13 $ +-- > CVS $Author: aarne $ +-- > CVS $Revision: 1.34 $ -- -- Application Programmer's Interface to GF; also used by Shell. AR 10/11/2001 ----------------------------------------------------------------------------- @@ -88,6 +88,12 @@ linearize :: GFGrammar -> Tree -> String linearize sgr = err id id . optLinearizeTree opts sgr where opts = addOption firstLin $ stateOptions sgr +term2tree :: GFGrammar -> G.Term -> Tree +term2tree gr = errVal uTree . annotate (grammar gr) . qualifTerm (absId gr) + +tree2term :: Tree -> G.Term +tree2term = tree2exp + linearizeToAll :: [GFGrammar] -> Tree -> [String] linearizeToAll grs t = [linearize gr t | gr <- grs] diff --git a/src/GF/Embed/EmbedAPI.hs b/src/GF/Embed/EmbedAPI.hs new file mode 100644 index 000000000..197c24de2 --- /dev/null +++ b/src/GF/Embed/EmbedAPI.hs @@ -0,0 +1,78 @@ +---------------------------------------------------------------------- +-- | +-- Module : EmbedAPI +-- Maintainer : Aarne Ranta +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: +-- > CVS $Author: +-- > CVS $Revision: +-- +-- Reduced Application Programmer's Interface to GF, meant for +-- embedded GF systems. AR 10/5/2005 +----------------------------------------------------------------------------- + +module GF.Embed.EmbedAPI where + +import GF.Compile.ShellState (ShellState,grammar2shellState,canModules,stateGrammarOfLang,abstract,grammar) +import GF.UseGrammar.Linear (linTree2string) +import GF.Embed.EmbedParsing (parseString) +import GF.Canon.CMacros (noMark) +import GF.Grammar.Grammar (Trm) +import GF.Grammar.MMacros (exp2tree) +import GF.Grammar.Macros (zIdent) +import GF.Grammar.Values (tree2exp) +import GF.Grammar.TypeCheck (annotate) +import GF.Canon.GetGFC (getCanonGrammar) +import GF.Infra.Modules (emptyMGrammar) +import GF.CF.CFIdent (string2CFCat) +import GF.Infra.UseIO +import GF.Data.Operations +import GF.Infra.Option (noOptions,useUntokenizer) +import GF.Infra.Ident (prIdent) +import GF.Embed.EmbedCustom + +-- This API is meant to be used when embedding GF grammars in Haskell +-- programs. The embedded system is supposed to use the +-- .gfcm grammar format, which is first produced by the gf program. + +--------------------------------------------------- +-- Interface +--------------------------------------------------- + +type MultiGrammar = ShellState +type Language = String +type Category = String +type Tree = Trm + +file2grammar :: FilePath -> IO MultiGrammar +linearize :: MultiGrammar -> Language -> Tree -> String +parse :: MultiGrammar -> Language -> Category -> String -> [Tree] + +--------------------------------------------------- +-- Implementation +--------------------------------------------------- + +file2grammar file = do + can <- useIOE (error "cannot parse grammar file") $ getCanonGrammar file + return $ errVal (error "cannot build multigrammar") $ grammar2shellState noOptions (can,emptyMGrammar) + +linearize mgr lang = + untok . + linTree2string noMark (canModules mgr) (zIdent lang) . + errVal (error "illegal tree") . + annotate gr + where + gr = grammar sgr + sgr = stateGrammarOfLang mgr (zIdent lang) + untok = customOrDefault noOptions useUntokenizer customUntokenizer sgr + +parse mgr lang cat = + map tree2exp . + errVal [] . + parseString noOptions sgr cfcat + where + sgr = stateGrammarOfLang mgr (zIdent lang) + cfcat = string2CFCat abs cat + abs = maybe (error "no abstract syntax") prIdent $ abstract mgr diff --git a/src/GF/Embed/EmbedCustom.hs b/src/GF/Embed/EmbedCustom.hs new file mode 100644 index 000000000..f315441c5 --- /dev/null +++ b/src/GF/Embed/EmbedCustom.hs @@ -0,0 +1,113 @@ +---------------------------------------------------------------------- +-- | +-- Module : EmbedCustom +-- Maintainer : AR +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: +-- > CVS $Author: +-- > CVS $Revision: +-- +-- A database for customizable lexers and unlexers. Reduced version of +-- GF.API, intended for embedded GF grammars. + +----------------------------------------------------------------------------- + +module GF.Embed.EmbedCustom where + +import GF.Data.Operations +import GF.Text.Text +import GF.UseGrammar.Tokenize +import GF.UseGrammar.Morphology +import GF.Infra.Option +import GF.CF.CFIdent +import GF.Compile.ShellState +import Data.Char + +-- | useTokenizer, \"-lexer=x\" +customTokenizer :: CustomData (StateGrammar -> String -> [CFTok]) + +-- | useUntokenizer, \"-unlexer=x\" --- should be from token list to string +customUntokenizer :: CustomData (StateGrammar -> String -> String) + +-- | this is the way of selecting an item +customOrDefault :: Options -> OptFun -> CustomData a -> a +customOrDefault opts optfun db = maybe (defaultCustomVal db) id $ + customAsOptVal opts optfun db + +-- | to produce menus of custom operations +customInfo :: CustomData a -> (String, [String]) +customInfo c = (titleCustomData c, map (ciStr . fst) (dbCustomData c)) + +type CommandId = String + +strCI :: String -> CommandId +strCI = id + +ciStr :: CommandId -> String +ciStr = id + +ciOpt :: CommandId -> Option +ciOpt = iOpt + +newtype CustomData a = CustomData (String, [(CommandId,a)]) + +customData :: String -> [(CommandId, a)] -> CustomData a +customData title db = CustomData (title,db) + +dbCustomData :: CustomData a -> [(CommandId, a)] +dbCustomData (CustomData (_,db)) = db + +titleCustomData :: CustomData a -> String +titleCustomData (CustomData (t,_)) = t + +lookupCustom :: CustomData a -> CommandId -> Maybe a +lookupCustom = flip lookup . dbCustomData + +customAsOptVal :: Options -> OptFun -> CustomData a -> Maybe a +customAsOptVal opts optfun db = do + arg <- getOptVal opts optfun + lookupCustom db (strCI arg) + +-- | take the first entry from the database +defaultCustomVal :: CustomData a -> a +defaultCustomVal (CustomData (s,db)) = + ifNull (error ("empty database:" +++ s)) (snd . head) db + +customTokenizer = + customData "Tokenizers, selected by option -lexer=x" $ + [ + (strCI "words", const $ tokWords) + ,(strCI "literals", const $ tokLits) + ,(strCI "vars", const $ tokVars) + ,(strCI "chars", const $ map (tS . singleton)) + ,(strCI "code", const $ lexHaskell) + ,(strCI "codevars", lexHaskellVar . stateIsWord) + ,(strCI "text", const $ lexText) + ,(strCI "unglue", \gr -> map tS . decomposeWords (stateMorpho gr)) + ,(strCI "codelit", lexHaskellLiteral . stateIsWord) + ,(strCI "textlit", lexTextLiteral . stateIsWord) + ,(strCI "codeC", const $ lexC2M) + ,(strCI "codeCHigh", const $ lexC2M' True) +-- add your own tokenizers here + ] + +customUntokenizer = + customData "Untokenizers, selected by option -unlexer=x" $ + [ + (strCI "unwords", const $ id) -- DEFAULT + ,(strCI "text", const $ formatAsText) + ,(strCI "html", const $ formatAsHTML) + ,(strCI "latex", const $ formatAsLatex) + ,(strCI "code", const $ formatAsCode) + ,(strCI "concat", const $ filter (not . isSpace)) + ,(strCI "textlit", const $ formatAsTextLit) + ,(strCI "codelit", const $ formatAsCodeLit) + ,(strCI "concat", const $ concatRemSpace) + ,(strCI "glue", const $ performBinds) + ,(strCI "reverse", const $ reverse) + ,(strCI "bind", const $ performBinds) -- backward compat +-- add your own untokenizers here + ] + diff --git a/src/GF/Embed/EmbedParsing.hs b/src/GF/Embed/EmbedParsing.hs new file mode 100644 index 000000000..485fa2379 --- /dev/null +++ b/src/GF/Embed/EmbedParsing.hs @@ -0,0 +1,137 @@ +---------------------------------------------------------------------- +-- | +-- Module : EmbedParsing +-- Maintainer : AR +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: +-- > CVS $Author: +-- > CVS $Revision: +-- +-- just one parse method, for use in embedded GF systems +----------------------------------------------------------------------------- + +module GF.Embed.EmbedParsing where + +import GF.Infra.CheckM +import qualified GF.Canon.AbsGFC as C +import GF.Canon.GFC +import GF.Canon.MkGFC (trExp) ---- +import GF.Canon.CMacros +import GF.Grammar.MMacros (refreshMetas) +import GF.UseGrammar.Linear +import GF.Data.Str +import GF.CF.CF +import GF.CF.CFIdent +import GF.Infra.Ident +import GF.Grammar.TypeCheck +import GF.Grammar.Values +import GF.UseGrammar.Tokenize +import GF.CF.Profile +import GF.Infra.Option +import GF.Compile.ShellState +import GF.Embed.EmbedCustom +import GF.CF.PPrCF (prCFTree) + +import qualified GF.OldParsing.ParseCF as PCFOld -- OBSOLETE + + +-- import qualified GF.Parsing.GFC as New + +import GF.Data.Operations + +import Data.List (nub) +import Control.Monad (liftM) + +-- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002 + +parseString :: Options -> StateGrammar -> CFCat -> String -> Err [Tree] +parseString os sg cat = liftM fst . parseStringMsg os sg cat + +parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String) +parseStringMsg os sg cat s = do + (ts,(_,ss)) <- checkStart $ parseStringC os sg cat s + return (ts,unlines ss) + +parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree] +parseStringC opts0 sg cat s + + | otherwise = do + let opts = unionOptions opts0 $ stateOptions sg + cf = stateCF sg + gr = stateGrammarST sg + cn = cncId sg + tok = customOrDefault opts useTokenizer customTokenizer sg + parser = PCFOld.parse "ibn" (stateCF sg) cat -- customOrDefault opts useParser customParser sg cat + tokens2trms opts sg cn parser (tok s) + +tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree] +tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info + where result = parser toks + info = snd result + trees = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2)) + +trees2trms :: Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree] +trees2trms opts sg cn as ts0 info = do + ts <- case () of + _ | null ts0 -> checkWarn "No success in cf parsing" >> return [] + _ | raw -> do + ts1 <- return (map cf2trm0 ts0) ----- should not need annot + checks [ + mapM (checkErr . (annotate gr) . trExp) ts1 ---- complicated, often fails + ,checkWarn (unlines ("Raw CF trees:":(map prCFTree ts0))) >> return [] + ] + _ -> do + let num = optIntOrN opts flagRawtrees 99999 + let (ts01,rest) = splitAt num ts0 + if null rest then return () + else checkWarn ("Warning: only" +++ show num +++ "raw parses out of" +++ + show (length ts0) +++ + "considered; use -rawtrees= to see more" + ) + (ts1,ss) <- checkErr $ mapErrN 1 postParse ts01 + if null ts1 then raise ss else return () + ts2 <- mapM (checkErr . annotate gr . refreshMetas [] . trExp) ts1 ---- + if forgive then return ts2 else do + let tsss = [(t, allLinsOfTree gr cn t) | t <- ts2] + ps = [t | (t,ss) <- tsss, + any (compatToks as) (map str2cftoks ss)] + if null ps + then raise $ "Failure in morphology." ++ + if verb + then "\nPossible corrections: " +++++ + unlines (nub (map sstr (concatMap snd tsss))) + else "" + else return ps + + if verb + then checkWarn $ " the token list" +++ show as ++++ unknown as +++++ info + else return () + + return $ optIntOrAll opts flagNumber $ nub ts + where + gr = stateGrammarST sg + + raw = oElem rawParse opts + verb = oElem beVerbose opts + forgive = oElem forgiveParse opts + + unknown ts = case filter noMatch [t | t@(TS _) <- ts] of + [] -> "where all words are known" + us -> "with the unknown tokens" +++ show us --- needs to be fixed for literals + terminals = map TS $ stateGrammarWords sg + noMatch t = all (not . compatTok t) terminals + + +--- too much type checking in building term info? return FullTerm to save work? + +-- | raw parsing: so simple it is for a context-free CF grammar +cf2trm0 :: CFTree -> C.Exp +cf2trm0 (CFTree (fun, (_, trees))) = mkAppAtom (cffun2trm fun) (map cf2trm0 trees) + where + cffun2trm (CFFun (fun,_)) = fun + mkApp = foldl C.EApp + mkAppAtom a = mkApp (C.EAtom a) + + diff --git a/src/GF/Grammar/MMacros.hs b/src/GF/Grammar/MMacros.hs index d4d1fa6e7..8370e102a 100644 --- a/src/GF/Grammar/MMacros.hs +++ b/src/GF/Grammar/MMacros.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/04/21 16:22:24 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.8 $ +-- > CVS $Date: 2005/05/10 12:49:13 $ +-- > CVS $Author: aarne $ +-- > CVS $Revision: 1.9 $ -- -- some more abstractions on grammars, esp. for Edit ----------------------------------------------------------------------------- @@ -318,3 +318,23 @@ reindexTerm = qualif (0,[]) where _ -> composSafeOp (qualif dg) t look x = maybe x id . lookup x --- if x is not in scope it is unchanged ind x d = identC $ prIdent x ++ "_" ++ show d + + +-- this method works for context-free abstract syntax +-- and is meant to be used in simple embedded GF applications + +exp2tree :: Exp -> Err Tree +exp2tree e = do + (bs,f,xs) <- termForm e + cont <- case bs of + [] -> return [] + _ -> prtBad "cannot convert bindings in" e + at <- case f of + Q m c -> return $ AtC (m,c) + QC m c -> return $ AtC (m,c) + Meta m -> return $ AtM m + K s -> return $ AtL s + EInt n -> return $ AtI n + _ -> prtBad "cannot convert to atom" f + ts <- mapM exp2tree xs + return $ Tr (N (cont,at,uVal,([],[]),True),ts)