GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3

This commit is contained in:
aarne
2008-05-21 09:26:44 +00:00
parent 915a1de717
commit 055c0d0d5a
536 changed files with 0 additions and 0 deletions

140
src-3.0/GF/GFCC/API.hs Normal file
View File

@@ -0,0 +1,140 @@
----------------------------------------------------------------------
-- |
-- Module : GFCCAPI
-- Maintainer : Aarne Ranta
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date:
-- > CVS $Author:
-- > CVS $Revision:
--
-- Reduced Application Programmer's Interface to GF, meant for
-- embedded GF systems. AR 19/9/2007
-----------------------------------------------------------------------------
module GF.GFCC.API where
import GF.GFCC.Linearize
import GF.GFCC.Generate
import GF.GFCC.Macros
import GF.GFCC.DataGFCC
import GF.GFCC.CId
import GF.GFCC.Raw.ConvertGFCC
import GF.GFCC.Raw.ParGFCCRaw
import GF.Command.PPrTree
import GF.Data.ErrM
import GF.Parsing.FCFG
--import GF.Data.Operations
--import GF.Infra.UseIO
import qualified Data.Map as Map
import System.Random (newStdGen)
import System.Directory (doesFileExist)
-- This API is meant to be used when embedding GF grammars in Haskell
-- programs. The embedded system is supposed to use the
-- .gfcc grammar format, which is first produced by the gf program.
---------------------------------------------------
-- Interface
---------------------------------------------------
data MultiGrammar = MultiGrammar {gfcc :: GFCC}
type Language = String
type Category = String
type Tree = Exp
file2grammar :: FilePath -> IO MultiGrammar
linearize :: MultiGrammar -> Language -> Tree -> String
parse :: MultiGrammar -> Language -> Category -> String -> [Tree]
linearizeAll :: MultiGrammar -> Tree -> [String]
linearizeAllLang :: MultiGrammar -> Tree -> [(Language,String)]
parseAll :: MultiGrammar -> Category -> String -> [[Tree]]
parseAllLang :: MultiGrammar -> Category -> String -> [(Language,[Tree])]
generateAll :: MultiGrammar -> Category -> [Tree]
generateRandom :: MultiGrammar -> Category -> IO [Tree]
generateAllDepth :: MultiGrammar -> Category -> Maybe Int -> [Tree]
readTree :: MultiGrammar -> String -> Tree
showTree :: Tree -> String
languages :: MultiGrammar -> [Language]
categories :: MultiGrammar -> [Category]
startCat :: MultiGrammar -> Category
---------------------------------------------------
-- Implementation
---------------------------------------------------
file2grammar f = do
gfcc <- file2gfcc f
return (MultiGrammar gfcc)
file2gfcc f = do
s <- readFileIf f
g <- parseGrammar s
return $ toGFCC g
linearize mgr lang = GF.GFCC.Linearize.linearize (gfcc mgr) (CId lang)
parse mgr lang cat s =
case lookParser (gfcc mgr) (CId lang) of
Nothing -> error "no parser"
Just pinfo -> case parseFCF "bottomup" pinfo (CId cat) (words s) of
Ok x -> x
Bad s -> error s
linearizeAll mgr = map snd . linearizeAllLang mgr
linearizeAllLang mgr t =
[(lang,linearThis mgr lang t) | lang <- languages mgr]
parseAll mgr cat = map snd . parseAllLang mgr cat
parseAllLang mgr cat s =
[(lang,ts) | lang <- languages mgr, let ts = parse mgr lang cat s, not (null ts)]
generateRandom mgr cat = do
gen <- newStdGen
return $ genRandom gen (gfcc mgr) (CId cat)
generateAll mgr cat = generate (gfcc mgr) (CId cat) Nothing
generateAllDepth mgr cat = generate (gfcc mgr) (CId cat)
readTree _ = pTree
showTree = prExp
prIdent :: CId -> String
prIdent (CId s) = s
abstractName mgr = prIdent (absname (gfcc mgr))
languages mgr = [l | CId l <- cncnames (gfcc mgr)]
categories mgr = [c | CId c <- Map.keys (cats (abstract (gfcc mgr)))]
startCat mgr = lookStartCat (gfcc mgr)
emptyMultiGrammar = MultiGrammar emptyGFCC
------------ for internal use only
linearThis = GF.GFCC.API.linearize
err f g ex = case ex of
Ok x -> g x
Bad s -> f s
readFileIf f = do
b <- doesFileExist f
if b then readFile f
else putStrLn ("file " ++ f ++ " not found") >> return ""

14
src-3.0/GF/GFCC/CId.hs Normal file
View File

@@ -0,0 +1,14 @@
module GF.GFCC.CId (
module GF.GFCC.Raw.AbsGFCCRaw,
prCId,
cId
) where
import GF.GFCC.Raw.AbsGFCCRaw (CId(CId))
prCId :: CId -> String
prCId (CId s) = s
cId :: String -> CId
cId = CId

View File

@@ -0,0 +1,186 @@
module GF.GFCC.CheckGFCC (checkGFCC, checkGFCCio, checkGFCCmaybe) where
import GF.GFCC.CId
import GF.GFCC.Macros
import GF.GFCC.DataGFCC
import GF.Data.ErrM
import qualified Data.Map as Map
import Control.Monad
import Debug.Trace
checkGFCCio :: GFCC -> IO GFCC
checkGFCCio gfcc = case checkGFCC gfcc of
Ok (gc,b) -> do
putStrLn $ if b then "OK" else "Corrupted GFCC"
return gc
Bad s -> do
putStrLn s
error "building GFCC failed"
---- needed in old Custom
checkGFCCmaybe :: GFCC -> Maybe GFCC
checkGFCCmaybe gfcc = case checkGFCC gfcc of
Ok (gc,b) -> return gc
Bad s -> Nothing
checkGFCC :: GFCC -> Err (GFCC,Bool)
checkGFCC gfcc = do
(cs,bs) <- mapM (checkConcrete gfcc)
(Map.assocs (concretes gfcc)) >>= return . unzip
return (gfcc {concretes = Map.fromAscList cs}, and bs)
-- errors are non-fatal; replace with 'fail' to change this
msg s = trace s (return ())
andMapM :: Monad m => (a -> m Bool) -> [a] -> m Bool
andMapM f xs = mapM f xs >>= return . and
labelBoolErr :: String -> Err (x,Bool) -> Err (x,Bool)
labelBoolErr ms iob = do
(x,b) <- iob
if b then return (x,b) else (msg ms >> return (x,b))
checkConcrete :: GFCC -> (CId,Concr) -> Err ((CId,Concr),Bool)
checkConcrete gfcc (lang,cnc) =
labelBoolErr ("happened in language " ++ printCId lang) $ do
(rs,bs) <- mapM checkl (Map.assocs (lins cnc)) >>= return . unzip
return ((lang,cnc{lins = Map.fromAscList rs}),and bs)
where
checkl = checkLin gfcc lang
checkLin :: GFCC -> CId -> (CId,Term) -> Err ((CId,Term),Bool)
checkLin gfcc lang (f,t) =
labelBoolErr ("happened in function " ++ printCId f) $ do
(t',b) <- checkTerm (lintype gfcc lang f) t --- $ inline gfcc lang t
return ((f,t'),b)
inferTerm :: [CType] -> Term -> Err (Term,CType)
inferTerm args trm = case trm of
K _ -> returnt str
C i -> returnt $ ints i
V i -> do
testErr (i < length args) ("too large index " ++ show i)
returnt $ args !! i
S ts -> do
(ts',tys) <- mapM infer ts >>= return . unzip
let tys' = filter (/=str) tys
testErr (null tys')
("expected Str in " ++ show trm ++ " not " ++ unwords (map show tys'))
return (S ts',str)
R ts -> do
(ts',tys) <- mapM infer ts >>= return . unzip
return $ (R ts',tuple tys)
P t u -> do
(t',tt) <- infer t
(u',tu) <- infer u
case tt of
R tys -> case tu of
R vs -> infer $ foldl P t' [P u' (C i) | i <- [0 .. length vs - 1]]
--- R [v] -> infer $ P t v
--- R (v:vs) -> infer $ P (head tys) (R vs)
C i -> do
testErr (i < length tys)
("required more than " ++ show i ++ " fields in " ++ show (R tys))
return (P t' u', tys !! i) -- record: index must be known
_ -> do
let typ = head tys
testErr (all (==typ) tys) ("different types in table " ++ show trm)
return (P t' u', typ) -- table: types must be same
_ -> Bad $ "projection from " ++ show t ++ " : " ++ show tt
FV [] -> returnt tm0 ----
FV (t:ts) -> do
(t',ty) <- infer t
(ts',tys) <- mapM infer ts >>= return . unzip
testErr (all (eqType ty) tys) ("different types in variants " ++ show trm)
return (FV (t':ts'),ty)
W s r -> infer r
_ -> Bad ("no type inference for " ++ show trm)
where
returnt ty = return (trm,ty)
infer = inferTerm args
checkTerm :: LinType -> Term -> Err (Term,Bool)
checkTerm (args,val) trm = case inferTerm args trm of
Ok (t,ty) -> if eqType ty val
then return (t,True)
else do
msg ("term: " ++ show trm ++
"\nexpected type: " ++ show val ++
"\ninferred type: " ++ show ty)
return (t,False)
Bad s -> do
msg s
return (trm,False)
eqType :: CType -> CType -> Bool
eqType inf exp = case (inf,exp) of
(C k, C n) -> k <= n -- only run-time corr.
(R rs,R ts) -> length rs == length ts && and [eqType r t | (r,t) <- zip rs ts]
(TM _, _) -> True ---- for variants [] ; not safe
_ -> inf == exp
-- should be in a generic module, but not in the run-time DataGFCC
type CType = Term
type LinType = ([CType],CType)
tuple :: [CType] -> CType
tuple = R
ints :: Int -> CType
ints = C
str :: CType
str = S []
lintype :: GFCC -> CId -> CId -> LinType
lintype gfcc lang fun = case typeSkeleton (lookType gfcc fun) of
(cs,c) -> (map vlinc cs, linc c) ---- HOAS
where
linc = lookLincat gfcc lang
vlinc (0,c) = linc c
vlinc (i,c) = case linc c of
R ts -> R (ts ++ replicate i str)
inline :: GFCC -> CId -> Term -> Term
inline gfcc lang t = case t of
F c -> inl $ look c
_ -> composSafeOp inl t
where
inl = inline gfcc lang
look = lookLin gfcc lang
composOp :: Monad m => (Term -> m Term) -> Term -> m Term
composOp f trm = case trm of
R ts -> liftM R $ mapM f ts
S ts -> liftM S $ mapM f ts
FV ts -> liftM FV $ mapM f ts
P t u -> liftM2 P (f t) (f u)
W s t -> liftM (W s) $ f t
_ -> return trm
composSafeOp :: (Term -> Term) -> Term -> Term
composSafeOp f = maybe undefined id . composOp (return . f)
-- from GF.Data.Oper
maybeErr :: String -> Maybe a -> Err a
maybeErr s = maybe (Bad s) Ok
testErr :: Bool -> String -> Err ()
testErr cond msg = if cond then return () else Bad msg
errVal :: a -> Err a -> a
errVal a = err (const a) id
errIn :: String -> Err a -> Err a
errIn msg = err (\s -> Bad (s ++ "\nOCCURRED IN\n" ++ msg)) return
err :: (String -> b) -> (a -> b) -> Err a -> b
err d f e = case e of
Ok a -> f a
Bad s -> d s

View File

@@ -0,0 +1,30 @@
{-# OPTIONS_GHC -fglasgow-exts #-}
module GF.GFCC.ComposOp (Compos(..),composOp,composOpM,composOpM_,composOpMonoid,
composOpMPlus,composOpFold) where
import Control.Monad.Identity
import Data.Monoid
class Compos t where
compos :: (forall a. a -> m a) -> (forall a b. m (a -> b) -> m a -> m b)
-> (forall a. t a -> m (t a)) -> t c -> m (t c)
composOp :: Compos t => (forall a. t a -> t a) -> t c -> t c
composOp f = runIdentity . composOpM (Identity . f)
composOpM :: (Compos t, Monad m) => (forall a. t a -> m (t a)) -> t c -> m (t c)
composOpM = compos return ap
composOpM_ :: (Compos t, Monad m) => (forall a. t a -> m ()) -> t c -> m ()
composOpM_ = composOpFold (return ()) (>>)
composOpMonoid :: (Compos t, Monoid m) => (forall a. t a -> m) -> t c -> m
composOpMonoid = composOpFold mempty mappend
composOpMPlus :: (Compos t, MonadPlus m) => (forall a. t a -> m b) -> t c -> m b
composOpMPlus = composOpFold mzero mplus
composOpFold :: Compos t => b -> (b -> b -> b) -> (forall a. t a -> b) -> t c -> b
composOpFold z c f = unC . compos (\_ -> C z) (\(C x) (C y) -> C (c x y)) (C . f)
newtype C b a = C { unC :: b }

152
src-3.0/GF/GFCC/DataGFCC.hs Normal file
View File

@@ -0,0 +1,152 @@
module GF.GFCC.DataGFCC where
import GF.GFCC.CId
import GF.Infra.CompactPrint
import GF.Text.UTF8
import GF.Formalism.FCFG
import GF.Parsing.FCFG.PInfo
import Data.Map
import Data.List
-- internal datatypes for GFCC
data GFCC = GFCC {
absname :: CId ,
cncnames :: [CId] ,
gflags :: Map CId String, -- value of a global flag
abstract :: Abstr ,
concretes :: Map CId Concr
}
data Abstr = Abstr {
aflags :: Map CId String, -- value of a flag
funs :: Map CId (Type,Exp), -- type and def of a fun
cats :: Map CId [Hypo], -- context of a cat
catfuns :: Map CId [CId] -- funs to a cat (redundant, for fast lookup)
}
data Concr = Concr {
cflags :: Map CId String, -- value of a flag
lins :: Map CId Term, -- lin of a fun
opers :: Map CId Term, -- oper generated by subex elim
lincats :: Map CId Term, -- lin type of a cat
lindefs :: Map CId Term, -- lin default of a cat
printnames :: Map CId Term, -- printname of a cat or a fun
paramlincats :: Map CId Term, -- lin type of cat, with printable param names
parser :: Maybe FCFPInfo -- parser
}
data Type =
DTyp [Hypo] CId [Exp]
deriving (Eq,Ord,Show)
data Exp =
DTr [CId] Atom [Exp]
| EEq [Equation]
deriving (Eq,Ord,Show)
data Atom =
AC CId
| AS String
| AI Integer
| AF Double
| AM Integer
| AV CId
deriving (Eq,Ord,Show)
data Term =
R [Term]
| P Term Term
| S [Term]
| K Tokn
| V Int
| C Int
| F CId
| FV [Term]
| W String Term
| TM String
| RP Term Term
deriving (Eq,Ord,Show)
data Tokn =
KS String
| KP [String] [Variant]
deriving (Eq,Ord,Show)
data Variant =
Var [String] [String]
deriving (Eq,Ord,Show)
data Hypo =
Hyp CId Type
deriving (Eq,Ord,Show)
data Equation =
Equ [Exp] Exp
deriving (Eq,Ord,Show)
-- print statistics
statGFCC :: GFCC -> String
statGFCC gfcc = unlines [
"Abstract\t" ++ pr (absname gfcc),
"Concretes\t" ++ unwords (lmap pr (cncnames gfcc)),
"Categories\t" ++ unwords (lmap pr (keys (cats (abstract gfcc))))
]
where pr (CId s) = s
printCId :: CId -> String
printCId (CId s) = s
-- merge two GFCCs; fails is differens absnames; priority to second arg
unionGFCC :: GFCC -> GFCC -> GFCC
unionGFCC one two = case absname one of
CId "" -> two -- extending empty grammar
n | n == absname two -> one { -- extending grammar with same abstract
concretes = Data.Map.union (concretes two) (concretes one),
cncnames = Data.List.union (cncnames two) (cncnames one)
}
_ -> one -- abstracts don't match ---- print error msg
emptyGFCC :: GFCC
emptyGFCC = GFCC {
absname = CId "",
cncnames = [] ,
gflags = empty,
abstract = error "empty grammar, no abstract",
concretes = empty
}
-- default map and filter are for Map here
lmap = Prelude.map
lfilter = Prelude.filter
mmap = Data.Map.map
-- encode idenfifiers and strings in UTF8
utf8GFCC :: GFCC -> GFCC
utf8GFCC gfcc = gfcc {
concretes = mmap u8concr (concretes gfcc)
}
where
u8concr cnc = cnc {
lins = mmap u8term (lins cnc),
opers = mmap u8term (opers cnc)
}
u8term = convertStringsInTerm encodeUTF8
---- TODO: convert identifiers and flags
convertStringsInTerm conv t = case t of
K (KS s) -> K (KS (conv s))
W s r -> W (conv s) (convs r)
R ts -> R $ lmap convs ts
S ts -> S $ lmap convs ts
FV ts -> FV $ lmap convs ts
P u v -> P (convs u) (convs v)
_ -> t
where
convs = convertStringsInTerm conv

81
src-3.0/GF/GFCC/GFCC.cf Normal file
View File

@@ -0,0 +1,81 @@
Grm. Grammar ::=
"grammar" CId "(" [CId] ")" "(" [Flag] ")" ";"
Abstract ";"
[Concrete] ;
Abs. Abstract ::=
"abstract" "{"
"flags" [Flag]
"fun" [FunDef]
"cat" [CatDef]
"}" ;
Cnc. Concrete ::=
"concrete" CId "{"
"flags" [Flag]
"lin" [LinDef]
"oper" [LinDef]
"lincat" [LinDef]
"lindef" [LinDef]
"printname" [LinDef]
"param" [LinDef] -- lincats with param value names
"}" ;
Flg. Flag ::= CId "=" String ;
Cat. CatDef ::= CId "[" [Hypo] "]" ;
Fun. FunDef ::= CId ":" Type "=" Exp ;
Lin. LinDef ::= CId "=" Term ;
DTyp. Type ::= "[" [Hypo] "]" CId [Exp] ; -- dependent type
DTr. Exp ::= "[" "(" [CId] ")" Atom [Exp] "]" ; -- term with bindings
AC. Atom ::= CId ;
AS. Atom ::= String ;
AI. Atom ::= Integer ;
AF. Atom ::= Double ;
AM. Atom ::= "?" Integer ;
R. Term ::= "[" [Term] "]" ; -- record/table
P. Term ::= "(" Term "!" Term ")" ; -- projection/selection
S. Term ::= "(" [Term] ")" ; -- concatenated sequence
K. Term ::= Tokn ; -- token
V. Term ::= "$" Integer ; -- argument
C. Term ::= Integer ; -- parameter value/label
F. Term ::= CId ; -- global constant
FV. Term ::= "[|" [Term] "|]" ; -- free variation
W. Term ::= "(" String "+" Term ")" ; -- prefix + suffix table
TM. Term ::= "?" ; -- lin of metavariable
KS. Tokn ::= String ;
KP. Tokn ::= "[" "pre" [String] "[" [Variant] "]" "]" ;
Var. Variant ::= [String] "/" [String] ;
RP. Term ::= "(" Term "@" Term ")"; -- DEPRECATED: record parameter alias
terminator Concrete ";" ;
terminator Flag ";" ;
terminator CatDef ";" ;
terminator FunDef ";" ;
terminator LinDef ";" ;
separator CId "," ;
separator Term "," ;
terminator Exp "" ;
terminator String "" ;
separator Variant "," ;
token CId (('_' | letter) (letter | digit | '\'' | '_')*) ;
-- the following are needed if dependent types or HOAS or defs are present
Hyp. Hypo ::= CId ":" Type ;
AV. Atom ::= "$" CId ;
EEq. Exp ::= "{" [Equation] "}" ; -- list of pattern eqs; primitive: []
Equ. Equation ::= [Exp] "->" Exp ; -- patterns are encoded as exps
separator Hypo "," ;
terminator Equation ";" ;

View File

@@ -0,0 +1,70 @@
module GF.GFCC.Generate where
import GF.GFCC.Macros
import GF.GFCC.DataGFCC
import GF.GFCC.CId
import qualified Data.Map as M
import System.Random
-- generate an infinite list of trees exhaustively
generate :: GFCC -> CId -> Maybe Int -> [Exp]
generate gfcc cat dp = concatMap (\i -> gener i cat) depths
where
gener 0 c = [tree (AC f) [] | (f, ([],_)) <- fns c]
gener i c = [
tr |
(f, (cs,_)) <- fns c,
let alts = map (gener (i-1)) cs,
ts <- combinations alts,
let tr = tree (AC f) ts,
depth tr >= i
]
fns c = [(f,catSkeleton ty) | (f,ty) <- functionsToCat gfcc c]
depths = maybe [0 ..] (\d -> [0..d]) dp
-- generate an infinite list of trees randomly
genRandom :: StdGen -> GFCC -> CId -> [Exp]
genRandom gen gfcc cat = genTrees (randomRs (0.0, 1.0 :: Double) gen) cat where
timeout = 47 -- give up
genTrees ds0 cat =
let (ds,ds2) = splitAt (timeout+1) ds0 -- for time out, else ds
(t,k) = genTree ds cat
in (if k>timeout then id else (t:))
(genTrees ds2 cat) -- else (drop k ds)
genTree rs = gett rs where
gett ds (CId "String") = (tree (AS "foo") [], 1)
gett ds (CId "Int") = (tree (AI 12345) [], 1)
gett [] _ = (tree (AS "TIMEOUT") [], 1) ----
gett ds cat = case fns cat of
[] -> (tree (AM 0) [],1)
fs -> let
d:ds2 = ds
(f,args) = getf d fs
(ts,k) = getts ds2 args
in (tree (AC f) ts, k+1)
getf d fs = let lg = (length fs) in
fs !! (floor (d * fromIntegral lg))
getts ds cats = case cats of
c:cs -> let
(t, k) = gett ds c
(ts,ks) = getts (drop k ds) cs
in (t:ts, k + ks)
_ -> ([],0)
fns cat = [(f,(fst (catSkeleton ty))) | (f,ty) <- functionsToCat gfcc cat]
{-
-- brute-force parsing method; only returns the first result
-- note: you cannot throw away rules with unknown words from the grammar
-- because it is not known which field in each rule may match the input
searchParse :: Int -> GFCC -> CId -> [String] -> [Exp]
searchParse i gfcc cat ws = [t | t <- gen, s <- lins t, words s == ws] where
gen = take i $ generate gfcc cat
lins t = [linearize gfcc lang t | lang <- cncnames gfcc]
-}

349
src-3.0/GF/GFCC/LexGFCC.hs Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,91 @@
module GF.GFCC.Linearize where
import GF.GFCC.Macros
import GF.GFCC.DataGFCC
import GF.GFCC.CId
import Data.Map
import Data.List
import Debug.Trace
-- linearization and computation of concrete GFCC Terms
linearize :: GFCC -> CId -> Exp -> String
linearize mcfg lang = realize . linExp mcfg lang
realize :: Term -> String
realize trm = case trm of
R ts -> realize (ts !! 0)
S ss -> unwords $ lmap realize ss
K t -> case t of
KS s -> s
KP s _ -> unwords s ---- prefix choice TODO
W s t -> s ++ realize t
FV ts -> realize (ts !! 0) ---- other variants TODO
RP _ r -> realize r ---- DEPREC
TM s -> s
_ -> "ERROR " ++ show trm ---- debug
linExp :: GFCC -> CId -> Exp -> Term
linExp mcfg lang tree@(DTr xs at trees) =
addB $ case at of
AC fun -> comp (lmap lin trees) $ look fun
AS s -> R [kks (show s)] -- quoted
AI i -> R [kks (show i)]
--- [C lst, kks (show i), C size] where
--- lst = mod (fromInteger i) 10 ; size = if i < 10 then 0 else 1
AF d -> R [kks (show d)]
AV x -> TM (prCId x)
AM i -> TM (show i)
where
lin = linExp mcfg lang
comp = compute mcfg lang
look = lookLin mcfg lang
addB t
| Data.List.null xs = t
| otherwise = case t of
R ts -> R $ ts ++ (Data.List.map (kks . prCId) xs)
TM s -> R $ t : (Data.List.map (kks . prCId) xs)
compute :: GFCC -> CId -> [Term] -> Term -> Term
compute mcfg lang args = comp where
comp trm = case trm of
P r p -> proj (comp r) (comp p)
RP i t -> RP (comp i) (comp t) ---- DEPREC
W s t -> W s (comp t)
R ts -> R $ lmap comp ts
V i -> idx args i -- already computed
F c -> comp $ look c -- not computed (if contains argvar)
FV ts -> FV $ lmap comp ts
S ts -> S $ lfilter (/= S []) $ lmap comp ts
_ -> trm
look = lookOper mcfg lang
idx xs i = if i > length xs - 1
then trace
("too large " ++ show i ++ " for\n" ++ unlines (lmap show xs) ++ "\n") tm0
else xs !! i
proj r p = case (r,p) of
(_, FV ts) -> FV $ lmap (proj r) ts
(FV ts, _ ) -> FV $ lmap (\t -> proj t p) ts
(W s t, _) -> kks (s ++ getString (proj t p))
_ -> comp $ getField r (getIndex p)
getString t = case t of
K (KS s) -> s
_ -> error ("ERROR in grammar compiler: string from "++ show t) "ERR"
getIndex t = case t of
C i -> i
RP p _ -> getIndex p ---- DEPREC
TM _ -> 0 -- default value for parameter
_ -> trace ("ERROR in grammar compiler: index from " ++ show t) 666
getField t i = case t of
R rs -> idx rs i
RP _ r -> getField r i ---- DEPREC
TM s -> TM s
_ -> error ("ERROR in grammar compiler: field from " ++ show t) t

121
src-3.0/GF/GFCC/Macros.hs Normal file
View File

@@ -0,0 +1,121 @@
module GF.GFCC.Macros where
import GF.GFCC.CId
import GF.GFCC.DataGFCC
import GF.Formalism.FCFG (FGrammar)
import GF.Parsing.FCFG.PInfo (FCFPInfo, fcfPInfoToFGrammar)
----import GF.GFCC.PrintGFCC
import Control.Monad
import Data.Map
import Data.Maybe
import Data.List
-- operations for manipulating GFCC grammars and objects
lookLin :: GFCC -> CId -> CId -> Term
lookLin gfcc lang fun =
lookMap tm0 fun $ lins $ lookMap (error "no lang") lang $ concretes gfcc
lookOper :: GFCC -> CId -> CId -> Term
lookOper gfcc lang fun =
lookMap tm0 fun $ opers $ lookMap (error "no lang") lang $ concretes gfcc
lookLincat :: GFCC -> CId -> CId -> Term
lookLincat gfcc lang fun =
lookMap tm0 fun $ lincats $ lookMap (error "no lang") lang $ concretes gfcc
lookParamLincat :: GFCC -> CId -> CId -> Term
lookParamLincat gfcc lang fun =
lookMap tm0 fun $ paramlincats $ lookMap (error "no lang") lang $ concretes gfcc
lookType :: GFCC -> CId -> Type
lookType gfcc f =
fst $ lookMap (error $ "lookType " ++ show f) f (funs (abstract gfcc))
lookParser :: GFCC -> CId -> Maybe FCFPInfo
lookParser gfcc lang = parser $ lookMap (error "no lang") lang $ concretes gfcc
lookFCFG :: GFCC -> CId -> Maybe FGrammar
lookFCFG gfcc lang = fmap fcfPInfoToFGrammar $ lookParser gfcc lang
lookStartCat :: GFCC -> String
lookStartCat gfcc = fromMaybe "S" $ msum $ Data.List.map (Data.Map.lookup (CId "startcat"))
[gflags gfcc, aflags (abstract gfcc)]
lookGlobalFlag :: GFCC -> CId -> String
lookGlobalFlag gfcc f =
lookMap "?" f (gflags gfcc)
lookAbsFlag :: GFCC -> CId -> String
lookAbsFlag gfcc f =
lookMap "?" f (aflags (abstract gfcc))
lookCncFlag :: GFCC -> CId -> CId -> String
lookCncFlag gfcc lang f =
lookMap "?" f $ cflags $ lookMap (error "no lang") lang $ concretes gfcc
functionsToCat :: GFCC -> CId -> [(CId,Type)]
functionsToCat gfcc cat =
[(f,ty) | f <- fs, Just (ty,_) <- [Data.Map.lookup f $ funs $ abstract gfcc]]
where
fs = lookMap [] cat $ catfuns $ abstract gfcc
depth :: Exp -> Int
depth tr = case tr of
DTr _ _ [] -> 1
DTr _ _ ts -> maximum (lmap depth ts) + 1
tree :: Atom -> [Exp] -> Exp
tree = DTr []
cftype :: [CId] -> CId -> Type
cftype args val = DTyp [Hyp wildCId (cftype [] arg) | arg <- args] val []
catSkeleton :: Type -> ([CId],CId)
catSkeleton ty = case ty of
DTyp hyps val _ -> ([valCat ty | Hyp _ ty <- hyps],val)
typeSkeleton :: Type -> ([(Int,CId)],CId)
typeSkeleton ty = case ty of
DTyp hyps val _ -> ([(contextLength ty, valCat ty) | Hyp _ ty <- hyps],val)
valCat :: Type -> CId
valCat ty = case ty of
DTyp _ val _ -> val
contextLength :: Type -> Int
contextLength ty = case ty of
DTyp hyps _ _ -> length hyps
cid :: String -> CId
cid = CId
wildCId :: CId
wildCId = cid "_"
exp0 :: Exp
exp0 = tree (AM 0) []
primNotion :: Exp
primNotion = EEq []
term0 :: CId -> Term
term0 = TM . prCId
tm0 :: Term
tm0 = TM "?"
kks :: String -> Term
kks = K . KS
-- lookup with default value
lookMap :: (Show i, Ord i) => a -> i -> Map i a -> a
lookMap d c m = maybe d id $ Data.Map.lookup c m
--- from Operations
combinations :: [[a]] -> [[a]]
combinations t = case t of
[] -> [[]]
aa:uu -> [a:u | a <- aa, u <- combinations uu]

View File

@@ -0,0 +1,116 @@
module GF.GFCC.OptimizeGFCC where
import GF.GFCC.CId
import GF.GFCC.DataGFCC
import GF.Data.Operations
import Data.List
import qualified Data.Map as Map
-- back-end optimization:
-- suffix analysis followed by common subexpression elimination
optGFCC :: GFCC -> GFCC
optGFCC gfcc = gfcc {
concretes = Map.map opt (concretes gfcc)
}
where
opt cnc = subex $ cnc {
lins = Map.map optTerm (lins cnc),
lindefs = Map.map optTerm (lindefs cnc),
printnames = Map.map optTerm (printnames cnc)
}
-- analyse word form lists into prefix + suffixes
-- suffix sets can later be shared by subex elim
optTerm :: Term -> Term
optTerm tr = case tr of
R ts@(_:_:_) | all isK ts -> mkSuff $ optToks [s | K (KS s) <- ts]
R ts -> R $ map optTerm ts
P t v -> P (optTerm t) v
_ -> tr
where
optToks ss = prf : suffs where
prf = pref (head ss) (tail ss)
suffs = map (drop (length prf)) ss
pref cand ss = case ss of
s1:ss2 -> if isPrefixOf cand s1 then pref cand ss2 else pref (init cand) ss
_ -> cand
isK t = case t of
K (KS _) -> True
_ -> False
mkSuff ("":ws) = R (map (K . KS) ws)
mkSuff (p:ws) = W p (R (map (K . KS) ws))
-- common subexpression elimination
---subex :: [(CId,Term)] -> [(CId,Term)]
subex :: Concr -> Concr
subex cnc = errVal cnc $ do
(tree,_) <- appSTM (getSubtermsMod cnc) (Map.empty,0)
return $ addSubexpConsts tree cnc
type TermList = Map.Map Term (Int,Int) -- number of occs, id
type TermM a = STM (TermList,Int) a
addSubexpConsts :: TermList -> Concr -> Concr
addSubexpConsts tree cnc = cnc {
opers = Map.fromList [(f,recomp f trm) | (f,trm) <- ops],
lins = rec lins,
lindefs = rec lindefs,
printnames = rec printnames
}
where
ops = [(fid id, trm) | (trm,(_,id)) <- Map.assocs tree]
mkOne (f,trm) = (f, recomp f trm)
recomp f t = case Map.lookup t tree of
Just (_,id) | fid id /= f -> F $ fid id -- not to replace oper itself
_ -> case t of
R ts -> R $ map (recomp f) ts
S ts -> S $ map (recomp f) ts
W s t -> W s (recomp f t)
P t p -> P (recomp f t) (recomp f p)
_ -> t
fid n = CId $ "_" ++ show n
rec field = Map.fromAscList [(f,recomp f trm) | (f,trm) <- Map.assocs (field cnc)]
getSubtermsMod :: Concr -> TermM TermList
getSubtermsMod cnc = do
mapM getSubterms (Map.assocs (lins cnc))
mapM getSubterms (Map.assocs (lindefs cnc))
mapM getSubterms (Map.assocs (printnames cnc))
(tree0,_) <- readSTM
return $ Map.filter (\ (nu,_) -> nu > 1) tree0
where
getSubterms (f,trm) = collectSubterms trm >> return ()
collectSubterms :: Term -> TermM ()
collectSubterms t = case t of
R ts -> do
mapM collectSubterms ts
add t
S ts -> do
mapM collectSubterms ts
add t
W s u -> do
collectSubterms u
add t
P p u -> do
collectSubterms p
collectSubterms u
add t
_ -> return ()
where
add t = do
(ts,i) <- readSTM
let
((count,id),next) = case Map.lookup t ts of
Just (nu,id) -> ((nu+1,id), i)
_ -> ((1, i ), i+1)
writeSTM (Map.insert t (count,id) ts, next)

View File

@@ -0,0 +1,17 @@
module GF.GFCC.Raw.AbsGFCCRaw where
-- Haskell module generated by the BNF converter
newtype CId = CId String deriving (Eq,Ord,Show)
data Grammar =
Grm [RExp]
deriving (Eq,Ord,Show)
data RExp =
App CId [RExp]
| AInt Integer
| AStr String
| AFlt Double
| AMet
deriving (Eq,Ord,Show)

View File

@@ -0,0 +1,277 @@
module GF.GFCC.Raw.ConvertGFCC (toGFCC,fromGFCC) where
import GF.GFCC.DataGFCC
import GF.GFCC.Raw.AbsGFCCRaw
import GF.Data.Assoc
import GF.Formalism.FCFG
import GF.Formalism.Utilities (NameProfile(..), Profile(..), SyntaxForest(..))
import GF.Parsing.FCFG.PInfo (FCFPInfo(..), buildFCFPInfo)
import qualified Data.Array as Array
import Data.Map
pgfMajorVersion, pgfMinorVersion :: Integer
(pgfMajorVersion, pgfMinorVersion) = (1,0)
-- convert parsed grammar to internal GFCC
toGFCC :: Grammar -> GFCC
toGFCC (Grm [
App (CId "pgf") (AInt v1 : AInt v2 : App a []:cs),
App (CId "flags") gfs,
ab@(
App (CId "abstract") [
App (CId "fun") fs,
App (CId "cat") cts
]),
App (CId "concrete") ccs
]) = GFCC {
absname = a,
cncnames = [c | App c [] <- cs],
gflags = fromAscList [(f,v) | App f [AStr v] <- gfs],
abstract =
let
aflags = fromAscList [(f,v) | App f [AStr v] <- gfs]
lfuns = [(f,(toType typ,toExp def)) | App f [typ, def] <- fs]
funs = fromAscList lfuns
lcats = [(c, Prelude.map toHypo hyps) | App c hyps <- cts]
cats = fromAscList lcats
catfuns = fromAscList
[(cat,[f | (f, (DTyp _ c _,_)) <- lfuns, c==cat]) | (cat,_) <- lcats]
in Abstr aflags funs cats catfuns,
concretes = fromAscList [(lang, toConcr ts) | App lang ts <- ccs]
}
where
toConcr :: [RExp] -> Concr
toConcr = foldl add (Concr {
cflags = empty,
lins = empty,
opers = empty,
lincats = empty,
lindefs = empty,
printnames = empty,
paramlincats = empty,
parser = Nothing
})
where
add :: Concr -> RExp -> Concr
add cnc (App (CId "flags") ts) = cnc { cflags = fromAscList [(f,v) | App f [AStr v] <- ts] }
add cnc (App (CId "lin") ts) = cnc { lins = mkTermMap ts }
add cnc (App (CId "oper") ts) = cnc { opers = mkTermMap ts }
add cnc (App (CId "lincat") ts) = cnc { lincats = mkTermMap ts }
add cnc (App (CId "lindef") ts) = cnc { lindefs = mkTermMap ts }
add cnc (App (CId "printname") ts) = cnc { printnames = mkTermMap ts }
add cnc (App (CId "param") ts) = cnc { paramlincats = mkTermMap ts }
add cnc (App (CId "parser") ts) = cnc { parser = Just (toPInfo ts) }
toPInfo :: [RExp] -> FCFPInfo
toPInfo [App (CId "rules") rs, App (CId "startupcats") cs] = buildFCFPInfo (rules, cats)
where
rules = lmap toFRule rs
cats = fromList [(c, lmap expToInt fs) | App c fs <- cs]
toFRule :: RExp -> FRule
toFRule (App (CId "rule")
[n,
App (CId "cats") (rt:at),
App (CId "R") ls]) = FRule name args res lins
where
name = toFName n
args = lmap expToInt at
res = expToInt rt
lins = mkArray [mkArray [toSymbol s | s <- l] | App (CId "S") l <- ls]
toFName :: RExp -> FName
toFName (App (CId "_A") [x]) = Name (CId "_") [Unify [expToInt x]]
toFName (App f ts) = Name f (lmap toProfile ts)
where
toProfile :: RExp -> Profile (SyntaxForest CId)
toProfile AMet = Unify []
toProfile (App (CId "_A") [t]) = Unify [expToInt t]
toProfile (App (CId "_U") ts) = Unify [expToInt t | App (CId "_A") [t] <- ts]
toProfile t = Constant (toSyntaxForest t)
toSyntaxForest :: RExp -> SyntaxForest CId
toSyntaxForest AMet = FMeta
toSyntaxForest (App n ts) = FNode n [lmap toSyntaxForest ts]
toSyntaxForest (AStr s) = FString s
toSyntaxForest (AInt i) = FInt i
toSyntaxForest (AFlt f) = FFloat f
toSymbol :: RExp -> FSymbol
toSymbol (App (CId "P") [c,n,l]) = FSymCat (expToInt c) (expToInt l) (expToInt n)
toSymbol (AStr t) = FSymTok t
toType :: RExp -> Type
toType e = case e of
App cat [App (CId "H") hypos, App (CId "X") exps] ->
DTyp (lmap toHypo hypos) cat (lmap toExp exps)
_ -> error $ "type " ++ show e
toHypo :: RExp -> Hypo
toHypo e = case e of
App x [typ] -> Hyp x (toType typ)
_ -> error $ "hypo " ++ show e
toExp :: RExp -> Exp
toExp e = case e of
App (CId "App") [App fun [], App (CId "B") xs, App (CId "X") exps] ->
DTr [x | App x [] <- xs] (AC fun) (lmap toExp exps)
App (CId "Eq") eqs ->
EEq [Equ (lmap toExp ps) (toExp v) | App (CId "E") (v:ps) <- eqs]
App (CId "Var") [App i []] -> DTr [] (AV i) []
AMet -> DTr [] (AM 0) []
AInt i -> DTr [] (AI i) []
AFlt i -> DTr [] (AF i) []
AStr i -> DTr [] (AS i) []
_ -> error $ "exp " ++ show e
toTerm :: RExp -> Term
toTerm e = case e of
App (CId "R") es -> R (lmap toTerm es)
App (CId "S") es -> S (lmap toTerm es)
App (CId "FV") es -> FV (lmap toTerm es)
App (CId "P") [e,v] -> P (toTerm e) (toTerm v)
App (CId "RP") [e,v] -> RP (toTerm e) (toTerm v) ----
App (CId "W") [AStr s,v] -> W s (toTerm v)
App (CId "A") [AInt i] -> V (fromInteger i)
App f [] -> F f
AInt i -> C (fromInteger i)
AMet -> TM "?"
AStr s -> K (KS s) ----
_ -> error $ "term " ++ show e
------------------------------
--- from internal to parser --
------------------------------
fromGFCC :: GFCC -> Grammar
fromGFCC gfcc0 = Grm [
app "pgf" (AInt pgfMajorVersion:AInt pgfMinorVersion
: App (absname gfcc) [] : lmap (flip App []) (cncnames gfcc)),
app "flags" [App f [AStr v] | (f,v) <- toList (gflags gfcc `union` aflags agfcc)],
app "abstract" [
app "fun" [App f [fromType t,fromExp d] | (f,(t,d)) <- toList (funs agfcc)],
app "cat" [App f (lmap fromHypo hs) | (f,hs) <- toList (cats agfcc)]
],
app "concrete" [App lang (fromConcrete c) | (lang,c) <- toList (concretes gfcc)]
]
where
gfcc = utf8GFCC gfcc0
app s = App (CId s)
agfcc = abstract gfcc
fromConcrete cnc = [
app "flags" [App f [AStr v] | (f,v) <- toList (cflags cnc)],
app "lin" [App f [fromTerm v] | (f,v) <- toList (lins cnc)],
app "oper" [App f [fromTerm v] | (f,v) <- toList (opers cnc)],
app "lincat" [App f [fromTerm v] | (f,v) <- toList (lincats cnc)],
app "lindef" [App f [fromTerm v] | (f,v) <- toList (lindefs cnc)],
app "printname" [App f [fromTerm v] | (f,v) <- toList (printnames cnc)],
app "param" [App f [fromTerm v] | (f,v) <- toList (paramlincats cnc)]
] ++ maybe [] (\p -> [fromPInfo p]) (parser cnc)
fromType :: Type -> RExp
fromType e = case e of
DTyp hypos cat exps ->
App cat [
App (CId "H") (lmap fromHypo hypos),
App (CId "X") (lmap fromExp exps)]
fromHypo :: Hypo -> RExp
fromHypo e = case e of
Hyp x typ -> App x [fromType typ]
fromExp :: Exp -> RExp
fromExp e = case e of
DTr xs (AC fun) exps ->
App (CId "App") [App fun [], App (CId "B") (lmap (flip App []) xs), App (CId "X") (lmap fromExp exps)]
DTr [] (AV x) [] -> App (CId "Var") [App x []]
DTr [] (AS s) [] -> AStr s
DTr [] (AF d) [] -> AFlt d
DTr [] (AI i) [] -> AInt (toInteger i)
DTr [] (AM _) [] -> AMet ----
EEq eqs ->
App (CId "Eq") [App (CId "E") (lmap fromExp (v:ps)) | Equ ps v <- eqs]
_ -> error $ "exp " ++ show e
fromTerm :: Term -> RExp
fromTerm e = case e of
R es -> app "R" (lmap fromTerm es)
S es -> app "S" (lmap fromTerm es)
FV es -> app "FV" (lmap fromTerm es)
P e v -> app "P" [fromTerm e, fromTerm v]
RP e v -> app "RP" [fromTerm e, fromTerm v] ----
W s v -> app "W" [AStr s, fromTerm v]
C i -> AInt (toInteger i)
TM _ -> AMet
F f -> App f []
V i -> App (CId "A") [AInt (toInteger i)]
K (KS s) -> AStr s ----
K (KP d vs) -> app "FV" (str d : [str v | Var v _ <- vs]) ----
where
app = App . CId
str v = app "S" (lmap AStr v)
-- ** Parsing info
fromPInfo :: FCFPInfo -> RExp
fromPInfo p = app "parser" [
app "rules" [fromFRule rule | rule <- Array.elems (allRules p)],
app "startupcats" [App f (lmap intToExp cs) | (f,cs) <- toList (startupCats p)]
]
fromFRule :: FRule -> RExp
fromFRule (FRule n args res lins) =
app "rule" [fromFName n,
app "cats" (intToExp res:lmap intToExp args),
app "R" [app "S" [fromSymbol s | s <- Array.elems l] | l <- Array.elems lins]
]
fromFName :: FName -> RExp
fromFName n = case n of
Name (CId "_") [p] -> fromProfile p
Name f ps -> App f (lmap fromProfile ps)
where
fromProfile :: Profile (SyntaxForest CId) -> RExp
fromProfile (Unify []) = AMet
fromProfile (Unify [x]) = daughter x
fromProfile (Unify args) = app "_U" (lmap daughter args)
fromProfile (Constant forest) = fromSyntaxForest forest
daughter n = app "_A" [intToExp n]
fromSyntaxForest :: SyntaxForest CId -> RExp
fromSyntaxForest FMeta = AMet
-- FIXME: is there always just one element here?
fromSyntaxForest (FNode n [args]) = App n (lmap fromSyntaxForest args)
fromSyntaxForest (FString s) = AStr s
fromSyntaxForest (FInt i) = AInt i
fromSyntaxForest (FFloat f) = AFlt f
fromSymbol :: FSymbol -> RExp
fromSymbol (FSymCat c l n) = app "P" [intToExp c, intToExp n, intToExp l]
fromSymbol (FSymTok t) = AStr t
-- ** Utilities
mkTermMap :: [RExp] -> Map CId Term
mkTermMap ts = fromAscList [(f,toTerm v) | App f [v] <- ts]
app :: String -> [RExp] -> RExp
app = App . CId
mkArray :: [a] -> Array.Array Int a
mkArray xs = Array.listArray (0, length xs - 1) xs
expToInt :: Integral a => RExp -> a
expToInt (App (CId "neg") [AInt i]) = fromIntegral (negate i)
expToInt (AInt i) = fromIntegral i
expToStr :: RExp -> String
expToStr (AStr s) = s
intToExp :: Integral a => a -> RExp
intToExp x | x < 0 = App (CId "neg") [AInt (fromIntegral (negate x))]
| otherwise = AInt (fromIntegral x)

View File

@@ -0,0 +1,12 @@
Grm. Grammar ::= [RExp] ;
App. RExp ::= "(" CId [RExp] ")" ;
AId. RExp ::= CId ;
AInt. RExp ::= Integer ;
AStr. RExp ::= String ;
AFlt. RExp ::= Double ;
AMet. RExp ::= "?" ;
terminator RExp "" ;
token CId (('_' | letter) (letter | digit | '\'' | '_')*) ;

View File

@@ -0,0 +1,99 @@
module GF.GFCC.Raw.ParGFCCRaw (parseGrammar) where
import GF.GFCC.Raw.AbsGFCCRaw
import Control.Monad
import Data.Char
parseGrammar :: String -> IO Grammar
parseGrammar s = case runP pGrammar s of
Just (x,"") -> return x
_ -> fail "Parse error"
pGrammar :: P Grammar
pGrammar = liftM Grm pTerms
pTerms :: P [RExp]
pTerms = liftM2 (:) (pTerm 1) pTerms <++ (skipSpaces >> return [])
pTerm :: Int -> P RExp
pTerm n = skipSpaces >> (pParen <++ pApp <++ pNum <++ pStr <++ pMeta)
where pParen = between (char '(') (char ')') (pTerm 0)
pApp = liftM2 App pIdent (if n == 0 then pTerms else return [])
pStr = char '"' >> liftM AStr (manyTill (pEsc <++ get) (char '"'))
pEsc = char '\\' >> get
pNum = do x <- munch1 isDigit
((char '.' >> munch1 isDigit >>= \y -> return (AFlt (read (x++"."++y))))
<++
return (AInt (read x)))
pMeta = char '?' >> return AMet
pIdent = liftM CId $ liftM2 (:) (satisfy isIdentFirst) (munch isIdentRest)
isIdentFirst c = c == '_' || isAlpha c
isIdentRest c = c == '_' || c == '\'' || isAlphaNum c
-- Parser combinators with only left-biased choice
newtype P a = P { runP :: String -> Maybe (a,String) }
instance Monad P where
return x = P (\ts -> Just (x,ts))
P p >>= f = P (\ts -> p ts >>= \ (x,ts') -> runP (f x) ts')
fail _ = pfail
instance MonadPlus P where
mzero = pfail
mplus = (<++)
get :: P Char
get = P (\ts -> case ts of
[] -> Nothing
c:cs -> Just (c,cs))
look :: P String
look = P (\ts -> Just (ts,ts))
(<++) :: P a -> P a -> P a
P p <++ P q = P (\ts -> p ts `mplus` q ts)
pfail :: P a
pfail = P (\ts -> Nothing)
satisfy :: (Char -> Bool) -> P Char
satisfy p = do c <- get
if p c then return c else pfail
char :: Char -> P Char
char c = satisfy (c==)
string :: String -> P String
string this = look >>= scan this
where
scan [] _ = return this
scan (x:xs) (y:ys) | x == y = get >> scan xs ys
scan _ _ = pfail
skipSpaces :: P ()
skipSpaces = look >>= skip
where
skip (c:s) | isSpace c = get >> skip s
skip _ = return ()
manyTill :: P a -> P end -> P [a]
manyTill p end = scan
where scan = (end >> return []) <++ liftM2 (:) p scan
munch :: (Char -> Bool) -> P String
munch p = munch1 p <++ return []
munch1 :: (Char -> Bool) -> P String
munch1 p = liftM2 (:) (satisfy p) (munch p)
choice :: [P a] -> P a
choice = msum
between :: P open -> P close -> P a -> P a
between open close p = do open
x <- p
close
return x

View File

@@ -0,0 +1,36 @@
module GF.GFCC.Raw.PrintGFCCRaw (printTree) where
import GF.GFCC.Raw.AbsGFCCRaw
import Data.List (intersperse)
import Numeric (showFFloat)
printTree :: Grammar -> String
printTree g = prGrammar g ""
prGrammar :: Grammar -> ShowS
prGrammar (Grm xs) = prRExpList xs
prRExp :: Int -> RExp -> ShowS
prRExp _ (App x []) = prCId x
prRExp n (App x xs) = p (prCId x . showChar ' ' . prRExpList xs)
where p s = if n == 0 then s else showChar '(' . s . showChar ')'
prRExp _ (AInt x) = shows x
prRExp _ (AStr x) = showChar '"' . concatS (map mkEsc x) . showChar '"'
prRExp _ (AFlt x) = showFFloat Nothing x
prRExp _ AMet = showChar '?'
mkEsc :: Char -> ShowS
mkEsc s = case s of
'"' -> showString "\\\""
'\\' -> showString "\\\\"
_ -> showChar s
prRExpList :: [RExp] -> ShowS
prRExpList = concatS . intersperse (showChar ' ') . map (prRExp 1)
prCId :: CId -> ShowS
prCId (CId x) = showString x
concatS :: [ShowS] -> ShowS
concatS = foldr (.) id

View File

@@ -0,0 +1,87 @@
module GF.GFCC.ShowLinearize (
tableLinearize,
recordLinearize,
termLinearize,
allLinearize
) where
import GF.GFCC.Linearize
import GF.GFCC.Macros
import GF.GFCC.DataGFCC
import GF.GFCC.CId
--import GF.GFCC.PrintGFCC ----
import GF.Data.Operations
import Data.List
-- printing linearizations in different ways with source parameters
-- internal representation, only used internally in this module
data Record =
RR [(String,Record)]
| RT [(String,Record)]
| RFV [Record]
| RS String
| RCon String
prRecord :: Record -> String
prRecord = prr where
prr t = case t of
RR fs -> concat $
"{" :
(intersperse ";" (map (\ (l,v) -> unwords [l,"=", prr v]) fs)) ++ ["}"]
RT fs -> concat $
"table {" :
(intersperse ";" (map (\ (l,v) -> unwords [l,"=>",prr v]) fs)) ++ ["}"]
RFV ts -> concat $
"variants {" : (intersperse ";" (map prr ts)) ++ ["}"]
RS s -> prQuotedString s
RCon s -> s
-- uses the encoding of record types in GFCC.paramlincat
mkRecord :: Term -> Term -> Record
mkRecord typ trm = case (typ,trm) of
(R rs, R ts) -> RR [(str lab, mkRecord ty t) | (P lab ty, t) <- zip rs ts]
(S [FV ps,ty],R ts) -> RT [(str par, mkRecord ty t) | (par, t) <- zip ps ts]
(_,W s (R ts)) -> mkRecord typ (R [K (KS (s ++ u)) | K (KS u) <- ts])
(FV ps, C i) -> RCon $ str $ ps !! i
(S [], _) -> RS $ realize trm
_ -> RS $ show trm ---- printTree trm
where
str = realize
-- show all branches, without labels and params
allLinearize :: GFCC -> CId -> Exp -> String
allLinearize gfcc lang = concat . map pr . tabularLinearize gfcc lang where
pr (p,vs) = unlines vs
-- show all branches, with labels and params
tableLinearize :: GFCC -> CId -> Exp -> String
tableLinearize gfcc lang = unlines . map pr . tabularLinearize gfcc lang where
pr (p,vs) = p +++ ":" +++ unwords (intersperse "|" vs)
-- create a table from labels+params to variants
tabularLinearize :: GFCC -> CId -> Exp -> [(String,[String])]
tabularLinearize gfcc lang = branches . recLinearize gfcc lang where
branches r = case r of
RR fs -> [(lab +++ b,s) | (lab,t) <- fs, (b,s) <- branches t]
RT fs -> [(lab +++ b,s) | (lab,t) <- fs, (b,s) <- branches t]
RFV rs -> [([], ss) | (_,ss) <- concatMap branches rs]
RS s -> [([], [s])]
RCon _ -> []
-- show record in GF-source-like syntax
recordLinearize :: GFCC -> CId -> Exp -> String
recordLinearize gfcc lang = prRecord . recLinearize gfcc lang
-- create a GF-like record, forming the basis of all functions above
recLinearize :: GFCC -> CId -> Exp -> Record
recLinearize gfcc lang exp = mkRecord typ $ linExp gfcc lang exp where
typ = case exp of
DTr _ (AC f) _ -> lookParamLincat gfcc lang $ valCat $ lookType gfcc f
-- show GFCC term
termLinearize :: GFCC -> CId -> Exp -> String
termLinearize gfcc lang = show . linExp gfcc lang

109
src-3.0/GF/GFCC/SkelGFCC.hs Normal file
View File

@@ -0,0 +1,109 @@
module GF.GFCC.SkelGFCC where
-- Haskell module generated by the BNF converter
import GF.GFCC.AbsGFCC
import GF.Data.ErrM
type Result = Err String
failure :: Show a => a -> Result
failure x = Bad $ "Undefined case: " ++ show x
transCId :: CId -> Result
transCId x = case x of
CId str -> failure x
transGrammar :: Grammar -> Result
transGrammar x = case x of
Grm cid cids abstract concretes -> failure x
transAbstract :: Abstract -> Result
transAbstract x = case x of
Abs flags fundefs catdefs -> failure x
transConcrete :: Concrete -> Result
transConcrete x = case x of
Cnc cid flags lindefs0 lindefs1 lindefs2 lindefs3 lindefs -> failure x
transFlag :: Flag -> Result
transFlag x = case x of
Flg cid str -> failure x
transCatDef :: CatDef -> Result
transCatDef x = case x of
Cat cid hypos -> failure x
transFunDef :: FunDef -> Result
transFunDef x = case x of
Fun cid type' exp -> failure x
transLinDef :: LinDef -> Result
transLinDef x = case x of
Lin cid term -> failure x
transType :: Type -> Result
transType x = case x of
DTyp hypos cid exps -> failure x
transExp :: Exp -> Result
transExp x = case x of
DTr cids atom exps -> failure x
EEq equations -> failure x
transAtom :: Atom -> Result
transAtom x = case x of
AC cid -> failure x
AS str -> failure x
AI n -> failure x
AF d -> failure x
AM n -> failure x
AV cid -> failure x
transTerm :: Term -> Result
transTerm x = case x of
R terms -> failure x
P term0 term -> failure x
S terms -> failure x
K tokn -> failure x
V n -> failure x
C n -> failure x
F cid -> failure x
FV terms -> failure x
W str term -> failure x
TM -> failure x
RP term0 term -> failure x
transTokn :: Tokn -> Result
transTokn x = case x of
KS str -> failure x
KP strs variants -> failure x
transVariant :: Variant -> Result
transVariant x = case x of
Var strs0 strs -> failure x
transHypo :: Hypo -> Result
transHypo x = case x of
Hyp cid type' -> failure x
transEquation :: Equation -> Result
transEquation x = case x of
Equ exps exp -> failure x

View File

@@ -0,0 +1,58 @@
-- automatically generated by BNF Converter
module Main where
import IO ( stdin, hGetContents )
import System ( getArgs, getProgName )
import GF.GFCC.LexGFCC
import GF.GFCC.ParGFCC
import GF.GFCC.SkelGFCC
import GF.GFCC.PrintGFCC
import GF.GFCC.AbsGFCC
import GF.Data.ErrM
type ParseFun a = [Token] -> Err a
myLLexer = myLexer
type Verbosity = Int
putStrV :: Verbosity -> String -> IO ()
putStrV v s = if v > 1 then putStrLn s else return ()
runFile :: (Print a, Show a) => Verbosity -> ParseFun a -> FilePath -> IO ()
runFile v p f = putStrLn f >> readFile f >>= run v p
run :: (Print a, Show a) => Verbosity -> ParseFun a -> String -> IO ()
run v p s = let ts = myLLexer s in case p ts of
Bad s -> do putStrLn "\nParse Failed...\n"
putStrV v "Tokens:"
putStrV v $ show ts
putStrLn s
Ok tree -> do putStrLn "\nParse Successful!"
showTree v tree
showTree :: (Show a, Print a) => Int -> a -> IO ()
showTree v tree
= do
putStrV v $ "\n[Abstract Syntax]\n\n" ++ show tree
putStrV v $ "\n[Linearized tree]\n\n" ++ printTree tree
main :: IO ()
main = do args <- getArgs
case args of
[] -> hGetContents stdin >>= run 2 pGrammar
"-s":fs -> mapM_ (runFile 0 pGrammar) fs
fs -> mapM_ (runFile 2 pGrammar) fs

View File

@@ -0,0 +1,13 @@
concrete Eng of Ex = {
lincat
S = {s : Str} ;
NP = {s : Str ; n : Num} ;
VP = {s : Num => Str} ;
param
Num = Sg | Pl ;
lin
Pred np vp = {s = np.s ++ vp.s ! np.n} ;
She = {s = "she" ; n = Sg} ;
They = {s = "they" ; n = Pl} ;
Sleep = {s = table {Sg => "sleeps" ; Pl => "sleep"}} ;
}

View File

@@ -0,0 +1,8 @@
abstract Ex = {
cat
S ; NP ; VP ;
fun
Pred : NP -> VP -> S ;
She, They : NP ;
Sleep : VP ;
}

View File

@@ -0,0 +1,13 @@
concrete Swe of Ex = {
lincat
S = {s : Str} ;
NP = {s : Str} ;
VP = {s : Str} ;
param
Num = Sg | Pl ;
lin
Pred np vp = {s = np.s ++ vp.s} ;
She = {s = "hon"} ;
They = {s = "de"} ;
Sleep = {s = "sover"} ;
}

View File

@@ -0,0 +1,64 @@
-- to test GFCC compilation
flags coding=utf8 ;
cat S ; NP ; N ; VP ;
fun Pred : NP -> VP -> S ;
fun Pred2 : NP -> VP -> NP -> S ;
fun Det, Dets : N -> NP ;
fun Mina, Sina, Me, Te : NP ;
fun Raha, Paska, Pallo : N ;
fun Puhua, Munia, Sanoa : VP ;
param Person = P1 | P2 | P3 ;
param Number = Sg | Pl ;
param Case = Nom | Part ;
param NForm = NF Number Case ;
param VForm = VF Number Person ;
lincat N = Noun ;
lincat VP = Verb ;
oper Noun = {s : NForm => Str} ;
oper Verb = {s : VForm => Str} ;
lincat NP = {s : Case => Str ; a : {n : Number ; p : Person}} ;
lin Pred np vp = {s = np.s ! Nom ++ vp.s ! VF np.a.n np.a.p} ;
lin Pred2 np vp ob = {s = np.s ! Nom ++ vp.s ! VF np.a.n np.a.p ++ ob.s ! Part} ;
lin Det no = {s = \\c => no.s ! NF Sg c ; a = {n = Sg ; p = P3}} ;
lin Dets no = {s = \\c => no.s ! NF Pl c ; a = {n = Pl ; p = P3}} ;
lin Mina = {s = table Case ["minä" ; "minua"] ; a = {n = Sg ; p = P1}} ;
lin Te = {s = table Case ["te" ; "teitä"] ; a = {n = Pl ; p = P2}} ;
lin Sina = {s = table Case ["sinä" ; "sinua"] ; a = {n = Sg ; p = P2}} ;
lin Me = {s = table Case ["me" ; "meitä"] ; a = {n = Pl ; p = P1}} ;
lin Raha = mkN "raha" ;
lin Paska = mkN "paska" ;
lin Pallo = mkN "pallo" ;
lin Puhua = mkV "puhu" ;
lin Munia = mkV "muni" ;
lin Sanoa = mkV "sano" ;
oper mkN : Str -> Noun = \raha -> {
s = table {
NF Sg Nom => raha ;
NF Sg Part => raha + "a" ;
NF Pl Nom => raha + "t" ;
NF Pl Part => Predef.tk 1 raha + "oja"
}
} ;
oper mkV : Str -> Verb = \puhu -> {
s = table {
VF Sg P1 => puhu + "n" ;
VF Sg P2 => puhu + "t" ;
VF Sg P3 => puhu + Predef.dp 1 puhu ;
VF Pl P1 => puhu + "mme" ;
VF Pl P2 => puhu + "tte" ;
VF Pl P3 => puhu + "vat"
}
} ;

View File

@@ -0,0 +1,809 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML>
<HEAD>
<META NAME="generator" CONTENT="http://txt2tags.sf.net">
<TITLE>The GFCC Grammar Format</TITLE>
</HEAD><BODY BGCOLOR="white" TEXT="black">
<P ALIGN="center"><CENTER><H1>The GFCC Grammar Format</H1>
<FONT SIZE="4">
<I>Aarne Ranta</I><BR>
October 5, 2007
</FONT></CENTER>
<P>
Author's address:
<A HREF="http://www.cs.chalmers.se/~aarne"><CODE>http://www.cs.chalmers.se/~aarne</CODE></A>
</P>
<P>
History:
</P>
<UL>
<LI>5 Oct 2007: new, better structured GFCC with full expressive power
<LI>19 Oct: translation of lincats, new figures on C++
<LI>3 Oct 2006: first version
</UL>
<H2>What is GFCC</H2>
<P>
GFCC is a low-level format for GF grammars. Its aim is to contain the minimum
that is needed to process GF grammars at runtime. This minimality has three
advantages:
</P>
<UL>
<LI>compact grammar files and run-time objects
<LI>time and space efficient processing
<LI>simple definition of interpreters
</UL>
<P>
Thus we also want to call GFCC the <B>portable grammar format</B>.
</P>
<P>
The idea is that all embedded GF applications use GFCC.
The GF system would be primarily used as a compiler and as a grammar
development tool.
</P>
<P>
Since GFCC is implemented in BNFC, a parser of the format is readily
available for C, C++, C#, Haskell, Java, and OCaml. Also an XML
representation can be generated in BNFC. A
<A HREF="../">reference implementation</A>
of linearization and some other functions has been written in Haskell.
</P>
<H2>GFCC vs. GFC</H2>
<P>
GFCC is aimed to replace GFC as the run-time grammar format. GFC was designed
to be a run-time format, but also to
support separate compilation of grammars, i.e.
to store the results of compiling
individual GF modules. But this means that GFC has to contain extra information,
such as type annotations, which is only needed in compilation and not at
run-time. In particular, the pattern matching syntax and semantics of GFC is
complex and therefore difficult to implement in new platforms.
</P>
<P>
Actually, GFC is planned to be omitted also as the target format of
separate compilation, where plain GF (type annotated and partially evaluated)
will be used instead. GFC provides only marginal advantages as a target format
compared with GF, and it is therefore just extra weight to carry around this
format.
</P>
<P>
The main differences of GFCC compared with GFC (and GF) can be summarized as follows:
</P>
<UL>
<LI>there are no modules, and therefore no qualified names
<LI>a GFCC grammar is multilingual, and consists of a common abstract syntax
together with one concrete syntax per language
<LI>records and tables are replaced by arrays
<LI>record labels and parameter values are replaced by integers
<LI>record projection and table selection are replaced by array indexing
<LI>even though the format does support dependent types and higher-order abstract
syntax, there is no interpreted yet that does this
</UL>
<P>
Here is an example of a GF grammar, consisting of three modules,
as translated to GFCC. The representations are aligned; thus they do not completely
reflect the order of judgements in GFCC files, which have different orders of
blocks of judgements, and alphabetical sorting.
</P>
<PRE>
grammar Ex(Eng,Swe);
abstract Ex = { abstract {
cat cat
S ; NP ; VP ; NP[]; S[]; VP[];
fun fun
Pred : NP -&gt; VP -&gt; S ; Pred=[(($ 0! 1),(($ 1! 0)!($ 0! 0)))];
She, They : NP ; She=[0,"she"];
Sleep : VP ; They=[1,"they"];
Sleep=[["sleeps","sleep"]];
} } ;
concrete Eng of Ex = { concrete Eng {
lincat lincat
S = {s : Str} ; S=[()];
NP = {s : Str ; n : Num} ; NP=[1,()];
VP = {s : Num =&gt; Str} ; VP=[[(),()]];
param
Num = Sg | Pl ;
lin lin
Pred np vp = { Pred=[(($ 0! 1),(($ 1! 0)!($ 0! 0)))];
s = np.s ++ vp.s ! np.n} ;
She = {s = "she" ; n = Sg} ; She=[0,"she"];
They = {s = "they" ; n = Pl} ; They = [1, "they"];
Sleep = {s = table { Sleep=[["sleeps","sleep"]];
Sg =&gt; "sleeps" ;
Pl =&gt; "sleep"
}
} ;
} } ;
concrete Swe of Ex = { concrete Swe {
lincat lincat
S = {s : Str} ; S=[()];
NP = {s : Str} ; NP=[()];
VP = {s : Str} ; VP=[()];
param
Num = Sg | Pl ;
lin lin
Pred np vp = { Pred = [(($0!0),($1!0))];
s = np.s ++ vp.s} ;
She = {s = "hon"} ; She = ["hon"];
They = {s = "de"} ; They = ["de"];
Sleep = {s = "sover"} ; Sleep = ["sover"];
} } ;
</PRE>
<P></P>
<H2>The syntax of GFCC files</H2>
<P>
The complete BNFC grammar, from which
the rules in this section are taken, is in the file
<A HREF="../DataGFCC.cf"><CODE>GF/GFCC/GFCC.cf</CODE></A>.
</P>
<H3>Top level</H3>
<P>
A grammar has a header telling the name of the abstract syntax
(often specifying an application domain), and the names of
the concrete languages. The abstract syntax and the concrete
syntaxes themselves follow.
</P>
<PRE>
Grm. Grammar ::=
"grammar" CId "(" [CId] ")" ";"
Abstract ";"
[Concrete] ;
Abs. Abstract ::=
"abstract" "{"
"flags" [Flag]
"fun" [FunDef]
"cat" [CatDef]
"}" ;
Cnc. Concrete ::=
"concrete" CId "{"
"flags" [Flag]
"lin" [LinDef]
"oper" [LinDef]
"lincat" [LinDef]
"lindef" [LinDef]
"printname" [LinDef]
"}" ;
</PRE>
<P>
This syntax organizes each module to a sequence of <B>fields</B>, such
as flags, linearizations, operations, linearization types, etc.
It is envisaged that particular applications can ignore some
of the fields, typically so that earlier fields are more
important than later ones.
</P>
<P>
The judgement forms have the following syntax.
</P>
<PRE>
Flg. Flag ::= CId "=" String ;
Cat. CatDef ::= CId "[" [Hypo] "]" ;
Fun. FunDef ::= CId ":" Type "=" Exp ;
Lin. LinDef ::= CId "=" Term ;
</PRE>
<P>
For the run-time system, the reference implementation in Haskell
uses a structure that gives efficient look-up:
</P>
<PRE>
data GFCC = GFCC {
absname :: CId ,
cncnames :: [CId] ,
abstract :: Abstr ,
concretes :: Map CId Concr
}
data Abstr = Abstr {
aflags :: Map CId String, -- value of a flag
funs :: Map CId (Type,Exp), -- type and def of a fun
cats :: Map CId [Hypo], -- context of a cat
catfuns :: Map CId [CId] -- funs yielding a cat (redundant, for fast lookup)
}
data Concr = Concr {
flags :: Map CId String, -- value of a flag
lins :: Map CId Term, -- lin of a fun
opers :: Map CId Term, -- oper generated by subex elim
lincats :: Map CId Term, -- lin type of a cat
lindefs :: Map CId Term, -- lin default of a cat
printnames :: Map CId Term -- printname of a cat or a fun
}
</PRE>
<P>
These definitions are from <A HREF="../DataGFCC.hs"><CODE>GF/GFCC/DataGFCC.hs</CODE></A>.
</P>
<P>
Identifiers (<CODE>CId</CODE>) are like <CODE>Ident</CODE> in GF, except that
the compiler produces constants prefixed with <CODE>_</CODE> in
the common subterm elimination optimization.
</P>
<PRE>
token CId (('_' | letter) (letter | digit | '\'' | '_')*) ;
</PRE>
<P></P>
<H3>Abstract syntax</H3>
<P>
Types are first-order function types built from argument type
contexts and value types.
category symbols. Syntax trees (<CODE>Exp</CODE>) are
rose trees with nodes consisting of a head (<CODE>Atom</CODE>) and
bound variables (<CODE>CId</CODE>).
</P>
<PRE>
DTyp. Type ::= "[" [Hypo] "]" CId [Exp] ;
DTr. Exp ::= "[" "(" [CId] ")" Atom [Exp] "]" ;
Hyp. Hypo ::= CId ":" Type ;
</PRE>
<P>
The head Atom is either a function
constant, a bound variable, or a metavariable, or a string, integer, or float
literal.
</P>
<PRE>
AC. Atom ::= CId ;
AS. Atom ::= String ;
AI. Atom ::= Integer ;
AF. Atom ::= Double ;
AM. Atom ::= "?" Integer ;
</PRE>
<P>
The context-free types and trees of the "old GFCC" are special
cases, which can be defined as follows:
</P>
<PRE>
Typ. Type ::= [CId] "-&gt;" CId
Typ args val = DTyp [Hyp (CId "_") arg | arg &lt;- args] val
Tr. Exp ::= "(" CId [Exp] ")"
Tr fun exps = DTr [] fun exps
</PRE>
<P>
To store semantic (<CODE>def</CODE>) definitions by cases, the following expression
form is provided, but it is only meaningful in the last field of a function
declaration in an abstract syntax:
</P>
<PRE>
EEq. Exp ::= "{" [Equation] "}" ;
Equ. Equation ::= [Exp] "-&gt;" Exp ;
</PRE>
<P>
Notice that expressions are used to encode patterns. Primitive notions
(the default semantics in GF) are encoded as empty sets of equations
(<CODE>[]</CODE>). For a constructor (canonical form) of a category <CODE>C</CODE>, we
aim to use the encoding as the application <CODE>(_constr C)</CODE>.
</P>
<H3>Concrete syntax</H3>
<P>
Linearization terms (<CODE>Term</CODE>) are built as follows.
Constructor names are shown to make the later code
examples readable.
</P>
<PRE>
R. Term ::= "[" [Term] "]" ; -- array (record/table)
P. Term ::= "(" Term "!" Term ")" ; -- access to field (projection/selection)
S. Term ::= "(" [Term] ")" ; -- concatenated sequence
K. Term ::= Tokn ; -- token
V. Term ::= "$" Integer ; -- argument (subtree)
C. Term ::= Integer ; -- array index (label/parameter value)
FV. Term ::= "[|" [Term] "|]" ; -- free variation
TM. Term ::= "?" ; -- linearization of metavariable
</PRE>
<P>
Tokens are strings or (maybe obsolescent) prefix-dependent
variant lists.
</P>
<PRE>
KS. Tokn ::= String ;
KP. Tokn ::= "[" "pre" [String] "[" [Variant] "]" "]" ;
Var. Variant ::= [String] "/" [String] ;
</PRE>
<P>
Two special forms of terms are introduced by the compiler
as optimizations. They can in principle be eliminated, but
their presence makes grammars much more compact. Their semantics
will be explained in a later section.
</P>
<PRE>
F. Term ::= CId ; -- global constant
W. Term ::= "(" String "+" Term ")" ; -- prefix + suffix table
</PRE>
<P>
There is also a deprecated form of "record parameter alias",
</P>
<PRE>
RP. Term ::= "(" Term "@" Term ")"; -- DEPRECATED
</PRE>
<P>
which will be removed when the migration to new GFCC is complete.
</P>
<H2>The semantics of concrete syntax terms</H2>
<P>
The code in this section is from <A HREF="../Linearize.hs"><CODE>GF/GFCC/Linearize.hs</CODE></A>.
</P>
<H3>Linearization and realization</H3>
<P>
The linearization algorithm is essentially the same as in
GFC: a tree is linearized by evaluating its linearization term
in the environment of the linearizations of the subtrees.
Literal atoms are linearized in the obvious way.
The function also needs to know the language (i.e. concrete syntax)
in which linearization is performed.
</P>
<PRE>
linExp :: GFCC -&gt; CId -&gt; Exp -&gt; Term
linExp gfcc lang tree@(DTr _ at trees) = case at of
AC fun -&gt; comp (Prelude.map lin trees) $ look fun
AS s -&gt; R [kks (show s)] -- quoted
AI i -&gt; R [kks (show i)]
AF d -&gt; R [kks (show d)]
AM -&gt; TM
where
lin = linExp gfcc lang
comp = compute gfcc lang
look = lookLin gfcc lang
</PRE>
<P>
TODO: bindings must be supported.
</P>
<P>
The result of linearization is usually a record, which is realized as
a string using the following algorithm.
</P>
<PRE>
realize :: Term -&gt; String
realize trm = case trm of
R (t:_) -&gt; realize t
S ss -&gt; unwords $ Prelude.map realize ss
K (KS s) -&gt; s
K (KP s _) -&gt; unwords s ---- prefix choice TODO
W s t -&gt; s ++ realize t
FV (t:_) -&gt; realize t
TM -&gt; "?"
</PRE>
<P>
Notice that realization always picks the first field of a record.
If a linearization type has more than one field, the first field
does not necessarily contain the desired string.
Also notice that the order of record fields in GFCC is not necessarily
the same as in GF source.
</P>
<H3>Term evaluation</H3>
<P>
Evaluation follows call-by-value order, with two environments
needed:
</P>
<UL>
<LI>the grammar (a concrete syntax) to give the global constants
<LI>an array of terms to give the subtree linearizations
</UL>
<P>
The code is presented in one-level pattern matching, to
enable reimplementations in languages that do not permit
deep patterns (such as Java and C++).
</P>
<PRE>
compute :: GFCC -&gt; CId -&gt; [Term] -&gt; Term -&gt; Term
compute gfcc lang args = comp where
comp trm = case trm of
P r p -&gt; proj (comp r) (comp p)
W s t -&gt; W s (comp t)
R ts -&gt; R $ Prelude.map comp ts
V i -&gt; idx args (fromInteger i) -- already computed
F c -&gt; comp $ look c -- not computed (if contains V)
FV ts -&gt; FV $ Prelude.map comp ts
S ts -&gt; S $ Prelude.filter (/= S []) $ Prelude.map comp ts
_ -&gt; trm
look = lookOper gfcc lang
idx xs i = xs !! i
proj r p = case (r,p) of
(_, FV ts) -&gt; FV $ Prelude.map (proj r) ts
(W s t, _) -&gt; kks (s ++ getString (proj t p))
_ -&gt; comp $ getField r (getIndex p)
getString t = case t of
K (KS s) -&gt; s
_ -&gt; trace ("ERROR in grammar compiler: string from "++ show t) "ERR"
getIndex t = case t of
C i -&gt; fromInteger i
RP p _ -&gt; getIndex p
TM -&gt; 0 -- default value for parameter
_ -&gt; trace ("ERROR in grammar compiler: index from " ++ show t) 0
getField t i = case t of
R rs -&gt; idx rs i
RP _ r -&gt; getField r i
TM -&gt; TM
_ -&gt; trace ("ERROR in grammar compiler: field from " ++ show t) t
</PRE>
<P></P>
<H3>The special term constructors</H3>
<P>
The three forms introduced by the compiler may a need special
explanation.
</P>
<P>
Global constants
</P>
<PRE>
Term ::= CId ;
</PRE>
<P>
are shorthands for complex terms. They are produced by the
compiler by (iterated) <B>common subexpression elimination</B>.
They are often more powerful than hand-devised code sharing in the source
code. They could be computed off-line by replacing each identifier by
its definition.
</P>
<P>
<B>Prefix-suffix tables</B>
</P>
<PRE>
Term ::= "(" String "+" Term ")" ;
</PRE>
<P>
represent tables of word forms divided to the longest common prefix
and its array of suffixes. In the example grammar above, we have
</P>
<PRE>
Sleep = [("sleep" + ["s",""])]
</PRE>
<P>
which in fact is equal to the array of full forms
</P>
<PRE>
["sleeps", "sleep"]
</PRE>
<P>
The power of this construction comes from the fact that suffix sets
tend to be repeated in a language, and can therefore be collected
by common subexpression elimination. It is this technique that
explains the used syntax rather than the more accurate
</P>
<PRE>
"(" String "+" [String] ")"
</PRE>
<P>
since we want the suffix part to be a <CODE>Term</CODE> for the optimization to
take effect.
</P>
<H2>Compiling to GFCC</H2>
<P>
Compilation to GFCC is performed by the GF grammar compiler, and
GFCC interpreters need not know what it does. For grammar writers,
however, it might be interesting to know what happens to the grammars
in the process.
</P>
<P>
The compilation phases are the following
</P>
<OL>
<LI>type check and partially evaluate GF source
<LI>create a symbol table mapping the GF parameter and record types to
fixed-size arrays, and parameter values and record labels to integers
<LI>traverse the linearization rules replacing parameters and labels by integers
<LI>reorganize the created GF grammar so that it has just one abstract syntax
and one concrete syntax per language
<LI>TODO: apply UTF8 encoding to the grammar, if not yet applied (this is told by the
<CODE>coding</CODE> flag)
<LI>translate the GF grammar object to a GFCC grammar object, using a simple
compositional mapping
<LI>perform the word-suffix optimization on GFCC linearization terms
<LI>perform subexpression elimination on each concrete syntax module
<LI>print out the GFCC code
</OL>
<H3>Problems in GFCC compilation</H3>
<P>
Two major problems had to be solved in compiling GF to GFCC:
</P>
<UL>
<LI>consistent order of tables and records, to permit the array translation
<LI>run-time variables in complex parameter values.
</UL>
<P>
The current implementation is still experimental and may fail
to generate correct code. Any errors remaining are likely to be
related to the two problems just mentioned.
</P>
<P>
The order problem is solved in slightly different ways for tables and records.
In both cases, <B>eta expansion</B> is used to establish a
canonical order. Tables are ordered by applying the preorder induced
by <CODE>param</CODE> definitions. Records are ordered by sorting them by labels.
This means that
e.g. the <CODE>s</CODE> field will in general no longer appear as the first
field, even if it does so in the GF source code. But relying on the
order of fields in a labelled record would be misplaced anyway.
</P>
<P>
The canonical form of records is further complicated by lock fields,
i.e. dummy fields of form <CODE>lock_C = &lt;&gt;</CODE>, which are added to grammar
libraries to force intensionality of linearization types. The problem
is that the absence of a lock field only generates a warning, not
an error. Therefore a GF grammar can contain objects of the same
type with and without a lock field. This problem was solved in GFCC
generation by just removing all lock fields (defined as fields whose
type is the empty record type). This has the further advantage of
(slightly) reducing the grammar size. More importantly, it is safe
to remove lock fields, because they are never used in computation,
and because intensional types are only needed in grammars reused
as libraries, not in grammars used at runtime.
</P>
<P>
While the order problem is rather bureaucratic in nature, run-time
variables are an interesting problem. They arise in the presence
of complex parameter values, created by argument-taking constructors
and parameter records. To give an example, consider the GF parameter
type system
</P>
<PRE>
Number = Sg | Pl ;
Person = P1 | P2 | P3 ;
Agr = Ag Number Person ;
</PRE>
<P>
The values can be translated to integers in the expected way,
</P>
<PRE>
Sg = 0, Pl = 1
P1 = 0, P2 = 1, P3 = 2
Ag Sg P1 = 0, Ag Sg P2 = 1, Ag Sg P3 = 2,
Ag Pl P1 = 3, Ag Pl P2 = 4, Ag Pl P3 = 5
</PRE>
<P>
However, an argument of <CODE>Agr</CODE> can be a run-time variable, as in
</P>
<PRE>
Ag np.n P3
</PRE>
<P>
This expression must first be translated to a case expression,
</P>
<PRE>
case np.n of {
0 =&gt; 2 ;
1 =&gt; 5
}
</PRE>
<P>
which can then be translated to the GFCC term
</P>
<PRE>
([2,5] ! ($0 ! $1))
</PRE>
<P>
assuming that the variable <CODE>np</CODE> is the first argument and that its
<CODE>Number</CODE> field is the second in the record.
</P>
<P>
This transformation of course has to be performed recursively, since
there can be several run-time variables in a parameter value:
</P>
<PRE>
Ag np.n np.p
</PRE>
<P>
A similar transformation would be possible to deal with the double
role of parameter records discussed above. Thus the type
</P>
<PRE>
RNP = {n : Number ; p : Person}
</PRE>
<P>
could be uniformly translated into the set <CODE>{0,1,2,3,4,5}</CODE>
as <CODE>Agr</CODE> above. Selections would be simple instances of indexing.
But any projection from the record should be translated into
a case expression,
</P>
<PRE>
rnp.n ===&gt;
case rnp of {
0 =&gt; 0 ;
1 =&gt; 0 ;
2 =&gt; 0 ;
3 =&gt; 1 ;
4 =&gt; 1 ;
5 =&gt; 1
}
</PRE>
<P>
To avoid the code bloat resulting from this, we have chosen to
deal with records by a <B>currying</B> transformation:
</P>
<PRE>
table {n : Number ; p : Person} {... ...}
===&gt;
table Number {Sg =&gt; table Person {...} ; table Person {...}}
</PRE>
<P>
This is performed when GFCC is generated. Selections with
records have to be treated likewise,
</P>
<PRE>
t ! r ===&gt; t ! r.n ! r.p
</PRE>
<P></P>
<H3>The representation of linearization types</H3>
<P>
Linearization types (<CODE>lincat</CODE>) are not needed when generating with
GFCC, but they have been added to enable parser generation directly from
GFCC. The linearization type definitions are shown as a part of the
concrete syntax, by using terms to represent types. Here is the table
showing how different linearization types are encoded.
</P>
<PRE>
P* = max(P) -- parameter type
{r1 : T1 ; ... ; rn : Tn}* = [T1*,...,Tn*] -- record
(P =&gt; T)* = [T* ,...,T*] -- table, size(P) cases
Str* = ()
</PRE>
<P>
For example, the linearization type <CODE>present/CatEng.NP</CODE> is
translated as follows:
</P>
<PRE>
NP = {
a : { -- 6 = 2*3 values
n : {ParamX.Number} ; -- 2 values
p : {ParamX.Person} -- 3 values
} ;
s : {ResEng.Case} =&gt; Str -- 3 values
}
__NP = [[1,2],[(),(),()]]
</PRE>
<P></P>
<H3>Running the compiler and the GFCC interpreter</H3>
<P>
GFCC generation is a part of the
<A HREF="http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html">developers' version</A>
of GF since September 2006. To invoke the compiler, the flag
<CODE>-printer=gfcc</CODE> to the command
<CODE>pm = print_multi</CODE> is used. It is wise to recompile the grammar from
source, since previously compiled libraries may not obey the canonical
order of records.
Here is an example, performed in
<A HREF="../../../../../examples/bronzeage">example/bronzeage</A>.
</P>
<PRE>
i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageEng.gf
i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageGer.gf
strip
pm -printer=gfcc | wf bronze.gfcc
</PRE>
<P>
There is also an experimental batch compiler, which does not use the GFC
format or the record aliases. It can be produced by
</P>
<PRE>
make gfc
</PRE>
<P>
in <CODE>GF/src</CODE>, and invoked by
</P>
<PRE>
gfc --make FILES
</PRE>
<P></P>
<H2>The reference interpreter</H2>
<P>
The reference interpreter written in Haskell consists of the following files:
</P>
<PRE>
-- source file for BNFC
GFCC.cf -- labelled BNF grammar of gfcc
-- files generated by BNFC
AbsGFCC.hs -- abstrac syntax datatypes
ErrM.hs -- error monad used internally
LexGFCC.hs -- lexer of gfcc files
ParGFCC.hs -- parser of gfcc files and syntax trees
PrintGFCC.hs -- printer of gfcc files and syntax trees
-- hand-written files
DataGFCC.hs -- grammar datatype, post-parser grammar creation
Linearize.hs -- linearization and evaluation
Macros.hs -- utilities abstracting away from GFCC datatypes
Generate.hs -- random and exhaustive generation, generate-and-test parsing
API.hs -- functionalities accessible in embedded GF applications
Generate.hs -- random and exhaustive generation
Shell.hs -- main function - a simple command interpreter
</PRE>
<P>
It is included in the
<A HREF="http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html">developers' version</A>
of GF, in the subdirectories <A HREF="../"><CODE>GF/src/GF/GFCC</CODE></A> and
<A HREF="../../Devel"><CODE>GF/src/GF/Devel</CODE></A>.
</P>
<P>
As of September 2007, default parsing in main GF uses GFCC (implemented by Krasimir
Angelov). The interpreter uses the relevant modules
</P>
<PRE>
GF/Conversions/SimpleToFCFG.hs -- generate parser from GFCC
GF/Parsing/FCFG.hs -- run the parser
</PRE>
<P></P>
<P>
To compile the interpreter, type
</P>
<PRE>
make gfcc
</PRE>
<P>
in <CODE>GF/src</CODE>. To run it, type
</P>
<PRE>
./gfcc &lt;GFCC-file&gt;
</PRE>
<P>
The available commands are
</P>
<UL>
<LI><CODE>gr &lt;Cat&gt; &lt;Int&gt;</CODE>: generate a number of random trees in category.
and show their linearizations in all languages
<LI><CODE>grt &lt;Cat&gt; &lt;Int&gt;</CODE>: generate a number of random trees in category.
and show the trees and their linearizations in all languages
<LI><CODE>gt &lt;Cat&gt; &lt;Int&gt;</CODE>: generate a number of trees in category from smallest,
and show their linearizations in all languages
<LI><CODE>gtt &lt;Cat&gt; &lt;Int&gt;</CODE>: generate a number of trees in category from smallest,
and show the trees and their linearizations in all languages
<LI><CODE>p &lt;Lang&gt; &lt;Cat&gt; &lt;String&gt;</CODE>: parse a string into a set of trees
<LI><CODE>lin &lt;Tree&gt;</CODE>: linearize tree in all languages, also showing full records
<LI><CODE>q</CODE>: terminate the system cleanly
</UL>
<H2>Embedded formats</H2>
<UL>
<LI>JavaScript: compiler of linearization and abstract syntax
<P></P>
<LI>Haskell: compiler of abstract syntax and interpreter with parsing,
linearization, and generation
<P></P>
<LI>C: compiler of linearization (old GFCC)
<P></P>
<LI>C++: embedded interpreter supporting linearization (old GFCC)
</UL>
<H2>Some things to do</H2>
<P>
Support for dependent types, higher-order abstract syntax, and
semantic definition in GFCC generation and interpreters.
</P>
<P>
Replacing the entire GF shell by one based on GFCC.
</P>
<P>
Interpreter in Java.
</P>
<P>
Hand-written parsers for GFCC grammars to reduce code size
(and efficiency?) of interpreters.
</P>
<P>
Binary format and/or file compression of GFCC output.
</P>
<P>
Syntax editor based on GFCC.
</P>
<P>
Rewriting of resource libraries in order to exploit the
word-suffix sharing better (depth-one tables, as in FM).
</P>
<!-- html code generated by txt2tags 2.3 (http://txt2tags.sf.net) -->
<!-- cmdline: txt2tags -thtml gfcc.txt -->
</BODY></HTML>

View File

@@ -0,0 +1,712 @@
The GFCC Grammar Format
Aarne Ranta
December 14, 2007
Author's address:
[``http://www.cs.chalmers.se/~aarne`` http://www.cs.chalmers.se/~aarne]
% to compile: txt2tags -thtml --toc gfcc.txt
History:
- 14 Dec 2007: simpler, Lisp-like concrete syntax of GFCC
- 5 Oct 2007: new, better structured GFCC with full expressive power
- 19 Oct: translation of lincats, new figures on C++
- 3 Oct 2006: first version
==What is GFCC==
GFCC is a low-level format for GF grammars. Its aim is to contain the minimum
that is needed to process GF grammars at runtime. This minimality has three
advantages:
- compact grammar files and run-time objects
- time and space efficient processing
- simple definition of interpreters
Thus we also want to call GFCC the **portable grammar format**.
The idea is that all embedded GF applications use GFCC.
The GF system would be primarily used as a compiler and as a grammar
development tool.
Since GFCC is implemented in BNFC, a parser of the format is readily
available for C, C++, C#, Haskell, Java, and OCaml. Also an XML
representation can be generated in BNFC. A
[reference implementation ../]
of linearization and some other functions has been written in Haskell.
==GFCC vs. GFC==
GFCC is aimed to replace GFC as the run-time grammar format. GFC was designed
to be a run-time format, but also to
support separate compilation of grammars, i.e.
to store the results of compiling
individual GF modules. But this means that GFC has to contain extra information,
such as type annotations, which is only needed in compilation and not at
run-time. In particular, the pattern matching syntax and semantics of GFC is
complex and therefore difficult to implement in new platforms.
Actually, GFC is planned to be omitted also as the target format of
separate compilation, where plain GF (type annotated and partially evaluated)
will be used instead. GFC provides only marginal advantages as a target format
compared with GF, and it is therefore just extra weight to carry around this
format.
The main differences of GFCC compared with GFC (and GF) can be
summarized as follows:
- there are no modules, and therefore no qualified names
- a GFCC grammar is multilingual, and consists of a common abstract syntax
together with one concrete syntax per language
- records and tables are replaced by arrays
- record labels and parameter values are replaced by integers
- record projection and table selection are replaced by array indexing
- even though the format does support dependent types and higher-order abstract
syntax, there is no interpreted yet that does this
Here is an example of a GF grammar, consisting of three modules,
as translated to GFCC. The representations are aligned;
thus they do not completely
reflect the order of judgements in GFCC files, which have different orders of
blocks of judgements, and alphabetical sorting.
```
grammar Ex(Eng,Swe);
abstract Ex = { abstract {
cat cat
S ; NP ; VP ; NP[]; S[]; VP[];
fun fun
Pred : NP -> VP -> S ; Pred=[(($ 0! 1),(($ 1! 0)!($ 0! 0)))];
She, They : NP ; She=[0,"she"];
Sleep : VP ; They=[1,"they"];
Sleep=[["sleeps","sleep"]];
} } ;
concrete Eng of Ex = { concrete Eng {
lincat lincat
S = {s : Str} ; S=[()];
NP = {s : Str ; n : Num} ; NP=[1,()];
VP = {s : Num => Str} ; VP=[[(),()]];
param
Num = Sg | Pl ;
lin lin
Pred np vp = { Pred=[(($ 0! 1),(($ 1! 0)!($ 0! 0)))];
s = np.s ++ vp.s ! np.n} ;
She = {s = "she" ; n = Sg} ; She=[0,"she"];
They = {s = "they" ; n = Pl} ; They = [1, "they"];
Sleep = {s = table { Sleep=[["sleeps","sleep"]];
Sg => "sleeps" ;
Pl => "sleep"
}
} ;
} } ;
concrete Swe of Ex = { concrete Swe {
lincat lincat
S = {s : Str} ; S=[()];
NP = {s : Str} ; NP=[()];
VP = {s : Str} ; VP=[()];
param
Num = Sg | Pl ;
lin lin
Pred np vp = { Pred = [(($0!0),($1!0))];
s = np.s ++ vp.s} ;
She = {s = "hon"} ; She = ["hon"];
They = {s = "de"} ; They = ["de"];
Sleep = {s = "sover"} ; Sleep = ["sover"];
} } ;
```
==The syntax of GFCC files==
The complete BNFC grammar, from which
the rules in this section are taken, is in the file
[``GF/GFCC/GFCC.cf`` ../DataGFCC.cf].
===Top level===
A grammar has a header telling the name of the abstract syntax
(often specifying an application domain), and the names of
the concrete languages. The abstract syntax and the concrete
syntaxes themselves follow.
```
Grm. Grammar ::=
"grammar" CId "(" [CId] ")" ";"
Abstract ";"
[Concrete] ;
Abs. Abstract ::=
"abstract" "{"
"flags" [Flag]
"fun" [FunDef]
"cat" [CatDef]
"}" ;
Cnc. Concrete ::=
"concrete" CId "{"
"flags" [Flag]
"lin" [LinDef]
"oper" [LinDef]
"lincat" [LinDef]
"lindef" [LinDef]
"printname" [LinDef]
"}" ;
```
This syntax organizes each module to a sequence of **fields**, such
as flags, linearizations, operations, linearization types, etc.
It is envisaged that particular applications can ignore some
of the fields, typically so that earlier fields are more
important than later ones.
The judgement forms have the following syntax.
```
Flg. Flag ::= CId "=" String ;
Cat. CatDef ::= CId "[" [Hypo] "]" ;
Fun. FunDef ::= CId ":" Type "=" Exp ;
Lin. LinDef ::= CId "=" Term ;
```
For the run-time system, the reference implementation in Haskell
uses a structure that gives efficient look-up:
```
data GFCC = GFCC {
absname :: CId ,
cncnames :: [CId] ,
abstract :: Abstr ,
concretes :: Map CId Concr
}
data Abstr = Abstr {
aflags :: Map CId String, -- value of a flag
funs :: Map CId (Type,Exp), -- type and def of a fun
cats :: Map CId [Hypo], -- context of a cat
catfuns :: Map CId [CId] -- funs yielding a cat (redundant, for fast lookup)
}
data Concr = Concr {
flags :: Map CId String, -- value of a flag
lins :: Map CId Term, -- lin of a fun
opers :: Map CId Term, -- oper generated by subex elim
lincats :: Map CId Term, -- lin type of a cat
lindefs :: Map CId Term, -- lin default of a cat
printnames :: Map CId Term -- printname of a cat or a fun
}
```
These definitions are from [``GF/GFCC/DataGFCC.hs`` ../DataGFCC.hs].
Identifiers (``CId``) are like ``Ident`` in GF, except that
the compiler produces constants prefixed with ``_`` in
the common subterm elimination optimization.
```
token CId (('_' | letter) (letter | digit | '\'' | '_')*) ;
```
===Abstract syntax===
Types are first-order function types built from argument type
contexts and value types.
category symbols. Syntax trees (``Exp``) are
rose trees with nodes consisting of a head (``Atom``) and
bound variables (``CId``).
```
DTyp. Type ::= "[" [Hypo] "]" CId [Exp] ;
DTr. Exp ::= "[" "(" [CId] ")" Atom [Exp] "]" ;
Hyp. Hypo ::= CId ":" Type ;
```
The head Atom is either a function
constant, a bound variable, or a metavariable, or a string, integer, or float
literal.
```
AC. Atom ::= CId ;
AS. Atom ::= String ;
AI. Atom ::= Integer ;
AF. Atom ::= Double ;
AM. Atom ::= "?" Integer ;
```
The context-free types and trees of the "old GFCC" are special
cases, which can be defined as follows:
```
Typ. Type ::= [CId] "->" CId
Typ args val = DTyp [Hyp (CId "_") arg | arg <- args] val
Tr. Exp ::= "(" CId [Exp] ")"
Tr fun exps = DTr [] fun exps
```
To store semantic (``def``) definitions by cases, the following expression
form is provided, but it is only meaningful in the last field of a function
declaration in an abstract syntax:
```
EEq. Exp ::= "{" [Equation] "}" ;
Equ. Equation ::= [Exp] "->" Exp ;
```
Notice that expressions are used to encode patterns. Primitive notions
(the default semantics in GF) are encoded as empty sets of equations
(``[]``). For a constructor (canonical form) of a category ``C``, we
aim to use the encoding as the application ``(_constr C)``.
===Concrete syntax===
Linearization terms (``Term``) are built as follows.
Constructor names are shown to make the later code
examples readable.
```
R. Term ::= "[" [Term] "]" ; -- array (record/table)
P. Term ::= "(" Term "!" Term ")" ; -- access to field (projection/selection)
S. Term ::= "(" [Term] ")" ; -- concatenated sequence
K. Term ::= Tokn ; -- token
V. Term ::= "$" Integer ; -- argument (subtree)
C. Term ::= Integer ; -- array index (label/parameter value)
FV. Term ::= "[|" [Term] "|]" ; -- free variation
TM. Term ::= "?" ; -- linearization of metavariable
```
Tokens are strings or (maybe obsolescent) prefix-dependent
variant lists.
```
KS. Tokn ::= String ;
KP. Tokn ::= "[" "pre" [String] "[" [Variant] "]" "]" ;
Var. Variant ::= [String] "/" [String] ;
```
Two special forms of terms are introduced by the compiler
as optimizations. They can in principle be eliminated, but
their presence makes grammars much more compact. Their semantics
will be explained in a later section.
```
F. Term ::= CId ; -- global constant
W. Term ::= "(" String "+" Term ")" ; -- prefix + suffix table
```
There is also a deprecated form of "record parameter alias",
```
RP. Term ::= "(" Term "@" Term ")"; -- DEPRECATED
```
which will be removed when the migration to new GFCC is complete.
==The semantics of concrete syntax terms==
The code in this section is from [``GF/GFCC/Linearize.hs`` ../Linearize.hs].
===Linearization and realization===
The linearization algorithm is essentially the same as in
GFC: a tree is linearized by evaluating its linearization term
in the environment of the linearizations of the subtrees.
Literal atoms are linearized in the obvious way.
The function also needs to know the language (i.e. concrete syntax)
in which linearization is performed.
```
linExp :: GFCC -> CId -> Exp -> Term
linExp gfcc lang tree@(DTr _ at trees) = case at of
AC fun -> comp (Prelude.map lin trees) $ look fun
AS s -> R [kks (show s)] -- quoted
AI i -> R [kks (show i)]
AF d -> R [kks (show d)]
AM -> TM
where
lin = linExp gfcc lang
comp = compute gfcc lang
look = lookLin gfcc lang
```
TODO: bindings must be supported.
The result of linearization is usually a record, which is realized as
a string using the following algorithm.
```
realize :: Term -> String
realize trm = case trm of
R (t:_) -> realize t
S ss -> unwords $ Prelude.map realize ss
K (KS s) -> s
K (KP s _) -> unwords s ---- prefix choice TODO
W s t -> s ++ realize t
FV (t:_) -> realize t
TM -> "?"
```
Notice that realization always picks the first field of a record.
If a linearization type has more than one field, the first field
does not necessarily contain the desired string.
Also notice that the order of record fields in GFCC is not necessarily
the same as in GF source.
===Term evaluation===
Evaluation follows call-by-value order, with two environments
needed:
- the grammar (a concrete syntax) to give the global constants
- an array of terms to give the subtree linearizations
The code is presented in one-level pattern matching, to
enable reimplementations in languages that do not permit
deep patterns (such as Java and C++).
```
compute :: GFCC -> CId -> [Term] -> Term -> Term
compute gfcc lang args = comp where
comp trm = case trm of
P r p -> proj (comp r) (comp p)
W s t -> W s (comp t)
R ts -> R $ Prelude.map comp ts
V i -> idx args (fromInteger i) -- already computed
F c -> comp $ look c -- not computed (if contains V)
FV ts -> FV $ Prelude.map comp ts
S ts -> S $ Prelude.filter (/= S []) $ Prelude.map comp ts
_ -> trm
look = lookOper gfcc lang
idx xs i = xs !! i
proj r p = case (r,p) of
(_, FV ts) -> FV $ Prelude.map (proj r) ts
(FV ts, _ ) -> FV $ Prelude.map (\t -> proj t p) ts
(W s t, _) -> kks (s ++ getString (proj t p))
_ -> comp $ getField r (getIndex p)
getString t = case t of
K (KS s) -> s
_ -> trace ("ERROR in grammar compiler: string from "++ show t) "ERR"
getIndex t = case t of
C i -> fromInteger i
RP p _ -> getIndex p
TM -> 0 -- default value for parameter
_ -> trace ("ERROR in grammar compiler: index from " ++ show t) 0
getField t i = case t of
R rs -> idx rs i
RP _ r -> getField r i
TM -> TM
_ -> trace ("ERROR in grammar compiler: field from " ++ show t) t
```
===The special term constructors===
The three forms introduced by the compiler may a need special
explanation.
Global constants
```
Term ::= CId ;
```
are shorthands for complex terms. They are produced by the
compiler by (iterated) **common subexpression elimination**.
They are often more powerful than hand-devised code sharing in the source
code. They could be computed off-line by replacing each identifier by
its definition.
**Prefix-suffix tables**
```
Term ::= "(" String "+" Term ")" ;
```
represent tables of word forms divided to the longest common prefix
and its array of suffixes. In the example grammar above, we have
```
Sleep = [("sleep" + ["s",""])]
```
which in fact is equal to the array of full forms
```
["sleeps", "sleep"]
```
The power of this construction comes from the fact that suffix sets
tend to be repeated in a language, and can therefore be collected
by common subexpression elimination. It is this technique that
explains the used syntax rather than the more accurate
```
"(" String "+" [String] ")"
```
since we want the suffix part to be a ``Term`` for the optimization to
take effect.
==Compiling to GFCC==
Compilation to GFCC is performed by the GF grammar compiler, and
GFCC interpreters need not know what it does. For grammar writers,
however, it might be interesting to know what happens to the grammars
in the process.
The compilation phases are the following
+ type check and partially evaluate GF source
+ create a symbol table mapping the GF parameter and record types to
fixed-size arrays, and parameter values and record labels to integers
+ traverse the linearization rules replacing parameters and labels by integers
+ reorganize the created GF grammar so that it has just one abstract syntax
and one concrete syntax per language
+ TODO: apply UTF8 encoding to the grammar, if not yet applied (this is told by the
``coding`` flag)
+ translate the GF grammar object to a GFCC grammar object, using a simple
compositional mapping
+ perform the word-suffix optimization on GFCC linearization terms
+ perform subexpression elimination on each concrete syntax module
+ print out the GFCC code
===Problems in GFCC compilation===
Two major problems had to be solved in compiling GF to GFCC:
- consistent order of tables and records, to permit the array translation
- run-time variables in complex parameter values.
The current implementation is still experimental and may fail
to generate correct code. Any errors remaining are likely to be
related to the two problems just mentioned.
The order problem is solved in slightly different ways for tables and records.
In both cases, **eta expansion** is used to establish a
canonical order. Tables are ordered by applying the preorder induced
by ``param`` definitions. Records are ordered by sorting them by labels.
This means that
e.g. the ``s`` field will in general no longer appear as the first
field, even if it does so in the GF source code. But relying on the
order of fields in a labelled record would be misplaced anyway.
The canonical form of records is further complicated by lock fields,
i.e. dummy fields of form ``lock_C = <>``, which are added to grammar
libraries to force intensionality of linearization types. The problem
is that the absence of a lock field only generates a warning, not
an error. Therefore a GF grammar can contain objects of the same
type with and without a lock field. This problem was solved in GFCC
generation by just removing all lock fields (defined as fields whose
type is the empty record type). This has the further advantage of
(slightly) reducing the grammar size. More importantly, it is safe
to remove lock fields, because they are never used in computation,
and because intensional types are only needed in grammars reused
as libraries, not in grammars used at runtime.
While the order problem is rather bureaucratic in nature, run-time
variables are an interesting problem. They arise in the presence
of complex parameter values, created by argument-taking constructors
and parameter records. To give an example, consider the GF parameter
type system
```
Number = Sg | Pl ;
Person = P1 | P2 | P3 ;
Agr = Ag Number Person ;
```
The values can be translated to integers in the expected way,
```
Sg = 0, Pl = 1
P1 = 0, P2 = 1, P3 = 2
Ag Sg P1 = 0, Ag Sg P2 = 1, Ag Sg P3 = 2,
Ag Pl P1 = 3, Ag Pl P2 = 4, Ag Pl P3 = 5
```
However, an argument of ``Agr`` can be a run-time variable, as in
```
Ag np.n P3
```
This expression must first be translated to a case expression,
```
case np.n of {
0 => 2 ;
1 => 5
}
```
which can then be translated to the GFCC term
```
([2,5] ! ($0 ! $1))
```
assuming that the variable ``np`` is the first argument and that its
``Number`` field is the second in the record.
This transformation of course has to be performed recursively, since
there can be several run-time variables in a parameter value:
```
Ag np.n np.p
```
A similar transformation would be possible to deal with the double
role of parameter records discussed above. Thus the type
```
RNP = {n : Number ; p : Person}
```
could be uniformly translated into the set ``{0,1,2,3,4,5}``
as ``Agr`` above. Selections would be simple instances of indexing.
But any projection from the record should be translated into
a case expression,
```
rnp.n ===>
case rnp of {
0 => 0 ;
1 => 0 ;
2 => 0 ;
3 => 1 ;
4 => 1 ;
5 => 1
}
```
To avoid the code bloat resulting from this, we have chosen to
deal with records by a **currying** transformation:
```
table {n : Number ; p : Person} {... ...}
===>
table Number {Sg => table Person {...} ; table Person {...}}
```
This is performed when GFCC is generated. Selections with
records have to be treated likewise,
```
t ! r ===> t ! r.n ! r.p
```
===The representation of linearization types===
Linearization types (``lincat``) are not needed when generating with
GFCC, but they have been added to enable parser generation directly from
GFCC. The linearization type definitions are shown as a part of the
concrete syntax, by using terms to represent types. Here is the table
showing how different linearization types are encoded.
```
P* = max(P) -- parameter type
{r1 : T1 ; ... ; rn : Tn}* = [T1*,...,Tn*] -- record
(P => T)* = [T* ,...,T*] -- table, size(P) cases
Str* = ()
```
For example, the linearization type ``present/CatEng.NP`` is
translated as follows:
```
NP = {
a : { -- 6 = 2*3 values
n : {ParamX.Number} ; -- 2 values
p : {ParamX.Person} -- 3 values
} ;
s : {ResEng.Case} => Str -- 3 values
}
__NP = [[1,2],[(),(),()]]
```
===Running the compiler and the GFCC interpreter===
GFCC generation is a part of the
[developers' version http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html]
of GF since September 2006. To invoke the compiler, the flag
``-printer=gfcc`` to the command
``pm = print_multi`` is used. It is wise to recompile the grammar from
source, since previously compiled libraries may not obey the canonical
order of records.
Here is an example, performed in
[example/bronzeage ../../../../../examples/bronzeage].
```
i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageEng.gf
i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageGer.gf
strip
pm -printer=gfcc | wf bronze.gfcc
```
There is also an experimental batch compiler, which does not use the GFC
format or the record aliases. It can be produced by
```
make gfc
```
in ``GF/src``, and invoked by
```
gfc --make FILES
```
==The reference interpreter==
The reference interpreter written in Haskell consists of the following files:
```
-- source file for BNFC
GFCC.cf -- labelled BNF grammar of gfcc
-- files generated by BNFC
AbsGFCC.hs -- abstrac syntax datatypes
ErrM.hs -- error monad used internally
LexGFCC.hs -- lexer of gfcc files
ParGFCC.hs -- parser of gfcc files and syntax trees
PrintGFCC.hs -- printer of gfcc files and syntax trees
-- hand-written files
DataGFCC.hs -- grammar datatype, post-parser grammar creation
Linearize.hs -- linearization and evaluation
Macros.hs -- utilities abstracting away from GFCC datatypes
Generate.hs -- random and exhaustive generation, generate-and-test parsing
API.hs -- functionalities accessible in embedded GF applications
Generate.hs -- random and exhaustive generation
Shell.hs -- main function - a simple command interpreter
```
It is included in the
[developers' version http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html]
of GF, in the subdirectories [``GF/src/GF/GFCC`` ../] and
[``GF/src/GF/Devel`` ../../Devel].
As of September 2007, default parsing in main GF uses GFCC (implemented by Krasimir
Angelov). The interpreter uses the relevant modules
```
GF/Conversions/SimpleToFCFG.hs -- generate parser from GFCC
GF/Parsing/FCFG.hs -- run the parser
```
To compile the interpreter, type
```
make gfcc
```
in ``GF/src``. To run it, type
```
./gfcc <GFCC-file>
```
The available commands are
- ``gr <Cat> <Int>``: generate a number of random trees in category.
and show their linearizations in all languages
- ``grt <Cat> <Int>``: generate a number of random trees in category.
and show the trees and their linearizations in all languages
- ``gt <Cat> <Int>``: generate a number of trees in category from smallest,
and show their linearizations in all languages
- ``gtt <Cat> <Int>``: generate a number of trees in category from smallest,
and show the trees and their linearizations in all languages
- ``p <Lang> <Cat> <String>``: parse a string into a set of trees
- ``lin <Tree>``: linearize tree in all languages, also showing full records
- ``q``: terminate the system cleanly
==Embedded formats==
- JavaScript: compiler of linearization and abstract syntax
- Haskell: compiler of abstract syntax and interpreter with parsing,
linearization, and generation
- C: compiler of linearization (old GFCC)
- C++: embedded interpreter supporting linearization (old GFCC)
==Some things to do==
Support for dependent types, higher-order abstract syntax, and
semantic definition in GFCC generation and interpreters.
Replacing the entire GF shell by one based on GFCC.
Interpreter in Java.
Hand-written parsers for GFCC grammars to reduce code size
(and efficiency?) of interpreters.
Binary format and/or file compression of GFCC output.
Syntax editor based on GFCC.
Rewriting of resource libraries in order to exploit the
word-suffix sharing better (depth-one tables, as in FM).

View File

@@ -0,0 +1,50 @@
Grm. Grammar ::= Header ";" Abstract ";" [Concrete] ;
Hdr. Header ::= "grammar" CId "(" [CId] ")" ;
Abs. Abstract ::= "abstract" "{" [AbsDef] "}" ;
Cnc. Concrete ::= "concrete" CId "{" [CncDef] "}" ;
Fun. AbsDef ::= CId ":" Type "=" Exp ;
--AFl. AbsDef ::= "%" CId "=" String ; -- flag
Lin. CncDef ::= CId "=" Term ;
--CFl. CncDef ::= "%" CId "=" String ; -- flag
Typ. Type ::= [CId] "->" CId ;
Tr. Exp ::= "(" Atom [Exp] ")" ;
AC. Atom ::= CId ;
AS. Atom ::= String ;
AI. Atom ::= Integer ;
AF. Atom ::= Double ;
AM. Atom ::= "?" ;
trA. Exp ::= Atom ;
define trA a = Tr a [] ;
R. Term ::= "[" [Term] "]" ; -- record/table
P. Term ::= "(" Term "!" Term ")" ; -- projection/selection
S. Term ::= "(" [Term] ")" ; -- sequence with ++
K. Term ::= Tokn ; -- token
V. Term ::= "$" Integer ; -- argument
C. Term ::= Integer ; -- parameter value/label
F. Term ::= CId ; -- global constant
FV. Term ::= "[|" [Term] "|]" ; -- free variation
W. Term ::= "(" String "+" Term ")" ; -- prefix + suffix table
RP. Term ::= "(" Term "@" Term ")"; -- record parameter alias
TM. Term ::= "?" ; -- lin of metavariable
L. Term ::= "(" CId "->" Term ")" ; -- lambda abstracted table
BV. Term ::= "#" CId ; -- lambda-bound variable
KS. Tokn ::= String ;
KP. Tokn ::= "[" "pre" [String] "[" [Variant] "]" "]" ;
Var. Variant ::= [String] "/" [String] ;
terminator Concrete ";" ;
terminator AbsDef ";" ;
terminator CncDef ";" ;
separator CId "," ;
separator Term "," ;
terminator Exp "" ;
terminator String "" ;
separator Variant "," ;
token CId (('_' | letter) (letter | digit | '\'' | '_')*) ;

View File

@@ -0,0 +1,656 @@
The GFCC Grammar Format
Aarne Ranta
October 19, 2006
Author's address:
[``http://www.cs.chalmers.se/~aarne`` http://www.cs.chalmers.se/~aarne]
% to compile: txt2tags -thtml --toc gfcc.txt
History:
- 19 Oct: translation of lincats, new figures on C++
- 3 Oct 2006: first version
==What is GFCC==
GFCC is a low-level format for GF grammars. Its aim is to contain the minimum
that is needed to process GF grammars at runtime. This minimality has three
advantages:
- compact grammar files and run-time objects
- time and space efficient processing
- simple definition of interpreters
The idea is that all embedded GF applications are compiled to GFCC.
The GF system would be primarily used as a compiler and as a grammar
development tool.
Since GFCC is implemented in BNFC, a parser of the format is readily
available for C, C++, Haskell, Java, and OCaml. Also an XML
representation is generated in BNFC. A
[reference implementation ../]
of linearization and some other functions has been written in Haskell.
==GFCC vs. GFC==
GFCC is aimed to replace GFC as the run-time grammar format. GFC was designed
to be a run-time format, but also to
support separate compilation of grammars, i.e.
to store the results of compiling
individual GF modules. But this means that GFC has to contain extra information,
such as type annotations, which is only needed in compilation and not at
run-time. In particular, the pattern matching syntax and semantics of GFC is
complex and therefore difficult to implement in new platforms.
The main differences of GFCC compared with GFC can be summarized as follows:
- there are no modules, and therefore no qualified names
- a GFCC grammar is multilingual, and consists of a common abstract syntax
together with one concrete syntax per language
- records and tables are replaced by arrays
- record labels and parameter values are replaced by integers
- record projection and table selection are replaced by array indexing
- there is (so far) no support for dependent types or higher-order abstract
syntax (which would be easy to add, but make interpreters much more difficult
to write)
Here is an example of a GF grammar, consisting of three modules,
as translated to GFCC. The representations are aligned, with the exceptions
due to the alphabetical sorting of GFCC grammars.
```
grammar Ex(Eng,Swe);
abstract Ex = { abstract {
cat
S ; NP ; VP ;
fun
Pred : NP -> VP -> S ; Pred : NP,VP -> S = (Pred);
She, They : NP ; She : -> NP = (She);
Sleep : VP ; Sleep : -> VP = (Sleep);
They : -> NP = (They);
} } ;
concrete Eng of Ex = { concrete Eng {
lincat
S = {s : Str} ;
NP = {s : Str ; n : Num} ;
VP = {s : Num => Str} ;
param
Num = Sg | Pl ;
lin
Pred np vp = { Pred = [(($0!1),(($1!0)!($0!0)))];
s = np.s ++ vp.s ! np.n} ;
She = {s = "she" ; n = Sg} ; She = [0, "she"];
They = {s = "they" ; n = Pl} ;
Sleep = {s = table { Sleep = [("sleep" + ["s",""])];
Sg => "sleeps" ;
Pl => "sleep" They = [1, "they"];
} } ;
} ;
}
concrete Swe of Ex = { concrete Swe {
lincat
S = {s : Str} ;
NP = {s : Str} ;
VP = {s : Str} ;
param
Num = Sg | Pl ;
lin
Pred np vp = { Pred = [(($0!0),($1!0))];
s = np.s ++ vp.s} ;
She = {s = "hon"} ; She = ["hon"];
They = {s = "de"} ; They = ["de"];
Sleep = {s = "sover"} ; Sleep = ["sover"];
} } ;
```
==The syntax of GFCC files==
===Top level===
A grammar has a header telling the name of the abstract syntax
(often specifying an application domain), and the names of
the concrete languages. The abstract syntax and the concrete
syntaxes themselves follow.
```
Grammar ::= Header ";" Abstract ";" [Concrete] ;
Header ::= "grammar" CId "(" [CId] ")" ;
Abstract ::= "abstract" "{" [AbsDef] "}" ;
Concrete ::= "concrete" CId "{" [CncDef] "}" ;
```
Abstract syntax judgements give typings and semantic definitions.
Concrete syntax judgements give linearizations.
```
AbsDef ::= CId ":" Type "=" Exp ;
CncDef ::= CId "=" Term ;
```
Also flags are possible, local to each "module" (i.e. abstract and concretes).
```
AbsDef ::= "%" CId "=" String ;
CncDef ::= "%" CId "=" String ;
```
For the run-time system, the reference implementation in Haskell
uses a structure that gives efficient look-up:
```
data GFCC = GFCC {
absname :: CId ,
cncnames :: [CId] ,
abstract :: Abstr ,
concretes :: Map CId Concr
}
data Abstr = Abstr {
funs :: Map CId Type, -- find the type of a fun
cats :: Map CId [CId] -- find the funs giving a cat
}
type Concr = Map CId Term
```
===Abstract syntax===
Types are first-order function types built from
category symbols. Syntax trees (``Exp``) are
rose trees with the head (``Atom``) either a function
constant, a metavariable, or a string, integer, or float
literal.
```
Type ::= [CId] "->" CId ;
Exp ::= "(" Atom [Exp] ")" ;
Atom ::= CId ; -- function constant
Atom ::= "?" ; -- metavariable
Atom ::= String ; -- string literal
Atom ::= Integer ; -- integer literal
Atom ::= Double ; -- float literal
```
===Concrete syntax===
Linearization terms (``Term``) are built as follows.
Constructor names are shown to make the later code
examples readable.
```
R. Term ::= "[" [Term] "]" ; -- array
P. Term ::= "(" Term "!" Term ")" ; -- access to indexed field
S. Term ::= "(" [Term] ")" ; -- sequence with ++
K. Term ::= Tokn ; -- token
V. Term ::= "$" Integer ; -- argument
C. Term ::= Integer ; -- array index
FV. Term ::= "[|" [Term] "|]" ; -- free variation
TM. Term ::= "?" ; -- linearization of metavariable
```
Tokens are strings or (maybe obsolescent) prefix-dependent
variant lists.
```
KS. Tokn ::= String ;
KP. Tokn ::= "[" "pre" [String] "[" [Variant] "]" "]" ;
Var. Variant ::= [String] "/" [String] ;
```
Three special forms of terms are introduced by the compiler
as optimizations. They can in principle be eliminated, but
their presence makes grammars much more compact. Their semantics
will be explained in a later section.
```
F. Term ::= CId ; -- global constant
W. Term ::= "(" String "+" Term ")" ; -- prefix + suffix table
RP. Term ::= "(" Term "@" Term ")"; -- record parameter alias
```
Identifiers are like ``Ident`` in GF and GFC, except that
the compiler produces constants prefixed with ``_`` in
the common subterm elimination optimization.
```
token CId (('_' | letter) (letter | digit | '\'' | '_')*) ;
```
==The semantics of concrete syntax terms==
===Linearization and realization===
The linearization algorithm is essentially the same as in
GFC: a tree is linearized by evaluating its linearization term
in the environment of the linearizations of the subtrees.
Literal atoms are linearized in the obvious way.
The function also needs to know the language (i.e. concrete syntax)
in which linearization is performed.
```
linExp :: GFCC -> CId -> Exp -> Term
linExp mcfg lang tree@(Tr at trees) = case at of
AC fun -> comp (Prelude.map lin trees) $ look fun
AS s -> R [kks (show s)] -- quoted
AI i -> R [kks (show i)]
AF d -> R [kks (show d)]
AM -> TM
where
lin = linExp mcfg lang
comp = compute mcfg lang
look = lookLin mcfg lang
```
The result of linearization is usually a record, which is realized as
a string using the following algorithm.
```
realize :: Term -> String
realize trm = case trm of
R (t:_) -> realize t
S ss -> unwords $ Prelude.map realize ss
K (KS s) -> s
K (KP s _) -> unwords s ---- prefix choice TODO
W s t -> s ++ realize t
FV (t:_) -> realize t
TM -> "?"
```
Since the order of record fields is not necessarily
the same as in GF source,
this realization does not work securely for
categories whose lincats more than one field.
===Term evaluation===
Evaluation follows call-by-value order, with two environments
needed:
- the grammar (a concrete syntax) to give the global constants
- an array of terms to give the subtree linearizations
The code is presented in one-level pattern matching, to
enable reimplementations in languages that do not permit
deep patterns (such as Java and C++).
```
compute :: GFCC -> CId -> [Term] -> Term -> Term
compute mcfg lang args = comp where
comp trm = case trm of
P r p -> proj (comp r) (comp p)
RP i t -> RP (comp i) (comp t)
W s t -> W s (comp t)
R ts -> R $ Prelude.map comp ts
V i -> idx args (fromInteger i) -- already computed
F c -> comp $ look c -- not computed (if contains V)
FV ts -> FV $ Prelude.map comp ts
S ts -> S $ Prelude.filter (/= S []) $ Prelude.map comp ts
_ -> trm
look = lookLin mcfg lang
idx xs i = xs !! i
proj r p = case (r,p) of
(_, FV ts) -> FV $ Prelude.map (proj r) ts
(W s t, _) -> kks (s ++ getString (proj t p))
_ -> comp $ getField r (getIndex p)
getString t = case t of
K (KS s) -> s
_ -> trace ("ERROR in grammar compiler: string from "++ show t) "ERR"
getIndex t = case t of
C i -> fromInteger i
RP p _ -> getIndex p
TM -> 0 -- default value for parameter
_ -> trace ("ERROR in grammar compiler: index from " ++ show t) 0
getField t i = case t of
R rs -> idx rs i
RP _ r -> getField r i
TM -> TM
_ -> trace ("ERROR in grammar compiler: field from " ++ show t) t
```
===The special term constructors===
The three forms introduced by the compiler may a need special
explanation.
Global constants
```
Term ::= CId ;
```
are shorthands for complex terms. They are produced by the
compiler by (iterated) common subexpression elimination.
They are often more powerful than hand-devised code sharing in the source
code. They could be computed off-line by replacing each identifier by
its definition.
Prefix-suffix tables
```
Term ::= "(" String "+" Term ")" ;
```
represent tables of word forms divided to the longest common prefix
and its array of suffixes. In the example grammar above, we have
```
Sleep = [("sleep" + ["s",""])]
```
which in fact is equal to the array of full forms
```
["sleeps", "sleep"]
```
The power of this construction comes from the fact that suffix sets
tend to be repeated in a language, and can therefore be collected
by common subexpression elimination. It is this technique that
explains the used syntax rather than the more accurate
```
"(" String "+" [String] ")"
```
since we want the suffix part to be a ``Term`` for the optimization to
take effect.
The most curious construct of GFCC is the parameter array alias,
```
Term ::= "(" Term "@" Term ")";
```
This form is used as the value of parameter records, such as the type
```
{n : Number ; p : Person}
```
The problem with parameter records is their double role.
They can be used like parameter values, as indices in selection,
```
VP.s ! {n = Sg ; p = P3}
```
but also as records, from which parameters can be projected:
```
{n = Sg ; p = P3}.n
```
Whichever use is selected as primary, a prohibitively complex
case expression must be generated at compilation to GFCC to get the
other use. The adopted
solution is to generate a pair containing both a parameter value index
and an array of indices of record fields. For instance, if we have
```
param Number = Sg | Pl ; Person = P1 | P2 | P3 ;
```
we get the encoding
```
{n = Sg ; p = P3} ---> (2 @ [0,2])
```
The GFCC computation rules are essentially
```
(t ! (i @ _)) = (t ! i)
((_ @ r) ! j) =(r ! j)
```
==Compiling to GFCC==
Compilation to GFCC is performed by the GF grammar compiler, and
GFCC interpreters need not know what it does. For grammar writers,
however, it might be interesting to know what happens to the grammars
in the process.
The compilation phases are the following
+ translate GF source to GFC, as always in GF
+ undo GFC back-end optimizations
+ perform the ``values`` optimization to normalize tables
+ create a symbol table mapping the GFC parameter and record types to
fixed-size arrays, and parameter values and record labels to integers
+ traverse the linearization rules replacing parameters and labels by integers
+ reorganize the created GFC grammar so that it has just one abstract syntax
and one concrete syntax per language
+ apply UTF8 encoding to the grammar, if not yet applied (this is told by the
``coding`` flag)
+ translate the GFC syntax tree to a GFCC syntax tree, using a simple
compositional mapping
+ perform the word-suffix optimization on GFCC linearization terms
+ perform subexpression elimination on each concrete syntax module
+ print out the GFCC code
Notice that a major part of the compilation is done within GFC, so that
GFC-related tasks (such as parser generation) could be performed by
using the old algorithms.
===Problems in GFCC compilation===
Two major problems had to be solved in compiling GFC to GFCC:
- consistent order of tables and records, to permit the array translation
- run-time variables in complex parameter values.
The current implementation is still experimental and may fail
to generate correct code. Any errors remaining are likely to be
related to the two problems just mentioned.
The order problem is solved in different ways for tables and records.
For tables, the ``values`` optimization of GFC already manages to
maintain a canonical order. But this order can be destroyed by the
``share`` optimization. To make sure that GFCC compilation works properly,
it is safest to recompile the GF grammar by using the ``values``
optimization flag.
Records can be canonically ordered by sorting them by labels.
In fact, this was done in connection of the GFCC work as a part
of the GFC generation, to guarantee consistency. This means that
e.g. the ``s`` field will in general no longer appear as the first
field, even if it does so in the GF source code. But relying on the
order of fields in a labelled record would be misplaced anyway.
The canonical form of records is further complicated by lock fields,
i.e. dummy fields of form ``lock_C = <>``, which are added to grammar
libraries to force intensionality of linearization types. The problem
is that the absence of a lock field only generates a warning, not
an error. Therefore a GFC grammar can contain objects of the same
type with and without a lock field. This problem was solved in GFCC
generation by just removing all lock fields (defined as fields whose
type is the empty record type). This has the further advantage of
(slightly) reducing the grammar size. More importantly, it is safe
to remove lock fields, because they are never used in computation,
and because intensional types are only needed in grammars reused
as libraries, not in grammars used at runtime.
While the order problem is rather bureaucratic in nature, run-time
variables are an interesting problem. They arise in the presence
of complex parameter values, created by argument-taking constructors
and parameter records. To give an example, consider the GF parameter
type system
```
Number = Sg | Pl ;
Person = P1 | P2 | P3 ;
Agr = Ag Number Person ;
```
The values can be translated to integers in the expected way,
```
Sg = 0, Pl = 1
P1 = 0, P2 = 1, P3 = 2
Ag Sg P1 = 0, Ag Sg P2 = 1, Ag Sg P3 = 2,
Ag Pl P1 = 3, Ag Pl P2 = 4, Ag Pl P3 = 5
```
However, an argument of ``Agr`` can be a run-time variable, as in
```
Ag np.n P3
```
This expression must first be translated to a case expression,
```
case np.n of {
0 => 2 ;
1 => 5
}
```
which can then be translated to the GFCC term
```
([2,5] ! ($0 ! $1))
```
assuming that the variable ``np`` is the first argument and that its
``Number`` field is the second in the record.
This transformation of course has to be performed recursively, since
there can be several run-time variables in a parameter value:
```
Ag np.n np.p
```
A similar transformation would be possible to deal with the double
role of parameter records discussed above. Thus the type
```
RNP = {n : Number ; p : Person}
```
could be uniformly translated into the set ``{0,1,2,3,4,5}``
as ``Agr`` above. Selections would be simple instances of indexing.
But any projection from the record should be translated into
a case expression,
```
rnp.n ===>
case rnp of {
0 => 0 ;
1 => 0 ;
2 => 0 ;
3 => 1 ;
4 => 1 ;
5 => 1
}
```
To avoid the code bloat resulting from this, we chose the alias representation
which is easy enough to deal with in interpreters.
===The representation of linearization types===
Linearization types (``lincat``) are not needed when generating with
GFCC, but they have been added to enable parser generation directly from
GFCC. The linearization type definitions are shown as a part of the
concrete syntax, by using terms to represent types. Here is the table
showing how different linearization types are encoded.
```
P* = size(P) -- parameter type
{_ : I ; __ : R}* = (I* @ R*) -- record of parameters
{r1 : T1 ; ... ; rn : Tn}* = [T1*,...,Tn*] -- other record
(P => T)* = [T* ,...,T*] -- size(P) times
Str* = ()
```
The category symbols are prefixed with two underscores (``__``).
For example, the linearization type ``present/CatEng.NP`` is
translated as follows:
```
NP = {
a : { -- 6 = 2*3 values
n : {ParamX.Number} ; -- 2 values
p : {ParamX.Person} -- 3 values
} ;
s : {ResEng.Case} => Str -- 3 values
}
__NP = [(6@[2,3]),[(),(),()]]
```
===Running the compiler and the GFCC interpreter===
GFCC generation is a part of the
[developers' version http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html]
of GF since September 2006. To invoke the compiler, the flag
``-printer=gfcc`` to the command
``pm = print_multi`` is used. It is wise to recompile the grammar from
source, since previously compiled libraries may not obey the canonical
order of records. To ``strip`` the grammar before
GFCC translation removes unnecessary interface references.
Here is an example, performed in
[example/bronzeage ../../../../../examples/bronzeage].
```
i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageEng.gf
i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageGer.gf
strip
pm -printer=gfcc | wf bronze.gfcc
```
==The reference interpreter==
The reference interpreter written in Haskell consists of the following files:
```
-- source file for BNFC
GFCC.cf -- labelled BNF grammar of gfcc
-- files generated by BNFC
AbsGFCC.hs -- abstrac syntax of gfcc
ErrM.hs -- error monad used internally
LexGFCC.hs -- lexer of gfcc files
ParGFCC.hs -- parser of gfcc files and syntax trees
PrintGFCC.hs -- printer of gfcc files and syntax trees
-- hand-written files
DataGFCC.hs -- post-parser grammar creation, linearization and evaluation
GenGFCC.hs -- random and exhaustive generation, generate-and-test parsing
RunGFCC.hs -- main function - a simple command interpreter
```
It is included in the
[developers' version http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html]
of GF, in the subdirectory [``GF/src/GF/Canon/GFCC`` ../].
To compile the interpreter, type
```
make gfcc
```
in ``GF/src``. To run it, type
```
./gfcc <GFCC-file>
```
The available commands are
- ``gr <Cat> <Int>``: generate a number of random trees in category.
and show their linearizations in all languages
- ``grt <Cat> <Int>``: generate a number of random trees in category.
and show the trees and their linearizations in all languages
- ``gt <Cat> <Int>``: generate a number of trees in category from smallest,
and show their linearizations in all languages
- ``gtt <Cat> <Int>``: generate a number of trees in category from smallest,
and show the trees and their linearizations in all languages
- ``p <Int> <Cat> <String>``: "parse", i.e. generate trees until match or
until the given number have been generated
- ``<Tree>``: linearize tree in all languages, also showing full records
- ``quit``: terminate the system cleanly
==Interpreter in C++==
A base-line interpreter in C++ has been started.
Its main functionality is random generation of trees and linearization of them.
Here are some results from running the different interpreters, compared
to running the same grammar in GF, saved in ``.gfcm`` format.
The grammar contains the English, German, and Norwegian
versions of Bronzeage. The experiment was carried out on
Ubuntu Linux laptop with 1.5 GHz Intel centrino processor.
|| | GF | gfcc(hs) | gfcc++ |
| program size | 7249k | 803k | 113k
| grammar size | 336k | 119k | 119k
| read grammar | 1150ms | 510ms | 100ms
| generate 222 | 9500ms | 450ms | 800ms
| memory | 21M | 10M | 20M
To summarize:
- going from GF to gfcc is a major win in both code size and efficiency
- going from Haskell to C++ interpreter is not a win yet, because of a space
leak in the C++ version
==Some things to do==
Interpreter in Java.
Parsing via MCFG
- the FCFG format can possibly be simplified
- parser grammars should be saved in files to make interpreters easier
Hand-written parsers for GFCC grammars to reduce code size
(and efficiency?) of interpreters.
Binary format and/or file compression of GFCC output.
Syntax editor based on GFCC.
Rewriting of resource libraries in order to exploit the
word-suffix sharing better (depth-one tables, as in FM).

View File

@@ -0,0 +1,180 @@
GFCC Syntax
==Syntax of GFCC files==
The parser syntax is very simple, as defined in BNF:
```
Grm. Grammar ::= [RExp] ;
App. RExp ::= "(" CId [RExp] ")" ;
AId. RExp ::= CId ;
AInt. RExp ::= Integer ;
AStr. RExp ::= String ;
AFlt. RExp ::= Double ;
AMet. RExp ::= "?" ;
terminator RExp "" ;
token CId (('_' | letter) (letter | digit | '\'' | '_')*) ;
```
While a parser and a printer can be generated for many languages
from this grammar by using the BNF Converter, a parser is also
easy to write by hand using recursive descent.
==Syntax of well-formed GFCC code==
Here is a summary of well-formed syntax,
with a comment on the semantics of each construction.
```
Grammar ::=
("grammar" CId CId*) -- abstract syntax name and concrete syntax names
"(" "flags" Flag* ")" -- global and abstract flags
"(" "abstract" Abstract ")" -- abstract syntax
"(" "concrete" Concrete* ")" -- concrete syntaxes
Abstract ::=
"(" "fun" FunDef* ")" -- function definitions
"(" "cat" CatDef* ")" -- category definitions
Concrete ::=
"(" CId -- language name
"flags" Flag* -- concrete flags
"lin" LinDef* -- linearization rules
"oper" LinDef* -- operations (macros)
"lincat" LinDef* -- linearization type definitions
"lindef" LinDef* -- linearization default definitions
"printname" LinDef* -- printname definitions
"param" LinDef* -- lincats with labels and parameter value names
")"
Flag ::= "(" CId String ")" -- flag and value
FunDef ::= "(" CId Type Exp ")" -- function, type, and definition
CatDef ::= "(" CId Hypo* ")" -- category and context
LinDef ::= "(" CId Term ")" -- function and definition
Type ::=
"(" CId -- value category
"(" "H" Hypo* ")" -- argument context
"(" "X" Exp* ")" ")" -- arguments (of dependent value type)
Exp ::=
"(" CId -- function
"(" "B" CId* ")" -- bindings
"(" "X" Exp* ")" ")" -- arguments
| CId -- variable
| "?" -- metavariable
| "(" "Eq" Equation* ")" -- group of pattern equations
| Integer -- integer literal (non-negative)
| Float -- floating-point literal (non-negative)
| String -- string literal (in double quotes)
Hypo ::= "(" CId Type ")" -- variable and type
Equation ::= "(" "E" Exp Exp* ")" -- value and pattern list
Term ::=
"(" "R" Term* ")" -- array (record or table)
| "(" "S" Term* ")" -- concatenated sequence
| "(" "FV" Term* ")" -- free variant list
| "(" "P" Term Term ")" -- access to index (projection or selection)
| "(" "W" String Term ")" -- token prefix with suffix list
| "(" "A" Integer ")" -- pointer to subtree
| String -- token (in double quotes)
| Integer -- index in array
| CId -- macro constant
| "?" -- metavariable
```
==GFCC interpreter==
The first phase in interpreting GFCC is to parse a GFCC file and
build an internal abstract syntax representation, as specified
in the previous section.
With this representation, linearization can be performed by
a straightforward function from expressions (``Exp``) to terms
(``Term``). All expressions except groups of pattern equations
can be linearized.
Here is a reference Haskell implementation of linearization:
```
linExp :: GFCC -> CId -> Exp -> Term
linExp gfcc lang tree@(DTr _ at trees) = case at of
AC fun -> comp (map lin trees) $ look fun
AS s -> R [K (show s)] -- quoted
AI i -> R [K (show i)]
AF d -> R [K (show d)]
AM -> TM
where
lin = linExp gfcc lang
comp = compute gfcc lang
look = lookLin gfcc lang
```
TODO: bindings must be supported.
Terms resulting from linearization are evaluated in
call-by-value order, with two environments needed:
- the grammar (a concrete syntax) to give the global constants
- an array of terms to give the subtree linearizations
The Haskell implementation works as follows:
```
compute :: GFCC -> CId -> [Term] -> Term -> Term
compute gfcc lang args = comp where
comp trm = case trm of
P r p -> proj (comp r) (comp p)
W s t -> W s (comp t)
R ts -> R $ map comp ts
V i -> idx args (fromInteger i) -- already computed
F c -> comp $ look c -- not computed (if contains V)
FV ts -> FV $ Prelude.map comp ts
S ts -> S $ Prelude.filter (/= S []) $ Prelude.map comp ts
_ -> trm
look = lookOper gfcc lang
idx xs i = xs !! i
proj r p = case (r,p) of
(_, FV ts) -> FV $ Prelude.map (proj r) ts
(FV ts, _ ) -> FV $ Prelude.map (\t -> proj t p) ts
(W s t, _) -> kks (s ++ getString (proj t p))
_ -> comp $ getField r (getIndex p)
getString t = case t of
K (KS s) -> s
_ -> trace ("ERROR in grammar compiler: string from "++ show t) "ERR"
getIndex t = case t of
C i -> fromInteger i
RP p _ -> getIndex p
TM -> 0 -- default value for parameter
_ -> trace ("ERROR in grammar compiler: index from " ++ show t) 0
getField t i = case t of
R rs -> idx rs i
RP _ r -> getField r i
TM -> TM
_ -> trace ("ERROR in grammar compiler: field from " ++ show t) t
```
The result of linearization is usually a record, which is realized as
a string using the following algorithm.
```
realize :: Term -> String
realize trm = case trm of
R (t:_) -> realize t
S ss -> unwords $ map realize ss
K s -> s
W s t -> s ++ realize t
FV (t:_) -> realize t -- TODO: all variants
TM -> "?"
```
Notice that realization always picks the first field of a record.
If a linearization type has more than one field, the first field
does not necessarily contain the desired string.
Also notice that the order of record fields in GFCC is not necessarily
the same as in GF source.