diff --git a/gf.cabal b/gf.cabal index 0a64e3d17..a162b40c3 100644 --- a/gf.cabal +++ b/gf.cabal @@ -109,12 +109,13 @@ library ghc-prof-options: -fprof-auto exposed-modules: + LPGF PGF PGF.Internal PGF.Haskell - LPGF other-modules: + LPGF.Internal PGF.Data PGF.Macros PGF.Binary @@ -540,6 +541,7 @@ test-suite lpgf GF.Text.Pretty GF.Text.Transliterations LPGF + LPGF.Internal PGF PGF.Binary PGF.ByteCode @@ -740,6 +742,7 @@ benchmark lpgf-bench GF.Text.Pretty GF.Text.Transliterations LPGF + LPGF.Internal PGF PGF.Binary PGF.ByteCode diff --git a/src/compiler/GF/Compile/GrammarToLPGF.hs b/src/compiler/GF/Compile/GrammarToLPGF.hs index e868fc14f..137a3a23f 100644 --- a/src/compiler/GF/Compile/GrammarToLPGF.hs +++ b/src/compiler/GF/Compile/GrammarToLPGF.hs @@ -1,7 +1,7 @@ module GF.Compile.GrammarToLPGF (mkCanon2lpgf) where -import LPGF (LPGF (..)) -import qualified LPGF as L +import LPGF.Internal (LPGF (..)) +import qualified LPGF.Internal as L import PGF.CId import GF.Grammar.Grammar diff --git a/src/compiler/GF/Compiler.hs b/src/compiler/GF/Compiler.hs index d4ea2b8be..524a514fc 100644 --- a/src/compiler/GF/Compiler.hs +++ b/src/compiler/GF/Compiler.hs @@ -4,7 +4,7 @@ import PGF import PGF.Internal(concretes,optimizePGF,unionPGF) import PGF.Internal(putSplitAbs,encodeFile,runPut) import LPGF(LPGF) -import qualified LPGF +import qualified LPGF.Internal as LPGF import GF.Compile as S(batchCompile,link,linkl,srcAbsName) import GF.CompileInParallel as P(parallelBatchCompile) import GF.Compile.Export @@ -193,7 +193,7 @@ writePGF opts pgf = writeLPGF :: Options -> LPGF -> IOE FilePath writeLPGF opts lpgf = do let - grammarName = fromMaybe (showCId (LPGF.abstractName lpgf)) (flag optName opts) + grammarName = fromMaybe (showCId (LPGF.absname lpgf)) (flag optName opts) outfile = outputPath opts (grammarName <.> "lpgf") writing opts outfile $ liftIO $ LPGF.encodeFile outfile lpgf return outfile diff --git a/src/runtime/haskell/LPGF.hs b/src/runtime/haskell/LPGF.hs index 2ef2fe323..7eda7c2a7 100644 --- a/src/runtime/haskell/LPGF.hs +++ b/src/runtime/haskell/LPGF.hs @@ -2,193 +2,100 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} --- | Linearisation-only grammar format. --- Closely follows description in Section 2 of Angelov, Bringert, Ranta (2009): +-- | Linearisation-only portable grammar format. +-- +-- LPGF is an output format from the GF compiler, intended as a smaller and faster alternative to PGF. +-- This API allows LPGF files to be used in Haskell programs. +-- +-- The implementation closely follows description in Section 2 of Angelov, Bringert, Ranta (2009): -- "PGF: A Portable Run-Time Format for Type-Theoretical Grammars". -- http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.640.6330&rep=rep1&type=pdf module LPGF ( - -- ** Types - LPGF (..), Abstract (..), Concrete (..), LinFun (..), + -- * LPGF + LPGF, + showLPGF, + readLPGF, - -- ** Reading/writing - readLPGF, LPGF.encodeFile, + -- * Identifiers + CId, + mkCId, + showCId, + readCId, + + -- * Abstract syntax + Abstract, + abstractName, + + -- ** Categories + + -- ** Functions + + -- ** Expressions + Expr, + PGF.showExpr, + PGF.readExpr, + + -- ** Types + + -- ** Type checking + + -- * Concrete syntax + Language, + PGF.showLanguage, + PGF.readLanguage, + languages, + Concrete, + LPGF.concretes, -- ** Linearization - linearize, linearizeText, linearizeConcrete, linearizeConcreteText, - - -- ** Other - abstractName, - PGF.showLanguage, PGF.readExpr, - - -- ** DEBUG only, to be removed - render, pp + linearize, + linearizeText, + linearizeConcrete, + linearizeConcreteText ) where +import LPGF.Internal import PGF (Language) import PGF.CId import PGF.Expr (Expr, Literal (..)) import PGF.Tree (Tree (..), expr2tree, prTree) import qualified PGF --- import qualified Control.Exception as EX -import Control.Monad (liftM, liftM2, forM_) -import qualified Control.Monad.Writer as CMW -import Data.Binary (Binary, put, get, putWord8, getWord8, encodeFile, decodeFile) +import Data.Binary (decodeFile) import Data.Either (isLeft) import qualified Data.IntMap as IntMap import qualified Data.Map.Strict as Map import Data.Text (Text) import qualified Data.Text as T -import qualified Data.Text.Encoding as TE import Numeric (showFFloat) import Text.Printf (printf) import Prelude hiding ((!!)) import qualified Prelude --- | Linearisation-only PGF -data LPGF = LPGF { - absname :: CId, - abstract :: Abstract, - concretes :: Map.Map CId Concrete -} deriving (Show) - --- | Abstract syntax (currently empty) -data Abstract = Abstract { -} deriving (Show) - --- | Concrete syntax -data Concrete = Concrete { - toks :: IntMap.IntMap Text, -- ^ all strings are stored exactly once here - -- lincats :: Map.Map CId LinType, -- ^ a linearization type for each category - lins :: Map.Map CId LinFun -- ^ a linearization function for each function -} deriving (Show) - --- | Abstract function type --- data Type = Type [CId] CId --- deriving (Show) - --- -- | Linearisation type --- data LinType = --- StrType --- | IxType Int --- | ProductType [LinType] --- deriving (Show) - --- | Linearisation function -data LinFun = - -- Additions - Error String -- ^ a runtime error, should probably not be supported at all - | Bind -- ^ join adjacent tokens - | Space -- ^ space between adjacent tokens - | Capit -- ^ capitalise next character - | AllCapit -- ^ capitalise next word - | Pre [([Text], LinFun)] LinFun - | Missing CId -- ^ missing definition (inserted at runtime) - - -- From original definition in paper - | Empty - | Token Text - | Concat LinFun LinFun - | Ix Int - | Tuple [LinFun] - | Projection LinFun LinFun - | Argument Int - - -- For reducing LPGF file when stored - | PreIx [(Int, LinFun)] LinFun -- ^ index into `toks` map (must apply read to convert to list) - | TokenIx Int -- ^ index into `toks` map - - deriving (Show, Read) - -instance Binary LPGF where - put lpgf = do - put (absname lpgf) - put (abstract lpgf) - put (concretes lpgf) - get = do - an <- get - abs <- get - concs <- get - return $ LPGF { - absname = an, - abstract = abs, - concretes = concs - } - -instance Binary Abstract where - put abs = return () - get = return $ Abstract {} - -instance Binary Concrete where - put concr = do - put (toks concr) - put (lins concr) - get = do - ts <- get - ls <- get - return $ Concrete { - toks = ts, - lins = ls - } - -instance Binary LinFun where - put = \case - Error e -> putWord8 0 >> put e - Bind -> putWord8 1 - Space -> putWord8 2 - Capit -> putWord8 3 - AllCapit -> putWord8 4 - Pre ps d -> putWord8 5 >> put (ps,d) - Missing f -> putWord8 13 >> put f - - Empty -> putWord8 6 - Token t -> putWord8 7 >> put t - Concat l1 l2 -> putWord8 8 >> put (l1,l2) - Ix i -> putWord8 9 >> put i - Tuple ls -> putWord8 10 >> put ls - Projection l1 l2 -> putWord8 11 >> put (l1,l2) - Argument i -> putWord8 12 >> put i - - PreIx ps d -> putWord8 15 >> put (ps,d) - TokenIx i -> putWord8 14 >> put i - - get = do - tag <- getWord8 - case tag of - 0 -> liftM Error get - 1 -> return Bind - 2 -> return Space - 3 -> return Capit - 4 -> return AllCapit - 5 -> liftM2 Pre get get - 13 -> liftM Missing get - - 6 -> return Empty - 7 -> liftM Token get - 8 -> liftM2 Concat get get - 9 -> liftM Ix get - 10 -> liftM Tuple get - 11 -> liftM2 Projection get get - 12 -> liftM Argument get - - 15 -> liftM2 PreIx get get - 14 -> liftM TokenIx get - _ -> fail "Failed to decode LPGF binary format" - -instance Binary Text where - put = put . TE.encodeUtf8 - get = liftM TE.decodeUtf8 get - +-- | The abstract language name is the name of the top-level abstract module. abstractName :: LPGF -> CId abstractName = absname -encodeFile :: FilePath -> LPGF -> IO () -encodeFile = Data.Binary.encodeFile +-- | List of all languages available in the given grammar. +languages :: LPGF -> [Language] +languages = Map.keys . LPGF.Internal.concretes +-- | Map of all languages and their corresponding concrete sytaxes. +concretes :: LPGF -> Map.Map Language Concrete +concretes = LPGF.Internal.concretes + +-- | Reads file in LPGF and produces 'LPGF' term. +-- The file is usually produced with: +-- +-- > $ gf --make --output-format=lpgf readLPGF :: FilePath -> IO LPGF readLPGF = Data.Binary.decodeFile +-- | Produce pretty-printed representation of an LPGF. +showLPGF :: LPGF -> String +showLPGF = render . pp + -- | Main linearize function, to 'String' linearize :: LPGF -> Language -> Expr -> String linearize lpgf lang expr = T.unpack $ linearizeText lpgf lang expr @@ -196,7 +103,7 @@ linearize lpgf lang expr = T.unpack $ linearizeText lpgf lang expr -- | Main linearize function, to 'Data.Text.Text' linearizeText :: LPGF -> Language -> Expr -> Text linearizeText lpgf lang = - case Map.lookup lang (concretes lpgf) of + case Map.lookup lang (LPGF.Internal.concretes lpgf) of Just concr -> linearizeConcreteText concr Nothing -> error $ printf "Unknown language: %s" (showCId lang) @@ -223,13 +130,6 @@ linearizeConcreteText concr expr = lin2string $ lin (expr2tree expr) LFlt f -> showFFloat (Just 6) f "" x -> error $ printf "Cannot lin: %s" (prTree x) --- -- | Run a computation and catch any exception/errors. --- -- Ideally this library should never throw exceptions, but we're still in development... --- try :: a -> IO (Either String a) --- try comp = do --- let f = Right <$> EX.evaluate comp --- EX.catch f (\(e :: EX.SomeException) -> return $ Left (show e)) - -- | Evaluation context data Context = Context { cxArgs :: [LinFun], -- ^ is a sequence of terms @@ -331,75 +231,3 @@ lin2string lf = T.unwords $ join $ flatten [lf] isIx :: LinFun -> Bool isIx (Ix _) = True isIx _ = False - --- | Helper for building concat trees -mkConcat :: [LinFun] -> LinFun -mkConcat [] = Empty -mkConcat [x] = x -mkConcat xs = foldl1 Concat xs - --- | Helper for unfolding concat trees -unConcat :: LinFun -> [LinFun] -unConcat (Concat l1 l2) = concatMap unConcat [l1, l2] -unConcat lf = [lf] - ------------------------------------------------------------------------------- --- Pretty-printing - -type Doc = CMW.Writer [String] () - -render :: Doc -> String -render = unlines . CMW.execWriter - -class PP a where - pp :: a -> Doc - -instance PP LPGF where - pp (LPGF _ _ cncs) = mapM_ pp cncs - -instance PP Concrete where - pp (Concrete toks lins) = do - forM_ (IntMap.toList toks) $ \(i,tok) -> - CMW.tell [show i ++ " " ++ T.unpack tok] - CMW.tell [""] - forM_ (Map.toList lins) $ \(cid,lin) -> do - CMW.tell ["# " ++ showCId cid] - pp lin - CMW.tell [""] - -instance PP LinFun where - pp = pp' 0 - where - pp' n = \case - Pre ps d -> do - p "Pre" - CMW.tell [ replicate (2*(n+1)) ' ' ++ show p | p <- ps ] - pp' (n+1) d - - c@(Concat l1 l2) -> do - let ts = unConcat c - if any isDeep ts - then do - p "Concat" - mapM_ (pp' (n+1)) ts - else - p $ "Concat " ++ show ts - Tuple ls | any isDeep ls -> do - p "Tuple" - mapM_ (pp' (n+1)) ls - Projection l1 l2 | isDeep l1 || isDeep l2 -> do - p "Projection" - pp' (n+1) l1 - pp' (n+1) l2 - t -> p $ show t - where - p :: String -> Doc - p t = CMW.tell [ replicate (2*n) ' ' ++ t ] - - isDeep = not . isTerm - isTerm = \case - Pre _ _ -> False - Concat _ _ -> False - Tuple _ -> False - Projection _ _ -> False - _ -> True diff --git a/src/runtime/haskell/LPGF/Internal.hs b/src/runtime/haskell/LPGF/Internal.hs new file mode 100644 index 000000000..7490a214a --- /dev/null +++ b/src/runtime/haskell/LPGF/Internal.hs @@ -0,0 +1,227 @@ +{-# LANGUAGE LambdaCase #-} + +module LPGF.Internal where + +import PGF.CId +import PGF () + +import Control.Monad (liftM, liftM2, forM_) +import qualified Control.Monad.Writer as CMW +import Data.Binary (Binary, put, get, putWord8, getWord8, encodeFile) +import qualified Data.IntMap as IntMap +import qualified Data.Map.Strict as Map +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE + +-- | Linearisation-only PGF +data LPGF = LPGF { + absname :: CId, + abstract :: Abstract, + concretes :: Map.Map CId Concrete +} deriving (Show) + +-- | Abstract syntax (currently empty) +data Abstract = Abstract { +} deriving (Show) + +-- | Concrete syntax +data Concrete = Concrete { + toks :: IntMap.IntMap Text, -- ^ all strings are stored exactly once here + -- lincats :: Map.Map CId LinType, -- ^ a linearization type for each category + lins :: Map.Map CId LinFun -- ^ a linearization function for each function +} deriving (Show) + +-- | Abstract function type +-- data Type = Type [CId] CId +-- deriving (Show) + +-- -- | Linearisation type +-- data LinType = +-- StrType +-- | IxType Int +-- | ProductType [LinType] +-- deriving (Show) + +-- | Linearisation function +data LinFun = + -- Additions + Error String -- ^ a runtime error, should probably not be supported at all + | Bind -- ^ join adjacent tokens + | Space -- ^ space between adjacent tokens + | Capit -- ^ capitalise next character + | AllCapit -- ^ capitalise next word + | Pre [([Text], LinFun)] LinFun + | Missing CId -- ^ missing definition (inserted at runtime) + + -- From original definition in paper + | Empty + | Token Text + | Concat LinFun LinFun + | Ix Int + | Tuple [LinFun] + | Projection LinFun LinFun + | Argument Int + + -- For reducing LPGF file when stored + | PreIx [(Int, LinFun)] LinFun -- ^ index into `toks` map (must apply read to convert to list) + | TokenIx Int -- ^ index into `toks` map + + deriving (Show, Read) + +instance Binary LPGF where + put lpgf = do + put (absname lpgf) + put (abstract lpgf) + put (concretes lpgf) + get = do + an <- get + abs <- get + concs <- get + return $ LPGF { + absname = an, + abstract = abs, + concretes = concs + } + +instance Binary Abstract where + put abs = return () + get = return $ Abstract {} + +instance Binary Concrete where + put concr = do + put (toks concr) + put (lins concr) + get = do + ts <- get + ls <- get + return $ Concrete { + toks = ts, + lins = ls + } + +instance Binary LinFun where + put = \case + Error e -> putWord8 0 >> put e + Bind -> putWord8 1 + Space -> putWord8 2 + Capit -> putWord8 3 + AllCapit -> putWord8 4 + Pre ps d -> putWord8 5 >> put (ps,d) + Missing f -> putWord8 13 >> put f + + Empty -> putWord8 6 + Token t -> putWord8 7 >> put t + Concat l1 l2 -> putWord8 8 >> put (l1,l2) + Ix i -> putWord8 9 >> put i + Tuple ls -> putWord8 10 >> put ls + Projection l1 l2 -> putWord8 11 >> put (l1,l2) + Argument i -> putWord8 12 >> put i + + PreIx ps d -> putWord8 15 >> put (ps,d) + TokenIx i -> putWord8 14 >> put i + + get = do + tag <- getWord8 + case tag of + 0 -> liftM Error get + 1 -> return Bind + 2 -> return Space + 3 -> return Capit + 4 -> return AllCapit + 5 -> liftM2 Pre get get + 13 -> liftM Missing get + + 6 -> return Empty + 7 -> liftM Token get + 8 -> liftM2 Concat get get + 9 -> liftM Ix get + 10 -> liftM Tuple get + 11 -> liftM2 Projection get get + 12 -> liftM Argument get + + 15 -> liftM2 PreIx get get + 14 -> liftM TokenIx get + _ -> fail "Failed to decode LPGF binary format" + +instance Binary Text where + put = put . TE.encodeUtf8 + get = liftM TE.decodeUtf8 get + +encodeFile :: FilePath -> LPGF -> IO () +encodeFile = Data.Binary.encodeFile + +------------------------------------------------------------------------------ +-- Utilities + +-- | Helper for building concat trees +mkConcat :: [LinFun] -> LinFun +mkConcat [] = Empty +mkConcat [x] = x +mkConcat xs = foldl1 Concat xs + +-- | Helper for unfolding concat trees +unConcat :: LinFun -> [LinFun] +unConcat (Concat l1 l2) = concatMap unConcat [l1, l2] +unConcat lf = [lf] + +------------------------------------------------------------------------------ +-- Pretty-printing + +type Doc = CMW.Writer [String] () + +render :: Doc -> String +render = unlines . CMW.execWriter + +class PP a where + pp :: a -> Doc + +instance PP LPGF where + pp (LPGF _ _ cncs) = mapM_ pp cncs + +instance PP Concrete where + pp (Concrete toks lins) = do + forM_ (IntMap.toList toks) $ \(i,tok) -> + CMW.tell [show i ++ " " ++ T.unpack tok] + CMW.tell [""] + forM_ (Map.toList lins) $ \(cid,lin) -> do + CMW.tell ["# " ++ showCId cid] + pp lin + CMW.tell [""] + +instance PP LinFun where + pp = pp' 0 + where + pp' n = \case + Pre ps d -> do + p "Pre" + CMW.tell [ replicate (2*(n+1)) ' ' ++ show p | p <- ps ] + pp' (n+1) d + + c@(Concat l1 l2) -> do + let ts = unConcat c + if any isDeep ts + then do + p "Concat" + mapM_ (pp' (n+1)) ts + else + p $ "Concat " ++ show ts + Tuple ls | any isDeep ls -> do + p "Tuple" + mapM_ (pp' (n+1)) ls + Projection l1 l2 | isDeep l1 || isDeep l2 -> do + p "Projection" + pp' (n+1) l1 + pp' (n+1) l2 + t -> p $ show t + where + p :: String -> Doc + p t = CMW.tell [ replicate (2*n) ' ' ++ t ] + + isDeep = not . isTerm + isTerm = \case + Pre _ _ -> False + Concat _ _ -> False + Tuple _ -> False + Projection _ _ -> False + _ -> True