diff --git a/gf.cabal b/gf.cabal index f5133ced3..8f83e641f 100644 --- a/gf.cabal +++ b/gf.cabal @@ -87,7 +87,9 @@ Library -- We need transformers-compat >= 0.6.3, but that is only in newer snapshots where it is redundant. transformers-compat, ghc-prim, - text + text, + hashable, + unordered-containers hs-source-dirs: src/runtime/haskell other-modules: @@ -216,6 +218,7 @@ Library GF.Data.ErrM GF.Data.Graph GF.Data.Graphviz + GF.Data.IntMapBuilder GF.Data.Relation GF.Data.Str GF.Data.Utilities @@ -423,6 +426,7 @@ test-suite lpgf GF.Data.ErrM GF.Data.Graph GF.Data.Graphviz + GF.Data.IntMapBuilder GF.Data.Operations GF.Data.Relation GF.Data.Str @@ -529,6 +533,7 @@ test-suite lpgf directory, filepath, ghc-prim, + hashable, haskeline, json, mtl, @@ -541,6 +546,7 @@ test-suite lpgf time, transformers-compat, unix, + unordered-containers, utf8-string default-language: Haskell2010 @@ -610,6 +616,7 @@ benchmark lpgf-bench GF.Data.ErrM GF.Data.Graph GF.Data.Graphviz + GF.Data.IntMapBuilder GF.Data.Operations GF.Data.Relation GF.Data.Str @@ -734,6 +741,7 @@ benchmark lpgf-bench directory, filepath, ghc-prim, + hashable, haskeline, json, mtl, @@ -746,5 +754,6 @@ benchmark lpgf-bench time, transformers-compat, unix, + unordered-containers, utf8-string default-language: Haskell2010 diff --git a/src/compiler/GF/Compile/GrammarToLPGF.hs b/src/compiler/GF/Compile/GrammarToLPGF.hs index 167ef9a80..b96e553dd 100644 --- a/src/compiler/GF/Compile/GrammarToLPGF.hs +++ b/src/compiler/GF/Compile/GrammarToLPGF.hs @@ -9,6 +9,7 @@ import qualified GF.Grammar.Canonical as C import GF.Compile.GrammarToCanonical (grammar2canonical) import GF.Data.Operations (ErrorMonad (..)) +import qualified GF.Data.IntMapBuilder as IntMapBuilder import GF.Infra.Option (Options) import GF.Infra.UseIO (IOE) import GF.Text.Pretty (pp, render) @@ -24,7 +25,6 @@ import qualified Data.Map.Strict as Map import Data.Maybe (fromJust, isJust) import Data.Text (Text) import qualified Data.Text as T -import Data.Tuple (swap) import System.Environment (lookupEnv) import System.FilePath ((), (<.>)) import Text.Printf (printf) @@ -292,38 +292,36 @@ isRecordType _ = False -- | Find all token strings, put them in a map and replace with token indexes extractStrings :: L.Concrete -> L.Concrete -extractStrings concr = L.Concrete { L.toks = strMap, L.lins = lins'} +extractStrings concr = L.Concrete { L.toks = toks', L.lins = lins' } where - state = ( - L.toks concr, -- the IntMap we're building - Map.fromList (map swap (IntMap.toList (L.toks concr))) -- a reversed map for lookups - ) - (lins',(strMap,_)) = CMS.runState (go0 (L.lins concr)) state + imb = IntMapBuilder.fromIntMap (L.toks concr) + (lins',imb') = CMS.runState (go0 (L.lins concr)) imb + toks' = IntMapBuilder.toIntMap imb' - go0 :: Map.Map CId L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) (Map.Map CId L.LinFun) + go0 :: Map.Map CId L.LinFun -> CMS.State (IntMapBuilder.IMB Text) (Map.Map CId L.LinFun) go0 mp = do xs <- mapM (\(cid,lin) -> go lin >>= \lin' -> return (cid,lin')) (Map.toList mp) return $ Map.fromList xs - go :: L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) L.LinFun + go :: L.LinFun -> CMS.State (IntMapBuilder.IMB Text) L.LinFun go lf = case lf of L.Token str -> do - (intMap,revMap) <- CMS.get - case Map.lookup str revMap of - Just i -> return $ L.TokenIx i - Nothing -> do - -- add string to map and replace with index - let i = IntMap.size intMap - let intMap' = IntMap.insert i str intMap - let revMap' = Map.insert str i revMap - CMS.put (intMap',revMap') - return $ L.TokenIx i + imb <- CMS.get + let (ix,imb') = IntMapBuilder.insert' str imb + CMS.put imb' + return $ L.TokenIx ix - -- TODO big savings can be made by putting prefixes in string map too L.Pre pts df -> do - pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts + -- pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts + pts' <- forM pts $ \(pfxs,lv) -> do + imb <- CMS.get + let str = T.pack $ show pfxs + let (ix,imb') = IntMapBuilder.insert' str imb + CMS.put imb' + lv' <- go lv + return (ix,lv') df' <- go df - return $ L.Pre pts' df' + return $ L.PreIx pts' df' L.Concat s t -> do s' <- go s t' <- go t diff --git a/src/compiler/GF/Data/IntMapBuilder.hs b/src/compiler/GF/Data/IntMapBuilder.hs new file mode 100644 index 000000000..77be56b47 --- /dev/null +++ b/src/compiler/GF/Data/IntMapBuilder.hs @@ -0,0 +1,57 @@ +-- | In order to build an IntMap in one pass, we need a map data structure with +-- fast lookup in both keys and values. +-- This is achieved by keeping a separate reversed map of values to keys during building. +module GF.Data.IntMapBuilder where + +import Data.IntMap (IntMap) +import qualified Data.IntMap as IntMap +import Data.Hashable (Hashable) +import Data.HashMap.Strict (HashMap) +import qualified Data.HashMap.Strict as HashMap +import Data.Tuple (swap) +import Prelude hiding (lookup) + +data IMB a = IMB { + intMap :: IntMap a, + valMap :: HashMap a Int +} + +-- | An empty IMB +empty :: (Eq a, Hashable a) => IMB a +empty = IMB { + intMap = IntMap.empty, + valMap = HashMap.empty +} + +-- | Lookup a value +lookup :: (Eq a, Hashable a) => a -> IMB a -> Maybe Int +lookup a IMB { valMap = vm } = HashMap.lookup a vm + +-- | Insert without any lookup +insert :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a) +insert a IMB { intMap = im, valMap = vm } = + let + ix = IntMap.size im + im' = IntMap.insert ix a im + vm' = HashMap.insert a ix vm + imb' = IMB { intMap = im', valMap = vm' } + in + (ix, imb') + +-- | Insert only when lookup fails +insert' :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a) +insert' a imb = + case lookup a imb of + Just ix -> (ix, imb) + Nothing -> insert a imb + +-- | Build IMB from existing IntMap +fromIntMap :: (Eq a, Hashable a) => IntMap a -> IMB a +fromIntMap im = IMB { + intMap = im, + valMap = HashMap.fromList (map swap (IntMap.toList im)) +} + +-- | Get IntMap from IMB +toIntMap :: (Eq a, Hashable a) => IMB a -> IntMap a +toIntMap = intMap diff --git a/src/runtime/haskell/LPGF.hs b/src/runtime/haskell/LPGF.hs index fc9a19749..944f3888d 100644 --- a/src/runtime/haskell/LPGF.hs +++ b/src/runtime/haskell/LPGF.hs @@ -68,12 +68,16 @@ data LinFun = -- From original definition in paper | Empty | Token Text - | TokenIx Int -- ^ index into `toks` map | Concat LinFun LinFun | Ix Int | Tuple [LinFun] | Projection LinFun LinFun | Argument Int + + -- For reducing LPGF file when stored + | PreIx [(Int, LinFun)] LinFun -- ^ index into `toks` map (must apply read to convert to list) + | TokenIx Int -- ^ index into `toks` map + deriving (Show, Read) instance Binary LPGF where @@ -116,14 +120,17 @@ instance Binary LinFun where AllCapit -> putWord8 4 Pre ps d -> putWord8 5 >> put (ps,d) Missing f -> putWord8 13 >> put f + Empty -> putWord8 6 Token t -> putWord8 7 >> put t - TokenIx i -> putWord8 14 >> put i Concat l1 l2 -> putWord8 8 >> put (l1,l2) Ix i -> putWord8 9 >> put i Tuple ls -> putWord8 10 >> put ls Projection l1 l2 -> putWord8 11 >> put (l1,l2) Argument i -> putWord8 12 >> put i + + PreIx ps d -> putWord8 15 >> put (ps,d) + TokenIx i -> putWord8 14 >> put i get = do tag <- getWord8 case tag of @@ -134,14 +141,17 @@ instance Binary LinFun where 4 -> return AllCapit 5 -> liftM2 Pre get get 13 -> liftM Missing get + 6 -> return Empty 7 -> liftM Token get - 14 -> liftM TokenIx get 8 -> liftM2 Concat get get 9 -> liftM Ix get 10 -> liftM Tuple get 11 -> liftM2 Projection get get 12 -> liftM Argument get + + 15 -> liftM2 PreIx get get + 14 -> liftM TokenIx get _ -> fail "Failed to decode LPGF binary format" instance Binary Text where @@ -199,7 +209,7 @@ eval cxt t = case t of where pts' = [(pfxs, eval cxt t) | (pfxs, t) <- pts] df' = eval cxt df - TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt) + Concat s t -> Concat v w where v = eval cxt s @@ -214,6 +224,13 @@ eval cxt t = case t of (tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv) (t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u') Argument i -> cxArgs cxt !! (i-1) + + PreIx pts df -> Pre pts' df' + where + pts' = [(pfxs, eval cxt t) | (ix, t) <- pts, let pfxs = maybe [] (read . T.unpack) $ IntMap.lookup ix (cxToks cxt)] + df' = eval cxt df + TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt) + _ -> t flattenTuple :: LinFun -> [LinFun] diff --git a/testsuite/lpgf/README.md b/testsuite/lpgf/README.md index 443588c4a..cad885b0b 100644 --- a/testsuite/lpgf/README.md +++ b/testsuite/lpgf/README.md @@ -17,6 +17,8 @@ stack test gf:test:lpgf --test-arguments="unittests/Params" # specific grammar stack test gf:test:lpgf --test-arguments="foods/Foods Fre Ger" # specific grammar and languages ``` +Set environment variable `DEBUG=1` to enable dumping of intermediate formats. + ## Benchmark Compare performance metrics between LPGF and PGF[2]. Note: correctness is not checked here.