Also store Pre prefixes in token map. Introduce IntMapBuilder data structure.

Storing of prefixes uses show/read, which isn't a great solution but avoids having yet another token map.
This commit is contained in:
John J. Camilleri
2021-03-04 09:58:17 +01:00
parent 4082c006c3
commit 30b016032d
5 changed files with 110 additions and 27 deletions

View File

@@ -87,7 +87,9 @@ Library
-- We need transformers-compat >= 0.6.3, but that is only in newer snapshots where it is redundant. -- We need transformers-compat >= 0.6.3, but that is only in newer snapshots where it is redundant.
transformers-compat, transformers-compat,
ghc-prim, ghc-prim,
text text,
hashable,
unordered-containers
hs-source-dirs: src/runtime/haskell hs-source-dirs: src/runtime/haskell
other-modules: other-modules:
@@ -216,6 +218,7 @@ Library
GF.Data.ErrM GF.Data.ErrM
GF.Data.Graph GF.Data.Graph
GF.Data.Graphviz GF.Data.Graphviz
GF.Data.IntMapBuilder
GF.Data.Relation GF.Data.Relation
GF.Data.Str GF.Data.Str
GF.Data.Utilities GF.Data.Utilities
@@ -423,6 +426,7 @@ test-suite lpgf
GF.Data.ErrM GF.Data.ErrM
GF.Data.Graph GF.Data.Graph
GF.Data.Graphviz GF.Data.Graphviz
GF.Data.IntMapBuilder
GF.Data.Operations GF.Data.Operations
GF.Data.Relation GF.Data.Relation
GF.Data.Str GF.Data.Str
@@ -529,6 +533,7 @@ test-suite lpgf
directory, directory,
filepath, filepath,
ghc-prim, ghc-prim,
hashable,
haskeline, haskeline,
json, json,
mtl, mtl,
@@ -541,6 +546,7 @@ test-suite lpgf
time, time,
transformers-compat, transformers-compat,
unix, unix,
unordered-containers,
utf8-string utf8-string
default-language: Haskell2010 default-language: Haskell2010
@@ -610,6 +616,7 @@ benchmark lpgf-bench
GF.Data.ErrM GF.Data.ErrM
GF.Data.Graph GF.Data.Graph
GF.Data.Graphviz GF.Data.Graphviz
GF.Data.IntMapBuilder
GF.Data.Operations GF.Data.Operations
GF.Data.Relation GF.Data.Relation
GF.Data.Str GF.Data.Str
@@ -734,6 +741,7 @@ benchmark lpgf-bench
directory, directory,
filepath, filepath,
ghc-prim, ghc-prim,
hashable,
haskeline, haskeline,
json, json,
mtl, mtl,
@@ -746,5 +754,6 @@ benchmark lpgf-bench
time, time,
transformers-compat, transformers-compat,
unix, unix,
unordered-containers,
utf8-string utf8-string
default-language: Haskell2010 default-language: Haskell2010

View File

@@ -9,6 +9,7 @@ import qualified GF.Grammar.Canonical as C
import GF.Compile.GrammarToCanonical (grammar2canonical) import GF.Compile.GrammarToCanonical (grammar2canonical)
import GF.Data.Operations (ErrorMonad (..)) import GF.Data.Operations (ErrorMonad (..))
import qualified GF.Data.IntMapBuilder as IntMapBuilder
import GF.Infra.Option (Options) import GF.Infra.Option (Options)
import GF.Infra.UseIO (IOE) import GF.Infra.UseIO (IOE)
import GF.Text.Pretty (pp, render) import GF.Text.Pretty (pp, render)
@@ -24,7 +25,6 @@ import qualified Data.Map.Strict as Map
import Data.Maybe (fromJust, isJust) import Data.Maybe (fromJust, isJust)
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
import Data.Tuple (swap)
import System.Environment (lookupEnv) import System.Environment (lookupEnv)
import System.FilePath ((</>), (<.>)) import System.FilePath ((</>), (<.>))
import Text.Printf (printf) import Text.Printf (printf)
@@ -292,38 +292,36 @@ isRecordType _ = False
-- | Find all token strings, put them in a map and replace with token indexes -- | Find all token strings, put them in a map and replace with token indexes
extractStrings :: L.Concrete -> L.Concrete extractStrings :: L.Concrete -> L.Concrete
extractStrings concr = L.Concrete { L.toks = strMap, L.lins = lins'} extractStrings concr = L.Concrete { L.toks = toks', L.lins = lins' }
where where
state = ( imb = IntMapBuilder.fromIntMap (L.toks concr)
L.toks concr, -- the IntMap we're building (lins',imb') = CMS.runState (go0 (L.lins concr)) imb
Map.fromList (map swap (IntMap.toList (L.toks concr))) -- a reversed map for lookups toks' = IntMapBuilder.toIntMap imb'
)
(lins',(strMap,_)) = CMS.runState (go0 (L.lins concr)) state
go0 :: Map.Map CId L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) (Map.Map CId L.LinFun) go0 :: Map.Map CId L.LinFun -> CMS.State (IntMapBuilder.IMB Text) (Map.Map CId L.LinFun)
go0 mp = do go0 mp = do
xs <- mapM (\(cid,lin) -> go lin >>= \lin' -> return (cid,lin')) (Map.toList mp) xs <- mapM (\(cid,lin) -> go lin >>= \lin' -> return (cid,lin')) (Map.toList mp)
return $ Map.fromList xs return $ Map.fromList xs
go :: L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) L.LinFun go :: L.LinFun -> CMS.State (IntMapBuilder.IMB Text) L.LinFun
go lf = case lf of go lf = case lf of
L.Token str -> do L.Token str -> do
(intMap,revMap) <- CMS.get imb <- CMS.get
case Map.lookup str revMap of let (ix,imb') = IntMapBuilder.insert' str imb
Just i -> return $ L.TokenIx i CMS.put imb'
Nothing -> do return $ L.TokenIx ix
-- add string to map and replace with index
let i = IntMap.size intMap
let intMap' = IntMap.insert i str intMap
let revMap' = Map.insert str i revMap
CMS.put (intMap',revMap')
return $ L.TokenIx i
-- TODO big savings can be made by putting prefixes in string map too
L.Pre pts df -> do L.Pre pts df -> do
pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts -- pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
pts' <- forM pts $ \(pfxs,lv) -> do
imb <- CMS.get
let str = T.pack $ show pfxs
let (ix,imb') = IntMapBuilder.insert' str imb
CMS.put imb'
lv' <- go lv
return (ix,lv')
df' <- go df df' <- go df
return $ L.Pre pts' df' return $ L.PreIx pts' df'
L.Concat s t -> do L.Concat s t -> do
s' <- go s s' <- go s
t' <- go t t' <- go t

View File

@@ -0,0 +1,57 @@
-- | In order to build an IntMap in one pass, we need a map data structure with
-- fast lookup in both keys and values.
-- This is achieved by keeping a separate reversed map of values to keys during building.
module GF.Data.IntMapBuilder where
import Data.IntMap (IntMap)
import qualified Data.IntMap as IntMap
import Data.Hashable (Hashable)
import Data.HashMap.Strict (HashMap)
import qualified Data.HashMap.Strict as HashMap
import Data.Tuple (swap)
import Prelude hiding (lookup)
data IMB a = IMB {
intMap :: IntMap a,
valMap :: HashMap a Int
}
-- | An empty IMB
empty :: (Eq a, Hashable a) => IMB a
empty = IMB {
intMap = IntMap.empty,
valMap = HashMap.empty
}
-- | Lookup a value
lookup :: (Eq a, Hashable a) => a -> IMB a -> Maybe Int
lookup a IMB { valMap = vm } = HashMap.lookup a vm
-- | Insert without any lookup
insert :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a)
insert a IMB { intMap = im, valMap = vm } =
let
ix = IntMap.size im
im' = IntMap.insert ix a im
vm' = HashMap.insert a ix vm
imb' = IMB { intMap = im', valMap = vm' }
in
(ix, imb')
-- | Insert only when lookup fails
insert' :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a)
insert' a imb =
case lookup a imb of
Just ix -> (ix, imb)
Nothing -> insert a imb
-- | Build IMB from existing IntMap
fromIntMap :: (Eq a, Hashable a) => IntMap a -> IMB a
fromIntMap im = IMB {
intMap = im,
valMap = HashMap.fromList (map swap (IntMap.toList im))
}
-- | Get IntMap from IMB
toIntMap :: (Eq a, Hashable a) => IMB a -> IntMap a
toIntMap = intMap

View File

@@ -68,12 +68,16 @@ data LinFun =
-- From original definition in paper -- From original definition in paper
| Empty | Empty
| Token Text | Token Text
| TokenIx Int -- ^ index into `toks` map
| Concat LinFun LinFun | Concat LinFun LinFun
| Ix Int | Ix Int
| Tuple [LinFun] | Tuple [LinFun]
| Projection LinFun LinFun | Projection LinFun LinFun
| Argument Int | Argument Int
-- For reducing LPGF file when stored
| PreIx [(Int, LinFun)] LinFun -- ^ index into `toks` map (must apply read to convert to list)
| TokenIx Int -- ^ index into `toks` map
deriving (Show, Read) deriving (Show, Read)
instance Binary LPGF where instance Binary LPGF where
@@ -116,14 +120,17 @@ instance Binary LinFun where
AllCapit -> putWord8 4 AllCapit -> putWord8 4
Pre ps d -> putWord8 5 >> put (ps,d) Pre ps d -> putWord8 5 >> put (ps,d)
Missing f -> putWord8 13 >> put f Missing f -> putWord8 13 >> put f
Empty -> putWord8 6 Empty -> putWord8 6
Token t -> putWord8 7 >> put t Token t -> putWord8 7 >> put t
TokenIx i -> putWord8 14 >> put i
Concat l1 l2 -> putWord8 8 >> put (l1,l2) Concat l1 l2 -> putWord8 8 >> put (l1,l2)
Ix i -> putWord8 9 >> put i Ix i -> putWord8 9 >> put i
Tuple ls -> putWord8 10 >> put ls Tuple ls -> putWord8 10 >> put ls
Projection l1 l2 -> putWord8 11 >> put (l1,l2) Projection l1 l2 -> putWord8 11 >> put (l1,l2)
Argument i -> putWord8 12 >> put i Argument i -> putWord8 12 >> put i
PreIx ps d -> putWord8 15 >> put (ps,d)
TokenIx i -> putWord8 14 >> put i
get = do get = do
tag <- getWord8 tag <- getWord8
case tag of case tag of
@@ -134,14 +141,17 @@ instance Binary LinFun where
4 -> return AllCapit 4 -> return AllCapit
5 -> liftM2 Pre get get 5 -> liftM2 Pre get get
13 -> liftM Missing get 13 -> liftM Missing get
6 -> return Empty 6 -> return Empty
7 -> liftM Token get 7 -> liftM Token get
14 -> liftM TokenIx get
8 -> liftM2 Concat get get 8 -> liftM2 Concat get get
9 -> liftM Ix get 9 -> liftM Ix get
10 -> liftM Tuple get 10 -> liftM Tuple get
11 -> liftM2 Projection get get 11 -> liftM2 Projection get get
12 -> liftM Argument get 12 -> liftM Argument get
15 -> liftM2 PreIx get get
14 -> liftM TokenIx get
_ -> fail "Failed to decode LPGF binary format" _ -> fail "Failed to decode LPGF binary format"
instance Binary Text where instance Binary Text where
@@ -199,7 +209,7 @@ eval cxt t = case t of
where where
pts' = [(pfxs, eval cxt t) | (pfxs, t) <- pts] pts' = [(pfxs, eval cxt t) | (pfxs, t) <- pts]
df' = eval cxt df df' = eval cxt df
TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt)
Concat s t -> Concat v w Concat s t -> Concat v w
where where
v = eval cxt s v = eval cxt s
@@ -214,6 +224,13 @@ eval cxt t = case t of
(tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv) (tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv)
(t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u') (t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u')
Argument i -> cxArgs cxt !! (i-1) Argument i -> cxArgs cxt !! (i-1)
PreIx pts df -> Pre pts' df'
where
pts' = [(pfxs, eval cxt t) | (ix, t) <- pts, let pfxs = maybe [] (read . T.unpack) $ IntMap.lookup ix (cxToks cxt)]
df' = eval cxt df
TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt)
_ -> t _ -> t
flattenTuple :: LinFun -> [LinFun] flattenTuple :: LinFun -> [LinFun]

View File

@@ -17,6 +17,8 @@ stack test gf:test:lpgf --test-arguments="unittests/Params" # specific grammar
stack test gf:test:lpgf --test-arguments="foods/Foods Fre Ger" # specific grammar and languages stack test gf:test:lpgf --test-arguments="foods/Foods Fre Ger" # specific grammar and languages
``` ```
Set environment variable `DEBUG=1` to enable dumping of intermediate formats.
## Benchmark ## Benchmark
Compare performance metrics between LPGF and PGF[2]. Note: correctness is not checked here. Compare performance metrics between LPGF and PGF[2]. Note: correctness is not checked here.