Also store Pre prefixes in token map. Introduce IntMapBuilder data structure.

Storing of prefixes uses show/read, which isn't a great solution but avoids having yet another token map.
This commit is contained in:
John J. Camilleri
2021-03-04 09:58:17 +01:00
parent 4082c006c3
commit 30b016032d
5 changed files with 110 additions and 27 deletions

View File

@@ -87,7 +87,9 @@ Library
-- We need transformers-compat >= 0.6.3, but that is only in newer snapshots where it is redundant.
transformers-compat,
ghc-prim,
text
text,
hashable,
unordered-containers
hs-source-dirs: src/runtime/haskell
other-modules:
@@ -216,6 +218,7 @@ Library
GF.Data.ErrM
GF.Data.Graph
GF.Data.Graphviz
GF.Data.IntMapBuilder
GF.Data.Relation
GF.Data.Str
GF.Data.Utilities
@@ -423,6 +426,7 @@ test-suite lpgf
GF.Data.ErrM
GF.Data.Graph
GF.Data.Graphviz
GF.Data.IntMapBuilder
GF.Data.Operations
GF.Data.Relation
GF.Data.Str
@@ -529,6 +533,7 @@ test-suite lpgf
directory,
filepath,
ghc-prim,
hashable,
haskeline,
json,
mtl,
@@ -541,6 +546,7 @@ test-suite lpgf
time,
transformers-compat,
unix,
unordered-containers,
utf8-string
default-language: Haskell2010
@@ -610,6 +616,7 @@ benchmark lpgf-bench
GF.Data.ErrM
GF.Data.Graph
GF.Data.Graphviz
GF.Data.IntMapBuilder
GF.Data.Operations
GF.Data.Relation
GF.Data.Str
@@ -734,6 +741,7 @@ benchmark lpgf-bench
directory,
filepath,
ghc-prim,
hashable,
haskeline,
json,
mtl,
@@ -746,5 +754,6 @@ benchmark lpgf-bench
time,
transformers-compat,
unix,
unordered-containers,
utf8-string
default-language: Haskell2010

View File

@@ -9,6 +9,7 @@ import qualified GF.Grammar.Canonical as C
import GF.Compile.GrammarToCanonical (grammar2canonical)
import GF.Data.Operations (ErrorMonad (..))
import qualified GF.Data.IntMapBuilder as IntMapBuilder
import GF.Infra.Option (Options)
import GF.Infra.UseIO (IOE)
import GF.Text.Pretty (pp, render)
@@ -24,7 +25,6 @@ import qualified Data.Map.Strict as Map
import Data.Maybe (fromJust, isJust)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Tuple (swap)
import System.Environment (lookupEnv)
import System.FilePath ((</>), (<.>))
import Text.Printf (printf)
@@ -292,38 +292,36 @@ isRecordType _ = False
-- | Find all token strings, put them in a map and replace with token indexes
extractStrings :: L.Concrete -> L.Concrete
extractStrings concr = L.Concrete { L.toks = strMap, L.lins = lins'}
extractStrings concr = L.Concrete { L.toks = toks', L.lins = lins' }
where
state = (
L.toks concr, -- the IntMap we're building
Map.fromList (map swap (IntMap.toList (L.toks concr))) -- a reversed map for lookups
)
(lins',(strMap,_)) = CMS.runState (go0 (L.lins concr)) state
imb = IntMapBuilder.fromIntMap (L.toks concr)
(lins',imb') = CMS.runState (go0 (L.lins concr)) imb
toks' = IntMapBuilder.toIntMap imb'
go0 :: Map.Map CId L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) (Map.Map CId L.LinFun)
go0 :: Map.Map CId L.LinFun -> CMS.State (IntMapBuilder.IMB Text) (Map.Map CId L.LinFun)
go0 mp = do
xs <- mapM (\(cid,lin) -> go lin >>= \lin' -> return (cid,lin')) (Map.toList mp)
return $ Map.fromList xs
go :: L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) L.LinFun
go :: L.LinFun -> CMS.State (IntMapBuilder.IMB Text) L.LinFun
go lf = case lf of
L.Token str -> do
(intMap,revMap) <- CMS.get
case Map.lookup str revMap of
Just i -> return $ L.TokenIx i
Nothing -> do
-- add string to map and replace with index
let i = IntMap.size intMap
let intMap' = IntMap.insert i str intMap
let revMap' = Map.insert str i revMap
CMS.put (intMap',revMap')
return $ L.TokenIx i
imb <- CMS.get
let (ix,imb') = IntMapBuilder.insert' str imb
CMS.put imb'
return $ L.TokenIx ix
-- TODO big savings can be made by putting prefixes in string map too
L.Pre pts df -> do
pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
-- pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
pts' <- forM pts $ \(pfxs,lv) -> do
imb <- CMS.get
let str = T.pack $ show pfxs
let (ix,imb') = IntMapBuilder.insert' str imb
CMS.put imb'
lv' <- go lv
return (ix,lv')
df' <- go df
return $ L.Pre pts' df'
return $ L.PreIx pts' df'
L.Concat s t -> do
s' <- go s
t' <- go t

View File

@@ -0,0 +1,57 @@
-- | In order to build an IntMap in one pass, we need a map data structure with
-- fast lookup in both keys and values.
-- This is achieved by keeping a separate reversed map of values to keys during building.
module GF.Data.IntMapBuilder where
import Data.IntMap (IntMap)
import qualified Data.IntMap as IntMap
import Data.Hashable (Hashable)
import Data.HashMap.Strict (HashMap)
import qualified Data.HashMap.Strict as HashMap
import Data.Tuple (swap)
import Prelude hiding (lookup)
data IMB a = IMB {
intMap :: IntMap a,
valMap :: HashMap a Int
}
-- | An empty IMB
empty :: (Eq a, Hashable a) => IMB a
empty = IMB {
intMap = IntMap.empty,
valMap = HashMap.empty
}
-- | Lookup a value
lookup :: (Eq a, Hashable a) => a -> IMB a -> Maybe Int
lookup a IMB { valMap = vm } = HashMap.lookup a vm
-- | Insert without any lookup
insert :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a)
insert a IMB { intMap = im, valMap = vm } =
let
ix = IntMap.size im
im' = IntMap.insert ix a im
vm' = HashMap.insert a ix vm
imb' = IMB { intMap = im', valMap = vm' }
in
(ix, imb')
-- | Insert only when lookup fails
insert' :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a)
insert' a imb =
case lookup a imb of
Just ix -> (ix, imb)
Nothing -> insert a imb
-- | Build IMB from existing IntMap
fromIntMap :: (Eq a, Hashable a) => IntMap a -> IMB a
fromIntMap im = IMB {
intMap = im,
valMap = HashMap.fromList (map swap (IntMap.toList im))
}
-- | Get IntMap from IMB
toIntMap :: (Eq a, Hashable a) => IMB a -> IntMap a
toIntMap = intMap

View File

@@ -68,12 +68,16 @@ data LinFun =
-- From original definition in paper
| Empty
| Token Text
| TokenIx Int -- ^ index into `toks` map
| Concat LinFun LinFun
| Ix Int
| Tuple [LinFun]
| Projection LinFun LinFun
| Argument Int
-- For reducing LPGF file when stored
| PreIx [(Int, LinFun)] LinFun -- ^ index into `toks` map (must apply read to convert to list)
| TokenIx Int -- ^ index into `toks` map
deriving (Show, Read)
instance Binary LPGF where
@@ -116,14 +120,17 @@ instance Binary LinFun where
AllCapit -> putWord8 4
Pre ps d -> putWord8 5 >> put (ps,d)
Missing f -> putWord8 13 >> put f
Empty -> putWord8 6
Token t -> putWord8 7 >> put t
TokenIx i -> putWord8 14 >> put i
Concat l1 l2 -> putWord8 8 >> put (l1,l2)
Ix i -> putWord8 9 >> put i
Tuple ls -> putWord8 10 >> put ls
Projection l1 l2 -> putWord8 11 >> put (l1,l2)
Argument i -> putWord8 12 >> put i
PreIx ps d -> putWord8 15 >> put (ps,d)
TokenIx i -> putWord8 14 >> put i
get = do
tag <- getWord8
case tag of
@@ -134,14 +141,17 @@ instance Binary LinFun where
4 -> return AllCapit
5 -> liftM2 Pre get get
13 -> liftM Missing get
6 -> return Empty
7 -> liftM Token get
14 -> liftM TokenIx get
8 -> liftM2 Concat get get
9 -> liftM Ix get
10 -> liftM Tuple get
11 -> liftM2 Projection get get
12 -> liftM Argument get
15 -> liftM2 PreIx get get
14 -> liftM TokenIx get
_ -> fail "Failed to decode LPGF binary format"
instance Binary Text where
@@ -199,7 +209,7 @@ eval cxt t = case t of
where
pts' = [(pfxs, eval cxt t) | (pfxs, t) <- pts]
df' = eval cxt df
TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt)
Concat s t -> Concat v w
where
v = eval cxt s
@@ -214,6 +224,13 @@ eval cxt t = case t of
(tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv)
(t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u')
Argument i -> cxArgs cxt !! (i-1)
PreIx pts df -> Pre pts' df'
where
pts' = [(pfxs, eval cxt t) | (ix, t) <- pts, let pfxs = maybe [] (read . T.unpack) $ IntMap.lookup ix (cxToks cxt)]
df' = eval cxt df
TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt)
_ -> t
flattenTuple :: LinFun -> [LinFun]

View File

@@ -17,6 +17,8 @@ stack test gf:test:lpgf --test-arguments="unittests/Params" # specific grammar
stack test gf:test:lpgf --test-arguments="foods/Foods Fre Ger" # specific grammar and languages
```
Set environment variable `DEBUG=1` to enable dumping of intermediate formats.
## Benchmark
Compare performance metrics between LPGF and PGF[2]. Note: correctness is not checked here.