Also store Pre prefixes in token map. Introduce IntMapBuilder data structure.

Storing of prefixes uses show/read, which isn't a great solution but avoids having yet another token map.
This commit is contained in:
John J. Camilleri
2021-03-04 09:58:17 +01:00
parent 4082c006c3
commit 30b016032d
5 changed files with 110 additions and 27 deletions

View File

@@ -9,6 +9,7 @@ import qualified GF.Grammar.Canonical as C
import GF.Compile.GrammarToCanonical (grammar2canonical)
import GF.Data.Operations (ErrorMonad (..))
import qualified GF.Data.IntMapBuilder as IntMapBuilder
import GF.Infra.Option (Options)
import GF.Infra.UseIO (IOE)
import GF.Text.Pretty (pp, render)
@@ -24,7 +25,6 @@ import qualified Data.Map.Strict as Map
import Data.Maybe (fromJust, isJust)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Tuple (swap)
import System.Environment (lookupEnv)
import System.FilePath ((</>), (<.>))
import Text.Printf (printf)
@@ -292,38 +292,36 @@ isRecordType _ = False
-- | Find all token strings, put them in a map and replace with token indexes
extractStrings :: L.Concrete -> L.Concrete
extractStrings concr = L.Concrete { L.toks = strMap, L.lins = lins'}
extractStrings concr = L.Concrete { L.toks = toks', L.lins = lins' }
where
state = (
L.toks concr, -- the IntMap we're building
Map.fromList (map swap (IntMap.toList (L.toks concr))) -- a reversed map for lookups
)
(lins',(strMap,_)) = CMS.runState (go0 (L.lins concr)) state
imb = IntMapBuilder.fromIntMap (L.toks concr)
(lins',imb') = CMS.runState (go0 (L.lins concr)) imb
toks' = IntMapBuilder.toIntMap imb'
go0 :: Map.Map CId L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) (Map.Map CId L.LinFun)
go0 :: Map.Map CId L.LinFun -> CMS.State (IntMapBuilder.IMB Text) (Map.Map CId L.LinFun)
go0 mp = do
xs <- mapM (\(cid,lin) -> go lin >>= \lin' -> return (cid,lin')) (Map.toList mp)
return $ Map.fromList xs
go :: L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) L.LinFun
go :: L.LinFun -> CMS.State (IntMapBuilder.IMB Text) L.LinFun
go lf = case lf of
L.Token str -> do
(intMap,revMap) <- CMS.get
case Map.lookup str revMap of
Just i -> return $ L.TokenIx i
Nothing -> do
-- add string to map and replace with index
let i = IntMap.size intMap
let intMap' = IntMap.insert i str intMap
let revMap' = Map.insert str i revMap
CMS.put (intMap',revMap')
return $ L.TokenIx i
imb <- CMS.get
let (ix,imb') = IntMapBuilder.insert' str imb
CMS.put imb'
return $ L.TokenIx ix
-- TODO big savings can be made by putting prefixes in string map too
L.Pre pts df -> do
pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
-- pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
pts' <- forM pts $ \(pfxs,lv) -> do
imb <- CMS.get
let str = T.pack $ show pfxs
let (ix,imb') = IntMapBuilder.insert' str imb
CMS.put imb'
lv' <- go lv
return (ix,lv')
df' <- go df
return $ L.Pre pts' df'
return $ L.PreIx pts' df'
L.Concat s t -> do
s' <- go s
t' <- go t

View File

@@ -0,0 +1,57 @@
-- | In order to build an IntMap in one pass, we need a map data structure with
-- fast lookup in both keys and values.
-- This is achieved by keeping a separate reversed map of values to keys during building.
module GF.Data.IntMapBuilder where
import Data.IntMap (IntMap)
import qualified Data.IntMap as IntMap
import Data.Hashable (Hashable)
import Data.HashMap.Strict (HashMap)
import qualified Data.HashMap.Strict as HashMap
import Data.Tuple (swap)
import Prelude hiding (lookup)
data IMB a = IMB {
intMap :: IntMap a,
valMap :: HashMap a Int
}
-- | An empty IMB
empty :: (Eq a, Hashable a) => IMB a
empty = IMB {
intMap = IntMap.empty,
valMap = HashMap.empty
}
-- | Lookup a value
lookup :: (Eq a, Hashable a) => a -> IMB a -> Maybe Int
lookup a IMB { valMap = vm } = HashMap.lookup a vm
-- | Insert without any lookup
insert :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a)
insert a IMB { intMap = im, valMap = vm } =
let
ix = IntMap.size im
im' = IntMap.insert ix a im
vm' = HashMap.insert a ix vm
imb' = IMB { intMap = im', valMap = vm' }
in
(ix, imb')
-- | Insert only when lookup fails
insert' :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a)
insert' a imb =
case lookup a imb of
Just ix -> (ix, imb)
Nothing -> insert a imb
-- | Build IMB from existing IntMap
fromIntMap :: (Eq a, Hashable a) => IntMap a -> IMB a
fromIntMap im = IMB {
intMap = im,
valMap = HashMap.fromList (map swap (IntMap.toList im))
}
-- | Get IntMap from IMB
toIntMap :: (Eq a, Hashable a) => IMB a -> IntMap a
toIntMap = intMap