mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 11:42:49 -06:00
Also store Pre prefixes in token map. Introduce IntMapBuilder data structure.
Storing of prefixes uses show/read, which isn't a great solution but avoids having yet another token map.
This commit is contained in:
11
gf.cabal
11
gf.cabal
@@ -87,7 +87,9 @@ Library
|
|||||||
-- We need transformers-compat >= 0.6.3, but that is only in newer snapshots where it is redundant.
|
-- We need transformers-compat >= 0.6.3, but that is only in newer snapshots where it is redundant.
|
||||||
transformers-compat,
|
transformers-compat,
|
||||||
ghc-prim,
|
ghc-prim,
|
||||||
text
|
text,
|
||||||
|
hashable,
|
||||||
|
unordered-containers
|
||||||
hs-source-dirs: src/runtime/haskell
|
hs-source-dirs: src/runtime/haskell
|
||||||
|
|
||||||
other-modules:
|
other-modules:
|
||||||
@@ -216,6 +218,7 @@ Library
|
|||||||
GF.Data.ErrM
|
GF.Data.ErrM
|
||||||
GF.Data.Graph
|
GF.Data.Graph
|
||||||
GF.Data.Graphviz
|
GF.Data.Graphviz
|
||||||
|
GF.Data.IntMapBuilder
|
||||||
GF.Data.Relation
|
GF.Data.Relation
|
||||||
GF.Data.Str
|
GF.Data.Str
|
||||||
GF.Data.Utilities
|
GF.Data.Utilities
|
||||||
@@ -423,6 +426,7 @@ test-suite lpgf
|
|||||||
GF.Data.ErrM
|
GF.Data.ErrM
|
||||||
GF.Data.Graph
|
GF.Data.Graph
|
||||||
GF.Data.Graphviz
|
GF.Data.Graphviz
|
||||||
|
GF.Data.IntMapBuilder
|
||||||
GF.Data.Operations
|
GF.Data.Operations
|
||||||
GF.Data.Relation
|
GF.Data.Relation
|
||||||
GF.Data.Str
|
GF.Data.Str
|
||||||
@@ -529,6 +533,7 @@ test-suite lpgf
|
|||||||
directory,
|
directory,
|
||||||
filepath,
|
filepath,
|
||||||
ghc-prim,
|
ghc-prim,
|
||||||
|
hashable,
|
||||||
haskeline,
|
haskeline,
|
||||||
json,
|
json,
|
||||||
mtl,
|
mtl,
|
||||||
@@ -541,6 +546,7 @@ test-suite lpgf
|
|||||||
time,
|
time,
|
||||||
transformers-compat,
|
transformers-compat,
|
||||||
unix,
|
unix,
|
||||||
|
unordered-containers,
|
||||||
utf8-string
|
utf8-string
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
@@ -610,6 +616,7 @@ benchmark lpgf-bench
|
|||||||
GF.Data.ErrM
|
GF.Data.ErrM
|
||||||
GF.Data.Graph
|
GF.Data.Graph
|
||||||
GF.Data.Graphviz
|
GF.Data.Graphviz
|
||||||
|
GF.Data.IntMapBuilder
|
||||||
GF.Data.Operations
|
GF.Data.Operations
|
||||||
GF.Data.Relation
|
GF.Data.Relation
|
||||||
GF.Data.Str
|
GF.Data.Str
|
||||||
@@ -734,6 +741,7 @@ benchmark lpgf-bench
|
|||||||
directory,
|
directory,
|
||||||
filepath,
|
filepath,
|
||||||
ghc-prim,
|
ghc-prim,
|
||||||
|
hashable,
|
||||||
haskeline,
|
haskeline,
|
||||||
json,
|
json,
|
||||||
mtl,
|
mtl,
|
||||||
@@ -746,5 +754,6 @@ benchmark lpgf-bench
|
|||||||
time,
|
time,
|
||||||
transformers-compat,
|
transformers-compat,
|
||||||
unix,
|
unix,
|
||||||
|
unordered-containers,
|
||||||
utf8-string
|
utf8-string
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import qualified GF.Grammar.Canonical as C
|
|||||||
import GF.Compile.GrammarToCanonical (grammar2canonical)
|
import GF.Compile.GrammarToCanonical (grammar2canonical)
|
||||||
|
|
||||||
import GF.Data.Operations (ErrorMonad (..))
|
import GF.Data.Operations (ErrorMonad (..))
|
||||||
|
import qualified GF.Data.IntMapBuilder as IntMapBuilder
|
||||||
import GF.Infra.Option (Options)
|
import GF.Infra.Option (Options)
|
||||||
import GF.Infra.UseIO (IOE)
|
import GF.Infra.UseIO (IOE)
|
||||||
import GF.Text.Pretty (pp, render)
|
import GF.Text.Pretty (pp, render)
|
||||||
@@ -24,7 +25,6 @@ import qualified Data.Map.Strict as Map
|
|||||||
import Data.Maybe (fromJust, isJust)
|
import Data.Maybe (fromJust, isJust)
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
import Data.Tuple (swap)
|
|
||||||
import System.Environment (lookupEnv)
|
import System.Environment (lookupEnv)
|
||||||
import System.FilePath ((</>), (<.>))
|
import System.FilePath ((</>), (<.>))
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
@@ -292,38 +292,36 @@ isRecordType _ = False
|
|||||||
|
|
||||||
-- | Find all token strings, put them in a map and replace with token indexes
|
-- | Find all token strings, put them in a map and replace with token indexes
|
||||||
extractStrings :: L.Concrete -> L.Concrete
|
extractStrings :: L.Concrete -> L.Concrete
|
||||||
extractStrings concr = L.Concrete { L.toks = strMap, L.lins = lins'}
|
extractStrings concr = L.Concrete { L.toks = toks', L.lins = lins' }
|
||||||
where
|
where
|
||||||
state = (
|
imb = IntMapBuilder.fromIntMap (L.toks concr)
|
||||||
L.toks concr, -- the IntMap we're building
|
(lins',imb') = CMS.runState (go0 (L.lins concr)) imb
|
||||||
Map.fromList (map swap (IntMap.toList (L.toks concr))) -- a reversed map for lookups
|
toks' = IntMapBuilder.toIntMap imb'
|
||||||
)
|
|
||||||
(lins',(strMap,_)) = CMS.runState (go0 (L.lins concr)) state
|
|
||||||
|
|
||||||
go0 :: Map.Map CId L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) (Map.Map CId L.LinFun)
|
go0 :: Map.Map CId L.LinFun -> CMS.State (IntMapBuilder.IMB Text) (Map.Map CId L.LinFun)
|
||||||
go0 mp = do
|
go0 mp = do
|
||||||
xs <- mapM (\(cid,lin) -> go lin >>= \lin' -> return (cid,lin')) (Map.toList mp)
|
xs <- mapM (\(cid,lin) -> go lin >>= \lin' -> return (cid,lin')) (Map.toList mp)
|
||||||
return $ Map.fromList xs
|
return $ Map.fromList xs
|
||||||
|
|
||||||
go :: L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) L.LinFun
|
go :: L.LinFun -> CMS.State (IntMapBuilder.IMB Text) L.LinFun
|
||||||
go lf = case lf of
|
go lf = case lf of
|
||||||
L.Token str -> do
|
L.Token str -> do
|
||||||
(intMap,revMap) <- CMS.get
|
imb <- CMS.get
|
||||||
case Map.lookup str revMap of
|
let (ix,imb') = IntMapBuilder.insert' str imb
|
||||||
Just i -> return $ L.TokenIx i
|
CMS.put imb'
|
||||||
Nothing -> do
|
return $ L.TokenIx ix
|
||||||
-- add string to map and replace with index
|
|
||||||
let i = IntMap.size intMap
|
|
||||||
let intMap' = IntMap.insert i str intMap
|
|
||||||
let revMap' = Map.insert str i revMap
|
|
||||||
CMS.put (intMap',revMap')
|
|
||||||
return $ L.TokenIx i
|
|
||||||
|
|
||||||
-- TODO big savings can be made by putting prefixes in string map too
|
|
||||||
L.Pre pts df -> do
|
L.Pre pts df -> do
|
||||||
pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
|
-- pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
|
||||||
|
pts' <- forM pts $ \(pfxs,lv) -> do
|
||||||
|
imb <- CMS.get
|
||||||
|
let str = T.pack $ show pfxs
|
||||||
|
let (ix,imb') = IntMapBuilder.insert' str imb
|
||||||
|
CMS.put imb'
|
||||||
|
lv' <- go lv
|
||||||
|
return (ix,lv')
|
||||||
df' <- go df
|
df' <- go df
|
||||||
return $ L.Pre pts' df'
|
return $ L.PreIx pts' df'
|
||||||
L.Concat s t -> do
|
L.Concat s t -> do
|
||||||
s' <- go s
|
s' <- go s
|
||||||
t' <- go t
|
t' <- go t
|
||||||
|
|||||||
57
src/compiler/GF/Data/IntMapBuilder.hs
Normal file
57
src/compiler/GF/Data/IntMapBuilder.hs
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
-- | In order to build an IntMap in one pass, we need a map data structure with
|
||||||
|
-- fast lookup in both keys and values.
|
||||||
|
-- This is achieved by keeping a separate reversed map of values to keys during building.
|
||||||
|
module GF.Data.IntMapBuilder where
|
||||||
|
|
||||||
|
import Data.IntMap (IntMap)
|
||||||
|
import qualified Data.IntMap as IntMap
|
||||||
|
import Data.Hashable (Hashable)
|
||||||
|
import Data.HashMap.Strict (HashMap)
|
||||||
|
import qualified Data.HashMap.Strict as HashMap
|
||||||
|
import Data.Tuple (swap)
|
||||||
|
import Prelude hiding (lookup)
|
||||||
|
|
||||||
|
data IMB a = IMB {
|
||||||
|
intMap :: IntMap a,
|
||||||
|
valMap :: HashMap a Int
|
||||||
|
}
|
||||||
|
|
||||||
|
-- | An empty IMB
|
||||||
|
empty :: (Eq a, Hashable a) => IMB a
|
||||||
|
empty = IMB {
|
||||||
|
intMap = IntMap.empty,
|
||||||
|
valMap = HashMap.empty
|
||||||
|
}
|
||||||
|
|
||||||
|
-- | Lookup a value
|
||||||
|
lookup :: (Eq a, Hashable a) => a -> IMB a -> Maybe Int
|
||||||
|
lookup a IMB { valMap = vm } = HashMap.lookup a vm
|
||||||
|
|
||||||
|
-- | Insert without any lookup
|
||||||
|
insert :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a)
|
||||||
|
insert a IMB { intMap = im, valMap = vm } =
|
||||||
|
let
|
||||||
|
ix = IntMap.size im
|
||||||
|
im' = IntMap.insert ix a im
|
||||||
|
vm' = HashMap.insert a ix vm
|
||||||
|
imb' = IMB { intMap = im', valMap = vm' }
|
||||||
|
in
|
||||||
|
(ix, imb')
|
||||||
|
|
||||||
|
-- | Insert only when lookup fails
|
||||||
|
insert' :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a)
|
||||||
|
insert' a imb =
|
||||||
|
case lookup a imb of
|
||||||
|
Just ix -> (ix, imb)
|
||||||
|
Nothing -> insert a imb
|
||||||
|
|
||||||
|
-- | Build IMB from existing IntMap
|
||||||
|
fromIntMap :: (Eq a, Hashable a) => IntMap a -> IMB a
|
||||||
|
fromIntMap im = IMB {
|
||||||
|
intMap = im,
|
||||||
|
valMap = HashMap.fromList (map swap (IntMap.toList im))
|
||||||
|
}
|
||||||
|
|
||||||
|
-- | Get IntMap from IMB
|
||||||
|
toIntMap :: (Eq a, Hashable a) => IMB a -> IntMap a
|
||||||
|
toIntMap = intMap
|
||||||
@@ -68,12 +68,16 @@ data LinFun =
|
|||||||
-- From original definition in paper
|
-- From original definition in paper
|
||||||
| Empty
|
| Empty
|
||||||
| Token Text
|
| Token Text
|
||||||
| TokenIx Int -- ^ index into `toks` map
|
|
||||||
| Concat LinFun LinFun
|
| Concat LinFun LinFun
|
||||||
| Ix Int
|
| Ix Int
|
||||||
| Tuple [LinFun]
|
| Tuple [LinFun]
|
||||||
| Projection LinFun LinFun
|
| Projection LinFun LinFun
|
||||||
| Argument Int
|
| Argument Int
|
||||||
|
|
||||||
|
-- For reducing LPGF file when stored
|
||||||
|
| PreIx [(Int, LinFun)] LinFun -- ^ index into `toks` map (must apply read to convert to list)
|
||||||
|
| TokenIx Int -- ^ index into `toks` map
|
||||||
|
|
||||||
deriving (Show, Read)
|
deriving (Show, Read)
|
||||||
|
|
||||||
instance Binary LPGF where
|
instance Binary LPGF where
|
||||||
@@ -116,14 +120,17 @@ instance Binary LinFun where
|
|||||||
AllCapit -> putWord8 4
|
AllCapit -> putWord8 4
|
||||||
Pre ps d -> putWord8 5 >> put (ps,d)
|
Pre ps d -> putWord8 5 >> put (ps,d)
|
||||||
Missing f -> putWord8 13 >> put f
|
Missing f -> putWord8 13 >> put f
|
||||||
|
|
||||||
Empty -> putWord8 6
|
Empty -> putWord8 6
|
||||||
Token t -> putWord8 7 >> put t
|
Token t -> putWord8 7 >> put t
|
||||||
TokenIx i -> putWord8 14 >> put i
|
|
||||||
Concat l1 l2 -> putWord8 8 >> put (l1,l2)
|
Concat l1 l2 -> putWord8 8 >> put (l1,l2)
|
||||||
Ix i -> putWord8 9 >> put i
|
Ix i -> putWord8 9 >> put i
|
||||||
Tuple ls -> putWord8 10 >> put ls
|
Tuple ls -> putWord8 10 >> put ls
|
||||||
Projection l1 l2 -> putWord8 11 >> put (l1,l2)
|
Projection l1 l2 -> putWord8 11 >> put (l1,l2)
|
||||||
Argument i -> putWord8 12 >> put i
|
Argument i -> putWord8 12 >> put i
|
||||||
|
|
||||||
|
PreIx ps d -> putWord8 15 >> put (ps,d)
|
||||||
|
TokenIx i -> putWord8 14 >> put i
|
||||||
get = do
|
get = do
|
||||||
tag <- getWord8
|
tag <- getWord8
|
||||||
case tag of
|
case tag of
|
||||||
@@ -134,14 +141,17 @@ instance Binary LinFun where
|
|||||||
4 -> return AllCapit
|
4 -> return AllCapit
|
||||||
5 -> liftM2 Pre get get
|
5 -> liftM2 Pre get get
|
||||||
13 -> liftM Missing get
|
13 -> liftM Missing get
|
||||||
|
|
||||||
6 -> return Empty
|
6 -> return Empty
|
||||||
7 -> liftM Token get
|
7 -> liftM Token get
|
||||||
14 -> liftM TokenIx get
|
|
||||||
8 -> liftM2 Concat get get
|
8 -> liftM2 Concat get get
|
||||||
9 -> liftM Ix get
|
9 -> liftM Ix get
|
||||||
10 -> liftM Tuple get
|
10 -> liftM Tuple get
|
||||||
11 -> liftM2 Projection get get
|
11 -> liftM2 Projection get get
|
||||||
12 -> liftM Argument get
|
12 -> liftM Argument get
|
||||||
|
|
||||||
|
15 -> liftM2 PreIx get get
|
||||||
|
14 -> liftM TokenIx get
|
||||||
_ -> fail "Failed to decode LPGF binary format"
|
_ -> fail "Failed to decode LPGF binary format"
|
||||||
|
|
||||||
instance Binary Text where
|
instance Binary Text where
|
||||||
@@ -199,7 +209,7 @@ eval cxt t = case t of
|
|||||||
where
|
where
|
||||||
pts' = [(pfxs, eval cxt t) | (pfxs, t) <- pts]
|
pts' = [(pfxs, eval cxt t) | (pfxs, t) <- pts]
|
||||||
df' = eval cxt df
|
df' = eval cxt df
|
||||||
TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt)
|
|
||||||
Concat s t -> Concat v w
|
Concat s t -> Concat v w
|
||||||
where
|
where
|
||||||
v = eval cxt s
|
v = eval cxt s
|
||||||
@@ -214,6 +224,13 @@ eval cxt t = case t of
|
|||||||
(tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv)
|
(tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv)
|
||||||
(t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u')
|
(t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u')
|
||||||
Argument i -> cxArgs cxt !! (i-1)
|
Argument i -> cxArgs cxt !! (i-1)
|
||||||
|
|
||||||
|
PreIx pts df -> Pre pts' df'
|
||||||
|
where
|
||||||
|
pts' = [(pfxs, eval cxt t) | (ix, t) <- pts, let pfxs = maybe [] (read . T.unpack) $ IntMap.lookup ix (cxToks cxt)]
|
||||||
|
df' = eval cxt df
|
||||||
|
TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt)
|
||||||
|
|
||||||
_ -> t
|
_ -> t
|
||||||
|
|
||||||
flattenTuple :: LinFun -> [LinFun]
|
flattenTuple :: LinFun -> [LinFun]
|
||||||
|
|||||||
@@ -17,6 +17,8 @@ stack test gf:test:lpgf --test-arguments="unittests/Params" # specific grammar
|
|||||||
stack test gf:test:lpgf --test-arguments="foods/Foods Fre Ger" # specific grammar and languages
|
stack test gf:test:lpgf --test-arguments="foods/Foods Fre Ger" # specific grammar and languages
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Set environment variable `DEBUG=1` to enable dumping of intermediate formats.
|
||||||
|
|
||||||
## Benchmark
|
## Benchmark
|
||||||
|
|
||||||
Compare performance metrics between LPGF and PGF[2]. Note: correctness is not checked here.
|
Compare performance metrics between LPGF and PGF[2]. Note: correctness is not checked here.
|
||||||
|
|||||||
Reference in New Issue
Block a user