mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
Also store Pre prefixes in token map. Introduce IntMapBuilder data structure.
Storing of prefixes uses show/read, which isn't a great solution but avoids having yet another token map.
This commit is contained in:
11
gf.cabal
11
gf.cabal
@@ -87,7 +87,9 @@ Library
|
||||
-- We need transformers-compat >= 0.6.3, but that is only in newer snapshots where it is redundant.
|
||||
transformers-compat,
|
||||
ghc-prim,
|
||||
text
|
||||
text,
|
||||
hashable,
|
||||
unordered-containers
|
||||
hs-source-dirs: src/runtime/haskell
|
||||
|
||||
other-modules:
|
||||
@@ -216,6 +218,7 @@ Library
|
||||
GF.Data.ErrM
|
||||
GF.Data.Graph
|
||||
GF.Data.Graphviz
|
||||
GF.Data.IntMapBuilder
|
||||
GF.Data.Relation
|
||||
GF.Data.Str
|
||||
GF.Data.Utilities
|
||||
@@ -423,6 +426,7 @@ test-suite lpgf
|
||||
GF.Data.ErrM
|
||||
GF.Data.Graph
|
||||
GF.Data.Graphviz
|
||||
GF.Data.IntMapBuilder
|
||||
GF.Data.Operations
|
||||
GF.Data.Relation
|
||||
GF.Data.Str
|
||||
@@ -529,6 +533,7 @@ test-suite lpgf
|
||||
directory,
|
||||
filepath,
|
||||
ghc-prim,
|
||||
hashable,
|
||||
haskeline,
|
||||
json,
|
||||
mtl,
|
||||
@@ -541,6 +546,7 @@ test-suite lpgf
|
||||
time,
|
||||
transformers-compat,
|
||||
unix,
|
||||
unordered-containers,
|
||||
utf8-string
|
||||
default-language: Haskell2010
|
||||
|
||||
@@ -610,6 +616,7 @@ benchmark lpgf-bench
|
||||
GF.Data.ErrM
|
||||
GF.Data.Graph
|
||||
GF.Data.Graphviz
|
||||
GF.Data.IntMapBuilder
|
||||
GF.Data.Operations
|
||||
GF.Data.Relation
|
||||
GF.Data.Str
|
||||
@@ -734,6 +741,7 @@ benchmark lpgf-bench
|
||||
directory,
|
||||
filepath,
|
||||
ghc-prim,
|
||||
hashable,
|
||||
haskeline,
|
||||
json,
|
||||
mtl,
|
||||
@@ -746,5 +754,6 @@ benchmark lpgf-bench
|
||||
time,
|
||||
transformers-compat,
|
||||
unix,
|
||||
unordered-containers,
|
||||
utf8-string
|
||||
default-language: Haskell2010
|
||||
|
||||
@@ -9,6 +9,7 @@ import qualified GF.Grammar.Canonical as C
|
||||
import GF.Compile.GrammarToCanonical (grammar2canonical)
|
||||
|
||||
import GF.Data.Operations (ErrorMonad (..))
|
||||
import qualified GF.Data.IntMapBuilder as IntMapBuilder
|
||||
import GF.Infra.Option (Options)
|
||||
import GF.Infra.UseIO (IOE)
|
||||
import GF.Text.Pretty (pp, render)
|
||||
@@ -24,7 +25,6 @@ import qualified Data.Map.Strict as Map
|
||||
import Data.Maybe (fromJust, isJust)
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
import Data.Tuple (swap)
|
||||
import System.Environment (lookupEnv)
|
||||
import System.FilePath ((</>), (<.>))
|
||||
import Text.Printf (printf)
|
||||
@@ -292,38 +292,36 @@ isRecordType _ = False
|
||||
|
||||
-- | Find all token strings, put them in a map and replace with token indexes
|
||||
extractStrings :: L.Concrete -> L.Concrete
|
||||
extractStrings concr = L.Concrete { L.toks = strMap, L.lins = lins'}
|
||||
extractStrings concr = L.Concrete { L.toks = toks', L.lins = lins' }
|
||||
where
|
||||
state = (
|
||||
L.toks concr, -- the IntMap we're building
|
||||
Map.fromList (map swap (IntMap.toList (L.toks concr))) -- a reversed map for lookups
|
||||
)
|
||||
(lins',(strMap,_)) = CMS.runState (go0 (L.lins concr)) state
|
||||
imb = IntMapBuilder.fromIntMap (L.toks concr)
|
||||
(lins',imb') = CMS.runState (go0 (L.lins concr)) imb
|
||||
toks' = IntMapBuilder.toIntMap imb'
|
||||
|
||||
go0 :: Map.Map CId L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) (Map.Map CId L.LinFun)
|
||||
go0 :: Map.Map CId L.LinFun -> CMS.State (IntMapBuilder.IMB Text) (Map.Map CId L.LinFun)
|
||||
go0 mp = do
|
||||
xs <- mapM (\(cid,lin) -> go lin >>= \lin' -> return (cid,lin')) (Map.toList mp)
|
||||
return $ Map.fromList xs
|
||||
|
||||
go :: L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) L.LinFun
|
||||
go :: L.LinFun -> CMS.State (IntMapBuilder.IMB Text) L.LinFun
|
||||
go lf = case lf of
|
||||
L.Token str -> do
|
||||
(intMap,revMap) <- CMS.get
|
||||
case Map.lookup str revMap of
|
||||
Just i -> return $ L.TokenIx i
|
||||
Nothing -> do
|
||||
-- add string to map and replace with index
|
||||
let i = IntMap.size intMap
|
||||
let intMap' = IntMap.insert i str intMap
|
||||
let revMap' = Map.insert str i revMap
|
||||
CMS.put (intMap',revMap')
|
||||
return $ L.TokenIx i
|
||||
imb <- CMS.get
|
||||
let (ix,imb') = IntMapBuilder.insert' str imb
|
||||
CMS.put imb'
|
||||
return $ L.TokenIx ix
|
||||
|
||||
-- TODO big savings can be made by putting prefixes in string map too
|
||||
L.Pre pts df -> do
|
||||
pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
|
||||
-- pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
|
||||
pts' <- forM pts $ \(pfxs,lv) -> do
|
||||
imb <- CMS.get
|
||||
let str = T.pack $ show pfxs
|
||||
let (ix,imb') = IntMapBuilder.insert' str imb
|
||||
CMS.put imb'
|
||||
lv' <- go lv
|
||||
return (ix,lv')
|
||||
df' <- go df
|
||||
return $ L.Pre pts' df'
|
||||
return $ L.PreIx pts' df'
|
||||
L.Concat s t -> do
|
||||
s' <- go s
|
||||
t' <- go t
|
||||
|
||||
57
src/compiler/GF/Data/IntMapBuilder.hs
Normal file
57
src/compiler/GF/Data/IntMapBuilder.hs
Normal file
@@ -0,0 +1,57 @@
|
||||
-- | In order to build an IntMap in one pass, we need a map data structure with
|
||||
-- fast lookup in both keys and values.
|
||||
-- This is achieved by keeping a separate reversed map of values to keys during building.
|
||||
module GF.Data.IntMapBuilder where
|
||||
|
||||
import Data.IntMap (IntMap)
|
||||
import qualified Data.IntMap as IntMap
|
||||
import Data.Hashable (Hashable)
|
||||
import Data.HashMap.Strict (HashMap)
|
||||
import qualified Data.HashMap.Strict as HashMap
|
||||
import Data.Tuple (swap)
|
||||
import Prelude hiding (lookup)
|
||||
|
||||
data IMB a = IMB {
|
||||
intMap :: IntMap a,
|
||||
valMap :: HashMap a Int
|
||||
}
|
||||
|
||||
-- | An empty IMB
|
||||
empty :: (Eq a, Hashable a) => IMB a
|
||||
empty = IMB {
|
||||
intMap = IntMap.empty,
|
||||
valMap = HashMap.empty
|
||||
}
|
||||
|
||||
-- | Lookup a value
|
||||
lookup :: (Eq a, Hashable a) => a -> IMB a -> Maybe Int
|
||||
lookup a IMB { valMap = vm } = HashMap.lookup a vm
|
||||
|
||||
-- | Insert without any lookup
|
||||
insert :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a)
|
||||
insert a IMB { intMap = im, valMap = vm } =
|
||||
let
|
||||
ix = IntMap.size im
|
||||
im' = IntMap.insert ix a im
|
||||
vm' = HashMap.insert a ix vm
|
||||
imb' = IMB { intMap = im', valMap = vm' }
|
||||
in
|
||||
(ix, imb')
|
||||
|
||||
-- | Insert only when lookup fails
|
||||
insert' :: (Eq a, Hashable a) => a -> IMB a -> (Int, IMB a)
|
||||
insert' a imb =
|
||||
case lookup a imb of
|
||||
Just ix -> (ix, imb)
|
||||
Nothing -> insert a imb
|
||||
|
||||
-- | Build IMB from existing IntMap
|
||||
fromIntMap :: (Eq a, Hashable a) => IntMap a -> IMB a
|
||||
fromIntMap im = IMB {
|
||||
intMap = im,
|
||||
valMap = HashMap.fromList (map swap (IntMap.toList im))
|
||||
}
|
||||
|
||||
-- | Get IntMap from IMB
|
||||
toIntMap :: (Eq a, Hashable a) => IMB a -> IntMap a
|
||||
toIntMap = intMap
|
||||
@@ -68,12 +68,16 @@ data LinFun =
|
||||
-- From original definition in paper
|
||||
| Empty
|
||||
| Token Text
|
||||
| TokenIx Int -- ^ index into `toks` map
|
||||
| Concat LinFun LinFun
|
||||
| Ix Int
|
||||
| Tuple [LinFun]
|
||||
| Projection LinFun LinFun
|
||||
| Argument Int
|
||||
|
||||
-- For reducing LPGF file when stored
|
||||
| PreIx [(Int, LinFun)] LinFun -- ^ index into `toks` map (must apply read to convert to list)
|
||||
| TokenIx Int -- ^ index into `toks` map
|
||||
|
||||
deriving (Show, Read)
|
||||
|
||||
instance Binary LPGF where
|
||||
@@ -116,14 +120,17 @@ instance Binary LinFun where
|
||||
AllCapit -> putWord8 4
|
||||
Pre ps d -> putWord8 5 >> put (ps,d)
|
||||
Missing f -> putWord8 13 >> put f
|
||||
|
||||
Empty -> putWord8 6
|
||||
Token t -> putWord8 7 >> put t
|
||||
TokenIx i -> putWord8 14 >> put i
|
||||
Concat l1 l2 -> putWord8 8 >> put (l1,l2)
|
||||
Ix i -> putWord8 9 >> put i
|
||||
Tuple ls -> putWord8 10 >> put ls
|
||||
Projection l1 l2 -> putWord8 11 >> put (l1,l2)
|
||||
Argument i -> putWord8 12 >> put i
|
||||
|
||||
PreIx ps d -> putWord8 15 >> put (ps,d)
|
||||
TokenIx i -> putWord8 14 >> put i
|
||||
get = do
|
||||
tag <- getWord8
|
||||
case tag of
|
||||
@@ -134,14 +141,17 @@ instance Binary LinFun where
|
||||
4 -> return AllCapit
|
||||
5 -> liftM2 Pre get get
|
||||
13 -> liftM Missing get
|
||||
|
||||
6 -> return Empty
|
||||
7 -> liftM Token get
|
||||
14 -> liftM TokenIx get
|
||||
8 -> liftM2 Concat get get
|
||||
9 -> liftM Ix get
|
||||
10 -> liftM Tuple get
|
||||
11 -> liftM2 Projection get get
|
||||
12 -> liftM Argument get
|
||||
|
||||
15 -> liftM2 PreIx get get
|
||||
14 -> liftM TokenIx get
|
||||
_ -> fail "Failed to decode LPGF binary format"
|
||||
|
||||
instance Binary Text where
|
||||
@@ -199,7 +209,7 @@ eval cxt t = case t of
|
||||
where
|
||||
pts' = [(pfxs, eval cxt t) | (pfxs, t) <- pts]
|
||||
df' = eval cxt df
|
||||
TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt)
|
||||
|
||||
Concat s t -> Concat v w
|
||||
where
|
||||
v = eval cxt s
|
||||
@@ -214,6 +224,13 @@ eval cxt t = case t of
|
||||
(tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv)
|
||||
(t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u')
|
||||
Argument i -> cxArgs cxt !! (i-1)
|
||||
|
||||
PreIx pts df -> Pre pts' df'
|
||||
where
|
||||
pts' = [(pfxs, eval cxt t) | (ix, t) <- pts, let pfxs = maybe [] (read . T.unpack) $ IntMap.lookup ix (cxToks cxt)]
|
||||
df' = eval cxt df
|
||||
TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt)
|
||||
|
||||
_ -> t
|
||||
|
||||
flattenTuple :: LinFun -> [LinFun]
|
||||
|
||||
@@ -17,6 +17,8 @@ stack test gf:test:lpgf --test-arguments="unittests/Params" # specific grammar
|
||||
stack test gf:test:lpgf --test-arguments="foods/Foods Fre Ger" # specific grammar and languages
|
||||
```
|
||||
|
||||
Set environment variable `DEBUG=1` to enable dumping of intermediate formats.
|
||||
|
||||
## Benchmark
|
||||
|
||||
Compare performance metrics between LPGF and PGF[2]. Note: correctness is not checked here.
|
||||
|
||||
Reference in New Issue
Block a user