forked from GitHub/gf-core
Extract token strings and put them in map which linfuns refer to by index, to reduce LPGF sizes.
This commit is contained in:
@@ -15,12 +15,16 @@ import GF.Text.Pretty (pp, render)
|
|||||||
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>))
|
||||||
import Control.Monad (when, unless, forM, forM_)
|
import Control.Monad (when, unless, forM, forM_)
|
||||||
|
import qualified Control.Monad.State as CMS
|
||||||
import Data.Either (lefts, rights)
|
import Data.Either (lefts, rights)
|
||||||
|
import qualified Data.IntMap as IntMap
|
||||||
import Data.List (elemIndex)
|
import Data.List (elemIndex)
|
||||||
import qualified Data.List as L
|
import qualified Data.List as L
|
||||||
import qualified Data.Map as Map
|
import qualified Data.Map.Strict as Map
|
||||||
import Data.Maybe (fromJust, isJust)
|
import Data.Maybe (fromJust, isJust)
|
||||||
|
import Data.Text (Text)
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
|
import Data.Tuple (swap)
|
||||||
import System.Environment (lookupEnv)
|
import System.Environment (lookupEnv)
|
||||||
import System.FilePath ((</>), (<.>))
|
import System.FilePath ((</>), (<.>))
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
@@ -234,9 +238,11 @@ mkCanon2lpgf opts gr am = do
|
|||||||
|
|
||||||
unless (null $ lefts es) (raise $ unlines (lefts es))
|
unless (null $ lefts es) (raise $ unlines (lefts es))
|
||||||
|
|
||||||
return (mdi2i modId, L.Concrete {
|
let concr = extractStrings $ L.Concrete {
|
||||||
|
L.toks = IntMap.empty,
|
||||||
L.lins = lins
|
L.lins = lins
|
||||||
})
|
}
|
||||||
|
return (mdi2i modId, concr)
|
||||||
|
|
||||||
-- | Remove ParamAliasDefs by inlining their definitions
|
-- | Remove ParamAliasDefs by inlining their definitions
|
||||||
inlineParamAliases :: [C.ParamDef] -> [C.ParamDef]
|
inlineParamAliases :: [C.ParamDef] -> [C.ParamDef]
|
||||||
@@ -284,22 +290,52 @@ isRecordType :: C.LinType -> Bool
|
|||||||
isRecordType (C.RecordType _) = True
|
isRecordType (C.RecordType _) = True
|
||||||
isRecordType _ = False
|
isRecordType _ = False
|
||||||
|
|
||||||
-- | Minimise a linfun by evaluating projections where possible
|
-- | Find all token strings, put them in a map and replace with token indexes
|
||||||
-- This code closely matches the runtime's `eval` function, except we have no context
|
extractStrings :: L.Concrete -> L.Concrete
|
||||||
reduce :: L.LinFun -> L.LinFun
|
extractStrings concr = L.Concrete { L.toks = strMap, L.lins = lins'}
|
||||||
reduce lf = case lf of
|
where
|
||||||
L.Pre pts df -> L.Pre pts' df'
|
state = (
|
||||||
where
|
L.toks concr, -- the IntMap we're building
|
||||||
pts' = [ (strs,reduce t) | (strs,t) <- pts]
|
Map.fromList (map swap (IntMap.toList (L.toks concr))) -- a reversed map for lookups
|
||||||
df' = reduce df
|
)
|
||||||
L.Concat s t -> L.Concat (reduce s) (reduce t)
|
(lins',(strMap,_)) = CMS.runState (go0 (L.lins concr)) state
|
||||||
L.Tuple ts -> L.Tuple (map reduce ts)
|
|
||||||
L.Projection t u ->
|
go0 :: Map.Map CId L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) (Map.Map CId L.LinFun)
|
||||||
case (reduce t, reduce u) of
|
go0 mp = do
|
||||||
(L.Tuple vs, L.Ix i) -> reduce $ vs !! (i-1)
|
xs <- mapM (\(cid,lin) -> go lin >>= \lin' -> return (cid,lin')) (Map.toList mp)
|
||||||
(tp@(L.Tuple _), L.Tuple is) | all L.isIx is -> foldl (\(L.Tuple vs) (L.Ix i) -> vs !! (i-1)) tp is
|
return $ Map.fromList xs
|
||||||
(t',u') -> L.Projection t' u'
|
|
||||||
t -> t
|
go :: L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) L.LinFun
|
||||||
|
go lf = case lf of
|
||||||
|
L.Token str -> do
|
||||||
|
(intMap,revMap) <- CMS.get
|
||||||
|
case Map.lookup str revMap of
|
||||||
|
Just i -> return $ L.TokenIx i
|
||||||
|
Nothing -> do
|
||||||
|
-- add string to map and replace with index
|
||||||
|
let i = IntMap.size intMap
|
||||||
|
let intMap' = IntMap.insert i str intMap
|
||||||
|
let revMap' = Map.insert str i revMap
|
||||||
|
CMS.put (intMap',revMap')
|
||||||
|
return $ L.TokenIx i
|
||||||
|
|
||||||
|
-- TODO big savings can be made by putting prefixes in string map too
|
||||||
|
L.Pre pts df -> do
|
||||||
|
pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
|
||||||
|
df' <- go df
|
||||||
|
return $ L.Pre pts' df'
|
||||||
|
L.Concat s t -> do
|
||||||
|
s' <- go s
|
||||||
|
t' <- go t
|
||||||
|
return $ L.Concat s' t'
|
||||||
|
L.Tuple ts -> do
|
||||||
|
ts' <- mapM go ts
|
||||||
|
return $ L.Tuple ts'
|
||||||
|
L.Projection t u -> do
|
||||||
|
t' <- go t
|
||||||
|
u' <- go u
|
||||||
|
return $ L.Projection t' u'
|
||||||
|
t -> return t
|
||||||
|
|
||||||
-- | Convert Maybe to Either value with error
|
-- | Convert Maybe to Either value with error
|
||||||
m2e :: String -> Maybe a -> Either String a
|
m2e :: String -> Maybe a -> Either String a
|
||||||
@@ -352,7 +388,7 @@ dumpCanonical path (C.Grammar ab cncs) = do
|
|||||||
ppLPGF :: FilePath -> LPGF -> IO ()
|
ppLPGF :: FilePath -> LPGF -> IO ()
|
||||||
ppLPGF path lpgf =
|
ppLPGF path lpgf =
|
||||||
forM_ (Map.toList $ L.concretes lpgf) $ \(cid,concr) ->
|
forM_ (Map.toList $ L.concretes lpgf) $ \(cid,concr) ->
|
||||||
writeFile' (path </> showCId cid <.> "lpgf.txt") (T.unpack $ L.render $ L.pp concr)
|
writeFile' (path </> showCId cid <.> "lpgf.txt") (L.render $ L.pp concr)
|
||||||
|
|
||||||
-- | Dump LPGF to file
|
-- | Dump LPGF to file
|
||||||
dumpLPGF :: FilePath -> LPGF -> IO ()
|
dumpLPGF :: FilePath -> LPGF -> IO ()
|
||||||
|
|||||||
@@ -15,7 +15,8 @@ import PGF.Tree (Tree (..), expr2tree, prTree)
|
|||||||
import Control.Monad (liftM, liftM2, forM_)
|
import Control.Monad (liftM, liftM2, forM_)
|
||||||
import qualified Control.Monad.Writer as CMW
|
import qualified Control.Monad.Writer as CMW
|
||||||
import Data.Binary (Binary, put, get, putWord8, getWord8, encodeFile, decodeFile)
|
import Data.Binary (Binary, put, get, putWord8, getWord8, encodeFile, decodeFile)
|
||||||
import qualified Data.Map as Map
|
import qualified Data.IntMap as IntMap
|
||||||
|
import qualified Data.Map.Strict as Map
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
import qualified Data.Text.Encoding as TE
|
import qualified Data.Text.Encoding as TE
|
||||||
@@ -36,7 +37,8 @@ data Abstract = Abstract {
|
|||||||
} deriving (Show)
|
} deriving (Show)
|
||||||
|
|
||||||
-- | Concrete syntax
|
-- | Concrete syntax
|
||||||
newtype Concrete = Concrete {
|
data Concrete = Concrete {
|
||||||
|
toks :: IntMap.IntMap Text, -- ^ all strings are stored exactly once here
|
||||||
-- lincats :: Map.Map CId LinType, -- ^ a linearization type for each category
|
-- lincats :: Map.Map CId LinType, -- ^ a linearization type for each category
|
||||||
lins :: Map.Map CId LinFun -- ^ a linearization function for each function
|
lins :: Map.Map CId LinFun -- ^ a linearization function for each function
|
||||||
} deriving (Show)
|
} deriving (Show)
|
||||||
@@ -66,6 +68,7 @@ data LinFun =
|
|||||||
-- From original definition in paper
|
-- From original definition in paper
|
||||||
| Empty
|
| Empty
|
||||||
| Token Text
|
| Token Text
|
||||||
|
| TokenIx Int -- ^ index into `toks` map
|
||||||
| Concat LinFun LinFun
|
| Concat LinFun LinFun
|
||||||
| Ix Int
|
| Ix Int
|
||||||
| Tuple [LinFun]
|
| Tuple [LinFun]
|
||||||
@@ -93,10 +96,14 @@ instance Binary Abstract where
|
|||||||
get = return $ Abstract {}
|
get = return $ Abstract {}
|
||||||
|
|
||||||
instance Binary Concrete where
|
instance Binary Concrete where
|
||||||
put concr = put (lins concr)
|
put concr = do
|
||||||
|
put (toks concr)
|
||||||
|
put (lins concr)
|
||||||
get = do
|
get = do
|
||||||
|
ts <- get
|
||||||
ls <- get
|
ls <- get
|
||||||
return $ Concrete {
|
return $ Concrete {
|
||||||
|
toks = ts,
|
||||||
lins = ls
|
lins = ls
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,10 +114,11 @@ instance Binary LinFun where
|
|||||||
Space -> putWord8 2
|
Space -> putWord8 2
|
||||||
Capit -> putWord8 3
|
Capit -> putWord8 3
|
||||||
AllCapit -> putWord8 4
|
AllCapit -> putWord8 4
|
||||||
Pre ps d -> putWord8 5 >> put ([(map TE.encodeUtf8 p,l) | (p,l) <- ps],d)
|
Pre ps d -> putWord8 5 >> put (ps,d)
|
||||||
Missing f -> putWord8 13 >> put f
|
Missing f -> putWord8 13 >> put f
|
||||||
Empty -> putWord8 6
|
Empty -> putWord8 6
|
||||||
Token t -> putWord8 7 >> put (TE.encodeUtf8 t)
|
Token t -> putWord8 7 >> put t
|
||||||
|
TokenIx i -> putWord8 14 >> put i
|
||||||
Concat l1 l2 -> putWord8 8 >> put (l1,l2)
|
Concat l1 l2 -> putWord8 8 >> put (l1,l2)
|
||||||
Ix i -> putWord8 9 >> put i
|
Ix i -> putWord8 9 >> put i
|
||||||
Tuple ls -> putWord8 10 >> put ls
|
Tuple ls -> putWord8 10 >> put ls
|
||||||
@@ -124,16 +132,21 @@ instance Binary LinFun where
|
|||||||
2 -> return Space
|
2 -> return Space
|
||||||
3 -> return Capit
|
3 -> return Capit
|
||||||
4 -> return AllCapit
|
4 -> return AllCapit
|
||||||
5 -> liftM2 (\ps -> Pre [(map TE.decodeUtf8 p,l) | (p,l) <- ps]) get get
|
5 -> liftM2 Pre get get
|
||||||
13 -> liftM Missing get
|
13 -> liftM Missing get
|
||||||
6 -> return Empty
|
6 -> return Empty
|
||||||
7 -> liftM (Token . TE.decodeUtf8) get
|
7 -> liftM Token get
|
||||||
|
14 -> liftM TokenIx get
|
||||||
8 -> liftM2 Concat get get
|
8 -> liftM2 Concat get get
|
||||||
9 -> liftM Ix get
|
9 -> liftM Ix get
|
||||||
10 -> liftM Tuple get
|
10 -> liftM Tuple get
|
||||||
11 -> liftM2 Projection get get
|
11 -> liftM2 Projection get get
|
||||||
12 -> liftM Argument get
|
12 -> liftM Argument get
|
||||||
_ -> fail "Failed to decode LPGF binary format"
|
_ -> fail "Failed to decode LPGF binary format"
|
||||||
|
|
||||||
|
instance Binary Text where
|
||||||
|
put = put . TE.encodeUtf8
|
||||||
|
get = liftM TE.decodeUtf8 get
|
||||||
|
|
||||||
abstractName :: LPGF -> CId
|
abstractName :: LPGF -> CId
|
||||||
abstractName = absname
|
abstractName = absname
|
||||||
@@ -167,13 +180,16 @@ linearizeConcreteText concr expr = lin2string $ lin (expr2tree expr)
|
|||||||
lin tree = case tree of
|
lin tree = case tree of
|
||||||
Fun f as ->
|
Fun f as ->
|
||||||
case Map.lookup f (lins concr) of
|
case Map.lookup f (lins concr) of
|
||||||
Just t -> eval (map lin as) t
|
Just t -> eval cxt t
|
||||||
-- _ -> error $ printf "Lookup failed for function: %s" (showCId f)
|
where cxt = Context { cxToks = toks concr, cxArgs = map lin as }
|
||||||
_ -> Missing f
|
_ -> Missing f
|
||||||
x -> error $ printf "Cannot lin: %s" (prTree x)
|
x -> error $ printf "Cannot lin: %s" (prTree x)
|
||||||
|
|
||||||
-- | Evaluation context is a sequence of terms
|
-- | Evaluation context
|
||||||
type Context = [LinFun]
|
data Context = Context {
|
||||||
|
cxArgs :: [LinFun], -- ^ is a sequence of terms
|
||||||
|
cxToks :: IntMap.IntMap Text -- ^ token map
|
||||||
|
}
|
||||||
|
|
||||||
-- | Operational semantics
|
-- | Operational semantics
|
||||||
eval :: Context -> LinFun -> LinFun
|
eval :: Context -> LinFun -> LinFun
|
||||||
@@ -181,8 +197,9 @@ eval cxt t = case t of
|
|||||||
Error err -> error err
|
Error err -> error err
|
||||||
Pre pts df -> Pre pts' df'
|
Pre pts df -> Pre pts' df'
|
||||||
where
|
where
|
||||||
pts' = [ (strs, eval cxt t) | (strs,t) <- pts]
|
pts' = [(pfxs, eval cxt t) | (pfxs, t) <- pts]
|
||||||
df' = eval cxt df
|
df' = eval cxt df
|
||||||
|
TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt)
|
||||||
Concat s t -> Concat v w
|
Concat s t -> Concat v w
|
||||||
where
|
where
|
||||||
v = eval cxt s
|
v = eval cxt s
|
||||||
@@ -196,7 +213,7 @@ eval cxt t = case t of
|
|||||||
(Tuple vs, Ix i) -> vs !! (i-1)
|
(Tuple vs, Ix i) -> vs !! (i-1)
|
||||||
(tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv)
|
(tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv)
|
||||||
(t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u')
|
(t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u')
|
||||||
Argument i -> cxt !! (i-1)
|
Argument i -> cxArgs cxt !! (i-1)
|
||||||
_ -> t
|
_ -> t
|
||||||
|
|
||||||
flattenTuple :: LinFun -> [LinFun]
|
flattenTuple :: LinFun -> [LinFun]
|
||||||
@@ -252,10 +269,10 @@ unConcat lf = [lf]
|
|||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
-- Pretty-printing
|
-- Pretty-printing
|
||||||
|
|
||||||
type Doc = CMW.Writer [Text] ()
|
type Doc = CMW.Writer [String] ()
|
||||||
|
|
||||||
render :: Doc -> Text
|
render :: Doc -> String
|
||||||
render = T.unlines . CMW.execWriter
|
render = unlines . CMW.execWriter
|
||||||
|
|
||||||
class PP a where
|
class PP a where
|
||||||
pp :: a -> Doc
|
pp :: a -> Doc
|
||||||
@@ -264,9 +281,12 @@ instance PP LPGF where
|
|||||||
pp (LPGF _ _ cncs) = mapM_ pp cncs
|
pp (LPGF _ _ cncs) = mapM_ pp cncs
|
||||||
|
|
||||||
instance PP Concrete where
|
instance PP Concrete where
|
||||||
pp (Concrete lins) =
|
pp (Concrete toks lins) = do
|
||||||
|
forM_ (IntMap.toList toks) $ \(i,tok) ->
|
||||||
|
CMW.tell [show i ++ " " ++ T.unpack tok]
|
||||||
|
CMW.tell [""]
|
||||||
forM_ (Map.toList lins) $ \(cid,lin) -> do
|
forM_ (Map.toList lins) $ \(cid,lin) -> do
|
||||||
CMW.tell [T.pack ("# " ++ showCId cid)]
|
CMW.tell ["# " ++ showCId cid]
|
||||||
pp lin
|
pp lin
|
||||||
CMW.tell [""]
|
CMW.tell [""]
|
||||||
|
|
||||||
@@ -276,7 +296,7 @@ instance PP LinFun where
|
|||||||
pp' n = \case
|
pp' n = \case
|
||||||
Pre ps d -> do
|
Pre ps d -> do
|
||||||
p "Pre"
|
p "Pre"
|
||||||
CMW.tell [ T.replicate (n+1) " " `T.append` T.pack (show p) | p <- ps ]
|
CMW.tell [ replicate (2*(n+1)) ' ' ++ show p | p <- ps ]
|
||||||
pp' (n+1) d
|
pp' (n+1) d
|
||||||
|
|
||||||
c@(Concat l1 l2) -> do
|
c@(Concat l1 l2) -> do
|
||||||
@@ -286,7 +306,7 @@ instance PP LinFun where
|
|||||||
p "Concat"
|
p "Concat"
|
||||||
mapM_ (pp' (n+1)) ts
|
mapM_ (pp' (n+1)) ts
|
||||||
else
|
else
|
||||||
ps $ "Concat " ++ show ts
|
p $ "Concat " ++ show ts
|
||||||
Tuple ls | any isDeep ls -> do
|
Tuple ls | any isDeep ls -> do
|
||||||
p "Tuple"
|
p "Tuple"
|
||||||
mapM_ (pp' (n+1)) ls
|
mapM_ (pp' (n+1)) ls
|
||||||
@@ -294,12 +314,10 @@ instance PP LinFun where
|
|||||||
p "Projection"
|
p "Projection"
|
||||||
pp' (n+1) l1
|
pp' (n+1) l1
|
||||||
pp' (n+1) l2
|
pp' (n+1) l2
|
||||||
t -> ps $ show t
|
t -> p $ show t
|
||||||
where
|
where
|
||||||
p :: Text -> Doc
|
p :: String -> Doc
|
||||||
p t = CMW.tell [ T.replicate n " " `T.append` t ]
|
p t = CMW.tell [ replicate (2*n) ' ' ++ t ]
|
||||||
ps :: String -> Doc
|
|
||||||
ps = p . T.pack
|
|
||||||
|
|
||||||
isDeep = not . isTerm
|
isDeep = not . isTerm
|
||||||
isTerm = \case
|
isTerm = \case
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ stack test gf:test:lpgf --test-arguments="foods/Foods Fre Ger" # specific gramma
|
|||||||
|
|
||||||
## Benchmark
|
## Benchmark
|
||||||
|
|
||||||
|
Compare performance metrics between LPGF and PGF[2]. Note: correctness is not checked here.
|
||||||
|
|
||||||
### Compilation
|
### Compilation
|
||||||
|
|
||||||
Comparing PGF, LPGF along following criteria:
|
Comparing PGF, LPGF along following criteria:
|
||||||
|
|||||||
Reference in New Issue
Block a user