forked from GitHub/gf-core
Extract token strings and put them in map which linfuns refer to by index, to reduce LPGF sizes.
This commit is contained in:
@@ -15,12 +15,16 @@ import GF.Text.Pretty (pp, render)
|
||||
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad (when, unless, forM, forM_)
|
||||
import qualified Control.Monad.State as CMS
|
||||
import Data.Either (lefts, rights)
|
||||
import qualified Data.IntMap as IntMap
|
||||
import Data.List (elemIndex)
|
||||
import qualified Data.List as L
|
||||
import qualified Data.Map as Map
|
||||
import qualified Data.Map.Strict as Map
|
||||
import Data.Maybe (fromJust, isJust)
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
import Data.Tuple (swap)
|
||||
import System.Environment (lookupEnv)
|
||||
import System.FilePath ((</>), (<.>))
|
||||
import Text.Printf (printf)
|
||||
@@ -234,9 +238,11 @@ mkCanon2lpgf opts gr am = do
|
||||
|
||||
unless (null $ lefts es) (raise $ unlines (lefts es))
|
||||
|
||||
return (mdi2i modId, L.Concrete {
|
||||
let concr = extractStrings $ L.Concrete {
|
||||
L.toks = IntMap.empty,
|
||||
L.lins = lins
|
||||
})
|
||||
}
|
||||
return (mdi2i modId, concr)
|
||||
|
||||
-- | Remove ParamAliasDefs by inlining their definitions
|
||||
inlineParamAliases :: [C.ParamDef] -> [C.ParamDef]
|
||||
@@ -284,22 +290,52 @@ isRecordType :: C.LinType -> Bool
|
||||
isRecordType (C.RecordType _) = True
|
||||
isRecordType _ = False
|
||||
|
||||
-- | Minimise a linfun by evaluating projections where possible
|
||||
-- This code closely matches the runtime's `eval` function, except we have no context
|
||||
reduce :: L.LinFun -> L.LinFun
|
||||
reduce lf = case lf of
|
||||
L.Pre pts df -> L.Pre pts' df'
|
||||
where
|
||||
pts' = [ (strs,reduce t) | (strs,t) <- pts]
|
||||
df' = reduce df
|
||||
L.Concat s t -> L.Concat (reduce s) (reduce t)
|
||||
L.Tuple ts -> L.Tuple (map reduce ts)
|
||||
L.Projection t u ->
|
||||
case (reduce t, reduce u) of
|
||||
(L.Tuple vs, L.Ix i) -> reduce $ vs !! (i-1)
|
||||
(tp@(L.Tuple _), L.Tuple is) | all L.isIx is -> foldl (\(L.Tuple vs) (L.Ix i) -> vs !! (i-1)) tp is
|
||||
(t',u') -> L.Projection t' u'
|
||||
t -> t
|
||||
-- | Find all token strings, put them in a map and replace with token indexes
|
||||
extractStrings :: L.Concrete -> L.Concrete
|
||||
extractStrings concr = L.Concrete { L.toks = strMap, L.lins = lins'}
|
||||
where
|
||||
state = (
|
||||
L.toks concr, -- the IntMap we're building
|
||||
Map.fromList (map swap (IntMap.toList (L.toks concr))) -- a reversed map for lookups
|
||||
)
|
||||
(lins',(strMap,_)) = CMS.runState (go0 (L.lins concr)) state
|
||||
|
||||
go0 :: Map.Map CId L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) (Map.Map CId L.LinFun)
|
||||
go0 mp = do
|
||||
xs <- mapM (\(cid,lin) -> go lin >>= \lin' -> return (cid,lin')) (Map.toList mp)
|
||||
return $ Map.fromList xs
|
||||
|
||||
go :: L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) L.LinFun
|
||||
go lf = case lf of
|
||||
L.Token str -> do
|
||||
(intMap,revMap) <- CMS.get
|
||||
case Map.lookup str revMap of
|
||||
Just i -> return $ L.TokenIx i
|
||||
Nothing -> do
|
||||
-- add string to map and replace with index
|
||||
let i = IntMap.size intMap
|
||||
let intMap' = IntMap.insert i str intMap
|
||||
let revMap' = Map.insert str i revMap
|
||||
CMS.put (intMap',revMap')
|
||||
return $ L.TokenIx i
|
||||
|
||||
-- TODO big savings can be made by putting prefixes in string map too
|
||||
L.Pre pts df -> do
|
||||
pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
|
||||
df' <- go df
|
||||
return $ L.Pre pts' df'
|
||||
L.Concat s t -> do
|
||||
s' <- go s
|
||||
t' <- go t
|
||||
return $ L.Concat s' t'
|
||||
L.Tuple ts -> do
|
||||
ts' <- mapM go ts
|
||||
return $ L.Tuple ts'
|
||||
L.Projection t u -> do
|
||||
t' <- go t
|
||||
u' <- go u
|
||||
return $ L.Projection t' u'
|
||||
t -> return t
|
||||
|
||||
-- | Convert Maybe to Either value with error
|
||||
m2e :: String -> Maybe a -> Either String a
|
||||
@@ -352,7 +388,7 @@ dumpCanonical path (C.Grammar ab cncs) = do
|
||||
ppLPGF :: FilePath -> LPGF -> IO ()
|
||||
ppLPGF path lpgf =
|
||||
forM_ (Map.toList $ L.concretes lpgf) $ \(cid,concr) ->
|
||||
writeFile' (path </> showCId cid <.> "lpgf.txt") (T.unpack $ L.render $ L.pp concr)
|
||||
writeFile' (path </> showCId cid <.> "lpgf.txt") (L.render $ L.pp concr)
|
||||
|
||||
-- | Dump LPGF to file
|
||||
dumpLPGF :: FilePath -> LPGF -> IO ()
|
||||
|
||||
@@ -15,7 +15,8 @@ import PGF.Tree (Tree (..), expr2tree, prTree)
|
||||
import Control.Monad (liftM, liftM2, forM_)
|
||||
import qualified Control.Monad.Writer as CMW
|
||||
import Data.Binary (Binary, put, get, putWord8, getWord8, encodeFile, decodeFile)
|
||||
import qualified Data.Map as Map
|
||||
import qualified Data.IntMap as IntMap
|
||||
import qualified Data.Map.Strict as Map
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
import qualified Data.Text.Encoding as TE
|
||||
@@ -36,7 +37,8 @@ data Abstract = Abstract {
|
||||
} deriving (Show)
|
||||
|
||||
-- | Concrete syntax
|
||||
newtype Concrete = Concrete {
|
||||
data Concrete = Concrete {
|
||||
toks :: IntMap.IntMap Text, -- ^ all strings are stored exactly once here
|
||||
-- lincats :: Map.Map CId LinType, -- ^ a linearization type for each category
|
||||
lins :: Map.Map CId LinFun -- ^ a linearization function for each function
|
||||
} deriving (Show)
|
||||
@@ -66,6 +68,7 @@ data LinFun =
|
||||
-- From original definition in paper
|
||||
| Empty
|
||||
| Token Text
|
||||
| TokenIx Int -- ^ index into `toks` map
|
||||
| Concat LinFun LinFun
|
||||
| Ix Int
|
||||
| Tuple [LinFun]
|
||||
@@ -93,10 +96,14 @@ instance Binary Abstract where
|
||||
get = return $ Abstract {}
|
||||
|
||||
instance Binary Concrete where
|
||||
put concr = put (lins concr)
|
||||
put concr = do
|
||||
put (toks concr)
|
||||
put (lins concr)
|
||||
get = do
|
||||
ts <- get
|
||||
ls <- get
|
||||
return $ Concrete {
|
||||
toks = ts,
|
||||
lins = ls
|
||||
}
|
||||
|
||||
@@ -107,10 +114,11 @@ instance Binary LinFun where
|
||||
Space -> putWord8 2
|
||||
Capit -> putWord8 3
|
||||
AllCapit -> putWord8 4
|
||||
Pre ps d -> putWord8 5 >> put ([(map TE.encodeUtf8 p,l) | (p,l) <- ps],d)
|
||||
Pre ps d -> putWord8 5 >> put (ps,d)
|
||||
Missing f -> putWord8 13 >> put f
|
||||
Empty -> putWord8 6
|
||||
Token t -> putWord8 7 >> put (TE.encodeUtf8 t)
|
||||
Token t -> putWord8 7 >> put t
|
||||
TokenIx i -> putWord8 14 >> put i
|
||||
Concat l1 l2 -> putWord8 8 >> put (l1,l2)
|
||||
Ix i -> putWord8 9 >> put i
|
||||
Tuple ls -> putWord8 10 >> put ls
|
||||
@@ -124,16 +132,21 @@ instance Binary LinFun where
|
||||
2 -> return Space
|
||||
3 -> return Capit
|
||||
4 -> return AllCapit
|
||||
5 -> liftM2 (\ps -> Pre [(map TE.decodeUtf8 p,l) | (p,l) <- ps]) get get
|
||||
5 -> liftM2 Pre get get
|
||||
13 -> liftM Missing get
|
||||
6 -> return Empty
|
||||
7 -> liftM (Token . TE.decodeUtf8) get
|
||||
7 -> liftM Token get
|
||||
14 -> liftM TokenIx get
|
||||
8 -> liftM2 Concat get get
|
||||
9 -> liftM Ix get
|
||||
10 -> liftM Tuple get
|
||||
11 -> liftM2 Projection get get
|
||||
12 -> liftM Argument get
|
||||
_ -> fail "Failed to decode LPGF binary format"
|
||||
_ -> fail "Failed to decode LPGF binary format"
|
||||
|
||||
instance Binary Text where
|
||||
put = put . TE.encodeUtf8
|
||||
get = liftM TE.decodeUtf8 get
|
||||
|
||||
abstractName :: LPGF -> CId
|
||||
abstractName = absname
|
||||
@@ -167,13 +180,16 @@ linearizeConcreteText concr expr = lin2string $ lin (expr2tree expr)
|
||||
lin tree = case tree of
|
||||
Fun f as ->
|
||||
case Map.lookup f (lins concr) of
|
||||
Just t -> eval (map lin as) t
|
||||
-- _ -> error $ printf "Lookup failed for function: %s" (showCId f)
|
||||
Just t -> eval cxt t
|
||||
where cxt = Context { cxToks = toks concr, cxArgs = map lin as }
|
||||
_ -> Missing f
|
||||
x -> error $ printf "Cannot lin: %s" (prTree x)
|
||||
|
||||
-- | Evaluation context is a sequence of terms
|
||||
type Context = [LinFun]
|
||||
-- | Evaluation context
|
||||
data Context = Context {
|
||||
cxArgs :: [LinFun], -- ^ is a sequence of terms
|
||||
cxToks :: IntMap.IntMap Text -- ^ token map
|
||||
}
|
||||
|
||||
-- | Operational semantics
|
||||
eval :: Context -> LinFun -> LinFun
|
||||
@@ -181,8 +197,9 @@ eval cxt t = case t of
|
||||
Error err -> error err
|
||||
Pre pts df -> Pre pts' df'
|
||||
where
|
||||
pts' = [ (strs, eval cxt t) | (strs,t) <- pts]
|
||||
pts' = [(pfxs, eval cxt t) | (pfxs, t) <- pts]
|
||||
df' = eval cxt df
|
||||
TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt)
|
||||
Concat s t -> Concat v w
|
||||
where
|
||||
v = eval cxt s
|
||||
@@ -196,7 +213,7 @@ eval cxt t = case t of
|
||||
(Tuple vs, Ix i) -> vs !! (i-1)
|
||||
(tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv)
|
||||
(t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u')
|
||||
Argument i -> cxt !! (i-1)
|
||||
Argument i -> cxArgs cxt !! (i-1)
|
||||
_ -> t
|
||||
|
||||
flattenTuple :: LinFun -> [LinFun]
|
||||
@@ -252,10 +269,10 @@ unConcat lf = [lf]
|
||||
------------------------------------------------------------------------------
|
||||
-- Pretty-printing
|
||||
|
||||
type Doc = CMW.Writer [Text] ()
|
||||
type Doc = CMW.Writer [String] ()
|
||||
|
||||
render :: Doc -> Text
|
||||
render = T.unlines . CMW.execWriter
|
||||
render :: Doc -> String
|
||||
render = unlines . CMW.execWriter
|
||||
|
||||
class PP a where
|
||||
pp :: a -> Doc
|
||||
@@ -264,9 +281,12 @@ instance PP LPGF where
|
||||
pp (LPGF _ _ cncs) = mapM_ pp cncs
|
||||
|
||||
instance PP Concrete where
|
||||
pp (Concrete lins) =
|
||||
pp (Concrete toks lins) = do
|
||||
forM_ (IntMap.toList toks) $ \(i,tok) ->
|
||||
CMW.tell [show i ++ " " ++ T.unpack tok]
|
||||
CMW.tell [""]
|
||||
forM_ (Map.toList lins) $ \(cid,lin) -> do
|
||||
CMW.tell [T.pack ("# " ++ showCId cid)]
|
||||
CMW.tell ["# " ++ showCId cid]
|
||||
pp lin
|
||||
CMW.tell [""]
|
||||
|
||||
@@ -276,7 +296,7 @@ instance PP LinFun where
|
||||
pp' n = \case
|
||||
Pre ps d -> do
|
||||
p "Pre"
|
||||
CMW.tell [ T.replicate (n+1) " " `T.append` T.pack (show p) | p <- ps ]
|
||||
CMW.tell [ replicate (2*(n+1)) ' ' ++ show p | p <- ps ]
|
||||
pp' (n+1) d
|
||||
|
||||
c@(Concat l1 l2) -> do
|
||||
@@ -286,7 +306,7 @@ instance PP LinFun where
|
||||
p "Concat"
|
||||
mapM_ (pp' (n+1)) ts
|
||||
else
|
||||
ps $ "Concat " ++ show ts
|
||||
p $ "Concat " ++ show ts
|
||||
Tuple ls | any isDeep ls -> do
|
||||
p "Tuple"
|
||||
mapM_ (pp' (n+1)) ls
|
||||
@@ -294,12 +314,10 @@ instance PP LinFun where
|
||||
p "Projection"
|
||||
pp' (n+1) l1
|
||||
pp' (n+1) l2
|
||||
t -> ps $ show t
|
||||
t -> p $ show t
|
||||
where
|
||||
p :: Text -> Doc
|
||||
p t = CMW.tell [ T.replicate n " " `T.append` t ]
|
||||
ps :: String -> Doc
|
||||
ps = p . T.pack
|
||||
p :: String -> Doc
|
||||
p t = CMW.tell [ replicate (2*n) ' ' ++ t ]
|
||||
|
||||
isDeep = not . isTerm
|
||||
isTerm = \case
|
||||
|
||||
@@ -19,6 +19,8 @@ stack test gf:test:lpgf --test-arguments="foods/Foods Fre Ger" # specific gramma
|
||||
|
||||
## Benchmark
|
||||
|
||||
Compare performance metrics between LPGF and PGF[2]. Note: correctness is not checked here.
|
||||
|
||||
### Compilation
|
||||
|
||||
Comparing PGF, LPGF along following criteria:
|
||||
|
||||
Reference in New Issue
Block a user