1
0
forked from GitHub/gf-core

Extract token strings and put them in map which linfuns refer to by index, to reduce LPGF sizes.

This commit is contained in:
John J. Camilleri
2021-03-04 00:16:12 +01:00
parent adc162b374
commit 4082c006c3
3 changed files with 102 additions and 46 deletions

View File

@@ -15,12 +15,16 @@ import GF.Text.Pretty (pp, render)
import Control.Applicative ((<|>)) import Control.Applicative ((<|>))
import Control.Monad (when, unless, forM, forM_) import Control.Monad (when, unless, forM, forM_)
import qualified Control.Monad.State as CMS
import Data.Either (lefts, rights) import Data.Either (lefts, rights)
import qualified Data.IntMap as IntMap
import Data.List (elemIndex) import Data.List (elemIndex)
import qualified Data.List as L import qualified Data.List as L
import qualified Data.Map as Map import qualified Data.Map.Strict as Map
import Data.Maybe (fromJust, isJust) import Data.Maybe (fromJust, isJust)
import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
import Data.Tuple (swap)
import System.Environment (lookupEnv) import System.Environment (lookupEnv)
import System.FilePath ((</>), (<.>)) import System.FilePath ((</>), (<.>))
import Text.Printf (printf) import Text.Printf (printf)
@@ -234,9 +238,11 @@ mkCanon2lpgf opts gr am = do
unless (null $ lefts es) (raise $ unlines (lefts es)) unless (null $ lefts es) (raise $ unlines (lefts es))
return (mdi2i modId, L.Concrete { let concr = extractStrings $ L.Concrete {
L.toks = IntMap.empty,
L.lins = lins L.lins = lins
}) }
return (mdi2i modId, concr)
-- | Remove ParamAliasDefs by inlining their definitions -- | Remove ParamAliasDefs by inlining their definitions
inlineParamAliases :: [C.ParamDef] -> [C.ParamDef] inlineParamAliases :: [C.ParamDef] -> [C.ParamDef]
@@ -284,22 +290,52 @@ isRecordType :: C.LinType -> Bool
isRecordType (C.RecordType _) = True isRecordType (C.RecordType _) = True
isRecordType _ = False isRecordType _ = False
-- | Minimise a linfun by evaluating projections where possible -- | Find all token strings, put them in a map and replace with token indexes
-- This code closely matches the runtime's `eval` function, except we have no context extractStrings :: L.Concrete -> L.Concrete
reduce :: L.LinFun -> L.LinFun extractStrings concr = L.Concrete { L.toks = strMap, L.lins = lins'}
reduce lf = case lf of where
L.Pre pts df -> L.Pre pts' df' state = (
where L.toks concr, -- the IntMap we're building
pts' = [ (strs,reduce t) | (strs,t) <- pts] Map.fromList (map swap (IntMap.toList (L.toks concr))) -- a reversed map for lookups
df' = reduce df )
L.Concat s t -> L.Concat (reduce s) (reduce t) (lins',(strMap,_)) = CMS.runState (go0 (L.lins concr)) state
L.Tuple ts -> L.Tuple (map reduce ts)
L.Projection t u -> go0 :: Map.Map CId L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) (Map.Map CId L.LinFun)
case (reduce t, reduce u) of go0 mp = do
(L.Tuple vs, L.Ix i) -> reduce $ vs !! (i-1) xs <- mapM (\(cid,lin) -> go lin >>= \lin' -> return (cid,lin')) (Map.toList mp)
(tp@(L.Tuple _), L.Tuple is) | all L.isIx is -> foldl (\(L.Tuple vs) (L.Ix i) -> vs !! (i-1)) tp is return $ Map.fromList xs
(t',u') -> L.Projection t' u'
t -> t go :: L.LinFun -> CMS.State (IntMap.IntMap Text, Map.Map Text Int) L.LinFun
go lf = case lf of
L.Token str -> do
(intMap,revMap) <- CMS.get
case Map.lookup str revMap of
Just i -> return $ L.TokenIx i
Nothing -> do
-- add string to map and replace with index
let i = IntMap.size intMap
let intMap' = IntMap.insert i str intMap
let revMap' = Map.insert str i revMap
CMS.put (intMap',revMap')
return $ L.TokenIx i
-- TODO big savings can be made by putting prefixes in string map too
L.Pre pts df -> do
pts' <- mapM (\(pfxs,lv) -> go lv >>= \lv' -> return (pfxs,lv')) pts
df' <- go df
return $ L.Pre pts' df'
L.Concat s t -> do
s' <- go s
t' <- go t
return $ L.Concat s' t'
L.Tuple ts -> do
ts' <- mapM go ts
return $ L.Tuple ts'
L.Projection t u -> do
t' <- go t
u' <- go u
return $ L.Projection t' u'
t -> return t
-- | Convert Maybe to Either value with error -- | Convert Maybe to Either value with error
m2e :: String -> Maybe a -> Either String a m2e :: String -> Maybe a -> Either String a
@@ -352,7 +388,7 @@ dumpCanonical path (C.Grammar ab cncs) = do
ppLPGF :: FilePath -> LPGF -> IO () ppLPGF :: FilePath -> LPGF -> IO ()
ppLPGF path lpgf = ppLPGF path lpgf =
forM_ (Map.toList $ L.concretes lpgf) $ \(cid,concr) -> forM_ (Map.toList $ L.concretes lpgf) $ \(cid,concr) ->
writeFile' (path </> showCId cid <.> "lpgf.txt") (T.unpack $ L.render $ L.pp concr) writeFile' (path </> showCId cid <.> "lpgf.txt") (L.render $ L.pp concr)
-- | Dump LPGF to file -- | Dump LPGF to file
dumpLPGF :: FilePath -> LPGF -> IO () dumpLPGF :: FilePath -> LPGF -> IO ()

View File

@@ -15,7 +15,8 @@ import PGF.Tree (Tree (..), expr2tree, prTree)
import Control.Monad (liftM, liftM2, forM_) import Control.Monad (liftM, liftM2, forM_)
import qualified Control.Monad.Writer as CMW import qualified Control.Monad.Writer as CMW
import Data.Binary (Binary, put, get, putWord8, getWord8, encodeFile, decodeFile) import Data.Binary (Binary, put, get, putWord8, getWord8, encodeFile, decodeFile)
import qualified Data.Map as Map import qualified Data.IntMap as IntMap
import qualified Data.Map.Strict as Map
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
import qualified Data.Text.Encoding as TE import qualified Data.Text.Encoding as TE
@@ -36,7 +37,8 @@ data Abstract = Abstract {
} deriving (Show) } deriving (Show)
-- | Concrete syntax -- | Concrete syntax
newtype Concrete = Concrete { data Concrete = Concrete {
toks :: IntMap.IntMap Text, -- ^ all strings are stored exactly once here
-- lincats :: Map.Map CId LinType, -- ^ a linearization type for each category -- lincats :: Map.Map CId LinType, -- ^ a linearization type for each category
lins :: Map.Map CId LinFun -- ^ a linearization function for each function lins :: Map.Map CId LinFun -- ^ a linearization function for each function
} deriving (Show) } deriving (Show)
@@ -66,6 +68,7 @@ data LinFun =
-- From original definition in paper -- From original definition in paper
| Empty | Empty
| Token Text | Token Text
| TokenIx Int -- ^ index into `toks` map
| Concat LinFun LinFun | Concat LinFun LinFun
| Ix Int | Ix Int
| Tuple [LinFun] | Tuple [LinFun]
@@ -93,10 +96,14 @@ instance Binary Abstract where
get = return $ Abstract {} get = return $ Abstract {}
instance Binary Concrete where instance Binary Concrete where
put concr = put (lins concr) put concr = do
put (toks concr)
put (lins concr)
get = do get = do
ts <- get
ls <- get ls <- get
return $ Concrete { return $ Concrete {
toks = ts,
lins = ls lins = ls
} }
@@ -107,10 +114,11 @@ instance Binary LinFun where
Space -> putWord8 2 Space -> putWord8 2
Capit -> putWord8 3 Capit -> putWord8 3
AllCapit -> putWord8 4 AllCapit -> putWord8 4
Pre ps d -> putWord8 5 >> put ([(map TE.encodeUtf8 p,l) | (p,l) <- ps],d) Pre ps d -> putWord8 5 >> put (ps,d)
Missing f -> putWord8 13 >> put f Missing f -> putWord8 13 >> put f
Empty -> putWord8 6 Empty -> putWord8 6
Token t -> putWord8 7 >> put (TE.encodeUtf8 t) Token t -> putWord8 7 >> put t
TokenIx i -> putWord8 14 >> put i
Concat l1 l2 -> putWord8 8 >> put (l1,l2) Concat l1 l2 -> putWord8 8 >> put (l1,l2)
Ix i -> putWord8 9 >> put i Ix i -> putWord8 9 >> put i
Tuple ls -> putWord8 10 >> put ls Tuple ls -> putWord8 10 >> put ls
@@ -124,16 +132,21 @@ instance Binary LinFun where
2 -> return Space 2 -> return Space
3 -> return Capit 3 -> return Capit
4 -> return AllCapit 4 -> return AllCapit
5 -> liftM2 (\ps -> Pre [(map TE.decodeUtf8 p,l) | (p,l) <- ps]) get get 5 -> liftM2 Pre get get
13 -> liftM Missing get 13 -> liftM Missing get
6 -> return Empty 6 -> return Empty
7 -> liftM (Token . TE.decodeUtf8) get 7 -> liftM Token get
14 -> liftM TokenIx get
8 -> liftM2 Concat get get 8 -> liftM2 Concat get get
9 -> liftM Ix get 9 -> liftM Ix get
10 -> liftM Tuple get 10 -> liftM Tuple get
11 -> liftM2 Projection get get 11 -> liftM2 Projection get get
12 -> liftM Argument get 12 -> liftM Argument get
_ -> fail "Failed to decode LPGF binary format" _ -> fail "Failed to decode LPGF binary format"
instance Binary Text where
put = put . TE.encodeUtf8
get = liftM TE.decodeUtf8 get
abstractName :: LPGF -> CId abstractName :: LPGF -> CId
abstractName = absname abstractName = absname
@@ -167,13 +180,16 @@ linearizeConcreteText concr expr = lin2string $ lin (expr2tree expr)
lin tree = case tree of lin tree = case tree of
Fun f as -> Fun f as ->
case Map.lookup f (lins concr) of case Map.lookup f (lins concr) of
Just t -> eval (map lin as) t Just t -> eval cxt t
-- _ -> error $ printf "Lookup failed for function: %s" (showCId f) where cxt = Context { cxToks = toks concr, cxArgs = map lin as }
_ -> Missing f _ -> Missing f
x -> error $ printf "Cannot lin: %s" (prTree x) x -> error $ printf "Cannot lin: %s" (prTree x)
-- | Evaluation context is a sequence of terms -- | Evaluation context
type Context = [LinFun] data Context = Context {
cxArgs :: [LinFun], -- ^ is a sequence of terms
cxToks :: IntMap.IntMap Text -- ^ token map
}
-- | Operational semantics -- | Operational semantics
eval :: Context -> LinFun -> LinFun eval :: Context -> LinFun -> LinFun
@@ -181,8 +197,9 @@ eval cxt t = case t of
Error err -> error err Error err -> error err
Pre pts df -> Pre pts' df' Pre pts df -> Pre pts' df'
where where
pts' = [ (strs, eval cxt t) | (strs,t) <- pts] pts' = [(pfxs, eval cxt t) | (pfxs, t) <- pts]
df' = eval cxt df df' = eval cxt df
TokenIx i -> maybe Empty Token $ IntMap.lookup i (cxToks cxt)
Concat s t -> Concat v w Concat s t -> Concat v w
where where
v = eval cxt s v = eval cxt s
@@ -196,7 +213,7 @@ eval cxt t = case t of
(Tuple vs, Ix i) -> vs !! (i-1) (Tuple vs, Ix i) -> vs !! (i-1)
(tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv) (tp@(Tuple _), tv@(Tuple _)) | all isIx (flattenTuple tv) -> foldl (\(Tuple vs) (Ix i) -> vs !! (i-1)) tp (flattenTuple tv)
(t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u') (t',u') -> error $ printf "Incompatible projection:\n- %s\n⇓ %s\n- %s\n⇓ %s" (show t) (show t') (show u) (show u')
Argument i -> cxt !! (i-1) Argument i -> cxArgs cxt !! (i-1)
_ -> t _ -> t
flattenTuple :: LinFun -> [LinFun] flattenTuple :: LinFun -> [LinFun]
@@ -252,10 +269,10 @@ unConcat lf = [lf]
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
-- Pretty-printing -- Pretty-printing
type Doc = CMW.Writer [Text] () type Doc = CMW.Writer [String] ()
render :: Doc -> Text render :: Doc -> String
render = T.unlines . CMW.execWriter render = unlines . CMW.execWriter
class PP a where class PP a where
pp :: a -> Doc pp :: a -> Doc
@@ -264,9 +281,12 @@ instance PP LPGF where
pp (LPGF _ _ cncs) = mapM_ pp cncs pp (LPGF _ _ cncs) = mapM_ pp cncs
instance PP Concrete where instance PP Concrete where
pp (Concrete lins) = pp (Concrete toks lins) = do
forM_ (IntMap.toList toks) $ \(i,tok) ->
CMW.tell [show i ++ " " ++ T.unpack tok]
CMW.tell [""]
forM_ (Map.toList lins) $ \(cid,lin) -> do forM_ (Map.toList lins) $ \(cid,lin) -> do
CMW.tell [T.pack ("# " ++ showCId cid)] CMW.tell ["# " ++ showCId cid]
pp lin pp lin
CMW.tell [""] CMW.tell [""]
@@ -276,7 +296,7 @@ instance PP LinFun where
pp' n = \case pp' n = \case
Pre ps d -> do Pre ps d -> do
p "Pre" p "Pre"
CMW.tell [ T.replicate (n+1) " " `T.append` T.pack (show p) | p <- ps ] CMW.tell [ replicate (2*(n+1)) ' ' ++ show p | p <- ps ]
pp' (n+1) d pp' (n+1) d
c@(Concat l1 l2) -> do c@(Concat l1 l2) -> do
@@ -286,7 +306,7 @@ instance PP LinFun where
p "Concat" p "Concat"
mapM_ (pp' (n+1)) ts mapM_ (pp' (n+1)) ts
else else
ps $ "Concat " ++ show ts p $ "Concat " ++ show ts
Tuple ls | any isDeep ls -> do Tuple ls | any isDeep ls -> do
p "Tuple" p "Tuple"
mapM_ (pp' (n+1)) ls mapM_ (pp' (n+1)) ls
@@ -294,12 +314,10 @@ instance PP LinFun where
p "Projection" p "Projection"
pp' (n+1) l1 pp' (n+1) l1
pp' (n+1) l2 pp' (n+1) l2
t -> ps $ show t t -> p $ show t
where where
p :: Text -> Doc p :: String -> Doc
p t = CMW.tell [ T.replicate n " " `T.append` t ] p t = CMW.tell [ replicate (2*n) ' ' ++ t ]
ps :: String -> Doc
ps = p . T.pack
isDeep = not . isTerm isDeep = not . isTerm
isTerm = \case isTerm = \case

View File

@@ -19,6 +19,8 @@ stack test gf:test:lpgf --test-arguments="foods/Foods Fre Ger" # specific gramma
## Benchmark ## Benchmark
Compare performance metrics between LPGF and PGF[2]. Note: correctness is not checked here.
### Compilation ### Compilation
Comparing PGF, LPGF along following criteria: Comparing PGF, LPGF along following criteria: