diff --git a/GF.cabal b/GF.cabal index 864a9005a..fa2e7708e 100644 --- a/GF.cabal +++ b/GF.cabal @@ -575,23 +575,25 @@ library PGF.Parsing.FCFG PGF.Expr PGF.Type - PGF.Raw.Parse - PGF.Raw.Print - PGF.Raw.Convert - PGF.Raw.Abstract PGF.AbsCompute PGF.Paraphrase PGF.TypeCheck + PGF.Binary GF.Data.MultiMap GF.Data.Utilities GF.Data.SortedList GF.Data.Assoc GF.Data.ErrM - GF.Text.UTF8 -- needed only for the on demand generation of PMCFG GF.Data.BacktrackM GF.Compile.GenerateFCFG GF.Compile.GeneratePMCFG +-- not really part of GF but I have changed the original binary library +-- and we have to keep the copy for now. + Data.Binary + Data.Binary.Put + Data.Binary.Get + Data.Binary.Builder executable gf build-depends: base, @@ -701,6 +703,7 @@ executable gf PGF.AbsCompute PGF.Paraphrase PGF.TypeCheck + PGF.Binary GFC GFI diff --git a/src/Data/Binary.hs b/src/Data/Binary.hs new file mode 100644 index 000000000..fd0ca6c98 --- /dev/null +++ b/src/Data/Binary.hs @@ -0,0 +1,792 @@ +{-# LANGUAGE CPP, FlexibleInstances, FlexibleContexts #-} +----------------------------------------------------------------------------- +-- | +-- Module : Data.Binary +-- Copyright : Lennart Kolmodin +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer : Lennart Kolmodin +-- Stability : unstable +-- Portability : portable to Hugs and GHC. Requires the FFI and some flexible instances +-- +-- Binary serialisation of Haskell values to and from lazy ByteStrings. +-- The Binary library provides methods for encoding Haskell values as +-- streams of bytes directly in memory. The resulting @ByteString@ can +-- then be written to disk, sent over the network, or futher processed +-- (for example, compressed with gzip). +-- +-- The 'Binary' package is notable in that it provides both pure, and +-- high performance serialisation. +-- +-- Values are always encoded in network order (big endian) form, and +-- encoded data should be portable across machine endianess, word size, +-- or compiler version. For example, data encoded using the Binary class +-- could be written from GHC, and read back in Hugs. +-- +----------------------------------------------------------------------------- + +module Data.Binary ( + + -- * The Binary class + Binary(..) + + -- $example + + -- * The Get and Put monads + , Get + , Put + + -- * Useful helpers for writing instances + , putWord8 + , getWord8 + + -- * Binary serialisation + , encode -- :: Binary a => a -> ByteString + , decode -- :: Binary a => ByteString -> a + + -- * IO functions for serialisation + , encodeFile -- :: Binary a => FilePath -> a -> IO () + , decodeFile -- :: Binary a => FilePath -> IO a + +-- Lazy put and get +-- , lazyPut +-- , lazyGet + + , module Data.Word -- useful + + ) where + +import Data.Word + +import Data.Binary.Put +import Data.Binary.Get + +import Control.Monad +import Foreign +import System.IO + +import Data.ByteString.Lazy (ByteString) +import qualified Data.ByteString.Lazy as L + +import Data.Char (chr,ord) +import Data.List (unfoldr) + +-- And needed for the instances: +import qualified Data.ByteString as B +import qualified Data.Map as Map +import qualified Data.Set as Set +import qualified Data.IntMap as IntMap +import qualified Data.IntSet as IntSet +import qualified Data.Ratio as R + +import qualified Data.Tree as T + +import Data.Array.Unboxed + +-- +-- This isn't available in older Hugs or older GHC +-- +#if __GLASGOW_HASKELL__ >= 606 +import qualified Data.Sequence as Seq +import qualified Data.Foldable as Fold +#endif + +------------------------------------------------------------------------ + +-- | The @Binary@ class provides 'put' and 'get', methods to encode and +-- decode a Haskell value to a lazy ByteString. It mirrors the Read and +-- Show classes for textual representation of Haskell types, and is +-- suitable for serialising Haskell values to disk, over the network. +-- +-- For parsing and generating simple external binary formats (e.g. C +-- structures), Binary may be used, but in general is not suitable +-- for complex protocols. Instead use the Put and Get primitives +-- directly. +-- +-- Instances of Binary should satisfy the following property: +-- +-- > decode . encode == id +-- +-- That is, the 'get' and 'put' methods should be the inverse of each +-- other. A range of instances are provided for basic Haskell types. +-- +class Binary t where + -- | Encode a value in the Put monad. + put :: t -> Put + -- | Decode a value in the Get monad + get :: Get t + +-- $example +-- To serialise a custom type, an instance of Binary for that type is +-- required. For example, suppose we have a data structure: +-- +-- > data Exp = IntE Int +-- > | OpE String Exp Exp +-- > deriving Show +-- +-- We can encode values of this type into bytestrings using the +-- following instance, which proceeds by recursively breaking down the +-- structure to serialise: +-- +-- > instance Binary Exp where +-- > put (IntE i) = do put (0 :: Word8) +-- > put i +-- > put (OpE s e1 e2) = do put (1 :: Word8) +-- > put s +-- > put e1 +-- > put e2 +-- > +-- > get = do t <- get :: Get Word8 +-- > case t of +-- > 0 -> do i <- get +-- > return (IntE i) +-- > 1 -> do s <- get +-- > e1 <- get +-- > e2 <- get +-- > return (OpE s e1 e2) +-- +-- Note how we write an initial tag byte to indicate each variant of the +-- data type. +-- +-- We can simplify the writing of 'get' instances using monadic +-- combinators: +-- +-- > get = do tag <- getWord8 +-- > case tag of +-- > 0 -> liftM IntE get +-- > 1 -> liftM3 OpE get get get +-- +-- The generation of Binary instances has been automated by a script +-- using Scrap Your Boilerplate generics. Use the script here: +-- . +-- +-- To derive the instance for a type, load this script into GHCi, and +-- bring your type into scope. Your type can then have its Binary +-- instances derived as follows: +-- +-- > $ ghci -fglasgow-exts BinaryDerive.hs +-- > *BinaryDerive> :l Example.hs +-- > *Main> deriveM (undefined :: Drinks) +-- > +-- > instance Binary Main.Drinks where +-- > put (Beer a) = putWord8 0 >> put a +-- > put Coffee = putWord8 1 +-- > put Tea = putWord8 2 +-- > put EnergyDrink = putWord8 3 +-- > put Water = putWord8 4 +-- > put Wine = putWord8 5 +-- > put Whisky = putWord8 6 +-- > get = do +-- > tag_ <- getWord8 +-- > case tag_ of +-- > 0 -> get >>= \a -> return (Beer a) +-- > 1 -> return Coffee +-- > 2 -> return Tea +-- > 3 -> return EnergyDrink +-- > 4 -> return Water +-- > 5 -> return Wine +-- > 6 -> return Whisky +-- > +-- +-- To serialise this to a bytestring, we use 'encode', which packs the +-- data structure into a binary format, in a lazy bytestring +-- +-- > > let e = OpE "*" (IntE 7) (OpE "/" (IntE 4) (IntE 2)) +-- > > let v = encode e +-- +-- Where 'v' is a binary encoded data structure. To reconstruct the +-- original data, we use 'decode' +-- +-- > > decode v :: Exp +-- > OpE "*" (IntE 7) (OpE "/" (IntE 4) (IntE 2)) +-- +-- The lazy ByteString that results from 'encode' can be written to +-- disk, and read from disk using Data.ByteString.Lazy IO functions, +-- such as hPutStr or writeFile: +-- +-- > > writeFile "/tmp/exp.txt" (encode e) +-- +-- And read back with: +-- +-- > > readFile "/tmp/exp.txt" >>= return . decode :: IO Exp +-- > OpE "*" (IntE 7) (OpE "/" (IntE 4) (IntE 2)) +-- +-- We can also directly serialise a value to and from a Handle, or a file: +-- +-- > > v <- decodeFile "/tmp/exp.txt" :: IO Exp +-- > OpE "*" (IntE 7) (OpE "/" (IntE 4) (IntE 2)) +-- +-- And write a value to disk +-- +-- > > encodeFile "/tmp/a.txt" v +-- + +------------------------------------------------------------------------ +-- Wrappers to run the underlying monad + +-- | Encode a value using binary serialisation to a lazy ByteString. +-- +encode :: Binary a => a -> ByteString +encode = runPut . put +{-# INLINE encode #-} + +-- | Decode a value from a lazy ByteString, reconstructing the original structure. +-- +decode :: Binary a => ByteString -> a +decode = runGet get + +------------------------------------------------------------------------ +-- Convenience IO operations + +-- | Lazily serialise a value to a file +-- +-- This is just a convenience function, it's defined simply as: +-- +-- > encodeFile f = B.writeFile f . encode +-- +-- So for example if you wanted to compress as well, you could use: +-- +-- > B.writeFile f . compress . encode +-- +encodeFile :: Binary a => FilePath -> a -> IO () +encodeFile f v = L.writeFile f (encode v) + +-- | Lazily reconstruct a value previously written to a file. +-- +-- This is just a convenience function, it's defined simply as: +-- +-- > decodeFile f = return . decode =<< B.readFile f +-- +-- So for example if you wanted to decompress as well, you could use: +-- +-- > return . decode . decompress =<< B.readFile f +-- +-- After contructing the data from the input file, 'decodeFile' checks +-- if the file is empty, and in doing so will force the associated file +-- handle closed, if it is indeed empty. If the file is not empty, +-- it is up to the decoding instance to consume the rest of the data, +-- or otherwise finalise the resource. +-- +decodeFile :: Binary a => FilePath -> IO a +decodeFile f = do + s <- L.readFile f + return $ runGet (do v <- get + m <- isEmpty + m `seq` return v) s + +-- needs bytestring 0.9.1.x to work + +------------------------------------------------------------------------ +-- Lazy put and get + +-- lazyPut :: (Binary a) => a -> Put +-- lazyPut a = put (encode a) + +-- lazyGet :: (Binary a) => Get a +-- lazyGet = fmap decode get + +------------------------------------------------------------------------ +-- Simple instances + +-- The () type need never be written to disk: values of singleton type +-- can be reconstructed from the type alone +instance Binary () where + put () = return () + get = return () + +-- Bools are encoded as a byte in the range 0 .. 1 +instance Binary Bool where + put = putWord8 . fromIntegral . fromEnum + get = liftM (toEnum . fromIntegral) getWord8 + +-- Values of type 'Ordering' are encoded as a byte in the range 0 .. 2 +instance Binary Ordering where + put = putWord8 . fromIntegral . fromEnum + get = liftM (toEnum . fromIntegral) getWord8 + +------------------------------------------------------------------------ +-- Words and Ints + +-- Words8s are written as bytes +instance Binary Word8 where + put = putWord8 + get = getWord8 + +-- Words16s are written as 2 bytes in big-endian (network) order +instance Binary Word16 where + put = putWord16be + get = getWord16be + +-- Words32s are written as 4 bytes in big-endian (network) order +instance Binary Word32 where + put = putWord32be + get = getWord32be + +-- Words64s are written as 8 bytes in big-endian (network) order +instance Binary Word64 where + put = putWord64be + get = getWord64be + +-- Int8s are written as a single byte. +instance Binary Int8 where + put i = put (fromIntegral i :: Word8) + get = liftM fromIntegral (get :: Get Word8) + +-- Int16s are written as a 2 bytes in big endian format +instance Binary Int16 where + put i = put (fromIntegral i :: Word16) + get = liftM fromIntegral (get :: Get Word16) + +-- Int32s are written as a 4 bytes in big endian format +instance Binary Int32 where + put i = put (fromIntegral i :: Word32) + get = liftM fromIntegral (get :: Get Word32) + +-- Int64s are written as a 4 bytes in big endian format +instance Binary Int64 where + put i = put (fromIntegral i :: Word64) + get = liftM fromIntegral (get :: Get Word64) + +------------------------------------------------------------------------ + +-- Words are written as sequence of bytes. The last bit of each +-- byte indicates whether there are more bytes to be read +instance Binary Word where + put i | i <= 0x7f = do put a + | i <= 0x3fff = do put (a .|. 0x80) + put b + | i <= 0x1fffff = do put (a .|. 0x80) + put (b .|. 0x80) + put c + | i <= 0xfffffff = do put (a .|. 0x80) + put (b .|. 0x80) + put (c .|. 0x80) + put d +#if WORD_SIZE_IN_BITS < 64 + | otherwise = do put (a .|. 0x80) + put (b .|. 0x80) + put (c .|. 0x80) + put (d .|. 0x80) + put e +#else + | i <= 0x7ffffffff = do put (a .|. 0x80) + put (b .|. 0x80) + put (c .|. 0x80) + put (d .|. 0x80) + put e + | i <= 0x3ffffffffff = do put (a .|. 0x80) + put (b .|. 0x80) + put (c .|. 0x80) + put (d .|. 0x80) + put (e .|. 0x80) + put f + | i <= 0x1ffffffffffff = do put (a .|. 0x80) + put (b .|. 0x80) + put (c .|. 0x80) + put (d .|. 0x80) + put (e .|. 0x80) + put (f .|. 0x80) + put g + | i <= 0xffffffffffffff = do put (a .|. 0x80) + put (b .|. 0x80) + put (c .|. 0x80) + put (d .|. 0x80) + put (e .|. 0x80) + put (f .|. 0x80) + put (g .|. 0x80) + put h + | i <= 0xffffffffffffff = do put (a .|. 0x80) + put (b .|. 0x80) + put (c .|. 0x80) + put (d .|. 0x80) + put (e .|. 0x80) + put (f .|. 0x80) + put (g .|. 0x80) + put h + | i <= 0x7fffffffffffffff = do put (a .|. 0x80) + put (b .|. 0x80) + put (c .|. 0x80) + put (d .|. 0x80) + put (e .|. 0x80) + put (f .|. 0x80) + put (g .|. 0x80) + put (h .|. 0x80) + put j + | otherwise = do put (a .|. 0x80) + put (b .|. 0x80) + put (c .|. 0x80) + put (d .|. 0x80) + put (e .|. 0x80) + put (f .|. 0x80) + put (g .|. 0x80) + put (h .|. 0x80) + put (j .|. 0x80) + put k +#endif + where + a = fromIntegral ( i .&. 0x7f) :: Word8 + b = fromIntegral (shiftR i 7 .&. 0x7f) :: Word8 + c = fromIntegral (shiftR i 14 .&. 0x7f) :: Word8 + d = fromIntegral (shiftR i 21 .&. 0x7f) :: Word8 + e = fromIntegral (shiftR i 28 .&. 0x7f) :: Word8 + f = fromIntegral (shiftR i 35 .&. 0x7f) :: Word8 + g = fromIntegral (shiftR i 42 .&. 0x7f) :: Word8 + h = fromIntegral (shiftR i 49 .&. 0x7f) :: Word8 + j = fromIntegral (shiftR i 56 .&. 0x7f) :: Word8 + k = fromIntegral (shiftR i 63 .&. 0x7f) :: Word8 + + get = do i <- getWord8 + (if i <= 0x7f + then return (fromIntegral i) + else do n <- get + return $ (n `shiftL` 7) .|. (fromIntegral (i .&. 0x7f))) + +-- Int has the same representation as Word +instance Binary Int where + put i = put (fromIntegral i :: Word) + get = liftM fromIntegral (get :: Get Word) + +------------------------------------------------------------------------ +-- +-- Portable, and pretty efficient, serialisation of Integer +-- + +-- Fixed-size type for a subset of Integer +type SmallInt = Int32 + +-- Integers are encoded in two ways: if they fit inside a SmallInt, +-- they're written as a byte tag, and that value. If the Integer value +-- is too large to fit in a SmallInt, it is written as a byte array, +-- along with a sign and length field. + +instance Binary Integer where + + {-# INLINE put #-} + put n | n >= lo && n <= hi = do + putWord8 0 + put (fromIntegral n :: SmallInt) -- fast path + where + lo = fromIntegral (minBound :: SmallInt) :: Integer + hi = fromIntegral (maxBound :: SmallInt) :: Integer + + put n = do + putWord8 1 + put sign + put (unroll (abs n)) -- unroll the bytes + where + sign = fromIntegral (signum n) :: Word8 + + {-# INLINE get #-} + get = do + tag <- get :: Get Word8 + case tag of + 0 -> liftM fromIntegral (get :: Get SmallInt) + _ -> do sign <- get + bytes <- get + let v = roll bytes + return $! if sign == (1 :: Word8) then v else - v + +-- +-- Fold and unfold an Integer to and from a list of its bytes +-- +unroll :: Integer -> [Word8] +unroll = unfoldr step + where + step 0 = Nothing + step i = Just (fromIntegral i, i `shiftR` 8) + +roll :: [Word8] -> Integer +roll = foldr unstep 0 + where + unstep b a = a `shiftL` 8 .|. fromIntegral b + +{- + +-- +-- An efficient, raw serialisation for Integer (GHC only) +-- + +-- TODO This instance is not architecture portable. GMP stores numbers as +-- arrays of machine sized words, so the byte format is not portable across +-- architectures with different endianess and word size. + +import Data.ByteString.Base (toForeignPtr,unsafePackAddress, memcpy) +import GHC.Base hiding (ord, chr) +import GHC.Prim +import GHC.Ptr (Ptr(..)) +import GHC.IOBase (IO(..)) + +instance Binary Integer where + put (S# i) = putWord8 0 >> put (I# i) + put (J# s ba) = do + putWord8 1 + put (I# s) + put (BA ba) + + get = do + b <- getWord8 + case b of + 0 -> do (I# i#) <- get + return (S# i#) + _ -> do (I# s#) <- get + (BA a#) <- get + return (J# s# a#) + +instance Binary ByteArray where + + -- Pretty safe. + put (BA ba) = + let sz = sizeofByteArray# ba -- (primitive) in *bytes* + addr = byteArrayContents# ba + bs = unsafePackAddress (I# sz) addr + in put bs -- write as a ByteString. easy, yay! + + -- Pretty scary. Should be quick though + get = do + (fp, off, n@(I# sz)) <- liftM toForeignPtr get -- so decode a ByteString + assert (off == 0) $ return $ unsafePerformIO $ do + (MBA arr) <- newByteArray sz -- and copy it into a ByteArray# + let to = byteArrayContents# (unsafeCoerce# arr) -- urk, is this safe? + withForeignPtr fp $ \from -> memcpy (Ptr to) from (fromIntegral n) + freezeByteArray arr + +-- wrapper for ByteArray# +data ByteArray = BA {-# UNPACK #-} !ByteArray# +data MBA = MBA {-# UNPACK #-} !(MutableByteArray# RealWorld) + +newByteArray :: Int# -> IO MBA +newByteArray sz = IO $ \s -> + case newPinnedByteArray# sz s of { (# s', arr #) -> + (# s', MBA arr #) } + +freezeByteArray :: MutableByteArray# RealWorld -> IO ByteArray +freezeByteArray arr = IO $ \s -> + case unsafeFreezeByteArray# arr s of { (# s', arr' #) -> + (# s', BA arr' #) } + +-} + +instance (Binary a,Integral a) => Binary (R.Ratio a) where + put r = put (R.numerator r) >> put (R.denominator r) + get = liftM2 (R.%) get get + +------------------------------------------------------------------------ + +-- Char is serialised as UTF-8 +instance Binary Char where + put a | c <= 0x7f = put (fromIntegral c :: Word8) + | c <= 0x7ff = do put (0xc0 .|. y) + put (0x80 .|. z) + | c <= 0xffff = do put (0xe0 .|. x) + put (0x80 .|. y) + put (0x80 .|. z) + | c <= 0x10ffff = do put (0xf0 .|. w) + put (0x80 .|. x) + put (0x80 .|. y) + put (0x80 .|. z) + | otherwise = error "Not a valid Unicode code point" + where + c = ord a + z, y, x, w :: Word8 + z = fromIntegral (c .&. 0x3f) + y = fromIntegral (shiftR c 6 .&. 0x3f) + x = fromIntegral (shiftR c 12 .&. 0x3f) + w = fromIntegral (shiftR c 18 .&. 0x7) + + get = do + let getByte = liftM (fromIntegral :: Word8 -> Int) get + shiftL6 = flip shiftL 6 :: Int -> Int + w <- getByte + r <- case () of + _ | w < 0x80 -> return w + | w < 0xe0 -> do + x <- liftM (xor 0x80) getByte + return (x .|. shiftL6 (xor 0xc0 w)) + | w < 0xf0 -> do + x <- liftM (xor 0x80) getByte + y <- liftM (xor 0x80) getByte + return (y .|. shiftL6 (x .|. shiftL6 + (xor 0xe0 w))) + | otherwise -> do + x <- liftM (xor 0x80) getByte + y <- liftM (xor 0x80) getByte + z <- liftM (xor 0x80) getByte + return (z .|. shiftL6 (y .|. shiftL6 + (x .|. shiftL6 (xor 0xf0 w)))) + return $! chr r + +------------------------------------------------------------------------ +-- Instances for the first few tuples + +instance (Binary a, Binary b) => Binary (a,b) where + put (a,b) = put a >> put b + get = liftM2 (,) get get + +instance (Binary a, Binary b, Binary c) => Binary (a,b,c) where + put (a,b,c) = put a >> put b >> put c + get = liftM3 (,,) get get get + +instance (Binary a, Binary b, Binary c, Binary d) => Binary (a,b,c,d) where + put (a,b,c,d) = put a >> put b >> put c >> put d + get = liftM4 (,,,) get get get get + +instance (Binary a, Binary b, Binary c, Binary d, Binary e) => Binary (a,b,c,d,e) where + put (a,b,c,d,e) = put a >> put b >> put c >> put d >> put e + get = liftM5 (,,,,) get get get get get + +-- +-- and now just recurse: +-- + +instance (Binary a, Binary b, Binary c, Binary d, Binary e, Binary f) + => Binary (a,b,c,d,e,f) where + put (a,b,c,d,e,f) = put (a,(b,c,d,e,f)) + get = do (a,(b,c,d,e,f)) <- get ; return (a,b,c,d,e,f) + +instance (Binary a, Binary b, Binary c, Binary d, Binary e, Binary f, Binary g) + => Binary (a,b,c,d,e,f,g) where + put (a,b,c,d,e,f,g) = put (a,(b,c,d,e,f,g)) + get = do (a,(b,c,d,e,f,g)) <- get ; return (a,b,c,d,e,f,g) + +instance (Binary a, Binary b, Binary c, Binary d, Binary e, + Binary f, Binary g, Binary h) + => Binary (a,b,c,d,e,f,g,h) where + put (a,b,c,d,e,f,g,h) = put (a,(b,c,d,e,f,g,h)) + get = do (a,(b,c,d,e,f,g,h)) <- get ; return (a,b,c,d,e,f,g,h) + +instance (Binary a, Binary b, Binary c, Binary d, Binary e, + Binary f, Binary g, Binary h, Binary i) + => Binary (a,b,c,d,e,f,g,h,i) where + put (a,b,c,d,e,f,g,h,i) = put (a,(b,c,d,e,f,g,h,i)) + get = do (a,(b,c,d,e,f,g,h,i)) <- get ; return (a,b,c,d,e,f,g,h,i) + +instance (Binary a, Binary b, Binary c, Binary d, Binary e, + Binary f, Binary g, Binary h, Binary i, Binary j) + => Binary (a,b,c,d,e,f,g,h,i,j) where + put (a,b,c,d,e,f,g,h,i,j) = put (a,(b,c,d,e,f,g,h,i,j)) + get = do (a,(b,c,d,e,f,g,h,i,j)) <- get ; return (a,b,c,d,e,f,g,h,i,j) + +------------------------------------------------------------------------ +-- Container types + +instance Binary a => Binary [a] where + put l = put (length l) >> mapM_ put l + get = do n <- get :: Get Int + xs <- replicateM n get + return xs + +instance (Binary a) => Binary (Maybe a) where + put Nothing = putWord8 0 + put (Just x) = putWord8 1 >> put x + get = do + w <- getWord8 + case w of + 0 -> return Nothing + _ -> liftM Just get + +instance (Binary a, Binary b) => Binary (Either a b) where + put (Left a) = putWord8 0 >> put a + put (Right b) = putWord8 1 >> put b + get = do + w <- getWord8 + case w of + 0 -> liftM Left get + _ -> liftM Right get + +------------------------------------------------------------------------ +-- ByteStrings (have specially efficient instances) + +instance Binary B.ByteString where + put bs = do put (B.length bs) + putByteString bs + get = get >>= getByteString + +-- +-- Using old versions of fps, this is a type synonym, and non portable +-- +-- Requires 'flexible instances' +-- +instance Binary ByteString where + put bs = do put (fromIntegral (L.length bs) :: Int) + putLazyByteString bs + get = get >>= getLazyByteString + +------------------------------------------------------------------------ +-- Maps and Sets + +instance (Ord a, Binary a) => Binary (Set.Set a) where + put s = put (Set.size s) >> mapM_ put (Set.toAscList s) + get = liftM Set.fromDistinctAscList get + +instance (Ord k, Binary k, Binary e) => Binary (Map.Map k e) where + put m = put (Map.size m) >> mapM_ put (Map.toAscList m) + get = liftM Map.fromDistinctAscList get + +instance Binary IntSet.IntSet where + put s = put (IntSet.size s) >> mapM_ put (IntSet.toAscList s) + get = liftM IntSet.fromDistinctAscList get + +instance (Binary e) => Binary (IntMap.IntMap e) where + put m = put (IntMap.size m) >> mapM_ put (IntMap.toAscList m) + get = liftM IntMap.fromDistinctAscList get + +------------------------------------------------------------------------ +-- Queues and Sequences + +#if __GLASGOW_HASKELL__ >= 606 +-- +-- This is valid Hugs, but you need the most recent Hugs +-- + +instance (Binary e) => Binary (Seq.Seq e) where + -- any better way to do this? + put = put . Fold.toList + get = fmap Seq.fromList get + +#endif + +------------------------------------------------------------------------ +-- Floating point + +instance Binary Double where + put d = put (decodeFloat d) + get = liftM2 encodeFloat get get + +instance Binary Float where + put f = put (decodeFloat f) + get = liftM2 encodeFloat get get + +------------------------------------------------------------------------ +-- Trees + +instance (Binary e) => Binary (T.Tree e) where + put (T.Node r s) = put r >> put s + get = liftM2 T.Node get get + +------------------------------------------------------------------------ +-- Arrays + +instance (Binary i, Ix i, Binary e) => Binary (Array i e) where + put a = do + put (bounds a) + put (rangeSize $ bounds a) -- write the length + mapM_ put (elems a) -- now the elems. + get = do + bs <- get + n <- get -- read the length + xs <- replicateM n get -- now the elems. + return (listArray bs xs) + +-- +-- The IArray UArray e constraint is non portable. Requires flexible instances +-- +instance (Binary i, Ix i, Binary e, IArray UArray e) => Binary (UArray i e) where + put a = do + put (bounds a) + put (rangeSize $ bounds a) -- now write the length + mapM_ put (elems a) + get = do + bs <- get + n <- get + xs <- replicateM n get + return (listArray bs xs) diff --git a/src/Data/Binary/Builder.hs b/src/Data/Binary/Builder.hs new file mode 100644 index 000000000..cccbe6fa4 --- /dev/null +++ b/src/Data/Binary/Builder.hs @@ -0,0 +1,426 @@ +{-# LANGUAGE CPP #-} +{-# OPTIONS_GHC -fglasgow-exts #-} +-- for unboxed shifts + +----------------------------------------------------------------------------- +-- | +-- Module : Data.Binary.Builder +-- Copyright : Lennart Kolmodin, Ross Paterson +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer : Lennart Kolmodin +-- Stability : experimental +-- Portability : portable to Hugs and GHC +-- +-- Efficient construction of lazy bytestrings. +-- +----------------------------------------------------------------------------- + +#if defined(__GLASGOW_HASKELL__) && !defined(__HADDOCK__) +#include "MachDeps.h" +#endif + +module Data.Binary.Builder ( + + -- * The Builder type + Builder + , toLazyByteString + + -- * Constructing Builders + , empty + , singleton + , append + , fromByteString -- :: S.ByteString -> Builder + , fromLazyByteString -- :: L.ByteString -> Builder + + -- * Flushing the buffer state + , flush + + -- * Derived Builders + -- ** Big-endian writes + , putWord16be -- :: Word16 -> Builder + , putWord32be -- :: Word32 -> Builder + , putWord64be -- :: Word64 -> Builder + + -- ** Little-endian writes + , putWord16le -- :: Word16 -> Builder + , putWord32le -- :: Word32 -> Builder + , putWord64le -- :: Word64 -> Builder + + -- ** Host-endian, unaligned writes + , putWordhost -- :: Word -> Builder + , putWord16host -- :: Word16 -> Builder + , putWord32host -- :: Word32 -> Builder + , putWord64host -- :: Word64 -> Builder + + ) where + +import Foreign +import Data.Monoid +import Data.Word +import qualified Data.ByteString as S +import qualified Data.ByteString.Lazy as L + +#ifdef BYTESTRING_IN_BASE +import Data.ByteString.Base (inlinePerformIO) +import qualified Data.ByteString.Base as S +#else +import Data.ByteString.Internal (inlinePerformIO) +import qualified Data.ByteString.Internal as S +import qualified Data.ByteString.Lazy.Internal as L +#endif + +#if defined(__GLASGOW_HASKELL__) && !defined(__HADDOCK__) +import GHC.Base +import GHC.Word (Word32(..),Word16(..),Word64(..)) + +#if WORD_SIZE_IN_BITS < 64 && __GLASGOW_HASKELL__ >= 608 +import GHC.Word (uncheckedShiftRL64#) +#endif +#endif + +------------------------------------------------------------------------ + +-- | A 'Builder' is an efficient way to build lazy 'L.ByteString's. +-- There are several functions for constructing 'Builder's, but only one +-- to inspect them: to extract any data, you have to turn them into lazy +-- 'L.ByteString's using 'toLazyByteString'. +-- +-- Internally, a 'Builder' constructs a lazy 'L.Bytestring' by filling byte +-- arrays piece by piece. As each buffer is filled, it is \'popped\' +-- off, to become a new chunk of the resulting lazy 'L.ByteString'. +-- All this is hidden from the user of the 'Builder'. + +newtype Builder = Builder { + -- Invariant (from Data.ByteString.Lazy): + -- The lists include no null ByteStrings. + runBuilder :: (Buffer -> [S.ByteString]) -> Buffer -> [S.ByteString] + } + +instance Monoid Builder where + mempty = empty + {-# INLINE mempty #-} + mappend = append + {-# INLINE mappend #-} + +------------------------------------------------------------------------ + +-- | /O(1)./ The empty Builder, satisfying +-- +-- * @'toLazyByteString' 'empty' = 'L.empty'@ +-- +empty :: Builder +empty = Builder id +{-# INLINE empty #-} + +-- | /O(1)./ A Builder taking a single byte, satisfying +-- +-- * @'toLazyByteString' ('singleton' b) = 'L.singleton' b@ +-- +singleton :: Word8 -> Builder +singleton = writeN 1 . flip poke +{-# INLINE singleton #-} + +------------------------------------------------------------------------ + +-- | /O(1)./ The concatenation of two Builders, an associative operation +-- with identity 'empty', satisfying +-- +-- * @'toLazyByteString' ('append' x y) = 'L.append' ('toLazyByteString' x) ('toLazyByteString' y)@ +-- +append :: Builder -> Builder -> Builder +append (Builder f) (Builder g) = Builder (f . g) +{-# INLINE append #-} + +-- | /O(1)./ A Builder taking a 'S.ByteString', satisfying +-- +-- * @'toLazyByteString' ('fromByteString' bs) = 'L.fromChunks' [bs]@ +-- +fromByteString :: S.ByteString -> Builder +fromByteString bs + | S.null bs = empty + | otherwise = flush `append` mapBuilder (bs :) +{-# INLINE fromByteString #-} + +-- | /O(1)./ A Builder taking a lazy 'L.ByteString', satisfying +-- +-- * @'toLazyByteString' ('fromLazyByteString' bs) = bs@ +-- +fromLazyByteString :: L.ByteString -> Builder +fromLazyByteString bss = flush `append` mapBuilder (L.toChunks bss ++) +{-# INLINE fromLazyByteString #-} + +------------------------------------------------------------------------ + +-- Our internal buffer type +data Buffer = Buffer {-# UNPACK #-} !(ForeignPtr Word8) + {-# UNPACK #-} !Int -- offset + {-# UNPACK #-} !Int -- used bytes + {-# UNPACK #-} !Int -- length left + +------------------------------------------------------------------------ + +-- | /O(n)./ Extract a lazy 'L.ByteString' from a 'Builder'. +-- The construction work takes place if and when the relevant part of +-- the lazy 'L.ByteString' is demanded. +-- +toLazyByteString :: Builder -> L.ByteString +toLazyByteString m = L.fromChunks $ unsafePerformIO $ do + buf <- newBuffer defaultSize + return (runBuilder (m `append` flush) (const []) buf) + +-- | /O(1)./ Pop the 'S.ByteString' we have constructed so far, if any, +-- yielding a new chunk in the result lazy 'L.ByteString'. +flush :: Builder +flush = Builder $ \ k buf@(Buffer p o u l) -> + if u == 0 + then k buf + else S.PS p o u : k (Buffer p (o+u) 0 l) + +------------------------------------------------------------------------ + +-- +-- copied from Data.ByteString.Lazy +-- +defaultSize :: Int +defaultSize = 32 * k - overhead + where k = 1024 + overhead = 2 * sizeOf (undefined :: Int) + +------------------------------------------------------------------------ + +-- | Sequence an IO operation on the buffer +unsafeLiftIO :: (Buffer -> IO Buffer) -> Builder +unsafeLiftIO f = Builder $ \ k buf -> inlinePerformIO $ do + buf' <- f buf + return (k buf') +{-# INLINE unsafeLiftIO #-} + +-- | Get the size of the buffer +withSize :: (Int -> Builder) -> Builder +withSize f = Builder $ \ k buf@(Buffer _ _ _ l) -> + runBuilder (f l) k buf + +-- | Map the resulting list of bytestrings. +mapBuilder :: ([S.ByteString] -> [S.ByteString]) -> Builder +mapBuilder f = Builder (f .) + +------------------------------------------------------------------------ + +-- | Ensure that there are at least @n@ many bytes available. +ensureFree :: Int -> Builder +ensureFree n = n `seq` withSize $ \ l -> + if n <= l then empty else + flush `append` unsafeLiftIO (const (newBuffer (max n defaultSize))) +{-# INLINE ensureFree #-} + +-- | Ensure that @n@ many bytes are available, and then use @f@ to write some +-- bytes into the memory. +writeN :: Int -> (Ptr Word8 -> IO ()) -> Builder +writeN n f = ensureFree n `append` unsafeLiftIO (writeNBuffer n f) +{-# INLINE writeN #-} + +writeNBuffer :: Int -> (Ptr Word8 -> IO ()) -> Buffer -> IO Buffer +writeNBuffer n f (Buffer fp o u l) = do + withForeignPtr fp (\p -> f (p `plusPtr` (o+u))) + return (Buffer fp o (u+n) (l-n)) +{-# INLINE writeNBuffer #-} + +newBuffer :: Int -> IO Buffer +newBuffer size = do + fp <- S.mallocByteString size + return $! Buffer fp 0 0 size +{-# INLINE newBuffer #-} + +------------------------------------------------------------------------ +-- Aligned, host order writes of storable values + +-- | Ensure that @n@ many bytes are available, and then use @f@ to write some +-- storable values into the memory. +writeNbytes :: Storable a => Int -> (Ptr a -> IO ()) -> Builder +writeNbytes n f = ensureFree n `append` unsafeLiftIO (writeNBufferBytes n f) +{-# INLINE writeNbytes #-} + +writeNBufferBytes :: Storable a => Int -> (Ptr a -> IO ()) -> Buffer -> IO Buffer +writeNBufferBytes n f (Buffer fp o u l) = do + withForeignPtr fp (\p -> f (p `plusPtr` (o+u))) + return (Buffer fp o (u+n) (l-n)) +{-# INLINE writeNBufferBytes #-} + +------------------------------------------------------------------------ + +-- +-- We rely on the fromIntegral to do the right masking for us. +-- The inlining here is critical, and can be worth 4x performance +-- + +-- | Write a Word16 in big endian format +putWord16be :: Word16 -> Builder +putWord16be w = writeN 2 $ \p -> do + poke p (fromIntegral (shiftr_w16 w 8) :: Word8) + poke (p `plusPtr` 1) (fromIntegral (w) :: Word8) +{-# INLINE putWord16be #-} + +-- | Write a Word16 in little endian format +putWord16le :: Word16 -> Builder +putWord16le w = writeN 2 $ \p -> do + poke p (fromIntegral (w) :: Word8) + poke (p `plusPtr` 1) (fromIntegral (shiftr_w16 w 8) :: Word8) +{-# INLINE putWord16le #-} + +-- putWord16le w16 = writeN 2 (\p -> poke (castPtr p) w16) + +-- | Write a Word32 in big endian format +putWord32be :: Word32 -> Builder +putWord32be w = writeN 4 $ \p -> do + poke p (fromIntegral (shiftr_w32 w 24) :: Word8) + poke (p `plusPtr` 1) (fromIntegral (shiftr_w32 w 16) :: Word8) + poke (p `plusPtr` 2) (fromIntegral (shiftr_w32 w 8) :: Word8) + poke (p `plusPtr` 3) (fromIntegral (w) :: Word8) +{-# INLINE putWord32be #-} + +-- +-- a data type to tag Put/Check. writes construct these which are then +-- inlined and flattened. matching Checks will be more robust with rules. +-- + +-- | Write a Word32 in little endian format +putWord32le :: Word32 -> Builder +putWord32le w = writeN 4 $ \p -> do + poke p (fromIntegral (w) :: Word8) + poke (p `plusPtr` 1) (fromIntegral (shiftr_w32 w 8) :: Word8) + poke (p `plusPtr` 2) (fromIntegral (shiftr_w32 w 16) :: Word8) + poke (p `plusPtr` 3) (fromIntegral (shiftr_w32 w 24) :: Word8) +{-# INLINE putWord32le #-} + +-- on a little endian machine: +-- putWord32le w32 = writeN 4 (\p -> poke (castPtr p) w32) + +-- | Write a Word64 in big endian format +putWord64be :: Word64 -> Builder +#if WORD_SIZE_IN_BITS < 64 +-- +-- To avoid expensive 64 bit shifts on 32 bit machines, we cast to +-- Word32, and write that +-- +putWord64be w = + let a = fromIntegral (shiftr_w64 w 32) :: Word32 + b = fromIntegral w :: Word32 + in writeN 8 $ \p -> do + poke p (fromIntegral (shiftr_w32 a 24) :: Word8) + poke (p `plusPtr` 1) (fromIntegral (shiftr_w32 a 16) :: Word8) + poke (p `plusPtr` 2) (fromIntegral (shiftr_w32 a 8) :: Word8) + poke (p `plusPtr` 3) (fromIntegral (a) :: Word8) + poke (p `plusPtr` 4) (fromIntegral (shiftr_w32 b 24) :: Word8) + poke (p `plusPtr` 5) (fromIntegral (shiftr_w32 b 16) :: Word8) + poke (p `plusPtr` 6) (fromIntegral (shiftr_w32 b 8) :: Word8) + poke (p `plusPtr` 7) (fromIntegral (b) :: Word8) +#else +putWord64be w = writeN 8 $ \p -> do + poke p (fromIntegral (shiftr_w64 w 56) :: Word8) + poke (p `plusPtr` 1) (fromIntegral (shiftr_w64 w 48) :: Word8) + poke (p `plusPtr` 2) (fromIntegral (shiftr_w64 w 40) :: Word8) + poke (p `plusPtr` 3) (fromIntegral (shiftr_w64 w 32) :: Word8) + poke (p `plusPtr` 4) (fromIntegral (shiftr_w64 w 24) :: Word8) + poke (p `plusPtr` 5) (fromIntegral (shiftr_w64 w 16) :: Word8) + poke (p `plusPtr` 6) (fromIntegral (shiftr_w64 w 8) :: Word8) + poke (p `plusPtr` 7) (fromIntegral (w) :: Word8) +#endif +{-# INLINE putWord64be #-} + +-- | Write a Word64 in little endian format +putWord64le :: Word64 -> Builder + +#if WORD_SIZE_IN_BITS < 64 +putWord64le w = + let b = fromIntegral (shiftr_w64 w 32) :: Word32 + a = fromIntegral w :: Word32 + in writeN 8 $ \p -> do + poke (p) (fromIntegral (a) :: Word8) + poke (p `plusPtr` 1) (fromIntegral (shiftr_w32 a 8) :: Word8) + poke (p `plusPtr` 2) (fromIntegral (shiftr_w32 a 16) :: Word8) + poke (p `plusPtr` 3) (fromIntegral (shiftr_w32 a 24) :: Word8) + poke (p `plusPtr` 4) (fromIntegral (b) :: Word8) + poke (p `plusPtr` 5) (fromIntegral (shiftr_w32 b 8) :: Word8) + poke (p `plusPtr` 6) (fromIntegral (shiftr_w32 b 16) :: Word8) + poke (p `plusPtr` 7) (fromIntegral (shiftr_w32 b 24) :: Word8) +#else +putWord64le w = writeN 8 $ \p -> do + poke p (fromIntegral (w) :: Word8) + poke (p `plusPtr` 1) (fromIntegral (shiftr_w64 w 8) :: Word8) + poke (p `plusPtr` 2) (fromIntegral (shiftr_w64 w 16) :: Word8) + poke (p `plusPtr` 3) (fromIntegral (shiftr_w64 w 24) :: Word8) + poke (p `plusPtr` 4) (fromIntegral (shiftr_w64 w 32) :: Word8) + poke (p `plusPtr` 5) (fromIntegral (shiftr_w64 w 40) :: Word8) + poke (p `plusPtr` 6) (fromIntegral (shiftr_w64 w 48) :: Word8) + poke (p `plusPtr` 7) (fromIntegral (shiftr_w64 w 56) :: Word8) +#endif +{-# INLINE putWord64le #-} + +-- on a little endian machine: +-- putWord64le w64 = writeN 8 (\p -> poke (castPtr p) w64) + +------------------------------------------------------------------------ +-- Unaligned, word size ops + +-- | /O(1)./ A Builder taking a single native machine word. The word is +-- written in host order, host endian form, for the machine you're on. +-- On a 64 bit machine the Word is an 8 byte value, on a 32 bit machine, +-- 4 bytes. Values written this way are not portable to +-- different endian or word sized machines, without conversion. +-- +putWordhost :: Word -> Builder +putWordhost w = writeNbytes (sizeOf (undefined :: Word)) (\p -> poke p w) +{-# INLINE putWordhost #-} + +-- | Write a Word16 in native host order and host endianness. +-- 2 bytes will be written, unaligned. +putWord16host :: Word16 -> Builder +putWord16host w16 = writeNbytes (sizeOf (undefined :: Word16)) (\p -> poke p w16) +{-# INLINE putWord16host #-} + +-- | Write a Word32 in native host order and host endianness. +-- 4 bytes will be written, unaligned. +putWord32host :: Word32 -> Builder +putWord32host w32 = writeNbytes (sizeOf (undefined :: Word32)) (\p -> poke p w32) +{-# INLINE putWord32host #-} + +-- | Write a Word64 in native host order. +-- On a 32 bit machine we write two host order Word32s, in big endian form. +-- 8 bytes will be written, unaligned. +putWord64host :: Word64 -> Builder +putWord64host w = writeNbytes (sizeOf (undefined :: Word64)) (\p -> poke p w) +{-# INLINE putWord64host #-} + +------------------------------------------------------------------------ +-- Unchecked shifts + +{-# INLINE shiftr_w16 #-} +shiftr_w16 :: Word16 -> Int -> Word16 +{-# INLINE shiftr_w32 #-} +shiftr_w32 :: Word32 -> Int -> Word32 +{-# INLINE shiftr_w64 #-} +shiftr_w64 :: Word64 -> Int -> Word64 + +#if defined(__GLASGOW_HASKELL__) && !defined(__HADDOCK__) +shiftr_w16 (W16# w) (I# i) = W16# (w `uncheckedShiftRL#` i) +shiftr_w32 (W32# w) (I# i) = W32# (w `uncheckedShiftRL#` i) + +#if WORD_SIZE_IN_BITS < 64 +shiftr_w64 (W64# w) (I# i) = W64# (w `uncheckedShiftRL64#` i) + +#if __GLASGOW_HASKELL__ <= 606 +-- Exported by GHC.Word in GHC 6.8 and higher +foreign import ccall unsafe "stg_uncheckedShiftRL64" + uncheckedShiftRL64# :: Word64# -> Int# -> Word64# +#endif + +#else +shiftr_w64 (W64# w) (I# i) = W64# (w `uncheckedShiftRL#` i) +#endif + +#else +shiftr_w16 = shiftR +shiftr_w32 = shiftR +shiftr_w64 = shiftR +#endif diff --git a/src/Data/Binary/Get.hs b/src/Data/Binary/Get.hs new file mode 100644 index 000000000..d92567e45 --- /dev/null +++ b/src/Data/Binary/Get.hs @@ -0,0 +1,539 @@ +{-# LANGUAGE CPP #-} +{-# OPTIONS_GHC -fglasgow-exts #-} +-- for unboxed shifts + +----------------------------------------------------------------------------- +-- | +-- Module : Data.Binary.Get +-- Copyright : Lennart Kolmodin +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer : Lennart Kolmodin +-- Stability : experimental +-- Portability : portable to Hugs and GHC. +-- +-- The Get monad. A monad for efficiently building structures from +-- encoded lazy ByteStrings +-- +----------------------------------------------------------------------------- + +#if defined(__GLASGOW_HASKELL__) && !defined(__HADDOCK__) +#include "MachDeps.h" +#endif + +module Data.Binary.Get ( + + -- * The Get type + Get + , runGet + , runGetState + + -- * Parsing + , skip + , uncheckedSkip + , lookAhead + , lookAheadM + , lookAheadE + , uncheckedLookAhead + + -- * Utility + , bytesRead + , getBytes + , remaining + , isEmpty + + -- * Parsing particular types + , getWord8 + + -- ** ByteStrings + , getByteString + , getLazyByteString + , getLazyByteStringNul + , getRemainingLazyByteString + + -- ** Big-endian reads + , getWord16be + , getWord32be + , getWord64be + + -- ** Little-endian reads + , getWord16le + , getWord32le + , getWord64le + + -- ** Host-endian, unaligned reads + , getWordhost + , getWord16host + , getWord32host + , getWord64host + + ) where + +import Control.Monad (when,liftM,ap) +import Control.Monad.Fix +import Data.Maybe (isNothing) + +import qualified Data.ByteString as B +import qualified Data.ByteString.Lazy as L + +#ifdef BYTESTRING_IN_BASE +import qualified Data.ByteString.Base as B +#else +import qualified Data.ByteString.Internal as B +import qualified Data.ByteString.Lazy.Internal as L +#endif + +#ifdef APPLICATIVE_IN_BASE +import Control.Applicative (Applicative(..)) +#endif + +import Foreign + +-- used by splitAtST +import Control.Monad.ST +import Data.STRef + +#if defined(__GLASGOW_HASKELL__) && !defined(__HADDOCK__) +import GHC.Base +import GHC.Word +import GHC.Int +#endif + +-- | The parse state +data S = S {-# UNPACK #-} !B.ByteString -- current chunk + L.ByteString -- the rest of the input + {-# UNPACK #-} !Int64 -- bytes read + +-- | The Get monad is just a State monad carrying around the input ByteString +newtype Get a = Get { unGet :: S -> (a, S) } + +instance Functor Get where + fmap f m = Get (\s -> let (a, s') = unGet m s + in (f a, s')) + {-# INLINE fmap #-} + +#ifdef APPLICATIVE_IN_BASE +instance Applicative Get where + pure = return + (<*>) = ap +#endif + +instance Monad Get where + return a = Get (\s -> (a, s)) + {-# INLINE return #-} + + m >>= k = Get (\s -> case unGet m s of + (a, s') -> unGet (k a) s') + {-# INLINE (>>=) #-} + + fail = failDesc + +instance MonadFix Get where + mfix f = Get (\s -> let (a,s') = unGet (f a) s + in (a,s')) + +------------------------------------------------------------------------ + +get :: Get S +get = Get (\s -> (s, s)) + +put :: S -> Get () +put s = Get (\_ -> ((), s)) + +------------------------------------------------------------------------ + +initState :: L.ByteString -> S +initState xs = mkState xs 0 +{-# INLINE initState #-} + +{- +initState (B.LPS xs) = + case xs of + [] -> S B.empty L.empty 0 + (x:xs') -> S x (B.LPS xs') 0 +-} + +#ifndef BYTESTRING_IN_BASE +mkState :: L.ByteString -> Int64 -> S +mkState l = case l of + L.Empty -> S B.empty L.empty + L.Chunk x xs -> S x xs +{-# INLINE mkState #-} + +#else +mkState :: L.ByteString -> Int64 -> S +mkState (B.LPS xs) = + case xs of + [] -> S B.empty L.empty + (x:xs') -> S x (B.LPS xs') +#endif + +-- | Run the Get monad applies a 'get'-based parser on the input ByteString +runGet :: Get a -> L.ByteString -> a +runGet m str = case unGet m (initState str) of (a, _) -> a + +-- | Run the Get monad applies a 'get'-based parser on the input +-- ByteString. Additional to the result of get it returns the number of +-- consumed bytes and the rest of the input. +runGetState :: Get a -> L.ByteString -> Int64 -> (a, L.ByteString, Int64) +runGetState m str off = + case unGet m (mkState str off) of + (a, ~(S s ss newOff)) -> (a, s `join` ss, newOff) + +------------------------------------------------------------------------ + +failDesc :: String -> Get a +failDesc err = do + S _ _ bytes <- get + Get (error (err ++ ". Failed reading at byte position " ++ show bytes)) + +-- | Skip ahead @n@ bytes. Fails if fewer than @n@ bytes are available. +skip :: Int -> Get () +skip n = readN (fromIntegral n) (const ()) + +-- | Skip ahead @n@ bytes. No error if there isn't enough bytes. +uncheckedSkip :: Int64 -> Get () +uncheckedSkip n = do + S s ss bytes <- get + if fromIntegral (B.length s) >= n + then put (S (B.drop (fromIntegral n) s) ss (bytes + n)) + else do + let rest = L.drop (n - fromIntegral (B.length s)) ss + put $! mkState rest (bytes + n) + +-- | Run @ga@, but return without consuming its input. +-- Fails if @ga@ fails. +lookAhead :: Get a -> Get a +lookAhead ga = do + s <- get + a <- ga + put s + return a + +-- | Like 'lookAhead', but consume the input if @gma@ returns 'Just _'. +-- Fails if @gma@ fails. +lookAheadM :: Get (Maybe a) -> Get (Maybe a) +lookAheadM gma = do + s <- get + ma <- gma + when (isNothing ma) $ + put s + return ma + +-- | Like 'lookAhead', but consume the input if @gea@ returns 'Right _'. +-- Fails if @gea@ fails. +lookAheadE :: Get (Either a b) -> Get (Either a b) +lookAheadE gea = do + s <- get + ea <- gea + case ea of + Left _ -> put s + _ -> return () + return ea + +-- | Get the next up to @n@ bytes as a lazy ByteString, without consuming them. +uncheckedLookAhead :: Int64 -> Get L.ByteString +uncheckedLookAhead n = do + S s ss _ <- get + if n <= fromIntegral (B.length s) + then return (L.fromChunks [B.take (fromIntegral n) s]) + else return $ L.take n (s `join` ss) + +------------------------------------------------------------------------ +-- Utility + +-- | Get the total number of bytes read to this point. +bytesRead :: Get Int64 +bytesRead = do + S _ _ b <- get + return b + +-- | Get the number of remaining unparsed bytes. +-- Useful for checking whether all input has been consumed. +-- Note that this forces the rest of the input. +remaining :: Get Int64 +remaining = do + S s ss _ <- get + return (fromIntegral (B.length s) + L.length ss) + +-- | Test whether all input has been consumed, +-- i.e. there are no remaining unparsed bytes. +isEmpty :: Get Bool +isEmpty = do + S s ss _ <- get + return (B.null s && L.null ss) + +------------------------------------------------------------------------ +-- Utility with ByteStrings + +-- | An efficient 'get' method for strict ByteStrings. Fails if fewer +-- than @n@ bytes are left in the input. +getByteString :: Int -> Get B.ByteString +getByteString n = readN n id +{-# INLINE getByteString #-} + +-- | An efficient 'get' method for lazy ByteStrings. Does not fail if fewer than +-- @n@ bytes are left in the input. +getLazyByteString :: Int64 -> Get L.ByteString +getLazyByteString n = do + S s ss bytes <- get + let big = s `join` ss + case splitAtST n big of + (consume, rest) -> do put $ mkState rest (bytes + n) + return consume +{-# INLINE getLazyByteString #-} + +-- | Get a lazy ByteString that is terminated with a NUL byte. Fails +-- if it reaches the end of input without hitting a NUL. +getLazyByteStringNul :: Get L.ByteString +getLazyByteStringNul = do + S s ss bytes <- get + let big = s `join` ss + (consume, t) = L.break (== 0) big + (h, rest) = L.splitAt 1 t + if L.null h + then fail "too few bytes" + else do + put $ mkState rest (bytes + L.length consume + 1) + return consume +{-# INLINE getLazyByteStringNul #-} + +-- | Get the remaining bytes as a lazy ByteString +getRemainingLazyByteString :: Get L.ByteString +getRemainingLazyByteString = do + S s ss _ <- get + return (s `join` ss) + +------------------------------------------------------------------------ +-- Helpers + +-- | Pull @n@ bytes from the input, as a strict ByteString. +getBytes :: Int -> Get B.ByteString +getBytes n = do + S s ss bytes <- get + if n <= B.length s + then do let (consume,rest) = B.splitAt n s + put $! S rest ss (bytes + fromIntegral n) + return $! consume + else + case L.splitAt (fromIntegral n) (s `join` ss) of + (consuming, rest) -> + do let now = B.concat . L.toChunks $ consuming + put $! mkState rest (bytes + fromIntegral n) + -- forces the next chunk before this one is returned + if (B.length now < n) + then + fail "too few bytes" + else + return now +{-# INLINE getBytes #-} +-- ^ important + +#ifndef BYTESTRING_IN_BASE +join :: B.ByteString -> L.ByteString -> L.ByteString +join bb lb + | B.null bb = lb + | otherwise = L.Chunk bb lb + +#else +join :: B.ByteString -> L.ByteString -> L.ByteString +join bb (B.LPS lb) + | B.null bb = B.LPS lb + | otherwise = B.LPS (bb:lb) +#endif + -- don't use L.append, it's strict in it's second argument :/ +{-# INLINE join #-} + +-- | Split a ByteString. If the first result is consumed before the -- +-- second, this runs in constant heap space. +-- +-- You must force the returned tuple for that to work, e.g. +-- +-- > case splitAtST n xs of +-- > (ys,zs) -> consume ys ... consume zs +-- +splitAtST :: Int64 -> L.ByteString -> (L.ByteString, L.ByteString) +splitAtST i ps | i <= 0 = (L.empty, ps) +#ifndef BYTESTRING_IN_BASE +splitAtST i ps = runST ( + do r <- newSTRef undefined + xs <- first r i ps + ys <- unsafeInterleaveST (readSTRef r) + return (xs, ys)) + + where + first r 0 xs@(L.Chunk _ _) = writeSTRef r xs >> return L.Empty + first r _ L.Empty = writeSTRef r L.Empty >> return L.Empty + + first r n (L.Chunk x xs) + | n < l = do writeSTRef r (L.Chunk (B.drop (fromIntegral n) x) xs) + return $ L.Chunk (B.take (fromIntegral n) x) L.Empty + | otherwise = do writeSTRef r (L.drop (n - l) xs) + liftM (L.Chunk x) $ unsafeInterleaveST (first r (n - l) xs) + + where l = fromIntegral (B.length x) +#else +splitAtST i (B.LPS ps) = runST ( + do r <- newSTRef undefined + xs <- first r i ps + ys <- unsafeInterleaveST (readSTRef r) + return (B.LPS xs, B.LPS ys)) + + where first r 0 xs = writeSTRef r xs >> return [] + first r _ [] = writeSTRef r [] >> return [] + first r n (x:xs) + | n < l = do writeSTRef r (B.drop (fromIntegral n) x : xs) + return [B.take (fromIntegral n) x] + | otherwise = do writeSTRef r (L.toChunks (L.drop (n - l) (B.LPS xs))) + fmap (x:) $ unsafeInterleaveST (first r (n - l) xs) + + where l = fromIntegral (B.length x) +#endif +{-# INLINE splitAtST #-} + +-- Pull n bytes from the input, and apply a parser to those bytes, +-- yielding a value. If less than @n@ bytes are available, fail with an +-- error. This wraps @getBytes@. +readN :: Int -> (B.ByteString -> a) -> Get a +readN n f = fmap f $ getBytes n +{-# INLINE readN #-} +-- ^ important + +------------------------------------------------------------------------ +-- Primtives + +-- helper, get a raw Ptr onto a strict ByteString copied out of the +-- underlying lazy byteString. So many indirections from the raw parser +-- state that my head hurts... + +getPtr :: Storable a => Int -> Get a +getPtr n = do + (fp,o,_) <- readN n B.toForeignPtr + return . B.inlinePerformIO $ withForeignPtr fp $ \p -> peek (castPtr $ p `plusPtr` o) +{-# INLINE getPtr #-} + +------------------------------------------------------------------------ + +-- | Read a Word8 from the monad state +getWord8 :: Get Word8 +getWord8 = getPtr (sizeOf (undefined :: Word8)) +{-# INLINE getWord8 #-} + +-- | Read a Word16 in big endian format +getWord16be :: Get Word16 +getWord16be = do + s <- readN 2 id + return $! (fromIntegral (s `B.index` 0) `shiftl_w16` 8) .|. + (fromIntegral (s `B.index` 1)) +{-# INLINE getWord16be #-} + +-- | Read a Word16 in little endian format +getWord16le :: Get Word16 +getWord16le = do + s <- readN 2 id + return $! (fromIntegral (s `B.index` 1) `shiftl_w16` 8) .|. + (fromIntegral (s `B.index` 0) ) +{-# INLINE getWord16le #-} + +-- | Read a Word32 in big endian format +getWord32be :: Get Word32 +getWord32be = do + s <- readN 4 id + return $! (fromIntegral (s `B.index` 0) `shiftl_w32` 24) .|. + (fromIntegral (s `B.index` 1) `shiftl_w32` 16) .|. + (fromIntegral (s `B.index` 2) `shiftl_w32` 8) .|. + (fromIntegral (s `B.index` 3) ) +{-# INLINE getWord32be #-} + +-- | Read a Word32 in little endian format +getWord32le :: Get Word32 +getWord32le = do + s <- readN 4 id + return $! (fromIntegral (s `B.index` 3) `shiftl_w32` 24) .|. + (fromIntegral (s `B.index` 2) `shiftl_w32` 16) .|. + (fromIntegral (s `B.index` 1) `shiftl_w32` 8) .|. + (fromIntegral (s `B.index` 0) ) +{-# INLINE getWord32le #-} + +-- | Read a Word64 in big endian format +getWord64be :: Get Word64 +getWord64be = do + s <- readN 8 id + return $! (fromIntegral (s `B.index` 0) `shiftl_w64` 56) .|. + (fromIntegral (s `B.index` 1) `shiftl_w64` 48) .|. + (fromIntegral (s `B.index` 2) `shiftl_w64` 40) .|. + (fromIntegral (s `B.index` 3) `shiftl_w64` 32) .|. + (fromIntegral (s `B.index` 4) `shiftl_w64` 24) .|. + (fromIntegral (s `B.index` 5) `shiftl_w64` 16) .|. + (fromIntegral (s `B.index` 6) `shiftl_w64` 8) .|. + (fromIntegral (s `B.index` 7) ) +{-# INLINE getWord64be #-} + +-- | Read a Word64 in little endian format +getWord64le :: Get Word64 +getWord64le = do + s <- readN 8 id + return $! (fromIntegral (s `B.index` 7) `shiftl_w64` 56) .|. + (fromIntegral (s `B.index` 6) `shiftl_w64` 48) .|. + (fromIntegral (s `B.index` 5) `shiftl_w64` 40) .|. + (fromIntegral (s `B.index` 4) `shiftl_w64` 32) .|. + (fromIntegral (s `B.index` 3) `shiftl_w64` 24) .|. + (fromIntegral (s `B.index` 2) `shiftl_w64` 16) .|. + (fromIntegral (s `B.index` 1) `shiftl_w64` 8) .|. + (fromIntegral (s `B.index` 0) ) +{-# INLINE getWord64le #-} + +------------------------------------------------------------------------ +-- Host-endian reads + +-- | /O(1)./ Read a single native machine word. The word is read in +-- host order, host endian form, for the machine you're on. On a 64 bit +-- machine the Word is an 8 byte value, on a 32 bit machine, 4 bytes. +getWordhost :: Get Word +getWordhost = getPtr (sizeOf (undefined :: Word)) +{-# INLINE getWordhost #-} + +-- | /O(1)./ Read a 2 byte Word16 in native host order and host endianness. +getWord16host :: Get Word16 +getWord16host = getPtr (sizeOf (undefined :: Word16)) +{-# INLINE getWord16host #-} + +-- | /O(1)./ Read a Word32 in native host order and host endianness. +getWord32host :: Get Word32 +getWord32host = getPtr (sizeOf (undefined :: Word32)) +{-# INLINE getWord32host #-} + +-- | /O(1)./ Read a Word64 in native host order and host endianess. +getWord64host :: Get Word64 +getWord64host = getPtr (sizeOf (undefined :: Word64)) +{-# INLINE getWord64host #-} + +------------------------------------------------------------------------ +-- Unchecked shifts + +shiftl_w16 :: Word16 -> Int -> Word16 +shiftl_w32 :: Word32 -> Int -> Word32 +shiftl_w64 :: Word64 -> Int -> Word64 + +#if defined(__GLASGOW_HASKELL__) && !defined(__HADDOCK__) +shiftl_w16 (W16# w) (I# i) = W16# (w `uncheckedShiftL#` i) +shiftl_w32 (W32# w) (I# i) = W32# (w `uncheckedShiftL#` i) + +#if WORD_SIZE_IN_BITS < 64 +shiftl_w64 (W64# w) (I# i) = W64# (w `uncheckedShiftL64#` i) + +#if __GLASGOW_HASKELL__ <= 606 +-- Exported by GHC.Word in GHC 6.8 and higher +foreign import ccall unsafe "stg_uncheckedShiftL64" + uncheckedShiftL64# :: Word64# -> Int# -> Word64# +#endif + +#else +shiftl_w64 (W64# w) (I# i) = W64# (w `uncheckedShiftL#` i) +#endif + +#else +shiftl_w16 = shiftL +shiftl_w32 = shiftL +shiftl_w64 = shiftL +#endif diff --git a/src/Data/Binary/Put.hs b/src/Data/Binary/Put.hs new file mode 100644 index 000000000..353bfb7b1 --- /dev/null +++ b/src/Data/Binary/Put.hs @@ -0,0 +1,199 @@ +{-# LANGUAGE CPP #-} +----------------------------------------------------------------------------- +-- | +-- Module : Data.Binary.Put +-- Copyright : Lennart Kolmodin +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer : Lennart Kolmodin +-- Stability : stable +-- Portability : Portable to Hugs and GHC. Requires MPTCs +-- +-- The Put monad. A monad for efficiently constructing lazy bytestrings. +-- +----------------------------------------------------------------------------- + +module Data.Binary.Put ( + + -- * The Put type + Put + , PutM(..) + , runPut + + -- * Flushing the implicit parse state + , flush + + -- * Primitives + , putWord8 + , putByteString + , putLazyByteString + + -- * Big-endian primitives + , putWord16be + , putWord32be + , putWord64be + + -- * Little-endian primitives + , putWord16le + , putWord32le + , putWord64le + + -- * Host-endian, unaligned writes + , putWordhost -- :: Word -> Put + , putWord16host -- :: Word16 -> Put + , putWord32host -- :: Word32 -> Put + , putWord64host -- :: Word64 -> Put + + ) where + +import Data.Monoid +import Data.Binary.Builder (Builder, toLazyByteString) +import qualified Data.Binary.Builder as B + +import Data.Word +import qualified Data.ByteString as S +import qualified Data.ByteString.Lazy as L + +#ifdef APPLICATIVE_IN_BASE +import Control.Applicative +#endif + + +------------------------------------------------------------------------ + +-- XXX Strict in buffer only. +data PairS a = PairS a {-# UNPACK #-}!Builder + +sndS :: PairS a -> Builder +sndS (PairS _ b) = b + +-- | The PutM type. A Writer monad over the efficient Builder monoid. +newtype PutM a = Put { unPut :: PairS a } + +-- | Put merely lifts Builder into a Writer monad, applied to (). +type Put = PutM () + +instance Functor PutM where + fmap f m = Put $ let PairS a w = unPut m in PairS (f a) w + {-# INLINE fmap #-} + +#ifdef APPLICATIVE_IN_BASE +instance Applicative PutM where + pure = return + m <*> k = Put $ + let PairS f w = unPut m + PairS x w' = unPut k + in PairS (f x) (w `mappend` w') +#endif + +-- Standard Writer monad, with aggressive inlining +instance Monad PutM where + return a = Put $ PairS a mempty + {-# INLINE return #-} + + m >>= k = Put $ + let PairS a w = unPut m + PairS b w' = unPut (k a) + in PairS b (w `mappend` w') + {-# INLINE (>>=) #-} + + m >> k = Put $ + let PairS _ w = unPut m + PairS b w' = unPut k + in PairS b (w `mappend` w') + {-# INLINE (>>) #-} + +tell :: Builder -> Put +tell b = Put $ PairS () b +{-# INLINE tell #-} + +-- | Run the 'Put' monad with a serialiser +runPut :: Put -> L.ByteString +runPut = toLazyByteString . sndS . unPut +{-# INLINE runPut #-} + +------------------------------------------------------------------------ + +-- | Pop the ByteString we have constructed so far, if any, yielding a +-- new chunk in the result ByteString. +flush :: Put +flush = tell B.flush +{-# INLINE flush #-} + +-- | Efficiently write a byte into the output buffer +putWord8 :: Word8 -> Put +putWord8 = tell . B.singleton +{-# INLINE putWord8 #-} + +-- | An efficient primitive to write a strict ByteString into the output buffer. +-- It flushes the current buffer, and writes the argument into a new chunk. +putByteString :: S.ByteString -> Put +putByteString = tell . B.fromByteString +{-# INLINE putByteString #-} + +-- | Write a lazy ByteString efficiently, simply appending the lazy +-- ByteString chunks to the output buffer +putLazyByteString :: L.ByteString -> Put +putLazyByteString = tell . B.fromLazyByteString +{-# INLINE putLazyByteString #-} + +-- | Write a Word16 in big endian format +putWord16be :: Word16 -> Put +putWord16be = tell . B.putWord16be +{-# INLINE putWord16be #-} + +-- | Write a Word16 in little endian format +putWord16le :: Word16 -> Put +putWord16le = tell . B.putWord16le +{-# INLINE putWord16le #-} + +-- | Write a Word32 in big endian format +putWord32be :: Word32 -> Put +putWord32be = tell . B.putWord32be +{-# INLINE putWord32be #-} + +-- | Write a Word32 in little endian format +putWord32le :: Word32 -> Put +putWord32le = tell . B.putWord32le +{-# INLINE putWord32le #-} + +-- | Write a Word64 in big endian format +putWord64be :: Word64 -> Put +putWord64be = tell . B.putWord64be +{-# INLINE putWord64be #-} + +-- | Write a Word64 in little endian format +putWord64le :: Word64 -> Put +putWord64le = tell . B.putWord64le +{-# INLINE putWord64le #-} + +------------------------------------------------------------------------ + +-- | /O(1)./ Write a single native machine word. The word is +-- written in host order, host endian form, for the machine you're on. +-- On a 64 bit machine the Word is an 8 byte value, on a 32 bit machine, +-- 4 bytes. Values written this way are not portable to +-- different endian or word sized machines, without conversion. +-- +putWordhost :: Word -> Put +putWordhost = tell . B.putWordhost +{-# INLINE putWordhost #-} + +-- | /O(1)./ Write a Word16 in native host order and host endianness. +-- For portability issues see @putWordhost@. +putWord16host :: Word16 -> Put +putWord16host = tell . B.putWord16host +{-# INLINE putWord16host #-} + +-- | /O(1)./ Write a Word32 in native host order and host endianness. +-- For portability issues see @putWordhost@. +putWord32host :: Word32 -> Put +putWord32host = tell . B.putWord32host +{-# INLINE putWord32host #-} + +-- | /O(1)./ Write a Word64 in native host order +-- On a 32 bit machine we write two host order Word32s, in big endian form. +-- For portability issues see @putWordhost@. +putWord64host :: Word64 -> Put +putWord64host = tell . B.putWord64host +{-# INLINE putWord64host #-} diff --git a/src/GF/Compile/Export.hs b/src/GF/Compile/Export.hs index 575a9dc84..64b4aeabf 100644 --- a/src/GF/Compile/Export.hs +++ b/src/GF/Compile/Export.hs @@ -2,8 +2,6 @@ module GF.Compile.Export where import PGF.CId import PGF.Data (PGF(..)) -import PGF.Raw.Print (printTree) -import PGF.Raw.Convert (fromPGF) import GF.Compile.GFCCtoHaskell import GF.Compile.GFCCtoProlog import GF.Compile.GFCCtoJS @@ -32,7 +30,6 @@ exportPGF :: Options -> [(FilePath,String)] -- ^ List of recommended file names and contents. exportPGF opts fmt pgf = case fmt of - FmtPGF -> multi "pgf" printPGF FmtPGFPretty -> multi "txt" prPGFPretty FmtJavaScript -> multi "js" pgf2js FmtHaskell -> multi "hs" (grammar2haskell opts name) @@ -65,7 +62,3 @@ outputConcr :: PGF -> CId outputConcr pgf = case cncnames pgf of [] -> error "No concrete syntax." cnc:_ -> cnc - -printPGF :: PGF -> String -printPGF = encodeUTF8 . printTree . fromPGF - diff --git a/src/GF/Compile/GrammarToGFCC.hs b/src/GF/Compile/GrammarToGFCC.hs index 4b5cf24bb..267015fb6 100644 --- a/src/GF/Compile/GrammarToGFCC.hs +++ b/src/GF/Compile/GrammarToGFCC.hs @@ -1,5 +1,5 @@ {-# LANGUAGE PatternGuards #-} -module GF.Compile.GrammarToGFCC (prGrammar2gfcc,mkCanon2gfcc,addParsers) where +module GF.Compile.GrammarToGFCC (mkCanon2gfcc,addParsers) where import GF.Compile.Export import GF.Compile.OptimizeGF (unshareModule) @@ -37,11 +37,6 @@ traceD s t = t -- the main function: generate PGF from GF. - -prGrammar2gfcc :: Options -> String -> SourceGrammar -> (String,String) -prGrammar2gfcc opts cnc gr = (abs,printPGF gc) where - (abs,gc) = mkCanon2gfcc opts cnc gr - mkCanon2gfcc :: Options -> String -> SourceGrammar -> (String,D.PGF) mkCanon2gfcc opts cnc gr = (prIdent abs, (canon2gfcc opts pars . reorder abs . canon2canon abs) gr) diff --git a/src/GF/Infra/Option.hs b/src/GF/Infra/Option.hs index 25ccb09a2..f9221b233 100644 --- a/src/GF/Infra/Option.hs +++ b/src/GF/Infra/Option.hs @@ -80,8 +80,7 @@ data Phase = Preproc | Convert | Compile | Link data Encoding = UTF_8 | ISO_8859_1 | CP_1251 deriving (Eq,Ord) -data OutputFormat = FmtPGF - | FmtPGFPretty +data OutputFormat = FmtPGFPretty | FmtJavaScript | FmtHaskell | FmtProlog @@ -239,7 +238,7 @@ defaultFlags = Flags { optShowCPUTime = False, optEmitGFO = True, optGFODir = ".", - optOutputFormats = [FmtPGF], + optOutputFormats = [], optSISR = Nothing, optHaskellOptions = Set.empty, optLexicalCats = Set.empty, @@ -427,8 +426,7 @@ optDescr = outputFormats :: [(String,OutputFormat)] outputFormats = - [("pgf", FmtPGF), - ("pgf-pretty", FmtPGFPretty), + [("pgf-pretty", FmtPGFPretty), ("js", FmtJavaScript), ("haskell", FmtHaskell), ("prolog", FmtProlog), diff --git a/src/GFC.hs b/src/GFC.hs index 8af23d6e1..337acb87a 100644 --- a/src/GFC.hs +++ b/src/GFC.hs @@ -4,8 +4,6 @@ module GFC (mainGFC) where import PGF import PGF.CId import PGF.Data -import PGF.Raw.Parse -import PGF.Raw.Convert import GF.Compile import GF.Compile.Export @@ -16,6 +14,7 @@ import GF.Infra.Option import GF.Data.ErrM import Data.Maybe +import Data.Binary import System.FilePath @@ -57,10 +56,17 @@ unionPGFFiles opts fs = where readPGFVerbose f = putPointE Normal opts ("Reading " ++ f ++ "...") $ ioeIO $ readPGF f writeOutputs :: Options -> PGF -> IOE () -writeOutputs opts pgf = - sequence_ [writeOutput opts name str - | fmt <- flag optOutputFormats opts, - (name,str) <- exportPGF opts fmt pgf] +writeOutputs opts pgf = do + writePGF opts pgf + sequence_ [writeOutput opts name str + | fmt <- flag optOutputFormats opts, + (name,str) <- exportPGF opts fmt pgf] + +writePGF :: Options -> PGF -> IOE () +writePGF opts pgf = do + let name = fromMaybe (prCId (absname pgf)) (flag optName opts) + outfile = name <.> "pgf" + putPointE Normal opts ("Writing " ++ outfile ++ "...") $ ioeIO $ encodeFile outfile pgf writeOutput :: Options -> FilePath-> String -> IOE () writeOutput opts file str = diff --git a/src/PGF.hs b/src/PGF.hs index 38031dcbd..ac7deb537 100644 --- a/src/PGF.hs +++ b/src/PGF.hs @@ -66,9 +66,7 @@ import PGF.TypeCheck import PGF.Paraphrase import PGF.Macros import PGF.Data -import PGF.Raw.Convert -import PGF.Raw.Parse -import PGF.Raw.Print (printTree) +import PGF.Binary import PGF.Parsing.FCFG import qualified PGF.Parsing.FCFG.Incremental as Incremental import qualified GF.Compile.GeneratePMCFG as PMCFG @@ -80,6 +78,7 @@ import GF.Data.Utilities (replace) import Data.Char import qualified Data.Map as Map import Data.Maybe +import Data.Binary import System.Random (newStdGen) import Control.Monad @@ -210,9 +209,8 @@ readLanguage = readCId showLanguage = prCId readPGF f = do - s <- readFile f >>= return . decodeUTF8 -- pgf is in UTF8, internal in unicode - g <- parseGrammar s - return $! addParsers $ toPGF g + g <- decodeFile f + return $! addParsers g -- Adds parsers for all concretes that don't have a parser and that have parser=ondemand. addParsers :: PGF -> PGF diff --git a/src/PGF/Raw/Abstract.hs b/src/PGF/Raw/Abstract.hs deleted file mode 100644 index 77d919a2d..000000000 --- a/src/PGF/Raw/Abstract.hs +++ /dev/null @@ -1,14 +0,0 @@ -module PGF.Raw.Abstract where - -data Grammar = - Grm [RExp] - deriving (Eq,Ord,Show) - -data RExp = - App String [RExp] - | AInt Integer - | AStr String - | AFlt Double - | AMet - deriving (Eq,Ord,Show) - diff --git a/src/PGF/Raw/Convert.hs b/src/PGF/Raw/Convert.hs deleted file mode 100644 index 85799a3a2..000000000 --- a/src/PGF/Raw/Convert.hs +++ /dev/null @@ -1,273 +0,0 @@ -module PGF.Raw.Convert (toPGF,fromPGF) where - -import PGF.CId -import PGF.Data -import PGF.Raw.Abstract - -import Data.Array.IArray -import qualified Data.Map as Map -import qualified Data.Set as Set -import qualified Data.IntMap as IntMap - -pgfMajorVersion, pgfMinorVersion :: Integer -(pgfMajorVersion, pgfMinorVersion) = (1,0) - --- convert parsed grammar to internal PGF - -toPGF :: Grammar -> PGF -toPGF (Grm [ - App "pgf" (AInt v1 : AInt v2 : App a []:cs), - App "flags" gfs, - ab@( - App "abstract" [ - App "fun" fs, - App "cat" cts - ]), - App "concrete" ccs - ]) = let pgf = PGF { - absname = mkCId a, - cncnames = [mkCId c | App c [] <- cs], - gflags = Map.fromAscList [(mkCId f,v) | App f [AStr v] <- gfs], - abstract = - let - aflags = Map.fromAscList [(mkCId f,v) | App f [AStr v] <- gfs] - lfuns = [(mkCId f,(toType typ,toExp def)) | App f [typ, def] <- fs] - funs = Map.fromAscList lfuns - lcats = [(mkCId c, Prelude.map toHypo hyps) | App c hyps <- cts] - cats = Map.fromAscList lcats - catfuns = Map.fromAscList - [(cat,[f | (f, (DTyp _ c _,_)) <- lfuns, c==cat]) | (cat,_) <- lcats] - in Abstr aflags funs cats catfuns, - concretes = Map.fromAscList [(mkCId lang, toConcr pgf ts) | App lang ts <- ccs] - } - in pgf - where - -toConcr :: PGF -> [RExp] -> Concr -toConcr pgf rexp = - let cnc = foldl add (Concr {cflags = Map.empty, - lins = Map.empty, - opers = Map.empty, - lincats = Map.empty, - lindefs = Map.empty, - printnames = Map.empty, - paramlincats = Map.empty, - parser = Nothing - }) rexp - in cnc - where - add :: Concr -> RExp -> Concr - add cnc (App "flags" ts) = cnc { cflags = Map.fromAscList [(mkCId f,v) | App f [AStr v] <- ts] } - add cnc (App "lin" ts) = cnc { lins = mkTermMap ts } - add cnc (App "oper" ts) = cnc { opers = mkTermMap ts } - add cnc (App "lincat" ts) = cnc { lincats = mkTermMap ts } - add cnc (App "lindef" ts) = cnc { lindefs = mkTermMap ts } - add cnc (App "printname" ts) = cnc { printnames = mkTermMap ts } - add cnc (App "param" ts) = cnc { paramlincats = mkTermMap ts } - add cnc (App "parser" ts) = cnc { parser = Just (toPInfo ts) } - -toPInfo :: [RExp] -> ParserInfo -toPInfo [App "functions" fs, App "sequences" ss, App "productions" ps,App "categories" (t:cs)] = - ParserInfo { functions = functions - , sequences = seqs - , productions = productions - , startCats = cats - , totalCats = expToInt t - } - where - functions = mkArray (map toFFun fs) - seqs = mkArray (map toFSeq ss) - productions = IntMap.fromList (map toProductionSet ps) - cats = Map.fromList [(mkCId c, (map expToInt xs)) | App c xs <- cs] - - toFFun :: RExp -> FFun - toFFun (App f [App "P" ts,App "R" ls]) = FFun fun prof lins - where - fun = mkCId f - prof = map toProfile ts - lins = mkArray [fromIntegral seqid | AInt seqid <- ls] - - toProfile :: RExp -> Profile - toProfile AMet = [] - toProfile (App "_A" [t]) = [expToInt t] - toProfile (App "_U" ts) = [expToInt t | App "_A" [t] <- ts] - - toFSeq :: RExp -> FSeq - toFSeq (App "seq" ss) = mkArray [toSymbol s | s <- ss] - - toProductionSet :: RExp -> (FCat,Set.Set Production) - toProductionSet (App "td" (rt : xs)) = (expToInt rt, Set.fromList (map toProduction xs)) - where - toProduction (App "A" (ruleid : at)) = FApply (expToInt ruleid) (map expToInt at) - toProduction (App "C" [fcat]) = FCoerce (expToInt fcat) - -toSymbol :: RExp -> FSymbol -toSymbol (App "P" [n,l]) = FSymCat (expToInt n) (expToInt l) -toSymbol (App "PL" [n,l]) = FSymLit (expToInt n) (expToInt l) -toSymbol (App "KP" (d:alts)) = FSymTok (toKP d alts) -toSymbol (AStr t) = FSymTok (KS t) - -toType :: RExp -> Type -toType e = case e of - App cat [App "H" hypos, App "X" exps] -> - DTyp (map toHypo hypos) (mkCId cat) (map toExp exps) - _ -> error $ "type " ++ show e - -toHypo :: RExp -> Hypo -toHypo e = case e of - App x [typ] -> Hyp (mkCId x) (toType typ) - _ -> error $ "hypo " ++ show e - -toExp :: RExp -> Expr -toExp e = case e of - App "Abs" [App x [], exp] -> EAbs (mkCId x) (toExp exp) - App "App" [e1,e2] -> EApp (toExp e1) (toExp e2) - App "Eq" eqs -> EEq [Equ (map toExp ps) (toExp v) | App "E" (v:ps) <- eqs] - App "Var" [App i []] -> EVar (mkCId i) - AMet -> EMeta 0 - AInt i -> ELit (LInt i) - AFlt i -> ELit (LFlt i) - AStr i -> ELit (LStr i) - _ -> error $ "exp " ++ show e - -toTerm :: RExp -> Term -toTerm e = case e of - App "R" es -> R (map toTerm es) - App "S" es -> S (map toTerm es) - App "FV" es -> FV (map toTerm es) - App "P" [e,v] -> P (toTerm e) (toTerm v) - App "W" [AStr s,v] -> W s (toTerm v) - App "A" [AInt i] -> V (fromInteger i) - App f [] -> F (mkCId f) - AInt i -> C (fromInteger i) - AMet -> TM "?" - App "KP" (d:alts) -> K (toKP d alts) - AStr s -> K (KS s) - _ -> error $ "term " ++ show e - -toKP d alts = KP (toStr d) (map toAlt alts) - where - toStr (App "S" vs) = [v | AStr v <- vs] - toAlt (App "A" [x,y]) = Alt (toStr x) (toStr y) - - ------------------------------- ---- from internal to parser -- ------------------------------- - -fromPGF :: PGF -> Grammar -fromPGF pgf = Grm [ - App "pgf" (AInt pgfMajorVersion:AInt pgfMinorVersion - : App (prCId (absname pgf)) [] : map (flip App [] . prCId) (cncnames pgf)), - App "flags" [App (prCId f) [AStr v] | (f,v) <- Map.toList (gflags pgf `Map.union` aflags apgf)], - App "abstract" [ - App "fun" [App (prCId f) [fromType t,fromExp d] | (f,(t,d)) <- Map.toList (funs apgf)], - App "cat" [App (prCId f) (map fromHypo hs) | (f,hs) <- Map.toList (cats apgf)] - ], - App "concrete" [App (prCId lang) (fromConcrete c) | (lang,c) <- Map.toList (concretes pgf)] - ] - where - apgf = abstract pgf - fromConcrete cnc = [ - App "flags" [App (prCId f) [AStr v] | (f,v) <- Map.toList (cflags cnc)], - App "lin" [App (prCId f) [fromTerm v] | (f,v) <- Map.toList (lins cnc)], - App "oper" [App (prCId f) [fromTerm v] | (f,v) <- Map.toList (opers cnc)], - App "lincat" [App (prCId f) [fromTerm v] | (f,v) <- Map.toList (lincats cnc)], - App "lindef" [App (prCId f) [fromTerm v] | (f,v) <- Map.toList (lindefs cnc)], - App "printname" [App (prCId f) [fromTerm v] | (f,v) <- Map.toList (printnames cnc)], - App "param" [App (prCId f) [fromTerm v] | (f,v) <- Map.toList (paramlincats cnc)] - ] ++ maybe [] (\p -> [fromPInfo p]) (parser cnc) - -fromType :: Type -> RExp -fromType e = case e of - DTyp hypos cat exps -> - App (prCId cat) [ - App "H" (map fromHypo hypos), - App "X" (map fromExp exps)] - -fromHypo :: Hypo -> RExp -fromHypo e = case e of - Hyp x typ -> App (prCId x) [fromType typ] - -fromExp :: Expr -> RExp -fromExp e = case e of - EAbs x exp -> App "Abs" [App (prCId x) [], fromExp exp] - EApp e1 e2 -> App "App" [fromExp e1, fromExp e2] - EVar x -> App "Var" [App (prCId x) []] - ELit (LStr s) -> AStr s - ELit (LFlt d) -> AFlt d - ELit (LInt i) -> AInt (toInteger i) - EMeta _ -> AMet ---- - EEq eqs -> App "Eq" [App "E" (map fromExp (v:ps)) | Equ ps v <- eqs] - -fromTerm :: Term -> RExp -fromTerm e = case e of - R es -> App "R" (map fromTerm es) - S es -> App "S" (map fromTerm es) - FV es -> App "FV" (map fromTerm es) - P e v -> App "P" [fromTerm e, fromTerm v] - W s v -> App "W" [AStr s, fromTerm v] - C i -> AInt (toInteger i) - TM _ -> AMet - F f -> App (prCId f) [] - V i -> App "A" [AInt (toInteger i)] - K t -> fromTokn t - -fromTokn :: Tokn -> RExp -fromTokn (KS s) = AStr s -fromTokn (KP d vs) = App "KP" (str d : [App "A" [str v, str x] | Alt v x <- vs]) - where - str v = App "S" (map AStr v) - --- ** Parsing info - -fromPInfo :: ParserInfo -> RExp -fromPInfo p = App "parser" [ - App "functions" [fromFFun fun | fun <- elems (functions p)], - App "sequences" [fromFSeq seq | seq <- elems (sequences p)], - App "productions" [fromProductionSet xs | xs <- IntMap.toList (productions p)], - App "categories" (intToExp (totalCats p) : [App (prCId f) (map intToExp xs) | (f,xs) <- Map.toList (startCats p)]) - ] - -fromFFun :: FFun -> RExp -fromFFun (FFun fun prof lins) = App (prCId fun) [App "P" (map fromProfile prof), App "R" [intToExp seqid | seqid <- elems lins]] - where - fromProfile :: Profile -> RExp - fromProfile [] = AMet - fromProfile [x] = daughter x - fromProfile args = App "_U" (map daughter args) - - daughter n = App "_A" [intToExp n] - -fromSymbol :: FSymbol -> RExp -fromSymbol (FSymCat n l) = App "P" [intToExp n, intToExp l] -fromSymbol (FSymLit n l) = App "PL" [intToExp n, intToExp l] -fromSymbol (FSymTok t) = fromTokn t - -fromFSeq :: FSeq -> RExp -fromFSeq seq = App "seq" [fromSymbol s | s <- elems seq] - -fromProductionSet :: (FCat,Set.Set Production) -> RExp -fromProductionSet (cat,xs) = App "td" (intToExp cat : map fromPassive (Set.toList xs)) - where - fromPassive (FApply ruleid args) = App "A" (intToExp ruleid : map intToExp args) - fromPassive (FCoerce fcat) = App "C" [intToExp fcat] - --- ** Utilities - -mkTermMap :: [RExp] -> Map.Map CId Term -mkTermMap ts = Map.fromAscList [(mkCId f,toTerm v) | App f [v] <- ts] - -mkArray :: IArray a e => [e] -> a Int e -mkArray xs = listArray (0, length xs - 1) xs - -expToInt :: Integral a => RExp -> a -expToInt (App "neg" [AInt i]) = fromIntegral (negate i) -expToInt (AInt i) = fromIntegral i - -expToStr :: RExp -> String -expToStr (AStr s) = s - -intToExp :: Integral a => a -> RExp -intToExp x | x < 0 = App "neg" [AInt (fromIntegral (negate x))] - | otherwise = AInt (fromIntegral x) diff --git a/src/PGF/Raw/Parse.hs b/src/PGF/Raw/Parse.hs deleted file mode 100644 index 671183ba4..000000000 --- a/src/PGF/Raw/Parse.hs +++ /dev/null @@ -1,101 +0,0 @@ -module PGF.Raw.Parse (parseGrammar) where - -import PGF.CId -import PGF.Raw.Abstract - -import Control.Monad -import Data.Char -import qualified Data.ByteString.Char8 as BS - -parseGrammar :: String -> IO Grammar -parseGrammar s = case runP pGrammar s of - Just (x,"") -> return x - _ -> fail "Parse error" - -pGrammar :: P Grammar -pGrammar = liftM Grm pTerms - -pTerms :: P [RExp] -pTerms = liftM2 (:) (pTerm 1) pTerms <++ (skipSpaces >> return []) - -pTerm :: Int -> P RExp -pTerm n = skipSpaces >> (pParen <++ pApp <++ pNum <++ pStr <++ pMeta) - where pParen = between (char '(') (char ')') (pTerm 0) - pApp = liftM2 App pIdent (if n == 0 then pTerms else return []) - pStr = char '"' >> liftM AStr (manyTill (pEsc <++ get) (char '"')) - pEsc = char '\\' >> get - pNum = do x <- munch1 isDigit - ((char '.' >> munch1 isDigit >>= \y -> return (AFlt (read (x++"."++y)))) - <++ - return (AInt (read x))) - pMeta = char '?' >> return AMet - pIdent = liftM2 (:) (satisfy isIdentFirst) (munch isIdentRest) - isIdentFirst c = c == '_' || isAlpha c - isIdentRest c = c == '_' || c == '\'' || isAlphaNum c - --- Parser combinators with only left-biased choice - -newtype P a = P { runP :: String -> Maybe (a,String) } - -instance Monad P where - return x = P (\ts -> Just (x,ts)) - P p >>= f = P (\ts -> p ts >>= \ (x,ts') -> runP (f x) ts') - fail _ = pfail - -instance MonadPlus P where - mzero = pfail - mplus = (<++) - - -get :: P Char -get = P (\ts -> case ts of - [] -> Nothing - c:cs -> Just (c,cs)) - -look :: P String -look = P (\ts -> Just (ts,ts)) - -(<++) :: P a -> P a -> P a -P p <++ P q = P (\ts -> p ts `mplus` q ts) - -pfail :: P a -pfail = P (\ts -> Nothing) - -satisfy :: (Char -> Bool) -> P Char -satisfy p = do c <- get - if p c then return c else pfail - -char :: Char -> P Char -char c = satisfy (c==) - -string :: String -> P String -string this = look >>= scan this - where - scan [] _ = return this - scan (x:xs) (y:ys) | x == y = get >> scan xs ys - scan _ _ = pfail - -skipSpaces :: P () -skipSpaces = look >>= skip - where - skip (c:s) | isSpace c = get >> skip s - skip _ = return () - -manyTill :: P a -> P end -> P [a] -manyTill p end = scan - where scan = (end >> return []) <++ liftM2 (:) p scan - -munch :: (Char -> Bool) -> P String -munch p = munch1 p <++ return [] - -munch1 :: (Char -> Bool) -> P String -munch1 p = liftM2 (:) (satisfy p) (munch p) - -choice :: [P a] -> P a -choice = msum - -between :: P open -> P close -> P a -> P a -between open close p = do open - x <- p - close - return x diff --git a/src/PGF/Raw/Print.hs b/src/PGF/Raw/Print.hs deleted file mode 100644 index d34adbc2b..000000000 --- a/src/PGF/Raw/Print.hs +++ /dev/null @@ -1,35 +0,0 @@ -module PGF.Raw.Print (printTree) where - -import PGF.CId -import PGF.Raw.Abstract - -import Data.List (intersperse) -import Numeric (showFFloat) -import qualified Data.ByteString.Char8 as BS - -printTree :: Grammar -> String -printTree g = prGrammar g "" - -prGrammar :: Grammar -> ShowS -prGrammar (Grm xs) = prRExpList xs - -prRExp :: Int -> RExp -> ShowS -prRExp _ (App x []) = showString x -prRExp n (App x xs) = p (showString x . showChar ' ' . prRExpList xs) - where p s = if n == 0 then s else showChar '(' . s . showChar ')' -prRExp _ (AInt x) = shows x -prRExp _ (AStr x) = showChar '"' . concatS (map mkEsc x) . showChar '"' -prRExp _ (AFlt x) = showFFloat Nothing x -prRExp _ AMet = showChar '?' - -mkEsc :: Char -> ShowS -mkEsc s = case s of - '"' -> showString "\\\"" - '\\' -> showString "\\\\" - _ -> showChar s - -prRExpList :: [RExp] -> ShowS -prRExpList = concatS . intersperse (showChar ' ') . map (prRExp 1) - -concatS :: [ShowS] -> ShowS -concatS = foldr (.) id