Represent identifiers as UTF-8-encoded ByteStrings

This was a fairly simple change thanks to previous work on making the Ident
type abstract and the fact that PGF.CId already uses UTF-8-encoded
ByteStrings.

One potential pitfall is that Data.ByteString.UTF8 uses the same type for
ByteStrings as Data.ByteString. I renamed ident2bs to ident2utf8 and
bsCId to utf8CId, to make it clearer that they work with UTF-8-encoded
ByteStrings.

Since both the compiler input and identifiers are now UTF-8-encoded
ByteStrings, the lexer now creates identifiers without copying any characters.
**END OF DESCRIPTION***

Place the long patch description above the ***END OF DESCRIPTION*** marker.
The first line of this file will be the patch name.


This patch contains the following changes:

M ./src/compiler/GF/Compile/CheckGrammar.hs -3 +3
M ./src/compiler/GF/Compile/GrammarToPGF.hs -2 +2
M ./src/compiler/GF/Grammar/Binary.hs -5 +1
M ./src/compiler/GF/Grammar/Lexer.x -11 +13
M ./src/compiler/GF/Infra/Ident.hs -19 +36
M ./src/runtime/haskell/PGF.hs -1 +1
M ./src/runtime/haskell/PGF/CId.hs -2 +3
This commit is contained in:
hallgren
2013-11-26 16:12:03 +00:00
parent 9d7fdf7c9a
commit 3f57151cc3
7 changed files with 60 additions and 44 deletions

View File

@@ -298,9 +298,9 @@ checkInfo opts sgr (m,mo) c info = do
-- | for grammars obtained otherwise than by parsing ---- update!! -- | for grammars obtained otherwise than by parsing ---- update!!
checkReservedId :: Ident -> Check () checkReservedId :: Ident -> Check ()
checkReservedId x checkReservedId x =
| isReservedWord (ident2bs x) = checkWarn (text "reserved word used as identifier:" <+> ppIdent x) when (isReservedWord x) $
| otherwise = return () checkWarn (text "reserved word used as identifier:" <+> ppIdent x)
-- auxiliaries -- auxiliaries

View File

@@ -5,7 +5,7 @@ module GF.Compile.GrammarToPGF (mkCanon2pgf) where
import GF.Compile.GeneratePMCFG import GF.Compile.GeneratePMCFG
import GF.Compile.GenerateBC import GF.Compile.GenerateBC
import PGF(CId,mkCId,bsCId) import PGF(CId,mkCId,utf8CId)
import PGF.Data(fidInt,fidFloat,fidString,fidVar) import PGF.Data(fidInt,fidFloat,fidString,fidVar)
import PGF.Optimize(updateProductionIndices) import PGF.Optimize(updateProductionIndices)
--import qualified PGF.Macros as CM --import qualified PGF.Macros as CM
@@ -103,7 +103,7 @@ mkCanon2pgf opts gr am = do
return (seqs, ((m,id), info) : is) return (seqs, ((m,id), info) : is)
i2i :: Ident -> CId i2i :: Ident -> CId
i2i = bsCId . ident2bs i2i = utf8CId . ident2utf8
mkType :: [Ident] -> A.Type -> C.Type mkType :: [Ident] -> A.Type -> C.Type
mkType scope t = mkType scope t =

View File

@@ -30,7 +30,7 @@ import PGF.Data(Literal(..))
gfoVersion = "GF03" gfoVersion = "GF03"
instance Binary Ident where instance Binary Ident where
put id = put (ident2bs id) put id = put (ident2utf8 id)
get = do bs <- get get = do bs <- get
if bs == BS.pack "_" if bs == BS.pack "_"
then return identW then return identW
@@ -295,10 +295,6 @@ instance Binary Label where
1 -> fmap LVar get 1 -> fmap LVar get
_ -> decodingError _ -> decodingError
instance Binary RawIdent where
put = put . rawId2bs
get = fmap rawIdentC get
--putGFOVersion = mapM_ (putWord8 . fromIntegral . ord) gfoVersion --putGFOVersion = mapM_ (putWord8 . fromIntegral . ord) gfoVersion
--getGFOVersion = replicateM (length gfoVersion) (fmap (chr . fromIntegral) getWord8) --getGFOVersion = replicateM (length gfoVersion) (fmap (chr . fromIntegral) getWord8)
--putGFOVersion = put gfoVersion --putGFOVersion = put gfoVersion

View File

@@ -33,9 +33,9 @@ $u = [.\n] -- universal: any character
"{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ; "{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ;
$white+ ; $white+ ;
@rsyms { tok (res (T_Ident . identS)) } @rsyms { tok ident }
\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) } \' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (res T_Ident . identS . unescapeInitTail . unpack) }
(\_ | $l)($l | $d | \_ | \')* { tok (res (T_Ident . identS)) } (\_ | $l)($l | $d | \_ | \')* { tok ident }
\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) } \" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) }
@@ -43,10 +43,12 @@ $white+ ;
(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . unpack) } (\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . unpack) }
{ {
--unpack = BS.unpack unpack = UTF8.toString
unpack = id --unpack = id
tok :: (String->Token) -> Posn -> String -> Token ident = res T_Ident . identC . rawIdentC
--tok :: (String->Token) -> Posn -> String -> Token
tok f p s = f s tok f p s = f s
data Token data Token
@@ -126,14 +128,14 @@ data Token
-- deriving Show -- debug -- deriving Show -- debug
res = eitherResIdent res = eitherResIdent
eitherResIdent :: (String -> Token) -> String -> Token eitherResIdent :: (Ident -> Token) -> Ident -> Token
eitherResIdent tv s = eitherResIdent tv s =
case Map.lookup s resWords of case Map.lookup s resWords of
Just t -> t Just t -> t
Nothing -> tv s Nothing -> tv s
isReservedWord :: BS.ByteString -> Bool isReservedWord :: Ident -> Bool
isReservedWord s = Map.member (BS.unpack s) resWords isReservedWord ident = Map.member ident resWords
resWords = Map.fromList resWords = Map.fromList
[ b "!" T_exclmark [ b "!" T_exclmark
@@ -205,7 +207,7 @@ resWords = Map.fromList
, b "where" T_where , b "where" T_where
, b "with" T_with , b "with" T_with
] ]
where b s t = (s, t) where b s t = (identS s, t)
unescapeInitTail :: String -> String unescapeInitTail :: String -> String
unescapeInitTail = unesc . tail where unescapeInitTail = unesc . tail where
@@ -278,7 +280,7 @@ lexer cont = P go
AlexEOF -> unP (cont T_EOF) inp AlexEOF -> unP (cont T_EOF) inp
AlexError (AI pos _ _) -> PFailed pos "lexical error" AlexError (AI pos _ _) -> PFailed pos "lexical error"
AlexSkip inp' len -> {-trace (show len) $-} go inp' AlexSkip inp' len -> {-trace (show len) $-} go inp'
AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp' AlexToken inp' len act -> unP (cont (act pos ({-UTF8.toString-} (UTF8.take len str)))) inp'
getPosn :: P Posn getPosn :: P Posn
getPosn = P $ \inp@(AI pos _ _) -> POk pos getPosn = P $ \inp@(AI pos _ _) -> POk pos

View File

@@ -13,20 +13,24 @@
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
module GF.Infra.Ident (-- * Identifiers module GF.Infra.Ident (-- * Identifiers
Ident, ident2bs, showIdent, ppIdent, prefixIdent, Ident, ident2utf8, showIdent, ppIdent, prefixIdent,
identS, identC, identV, identA, identAV, identW, identS, identC, identV, identA, identAV, identW,
argIdent, isArgIdent, getArgIndex, argIdent, isArgIdent, getArgIndex,
varStr, varX, isWildIdent, varIndex, varStr, varX, isWildIdent, varIndex,
-- * Raw Identifiers -- * Raw Identifiers
RawIdent, rawIdentS, rawIdentC, ident2raw, prefixRawIdent, RawIdent, rawIdentS, rawIdentC, ident2raw, prefixRawIdent,
isPrefixOf, showRawIdent, rawId2bs{-, isPrefixOf, showRawIdent{-,
-- * Refreshing identifiers -- * Refreshing identifiers
IdState, initIdStateN, initIdState, IdState, initIdStateN, initIdState,
lookVar, refVar, refVarPlus-} lookVar, refVar, refVarPlus-}
) where ) where
import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.UTF8 as UTF8
import qualified Data.ByteString.Char8 as BS(append,isPrefixOf)
-- Limit use of BS functions to the ones that work correctly on
-- UTF-8-encoded bytestrings!
import Data.Char(isDigit) import Data.Char(isDigit)
import Data.Binary(Binary(..))
import Text.PrettyPrint(Doc,text) import Text.PrettyPrint(Doc,text)
@@ -41,31 +45,41 @@ data Ident =
| IA {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int -- ^ /INTERNAL/ argument of cat at position | IA {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int -- ^ /INTERNAL/ argument of cat at position
| IAV {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ /INTERNAL/ argument of cat with bindings at position | IAV {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ /INTERNAL/ argument of cat with bindings at position
-- --
deriving (Eq, Ord, Show, Read) deriving (Eq, Ord, Show, Read)
newtype RawIdent = Id { rawId2bs :: BS.ByteString } -- | Identifiers are stored as UTF-8-encoded bytestrings.
newtype RawIdent = Id { rawId2utf8 :: UTF8.ByteString }
deriving (Eq, Ord, Show, Read) deriving (Eq, Ord, Show, Read)
rawIdentS = Id . BS.pack pack = UTF8.fromString
unpack = UTF8.toString
rawIdentS = Id . pack
rawIdentC = Id rawIdentC = Id
showRawIdent = BS.unpack . rawId2bs showRawIdent = unpack . rawId2utf8
prefixRawIdent (Id x) (Id y) = Id (BS.append x y) prefixRawIdent (Id x) (Id y) = Id (BS.append x y)
isPrefixOf (Id x) (Id y) = BS.isPrefixOf x y isPrefixOf (Id x) (Id y) = BS.isPrefixOf x y
ident2bs :: Ident -> BS.ByteString instance Binary RawIdent where
ident2bs i = case i of put = put . rawId2utf8
IC (Id s) -> s get = fmap rawIdentC get
IV (Id s) n -> BS.append s (BS.pack ('_':show n))
IA (Id s) j -> BS.append s (BS.pack ('_':show j))
IAV (Id s) b j -> BS.append s (BS.pack ('_':show b ++ '_':show j))
IW -> BS.pack "_"
ident2raw = Id . ident2bs
-- | This function should be used with care, since the returned ByteString is
-- UTF-8-encoded.
ident2utf8 :: Ident -> UTF8.ByteString
ident2utf8 i = case i of
IC (Id s) -> s
IV (Id s) n -> BS.append s (pack ('_':show n))
IA (Id s) j -> BS.append s (pack ('_':show j))
IAV (Id s) b j -> BS.append s (pack ('_':show b ++ '_':show j))
IW -> pack "_"
ident2raw = Id . ident2utf8
showIdent :: Ident -> String showIdent :: Ident -> String
showIdent i = BS.unpack $! ident2bs i showIdent i = unpack $! ident2utf8 i
ppIdent :: Ident -> Doc ppIdent :: Ident -> Doc
ppIdent = text . showIdent ppIdent = text . showIdent
@@ -83,7 +97,7 @@ identW :: Ident
prefixIdent :: String -> Ident -> Ident prefixIdent :: String -> Ident -> Ident
prefixIdent pref = identC . Id . BS.append (BS.pack pref) . ident2bs prefixIdent pref = identC . Id . BS.append (pack pref) . ident2utf8
-- normal identifier -- normal identifier
-- ident s = IC s -- ident s = IC s
@@ -99,8 +113,11 @@ isArgIdent _ = False
getArgIndex (IA _ i) = Just i getArgIndex (IA _ i) = Just i
getArgIndex (IAV _ _ i) = Just i getArgIndex (IAV _ _ i) = Just i
getArgIndex (IC (Id s)) getArgIndex (IC (Id bs))
| isDigit (BS.last s) = (Just . read . BS.unpack . snd . BS.spanEnd isDigit) s | isDigit c =
-- (Just . read . unpack . snd . BS.spanEnd isDigit) bs -- not ok with UTF-8
(Just . read . reverse . takeWhile isDigit) s
where s@(c:_) = reverse (unpack bs)
getArgIndex x = Nothing getArgIndex x = Nothing
-- | used in lin defaults -- | used in lin defaults
@@ -117,7 +134,7 @@ isWildIdent x = case x of
IC s | s == wild -> True IC s | s == wild -> True
_ -> False _ -> False
wild = Id (BS.pack "_") wild = Id (pack "_")
varIndex :: Ident -> Int varIndex :: Ident -> Int
varIndex (IV _ n) = n varIndex (IV _ n) = n

View File

@@ -22,7 +22,7 @@ module PGF(
CId, mkCId, wildCId, CId, mkCId, wildCId,
showCId, readCId, showCId, readCId,
-- extra -- extra
ppCId, pIdent, bsCId, ppCId, pIdent, utf8CId,
-- * Languages -- * Languages
Language, Language,

View File

@@ -3,7 +3,7 @@ module PGF.CId (CId(..),
readCId, showCId, readCId, showCId,
-- utils -- utils
bsCId, pCId, pIdent, ppCId) where utf8CId, pCId, pIdent, ppCId) where
import Control.Monad import Control.Monad
import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Char8 as BS
@@ -24,7 +24,8 @@ wildCId = CId (BS.singleton '_')
mkCId :: String -> CId mkCId :: String -> CId
mkCId s = CId (UTF8.fromString s) mkCId s = CId (UTF8.fromString s)
bsCId = CId -- | Creates an identifier from a UTF-8-encoded 'ByteString'
utf8CId = CId
-- | Reads an identifier from 'String'. The function returns 'Nothing' if the string is not valid identifier. -- | Reads an identifier from 'String'. The function returns 'Nothing' if the string is not valid identifier.
readCId :: String -> Maybe CId readCId :: String -> Maybe CId