From a617e50d955c456ae2c46ba69f4c1ecd0d831017 Mon Sep 17 00:00:00 2001 From: hallgren Date: Tue, 26 Nov 2013 16:12:03 +0000 Subject: [PATCH] Represent identifiers as UTF-8-encoded ByteStrings This was a fairly simple change thanks to previous work on making the Ident type abstract and the fact that PGF.CId already uses UTF-8-encoded ByteStrings. One potential pitfall is that Data.ByteString.UTF8 uses the same type for ByteStrings as Data.ByteString. I renamed ident2bs to ident2utf8 and bsCId to utf8CId, to make it clearer that they work with UTF-8-encoded ByteStrings. Since both the compiler input and identifiers are now UTF-8-encoded ByteStrings, the lexer now creates identifiers without copying any characters. **END OF DESCRIPTION*** Place the long patch description above the ***END OF DESCRIPTION*** marker. The first line of this file will be the patch name. This patch contains the following changes: M ./src/compiler/GF/Compile/CheckGrammar.hs -3 +3 M ./src/compiler/GF/Compile/GrammarToPGF.hs -2 +2 M ./src/compiler/GF/Grammar/Binary.hs -5 +1 M ./src/compiler/GF/Grammar/Lexer.x -11 +13 M ./src/compiler/GF/Infra/Ident.hs -19 +36 M ./src/runtime/haskell/PGF.hs -1 +1 M ./src/runtime/haskell/PGF/CId.hs -2 +3 --- src/compiler/GF/Compile/CheckGrammar.hs | 6 +-- src/compiler/GF/Compile/GrammarToPGF.hs | 4 +- src/compiler/GF/Grammar/Binary.hs | 6 +-- src/compiler/GF/Grammar/Lexer.x | 24 ++++++----- src/compiler/GF/Infra/Ident.hs | 57 ++++++++++++++++--------- src/runtime/haskell/PGF.hs | 2 +- src/runtime/haskell/PGF/CId.hs | 5 ++- 7 files changed, 60 insertions(+), 44 deletions(-) diff --git a/src/compiler/GF/Compile/CheckGrammar.hs b/src/compiler/GF/Compile/CheckGrammar.hs index 568686f92..d924e413e 100644 --- a/src/compiler/GF/Compile/CheckGrammar.hs +++ b/src/compiler/GF/Compile/CheckGrammar.hs @@ -298,9 +298,9 @@ checkInfo opts sgr (m,mo) c info = do -- | for grammars obtained otherwise than by parsing ---- update!! checkReservedId :: Ident -> Check () -checkReservedId x - | isReservedWord (ident2bs x) = checkWarn (text "reserved word used as identifier:" <+> ppIdent x) - | otherwise = return () +checkReservedId x = + when (isReservedWord x) $ + checkWarn (text "reserved word used as identifier:" <+> ppIdent x) -- auxiliaries diff --git a/src/compiler/GF/Compile/GrammarToPGF.hs b/src/compiler/GF/Compile/GrammarToPGF.hs index 25db4bac7..5917b709c 100644 --- a/src/compiler/GF/Compile/GrammarToPGF.hs +++ b/src/compiler/GF/Compile/GrammarToPGF.hs @@ -5,7 +5,7 @@ module GF.Compile.GrammarToPGF (mkCanon2pgf) where import GF.Compile.GeneratePMCFG import GF.Compile.GenerateBC -import PGF(CId,mkCId,bsCId) +import PGF(CId,mkCId,utf8CId) import PGF.Data(fidInt,fidFloat,fidString,fidVar) import PGF.Optimize(updateProductionIndices) --import qualified PGF.Macros as CM @@ -103,7 +103,7 @@ mkCanon2pgf opts gr am = do return (seqs, ((m,id), info) : is) i2i :: Ident -> CId -i2i = bsCId . ident2bs +i2i = utf8CId . ident2utf8 mkType :: [Ident] -> A.Type -> C.Type mkType scope t = diff --git a/src/compiler/GF/Grammar/Binary.hs b/src/compiler/GF/Grammar/Binary.hs index 6cd3832c0..7b4540ce5 100644 --- a/src/compiler/GF/Grammar/Binary.hs +++ b/src/compiler/GF/Grammar/Binary.hs @@ -30,7 +30,7 @@ import PGF.Data(Literal(..)) gfoVersion = "GF03" instance Binary Ident where - put id = put (ident2bs id) + put id = put (ident2utf8 id) get = do bs <- get if bs == BS.pack "_" then return identW @@ -295,10 +295,6 @@ instance Binary Label where 1 -> fmap LVar get _ -> decodingError -instance Binary RawIdent where - put = put . rawId2bs - get = fmap rawIdentC get - --putGFOVersion = mapM_ (putWord8 . fromIntegral . ord) gfoVersion --getGFOVersion = replicateM (length gfoVersion) (fmap (chr . fromIntegral) getWord8) --putGFOVersion = put gfoVersion diff --git a/src/compiler/GF/Grammar/Lexer.x b/src/compiler/GF/Grammar/Lexer.x index 60c51f814..c4f7159a2 100644 --- a/src/compiler/GF/Grammar/Lexer.x +++ b/src/compiler/GF/Grammar/Lexer.x @@ -33,9 +33,9 @@ $u = [.\n] -- universal: any character "{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ; $white+ ; -@rsyms { tok (res (T_Ident . identS)) } -\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) } -(\_ | $l)($l | $d | \_ | \')* { tok (res (T_Ident . identS)) } +@rsyms { tok ident } +\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (res T_Ident . identS . unescapeInitTail . unpack) } +(\_ | $l)($l | $d | \_ | \')* { tok ident } \" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) } @@ -43,10 +43,12 @@ $white+ ; (\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . unpack) } { ---unpack = BS.unpack -unpack = id +unpack = UTF8.toString +--unpack = id -tok :: (String->Token) -> Posn -> String -> Token +ident = res T_Ident . identC . rawIdentC + +--tok :: (String->Token) -> Posn -> String -> Token tok f p s = f s data Token @@ -126,14 +128,14 @@ data Token -- deriving Show -- debug res = eitherResIdent -eitherResIdent :: (String -> Token) -> String -> Token +eitherResIdent :: (Ident -> Token) -> Ident -> Token eitherResIdent tv s = case Map.lookup s resWords of Just t -> t Nothing -> tv s -isReservedWord :: BS.ByteString -> Bool -isReservedWord s = Map.member (BS.unpack s) resWords +isReservedWord :: Ident -> Bool +isReservedWord ident = Map.member ident resWords resWords = Map.fromList [ b "!" T_exclmark @@ -205,7 +207,7 @@ resWords = Map.fromList , b "where" T_where , b "with" T_with ] - where b s t = (s, t) + where b s t = (identS s, t) unescapeInitTail :: String -> String unescapeInitTail = unesc . tail where @@ -278,7 +280,7 @@ lexer cont = P go AlexEOF -> unP (cont T_EOF) inp AlexError (AI pos _ _) -> PFailed pos "lexical error" AlexSkip inp' len -> {-trace (show len) $-} go inp' - AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp' + AlexToken inp' len act -> unP (cont (act pos ({-UTF8.toString-} (UTF8.take len str)))) inp' getPosn :: P Posn getPosn = P $ \inp@(AI pos _ _) -> POk pos diff --git a/src/compiler/GF/Infra/Ident.hs b/src/compiler/GF/Infra/Ident.hs index 4792852dd..a5874b744 100644 --- a/src/compiler/GF/Infra/Ident.hs +++ b/src/compiler/GF/Infra/Ident.hs @@ -13,20 +13,24 @@ ----------------------------------------------------------------------------- module GF.Infra.Ident (-- * Identifiers - Ident, ident2bs, showIdent, ppIdent, prefixIdent, + Ident, ident2utf8, showIdent, ppIdent, prefixIdent, identS, identC, identV, identA, identAV, identW, argIdent, isArgIdent, getArgIndex, varStr, varX, isWildIdent, varIndex, -- * Raw Identifiers RawIdent, rawIdentS, rawIdentC, ident2raw, prefixRawIdent, - isPrefixOf, showRawIdent, rawId2bs{-, + isPrefixOf, showRawIdent{-, -- * Refreshing identifiers IdState, initIdStateN, initIdState, lookVar, refVar, refVarPlus-} ) where -import qualified Data.ByteString.Char8 as BS +import qualified Data.ByteString.UTF8 as UTF8 +import qualified Data.ByteString.Char8 as BS(append,isPrefixOf) + -- Limit use of BS functions to the ones that work correctly on + -- UTF-8-encoded bytestrings! import Data.Char(isDigit) +import Data.Binary(Binary(..)) import Text.PrettyPrint(Doc,text) @@ -41,31 +45,41 @@ data Ident = | IA {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int -- ^ /INTERNAL/ argument of cat at position | IAV {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ /INTERNAL/ argument of cat with bindings at position -- - deriving (Eq, Ord, Show, Read) -newtype RawIdent = Id { rawId2bs :: BS.ByteString } +-- | Identifiers are stored as UTF-8-encoded bytestrings. +newtype RawIdent = Id { rawId2utf8 :: UTF8.ByteString } deriving (Eq, Ord, Show, Read) -rawIdentS = Id . BS.pack +pack = UTF8.fromString +unpack = UTF8.toString + +rawIdentS = Id . pack rawIdentC = Id -showRawIdent = BS.unpack . rawId2bs +showRawIdent = unpack . rawId2utf8 prefixRawIdent (Id x) (Id y) = Id (BS.append x y) isPrefixOf (Id x) (Id y) = BS.isPrefixOf x y -ident2bs :: Ident -> BS.ByteString -ident2bs i = case i of - IC (Id s) -> s - IV (Id s) n -> BS.append s (BS.pack ('_':show n)) - IA (Id s) j -> BS.append s (BS.pack ('_':show j)) - IAV (Id s) b j -> BS.append s (BS.pack ('_':show b ++ '_':show j)) - IW -> BS.pack "_" +instance Binary RawIdent where + put = put . rawId2utf8 + get = fmap rawIdentC get -ident2raw = Id . ident2bs + +-- | This function should be used with care, since the returned ByteString is +-- UTF-8-encoded. +ident2utf8 :: Ident -> UTF8.ByteString +ident2utf8 i = case i of + IC (Id s) -> s + IV (Id s) n -> BS.append s (pack ('_':show n)) + IA (Id s) j -> BS.append s (pack ('_':show j)) + IAV (Id s) b j -> BS.append s (pack ('_':show b ++ '_':show j)) + IW -> pack "_" + +ident2raw = Id . ident2utf8 showIdent :: Ident -> String -showIdent i = BS.unpack $! ident2bs i +showIdent i = unpack $! ident2utf8 i ppIdent :: Ident -> Doc ppIdent = text . showIdent @@ -83,7 +97,7 @@ identW :: Ident prefixIdent :: String -> Ident -> Ident -prefixIdent pref = identC . Id . BS.append (BS.pack pref) . ident2bs +prefixIdent pref = identC . Id . BS.append (pack pref) . ident2utf8 -- normal identifier -- ident s = IC s @@ -99,8 +113,11 @@ isArgIdent _ = False getArgIndex (IA _ i) = Just i getArgIndex (IAV _ _ i) = Just i -getArgIndex (IC (Id s)) - | isDigit (BS.last s) = (Just . read . BS.unpack . snd . BS.spanEnd isDigit) s +getArgIndex (IC (Id bs)) + | isDigit c = + -- (Just . read . unpack . snd . BS.spanEnd isDigit) bs -- not ok with UTF-8 + (Just . read . reverse . takeWhile isDigit) s + where s@(c:_) = reverse (unpack bs) getArgIndex x = Nothing -- | used in lin defaults @@ -117,7 +134,7 @@ isWildIdent x = case x of IC s | s == wild -> True _ -> False -wild = Id (BS.pack "_") +wild = Id (pack "_") varIndex :: Ident -> Int varIndex (IV _ n) = n diff --git a/src/runtime/haskell/PGF.hs b/src/runtime/haskell/PGF.hs index 0e3c79f40..c1d903f4f 100644 --- a/src/runtime/haskell/PGF.hs +++ b/src/runtime/haskell/PGF.hs @@ -22,7 +22,7 @@ module PGF( CId, mkCId, wildCId, showCId, readCId, -- extra - ppCId, pIdent, bsCId, + ppCId, pIdent, utf8CId, -- * Languages Language, diff --git a/src/runtime/haskell/PGF/CId.hs b/src/runtime/haskell/PGF/CId.hs index 0594d9fc1..785642cdf 100644 --- a/src/runtime/haskell/PGF/CId.hs +++ b/src/runtime/haskell/PGF/CId.hs @@ -3,7 +3,7 @@ module PGF.CId (CId(..), readCId, showCId, -- utils - bsCId, pCId, pIdent, ppCId) where + utf8CId, pCId, pIdent, ppCId) where import Control.Monad import qualified Data.ByteString.Char8 as BS @@ -24,7 +24,8 @@ wildCId = CId (BS.singleton '_') mkCId :: String -> CId mkCId s = CId (UTF8.fromString s) -bsCId = CId +-- | Creates an identifier from a UTF-8-encoded 'ByteString' +utf8CId = CId -- | Reads an identifier from 'String'. The function returns 'Nothing' if the string is not valid identifier. readCId :: String -> Maybe CId