Represent identifiers as UTF-8-encoded ByteStrings

This was a fairly simple change thanks to previous work on making the Ident type abstract and the fact that PGF.CId already uses UTF-8-encoded ByteStrings. One potential pitfall is that Data.ByteString.UTF8 uses the same type for ByteStrings as Data.ByteString. I renamed ident2bs to ident2utf8 and bsCId to utf8CId, to make it clearer that they work with UTF-8-encoded ByteStrings. Since both the compiler input and identifiers are now UTF-8-encoded ByteStrings, the lexer now creates identifiers without copying any characters. **END OF DESCRIPTION*** Place the long patch description above the ***END OF DESCRIPTION*** marker. The first line of this file will be the patch name. This patch contains the following changes: M ./src/compiler/GF/Compile/CheckGrammar.hs -3 +3 M ./src/compiler/GF/Compile/GrammarToPGF.hs -2 +2 M ./src/compiler/GF/Grammar/Binary.hs -5 +1 M ./src/compiler/GF/Grammar/Lexer.x -11 +13 M ./src/compiler/GF/Infra/Ident.hs -19 +36 M ./src/runtime/haskell/PGF.hs -1 +1 M ./src/runtime/haskell/PGF/CId.hs -2 +3
2026-04-09 04:59:31 -06:00 · 2013-11-26 16:12:03 +00:00
parent 9d7fdf7c9a
commit 3f57151cc3
7 changed files with 60 additions and 44 deletions
--- a/src/compiler/GF/Compile/CheckGrammar.hs
+++ b/src/compiler/GF/Compile/CheckGrammar.hs
@@ -298,9 +298,9 @@ checkInfo opts sgr (m,mo) c info = do

 -- | for grammars obtained otherwise than by parsing ---- update!!
 checkReservedId :: Ident -> Check ()
-checkReservedId x
-  | isReservedWord (ident2bs x) = checkWarn (text "reserved word used as identifier:" <+> ppIdent x)
-  | otherwise                   = return ()
+checkReservedId x =
+  when (isReservedWord x) $
+       checkWarn (text "reserved word used as identifier:" <+> ppIdent x)

 -- auxiliaries

--- a/src/compiler/GF/Compile/GrammarToPGF.hs
+++ b/src/compiler/GF/Compile/GrammarToPGF.hs
@@ -5,7 +5,7 @@ module GF.Compile.GrammarToPGF (mkCanon2pgf) where
 import GF.Compile.GeneratePMCFG
 import GF.Compile.GenerateBC

-import PGF(CId,mkCId,bsCId)
+import PGF(CId,mkCId,utf8CId)
 import PGF.Data(fidInt,fidFloat,fidString,fidVar)
 import PGF.Optimize(updateProductionIndices)
 --import qualified PGF.Macros as CM
@@ -103,7 +103,7 @@ mkCanon2pgf opts gr am = do
          return (seqs, ((m,id), info) : is)

 i2i :: Ident -> CId
-i2i = bsCId . ident2bs
+i2i = utf8CId . ident2utf8

 mkType :: [Ident] -> A.Type -> C.Type
 mkType scope t =
--- a/src/compiler/GF/Grammar/Binary.hs
+++ b/src/compiler/GF/Grammar/Binary.hs
@@ -30,7 +30,7 @@ import PGF.Data(Literal(..))
 gfoVersion = "GF03"

 instance Binary Ident where
-  put id = put (ident2bs id)
+  put id = put (ident2utf8 id)
  get    = do bs <- get
              if bs == BS.pack "_"
                then return identW
@@ -295,10 +295,6 @@ instance Binary Label where
             1 -> fmap LVar   get
             _ -> decodingError

-instance Binary RawIdent where
-  put = put . rawId2bs
-  get = fmap rawIdentC get
-
 --putGFOVersion = mapM_ (putWord8 . fromIntegral . ord) gfoVersion
 --getGFOVersion = replicateM (length gfoVersion) (fmap (chr . fromIntegral) getWord8)
 --putGFOVersion = put gfoVersion
--- a/src/compiler/GF/Grammar/Lexer.x
+++ b/src/compiler/GF/Grammar/Lexer.x
@@ -33,9 +33,9 @@ $u = [.\n]                -- universal: any character
 "{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ; 

 $white+ ;
-@rsyms                          { tok (res (T_Ident . identS)) }
-\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) }
-(\_ | $l)($l | $d | \_ | \')*   { tok (res (T_Ident . identS)) }
+@rsyms                          { tok ident }
+\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (res T_Ident . identS . unescapeInitTail . unpack) }
+(\_ | $l)($l | $d | \_ | \')*   { tok ident }

 \" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) }

@@ -43,10 +43,12 @@ $white+ ;
 (\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double  . read . unpack) }

 {
--unpack = BS.unpack
-unpack = id
+unpack = UTF8.toString
+--unpack = id

-tok :: (String->Token) -> Posn -> String -> Token
+ident = res T_Ident . identC . rawIdentC
+
+--tok :: (String->Token) -> Posn -> String -> Token
 tok f p s = f s

 data Token
@@ -126,14 +128,14 @@ data Token
 -- deriving Show -- debug

 res = eitherResIdent
-eitherResIdent :: (String -> Token) -> String -> Token
+eitherResIdent :: (Ident -> Token) -> Ident -> Token
 eitherResIdent tv s = 
  case Map.lookup s resWords of
    Just t  -> t
    Nothing -> tv s

-isReservedWord :: BS.ByteString -> Bool
-isReservedWord s = Map.member (BS.unpack s) resWords
+isReservedWord :: Ident -> Bool
+isReservedWord ident = Map.member ident resWords

 resWords = Map.fromList
 [ b "!"  T_exclmark
@@ -205,7 +207,7 @@ resWords = Map.fromList
 , b "where"      T_where
 , b "with"       T_with
 ]
- where b s t = (s, t)
+ where b s t = (identS s, t)

 unescapeInitTail :: String -> String
 unescapeInitTail = unesc . tail where
@@ -278,7 +280,7 @@ lexer cont = P go
        AlexEOF                -> unP (cont T_EOF) inp
        AlexError (AI pos _ _) -> PFailed pos "lexical error"
        AlexSkip  inp' len     -> {-trace (show len) $-} go inp'
-        AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp'
+        AlexToken inp' len act -> unP (cont (act pos ({-UTF8.toString-} (UTF8.take len str)))) inp'

 getPosn :: P Posn
 getPosn = P $ \inp@(AI pos _ _) -> POk pos
--- a/src/compiler/GF/Infra/Ident.hs
+++ b/src/compiler/GF/Infra/Ident.hs
@@ -13,20 +13,24 @@
 -----------------------------------------------------------------------------

 module GF.Infra.Ident (-- * Identifiers
-	      Ident, ident2bs, showIdent, ppIdent, prefixIdent,
+	      Ident, ident2utf8, showIdent, ppIdent, prefixIdent,
 	      identS, identC, identV, identA, identAV, identW,
 	      argIdent, isArgIdent, getArgIndex,
              varStr, varX, isWildIdent, varIndex,
              -- * Raw Identifiers
              RawIdent, rawIdentS, rawIdentC, ident2raw, prefixRawIdent,
-              isPrefixOf, showRawIdent, rawId2bs{-,
+              isPrefixOf, showRawIdent{-,
 	      -- * Refreshing identifiers
 	      IdState, initIdStateN, initIdState,
 	      lookVar, refVar, refVarPlus-}
 	     ) where

-import qualified Data.ByteString.Char8 as BS
+import qualified Data.ByteString.UTF8 as UTF8
+import qualified Data.ByteString.Char8 as BS(append,isPrefixOf)
+                 -- Limit use of BS functions to the ones that work correctly on
+                 -- UTF-8-encoded bytestrings!
 import Data.Char(isDigit)
+import Data.Binary(Binary(..))
 import Text.PrettyPrint(Doc,text)


@@ -41,31 +45,41 @@ data Ident =
 | IA  {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int                       -- ^ /INTERNAL/ argument of cat at position
 | IAV {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int {-# UNPACK #-} !Int   -- ^ /INTERNAL/ argument of cat with bindings at position
 -- 
-
  deriving (Eq, Ord, Show, Read)

-newtype RawIdent = Id { rawId2bs :: BS.ByteString }
+-- | Identifiers are stored as UTF-8-encoded bytestrings.
+newtype RawIdent = Id { rawId2utf8 :: UTF8.ByteString }
  deriving (Eq, Ord, Show, Read)

-rawIdentS = Id . BS.pack
+pack = UTF8.fromString
+unpack = UTF8.toString
+
+rawIdentS = Id . pack
 rawIdentC = Id
-showRawIdent = BS.unpack . rawId2bs
+showRawIdent = unpack . rawId2utf8

 prefixRawIdent (Id x) (Id y) = Id (BS.append x y) 
 isPrefixOf (Id x) (Id y) = BS.isPrefixOf x y

-ident2bs :: Ident -> BS.ByteString
-ident2bs i = case i of
-  IC (Id s) -> s
-  IV (Id s) n -> BS.append s (BS.pack ('_':show n))
-  IA (Id s) j -> BS.append s (BS.pack ('_':show j))
-  IAV (Id s) b j -> BS.append s (BS.pack ('_':show b ++ '_':show j))
-  IW -> BS.pack "_"
+instance Binary RawIdent where
+  put = put . rawId2utf8
+  get = fmap rawIdentC get

-ident2raw = Id . ident2bs
+
+-- | This function should be used with care, since the returned ByteString is
+-- UTF-8-encoded.
+ident2utf8 :: Ident -> UTF8.ByteString
+ident2utf8 i = case i of
+  IC (Id s) -> s
+  IV (Id s) n -> BS.append s (pack ('_':show n))
+  IA (Id s) j -> BS.append s (pack ('_':show j))
+  IAV (Id s) b j -> BS.append s (pack ('_':show b ++ '_':show j))
+  IW -> pack "_"
+
+ident2raw = Id . ident2utf8

 showIdent :: Ident -> String
-showIdent i = BS.unpack $! ident2bs i
+showIdent i = unpack $! ident2utf8 i

 ppIdent :: Ident -> Doc
 ppIdent = text . showIdent
@@ -83,7 +97,7 @@ identW :: Ident


 prefixIdent :: String -> Ident -> Ident
-prefixIdent pref = identC . Id . BS.append (BS.pack pref) . ident2bs
+prefixIdent pref = identC . Id . BS.append (pack pref) . ident2utf8

 -- normal identifier
 -- ident s = IC s
@@ -99,8 +113,11 @@ isArgIdent _     = False

 getArgIndex (IA _ i)    = Just i
 getArgIndex (IAV _ _ i) = Just i
-getArgIndex (IC (Id s))
-  | isDigit (BS.last s) = (Just . read . BS.unpack . snd . BS.spanEnd isDigit) s
+getArgIndex (IC (Id bs))
+  | isDigit c =
+   -- (Just . read . unpack . snd . BS.spanEnd isDigit) bs -- not ok with UTF-8
+      (Just . read . reverse . takeWhile isDigit) s
+  where s@(c:_) = reverse (unpack bs)
 getArgIndex x = Nothing

 -- | used in lin defaults
@@ -117,7 +134,7 @@ isWildIdent x = case x of
  IC s | s == wild -> True
  _ -> False

-wild = Id (BS.pack "_")
+wild = Id (pack "_")

 varIndex :: Ident -> Int
 varIndex (IV _ n) = n
--- a/src/runtime/haskell/PGF.hs
+++ b/src/runtime/haskell/PGF.hs
@@ -22,7 +22,7 @@ module PGF(
           CId, mkCId, wildCId,
           showCId, readCId,
           -- extra
-           ppCId, pIdent, bsCId,
+           ppCId, pIdent, utf8CId,

           -- * Languages
           Language, 
--- a/src/runtime/haskell/PGF/CId.hs
+++ b/src/runtime/haskell/PGF/CId.hs
@@ -3,7 +3,7 @@ module PGF.CId (CId(..),
                readCId, showCId,
                
                -- utils
-                bsCId, pCId, pIdent, ppCId) where
+                utf8CId, pCId, pIdent, ppCId) where

 import Control.Monad
 import qualified Data.ByteString.Char8 as BS
@@ -24,7 +24,8 @@ wildCId = CId (BS.singleton '_')
 mkCId :: String -> CId
 mkCId s = CId (UTF8.fromString s)

-bsCId = CId
+-- | Creates an identifier from a UTF-8-encoded 'ByteString'
+utf8CId = CId

 -- | Reads an identifier from 'String'. The function returns 'Nothing' if the string is not valid identifier.
 readCId :: String -> Maybe CId