From a617e50d955c456ae2c46ba69f4c1ecd0d831017 Mon Sep 17 00:00:00 2001
From: hallgren <hallgren@chalmers.se>
Date: Tue, 26 Nov 2013 16:12:03 +0000
Subject: [PATCH] Represent identifiers as UTF-8-encoded ByteStrings

This was a fairly simple change thanks to previous work on making the Ident
type abstract and the fact that PGF.CId already uses UTF-8-encoded
ByteStrings.

One potential pitfall is that Data.ByteString.UTF8 uses the same type for
ByteStrings as Data.ByteString. I renamed ident2bs to ident2utf8 and
bsCId to utf8CId, to make it clearer that they work with UTF-8-encoded
ByteStrings.

Since both the compiler input and identifiers are now UTF-8-encoded
ByteStrings, the lexer now creates identifiers without copying any characters.
**END OF DESCRIPTION***

Place the long patch description above the ***END OF DESCRIPTION*** marker.
The first line of this file will be the patch name.


This patch contains the following changes:

M ./src/compiler/GF/Compile/CheckGrammar.hs -3 +3
M ./src/compiler/GF/Compile/GrammarToPGF.hs -2 +2
M ./src/compiler/GF/Grammar/Binary.hs -5 +1
M ./src/compiler/GF/Grammar/Lexer.x -11 +13
M ./src/compiler/GF/Infra/Ident.hs -19 +36
M ./src/runtime/haskell/PGF.hs -1 +1
M ./src/runtime/haskell/PGF/CId.hs -2 +3
---
 src/compiler/GF/Compile/CheckGrammar.hs |  6 +--
 src/compiler/GF/Compile/GrammarToPGF.hs |  4 +-
 src/compiler/GF/Grammar/Binary.hs       |  6 +--
 src/compiler/GF/Grammar/Lexer.x         | 24 ++++++-----
 src/compiler/GF/Infra/Ident.hs          | 57 ++++++++++++++++---------
 src/runtime/haskell/PGF.hs              |  2 +-
 src/runtime/haskell/PGF/CId.hs          |  5 ++-
 7 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/src/compiler/GF/Compile/CheckGrammar.hs b/src/compiler/GF/Compile/CheckGrammar.hs
index 568686f92..d924e413e 100644
--- a/src/compiler/GF/Compile/CheckGrammar.hs
+++ b/src/compiler/GF/Compile/CheckGrammar.hs
@@ -298,9 +298,9 @@ checkInfo opts sgr (m,mo) c info = do
 
 -- | for grammars obtained otherwise than by parsing ---- update!!
 checkReservedId :: Ident -> Check ()
-checkReservedId x
-  | isReservedWord (ident2bs x) = checkWarn (text "reserved word used as identifier:" <+> ppIdent x)
-  | otherwise                   = return ()
+checkReservedId x =
+  when (isReservedWord x) $
+       checkWarn (text "reserved word used as identifier:" <+> ppIdent x)
 
 -- auxiliaries
 
diff --git a/src/compiler/GF/Compile/GrammarToPGF.hs b/src/compiler/GF/Compile/GrammarToPGF.hs
index 25db4bac7..5917b709c 100644
--- a/src/compiler/GF/Compile/GrammarToPGF.hs
+++ b/src/compiler/GF/Compile/GrammarToPGF.hs
@@ -5,7 +5,7 @@ module GF.Compile.GrammarToPGF (mkCanon2pgf) where
 import GF.Compile.GeneratePMCFG
 import GF.Compile.GenerateBC
 
-import PGF(CId,mkCId,bsCId)
+import PGF(CId,mkCId,utf8CId)
 import PGF.Data(fidInt,fidFloat,fidString,fidVar)
 import PGF.Optimize(updateProductionIndices)
 --import qualified PGF.Macros as CM
@@ -103,7 +103,7 @@ mkCanon2pgf opts gr am = do
           return (seqs, ((m,id), info) : is)
 
 i2i :: Ident -> CId
-i2i = bsCId . ident2bs
+i2i = utf8CId . ident2utf8
 
 mkType :: [Ident] -> A.Type -> C.Type
 mkType scope t =
diff --git a/src/compiler/GF/Grammar/Binary.hs b/src/compiler/GF/Grammar/Binary.hs
index 6cd3832c0..7b4540ce5 100644
--- a/src/compiler/GF/Grammar/Binary.hs
+++ b/src/compiler/GF/Grammar/Binary.hs
@@ -30,7 +30,7 @@ import PGF.Data(Literal(..))
 gfoVersion = "GF03"
 
 instance Binary Ident where
-  put id = put (ident2bs id)
+  put id = put (ident2utf8 id)
   get    = do bs <- get
               if bs == BS.pack "_"
                 then return identW
@@ -295,10 +295,6 @@ instance Binary Label where
              1 -> fmap LVar   get
              _ -> decodingError
 
-instance Binary RawIdent where
-  put = put . rawId2bs
-  get = fmap rawIdentC get
-
 --putGFOVersion = mapM_ (putWord8 . fromIntegral . ord) gfoVersion
 --getGFOVersion = replicateM (length gfoVersion) (fmap (chr . fromIntegral) getWord8)
 --putGFOVersion = put gfoVersion
diff --git a/src/compiler/GF/Grammar/Lexer.x b/src/compiler/GF/Grammar/Lexer.x
index 60c51f814..c4f7159a2 100644
--- a/src/compiler/GF/Grammar/Lexer.x
+++ b/src/compiler/GF/Grammar/Lexer.x
@@ -33,9 +33,9 @@ $u = [.\n]                -- universal: any character
 "{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ; 
 
 $white+ ;
-@rsyms                          { tok (res (T_Ident . identS)) }
-\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) }
-(\_ | $l)($l | $d | \_ | \')*   { tok (res (T_Ident . identS)) }
+@rsyms                          { tok ident }
+\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (res T_Ident . identS . unescapeInitTail . unpack) }
+(\_ | $l)($l | $d | \_ | \')*   { tok ident }
 
 \" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) }
 
@@ -43,10 +43,12 @@ $white+ ;
 (\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double  . read . unpack) }
 
 {
---unpack = BS.unpack
-unpack = id
+unpack = UTF8.toString
+--unpack = id
 
-tok :: (String->Token) -> Posn -> String -> Token
+ident = res T_Ident . identC . rawIdentC
+
+--tok :: (String->Token) -> Posn -> String -> Token
 tok f p s = f s
 
 data Token
@@ -126,14 +128,14 @@ data Token
 -- deriving Show -- debug
 
 res = eitherResIdent
-eitherResIdent :: (String -> Token) -> String -> Token
+eitherResIdent :: (Ident -> Token) -> Ident -> Token
 eitherResIdent tv s = 
   case Map.lookup s resWords of
     Just t  -> t
     Nothing -> tv s
 
-isReservedWord :: BS.ByteString -> Bool
-isReservedWord s = Map.member (BS.unpack s) resWords
+isReservedWord :: Ident -> Bool
+isReservedWord ident = Map.member ident resWords
 
 resWords = Map.fromList
  [ b "!"  T_exclmark
@@ -205,7 +207,7 @@ resWords = Map.fromList
  , b "where"      T_where
  , b "with"       T_with
  ]
- where b s t = (s, t)
+ where b s t = (identS s, t)
 
 unescapeInitTail :: String -> String
 unescapeInitTail = unesc . tail where
@@ -278,7 +280,7 @@ lexer cont = P go
         AlexEOF                -> unP (cont T_EOF) inp
         AlexError (AI pos _ _) -> PFailed pos "lexical error"
         AlexSkip  inp' len     -> {-trace (show len) $-} go inp'
-        AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp'
+        AlexToken inp' len act -> unP (cont (act pos ({-UTF8.toString-} (UTF8.take len str)))) inp'
 
 getPosn :: P Posn
 getPosn = P $ \inp@(AI pos _ _) -> POk pos
diff --git a/src/compiler/GF/Infra/Ident.hs b/src/compiler/GF/Infra/Ident.hs
index 4792852dd..a5874b744 100644
--- a/src/compiler/GF/Infra/Ident.hs
+++ b/src/compiler/GF/Infra/Ident.hs
@@ -13,20 +13,24 @@
 -----------------------------------------------------------------------------
 
 module GF.Infra.Ident (-- * Identifiers
-	      Ident, ident2bs, showIdent, ppIdent, prefixIdent,
+	      Ident, ident2utf8, showIdent, ppIdent, prefixIdent,
 	      identS, identC, identV, identA, identAV, identW,
 	      argIdent, isArgIdent, getArgIndex,
               varStr, varX, isWildIdent, varIndex,
               -- * Raw Identifiers
               RawIdent, rawIdentS, rawIdentC, ident2raw, prefixRawIdent,
-              isPrefixOf, showRawIdent, rawId2bs{-,
+              isPrefixOf, showRawIdent{-,
 	      -- * Refreshing identifiers
 	      IdState, initIdStateN, initIdState,
 	      lookVar, refVar, refVarPlus-}
 	     ) where
 
-import qualified Data.ByteString.Char8 as BS
+import qualified Data.ByteString.UTF8 as UTF8
+import qualified Data.ByteString.Char8 as BS(append,isPrefixOf)
+                 -- Limit use of BS functions to the ones that work correctly on
+                 -- UTF-8-encoded bytestrings!
 import Data.Char(isDigit)
+import Data.Binary(Binary(..))
 import Text.PrettyPrint(Doc,text)
 
 
@@ -41,31 +45,41 @@ data Ident =
  | IA  {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int                       -- ^ /INTERNAL/ argument of cat at position
  | IAV {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int {-# UNPACK #-} !Int   -- ^ /INTERNAL/ argument of cat with bindings at position
 -- 
-
   deriving (Eq, Ord, Show, Read)
 
-newtype RawIdent = Id { rawId2bs :: BS.ByteString }
+-- | Identifiers are stored as UTF-8-encoded bytestrings.
+newtype RawIdent = Id { rawId2utf8 :: UTF8.ByteString }
   deriving (Eq, Ord, Show, Read)
 
-rawIdentS = Id . BS.pack
+pack = UTF8.fromString
+unpack = UTF8.toString
+
+rawIdentS = Id . pack
 rawIdentC = Id
-showRawIdent = BS.unpack . rawId2bs
+showRawIdent = unpack . rawId2utf8
 
 prefixRawIdent (Id x) (Id y) = Id (BS.append x y) 
 isPrefixOf (Id x) (Id y) = BS.isPrefixOf x y
 
-ident2bs :: Ident -> BS.ByteString
-ident2bs i = case i of
-  IC (Id s) -> s
-  IV (Id s) n -> BS.append s (BS.pack ('_':show n))
-  IA (Id s) j -> BS.append s (BS.pack ('_':show j))
-  IAV (Id s) b j -> BS.append s (BS.pack ('_':show b ++ '_':show j))
-  IW -> BS.pack "_"
+instance Binary RawIdent where
+  put = put . rawId2utf8
+  get = fmap rawIdentC get
 
-ident2raw = Id . ident2bs
+
+-- | This function should be used with care, since the returned ByteString is
+-- UTF-8-encoded.
+ident2utf8 :: Ident -> UTF8.ByteString
+ident2utf8 i = case i of
+  IC (Id s) -> s
+  IV (Id s) n -> BS.append s (pack ('_':show n))
+  IA (Id s) j -> BS.append s (pack ('_':show j))
+  IAV (Id s) b j -> BS.append s (pack ('_':show b ++ '_':show j))
+  IW -> pack "_"
+
+ident2raw = Id . ident2utf8
 
 showIdent :: Ident -> String
-showIdent i = BS.unpack $! ident2bs i
+showIdent i = unpack $! ident2utf8 i
 
 ppIdent :: Ident -> Doc
 ppIdent = text . showIdent
@@ -83,7 +97,7 @@ identW :: Ident
 
 
 prefixIdent :: String -> Ident -> Ident
-prefixIdent pref = identC . Id . BS.append (BS.pack pref) . ident2bs
+prefixIdent pref = identC . Id . BS.append (pack pref) . ident2utf8
 
 -- normal identifier
 -- ident s = IC s
@@ -99,8 +113,11 @@ isArgIdent _     = False
 
 getArgIndex (IA _ i)    = Just i
 getArgIndex (IAV _ _ i) = Just i
-getArgIndex (IC (Id s))
-  | isDigit (BS.last s) = (Just . read . BS.unpack . snd . BS.spanEnd isDigit) s
+getArgIndex (IC (Id bs))
+  | isDigit c =
+   -- (Just . read . unpack . snd . BS.spanEnd isDigit) bs -- not ok with UTF-8
+      (Just . read . reverse . takeWhile isDigit) s
+  where s@(c:_) = reverse (unpack bs)
 getArgIndex x = Nothing
 
 -- | used in lin defaults
@@ -117,7 +134,7 @@ isWildIdent x = case x of
   IC s | s == wild -> True
   _ -> False
 
-wild = Id (BS.pack "_")
+wild = Id (pack "_")
 
 varIndex :: Ident -> Int
 varIndex (IV _ n) = n
diff --git a/src/runtime/haskell/PGF.hs b/src/runtime/haskell/PGF.hs
index 0e3c79f40..c1d903f4f 100644
--- a/src/runtime/haskell/PGF.hs
+++ b/src/runtime/haskell/PGF.hs
@@ -22,7 +22,7 @@ module PGF(
            CId, mkCId, wildCId,
            showCId, readCId,
            -- extra
-           ppCId, pIdent, bsCId,
+           ppCId, pIdent, utf8CId,
 
            -- * Languages
            Language, 
diff --git a/src/runtime/haskell/PGF/CId.hs b/src/runtime/haskell/PGF/CId.hs
index 0594d9fc1..785642cdf 100644
--- a/src/runtime/haskell/PGF/CId.hs
+++ b/src/runtime/haskell/PGF/CId.hs
@@ -3,7 +3,7 @@ module PGF.CId (CId(..),
                 readCId, showCId,
                 
                 -- utils
-                bsCId, pCId, pIdent, ppCId) where
+                utf8CId, pCId, pIdent, ppCId) where
 
 import Control.Monad
 import qualified Data.ByteString.Char8 as BS
@@ -24,7 +24,8 @@ wildCId = CId (BS.singleton '_')
 mkCId :: String -> CId
 mkCId s = CId (UTF8.fromString s)
 
-bsCId = CId
+-- | Creates an identifier from a UTF-8-encoded 'ByteString'
+utf8CId = CId
 
 -- | Reads an identifier from 'String'. The function returns 'Nothing' if the string is not valid identifier.
 readCId :: String -> Maybe CId