mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
Represent identifiers as UTF-8-encoded ByteStrings
This was a fairly simple change thanks to previous work on making the Ident type abstract and the fact that PGF.CId already uses UTF-8-encoded ByteStrings. One potential pitfall is that Data.ByteString.UTF8 uses the same type for ByteStrings as Data.ByteString. I renamed ident2bs to ident2utf8 and bsCId to utf8CId, to make it clearer that they work with UTF-8-encoded ByteStrings. Since both the compiler input and identifiers are now UTF-8-encoded ByteStrings, the lexer now creates identifiers without copying any characters. **END OF DESCRIPTION*** Place the long patch description above the ***END OF DESCRIPTION*** marker. The first line of this file will be the patch name. This patch contains the following changes: M ./src/compiler/GF/Compile/CheckGrammar.hs -3 +3 M ./src/compiler/GF/Compile/GrammarToPGF.hs -2 +2 M ./src/compiler/GF/Grammar/Binary.hs -5 +1 M ./src/compiler/GF/Grammar/Lexer.x -11 +13 M ./src/compiler/GF/Infra/Ident.hs -19 +36 M ./src/runtime/haskell/PGF.hs -1 +1 M ./src/runtime/haskell/PGF/CId.hs -2 +3
This commit is contained in:
@@ -298,9 +298,9 @@ checkInfo opts sgr (m,mo) c info = do
|
||||
|
||||
-- | for grammars obtained otherwise than by parsing ---- update!!
|
||||
checkReservedId :: Ident -> Check ()
|
||||
checkReservedId x
|
||||
| isReservedWord (ident2bs x) = checkWarn (text "reserved word used as identifier:" <+> ppIdent x)
|
||||
| otherwise = return ()
|
||||
checkReservedId x =
|
||||
when (isReservedWord x) $
|
||||
checkWarn (text "reserved word used as identifier:" <+> ppIdent x)
|
||||
|
||||
-- auxiliaries
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ module GF.Compile.GrammarToPGF (mkCanon2pgf) where
|
||||
import GF.Compile.GeneratePMCFG
|
||||
import GF.Compile.GenerateBC
|
||||
|
||||
import PGF(CId,mkCId,bsCId)
|
||||
import PGF(CId,mkCId,utf8CId)
|
||||
import PGF.Data(fidInt,fidFloat,fidString,fidVar)
|
||||
import PGF.Optimize(updateProductionIndices)
|
||||
--import qualified PGF.Macros as CM
|
||||
@@ -103,7 +103,7 @@ mkCanon2pgf opts gr am = do
|
||||
return (seqs, ((m,id), info) : is)
|
||||
|
||||
i2i :: Ident -> CId
|
||||
i2i = bsCId . ident2bs
|
||||
i2i = utf8CId . ident2utf8
|
||||
|
||||
mkType :: [Ident] -> A.Type -> C.Type
|
||||
mkType scope t =
|
||||
|
||||
@@ -30,7 +30,7 @@ import PGF.Data(Literal(..))
|
||||
gfoVersion = "GF03"
|
||||
|
||||
instance Binary Ident where
|
||||
put id = put (ident2bs id)
|
||||
put id = put (ident2utf8 id)
|
||||
get = do bs <- get
|
||||
if bs == BS.pack "_"
|
||||
then return identW
|
||||
@@ -295,10 +295,6 @@ instance Binary Label where
|
||||
1 -> fmap LVar get
|
||||
_ -> decodingError
|
||||
|
||||
instance Binary RawIdent where
|
||||
put = put . rawId2bs
|
||||
get = fmap rawIdentC get
|
||||
|
||||
--putGFOVersion = mapM_ (putWord8 . fromIntegral . ord) gfoVersion
|
||||
--getGFOVersion = replicateM (length gfoVersion) (fmap (chr . fromIntegral) getWord8)
|
||||
--putGFOVersion = put gfoVersion
|
||||
|
||||
@@ -33,9 +33,9 @@ $u = [.\n] -- universal: any character
|
||||
"{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ;
|
||||
|
||||
$white+ ;
|
||||
@rsyms { tok (res (T_Ident . identS)) }
|
||||
\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) }
|
||||
(\_ | $l)($l | $d | \_ | \')* { tok (res (T_Ident . identS)) }
|
||||
@rsyms { tok ident }
|
||||
\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (res T_Ident . identS . unescapeInitTail . unpack) }
|
||||
(\_ | $l)($l | $d | \_ | \')* { tok ident }
|
||||
|
||||
\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) }
|
||||
|
||||
@@ -43,10 +43,12 @@ $white+ ;
|
||||
(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . unpack) }
|
||||
|
||||
{
|
||||
--unpack = BS.unpack
|
||||
unpack = id
|
||||
unpack = UTF8.toString
|
||||
--unpack = id
|
||||
|
||||
tok :: (String->Token) -> Posn -> String -> Token
|
||||
ident = res T_Ident . identC . rawIdentC
|
||||
|
||||
--tok :: (String->Token) -> Posn -> String -> Token
|
||||
tok f p s = f s
|
||||
|
||||
data Token
|
||||
@@ -126,14 +128,14 @@ data Token
|
||||
-- deriving Show -- debug
|
||||
|
||||
res = eitherResIdent
|
||||
eitherResIdent :: (String -> Token) -> String -> Token
|
||||
eitherResIdent :: (Ident -> Token) -> Ident -> Token
|
||||
eitherResIdent tv s =
|
||||
case Map.lookup s resWords of
|
||||
Just t -> t
|
||||
Nothing -> tv s
|
||||
|
||||
isReservedWord :: BS.ByteString -> Bool
|
||||
isReservedWord s = Map.member (BS.unpack s) resWords
|
||||
isReservedWord :: Ident -> Bool
|
||||
isReservedWord ident = Map.member ident resWords
|
||||
|
||||
resWords = Map.fromList
|
||||
[ b "!" T_exclmark
|
||||
@@ -205,7 +207,7 @@ resWords = Map.fromList
|
||||
, b "where" T_where
|
||||
, b "with" T_with
|
||||
]
|
||||
where b s t = (s, t)
|
||||
where b s t = (identS s, t)
|
||||
|
||||
unescapeInitTail :: String -> String
|
||||
unescapeInitTail = unesc . tail where
|
||||
@@ -278,7 +280,7 @@ lexer cont = P go
|
||||
AlexEOF -> unP (cont T_EOF) inp
|
||||
AlexError (AI pos _ _) -> PFailed pos "lexical error"
|
||||
AlexSkip inp' len -> {-trace (show len) $-} go inp'
|
||||
AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp'
|
||||
AlexToken inp' len act -> unP (cont (act pos ({-UTF8.toString-} (UTF8.take len str)))) inp'
|
||||
|
||||
getPosn :: P Posn
|
||||
getPosn = P $ \inp@(AI pos _ _) -> POk pos
|
||||
|
||||
@@ -13,20 +13,24 @@
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Infra.Ident (-- * Identifiers
|
||||
Ident, ident2bs, showIdent, ppIdent, prefixIdent,
|
||||
Ident, ident2utf8, showIdent, ppIdent, prefixIdent,
|
||||
identS, identC, identV, identA, identAV, identW,
|
||||
argIdent, isArgIdent, getArgIndex,
|
||||
varStr, varX, isWildIdent, varIndex,
|
||||
-- * Raw Identifiers
|
||||
RawIdent, rawIdentS, rawIdentC, ident2raw, prefixRawIdent,
|
||||
isPrefixOf, showRawIdent, rawId2bs{-,
|
||||
isPrefixOf, showRawIdent{-,
|
||||
-- * Refreshing identifiers
|
||||
IdState, initIdStateN, initIdState,
|
||||
lookVar, refVar, refVarPlus-}
|
||||
) where
|
||||
|
||||
import qualified Data.ByteString.Char8 as BS
|
||||
import qualified Data.ByteString.UTF8 as UTF8
|
||||
import qualified Data.ByteString.Char8 as BS(append,isPrefixOf)
|
||||
-- Limit use of BS functions to the ones that work correctly on
|
||||
-- UTF-8-encoded bytestrings!
|
||||
import Data.Char(isDigit)
|
||||
import Data.Binary(Binary(..))
|
||||
import Text.PrettyPrint(Doc,text)
|
||||
|
||||
|
||||
@@ -41,31 +45,41 @@ data Ident =
|
||||
| IA {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int -- ^ /INTERNAL/ argument of cat at position
|
||||
| IAV {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ /INTERNAL/ argument of cat with bindings at position
|
||||
--
|
||||
|
||||
deriving (Eq, Ord, Show, Read)
|
||||
|
||||
newtype RawIdent = Id { rawId2bs :: BS.ByteString }
|
||||
-- | Identifiers are stored as UTF-8-encoded bytestrings.
|
||||
newtype RawIdent = Id { rawId2utf8 :: UTF8.ByteString }
|
||||
deriving (Eq, Ord, Show, Read)
|
||||
|
||||
rawIdentS = Id . BS.pack
|
||||
pack = UTF8.fromString
|
||||
unpack = UTF8.toString
|
||||
|
||||
rawIdentS = Id . pack
|
||||
rawIdentC = Id
|
||||
showRawIdent = BS.unpack . rawId2bs
|
||||
showRawIdent = unpack . rawId2utf8
|
||||
|
||||
prefixRawIdent (Id x) (Id y) = Id (BS.append x y)
|
||||
isPrefixOf (Id x) (Id y) = BS.isPrefixOf x y
|
||||
|
||||
ident2bs :: Ident -> BS.ByteString
|
||||
ident2bs i = case i of
|
||||
IC (Id s) -> s
|
||||
IV (Id s) n -> BS.append s (BS.pack ('_':show n))
|
||||
IA (Id s) j -> BS.append s (BS.pack ('_':show j))
|
||||
IAV (Id s) b j -> BS.append s (BS.pack ('_':show b ++ '_':show j))
|
||||
IW -> BS.pack "_"
|
||||
instance Binary RawIdent where
|
||||
put = put . rawId2utf8
|
||||
get = fmap rawIdentC get
|
||||
|
||||
ident2raw = Id . ident2bs
|
||||
|
||||
-- | This function should be used with care, since the returned ByteString is
|
||||
-- UTF-8-encoded.
|
||||
ident2utf8 :: Ident -> UTF8.ByteString
|
||||
ident2utf8 i = case i of
|
||||
IC (Id s) -> s
|
||||
IV (Id s) n -> BS.append s (pack ('_':show n))
|
||||
IA (Id s) j -> BS.append s (pack ('_':show j))
|
||||
IAV (Id s) b j -> BS.append s (pack ('_':show b ++ '_':show j))
|
||||
IW -> pack "_"
|
||||
|
||||
ident2raw = Id . ident2utf8
|
||||
|
||||
showIdent :: Ident -> String
|
||||
showIdent i = BS.unpack $! ident2bs i
|
||||
showIdent i = unpack $! ident2utf8 i
|
||||
|
||||
ppIdent :: Ident -> Doc
|
||||
ppIdent = text . showIdent
|
||||
@@ -83,7 +97,7 @@ identW :: Ident
|
||||
|
||||
|
||||
prefixIdent :: String -> Ident -> Ident
|
||||
prefixIdent pref = identC . Id . BS.append (BS.pack pref) . ident2bs
|
||||
prefixIdent pref = identC . Id . BS.append (pack pref) . ident2utf8
|
||||
|
||||
-- normal identifier
|
||||
-- ident s = IC s
|
||||
@@ -99,8 +113,11 @@ isArgIdent _ = False
|
||||
|
||||
getArgIndex (IA _ i) = Just i
|
||||
getArgIndex (IAV _ _ i) = Just i
|
||||
getArgIndex (IC (Id s))
|
||||
| isDigit (BS.last s) = (Just . read . BS.unpack . snd . BS.spanEnd isDigit) s
|
||||
getArgIndex (IC (Id bs))
|
||||
| isDigit c =
|
||||
-- (Just . read . unpack . snd . BS.spanEnd isDigit) bs -- not ok with UTF-8
|
||||
(Just . read . reverse . takeWhile isDigit) s
|
||||
where s@(c:_) = reverse (unpack bs)
|
||||
getArgIndex x = Nothing
|
||||
|
||||
-- | used in lin defaults
|
||||
@@ -117,7 +134,7 @@ isWildIdent x = case x of
|
||||
IC s | s == wild -> True
|
||||
_ -> False
|
||||
|
||||
wild = Id (BS.pack "_")
|
||||
wild = Id (pack "_")
|
||||
|
||||
varIndex :: Ident -> Int
|
||||
varIndex (IV _ n) = n
|
||||
|
||||
@@ -22,7 +22,7 @@ module PGF(
|
||||
CId, mkCId, wildCId,
|
||||
showCId, readCId,
|
||||
-- extra
|
||||
ppCId, pIdent, bsCId,
|
||||
ppCId, pIdent, utf8CId,
|
||||
|
||||
-- * Languages
|
||||
Language,
|
||||
|
||||
@@ -3,7 +3,7 @@ module PGF.CId (CId(..),
|
||||
readCId, showCId,
|
||||
|
||||
-- utils
|
||||
bsCId, pCId, pIdent, ppCId) where
|
||||
utf8CId, pCId, pIdent, ppCId) where
|
||||
|
||||
import Control.Monad
|
||||
import qualified Data.ByteString.Char8 as BS
|
||||
@@ -24,7 +24,8 @@ wildCId = CId (BS.singleton '_')
|
||||
mkCId :: String -> CId
|
||||
mkCId s = CId (UTF8.fromString s)
|
||||
|
||||
bsCId = CId
|
||||
-- | Creates an identifier from a UTF-8-encoded 'ByteString'
|
||||
utf8CId = CId
|
||||
|
||||
-- | Reads an identifier from 'String'. The function returns 'Nothing' if the string is not valid identifier.
|
||||
readCId :: String -> Maybe CId
|
||||
|
||||
Reference in New Issue
Block a user