1
0
forked from GitHub/gf-core

Change how GF deals with character encodings in grammar files

1. The default encoding is changed from Latin-1 to UTF-8.

2. Alternate encodings should be specified as "--# -coding=enc", the old
   "flags coding=enc" declarations have no effect but are still checked for
   consistency.

3. A transitional warning is generated for files that contain non-ASCII
   characters without specifying a character encoding:

	"Warning: default encoding has changed from Latin-1 to UTF-8"

4. Conversion to Unicode is now done *before* lexing. This makes it possible
   to allow arbitrary Unicode characters in identifiers. But identifiers are
   still stored as ByteStrings, so they are limited to Latin-1 characters
   for now.

5. Lexer.hs is no longer part of the repository. We now generate the lexer
   from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were
   needed. These bugs might already be fixed in newer versions of alex, but
   we should be compatible with what is shipped in the Haskell Platform.
This commit is contained in:
hallgren
2013-11-25 21:12:11 +00:00
parent 3210a50648
commit 9d7fdf7c9a
9 changed files with 157 additions and 559 deletions

File diff suppressed because one or more lines are too long

View File

@@ -1,3 +1,4 @@
-- -*- haskell -*-
{
module GF.Grammar.Lexer
( Token(..), Posn(..)
@@ -6,9 +7,14 @@ module GF.Grammar.Lexer
) where
import GF.Infra.Ident
--import GF.Data.Operations
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString as WBS
import qualified Data.ByteString.Internal as BS(w2c)
import qualified Data.ByteString.UTF8 as UTF8
import qualified Data.Map as Map
import Data.Word(Word8)
--import Debug.Trace(trace)
}
@@ -17,7 +23,7 @@ $c = [A-Z\192-\221] # [\215]
$s = [a-z\222-\255] # [\247]
$d = [0-9] -- digit
$i = [$l $d _ '] -- identifier character
$u = [\0-\255] -- universal: any character
$u = [.\n] -- universal: any character
@rsyms = -- symbols and non-identifier-like reserved words
\; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/
@@ -27,17 +33,20 @@ $u = [\0-\255] -- universal: any character
"{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ;
$white+ ;
@rsyms { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . BS.unpack)) }
(\_ | $l)($l | $d | \_ | \')* { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
@rsyms { tok (res (T_Ident . identS)) }
\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) }
(\_ | $l)($l | $d | \_ | \')* { tok (res (T_Ident . identS)) }
\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . BS.unpack) }
\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) }
(\-)? $d+ { tok (T_Integer . read . BS.unpack) }
(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . BS.unpack) }
(\-)? $d+ { tok (T_Integer . read . unpack) }
(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . unpack) }
{
--unpack = BS.unpack
unpack = id
tok :: (String->Token) -> Posn -> String -> Token
tok f p s = f s
data Token
@@ -114,15 +123,17 @@ data Token
| T_Double Double -- double precision float literals
| T_Ident Ident
| T_EOF
-- deriving Show -- debug
eitherResIdent :: (BS.ByteString -> Token) -> BS.ByteString -> Token
res = eitherResIdent
eitherResIdent :: (String -> Token) -> String -> Token
eitherResIdent tv s =
case Map.lookup s resWords of
Just t -> t
Nothing -> tv s
isReservedWord :: BS.ByteString -> Bool
isReservedWord s = Map.member s resWords
isReservedWord s = Map.member (BS.unpack s) resWords
resWords = Map.fromList
[ b "!" T_exclmark
@@ -194,7 +205,7 @@ resWords = Map.fromList
, b "where" T_where
, b "with" T_with
]
where b s t = (BS.pack s, t)
where b s t = (s, t)
unescapeInitTail :: String -> String
unescapeInitTail = unesc . tail where
@@ -202,7 +213,7 @@ unescapeInitTail = unesc . tail where
'\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs
'\\':'n':cs -> '\n' : unesc cs
'\\':'t':cs -> '\t' : unesc cs
'"':[] -> []
'\"':[] -> []
'\'':[] -> []
c:cs -> c : unesc cs
_ -> []
@@ -219,13 +230,14 @@ alexMove :: Posn -> Char -> Posn
alexMove (Pn l c) '\n' = Pn (l+1) 1
alexMove (Pn l c) _ = Pn l (c+1)
alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
alexGetChar (AI p _ s) =
case BS.uncons s of
alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte (AI p _ s) =
case WBS.uncons s of
Nothing -> Nothing
Just (c,s) ->
Just (w,s) ->
let p' = alexMove p c
in p' `seq` Just (c, (AI p' c s))
c = BS.w2c w
in p' `seq` Just (w, (AI p' c s))
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (AI p c s) = c
@@ -260,12 +272,13 @@ failLoc pos msg = P $ \_ -> PFailed pos msg
lexer :: (Token -> P a) -> P a
lexer cont = P go
where
--cont' t = trace (show t) (cont t)
go inp@(AI pos _ str) =
case alexScan inp 0 of
AlexEOF -> unP (cont T_EOF) inp
AlexError (AI pos _ _) -> PFailed pos "lexical error"
AlexSkip inp' len -> go inp'
AlexToken inp' len act -> unP (cont (act pos (BS.take len str))) inp'
AlexSkip inp' len -> {-trace (show len) $-} go inp'
AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp'
getPosn :: P Posn
getPosn = P $ \inp@(AI pos _ _) -> POk pos

View File

@@ -17,8 +17,8 @@ import GF.Grammar.Grammar
import GF.Grammar.Macros
import GF.Grammar.Lexer
import GF.Compile.Update (buildAnyTree)
import Codec.Binary.UTF8.String(decodeString)
import Data.Char(toLower)
--import Codec.Binary.UTF8.String(decodeString)
--import Data.Char(toLower)
}
%name pModDef ModDef
@@ -616,9 +616,9 @@ happyError = fail "syntax error"
-- Quick fix to render error messages from UTF-8-encoded source files correctly.
optDecode opts =
if map toLower (flag optEncoding opts) `elem` ["utf8","utf-8"]
{-if map toLower (getEncoding opts) `elem` ["utf8","utf-8"]
then decodeString
else id
else-} id
mkListId,mkConsId,mkBaseId :: Ident -> Ident
mkListId = prefixIdent "List"