Change how GF deals with character encodings in grammar files

1. The default encoding is changed from Latin-1 to UTF-8. 2. Alternate encodings should be specified as "--# -coding=enc", the old "flags coding=enc" declarations have no effect but are still checked for consistency. 3. A transitional warning is generated for files that contain non-ASCII characters without specifying a character encoding: "Warning: default encoding has changed from Latin-1 to UTF-8" 4. Conversion to Unicode is now done *before* lexing. This makes it possible to allow arbitrary Unicode characters in identifiers. But identifiers are still stored as ByteStrings, so they are limited to Latin-1 characters for now. 5. Lexer.hs is no longer part of the repository. We now generate the lexer from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were needed. These bugs might already be fixed in newer versions of alex, but we should be compatible with what is shipped in the Haskell Platform.
2013-11-25 21:12:11 +00:00
parent 3210a50648
commit 9d7fdf7c9a
9 changed files with 157 additions and 559 deletions
--- a/src/compiler/GF/Grammar/Lexer.hs
+++ b/src/compiler/GF/Grammar/Lexer.hs
--- a/src/compiler/GF/Grammar/lexer/Lexer.x
+++ b/src/compiler/GF/Grammar/lexer/Lexer.x
@@ -1,3 +1,4 @@
+-- -*- haskell -*-
 {
 module GF.Grammar.Lexer
         ( Token(..), Posn(..)
@@ -6,9 +7,14 @@ module GF.Grammar.Lexer
         ) where

 import GF.Infra.Ident
+--import GF.Data.Operations
 import qualified Data.ByteString.Char8 as BS
+import qualified Data.ByteString as WBS
+import qualified Data.ByteString.Internal as BS(w2c)
+import qualified Data.ByteString.UTF8 as UTF8
 import qualified Data.Map as Map
-
+import Data.Word(Word8)
+--import Debug.Trace(trace)
 }


@@ -17,7 +23,7 @@ $c = [A-Z\192-\221] # [\215]
 $s = [a-z\222-\255] # [\247]
 $d = [0-9]                -- digit
 $i = [$l $d _ ']          -- identifier character
-$u = [\0-\255]            -- universal: any character
+$u = [.\n]                -- universal: any character

@rsyms =    -- symbols and non-identifier-like reserved words
   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/
@@ -27,17 +33,20 @@ $u = [\0-\255]            -- universal: any character
 "{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ; 

 $white+ ;
-@rsyms                          { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
-\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . BS.unpack)) }
-(\_ | $l)($l | $d | \_ | \')*   { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
+@rsyms                          { tok (res (T_Ident . identS)) }
+\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) }
+(\_ | $l)($l | $d | \_ | \')*   { tok (res (T_Ident . identS)) }

-\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . BS.unpack) }
+\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) }

-(\-)? $d+                       { tok (T_Integer . read . BS.unpack) }
-(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double  . read . BS.unpack) }
+(\-)? $d+                       { tok (T_Integer . read . unpack) }
+(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double  . read . unpack) }

 {
+--unpack = BS.unpack
+unpack = id

+tok :: (String->Token) -> Posn -> String -> Token
 tok f p s = f s

 data Token
@@ -114,15 +123,17 @@ data Token
 | T_Double  Double          -- double precision float literals
 | T_Ident   Ident
 | T_EOF
+-- deriving Show -- debug

-eitherResIdent :: (BS.ByteString -> Token) -> BS.ByteString -> Token
+res = eitherResIdent
+eitherResIdent :: (String -> Token) -> String -> Token
 eitherResIdent tv s = 
  case Map.lookup s resWords of
    Just t  -> t
    Nothing -> tv s

 isReservedWord :: BS.ByteString -> Bool
-isReservedWord s = Map.member s resWords
+isReservedWord s = Map.member (BS.unpack s) resWords

 resWords = Map.fromList
 [ b "!"  T_exclmark
@@ -194,7 +205,7 @@ resWords = Map.fromList
 , b "where"      T_where
 , b "with"       T_with
 ]
- where b s t = (BS.pack s, t)
+ where b s t = (s, t)

 unescapeInitTail :: String -> String
 unescapeInitTail = unesc . tail where
@@ -202,7 +213,7 @@ unescapeInitTail = unesc . tail where
    '\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs
    '\\':'n':cs  -> '\n' : unesc cs
    '\\':'t':cs  -> '\t' : unesc cs
-    '"':[]    -> []
+    '\"':[]    -> []
    '\'':[]    -> []
    c:cs      -> c : unesc cs
    _         -> []
@@ -219,13 +230,14 @@ alexMove :: Posn -> Char -> Posn
 alexMove (Pn l c) '\n' = Pn (l+1) 1
 alexMove (Pn l c) _    = Pn l     (c+1)

-alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
-alexGetChar (AI p _ s) =
-  case BS.uncons s of
+alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
+alexGetByte (AI p _ s) =
+  case WBS.uncons s of
    Nothing  -> Nothing
-    Just (c,s) ->
+    Just (w,s) ->
             let p' = alexMove p c
-              in p' `seq` Just (c, (AI p' c s))
+                 c = BS.w2c w
+              in p' `seq` Just (w, (AI p' c s))

 alexInputPrevChar :: AlexInput -> Char
 alexInputPrevChar (AI p c s) = c
@@ -260,12 +272,13 @@ failLoc pos msg = P $ \_ -> PFailed pos msg
 lexer :: (Token -> P a) -> P a
 lexer cont = P go
  where
+  --cont' t = trace (show t) (cont t)
    go inp@(AI pos _ str) =
      case alexScan inp 0 of
        AlexEOF                -> unP (cont T_EOF) inp
        AlexError (AI pos _ _) -> PFailed pos "lexical error"
-        AlexSkip  inp' len     -> go inp'
-        AlexToken inp' len act -> unP (cont (act pos (BS.take len str))) inp'
+        AlexSkip  inp' len     -> {-trace (show len) $-} go inp'
+        AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp'

 getPosn :: P Posn
 getPosn = P $ \inp@(AI pos _ _) -> POk pos
--- a/src/compiler/GF/Grammar/Parser.y
+++ b/src/compiler/GF/Grammar/Parser.y
@@ -17,8 +17,8 @@ import GF.Grammar.Grammar
 import GF.Grammar.Macros
 import GF.Grammar.Lexer
 import GF.Compile.Update (buildAnyTree)
-import Codec.Binary.UTF8.String(decodeString)
-import Data.Char(toLower)
+--import Codec.Binary.UTF8.String(decodeString)
+--import Data.Char(toLower)
 }

 %name pModDef ModDef
@@ -616,9 +616,9 @@ happyError = fail "syntax error"

 -- Quick fix to render error messages from UTF-8-encoded source files correctly.
 optDecode opts =
-    if map toLower (flag optEncoding opts) `elem` ["utf8","utf-8"]
+    {-if map toLower (getEncoding opts) `elem` ["utf8","utf-8"]
    then decodeString
-    else id
+    else-} id

 mkListId,mkConsId,mkBaseId  :: Ident -> Ident
 mkListId = prefixIdent "List"