Change how GF deals with character encodings in grammar files

1. The default encoding is changed from Latin-1 to UTF-8. 2. Alternate encodings should be specified as "--# -coding=enc", the old "flags coding=enc" declarations have no effect but are still checked for consistency. 3. A transitional warning is generated for files that contain non-ASCII characters without specifying a character encoding: "Warning: default encoding has changed from Latin-1 to UTF-8" 4. Conversion to Unicode is now done *before* lexing. This makes it possible to allow arbitrary Unicode characters in identifiers. But identifiers are still stored as ByteStrings, so they are limited to Latin-1 characters for now. 5. Lexer.hs is no longer part of the repository. We now generate the lexer from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were needed. These bugs might already be fixed in newer versions of alex, but we should be compatible with what is shipped in the Haskell Platform.
2013-11-25 21:12:11 +00:00
parent 3210a50648
commit 9d7fdf7c9a
9 changed files with 157 additions and 559 deletions
--- a/src/compiler/GF/Grammar/Lexer.x
+++ b/src/compiler/GF/Grammar/Lexer.x
@@ -0,0 +1,286 @@
+-- -*- haskell -*-
+{
+module GF.Grammar.Lexer
+         ( Token(..), Posn(..)
+         , P, runP, lexer, getPosn, failLoc
+         , isReservedWord
+         ) where
+
+import GF.Infra.Ident
+--import GF.Data.Operations
+import qualified Data.ByteString.Char8 as BS
+import qualified Data.ByteString as WBS
+import qualified Data.ByteString.Internal as BS(w2c)
+import qualified Data.ByteString.UTF8 as UTF8
+import qualified Data.Map as Map
+import Data.Word(Word8)
+--import Debug.Trace(trace)
+}
+
+
+$l = [a-zA-Z\192 - \255] # [\215 \247]
+$c = [A-Z\192-\221] # [\215]
+$s = [a-z\222-\255] # [\247]
+$d = [0-9]                -- digit
+$i = [$l $d _ ']          -- identifier character
+$u = [.\n]                -- universal: any character
+
+@rsyms =    -- symbols and non-identifier-like reserved words
+   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/
+
+:-
+"--" [.]* ; -- Toss single line comments
+"{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ; 
+
+$white+ ;
+@rsyms                          { tok (res (T_Ident . identS)) }
+\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) }
+(\_ | $l)($l | $d | \_ | \')*   { tok (res (T_Ident . identS)) }
+
+\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) }
+
+(\-)? $d+                       { tok (T_Integer . read . unpack) }
+(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double  . read . unpack) }
+
+{
+--unpack = BS.unpack
+unpack = id
+
+tok :: (String->Token) -> Posn -> String -> Token
+tok f p s = f s
+
+data Token
+ = T_exclmark
+ | T_patt
+ | T_int_label
+ | T_oparen
+ | T_cparen
+ | T_tilde
+ | T_star
+ | T_starstar
+ | T_plus
+ | T_plusplus
+ | T_comma
+ | T_minus
+ | T_rarrow
+ | T_dot
+ | T_alt
+ | T_colon
+ | T_semicolon
+ | T_less
+ | T_equal
+ | T_big_rarrow
+ | T_great
+ | T_questmark
+ | T_obrack
+ | T_lam
+ | T_lamlam
+ | T_cbrack
+ | T_ocurly
+ | T_bar
+ | T_ccurly
+ | T_underscore
+ | T_at
+ | T_PType
+ | T_Str
+ | T_Strs
+ | T_Tok
+ | T_Type
+ | T_abstract
+ | T_case
+ | T_cat
+ | T_concrete
+ | T_data
+ | T_def
+ | T_flags
+ | T_fn
+ | T_fun
+ | T_in
+ | T_incomplete
+ | T_instance
+ | T_interface
+ | T_let
+ | T_lin
+ | T_lincat
+ | T_lindef
+ | T_linref
+ | T_of
+ | T_open
+ | T_oper
+ | T_param
+ | T_pattern
+ | T_pre
+ | T_printname
+ | T_resource
+ | T_strs
+ | T_table
+ | T_transfer
+ | T_variants
+ | T_where
+ | T_with
+ | T_String  String          -- string literals
+ | T_Integer Int             -- integer literals
+ | T_Double  Double          -- double precision float literals
+ | T_Ident   Ident
+ | T_EOF
+-- deriving Show -- debug
+
+res = eitherResIdent
+eitherResIdent :: (String -> Token) -> String -> Token
+eitherResIdent tv s = 
+  case Map.lookup s resWords of
+    Just t  -> t
+    Nothing -> tv s
+
+isReservedWord :: BS.ByteString -> Bool
+isReservedWord s = Map.member (BS.unpack s) resWords
+
+resWords = Map.fromList
+ [ b "!"  T_exclmark
+ , b "#"  T_patt
+ , b "$"  T_int_label
+ , b "("  T_oparen
+ , b ")"  T_cparen
+ , b "~"  T_tilde
+ , b "*"  T_star
+ , b "**" T_starstar
+ , b "+"  T_plus
+ , b "++" T_plusplus
+ , b ","  T_comma
+ , b "-"  T_minus
+ , b "->" T_rarrow
+ , b "."  T_dot
+ , b "/"  T_alt
+ , b ":"  T_colon
+ , b ";"  T_semicolon
+ , b "<"  T_less
+ , b "="  T_equal
+ , b "=>" T_big_rarrow
+ , b ">"  T_great
+ , b "?"  T_questmark
+ , b "["  T_obrack
+ , b "]"  T_cbrack
+ , b "\\" T_lam
+ , b "\\\\" T_lamlam
+ , b "{"  T_ocurly
+ , b "}"  T_ccurly
+ , b "|"  T_bar
+ , b "_"  T_underscore
+ , b "@"  T_at
+ , b "PType"      T_PType
+ , b "Str"        T_Str
+ , b "Strs"       T_Strs
+ , b "Tok"        T_Tok
+ , b "Type"       T_Type
+ , b "abstract"   T_abstract
+ , b "case"       T_case
+ , b "cat"        T_cat
+ , b "concrete"   T_concrete
+ , b "data"       T_data
+ , b "def"        T_def
+ , b "flags"      T_flags
+ , b "fn"         T_fn
+ , b "fun"        T_fun
+ , b "in"         T_in
+ , b "incomplete" T_incomplete
+ , b "instance"   T_instance
+ , b "interface"  T_interface
+ , b "let"        T_let
+ , b "lin"        T_lin
+ , b "lincat"     T_lincat
+ , b "lindef"     T_lindef
+ , b "linref"     T_linref
+ , b "of"         T_of
+ , b "open"       T_open
+ , b "oper"       T_oper
+ , b "param"      T_param
+ , b "pattern"    T_pattern
+ , b "pre"        T_pre
+ , b "printname"  T_printname
+ , b "resource"   T_resource
+ , b "strs"       T_strs
+ , b "table"      T_table
+ , b "transfer"   T_transfer
+ , b "variants"   T_variants
+ , b "where"      T_where
+ , b "with"       T_with
+ ]
+ where b s t = (s, t)
+
+unescapeInitTail :: String -> String
+unescapeInitTail = unesc . tail where
+  unesc s = case s of
+    '\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs
+    '\\':'n':cs  -> '\n' : unesc cs
+    '\\':'t':cs  -> '\t' : unesc cs
+    '\"':[]    -> []
+    '\'':[]    -> []
+    c:cs      -> c : unesc cs
+    _         -> []
+
+-------------------------------------------------------------------
+-- Alex wrapper code.
+-- A modified "posn" wrapper.
+-------------------------------------------------------------------
+
+data Posn = Pn {-# UNPACK #-} !Int
+               {-# UNPACK #-} !Int
+
+alexMove :: Posn -> Char -> Posn
+alexMove (Pn l c) '\n' = Pn (l+1) 1
+alexMove (Pn l c) _    = Pn l     (c+1)
+
+alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
+alexGetByte (AI p _ s) =
+  case WBS.uncons s of
+    Nothing  -> Nothing
+    Just (w,s) ->
+             let p' = alexMove p c
+                 c = BS.w2c w
+              in p' `seq` Just (w, (AI p' c s))
+
+alexInputPrevChar :: AlexInput -> Char
+alexInputPrevChar (AI p c s) = c
+
+data AlexInput = AI {-# UNPACK #-} !Posn            -- current position,
+                    {-# UNPACK #-} !Char            -- previous char
+                    {-# UNPACK #-} !BS.ByteString   -- current input string
+
+data ParseResult a
+  = POk a
+  | PFailed Posn	-- The position of the error
+            String      -- The error message
+
+newtype P a = P { unP :: AlexInput -> ParseResult a }
+
+instance Monad P where
+  return a    = a `seq` (P $ \s -> POk a)
+  (P m) >>= k = P $ \ s -> case m s of
+                             POk a         -> unP (k a) s
+                             PFailed posn err -> PFailed posn err
+  fail msg    = P $ \(AI posn _ _) -> PFailed posn msg
+
+runP :: P a -> BS.ByteString -> Either (Posn,String) a
+runP (P f) txt =
+  case f (AI (Pn 1 0) ' ' txt) of
+    POk x           -> Right x
+    PFailed pos msg -> Left  (pos,msg)
+
+failLoc :: Posn -> String -> P a
+failLoc pos msg = P $ \_ -> PFailed pos msg
+
+lexer :: (Token -> P a) -> P a
+lexer cont = P go
+  where
+  --cont' t = trace (show t) (cont t)
+    go inp@(AI pos _ str) =
+      case alexScan inp 0 of
+        AlexEOF                -> unP (cont T_EOF) inp
+        AlexError (AI pos _ _) -> PFailed pos "lexical error"
+        AlexSkip  inp' len     -> {-trace (show len) $-} go inp'
+        AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp'
+
+getPosn :: P Posn
+getPosn = P $ \inp@(AI pos _ _) -> POk pos
+
+}