Change how GF deals with character encodings in grammar files

1. The default encoding is changed from Latin-1 to UTF-8. 2. Alternate encodings should be specified as "--# -coding=enc", the old "flags coding=enc" declarations have no effect but are still checked for consistency. 3. A transitional warning is generated for files that contain non-ASCII characters without specifying a character encoding: "Warning: default encoding has changed from Latin-1 to UTF-8" 4. Conversion to Unicode is now done *before* lexing. This makes it possible to allow arbitrary Unicode characters in identifiers. But identifiers are still stored as ByteStrings, so they are limited to Latin-1 characters for now. 5. Lexer.hs is no longer part of the repository. We now generate the lexer from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were needed. These bugs might already be fixed in newer versions of alex, but we should be compatible with what is shipped in the Haskell Platform.
2013-11-25 21:12:11 +00:00
parent 1ae58146cd
commit 30fc46e934
9 changed files with 157 additions and 559 deletions
--- a/gf.cabal
+++ b/gf.cabal
@@ -1,5 +1,5 @@
 name: gf
-version: 3.5-darcs
+version: 3.5.11-darcs
 cabal-version: >= 1.8
 build-type: Custom
@@ -140,8 +140,8 @@ Executable gf
  if flag(new-comp)
    cpp-options: -DNEW_COMP
-  build-tools: happy
+  build-tools: happy, alex>=3
-           --, alex>=2 && <3 -- tricky to install in Ubuntu 12.04
+
  if os(windows)
    build-depends: Win32
  else
--- a/src/compiler/GF/Compile/GetGrammar.hs
+++ b/src/compiler/GF/Compile/GetGrammar.hs
@@ -18,40 +18,61 @@ import Prelude hiding (catch)
 import GF.Data.Operations
-import GF.System.Catch
+--import GF.System.Catch
 import GF.Infra.UseIO
-import GF.Infra.Option(Options,optPreprocessors,addOptions,optEncoding,flag,renameEncoding)
+import GF.Infra.Option(Options,optPreprocessors,addOptions,renameEncoding,optEncoding,flag,defaultEncoding)
 import GF.Grammar.Lexer
 import GF.Grammar.Parser
 import GF.Grammar.Grammar
-import GF.Compile.Coding
+--import GF.Compile.Coding
 import GF.Compile.ReadFiles(parseSource,lift)
 --import GF.Text.Coding(decodeUnicodeIO)
 import qualified Data.ByteString.Char8 as BS
-import Control.Monad (foldM)
+import Data.Char(isAscii)
 import Control.Monad (foldM,when,unless)
 import System.Cmd (system)
-import System.IO(mkTextEncoding)
+--import System.IO(mkTextEncoding) --,utf8
 import System.Directory(removeFile)
 getSourceModule :: Options -> FilePath -> IOE SourceModule
-getSourceModule opts file0 = ioe $
+getSourceModule opts file0 = 
-  do tmp <- foldM runPreprocessor (Source file0) (flag optPreprocessors opts)
+  errIn file0 $
-     content <- keepTemp tmp
+  do tmp <- lift $ foldM runPreprocessor (Source file0) (flag optPreprocessors opts)
-     case runP pModDef content of
+     raw <- lift $ keepTemp tmp
-       Left (Pn l c,msg) -> do file <- writeTemp tmp
+   --ePutStrLn $ "1 "++file0
     (optCoding,parsed) <- parseSource opts pModDef raw
     case parsed of
       Left (Pn l c,msg) -> do file <- lift $ writeTemp tmp
                               let location = file++":"++show l++":"++show c
-                               return (Bad (location++":\n   "++msg))
+                               raise (location++":\n   "++msg)
-       Right (i,mi00) ->
+       Right (i,mi0) ->
-         do removeTemp tmp
+         do lift $ removeTemp tmp
-            let mi0 =mi00 {mflags=mflags mi00 `addOptions` opts, msrc=file0}
+            let mi =mi0 {mflags=mflags mi0 `addOptions` opts, msrc=file0}
-            mi <- transcodeModule (i,mi0)
+                optCoding' = renameEncoding `fmap` flag optEncoding (mflags mi0)
-            return (Ok mi)
+            case (optCoding,optCoding') of
-  `catch` (return . Bad . show)
+              (Nothing,Nothing) ->
                  unless (BS.all isAscii raw) $
                    ePutStrLn $ file0++":\n    Warning: default encoding has changed from Latin-1 to UTF-8"
              (_,Just coding') -> 
                  when (coding/=coding') $
                  raise $ "Encoding mismatch: "++coding++" /= "++coding'
                where coding = maybe defaultEncoding renameEncoding optCoding
              _ -> return ()
          --lift $ transcodeModule' (i,mi) -- old lexer
            return (i,mi) -- new lexer
 {-
 transcodeModule sm00 =
-  do enc <- mkTextEncoding (renameEncoding (flag optEncoding (mflags (snd sm00))))
+  do enc <- mkTextEncoding (getEncoding (mflags (snd sm00)))
     let sm = decodeStringsInModule enc sm00
     return sm
 transcodeModule' sm00 =
  do let enc = utf8
     let sm = decodeStringsInModule enc sm00
     return sm
 -}
 runPreprocessor :: Temporary -> String -> IO Temporary
 runPreprocessor tmp0 p =
--- a/src/compiler/GF/Compile/ReadFiles.hs
+++ b/src/compiler/GF/Compile/ReadFiles.hs
@@ -21,7 +21,8 @@
 module GF.Compile.ReadFiles 
           ( getAllFiles,ModName,ModEnv,importsOfModule,
             gfoFile,gfFile,isGFO,gf2gfo,
-             getOptionsFromFile) where
+             parseSource,lift,
             getOptionsFromFile,getPragmas) where
 import Prelude hiding (catch)
 import GF.System.Catch
@@ -34,6 +35,10 @@ import GF.Grammar.Parser
 import GF.Grammar.Grammar
 import GF.Grammar.Binary
 import System.IO(mkTextEncoding)
 import qualified Data.ByteString.UTF8 as UTF8
 import GF.Text.Coding(decodeUnicodeIO)
 import Control.Monad
 import Data.Maybe(isJust)
 import qualified Data.ByteString.Char8 as BS
@@ -50,7 +55,7 @@ type ModEnv  = Map.Map ModName (UTCTime,[ModName])
 -- | Returns a list of all files to be compiled in topological order i.e.
 -- the low level (leaf) modules are first.
-getAllFiles :: (MonadIO m,ErrorMonad m) => Options -> [InitPath] -> ModEnv -> FileName -> m [FullPath]
+--getAllFiles :: (MonadIO m,ErrorMonad m) => Options -> [InitPath] -> ModEnv -> FileName -> m [FullPath]
 getAllFiles opts ps env file = do
  -- read module headers from all files recursively
  ds <- liftM reverse $ get [] [] (justModuleName file)
@@ -117,14 +122,10 @@ getAllFiles opts ps env file = do
                                       Just mo -> return (st,importsOfModule mo)
                                       Nothing
                                         | isGFO file -> raise (file ++ " is compiled with different GF version and I can't find the source file")
-                                         | otherwise  -> do s <- liftIO $ BS.readFile file
+                                         | otherwise  -> do mo <- parseModHeader opts file
-                                                            case runP pModHeader s of
+                                                            return (CSComp,importsOfModule mo)
-                                                              Left (Pn l c,msg) -> raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
+                        CSComp -> do mo <- parseModHeader opts file
-                                                              Right mo          -> return (CSComp,importsOfModule mo)
+                                     return (st,importsOfModule mo)
                        CSComp -> do s <- liftIO $ BS.readFile file
                                     case runP pModHeader s of
                                       Left (Pn l c,msg) -> raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
                                       Right mo          -> return (st,importsOfModule mo)
      testErr (mname == name)
              ("module name" +++ mname +++ "differs from file name" +++ name)
      return (name,st,t,isJust gfTime,imps,dropFileName file)
@@ -209,17 +210,55 @@ importsOfModule (m,mi) = (modName m,depModInfo mi [])
    modName = showIdent
 parseModHeader opts file =
  do --ePutStrLn file
     (_,parsed) <- parseSource opts pModHeader =<< lift (BS.readFile file)
     case parsed of
       Right mo          -> return mo
       Left (Pn l c,msg) ->
                  raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
 parseSource opts p raw =
  do (coding,utf8) <- toUTF8 opts raw
     return (coding,runP p utf8)
 toUTF8 opts0 raw =
  do opts <- getPragmas raw
     let given = flag optEncoding opts -- explicitly given encoding
         coding = getEncoding $ opts0 `addOptions` opts
     utf8 <- if coding=="UTF-8"
             then return raw
             else lift $ do --ePutStrLn $ "toUTF8 from "++coding
                            enc <- mkTextEncoding coding
                            -- decodeUnicodeIO uses a lot of stack space,
                            -- so we need to split the file into smaller pieces
                            ls <- mapM (decodeUnicodeIO enc) (BS.lines raw)
                            return $ UTF8.fromString (unlines ls)
     return (given,utf8)
 --lift io = ioe (fmap Ok io `catch` (return . Bad . show))
 lift io = liftIO io
 -- | options can be passed to the compiler by comments in @--#@, in the main file
 getOptionsFromFile :: (MonadIO m,ErrorMonad m) => FilePath -> m Options
 getOptionsFromFile file = do
  s <- either (\_ -> raise $ "File " ++ file ++ " does not exist") return =<<
       liftIO (try $ BS.readFile file)
-  let ls = filter (BS.isPrefixOf (BS.pack "--#")) $ BS.lines s
+  opts <- getPragmas s
-      fs = map (BS.unpack . BS.unwords . BS.words . BS.drop 3) ls
+  -- The coding flag should not be inherited by other files
-  parseModuleOptions fs
+  return (addOptions opts (modifyFlags $ \ f -> f{optEncoding=Nothing}))
-getFilePath :: MonadIO m => [FilePath] -> String -> m (Maybe FilePath)
+
-getFilePath paths file = liftIO $ get paths
+getPragmas :: (ErrorMonad m) => BS.ByteString -> m Options
 getPragmas = parseModuleOptions . 
             map (BS.unpack . BS.unwords . BS.words . BS.drop 3) .
             filter (BS.isPrefixOf (BS.pack "--#")) . BS.lines
 --getFilePath :: MonadIO m => [FilePath] -> String -> m (Maybe FilePath)
 getFilePath paths file =
   liftIO $ do --ePutStrLn $ "getFilePath "++show paths++" "++show file
               get paths
  where
    get []     = return Nothing
    get (p:ps) = do
--- a/src/compiler/GF/Grammar/Lexer.hs
+++ b/src/compiler/GF/Grammar/Lexer.hs
--- a/src/compiler/GF/Grammar/lexer/Lexer.x
+++ b/src/compiler/GF/Grammar/lexer/Lexer.x
@@ -1,3 +1,4 @@
 -- -*- haskell -*-
 {
 module GF.Grammar.Lexer
         ( Token(..), Posn(..)
@@ -6,9 +7,14 @@ module GF.Grammar.Lexer
         ) where
 import GF.Infra.Ident
 --import GF.Data.Operations
 import qualified Data.ByteString.Char8 as BS
 import qualified Data.ByteString as WBS
 import qualified Data.ByteString.Internal as BS(w2c)
 import qualified Data.ByteString.UTF8 as UTF8
 import qualified Data.Map as Map
-
+import Data.Word(Word8)
 --import Debug.Trace(trace)
 }
@@ -17,7 +23,7 @@ $c = [A-Z\192-\221] # [\215]
 $s = [a-z\222-\255] # [\247]
 $d = [0-9]                -- digit
 $i = [$l $d _ ']          -- identifier character
-$u = [\0-\255]            -- universal: any character
+$u = [.\n]                -- universal: any character
@rsyms =    -- symbols and non-identifier-like reserved words
   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/
@@ -27,17 +33,20 @@ $u = [\0-\255]            -- universal: any character
 "{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ; 
 $white+ ;
-@rsyms                          { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
+@rsyms                          { tok (res (T_Ident . identS)) }
-\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . BS.unpack)) }
+\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) }
-(\_ | $l)($l | $d | \_ | \')*   { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
+(\_ | $l)($l | $d | \_ | \')*   { tok (res (T_Ident . identS)) }
-\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . BS.unpack) }
+\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) }
-(\-)? $d+                       { tok (T_Integer . read . BS.unpack) }
+(\-)? $d+                       { tok (T_Integer . read . unpack) }
-(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double  . read . BS.unpack) }
+(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double  . read . unpack) }
 {
 --unpack = BS.unpack
 unpack = id
 tok :: (String->Token) -> Posn -> String -> Token
 tok f p s = f s
 data Token
@@ -114,15 +123,17 @@ data Token
 | T_Double  Double          -- double precision float literals
 | T_Ident   Ident
 | T_EOF
 -- deriving Show -- debug
-eitherResIdent :: (BS.ByteString -> Token) -> BS.ByteString -> Token
+res = eitherResIdent
 eitherResIdent :: (String -> Token) -> String -> Token
 eitherResIdent tv s = 
  case Map.lookup s resWords of
    Just t  -> t
    Nothing -> tv s
 isReservedWord :: BS.ByteString -> Bool
-isReservedWord s = Map.member s resWords
+isReservedWord s = Map.member (BS.unpack s) resWords
 resWords = Map.fromList
 [ b "!"  T_exclmark
@@ -194,7 +205,7 @@ resWords = Map.fromList
 , b "where"      T_where
 , b "with"       T_with
 ]
- where b s t = (BS.pack s, t)
+ where b s t = (s, t)
 unescapeInitTail :: String -> String
 unescapeInitTail = unesc . tail where
@@ -202,7 +213,7 @@ unescapeInitTail = unesc . tail where
    '\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs
    '\\':'n':cs  -> '\n' : unesc cs
    '\\':'t':cs  -> '\t' : unesc cs
-    '"':[]    -> []
+    '\"':[]    -> []
    '\'':[]    -> []
    c:cs      -> c : unesc cs
    _         -> []
@@ -219,13 +230,14 @@ alexMove :: Posn -> Char -> Posn
 alexMove (Pn l c) '\n' = Pn (l+1) 1
 alexMove (Pn l c) _    = Pn l     (c+1)
-alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
+alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
-alexGetChar (AI p _ s) =
+alexGetByte (AI p _ s) =
-  case BS.uncons s of
+  case WBS.uncons s of
    Nothing  -> Nothing
-    Just (c,s) ->
+    Just (w,s) ->
             let p' = alexMove p c
-              in p' `seq` Just (c, (AI p' c s))
+                 c = BS.w2c w
              in p' `seq` Just (w, (AI p' c s))
 alexInputPrevChar :: AlexInput -> Char
 alexInputPrevChar (AI p c s) = c
@@ -260,12 +272,13 @@ failLoc pos msg = P $ \_ -> PFailed pos msg
 lexer :: (Token -> P a) -> P a
 lexer cont = P go
  where
  --cont' t = trace (show t) (cont t)
    go inp@(AI pos _ str) =
      case alexScan inp 0 of
        AlexEOF                -> unP (cont T_EOF) inp
        AlexError (AI pos _ _) -> PFailed pos "lexical error"
-        AlexSkip  inp' len     -> go inp'
+        AlexSkip  inp' len     -> {-trace (show len) $-} go inp'
-        AlexToken inp' len act -> unP (cont (act pos (BS.take len str))) inp'
+        AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp'
 getPosn :: P Posn
 getPosn = P $ \inp@(AI pos _ _) -> POk pos
--- a/src/compiler/GF/Grammar/Parser.y
+++ b/src/compiler/GF/Grammar/Parser.y
@@ -17,8 +17,8 @@ import GF.Grammar.Grammar
 import GF.Grammar.Macros
 import GF.Grammar.Lexer
 import GF.Compile.Update (buildAnyTree)
-import Codec.Binary.UTF8.String(decodeString)
+--import Codec.Binary.UTF8.String(decodeString)
-import Data.Char(toLower)
+--import Data.Char(toLower)
 }
 %name pModDef ModDef
@@ -616,9 +616,9 @@ happyError = fail "syntax error"
 -- Quick fix to render error messages from UTF-8-encoded source files correctly.
 optDecode opts =
-    if map toLower (flag optEncoding opts) `elem` ["utf8","utf-8"]
+    {-if map toLower (getEncoding opts) `elem` ["utf8","utf-8"]
    then decodeString
-    else id
+    else-} id
 mkListId,mkConsId,mkBaseId  :: Ident -> Ident
 mkListId = prefixIdent "List"
--- a/src/compiler/GF/Infra/Option.hs
+++ b/src/compiler/GF/Infra/Option.hs
@@ -20,7 +20,7 @@ module GF.Infra.Option
     helpMessage,
     -- * Checking specific options
     flag, cfgTransform, haskellOption, readOutputFormat,
-     isLexicalCat, isLiteralCat, renameEncoding,
+     isLexicalCat, isLiteralCat, renameEncoding, getEncoding, defaultEncoding,
     -- * Setting specific options
     setOptimization, setCFGTransform,
     -- * Convenience methods for checking options    
@@ -157,7 +157,7 @@ data Flags = Flags {
      optRetainResource  :: Bool,
      optName            :: Maybe String,
      optPreprocessors   :: [String],
-      optEncoding        :: String,
+      optEncoding        :: Maybe String,
      optPMCFG           :: Bool,
      optOptimizations   :: Set Optimization,
      optOptimizePGF     :: Bool,
@@ -213,7 +213,7 @@ fixRelativeLibPaths curr_dir lib_dir (Options o) = Options (fixPathFlags . o)
 -- | Pretty-print the options that are preserved in .gfo files.
 optionsGFO :: Options -> [(String,Literal)]
 optionsGFO opts = optionsPGF opts
-      ++ [("coding", LStr (flag optEncoding opts))]
+      ++ [("coding", LStr (getEncoding opts))]
 -- | Pretty-print the options that are preserved in .pgf files.
 optionsPGF :: Options -> [(String,Literal)]
@@ -241,6 +241,10 @@ concatOptions = foldr addOptions noOptions
 modifyFlags :: (Flags -> Flags) -> Options
 modifyFlags = Options
 getEncoding :: Options -> String
 getEncoding = renameEncoding . maybe defaultEncoding id . flag optEncoding
 defaultEncoding = "UTF-8"
 -- Default options
 defaultFlags :: Flags
@@ -264,7 +268,7 @@ defaultFlags = Flags {
      optName            = Nothing,
      optPreprocessors   = [],
-      optEncoding        = "latin1",
+      optEncoding        = Nothing,
      optPMCFG           = True,
      optOptimizations   = Set.fromList [OptStem,OptCSE,OptExpand,OptParametrize],
      optOptimizePGF     = False,
@@ -419,7 +423,7 @@ optDescr =
       addLibDir   x = set $ \o -> o { optLibraryPath = x:optLibraryPath o }
       setLibPath  x = set $ \o -> o { optLibraryPath = splitInModuleSearchPath x }
       preproc     x = set $ \o -> o { optPreprocessors = optPreprocessors o ++ [x] }
-       coding      x = set $ \o -> o { optEncoding = x }
+       coding      x = set $ \o -> o { optEncoding = Just x }
       startcat    x = set $ \o -> o { optStartCat = Just x }
       language    x = set $ \o -> o { optSpeechLanguage = Just x }
       lexer       x = set $ \o -> o { optLexer = Just x }
--- a/src/compiler/GF/Text/Coding.hs
+++ b/src/compiler/GF/Text/Coding.hs
@@ -31,7 +31,7 @@ encodeUnicode enc s =
                       (cbuf,bbuf) <- cod cbuf bbuf
 #endif
                       if isEmptyBuffer bbuf
-                         then ioe_invalidCharacter
+                         then ioe_invalidCharacter1
                         else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
                                 bss <- translate cod cbuf
                                 return (bs:bss)
@@ -41,8 +41,9 @@ encodeUnicode enc s =
        w = bufR cbuf
 decodeUnicode :: TextEncoding -> ByteString -> String
-decodeUnicode enc (PS fptr l len) =
+decodeUnicode enc bs = unsafePerformIO $ decodeUnicodeIO enc bs
-  unsafePerformIO $ do
+
 decodeUnicodeIO enc (PS fptr l len) = do
    let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len}
    cbuf <- newCharBuffer 128 WriteBuffer
    case enc of
@@ -59,7 +60,7 @@ decodeUnicode enc (PS fptr l len) =
                       (bbuf,cbuf) <- cod bbuf cbuf
 #endif
                       if isEmptyBuffer cbuf
-                         then ioe_invalidCharacter
+                         then ioe_invalidCharacter2
                         else unpack cod bbuf cbuf
      | otherwise = return []
      where
@@ -75,6 +76,10 @@ decodeUnicode enc (PS fptr l len) =
        i = bufL cbuf
        w = bufR cbuf
-ioe_invalidCharacter = ioException
+ioe_invalidCharacter1 = ioException
   (IOError Nothing InvalidArgument ""
        ("invalid byte sequence for this encoding") Nothing Nothing)
 ioe_invalidCharacter2 = ioException
   (IOError Nothing InvalidArgument ""
        ("invalid byte sequence for this decoding") Nothing Nothing)
--- a/src/compiler/GFC.hs
+++ b/src/compiler/GFC.hs
@@ -24,7 +24,7 @@ import qualified Data.ByteString.Lazy as BSL
 import System.FilePath
 import System.IO
 import Control.Exception
-
+import Control.Monad(unless)
 mainGFC :: Options -> [FilePath] -> IO ()
 mainGFC opts fs = do
@@ -46,9 +46,8 @@ compileSourceFiles :: Options -> [FilePath] -> IOE ()
 compileSourceFiles opts fs = 
    do gr <- batchCompile opts fs
       let cnc = justModuleName (last fs)
-       if flag optStopAfterPhase opts == Compile 
+       unless (flag optStopAfterPhase opts == Compile) $
-         then return ()
+              do pgf <- link opts (identS cnc) gr
         else do pgf <- link opts (identS cnc) gr
                 writePGF opts pgf
                 writeByteCode opts pgf
                 writeOutputs opts pgf
@@ -57,11 +56,9 @@ compileCFFiles :: Options -> [FilePath] -> IOE ()
 compileCFFiles opts fs = 
    do s  <- liftIO $ fmap unlines $ mapM readFile fs
       let cnc = justModuleName (last fs)
-       gf <- getCF cnc s
+       gr <- compileSourceGrammar opts =<< getCF cnc s
-       gr <- compileSourceGrammar opts gf
+       unless (flag optStopAfterPhase opts == Compile) $
-       if flag optStopAfterPhase opts == Compile 
+              do pgf <- link opts (identS cnc) gr
         then return ()
         else do pgf <- link opts (identS cnc) gr
                 writePGF opts pgf
                 writeOutputs opts pgf