forked from GitHub/gf-core
Change how GF deals with character encodings in grammar files
1. The default encoding is changed from Latin-1 to UTF-8. 2. Alternate encodings should be specified as "--# -coding=enc", the old "flags coding=enc" declarations have no effect but are still checked for consistency. 3. A transitional warning is generated for files that contain non-ASCII characters without specifying a character encoding: "Warning: default encoding has changed from Latin-1 to UTF-8" 4. Conversion to Unicode is now done *before* lexing. This makes it possible to allow arbitrary Unicode characters in identifiers. But identifiers are still stored as ByteStrings, so they are limited to Latin-1 characters for now. 5. Lexer.hs is no longer part of the repository. We now generate the lexer from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were needed. These bugs might already be fixed in newer versions of alex, but we should be compatible with what is shipped in the Haskell Platform.
This commit is contained in:
@@ -21,7 +21,8 @@
|
||||
module GF.Compile.ReadFiles
|
||||
( getAllFiles,ModName,ModEnv,importsOfModule,
|
||||
gfoFile,gfFile,isGFO,gf2gfo,
|
||||
getOptionsFromFile) where
|
||||
parseSource,lift,
|
||||
getOptionsFromFile,getPragmas) where
|
||||
|
||||
import Prelude hiding (catch)
|
||||
import GF.System.Catch
|
||||
@@ -34,6 +35,10 @@ import GF.Grammar.Parser
|
||||
import GF.Grammar.Grammar
|
||||
import GF.Grammar.Binary
|
||||
|
||||
import System.IO(mkTextEncoding)
|
||||
import qualified Data.ByteString.UTF8 as UTF8
|
||||
import GF.Text.Coding(decodeUnicodeIO)
|
||||
|
||||
import Control.Monad
|
||||
import Data.Maybe(isJust)
|
||||
import qualified Data.ByteString.Char8 as BS
|
||||
@@ -50,7 +55,7 @@ type ModEnv = Map.Map ModName (UTCTime,[ModName])
|
||||
|
||||
-- | Returns a list of all files to be compiled in topological order i.e.
|
||||
-- the low level (leaf) modules are first.
|
||||
getAllFiles :: (MonadIO m,ErrorMonad m) => Options -> [InitPath] -> ModEnv -> FileName -> m [FullPath]
|
||||
--getAllFiles :: (MonadIO m,ErrorMonad m) => Options -> [InitPath] -> ModEnv -> FileName -> m [FullPath]
|
||||
getAllFiles opts ps env file = do
|
||||
-- read module headers from all files recursively
|
||||
ds <- liftM reverse $ get [] [] (justModuleName file)
|
||||
@@ -117,14 +122,10 @@ getAllFiles opts ps env file = do
|
||||
Just mo -> return (st,importsOfModule mo)
|
||||
Nothing
|
||||
| isGFO file -> raise (file ++ " is compiled with different GF version and I can't find the source file")
|
||||
| otherwise -> do s <- liftIO $ BS.readFile file
|
||||
case runP pModHeader s of
|
||||
Left (Pn l c,msg) -> raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
|
||||
Right mo -> return (CSComp,importsOfModule mo)
|
||||
CSComp -> do s <- liftIO $ BS.readFile file
|
||||
case runP pModHeader s of
|
||||
Left (Pn l c,msg) -> raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
|
||||
Right mo -> return (st,importsOfModule mo)
|
||||
| otherwise -> do mo <- parseModHeader opts file
|
||||
return (CSComp,importsOfModule mo)
|
||||
CSComp -> do mo <- parseModHeader opts file
|
||||
return (st,importsOfModule mo)
|
||||
testErr (mname == name)
|
||||
("module name" +++ mname +++ "differs from file name" +++ name)
|
||||
return (name,st,t,isJust gfTime,imps,dropFileName file)
|
||||
@@ -209,17 +210,55 @@ importsOfModule (m,mi) = (modName m,depModInfo mi [])
|
||||
|
||||
modName = showIdent
|
||||
|
||||
|
||||
parseModHeader opts file =
|
||||
do --ePutStrLn file
|
||||
(_,parsed) <- parseSource opts pModHeader =<< lift (BS.readFile file)
|
||||
case parsed of
|
||||
Right mo -> return mo
|
||||
Left (Pn l c,msg) ->
|
||||
raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
|
||||
|
||||
parseSource opts p raw =
|
||||
do (coding,utf8) <- toUTF8 opts raw
|
||||
return (coding,runP p utf8)
|
||||
|
||||
toUTF8 opts0 raw =
|
||||
do opts <- getPragmas raw
|
||||
let given = flag optEncoding opts -- explicitly given encoding
|
||||
coding = getEncoding $ opts0 `addOptions` opts
|
||||
utf8 <- if coding=="UTF-8"
|
||||
then return raw
|
||||
else lift $ do --ePutStrLn $ "toUTF8 from "++coding
|
||||
enc <- mkTextEncoding coding
|
||||
-- decodeUnicodeIO uses a lot of stack space,
|
||||
-- so we need to split the file into smaller pieces
|
||||
ls <- mapM (decodeUnicodeIO enc) (BS.lines raw)
|
||||
return $ UTF8.fromString (unlines ls)
|
||||
return (given,utf8)
|
||||
|
||||
--lift io = ioe (fmap Ok io `catch` (return . Bad . show))
|
||||
lift io = liftIO io
|
||||
|
||||
-- | options can be passed to the compiler by comments in @--#@, in the main file
|
||||
getOptionsFromFile :: (MonadIO m,ErrorMonad m) => FilePath -> m Options
|
||||
getOptionsFromFile file = do
|
||||
s <- either (\_ -> raise $ "File " ++ file ++ " does not exist") return =<<
|
||||
liftIO (try $ BS.readFile file)
|
||||
let ls = filter (BS.isPrefixOf (BS.pack "--#")) $ BS.lines s
|
||||
fs = map (BS.unpack . BS.unwords . BS.words . BS.drop 3) ls
|
||||
parseModuleOptions fs
|
||||
opts <- getPragmas s
|
||||
-- The coding flag should not be inherited by other files
|
||||
return (addOptions opts (modifyFlags $ \ f -> f{optEncoding=Nothing}))
|
||||
|
||||
getFilePath :: MonadIO m => [FilePath] -> String -> m (Maybe FilePath)
|
||||
getFilePath paths file = liftIO $ get paths
|
||||
|
||||
getPragmas :: (ErrorMonad m) => BS.ByteString -> m Options
|
||||
getPragmas = parseModuleOptions .
|
||||
map (BS.unpack . BS.unwords . BS.words . BS.drop 3) .
|
||||
filter (BS.isPrefixOf (BS.pack "--#")) . BS.lines
|
||||
|
||||
--getFilePath :: MonadIO m => [FilePath] -> String -> m (Maybe FilePath)
|
||||
getFilePath paths file =
|
||||
liftIO $ do --ePutStrLn $ "getFilePath "++show paths++" "++show file
|
||||
get paths
|
||||
where
|
||||
get [] = return Nothing
|
||||
get (p:ps) = do
|
||||
|
||||
Reference in New Issue
Block a user