1
0
forked from GitHub/gf-core

Change how GF deals with character encodings in grammar files

1. The default encoding is changed from Latin-1 to UTF-8.

2. Alternate encodings should be specified as "--# -coding=enc", the old
   "flags coding=enc" declarations have no effect but are still checked for
   consistency.

3. A transitional warning is generated for files that contain non-ASCII
   characters without specifying a character encoding:

	"Warning: default encoding has changed from Latin-1 to UTF-8"

4. Conversion to Unicode is now done *before* lexing. This makes it possible
   to allow arbitrary Unicode characters in identifiers. But identifiers are
   still stored as ByteStrings, so they are limited to Latin-1 characters
   for now.

5. Lexer.hs is no longer part of the repository. We now generate the lexer
   from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were
   needed. These bugs might already be fixed in newer versions of alex, but
   we should be compatible with what is shipped in the Haskell Platform.
This commit is contained in:
hallgren
2013-11-25 21:12:11 +00:00
parent 1ae58146cd
commit 30fc46e934
9 changed files with 157 additions and 559 deletions

View File

@@ -1,5 +1,5 @@
name: gf
version: 3.5-darcs
version: 3.5.11-darcs
cabal-version: >= 1.8
build-type: Custom
@@ -140,8 +140,8 @@ Executable gf
if flag(new-comp)
cpp-options: -DNEW_COMP
build-tools: happy
--, alex>=2 && <3 -- tricky to install in Ubuntu 12.04
build-tools: happy, alex>=3
if os(windows)
build-depends: Win32
else

View File

@@ -18,40 +18,61 @@ import Prelude hiding (catch)
import GF.Data.Operations
import GF.System.Catch
--import GF.System.Catch
import GF.Infra.UseIO
import GF.Infra.Option(Options,optPreprocessors,addOptions,optEncoding,flag,renameEncoding)
import GF.Infra.Option(Options,optPreprocessors,addOptions,renameEncoding,optEncoding,flag,defaultEncoding)
import GF.Grammar.Lexer
import GF.Grammar.Parser
import GF.Grammar.Grammar
import GF.Compile.Coding
--import GF.Compile.Coding
import GF.Compile.ReadFiles(parseSource,lift)
--import GF.Text.Coding(decodeUnicodeIO)
import qualified Data.ByteString.Char8 as BS
import Control.Monad (foldM)
import Data.Char(isAscii)
import Control.Monad (foldM,when,unless)
import System.Cmd (system)
import System.IO(mkTextEncoding)
--import System.IO(mkTextEncoding) --,utf8
import System.Directory(removeFile)
getSourceModule :: Options -> FilePath -> IOE SourceModule
getSourceModule opts file0 = ioe $
do tmp <- foldM runPreprocessor (Source file0) (flag optPreprocessors opts)
content <- keepTemp tmp
case runP pModDef content of
Left (Pn l c,msg) -> do file <- writeTemp tmp
getSourceModule opts file0 =
errIn file0 $
do tmp <- lift $ foldM runPreprocessor (Source file0) (flag optPreprocessors opts)
raw <- lift $ keepTemp tmp
--ePutStrLn $ "1 "++file0
(optCoding,parsed) <- parseSource opts pModDef raw
case parsed of
Left (Pn l c,msg) -> do file <- lift $ writeTemp tmp
let location = file++":"++show l++":"++show c
return (Bad (location++":\n "++msg))
Right (i,mi00) ->
do removeTemp tmp
let mi0 =mi00 {mflags=mflags mi00 `addOptions` opts, msrc=file0}
mi <- transcodeModule (i,mi0)
return (Ok mi)
`catch` (return . Bad . show)
raise (location++":\n "++msg)
Right (i,mi0) ->
do lift $ removeTemp tmp
let mi =mi0 {mflags=mflags mi0 `addOptions` opts, msrc=file0}
optCoding' = renameEncoding `fmap` flag optEncoding (mflags mi0)
case (optCoding,optCoding') of
(Nothing,Nothing) ->
unless (BS.all isAscii raw) $
ePutStrLn $ file0++":\n Warning: default encoding has changed from Latin-1 to UTF-8"
(_,Just coding') ->
when (coding/=coding') $
raise $ "Encoding mismatch: "++coding++" /= "++coding'
where coding = maybe defaultEncoding renameEncoding optCoding
_ -> return ()
--lift $ transcodeModule' (i,mi) -- old lexer
return (i,mi) -- new lexer
{-
transcodeModule sm00 =
do enc <- mkTextEncoding (renameEncoding (flag optEncoding (mflags (snd sm00))))
do enc <- mkTextEncoding (getEncoding (mflags (snd sm00)))
let sm = decodeStringsInModule enc sm00
return sm
transcodeModule' sm00 =
do let enc = utf8
let sm = decodeStringsInModule enc sm00
return sm
-}
runPreprocessor :: Temporary -> String -> IO Temporary
runPreprocessor tmp0 p =

View File

@@ -21,7 +21,8 @@
module GF.Compile.ReadFiles
( getAllFiles,ModName,ModEnv,importsOfModule,
gfoFile,gfFile,isGFO,gf2gfo,
getOptionsFromFile) where
parseSource,lift,
getOptionsFromFile,getPragmas) where
import Prelude hiding (catch)
import GF.System.Catch
@@ -34,6 +35,10 @@ import GF.Grammar.Parser
import GF.Grammar.Grammar
import GF.Grammar.Binary
import System.IO(mkTextEncoding)
import qualified Data.ByteString.UTF8 as UTF8
import GF.Text.Coding(decodeUnicodeIO)
import Control.Monad
import Data.Maybe(isJust)
import qualified Data.ByteString.Char8 as BS
@@ -50,7 +55,7 @@ type ModEnv = Map.Map ModName (UTCTime,[ModName])
-- | Returns a list of all files to be compiled in topological order i.e.
-- the low level (leaf) modules are first.
getAllFiles :: (MonadIO m,ErrorMonad m) => Options -> [InitPath] -> ModEnv -> FileName -> m [FullPath]
--getAllFiles :: (MonadIO m,ErrorMonad m) => Options -> [InitPath] -> ModEnv -> FileName -> m [FullPath]
getAllFiles opts ps env file = do
-- read module headers from all files recursively
ds <- liftM reverse $ get [] [] (justModuleName file)
@@ -117,14 +122,10 @@ getAllFiles opts ps env file = do
Just mo -> return (st,importsOfModule mo)
Nothing
| isGFO file -> raise (file ++ " is compiled with different GF version and I can't find the source file")
| otherwise -> do s <- liftIO $ BS.readFile file
case runP pModHeader s of
Left (Pn l c,msg) -> raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
Right mo -> return (CSComp,importsOfModule mo)
CSComp -> do s <- liftIO $ BS.readFile file
case runP pModHeader s of
Left (Pn l c,msg) -> raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
Right mo -> return (st,importsOfModule mo)
| otherwise -> do mo <- parseModHeader opts file
return (CSComp,importsOfModule mo)
CSComp -> do mo <- parseModHeader opts file
return (st,importsOfModule mo)
testErr (mname == name)
("module name" +++ mname +++ "differs from file name" +++ name)
return (name,st,t,isJust gfTime,imps,dropFileName file)
@@ -209,17 +210,55 @@ importsOfModule (m,mi) = (modName m,depModInfo mi [])
modName = showIdent
parseModHeader opts file =
do --ePutStrLn file
(_,parsed) <- parseSource opts pModHeader =<< lift (BS.readFile file)
case parsed of
Right mo -> return mo
Left (Pn l c,msg) ->
raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
parseSource opts p raw =
do (coding,utf8) <- toUTF8 opts raw
return (coding,runP p utf8)
toUTF8 opts0 raw =
do opts <- getPragmas raw
let given = flag optEncoding opts -- explicitly given encoding
coding = getEncoding $ opts0 `addOptions` opts
utf8 <- if coding=="UTF-8"
then return raw
else lift $ do --ePutStrLn $ "toUTF8 from "++coding
enc <- mkTextEncoding coding
-- decodeUnicodeIO uses a lot of stack space,
-- so we need to split the file into smaller pieces
ls <- mapM (decodeUnicodeIO enc) (BS.lines raw)
return $ UTF8.fromString (unlines ls)
return (given,utf8)
--lift io = ioe (fmap Ok io `catch` (return . Bad . show))
lift io = liftIO io
-- | options can be passed to the compiler by comments in @--#@, in the main file
getOptionsFromFile :: (MonadIO m,ErrorMonad m) => FilePath -> m Options
getOptionsFromFile file = do
s <- either (\_ -> raise $ "File " ++ file ++ " does not exist") return =<<
liftIO (try $ BS.readFile file)
let ls = filter (BS.isPrefixOf (BS.pack "--#")) $ BS.lines s
fs = map (BS.unpack . BS.unwords . BS.words . BS.drop 3) ls
parseModuleOptions fs
opts <- getPragmas s
-- The coding flag should not be inherited by other files
return (addOptions opts (modifyFlags $ \ f -> f{optEncoding=Nothing}))
getFilePath :: MonadIO m => [FilePath] -> String -> m (Maybe FilePath)
getFilePath paths file = liftIO $ get paths
getPragmas :: (ErrorMonad m) => BS.ByteString -> m Options
getPragmas = parseModuleOptions .
map (BS.unpack . BS.unwords . BS.words . BS.drop 3) .
filter (BS.isPrefixOf (BS.pack "--#")) . BS.lines
--getFilePath :: MonadIO m => [FilePath] -> String -> m (Maybe FilePath)
getFilePath paths file =
liftIO $ do --ePutStrLn $ "getFilePath "++show paths++" "++show file
get paths
where
get [] = return Nothing
get (p:ps) = do

File diff suppressed because one or more lines are too long

View File

@@ -1,3 +1,4 @@
-- -*- haskell -*-
{
module GF.Grammar.Lexer
( Token(..), Posn(..)
@@ -6,9 +7,14 @@ module GF.Grammar.Lexer
) where
import GF.Infra.Ident
--import GF.Data.Operations
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString as WBS
import qualified Data.ByteString.Internal as BS(w2c)
import qualified Data.ByteString.UTF8 as UTF8
import qualified Data.Map as Map
import Data.Word(Word8)
--import Debug.Trace(trace)
}
@@ -17,7 +23,7 @@ $c = [A-Z\192-\221] # [\215]
$s = [a-z\222-\255] # [\247]
$d = [0-9] -- digit
$i = [$l $d _ '] -- identifier character
$u = [\0-\255] -- universal: any character
$u = [.\n] -- universal: any character
@rsyms = -- symbols and non-identifier-like reserved words
\; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/
@@ -27,17 +33,20 @@ $u = [\0-\255] -- universal: any character
"{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ;
$white+ ;
@rsyms { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . BS.unpack)) }
(\_ | $l)($l | $d | \_ | \')* { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
@rsyms { tok (res (T_Ident . identS)) }
\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) }
(\_ | $l)($l | $d | \_ | \')* { tok (res (T_Ident . identS)) }
\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . BS.unpack) }
\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) }
(\-)? $d+ { tok (T_Integer . read . BS.unpack) }
(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . BS.unpack) }
(\-)? $d+ { tok (T_Integer . read . unpack) }
(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . unpack) }
{
--unpack = BS.unpack
unpack = id
tok :: (String->Token) -> Posn -> String -> Token
tok f p s = f s
data Token
@@ -114,15 +123,17 @@ data Token
| T_Double Double -- double precision float literals
| T_Ident Ident
| T_EOF
-- deriving Show -- debug
eitherResIdent :: (BS.ByteString -> Token) -> BS.ByteString -> Token
res = eitherResIdent
eitherResIdent :: (String -> Token) -> String -> Token
eitherResIdent tv s =
case Map.lookup s resWords of
Just t -> t
Nothing -> tv s
isReservedWord :: BS.ByteString -> Bool
isReservedWord s = Map.member s resWords
isReservedWord s = Map.member (BS.unpack s) resWords
resWords = Map.fromList
[ b "!" T_exclmark
@@ -194,7 +205,7 @@ resWords = Map.fromList
, b "where" T_where
, b "with" T_with
]
where b s t = (BS.pack s, t)
where b s t = (s, t)
unescapeInitTail :: String -> String
unescapeInitTail = unesc . tail where
@@ -202,7 +213,7 @@ unescapeInitTail = unesc . tail where
'\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs
'\\':'n':cs -> '\n' : unesc cs
'\\':'t':cs -> '\t' : unesc cs
'"':[] -> []
'\"':[] -> []
'\'':[] -> []
c:cs -> c : unesc cs
_ -> []
@@ -219,13 +230,14 @@ alexMove :: Posn -> Char -> Posn
alexMove (Pn l c) '\n' = Pn (l+1) 1
alexMove (Pn l c) _ = Pn l (c+1)
alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
alexGetChar (AI p _ s) =
case BS.uncons s of
alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte (AI p _ s) =
case WBS.uncons s of
Nothing -> Nothing
Just (c,s) ->
Just (w,s) ->
let p' = alexMove p c
in p' `seq` Just (c, (AI p' c s))
c = BS.w2c w
in p' `seq` Just (w, (AI p' c s))
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (AI p c s) = c
@@ -260,12 +272,13 @@ failLoc pos msg = P $ \_ -> PFailed pos msg
lexer :: (Token -> P a) -> P a
lexer cont = P go
where
--cont' t = trace (show t) (cont t)
go inp@(AI pos _ str) =
case alexScan inp 0 of
AlexEOF -> unP (cont T_EOF) inp
AlexError (AI pos _ _) -> PFailed pos "lexical error"
AlexSkip inp' len -> go inp'
AlexToken inp' len act -> unP (cont (act pos (BS.take len str))) inp'
AlexSkip inp' len -> {-trace (show len) $-} go inp'
AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp'
getPosn :: P Posn
getPosn = P $ \inp@(AI pos _ _) -> POk pos

View File

@@ -17,8 +17,8 @@ import GF.Grammar.Grammar
import GF.Grammar.Macros
import GF.Grammar.Lexer
import GF.Compile.Update (buildAnyTree)
import Codec.Binary.UTF8.String(decodeString)
import Data.Char(toLower)
--import Codec.Binary.UTF8.String(decodeString)
--import Data.Char(toLower)
}
%name pModDef ModDef
@@ -616,9 +616,9 @@ happyError = fail "syntax error"
-- Quick fix to render error messages from UTF-8-encoded source files correctly.
optDecode opts =
if map toLower (flag optEncoding opts) `elem` ["utf8","utf-8"]
{-if map toLower (getEncoding opts) `elem` ["utf8","utf-8"]
then decodeString
else id
else-} id
mkListId,mkConsId,mkBaseId :: Ident -> Ident
mkListId = prefixIdent "List"

View File

@@ -20,7 +20,7 @@ module GF.Infra.Option
helpMessage,
-- * Checking specific options
flag, cfgTransform, haskellOption, readOutputFormat,
isLexicalCat, isLiteralCat, renameEncoding,
isLexicalCat, isLiteralCat, renameEncoding, getEncoding, defaultEncoding,
-- * Setting specific options
setOptimization, setCFGTransform,
-- * Convenience methods for checking options
@@ -157,7 +157,7 @@ data Flags = Flags {
optRetainResource :: Bool,
optName :: Maybe String,
optPreprocessors :: [String],
optEncoding :: String,
optEncoding :: Maybe String,
optPMCFG :: Bool,
optOptimizations :: Set Optimization,
optOptimizePGF :: Bool,
@@ -213,7 +213,7 @@ fixRelativeLibPaths curr_dir lib_dir (Options o) = Options (fixPathFlags . o)
-- | Pretty-print the options that are preserved in .gfo files.
optionsGFO :: Options -> [(String,Literal)]
optionsGFO opts = optionsPGF opts
++ [("coding", LStr (flag optEncoding opts))]
++ [("coding", LStr (getEncoding opts))]
-- | Pretty-print the options that are preserved in .pgf files.
optionsPGF :: Options -> [(String,Literal)]
@@ -241,6 +241,10 @@ concatOptions = foldr addOptions noOptions
modifyFlags :: (Flags -> Flags) -> Options
modifyFlags = Options
getEncoding :: Options -> String
getEncoding = renameEncoding . maybe defaultEncoding id . flag optEncoding
defaultEncoding = "UTF-8"
-- Default options
defaultFlags :: Flags
@@ -264,7 +268,7 @@ defaultFlags = Flags {
optName = Nothing,
optPreprocessors = [],
optEncoding = "latin1",
optEncoding = Nothing,
optPMCFG = True,
optOptimizations = Set.fromList [OptStem,OptCSE,OptExpand,OptParametrize],
optOptimizePGF = False,
@@ -419,7 +423,7 @@ optDescr =
addLibDir x = set $ \o -> o { optLibraryPath = x:optLibraryPath o }
setLibPath x = set $ \o -> o { optLibraryPath = splitInModuleSearchPath x }
preproc x = set $ \o -> o { optPreprocessors = optPreprocessors o ++ [x] }
coding x = set $ \o -> o { optEncoding = x }
coding x = set $ \o -> o { optEncoding = Just x }
startcat x = set $ \o -> o { optStartCat = Just x }
language x = set $ \o -> o { optSpeechLanguage = Just x }
lexer x = set $ \o -> o { optLexer = Just x }

View File

@@ -31,7 +31,7 @@ encodeUnicode enc s =
(cbuf,bbuf) <- cod cbuf bbuf
#endif
if isEmptyBuffer bbuf
then ioe_invalidCharacter
then ioe_invalidCharacter1
else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
bss <- translate cod cbuf
return (bs:bss)
@@ -41,8 +41,9 @@ encodeUnicode enc s =
w = bufR cbuf
decodeUnicode :: TextEncoding -> ByteString -> String
decodeUnicode enc (PS fptr l len) =
unsafePerformIO $ do
decodeUnicode enc bs = unsafePerformIO $ decodeUnicodeIO enc bs
decodeUnicodeIO enc (PS fptr l len) = do
let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len}
cbuf <- newCharBuffer 128 WriteBuffer
case enc of
@@ -59,7 +60,7 @@ decodeUnicode enc (PS fptr l len) =
(bbuf,cbuf) <- cod bbuf cbuf
#endif
if isEmptyBuffer cbuf
then ioe_invalidCharacter
then ioe_invalidCharacter2
else unpack cod bbuf cbuf
| otherwise = return []
where
@@ -75,6 +76,10 @@ decodeUnicode enc (PS fptr l len) =
i = bufL cbuf
w = bufR cbuf
ioe_invalidCharacter = ioException
ioe_invalidCharacter1 = ioException
(IOError Nothing InvalidArgument ""
("invalid byte sequence for this encoding") Nothing Nothing)
ioe_invalidCharacter2 = ioException
(IOError Nothing InvalidArgument ""
("invalid byte sequence for this decoding") Nothing Nothing)

View File

@@ -24,7 +24,7 @@ import qualified Data.ByteString.Lazy as BSL
import System.FilePath
import System.IO
import Control.Exception
import Control.Monad(unless)
mainGFC :: Options -> [FilePath] -> IO ()
mainGFC opts fs = do
@@ -46,9 +46,8 @@ compileSourceFiles :: Options -> [FilePath] -> IOE ()
compileSourceFiles opts fs =
do gr <- batchCompile opts fs
let cnc = justModuleName (last fs)
if flag optStopAfterPhase opts == Compile
then return ()
else do pgf <- link opts (identS cnc) gr
unless (flag optStopAfterPhase opts == Compile) $
do pgf <- link opts (identS cnc) gr
writePGF opts pgf
writeByteCode opts pgf
writeOutputs opts pgf
@@ -57,11 +56,9 @@ compileCFFiles :: Options -> [FilePath] -> IOE ()
compileCFFiles opts fs =
do s <- liftIO $ fmap unlines $ mapM readFile fs
let cnc = justModuleName (last fs)
gf <- getCF cnc s
gr <- compileSourceGrammar opts gf
if flag optStopAfterPhase opts == Compile
then return ()
else do pgf <- link opts (identS cnc) gr
gr <- compileSourceGrammar opts =<< getCF cnc s
unless (flag optStopAfterPhase opts == Compile) $
do pgf <- link opts (identS cnc) gr
writePGF opts pgf
writeOutputs opts pgf