mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-28 22:12:51 -06:00
Change how GF deals with character encodings in grammar files
1. The default encoding is changed from Latin-1 to UTF-8. 2. Alternate encodings should be specified as "--# -coding=enc", the old "flags coding=enc" declarations have no effect but are still checked for consistency. 3. A transitional warning is generated for files that contain non-ASCII characters without specifying a character encoding: "Warning: default encoding has changed from Latin-1 to UTF-8" 4. Conversion to Unicode is now done *before* lexing. This makes it possible to allow arbitrary Unicode characters in identifiers. But identifiers are still stored as ByteStrings, so they are limited to Latin-1 characters for now. 5. Lexer.hs is no longer part of the repository. We now generate the lexer from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were needed. These bugs might already be fixed in newer versions of alex, but we should be compatible with what is shipped in the Haskell Platform.
This commit is contained in:
@@ -18,40 +18,61 @@ import Prelude hiding (catch)
|
||||
|
||||
import GF.Data.Operations
|
||||
|
||||
import GF.System.Catch
|
||||
--import GF.System.Catch
|
||||
import GF.Infra.UseIO
|
||||
import GF.Infra.Option(Options,optPreprocessors,addOptions,optEncoding,flag,renameEncoding)
|
||||
import GF.Infra.Option(Options,optPreprocessors,addOptions,renameEncoding,optEncoding,flag,defaultEncoding)
|
||||
import GF.Grammar.Lexer
|
||||
import GF.Grammar.Parser
|
||||
import GF.Grammar.Grammar
|
||||
import GF.Compile.Coding
|
||||
--import GF.Compile.Coding
|
||||
import GF.Compile.ReadFiles(parseSource,lift)
|
||||
--import GF.Text.Coding(decodeUnicodeIO)
|
||||
|
||||
import qualified Data.ByteString.Char8 as BS
|
||||
import Control.Monad (foldM)
|
||||
import Data.Char(isAscii)
|
||||
import Control.Monad (foldM,when,unless)
|
||||
import System.Cmd (system)
|
||||
import System.IO(mkTextEncoding)
|
||||
--import System.IO(mkTextEncoding) --,utf8
|
||||
import System.Directory(removeFile)
|
||||
|
||||
getSourceModule :: Options -> FilePath -> IOE SourceModule
|
||||
getSourceModule opts file0 = ioe $
|
||||
do tmp <- foldM runPreprocessor (Source file0) (flag optPreprocessors opts)
|
||||
content <- keepTemp tmp
|
||||
case runP pModDef content of
|
||||
Left (Pn l c,msg) -> do file <- writeTemp tmp
|
||||
getSourceModule opts file0 =
|
||||
errIn file0 $
|
||||
do tmp <- lift $ foldM runPreprocessor (Source file0) (flag optPreprocessors opts)
|
||||
raw <- lift $ keepTemp tmp
|
||||
--ePutStrLn $ "1 "++file0
|
||||
(optCoding,parsed) <- parseSource opts pModDef raw
|
||||
case parsed of
|
||||
Left (Pn l c,msg) -> do file <- lift $ writeTemp tmp
|
||||
let location = file++":"++show l++":"++show c
|
||||
return (Bad (location++":\n "++msg))
|
||||
Right (i,mi00) ->
|
||||
do removeTemp tmp
|
||||
let mi0 =mi00 {mflags=mflags mi00 `addOptions` opts, msrc=file0}
|
||||
mi <- transcodeModule (i,mi0)
|
||||
return (Ok mi)
|
||||
`catch` (return . Bad . show)
|
||||
raise (location++":\n "++msg)
|
||||
Right (i,mi0) ->
|
||||
do lift $ removeTemp tmp
|
||||
let mi =mi0 {mflags=mflags mi0 `addOptions` opts, msrc=file0}
|
||||
optCoding' = renameEncoding `fmap` flag optEncoding (mflags mi0)
|
||||
case (optCoding,optCoding') of
|
||||
(Nothing,Nothing) ->
|
||||
unless (BS.all isAscii raw) $
|
||||
ePutStrLn $ file0++":\n Warning: default encoding has changed from Latin-1 to UTF-8"
|
||||
(_,Just coding') ->
|
||||
when (coding/=coding') $
|
||||
raise $ "Encoding mismatch: "++coding++" /= "++coding'
|
||||
where coding = maybe defaultEncoding renameEncoding optCoding
|
||||
_ -> return ()
|
||||
--lift $ transcodeModule' (i,mi) -- old lexer
|
||||
return (i,mi) -- new lexer
|
||||
|
||||
{-
|
||||
transcodeModule sm00 =
|
||||
do enc <- mkTextEncoding (renameEncoding (flag optEncoding (mflags (snd sm00))))
|
||||
do enc <- mkTextEncoding (getEncoding (mflags (snd sm00)))
|
||||
let sm = decodeStringsInModule enc sm00
|
||||
return sm
|
||||
|
||||
transcodeModule' sm00 =
|
||||
do let enc = utf8
|
||||
let sm = decodeStringsInModule enc sm00
|
||||
return sm
|
||||
-}
|
||||
|
||||
runPreprocessor :: Temporary -> String -> IO Temporary
|
||||
runPreprocessor tmp0 p =
|
||||
|
||||
Reference in New Issue
Block a user