Quick fix to render some parser error messages from UTF-8-encoded source files correctly.

The parser works on raw byte sequences read from source files. If parsing
succeeds the raw byte sequences are converted to proper Unicode characters 
in a later phase. But the parser calls the function buildAnyTree, which can 
fail and generate error messages containing source code fragments, which might
then containing raw byte sequences. To render these error messages correctly, 
they need to be converted in accordance with the coding flag in the source 
file. This is now done for UTF-8-encoded source files, but should ideally also
be done for other character encodings. (Latin-1-encoded files never suffered 
from this problem, since raw bytes are proper Unicode characters in this case.)
This commit is contained in:
hallgren
2013-01-28 17:23:02 +00:00
parent 764b649959
commit c14e75706e

View File

@@ -17,6 +17,8 @@ import GF.Grammar.Macros
import GF.Grammar.Lexer
import qualified Data.ByteString.Char8 as BS
import GF.Compile.Update (buildAnyTree)
import Codec.Binary.UTF8.String(decodeString)
import Data.Char(toLower)
}
%name pModDef ModDef
@@ -116,7 +118,7 @@ ModDef
jments <- mapM (checkInfoType mtype) jments
defs <- case buildAnyTree id jments of
Ok x -> return x
Bad msg -> fail msg
Bad msg -> fail (optDecode opts msg)
return (id, ModInfo mtype mstat opts extends with opens [] "" Nothing defs) }
ModHeader :: { SourceModule }
@@ -605,6 +607,12 @@ Posn
happyError :: P a
happyError = fail "syntax error"
-- Quick fix to render error messages from UTF-8-encoded source files correctly.
optDecode opts =
if map toLower (flag optEncoding opts) `elem` ["utf8","utf-8"]
then decodeString
else id
mkListId,mkConsId,mkBaseId :: Ident -> Ident
mkListId = prefixId (BS.pack "List")
mkConsId = prefixId (BS.pack "Cons")