Quick fix to render some parser error messages from UTF-8-encoded source files correctly.

The parser works on raw byte sequences read from source files. If parsing succeeds the raw byte sequences are converted to proper Unicode characters in a later phase. But the parser calls the function buildAnyTree, which can fail and generate error messages containing source code fragments, which might then containing raw byte sequences. To render these error messages correctly, they need to be converted in accordance with the coding flag in the source file. This is now done for UTF-8-encoded source files, but should ideally also be done for other character encodings. (Latin-1-encoded files never suffered from this problem, since raw bytes are proper Unicode characters in this case.)
2026-05-24 18:28:55 -06:00 · 2013-01-28 17:23:02 +00:00
parent 764b649959
commit c14e75706e
1 changed files with 9 additions and 1 deletions
--- a/src/compiler/GF/Grammar/Parser.y
+++ b/src/compiler/GF/Grammar/Parser.y
@@ -17,6 +17,8 @@ import GF.Grammar.Macros
 import GF.Grammar.Lexer
 import qualified Data.ByteString.Char8 as BS
 import GF.Compile.Update (buildAnyTree)
+import Codec.Binary.UTF8.String(decodeString)
+import Data.Char(toLower)
 }

 %name pModDef ModDef
@@ -116,7 +118,7 @@ ModDef
                                      jments <- mapM (checkInfoType mtype) jments
                                      defs <- case buildAnyTree id jments of
                                                Ok x    -> return x
-                                                Bad msg -> fail msg
+                                                Bad msg -> fail (optDecode opts msg)
                                      return (id, ModInfo mtype mstat opts extends with opens [] "" Nothing defs)  }

 ModHeader :: { SourceModule }
@@ -605,6 +607,12 @@ Posn
 happyError :: P a
 happyError = fail "syntax error"

+-- Quick fix to render error messages from UTF-8-encoded source files correctly.
+optDecode opts =
+    if map toLower (flag optEncoding opts) `elem` ["utf8","utf-8"]
+    then decodeString
+    else id
+
 mkListId,mkConsId,mkBaseId  :: Ident -> Ident
 mkListId = prefixId (BS.pack "List")
 mkConsId = prefixId (BS.pack "Cons")