forked from GitHub/gf-core
Change how GF deals with character encodings in grammar files
1. The default encoding is changed from Latin-1 to UTF-8. 2. Alternate encodings should be specified as "--# -coding=enc", the old "flags coding=enc" declarations have no effect but are still checked for consistency. 3. A transitional warning is generated for files that contain non-ASCII characters without specifying a character encoding: "Warning: default encoding has changed from Latin-1 to UTF-8" 4. Conversion to Unicode is now done *before* lexing. This makes it possible to allow arbitrary Unicode characters in identifiers. But identifiers are still stored as ByteStrings, so they are limited to Latin-1 characters for now. 5. Lexer.hs is no longer part of the repository. We now generate the lexer from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were needed. These bugs might already be fixed in newer versions of alex, but we should be compatible with what is shipped in the Haskell Platform.
This commit is contained in:
@@ -31,7 +31,7 @@ encodeUnicode enc s =
|
||||
(cbuf,bbuf) <- cod cbuf bbuf
|
||||
#endif
|
||||
if isEmptyBuffer bbuf
|
||||
then ioe_invalidCharacter
|
||||
then ioe_invalidCharacter1
|
||||
else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
|
||||
bss <- translate cod cbuf
|
||||
return (bs:bss)
|
||||
@@ -41,8 +41,9 @@ encodeUnicode enc s =
|
||||
w = bufR cbuf
|
||||
|
||||
decodeUnicode :: TextEncoding -> ByteString -> String
|
||||
decodeUnicode enc (PS fptr l len) =
|
||||
unsafePerformIO $ do
|
||||
decodeUnicode enc bs = unsafePerformIO $ decodeUnicodeIO enc bs
|
||||
|
||||
decodeUnicodeIO enc (PS fptr l len) = do
|
||||
let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len}
|
||||
cbuf <- newCharBuffer 128 WriteBuffer
|
||||
case enc of
|
||||
@@ -59,7 +60,7 @@ decodeUnicode enc (PS fptr l len) =
|
||||
(bbuf,cbuf) <- cod bbuf cbuf
|
||||
#endif
|
||||
if isEmptyBuffer cbuf
|
||||
then ioe_invalidCharacter
|
||||
then ioe_invalidCharacter2
|
||||
else unpack cod bbuf cbuf
|
||||
| otherwise = return []
|
||||
where
|
||||
@@ -75,6 +76,10 @@ decodeUnicode enc (PS fptr l len) =
|
||||
i = bufL cbuf
|
||||
w = bufR cbuf
|
||||
|
||||
ioe_invalidCharacter = ioException
|
||||
ioe_invalidCharacter1 = ioException
|
||||
(IOError Nothing InvalidArgument ""
|
||||
("invalid byte sequence for this encoding") Nothing Nothing)
|
||||
|
||||
ioe_invalidCharacter2 = ioException
|
||||
(IOError Nothing InvalidArgument ""
|
||||
("invalid byte sequence for this decoding") Nothing Nothing)
|
||||
|
||||
Reference in New Issue
Block a user