1
0
forked from GitHub/gf-core

use the native unicode support from GHC 6.12

This commit is contained in:
krasimir
2010-04-19 09:38:36 +00:00
parent 4c757f4683
commit 0b6b30d4a8
23 changed files with 177 additions and 490 deletions

View File

@@ -1,91 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module : GF.Text.CP1250
-- Maintainer : Krasimir Angelov
--
-- cp1250 is a code page used under Microsoft Windows to represent texts
-- in Central European and Eastern European languages that use Latin script,
-- such as Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian,
-- Serbian (Latin script), Romanian and Albanian. It may also be used with
-- the German language; German-language texts encoded with cp1250 and cp1252
-- are identical.
--
-----------------------------------------------------------------------------
module GF.Text.CP1250 where
import Data.Char
decodeCP1250 = map convert where
convert c
| c == '\x80' = chr 0x20AC
| c == '\x82' = chr 0x201A
| c == '\x84' = chr 0x201E
| c == '\x85' = chr 0x2026
| c == '\x86' = chr 0x2020
| c == '\x87' = chr 0x2021
| c == '\x89' = chr 0x2030
| c == '\x8A' = chr 0x0160
| c == '\x8B' = chr 0x2039
| c == '\x8C' = chr 0x015A
| c == '\x8D' = chr 0x0164
| c == '\x8E' = chr 0x017D
| c == '\x8F' = chr 0x0179
| c == '\x91' = chr 0x2018
| c == '\x92' = chr 0x2019
| c == '\x93' = chr 0x201C
| c == '\x94' = chr 0x201D
| c == '\x95' = chr 0x2022
| c == '\x96' = chr 0x2013
| c == '\x97' = chr 0x2014
| c == '\x99' = chr 0x2122
| c == '\x9A' = chr 0x0161
| c == '\x9B' = chr 0x203A
| c == '\x9C' = chr 0x015B
| c == '\x9D' = chr 0x0165
| c == '\x9E' = chr 0x017E
| c == '\x9F' = chr 0x017A
| c == '\xA1' = chr 0x02C7
| c == '\xA5' = chr 0x0104
| c == '\xB9' = chr 0x0105
| c == '\xBC' = chr 0x013D
| c == '\xBE' = chr 0x013E
| otherwise = c
encodeCP1250 = map convert where
convert c
| oc == 0x20AC = '\x80'
| oc == 0x201A = '\x82'
| oc == 0x201E = '\x84'
| oc == 0x2026 = '\x85'
| oc == 0x2020 = '\x86'
| oc == 0x2021 = '\x87'
| oc == 0x2030 = '\x89'
| oc == 0x0160 = '\x8A'
| oc == 0x2039 = '\x8B'
| oc == 0x015A = '\x8C'
| oc == 0x0164 = '\x8D'
| oc == 0x017D = '\x8E'
| oc == 0x0179 = '\x8F'
| oc == 0x2018 = '\x91'
| oc == 0x2019 = '\x92'
| oc == 0x201C = '\x93'
| oc == 0x201D = '\x94'
| oc == 0x2022 = '\x95'
| oc == 0x2013 = '\x96'
| oc == 0x2014 = '\x97'
| oc == 0x2122 = '\x99'
| oc == 0x0161 = '\x9A'
| oc == 0x203A = '\x9B'
| oc == 0x015B = '\x9C'
| oc == 0x0165 = '\x9D'
| oc == 0x017E = '\x9E'
| oc == 0x017A = '\x9F'
| oc == 0x02C7 = '\xA1'
| oc == 0x0104 = '\xA5'
| oc == 0x0105 = '\xB9'
| oc == 0x013D = '\xBC'
| oc == 0x013E = '\xBE'
| otherwise = c
where oc = ord c

View File

@@ -1,86 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module : GF.Text.CP1251
-- Maintainer : Krasimir Angelov
--
-- cp1251 is a popular 8-bit character encoding, designed to cover languages
-- that use the Cyrillic alphabet such as Russian, Bulgarian, Serbian Cyrillic
-- and other languages. It is the most widely used for encoding the Bulgarian,
-- Serbian and Macedonian languages.
--
-----------------------------------------------------------------------------
module GF.Text.CP1251 where
import Data.Char
decodeCP1251 = map convert where
convert c
| c >= '\xC0' && c <= '\xFF' = chr (ord c + (0x410-0xC0))
| c == '\xA8' = chr 0x401 -- cyrillic capital letter lo
| c == '\x80' = chr 0x402
| c == '\x81' = chr 0x403
| c == '\xAA' = chr 0x404
| c == '\xBD' = chr 0x405
| c == '\xB2' = chr 0x406
| c == '\xAF' = chr 0x407
| c == '\xA3' = chr 0x408
| c == '\x8A' = chr 0x409
| c == '\x8C' = chr 0x40A
| c == '\x8E' = chr 0x40B
| c == '\x8D' = chr 0x40C
| c == '\xA1' = chr 0x40E
| c == '\x8F' = chr 0x40F
| c == '\xB8' = chr 0x451 -- cyrillic small letter lo
| c == '\x90' = chr 0x452
| c == '\x83' = chr 0x453
| c == '\xBA' = chr 0x454
| c == '\xBE' = chr 0x455
| c == '\xB3' = chr 0x456
| c == '\xBF' = chr 0x457
| c == '\xBC' = chr 0x458
| c == '\x9A' = chr 0x459
| c == '\x9C' = chr 0x45A
| c == '\x9E' = chr 0x45B
| c == '\x9D' = chr 0x45C
| c == '\xA2' = chr 0x45E
| c == '\x9F' = chr 0x45F
| c == '\xA5' = chr 0x490
| c == '\xB4' = chr 0x491
| otherwise = c
encodeCP1251 = map convert where
convert c
| oc >= 0x410 && oc <= 0x44F = chr (oc - (0x410-0xC0))
| oc == 0x401 = '\xA8' -- cyrillic capital letter lo
| oc == 0x402 = '\x80'
| oc == 0x403 = '\x81'
| oc == 0x404 = '\xAA'
| oc == 0x405 = '\xBD'
| oc == 0x406 = '\xB2'
| oc == 0x407 = '\xAF'
| oc == 0x408 = '\xA3'
| oc == 0x409 = '\x8A'
| oc == 0x40A = '\x8C'
| oc == 0x40B = '\x8E'
| oc == 0x40C = '\x8D'
| oc == 0x40E = '\xA1'
| oc == 0x40F = '\x8F'
| oc == 0x451 = '\xB8' -- cyrillic small letter lo
| oc == 0x452 = '\x90'
| oc == 0x453 = '\x83'
| oc == 0x454 = '\xBA'
| oc == 0x455 = '\xBE'
| oc == 0x456 = '\xB3'
| oc == 0x457 = '\xBF'
| oc == 0x458 = '\xBC'
| oc == 0x459 = '\x9A'
| oc == 0x45A = '\x9C'
| oc == 0x45B = '\x9E'
| oc == 0x45C = '\x9D'
| oc == 0x45E = '\xA2'
| oc == 0x45F = '\x9F'
| oc == 0x490 = '\xA5'
| oc == 0x491 = '\xB4'
| otherwise = c
where oc = ord c

View File

@@ -1,17 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module : GF.Text.CP1252
-- Maintainer : Krasimir Angelov
--
-- cp1252 is a character encoding of the Latin alphabet, used by default in
-- the legacy components of Microsoft Windows in English and some other
-- Western languages.
--
-----------------------------------------------------------------------------
module GF.Text.CP1252 where
import Data.Char
decodeCP1252 = map id
encodeCP1252 = map (\x -> if x <= '\255' then x else '?')

View File

@@ -1,84 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module : GF.Text.CP1254
-- Maintainer : Krasimir Angelov
--
-- cp1254 is a code page used under Microsoft Windows to write Turkish.
-- Characters with codepoints A0 through FF are compatible with ISO 8859-9.
--
-----------------------------------------------------------------------------
module GF.Text.CP1254 where
import Data.Char
decodeCP1254 = map convert where
convert c
| c == '\x80' = chr 0x20AC
| c == '\x82' = chr 0x201A
| c == '\x83' = chr 0x192
| c == '\x84' = chr 0x201E
| c == '\x85' = chr 0x2026
| c == '\x86' = chr 0x2020
| c == '\x87' = chr 0x2021
| c == '\x88' = chr 0x2C6
| c == '\x89' = chr 0x2030
| c == '\x8A' = chr 0x160
| c == '\x8B' = chr 0x2039
| c == '\x8C' = chr 0x152
| c == '\x91' = chr 0x2018
| c == '\x92' = chr 0x2019
| c == '\x93' = chr 0x201C
| c == '\x94' = chr 0x201D
| c == '\x95' = chr 0x2022
| c == '\x96' = chr 0x2013
| c == '\x97' = chr 0x2014
| c == '\x98' = chr 0x2DC
| c == '\x99' = chr 0x2122
| c == '\x9A' = chr 0x161
| c == '\x9B' = chr 0x203A
| c == '\x9C' = chr 0x153
| c == '\x9F' = chr 0x178
| c == '\xD0' = chr 0x11E
| c == '\xDD' = chr 0x130
| c == '\xDE' = chr 0x15E
| c == '\xF0' = chr 0x11F
| c == '\xFD' = chr 0x131
| c == '\xFE' = chr 0x15F
| otherwise = c
encodeCP1254 = map convert where
convert c
| oc == 0x20AC = '\x80'
| oc == 0x201A = '\x82'
| oc == 0x192 = '\x83'
| oc == 0x201E = '\x84'
| oc == 0x2026 = '\x85'
| oc == 0x2020 = '\x86'
| oc == 0x2021 = '\x87'
| oc == 0x2C6 = '\x88'
| oc == 0x2030 = '\x89'
| oc == 0x160 = '\x8A'
| oc == 0x2039 = '\x8B'
| oc == 0x152 = '\x8C'
| oc == 0x2018 = '\x91'
| oc == 0x2019 = '\x92'
| oc == 0x201C = '\x93'
| oc == 0x201D = '\x94'
| oc == 0x2022 = '\x95'
| oc == 0x2013 = '\x96'
| oc == 0x2014 = '\x97'
| oc == 0x2DC = '\x98'
| oc == 0x2122 = '\x99'
| oc == 0x161 = '\x9A'
| oc == 0x203A = '\x9B'
| oc == 0x153 = '\x9C'
| oc == 0x178 = '\x9F'
| oc == 0x11E = '\xD0'
| oc == 0x130 = '\xDD'
| oc == 0x15E = '\xDE'
| oc == 0x11F = '\xF0'
| oc == 0x131 = '\xFD'
| oc == 0x15F = '\xFE'
| otherwise = c
where oc = ord c

View File

@@ -1,24 +1,69 @@
module GF.Text.Coding where
import GF.Infra.Option
import GF.Text.UTF8
import GF.Text.CP1250
import GF.Text.CP1251
import GF.Text.CP1252
import GF.Text.CP1254
import qualified Data.ByteString as BS
import Data.ByteString.Internal
import GHC.IO
import GHC.IO.Buffer
import GHC.IO.Encoding
import GHC.IO.Exception
import Control.Monad
encodeUnicode e = case e of
UTF_8 -> encodeUTF8
CP_1250 -> encodeCP1250
CP_1251 -> encodeCP1251
CP_1252 -> encodeCP1252
CP_1254 -> encodeCP1254
_ -> id
encodeUnicode :: TextEncoding -> String -> ByteString
encodeUnicode enc s =
unsafePerformIO $ do
let len = length s
cbuf0 <- newCharBuffer (len*4) ReadBuffer
foldM (\i c -> writeCharBuf (bufRaw cbuf0) i c) 0 s
let cbuf = cbuf0{bufR=len}
case enc of
TextEncoding {mkTextEncoder=mk} -> do encoder <- mk
bss <- translate (encode encoder) cbuf
close encoder
return (BS.concat bss)
where
translate cod cbuf
| i < w = do bbuf <- newByteBuffer 128 WriteBuffer
(cbuf,bbuf) <- cod cbuf bbuf
if isEmptyBuffer bbuf
then ioe_invalidCharacter
else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
bss <- translate cod cbuf
return (bs:bss)
| otherwise = return []
where
i = bufL cbuf
w = bufR cbuf
decodeUnicode e = case e of
UTF_8 -> decodeUTF8
CP_1250 -> decodeCP1250
CP_1251 -> decodeCP1251
CP_1252 -> decodeCP1252
CP_1254 -> decodeCP1254
_ -> id
decodeUnicode :: TextEncoding -> ByteString -> String
decodeUnicode enc (PS fptr l len) =
unsafePerformIO $ do
let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len}
cbuf <- newCharBuffer 128 WriteBuffer
case enc of
TextEncoding {mkTextDecoder=mk} -> do decoder <- mk
s <- translate (encode decoder) bbuf cbuf
close decoder
return s
where
translate cod bbuf cbuf
| i < w = do (bbuf,cbuf) <- cod bbuf cbuf
if isEmptyBuffer cbuf
then ioe_invalidCharacter
else unpack cod bbuf cbuf
| otherwise = return []
where
i = bufL bbuf
w = bufR bbuf
unpack cod bbuf cbuf
| i < w = do (c,i') <- readCharBuf (bufRaw cbuf) i
cs <- unpack cod bbuf cbuf{bufL=i'}
return (c:cs)
| otherwise = translate cod bbuf cbuf{bufL=0,bufR=0}
where
i = bufL cbuf
w = bufR cbuf
ioe_invalidCharacter = ioException
(IOError Nothing InvalidArgument ""
("invalid byte sequence for this encoding") Nothing Nothing)

View File

@@ -1,8 +1,6 @@
module GF.Text.Lexing (stringOp,opInEnv) where
import GF.Text.Transliterations
import GF.Text.UTF8
import GF.Text.CP1251
import Data.Char
import Data.List (intersperse)
@@ -23,10 +21,6 @@ stringOp name = case name of
"unlexmixed" -> Just $ capitInit . appUnlexer (unlexMixed . unquote)
"unwords" -> Just $ appUnlexer unwords
"to_html" -> Just wrapHTML
"to_utf8" -> Just encodeUTF8
"from_utf8" -> Just decodeUTF8
"to_cp1251" -> Just encodeCP1251
"from_cp1251" -> Just decodeCP1251
_ -> transliterate name
-- perform op in environments beg--end, t.ex. between "--"

View File

@@ -5,8 +5,6 @@ module GF.Text.Transliterations (
transliterationPrintNames
) where
import GF.Text.UTF8
import Data.Char
import Numeric
import qualified Data.Map as Map

View File

@@ -1,48 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : UTF8
-- Maintainer : AR
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:42 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.5 $
--
-- From the Char module supplied with HBC.
-- code by Thomas Hallgren (Jul 10 1999)
-----------------------------------------------------------------------------
module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where
-- | Take a Unicode string and encode it as a string
-- with the UTF8 method.
decodeUTF8 :: String -> String
decodeUTF8 "" = ""
decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' &&
'\x80' <= c' && c' <= '\xbf' =
toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' &&
'\x80' <= c' && c' <= '\xbf' &&
'\x80' <= c'' && c'' <= '\xbf' =
toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
decodeUTF8 s = s ---- AR workaround 22/6/2006
----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
encodeUTF8 :: String -> String
encodeUTF8 "" = ""
encodeUTF8 (c:cs) =
if c > '\x0000' && c < '\x0080' then
c : encodeUTF8 cs
else if c < toEnum 0x0800 then
let i = fromEnum c
in toEnum (0xc0 + i `div` 0x40) :
toEnum (0x80 + i `mod` 0x40) :
encodeUTF8 cs
else
let i = fromEnum c
in toEnum (0xe0 + i `div` 0x1000) :
toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) :
toEnum (0x80 + i `mod` 0x40) :
encodeUTF8 cs