forked from GitHub/gf-core
use the native unicode support from GHC 6.12
This commit is contained in:
@@ -1,91 +0,0 @@
|
||||
-----------------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : GF.Text.CP1250
|
||||
-- Maintainer : Krasimir Angelov
|
||||
--
|
||||
-- cp1250 is a code page used under Microsoft Windows to represent texts
|
||||
-- in Central European and Eastern European languages that use Latin script,
|
||||
-- such as Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian,
|
||||
-- Serbian (Latin script), Romanian and Albanian. It may also be used with
|
||||
-- the German language; German-language texts encoded with cp1250 and cp1252
|
||||
-- are identical.
|
||||
--
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.CP1250 where
|
||||
|
||||
import Data.Char
|
||||
|
||||
decodeCP1250 = map convert where
|
||||
convert c
|
||||
| c == '\x80' = chr 0x20AC
|
||||
| c == '\x82' = chr 0x201A
|
||||
| c == '\x84' = chr 0x201E
|
||||
| c == '\x85' = chr 0x2026
|
||||
| c == '\x86' = chr 0x2020
|
||||
| c == '\x87' = chr 0x2021
|
||||
| c == '\x89' = chr 0x2030
|
||||
| c == '\x8A' = chr 0x0160
|
||||
| c == '\x8B' = chr 0x2039
|
||||
| c == '\x8C' = chr 0x015A
|
||||
| c == '\x8D' = chr 0x0164
|
||||
| c == '\x8E' = chr 0x017D
|
||||
| c == '\x8F' = chr 0x0179
|
||||
| c == '\x91' = chr 0x2018
|
||||
| c == '\x92' = chr 0x2019
|
||||
| c == '\x93' = chr 0x201C
|
||||
| c == '\x94' = chr 0x201D
|
||||
| c == '\x95' = chr 0x2022
|
||||
| c == '\x96' = chr 0x2013
|
||||
| c == '\x97' = chr 0x2014
|
||||
| c == '\x99' = chr 0x2122
|
||||
| c == '\x9A' = chr 0x0161
|
||||
| c == '\x9B' = chr 0x203A
|
||||
| c == '\x9C' = chr 0x015B
|
||||
| c == '\x9D' = chr 0x0165
|
||||
| c == '\x9E' = chr 0x017E
|
||||
| c == '\x9F' = chr 0x017A
|
||||
| c == '\xA1' = chr 0x02C7
|
||||
| c == '\xA5' = chr 0x0104
|
||||
| c == '\xB9' = chr 0x0105
|
||||
| c == '\xBC' = chr 0x013D
|
||||
| c == '\xBE' = chr 0x013E
|
||||
| otherwise = c
|
||||
|
||||
|
||||
encodeCP1250 = map convert where
|
||||
convert c
|
||||
| oc == 0x20AC = '\x80'
|
||||
| oc == 0x201A = '\x82'
|
||||
| oc == 0x201E = '\x84'
|
||||
| oc == 0x2026 = '\x85'
|
||||
| oc == 0x2020 = '\x86'
|
||||
| oc == 0x2021 = '\x87'
|
||||
| oc == 0x2030 = '\x89'
|
||||
| oc == 0x0160 = '\x8A'
|
||||
| oc == 0x2039 = '\x8B'
|
||||
| oc == 0x015A = '\x8C'
|
||||
| oc == 0x0164 = '\x8D'
|
||||
| oc == 0x017D = '\x8E'
|
||||
| oc == 0x0179 = '\x8F'
|
||||
| oc == 0x2018 = '\x91'
|
||||
| oc == 0x2019 = '\x92'
|
||||
| oc == 0x201C = '\x93'
|
||||
| oc == 0x201D = '\x94'
|
||||
| oc == 0x2022 = '\x95'
|
||||
| oc == 0x2013 = '\x96'
|
||||
| oc == 0x2014 = '\x97'
|
||||
| oc == 0x2122 = '\x99'
|
||||
| oc == 0x0161 = '\x9A'
|
||||
| oc == 0x203A = '\x9B'
|
||||
| oc == 0x015B = '\x9C'
|
||||
| oc == 0x0165 = '\x9D'
|
||||
| oc == 0x017E = '\x9E'
|
||||
| oc == 0x017A = '\x9F'
|
||||
| oc == 0x02C7 = '\xA1'
|
||||
| oc == 0x0104 = '\xA5'
|
||||
| oc == 0x0105 = '\xB9'
|
||||
| oc == 0x013D = '\xBC'
|
||||
| oc == 0x013E = '\xBE'
|
||||
| otherwise = c
|
||||
where oc = ord c
|
||||
@@ -1,86 +0,0 @@
|
||||
-----------------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : GF.Text.CP1251
|
||||
-- Maintainer : Krasimir Angelov
|
||||
--
|
||||
-- cp1251 is a popular 8-bit character encoding, designed to cover languages
|
||||
-- that use the Cyrillic alphabet such as Russian, Bulgarian, Serbian Cyrillic
|
||||
-- and other languages. It is the most widely used for encoding the Bulgarian,
|
||||
-- Serbian and Macedonian languages.
|
||||
--
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.CP1251 where
|
||||
|
||||
import Data.Char
|
||||
|
||||
decodeCP1251 = map convert where
|
||||
convert c
|
||||
| c >= '\xC0' && c <= '\xFF' = chr (ord c + (0x410-0xC0))
|
||||
| c == '\xA8' = chr 0x401 -- cyrillic capital letter lo
|
||||
| c == '\x80' = chr 0x402
|
||||
| c == '\x81' = chr 0x403
|
||||
| c == '\xAA' = chr 0x404
|
||||
| c == '\xBD' = chr 0x405
|
||||
| c == '\xB2' = chr 0x406
|
||||
| c == '\xAF' = chr 0x407
|
||||
| c == '\xA3' = chr 0x408
|
||||
| c == '\x8A' = chr 0x409
|
||||
| c == '\x8C' = chr 0x40A
|
||||
| c == '\x8E' = chr 0x40B
|
||||
| c == '\x8D' = chr 0x40C
|
||||
| c == '\xA1' = chr 0x40E
|
||||
| c == '\x8F' = chr 0x40F
|
||||
| c == '\xB8' = chr 0x451 -- cyrillic small letter lo
|
||||
| c == '\x90' = chr 0x452
|
||||
| c == '\x83' = chr 0x453
|
||||
| c == '\xBA' = chr 0x454
|
||||
| c == '\xBE' = chr 0x455
|
||||
| c == '\xB3' = chr 0x456
|
||||
| c == '\xBF' = chr 0x457
|
||||
| c == '\xBC' = chr 0x458
|
||||
| c == '\x9A' = chr 0x459
|
||||
| c == '\x9C' = chr 0x45A
|
||||
| c == '\x9E' = chr 0x45B
|
||||
| c == '\x9D' = chr 0x45C
|
||||
| c == '\xA2' = chr 0x45E
|
||||
| c == '\x9F' = chr 0x45F
|
||||
| c == '\xA5' = chr 0x490
|
||||
| c == '\xB4' = chr 0x491
|
||||
| otherwise = c
|
||||
|
||||
encodeCP1251 = map convert where
|
||||
convert c
|
||||
| oc >= 0x410 && oc <= 0x44F = chr (oc - (0x410-0xC0))
|
||||
| oc == 0x401 = '\xA8' -- cyrillic capital letter lo
|
||||
| oc == 0x402 = '\x80'
|
||||
| oc == 0x403 = '\x81'
|
||||
| oc == 0x404 = '\xAA'
|
||||
| oc == 0x405 = '\xBD'
|
||||
| oc == 0x406 = '\xB2'
|
||||
| oc == 0x407 = '\xAF'
|
||||
| oc == 0x408 = '\xA3'
|
||||
| oc == 0x409 = '\x8A'
|
||||
| oc == 0x40A = '\x8C'
|
||||
| oc == 0x40B = '\x8E'
|
||||
| oc == 0x40C = '\x8D'
|
||||
| oc == 0x40E = '\xA1'
|
||||
| oc == 0x40F = '\x8F'
|
||||
| oc == 0x451 = '\xB8' -- cyrillic small letter lo
|
||||
| oc == 0x452 = '\x90'
|
||||
| oc == 0x453 = '\x83'
|
||||
| oc == 0x454 = '\xBA'
|
||||
| oc == 0x455 = '\xBE'
|
||||
| oc == 0x456 = '\xB3'
|
||||
| oc == 0x457 = '\xBF'
|
||||
| oc == 0x458 = '\xBC'
|
||||
| oc == 0x459 = '\x9A'
|
||||
| oc == 0x45A = '\x9C'
|
||||
| oc == 0x45B = '\x9E'
|
||||
| oc == 0x45C = '\x9D'
|
||||
| oc == 0x45E = '\xA2'
|
||||
| oc == 0x45F = '\x9F'
|
||||
| oc == 0x490 = '\xA5'
|
||||
| oc == 0x491 = '\xB4'
|
||||
| otherwise = c
|
||||
where oc = ord c
|
||||
@@ -1,17 +0,0 @@
|
||||
-----------------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : GF.Text.CP1252
|
||||
-- Maintainer : Krasimir Angelov
|
||||
--
|
||||
-- cp1252 is a character encoding of the Latin alphabet, used by default in
|
||||
-- the legacy components of Microsoft Windows in English and some other
|
||||
-- Western languages.
|
||||
--
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.CP1252 where
|
||||
|
||||
import Data.Char
|
||||
|
||||
decodeCP1252 = map id
|
||||
encodeCP1252 = map (\x -> if x <= '\255' then x else '?')
|
||||
@@ -1,84 +0,0 @@
|
||||
-----------------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : GF.Text.CP1254
|
||||
-- Maintainer : Krasimir Angelov
|
||||
--
|
||||
-- cp1254 is a code page used under Microsoft Windows to write Turkish.
|
||||
-- Characters with codepoints A0 through FF are compatible with ISO 8859-9.
|
||||
--
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.CP1254 where
|
||||
|
||||
import Data.Char
|
||||
|
||||
decodeCP1254 = map convert where
|
||||
convert c
|
||||
| c == '\x80' = chr 0x20AC
|
||||
| c == '\x82' = chr 0x201A
|
||||
| c == '\x83' = chr 0x192
|
||||
| c == '\x84' = chr 0x201E
|
||||
| c == '\x85' = chr 0x2026
|
||||
| c == '\x86' = chr 0x2020
|
||||
| c == '\x87' = chr 0x2021
|
||||
| c == '\x88' = chr 0x2C6
|
||||
| c == '\x89' = chr 0x2030
|
||||
| c == '\x8A' = chr 0x160
|
||||
| c == '\x8B' = chr 0x2039
|
||||
| c == '\x8C' = chr 0x152
|
||||
| c == '\x91' = chr 0x2018
|
||||
| c == '\x92' = chr 0x2019
|
||||
| c == '\x93' = chr 0x201C
|
||||
| c == '\x94' = chr 0x201D
|
||||
| c == '\x95' = chr 0x2022
|
||||
| c == '\x96' = chr 0x2013
|
||||
| c == '\x97' = chr 0x2014
|
||||
| c == '\x98' = chr 0x2DC
|
||||
| c == '\x99' = chr 0x2122
|
||||
| c == '\x9A' = chr 0x161
|
||||
| c == '\x9B' = chr 0x203A
|
||||
| c == '\x9C' = chr 0x153
|
||||
| c == '\x9F' = chr 0x178
|
||||
| c == '\xD0' = chr 0x11E
|
||||
| c == '\xDD' = chr 0x130
|
||||
| c == '\xDE' = chr 0x15E
|
||||
| c == '\xF0' = chr 0x11F
|
||||
| c == '\xFD' = chr 0x131
|
||||
| c == '\xFE' = chr 0x15F
|
||||
| otherwise = c
|
||||
|
||||
encodeCP1254 = map convert where
|
||||
convert c
|
||||
| oc == 0x20AC = '\x80'
|
||||
| oc == 0x201A = '\x82'
|
||||
| oc == 0x192 = '\x83'
|
||||
| oc == 0x201E = '\x84'
|
||||
| oc == 0x2026 = '\x85'
|
||||
| oc == 0x2020 = '\x86'
|
||||
| oc == 0x2021 = '\x87'
|
||||
| oc == 0x2C6 = '\x88'
|
||||
| oc == 0x2030 = '\x89'
|
||||
| oc == 0x160 = '\x8A'
|
||||
| oc == 0x2039 = '\x8B'
|
||||
| oc == 0x152 = '\x8C'
|
||||
| oc == 0x2018 = '\x91'
|
||||
| oc == 0x2019 = '\x92'
|
||||
| oc == 0x201C = '\x93'
|
||||
| oc == 0x201D = '\x94'
|
||||
| oc == 0x2022 = '\x95'
|
||||
| oc == 0x2013 = '\x96'
|
||||
| oc == 0x2014 = '\x97'
|
||||
| oc == 0x2DC = '\x98'
|
||||
| oc == 0x2122 = '\x99'
|
||||
| oc == 0x161 = '\x9A'
|
||||
| oc == 0x203A = '\x9B'
|
||||
| oc == 0x153 = '\x9C'
|
||||
| oc == 0x178 = '\x9F'
|
||||
| oc == 0x11E = '\xD0'
|
||||
| oc == 0x130 = '\xDD'
|
||||
| oc == 0x15E = '\xDE'
|
||||
| oc == 0x11F = '\xF0'
|
||||
| oc == 0x131 = '\xFD'
|
||||
| oc == 0x15F = '\xFE'
|
||||
| otherwise = c
|
||||
where oc = ord c
|
||||
@@ -1,24 +1,69 @@
|
||||
module GF.Text.Coding where
|
||||
|
||||
import GF.Infra.Option
|
||||
import GF.Text.UTF8
|
||||
import GF.Text.CP1250
|
||||
import GF.Text.CP1251
|
||||
import GF.Text.CP1252
|
||||
import GF.Text.CP1254
|
||||
import qualified Data.ByteString as BS
|
||||
import Data.ByteString.Internal
|
||||
import GHC.IO
|
||||
import GHC.IO.Buffer
|
||||
import GHC.IO.Encoding
|
||||
import GHC.IO.Exception
|
||||
import Control.Monad
|
||||
|
||||
encodeUnicode e = case e of
|
||||
UTF_8 -> encodeUTF8
|
||||
CP_1250 -> encodeCP1250
|
||||
CP_1251 -> encodeCP1251
|
||||
CP_1252 -> encodeCP1252
|
||||
CP_1254 -> encodeCP1254
|
||||
_ -> id
|
||||
encodeUnicode :: TextEncoding -> String -> ByteString
|
||||
encodeUnicode enc s =
|
||||
unsafePerformIO $ do
|
||||
let len = length s
|
||||
cbuf0 <- newCharBuffer (len*4) ReadBuffer
|
||||
foldM (\i c -> writeCharBuf (bufRaw cbuf0) i c) 0 s
|
||||
let cbuf = cbuf0{bufR=len}
|
||||
case enc of
|
||||
TextEncoding {mkTextEncoder=mk} -> do encoder <- mk
|
||||
bss <- translate (encode encoder) cbuf
|
||||
close encoder
|
||||
return (BS.concat bss)
|
||||
where
|
||||
translate cod cbuf
|
||||
| i < w = do bbuf <- newByteBuffer 128 WriteBuffer
|
||||
(cbuf,bbuf) <- cod cbuf bbuf
|
||||
if isEmptyBuffer bbuf
|
||||
then ioe_invalidCharacter
|
||||
else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
|
||||
bss <- translate cod cbuf
|
||||
return (bs:bss)
|
||||
| otherwise = return []
|
||||
where
|
||||
i = bufL cbuf
|
||||
w = bufR cbuf
|
||||
|
||||
decodeUnicode e = case e of
|
||||
UTF_8 -> decodeUTF8
|
||||
CP_1250 -> decodeCP1250
|
||||
CP_1251 -> decodeCP1251
|
||||
CP_1252 -> decodeCP1252
|
||||
CP_1254 -> decodeCP1254
|
||||
_ -> id
|
||||
decodeUnicode :: TextEncoding -> ByteString -> String
|
||||
decodeUnicode enc (PS fptr l len) =
|
||||
unsafePerformIO $ do
|
||||
let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len}
|
||||
cbuf <- newCharBuffer 128 WriteBuffer
|
||||
case enc of
|
||||
TextEncoding {mkTextDecoder=mk} -> do decoder <- mk
|
||||
s <- translate (encode decoder) bbuf cbuf
|
||||
close decoder
|
||||
return s
|
||||
where
|
||||
translate cod bbuf cbuf
|
||||
| i < w = do (bbuf,cbuf) <- cod bbuf cbuf
|
||||
if isEmptyBuffer cbuf
|
||||
then ioe_invalidCharacter
|
||||
else unpack cod bbuf cbuf
|
||||
| otherwise = return []
|
||||
where
|
||||
i = bufL bbuf
|
||||
w = bufR bbuf
|
||||
|
||||
unpack cod bbuf cbuf
|
||||
| i < w = do (c,i') <- readCharBuf (bufRaw cbuf) i
|
||||
cs <- unpack cod bbuf cbuf{bufL=i'}
|
||||
return (c:cs)
|
||||
| otherwise = translate cod bbuf cbuf{bufL=0,bufR=0}
|
||||
where
|
||||
i = bufL cbuf
|
||||
w = bufR cbuf
|
||||
|
||||
ioe_invalidCharacter = ioException
|
||||
(IOError Nothing InvalidArgument ""
|
||||
("invalid byte sequence for this encoding") Nothing Nothing)
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
module GF.Text.Lexing (stringOp,opInEnv) where
|
||||
|
||||
import GF.Text.Transliterations
|
||||
import GF.Text.UTF8
|
||||
import GF.Text.CP1251
|
||||
|
||||
import Data.Char
|
||||
import Data.List (intersperse)
|
||||
@@ -23,10 +21,6 @@ stringOp name = case name of
|
||||
"unlexmixed" -> Just $ capitInit . appUnlexer (unlexMixed . unquote)
|
||||
"unwords" -> Just $ appUnlexer unwords
|
||||
"to_html" -> Just wrapHTML
|
||||
"to_utf8" -> Just encodeUTF8
|
||||
"from_utf8" -> Just decodeUTF8
|
||||
"to_cp1251" -> Just encodeCP1251
|
||||
"from_cp1251" -> Just decodeCP1251
|
||||
_ -> transliterate name
|
||||
|
||||
-- perform op in environments beg--end, t.ex. between "--"
|
||||
|
||||
@@ -5,8 +5,6 @@ module GF.Text.Transliterations (
|
||||
transliterationPrintNames
|
||||
) where
|
||||
|
||||
import GF.Text.UTF8
|
||||
|
||||
import Data.Char
|
||||
import Numeric
|
||||
import qualified Data.Map as Map
|
||||
|
||||
@@ -1,48 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : UTF8
|
||||
-- Maintainer : AR
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:42 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.5 $
|
||||
--
|
||||
-- From the Char module supplied with HBC.
|
||||
-- code by Thomas Hallgren (Jul 10 1999)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where
|
||||
|
||||
-- | Take a Unicode string and encode it as a string
|
||||
-- with the UTF8 method.
|
||||
decodeUTF8 :: String -> String
|
||||
decodeUTF8 "" = ""
|
||||
decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
|
||||
decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' &&
|
||||
'\x80' <= c' && c' <= '\xbf' =
|
||||
toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
|
||||
decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' &&
|
||||
'\x80' <= c' && c' <= '\xbf' &&
|
||||
'\x80' <= c'' && c'' <= '\xbf' =
|
||||
toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
|
||||
decodeUTF8 s = s ---- AR workaround 22/6/2006
|
||||
----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
|
||||
|
||||
encodeUTF8 :: String -> String
|
||||
encodeUTF8 "" = ""
|
||||
encodeUTF8 (c:cs) =
|
||||
if c > '\x0000' && c < '\x0080' then
|
||||
c : encodeUTF8 cs
|
||||
else if c < toEnum 0x0800 then
|
||||
let i = fromEnum c
|
||||
in toEnum (0xc0 + i `div` 0x40) :
|
||||
toEnum (0x80 + i `mod` 0x40) :
|
||||
encodeUTF8 cs
|
||||
else
|
||||
let i = fromEnum c
|
||||
in toEnum (0xe0 + i `div` 0x1000) :
|
||||
toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) :
|
||||
toEnum (0x80 + i `mod` 0x40) :
|
||||
encodeUTF8 cs
|
||||
Reference in New Issue
Block a user