use the native unicode support from GHC 6.12

2010-04-19 09:38:36 +00:00
parent 4c757f4683
commit 0b6b30d4a8
23 changed files with 177 additions and 490 deletions
--- a/src/compiler/GF/Text/CP1250.hs
+++ b/src/compiler/GF/Text/CP1250.hs
@@ -1,91 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module      : GF.Text.CP1250
-- Maintainer  : Krasimir Angelov
--
-- cp1250 is a code page used under Microsoft Windows to represent texts 
-- in Central European and Eastern European languages that use Latin script, 
-- such as Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian, 
-- Serbian (Latin script), Romanian and Albanian. It may also be used with 
-- the German language; German-language texts encoded with cp1250 and cp1252 
-- are identical.
--
-----------------------------------------------------------------------------
-
-module GF.Text.CP1250 where
-
-import Data.Char
-
-decodeCP1250 = map convert where
-  convert c
-   | c == '\x80'                = chr 0x20AC
-   | c == '\x82'                = chr 0x201A
-   | c == '\x84'                = chr 0x201E
-   | c == '\x85'                = chr 0x2026
-   | c == '\x86'                = chr 0x2020
-   | c == '\x87'                = chr 0x2021
-   | c == '\x89'                = chr 0x2030
-   | c == '\x8A'                = chr 0x0160
-   | c == '\x8B'                = chr 0x2039
-   | c == '\x8C'                = chr 0x015A
-   | c == '\x8D'                = chr 0x0164
-   | c == '\x8E'                = chr 0x017D
-   | c == '\x8F'                = chr 0x0179
-   | c == '\x91'                = chr 0x2018
-   | c == '\x92'                = chr 0x2019
-   | c == '\x93'                = chr 0x201C
-   | c == '\x94'                = chr 0x201D
-   | c == '\x95'                = chr 0x2022
-   | c == '\x96'                = chr 0x2013
-   | c == '\x97'                = chr 0x2014
-   | c == '\x99'                = chr 0x2122
-   | c == '\x9A'                = chr 0x0161
-   | c == '\x9B'                = chr 0x203A
-   | c == '\x9C'                = chr 0x015B
-   | c == '\x9D'                = chr 0x0165
-   | c == '\x9E'                = chr 0x017E
-   | c == '\x9F'                = chr 0x017A
-   | c == '\xA1'                = chr 0x02C7
-   | c == '\xA5'                = chr 0x0104
-   | c == '\xB9'                = chr 0x0105
-   | c == '\xBC'                = chr 0x013D
-   | c == '\xBE'                = chr 0x013E
-   | otherwise                  = c
-
-
-encodeCP1250 = map convert where
-  convert c
-   | oc == 0x20AC               = '\x80'
-   | oc == 0x201A               = '\x82'
-   | oc == 0x201E               = '\x84'
-   | oc == 0x2026               = '\x85'
-   | oc == 0x2020               = '\x86'
-   | oc == 0x2021               = '\x87'
-   | oc == 0x2030               = '\x89'
-   | oc == 0x0160               = '\x8A'
-   | oc == 0x2039               = '\x8B'
-   | oc == 0x015A               = '\x8C'
-   | oc == 0x0164               = '\x8D'
-   | oc == 0x017D               = '\x8E'
-   | oc == 0x0179               = '\x8F'
-   | oc == 0x2018               = '\x91'
-   | oc == 0x2019               = '\x92'
-   | oc == 0x201C               = '\x93'
-   | oc == 0x201D               = '\x94'
-   | oc == 0x2022               = '\x95'
-   | oc == 0x2013               = '\x96'
-   | oc == 0x2014               = '\x97'
-   | oc == 0x2122               = '\x99'
-   | oc == 0x0161               = '\x9A'
-   | oc == 0x203A               = '\x9B'
-   | oc == 0x015B               = '\x9C'
-   | oc == 0x0165               = '\x9D'
-   | oc == 0x017E               = '\x9E'
-   | oc == 0x017A               = '\x9F'
-   | oc == 0x02C7               = '\xA1'
-   | oc == 0x0104               = '\xA5'
-   | oc == 0x0105               = '\xB9'
-   | oc == 0x013D               = '\xBC'
-   | oc == 0x013E               = '\xBE'
-   | otherwise                  = c
-   where oc = ord c
--- a/src/compiler/GF/Text/CP1251.hs
+++ b/src/compiler/GF/Text/CP1251.hs
@@ -1,86 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module      : GF.Text.CP1251
-- Maintainer  : Krasimir Angelov
--
-- cp1251 is a popular 8-bit character encoding, designed to cover languages
-- that use the Cyrillic alphabet such as Russian, Bulgarian, Serbian Cyrillic
-- and other languages. It is the most widely used for encoding the Bulgarian,
-- Serbian and Macedonian languages.
--
-----------------------------------------------------------------------------
-
-module GF.Text.CP1251 where
-
-import Data.Char
-
-decodeCP1251 = map convert where
-  convert c
-   | c >= '\xC0' && c <= '\xFF' = chr (ord c + (0x410-0xC0))
-   | c == '\xA8'                = chr 0x401      -- cyrillic capital letter lo
-   | c == '\x80'                = chr 0x402
-   | c == '\x81'                = chr 0x403
-   | c == '\xAA'                = chr 0x404
-   | c == '\xBD'                = chr 0x405
-   | c == '\xB2'                = chr 0x406
-   | c == '\xAF'                = chr 0x407
-   | c == '\xA3'                = chr 0x408
-   | c == '\x8A'                = chr 0x409
-   | c == '\x8C'                = chr 0x40A
-   | c == '\x8E'                = chr 0x40B
-   | c == '\x8D'                = chr 0x40C
-   | c == '\xA1'                = chr 0x40E
-   | c == '\x8F'                = chr 0x40F
-   | c == '\xB8'                = chr 0x451      -- cyrillic small   letter lo
-   | c == '\x90'                = chr 0x452
-   | c == '\x83'                = chr 0x453
-   | c == '\xBA'                = chr 0x454
-   | c == '\xBE'                = chr 0x455
-   | c == '\xB3'                = chr 0x456
-   | c == '\xBF'                = chr 0x457
-   | c == '\xBC'                = chr 0x458
-   | c == '\x9A'                = chr 0x459
-   | c == '\x9C'                = chr 0x45A
-   | c == '\x9E'                = chr 0x45B
-   | c == '\x9D'                = chr 0x45C
-   | c == '\xA2'                = chr 0x45E
-   | c == '\x9F'                = chr 0x45F
-   | c == '\xA5'                = chr 0x490
-   | c == '\xB4'                = chr 0x491
-   | otherwise                  = c
-
-encodeCP1251 = map convert where
-  convert c
-   | oc >= 0x410 && oc <= 0x44F = chr (oc - (0x410-0xC0))
-   | oc == 0x401                = '\xA8'         -- cyrillic capital letter lo
-   | oc == 0x402                = '\x80'
-   | oc == 0x403                = '\x81'
-   | oc == 0x404                = '\xAA'
-   | oc == 0x405                = '\xBD'
-   | oc == 0x406                = '\xB2'
-   | oc == 0x407                = '\xAF'
-   | oc == 0x408                = '\xA3'
-   | oc == 0x409                = '\x8A'
-   | oc == 0x40A                = '\x8C'
-   | oc == 0x40B                = '\x8E'
-   | oc == 0x40C                = '\x8D'
-   | oc == 0x40E                = '\xA1'
-   | oc == 0x40F                = '\x8F'
-   | oc == 0x451                = '\xB8'         -- cyrillic small   letter lo
-   | oc == 0x452                = '\x90'
-   | oc == 0x453                = '\x83'
-   | oc == 0x454                = '\xBA'
-   | oc == 0x455                = '\xBE'
-   | oc == 0x456                = '\xB3'
-   | oc == 0x457                = '\xBF'
-   | oc == 0x458                = '\xBC'
-   | oc == 0x459                = '\x9A'
-   | oc == 0x45A                = '\x9C'
-   | oc == 0x45B                = '\x9E'
-   | oc == 0x45C                = '\x9D'
-   | oc == 0x45E                = '\xA2'
-   | oc == 0x45F                = '\x9F'
-   | oc == 0x490                = '\xA5'
-   | oc == 0x491                = '\xB4'
-   | otherwise                  = c
-   where oc = ord c
--- a/src/compiler/GF/Text/CP1252.hs
+++ b/src/compiler/GF/Text/CP1252.hs
@@ -1,17 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module      : GF.Text.CP1252
-- Maintainer  : Krasimir Angelov
--
-- cp1252 is a character encoding of the Latin alphabet, used by default in 
-- the legacy components of Microsoft Windows in English and some other 
-- Western languages.
--
-----------------------------------------------------------------------------
-
-module GF.Text.CP1252 where
-
-import Data.Char
-
-decodeCP1252 = map id
-encodeCP1252 = map (\x -> if x <= '\255' then x else '?')
--- a/src/compiler/GF/Text/CP1254.hs
+++ b/src/compiler/GF/Text/CP1254.hs
@@ -1,84 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module      : GF.Text.CP1254
-- Maintainer  : Krasimir Angelov
--
-- cp1254 is a code page used under Microsoft Windows to write Turkish.
-- Characters with codepoints A0 through FF are compatible with ISO 8859-9.
--
-----------------------------------------------------------------------------
-
-module GF.Text.CP1254 where
-
-import Data.Char
-
-decodeCP1254 = map convert where
-  convert c
-   | c == '\x80'                = chr 0x20AC
-   | c == '\x82'                = chr 0x201A
-   | c == '\x83'                = chr 0x192
-   | c == '\x84'                = chr 0x201E
-   | c == '\x85'                = chr 0x2026
-   | c == '\x86'                = chr 0x2020
-   | c == '\x87'                = chr 0x2021
-   | c == '\x88'                = chr 0x2C6
-   | c == '\x89'                = chr 0x2030
-   | c == '\x8A'                = chr 0x160
-   | c == '\x8B'                = chr 0x2039
-   | c == '\x8C'                = chr 0x152
-   | c == '\x91'                = chr 0x2018
-   | c == '\x92'                = chr 0x2019
-   | c == '\x93'                = chr 0x201C
-   | c == '\x94'                = chr 0x201D
-   | c == '\x95'                = chr 0x2022
-   | c == '\x96'                = chr 0x2013
-   | c == '\x97'                = chr 0x2014
-   | c == '\x98'                = chr 0x2DC
-   | c == '\x99'                = chr 0x2122
-   | c == '\x9A'                = chr 0x161
-   | c == '\x9B'                = chr 0x203A
-   | c == '\x9C'                = chr 0x153
-   | c == '\x9F'                = chr 0x178
-   | c == '\xD0'                = chr 0x11E
-   | c == '\xDD'                = chr 0x130
-   | c == '\xDE'                = chr 0x15E
-   | c == '\xF0'                = chr 0x11F
-   | c == '\xFD'                = chr 0x131
-   | c == '\xFE'                = chr 0x15F
-   | otherwise                  = c
-
-encodeCP1254 = map convert where
-  convert c
-   | oc == 0x20AC               = '\x80'
-   | oc == 0x201A               = '\x82'
-   | oc == 0x192                = '\x83'
-   | oc == 0x201E               = '\x84'
-   | oc == 0x2026               = '\x85'
-   | oc == 0x2020               = '\x86'
-   | oc == 0x2021               = '\x87'
-   | oc == 0x2C6                = '\x88'
-   | oc == 0x2030               = '\x89'
-   | oc == 0x160                = '\x8A'
-   | oc == 0x2039               = '\x8B'
-   | oc == 0x152                = '\x8C'
-   | oc == 0x2018               = '\x91'
-   | oc == 0x2019               = '\x92'
-   | oc == 0x201C               = '\x93'
-   | oc == 0x201D               = '\x94'
-   | oc == 0x2022               = '\x95'
-   | oc == 0x2013               = '\x96'
-   | oc == 0x2014               = '\x97'
-   | oc == 0x2DC                = '\x98'
-   | oc == 0x2122               = '\x99'
-   | oc == 0x161                = '\x9A'
-   | oc == 0x203A               = '\x9B'
-   | oc == 0x153                = '\x9C'
-   | oc == 0x178                = '\x9F'
-   | oc == 0x11E                = '\xD0'
-   | oc == 0x130                = '\xDD'
-   | oc == 0x15E                = '\xDE'
-   | oc == 0x11F                = '\xF0'
-   | oc == 0x131                = '\xFD'
-   | oc == 0x15F                = '\xFE'
-   | otherwise                  = c
-   where oc = ord c
--- a/src/compiler/GF/Text/Coding.hs
+++ b/src/compiler/GF/Text/Coding.hs
@@ -1,24 +1,69 @@
 module GF.Text.Coding where

-import GF.Infra.Option
-import GF.Text.UTF8
-import GF.Text.CP1250
-import GF.Text.CP1251
-import GF.Text.CP1252
-import GF.Text.CP1254
+import qualified Data.ByteString as BS
+import Data.ByteString.Internal
+import GHC.IO
+import GHC.IO.Buffer
+import GHC.IO.Encoding
+import GHC.IO.Exception
+import Control.Monad

-encodeUnicode e = case e of
-  UTF_8   -> encodeUTF8
-  CP_1250 -> encodeCP1250
-  CP_1251 -> encodeCP1251
-  CP_1252 -> encodeCP1252
-  CP_1254 -> encodeCP1254
-  _       -> id
+encodeUnicode :: TextEncoding -> String -> ByteString
+encodeUnicode enc s =
+  unsafePerformIO $ do
+    let len = length s
+    cbuf0 <- newCharBuffer (len*4) ReadBuffer
+    foldM (\i c -> writeCharBuf (bufRaw cbuf0) i c) 0 s
+    let cbuf = cbuf0{bufR=len}
+    case enc of
+      TextEncoding {mkTextEncoder=mk} -> do encoder <- mk
+                                            bss <- translate (encode encoder) cbuf
+                                            close encoder
+                                            return (BS.concat bss)
+  where
+    translate cod cbuf
+      | i < w     = do bbuf <- newByteBuffer 128 WriteBuffer
+                       (cbuf,bbuf) <- cod cbuf bbuf
+                       if isEmptyBuffer bbuf
+                         then ioe_invalidCharacter
+                         else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
+                                 bss <- translate cod cbuf
+                                 return (bs:bss)
+      | otherwise = return []
+      where
+        i = bufL cbuf
+        w = bufR cbuf

-decodeUnicode e = case e of
-  UTF_8   -> decodeUTF8
-  CP_1250 -> decodeCP1250
-  CP_1251 -> decodeCP1251
-  CP_1252 -> decodeCP1252
-  CP_1254 -> decodeCP1254
-  _       -> id
+decodeUnicode :: TextEncoding -> ByteString -> String
+decodeUnicode enc (PS fptr l len) =
+  unsafePerformIO $ do
+    let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len}
+    cbuf <- newCharBuffer 128 WriteBuffer
+    case enc of
+      TextEncoding {mkTextDecoder=mk} -> do decoder <- mk
+                                            s <- translate (encode decoder) bbuf cbuf
+                                            close decoder
+                                            return s
+  where
+    translate cod bbuf cbuf
+      | i < w     = do (bbuf,cbuf) <- cod bbuf cbuf
+                       if isEmptyBuffer cbuf
+                         then ioe_invalidCharacter
+                         else unpack cod bbuf cbuf
+      | otherwise = return []
+      where
+        i = bufL bbuf
+        w = bufR bbuf
+                                            
+    unpack cod bbuf cbuf
+      | i < w     = do (c,i') <- readCharBuf (bufRaw cbuf) i
+                       cs <- unpack cod bbuf cbuf{bufL=i'}
+                       return (c:cs)
+      | otherwise = translate cod bbuf cbuf{bufL=0,bufR=0}
+      where
+        i = bufL cbuf
+        w = bufR cbuf
+
+ioe_invalidCharacter = ioException
+   (IOError Nothing InvalidArgument ""
+        ("invalid byte sequence for this encoding") Nothing Nothing)
--- a/src/compiler/GF/Text/Lexing.hs
+++ b/src/compiler/GF/Text/Lexing.hs
@@ -1,8 +1,6 @@
 module GF.Text.Lexing (stringOp,opInEnv) where

 import GF.Text.Transliterations
-import GF.Text.UTF8
-import GF.Text.CP1251

 import Data.Char
 import Data.List (intersperse)
@@ -23,10 +21,6 @@ stringOp name = case name of
  "unlexmixed" -> Just $ capitInit . appUnlexer (unlexMixed . unquote)
  "unwords"    -> Just $ appUnlexer unwords
  "to_html"    -> Just wrapHTML
-  "to_utf8"    -> Just encodeUTF8
-  "from_utf8"  -> Just decodeUTF8
-  "to_cp1251"  -> Just encodeCP1251
-  "from_cp1251" -> Just decodeCP1251
  _ -> transliterate name

 -- perform op in environments beg--end, t.ex. between "--"
--- a/src/compiler/GF/Text/Transliterations.hs
+++ b/src/compiler/GF/Text/Transliterations.hs
@@ -5,8 +5,6 @@ module GF.Text.Transliterations (
  transliterationPrintNames
  ) where

-import GF.Text.UTF8
-
 import Data.Char
 import Numeric
 import qualified Data.Map as Map
--- a/src/compiler/GF/Text/UTF8.hs
+++ b/src/compiler/GF/Text/UTF8.hs
@@ -1,48 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module      : UTF8
-- Maintainer  : AR
-- Stability   : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:42 $ 
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.5 $
--
-- From the Char module supplied with HBC.
-- code by Thomas Hallgren (Jul 10 1999)
-----------------------------------------------------------------------------
-
-module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where
-
-- | Take a Unicode string and encode it as a string
-- with the UTF8 method.
-decodeUTF8 :: String -> String
-decodeUTF8 "" = ""
-decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
-decodeUTF8 (c:c':cs) | '\xc0' <= c  && c  <= '\xdf' && 
-		      '\x80' <= c' && c' <= '\xbf' =
-	toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
-decodeUTF8 (c:c':c'':cs) | '\xe0' <= c   && c   <= '\xef' && 
-		          '\x80' <= c'  && c'  <= '\xbf' &&
-		          '\x80' <= c'' && c'' <= '\xbf' =
-	toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
-decodeUTF8 s = s ---- AR workaround 22/6/2006
----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
-
-encodeUTF8 :: String -> String
-encodeUTF8 "" = ""
-encodeUTF8 (c:cs) =
-	if c > '\x0000' && c < '\x0080' then
-	    c : encodeUTF8 cs
-	else if c < toEnum 0x0800 then
-	    let i = fromEnum c
-	    in  toEnum (0xc0 + i `div` 0x40) : 
-	        toEnum (0x80 + i `mod` 0x40) : 
-		encodeUTF8 cs
-	else
-	    let i = fromEnum c
-	    in  toEnum (0xe0 + i `div` 0x1000) : 
-	        toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) : 
-		toEnum (0x80 + i `mod` 0x40) : 
-		encodeUTF8 cs