use the native unicode support from GHC 6.12

This commit is contained in:
krasimir
2010-04-19 09:38:36 +00:00
parent 4c757f4683
commit 0b6b30d4a8
23 changed files with 177 additions and 490 deletions

View File

@@ -89,7 +89,6 @@ executable gf
GF.JavaScript.AbsJS
GF.JavaScript.PrintJS
GF.Infra.CompactPrint
GF.Text.UTF8
GF.Data.TrieMap
GF.Data.Utilities
GF.Data.SortedList

View File

@@ -24,6 +24,10 @@ main = do
codepage <- getACP
setConsoleCP codepage
setConsoleOutputCP codepage
enc <- mkTextEncoding ("CP"++show codepage)
hSetEncoding stdin enc
hSetEncoding stdout enc
hSetEncoding stderr enc
#endif
args <- getArgs
case parseOptions args of

View File

@@ -24,7 +24,7 @@ import PGF.Probabilistic -- (getProbsFromFile,prProbabilities,defaultProbabiliti
import PGF.Generate (generateRandomFrom) ----
import GF.Compile.Export
import GF.Compile.ExampleBased
import GF.Infra.Option (noOptions, readOutputFormat, Encoding(..))
import GF.Infra.Option (noOptions, readOutputFormat)
import GF.Infra.UseIO
import GF.Data.ErrM ----
import GF.Command.Abstract
@@ -36,7 +36,6 @@ import GF.Quiz
import GF.Command.TreeOperations ---- temporary place for typecheck and compute
import GF.Data.Operations
import GF.Text.Coding
import Data.List
import Data.Maybe
@@ -77,10 +76,10 @@ emptyCommandInfo = CommandInfo {
lookCommand :: String -> Map.Map String CommandInfo -> Maybe CommandInfo
lookCommand = Map.lookup
commandHelpAll :: Encoding -> PGFEnv -> [Option] -> String
commandHelpAll cod pgf opts = unlines
commandHelpAll :: PGFEnv -> [Option] -> String
commandHelpAll pgf opts = unlines
[commandHelp (isOpt "full" opts) (co,info)
| (co,info) <- Map.assocs (allCommands cod pgf)]
| (co,info) <- Map.assocs (allCommands pgf)]
commandHelp :: Bool -> (String,CommandInfo) -> String
commandHelp full (co,info) = unlines $ [
@@ -120,8 +119,8 @@ commandHelpTags full (co,info) = unlines $ [
type PGFEnv = (PGF, Map.Map Language Morpho)
-- this list must no more be kept sorted by the command name
allCommands :: Encoding -> PGFEnv -> Map.Map String CommandInfo
allCommands cod env@(pgf, mos) = Map.fromList [
allCommands :: PGFEnv -> Map.Map String CommandInfo
allCommands env@(pgf, mos) = Map.fromList [
("!", emptyCommandInfo {
synopsis = "system command: escape to system shell",
syntax = "! SYSTEMCOMMAND",
@@ -156,7 +155,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
let file s = "_grph." ++ s
let view = optViewGraph opts ++ " "
let format = optViewFormat opts
writeFile (file "dot") (enc grph)
writeFile (file "dot") grph
system $ "dot -T" ++ format ++ " " ++ file "dot" ++ " > " ++ file format ++
" ; " ++ view ++ file format
return void
@@ -365,10 +364,10 @@ allCommands cod env@(pgf, mos) = Map.fromList [
_ | isOpt "coding" opts -> codingMsg
_ | isOpt "license" opts -> licenseMsg
[t] -> let co = getCommandOp (showExpr [] t) in
case lookCommand co (allCommands cod env) of ---- new map ??!!
case lookCommand co (allCommands env) of ---- new map ??!!
Just info -> commandHelp True (co,info)
_ -> "command not found"
_ -> commandHelpAll cod env opts
_ -> commandHelpAll env opts
in return (fromString msg),
needsTypeCheck = False
}),
@@ -458,7 +457,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
let typ = optType opts
mprobs <- optProbs opts pgf
let mt = mexp xs
morphologyQuiz mt mprobs cod pgf lang typ
morphologyQuiz mt mprobs pgf lang typ
return void,
flags = [
("lang","language of the quiz"),
@@ -656,7 +655,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
let typ = optType opts
let mt = mexp xs
mprobs <- optProbs opts pgf
translationQuiz mt mprobs cod pgf from to typ
translationQuiz mt mprobs pgf from to typ
return void,
flags = [
("from","translate from this language"),
@@ -687,7 +686,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
exec = \opts arg -> do
let tmpi = "_tmpi" ---
let tmpo = "_tmpo"
writeFile tmpi $ enc $ toString arg
writeFile tmpi $ toString arg
let syst = optComm opts ++ " " ++ tmpi
system $ syst ++ " <" ++ tmpi ++ " >" ++ tmpo
s <- readFile tmpo
@@ -738,7 +737,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
let file s = "_grphd." ++ s
let view = optViewGraph opts ++ " "
let format = optViewFormat opts
writeFile (file "dot") (enc grphs)
writeFile (file "dot") grphs
system $ "dot -T" ++ format ++ " " ++ file "dot" ++ " > " ++ file format ++
" ; " ++ view ++ file format
return void
@@ -779,7 +778,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
let file s = "_grph." ++ s
let view = optViewGraph opts ++ " "
let format = optViewFormat opts
writeFile (file "dot") (enc grph)
writeFile (file "dot") grph
system $ "dot -T" ++ format ++ " " ++ file "dot" ++ " > " ++ file format ++
" ; " ++ view ++ file format
return void
@@ -819,7 +818,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
let file s = "_grph." ++ s
let view = optViewGraph opts ++ " "
let format = optViewFormat opts
writeFile (file "dot") (enc grph)
writeFile (file "dot") grph
system $ "dot -T" ++ format ++ " " ++ file "dot" ++ " > " ++ file format ++
" ; " ++ view ++ file format
return void
@@ -844,8 +843,8 @@ allCommands cod env@(pgf, mos) = Map.fromList [
exec = \opts arg -> do
let file = valStrOpts "file" "_gftmp" opts
if isOpt "append" opts
then appendFile file (enc (toString arg))
else writeFile file (enc (toString arg))
then appendFile file (toString arg)
else writeFile file (toString arg)
return void,
options = [
("append","append to file, instead of overwriting it")
@@ -889,8 +888,6 @@ allCommands cod env@(pgf, mos) = Map.fromList [
})
]
where
enc = encodeUnicode cod
par opts s = case optOpenTypes opts of
[] -> concat [parse pgf lang (optType opts) s | lang <- optLangs opts]
open_typs -> concat [parseWithRecovery pgf lang (optType opts) open_typs s | lang <- optLangs opts]
@@ -1063,17 +1060,17 @@ stringOpOptions = sort $ [
treeOpOptions pgf = [(op,expl) | (op,(expl,Left _)) <- allTreeOps pgf]
treeOpFlags pgf = [(op,expl) | (op,(expl,Right _)) <- allTreeOps pgf]
translationQuiz :: Maybe Expr -> Maybe Probabilities -> Encoding ->
translationQuiz :: Maybe Expr -> Maybe Probabilities ->
PGF -> Language -> Language -> Type -> IO ()
translationQuiz mex mprobs cod pgf ig og typ = do
translationQuiz mex mprobs pgf ig og typ = do
tts <- translationList mex mprobs pgf ig og typ infinity
mkQuiz cod "Welcome to GF Translation Quiz." tts
mkQuiz "Welcome to GF Translation Quiz." tts
morphologyQuiz :: Maybe Expr -> Maybe Probabilities -> Encoding ->
morphologyQuiz :: Maybe Expr -> Maybe Probabilities ->
PGF -> Language -> Type -> IO ()
morphologyQuiz mex mprobs cod pgf ig typ = do
morphologyQuiz mex mprobs pgf ig typ = do
tts <- morphologyList mex mprobs pgf ig typ infinity
mkQuiz cod "Welcome to GF Morphology Quiz." tts
mkQuiz "Welcome to GF Morphology Quiz." tts
-- | the maximal number of precompiled quiz problems
infinity :: Int

View File

@@ -29,24 +29,24 @@ data CommandEnv = CommandEnv {
expmacros :: Map.Map String Expr
}
mkCommandEnv :: Encoding -> PGF -> CommandEnv
mkCommandEnv enc pgf =
mkCommandEnv :: PGF -> CommandEnv
mkCommandEnv pgf =
let mos = Map.fromList [(la,buildMorpho pgf la) | la <- languages pgf] in
CommandEnv pgf mos (allCommands enc (pgf, mos)) Map.empty Map.empty
CommandEnv pgf mos (allCommands (pgf, mos)) Map.empty Map.empty
emptyCommandEnv :: CommandEnv
emptyCommandEnv = mkCommandEnv UTF_8 emptyPGF
emptyCommandEnv = mkCommandEnv emptyPGF
interpretCommandLine :: (String -> String) -> CommandEnv -> String -> IO ()
interpretCommandLine enc env line =
interpretCommandLine :: CommandEnv -> String -> IO ()
interpretCommandLine env line =
case readCommandLine line of
Just [] -> return ()
Just pipes -> mapM_ (interpretPipe enc env) pipes
Just pipes -> mapM_ (interpretPipe env) pipes
Nothing -> putStrLnFlush "command not parsed"
interpretPipe enc env cs = do
interpretPipe env cs = do
v@(_,s) <- intercs ([],"") cs
putStrLnFlush $ enc s
putStrLnFlush s
return v
where
intercs treess [] = return treess
@@ -57,14 +57,14 @@ interpretPipe enc env cs = do
'%':f -> case Map.lookup f (commandmacros env) of
Just css ->
case getCommandTrees env False arg es of
Right es -> do mapM_ (interpretPipe enc env) (appLine es css)
Right es -> do mapM_ (interpretPipe env) (appLine es css)
return ([],[])
Left msg -> do putStrLn ('\n':msg)
return ([],[])
Nothing -> do
putStrLn $ "command macro " ++ co ++ " not interpreted"
return ([],[])
_ -> interpret enc env es comm
_ -> interpret env es comm
appLine es = map (map (appCommand es))
-- macro definition applications: replace ?i by (exps !! i)
@@ -81,14 +81,14 @@ appCommand xs c@(Command i os arg) = case arg of
EFun x -> EFun x
-- return the trees to be sent in pipe, and the output possibly printed
interpret :: (String -> String) -> CommandEnv -> [Expr] -> Command -> IO CommandOutput
interpret enc env trees comm =
interpret :: CommandEnv -> [Expr] -> Command -> IO CommandOutput
interpret env trees comm =
case getCommand env trees comm of
Left msg -> do putStrLn ('\n':msg)
return ([],[])
Right (info,opts,trees) -> do tss@(_,s) <- exec info opts trees
if isOpt "tr" opts
then putStrLn (enc s)
then putStrLn s
else return ()
return tss

View File

@@ -12,7 +12,6 @@ import GF.Compile.Update
import GF.Compile.Refresh
import GF.Compile.Coding
import GF.Text.UTF8 ----
import GF.Grammar.Grammar
import GF.Grammar.Lookup
@@ -82,7 +81,7 @@ compileSourceGrammar opts gr@(MGrammar ms) = do
-- to output an intermediate stage
intermOut :: Options -> Dump -> Doc -> IOE ()
intermOut opts d doc
| dump opts d = ioeIO (hPutStrLn stderr (encodeUTF8 (render (text "\n\n--#" <+> text (show d) $$ doc))))
| dump opts d = ioeIO (hPutStrLn stderr (render (text "\n\n--#" <+> text (show d) $$ doc)))
| otherwise = return ()
-- | the environment
@@ -162,7 +161,8 @@ compileOne opts env@(_,srcgr,_) file = do
sm00 <- putpOpt ("- parsing" +++ file) ("- compiling" +++ file ++ "... ") $
getSourceModule opts file
let sm0 = decodeStringsInModule sm00
enc <- ioeIO $ mkTextEncoding (renameEncoding (flag optEncoding (flagsModule sm00)))
let sm0 = decodeStringsInModule enc sm00
intermOut opts DumpSource (ppModule Qualified sm0)

View File

@@ -8,12 +8,14 @@ import GF.Infra.Option
import GF.Data.Operations
import Data.Char
import System.IO
import qualified Data.ByteString.Char8 as BS
encodeStringsInModule :: SourceModule -> SourceModule
encodeStringsInModule = codeSourceModule (encodeUnicode UTF_8)
encodeStringsInModule :: TextEncoding -> SourceModule -> SourceModule
encodeStringsInModule enc = codeSourceModule (BS.unpack . encodeUnicode enc)
decodeStringsInModule :: SourceModule -> SourceModule
decodeStringsInModule mo = codeSourceModule (decodeUnicode (flag optEncoding (flagsModule mo))) mo
decodeStringsInModule :: TextEncoding -> SourceModule -> SourceModule
decodeStringsInModule enc mo = codeSourceModule (decodeUnicode enc . BS.pack) mo
codeSourceModule :: (String -> String) -> SourceModule -> SourceModule
codeSourceModule co (id,mo) = (id,replaceJudgements mo (mapTree codj (jments mo)))

View File

@@ -22,7 +22,6 @@ import PGF.Macros
import GF.Data.Operations
import GF.Infra.Option
import GF.Text.UTF8
import Data.List --(isPrefixOf, find, intersperse)
import qualified Data.Map as Map
@@ -34,7 +33,7 @@ grammar2haskell :: Options
-> String -- ^ Module name.
-> PGF
-> String
grammar2haskell opts name gr = encodeUTF8 $ foldr (++++) [] $
grammar2haskell opts name gr = foldr (++++) [] $
pragmas ++ haskPreamble name ++ [types, gfinstances gId lexical gr']
where gr' = hSkeleton gr
gadt = haskellOption opts HaskellGADT

View File

@@ -6,7 +6,6 @@ import qualified PGF.Macros as M
import qualified GF.JavaScript.AbsJS as JS
import qualified GF.JavaScript.PrintJS as JS
import GF.Text.UTF8
import GF.Data.ErrM
import GF.Infra.Option
@@ -21,7 +20,7 @@ import qualified Data.IntMap as IntMap
pgf2js :: PGF -> String
pgf2js pgf =
encodeUTF8 $ JS.printTree $ JS.Program [JS.ElStmt $ JS.SDeclOrExpr $ JS.Decl [JS.DInit (JS.Ident n) grammar]]
JS.printTree $ JS.Program [JS.ElStmt $ JS.SDeclOrExpr $ JS.Decl [JS.DInit (JS.Ident n) grammar]]
where
n = showCId $ absname pgf
as = abstract pgf

View File

@@ -15,7 +15,6 @@ import PGF.Data
import PGF.Macros
import GF.Data.Operations
import GF.Text.UTF8
import qualified Data.Map as Map
import Data.Char (isAlphaNum, isAsciiLower, isAsciiUpper, ord)

View File

@@ -7,7 +7,6 @@
module GF.Data.XML (XML(..), Attr, comments, showXMLDoc, showsXMLDoc, showsXML, bottomUpXML) where
import GF.Data.Utilities
import GF.Text.UTF8
data XML = Data String | CData String | Tag String [Attr] [XML] | ETag String [Attr] | Comment String | Empty
deriving (Ord,Eq,Show)
@@ -21,7 +20,7 @@ showXMLDoc :: XML -> String
showXMLDoc xml = showsXMLDoc xml ""
showsXMLDoc :: XML -> ShowS
showsXMLDoc xml = encodeUTF8 . showString header . showsXML xml
showsXMLDoc xml = showString header . showsXML xml
where header = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
showsXML :: XML -> ShowS

View File

@@ -441,9 +441,7 @@ Exp6
| '?' { Meta 0 }
| '[' ']' { Empty }
| '[' Ident Exps ']' { foldl App (Vr (mkListId $2)) $3 }
| '[' String ']' { case $2 of
[] -> Empty
str -> foldr1 C (map K (words str)) }
| '[' String ']' { K $2 }
| '{' ListLocDef '}' {% mkR $2 }
| '<' ListTupleComp '>' { R (tuple2record $2) }
| '<' Exp ':' Exp '>' { Typed $2 $4 }

View File

@@ -3,7 +3,7 @@ module GF.Infra.Option
-- * Option types
Options,
Flags(..),
Mode(..), Phase(..), Verbosity(..), Encoding(..), OutputFormat(..),
Mode(..), Phase(..), Verbosity(..), OutputFormat(..),
SISRFormat(..), Optimization(..), CFGTransform(..), HaskellOption(..),
Dump(..), Printer(..), Recomp(..),
-- * Option parsing
@@ -17,7 +17,7 @@ module GF.Infra.Option
helpMessage,
-- * Checking specific options
flag, cfgTransform, haskellOption, readOutputFormat,
isLexicalCat, encodings,
isLexicalCat, renameEncoding,
-- * Setting specific options
setOptimization, setCFGTransform,
-- * Convenience methods for checking options
@@ -25,12 +25,13 @@ module GF.Infra.Option
) where
import Control.Monad
import Data.Char (toLower)
import Data.Char (toLower, isDigit)
import Data.List
import Data.Maybe
import GF.Infra.GetOpt
--import System.Console.GetOpt
import System.FilePath
import System.IO
import GF.Data.ErrM
@@ -77,9 +78,6 @@ data Verbosity = Quiet | Normal | Verbose | Debug
data Phase = Preproc | Convert | Compile | Link
deriving (Show,Eq,Ord)
data Encoding = UTF_8 | ISO_8859_1 | CP_1250 | CP_1251 | CP_1252 | CP_1254
deriving (Eq,Ord)
data OutputFormat = FmtPGFPretty
| FmtJavaScript
| FmtHaskell
@@ -161,7 +159,7 @@ data Flags = Flags {
optCncName :: Maybe String,
optResName :: Maybe String,
optPreprocessors :: [String],
optEncoding :: Encoding,
optEncoding :: String,
optOptimizations :: Set Optimization,
optCFGTransforms :: Set CFGTransform,
optLibraryPath :: [FilePath],
@@ -207,7 +205,7 @@ fixRelativeLibPaths curr_dir lib_dir (Options o) = Options (fixPathFlags . o)
-- | Pretty-print the options that are preserved in .gfo files.
optionsGFO :: Options -> [(String,String)]
optionsGFO opts = optionsPGF opts
++ [("coding", show (flag optEncoding opts))]
++ [("coding", flag optEncoding opts)]
-- | Pretty-print the options that are preserved in .pgf files.
optionsPGF :: Options -> [(String,String)]
@@ -260,7 +258,7 @@ defaultFlags = Flags {
optCncName = Nothing,
optResName = Nothing,
optPreprocessors = [],
optEncoding = ISO_8859_1,
optEncoding = "latin1",
optOptimizations = Set.fromList [OptStem,OptCSE,OptExpand,OptParametrize],
optCFGTransforms = Set.fromList [CFGRemoveCycles, CFGBottomUpFilter,
CFGTopDownFilter, CFGMergeIdentical],
@@ -343,8 +341,7 @@ optDescr =
(unlines ["Use CMD to preprocess input files.",
"Multiple preprocessors can be used by giving this option multiple times."]),
Option [] ["coding"] (ReqArg coding "ENCODING")
("Character encoding of the source grammar, ENCODING = "
++ concat (intersperse " | " (map fst encodings)) ++ "."),
("Character encoding of the source grammar, ENCODING = utf8, latin1, cp1251, ..."),
Option [] ["startcat"] (ReqArg startcat "CAT") "Grammar start category.",
Option [] ["language"] (ReqArg language "LANG") "Set the speech language flag to LANG in the generated grammar.",
Option [] ["lexer"] (ReqArg lexer "LEXER") "Use lexer LEXER.",
@@ -400,9 +397,7 @@ optDescr =
addLibDir x = set $ \o -> o { optLibraryPath = x:optLibraryPath o }
setLibPath x = set $ \o -> o { optLibraryPath = splitInModuleSearchPath x }
preproc x = set $ \o -> o { optPreprocessors = optPreprocessors o ++ [x] }
coding x = case lookup x encodings of
Just c -> set $ \o -> o { optEncoding = c }
Nothing -> fail $ "Unknown character encoding: " ++ x
coding x = set $ \o -> o { optEncoding = x }
startcat x = set $ \o -> o { optStartCat = Just x }
language x = set $ \o -> o { optSpeechLanguage = Just x }
lexer x = set $ \o -> o { optLexer = Just x }
@@ -483,18 +478,14 @@ haskellOptionNames =
("gadt", HaskellGADT),
("lexical", HaskellLexical)]
encodings :: [(String,Encoding)]
encodings =
[("utf8", UTF_8),
("cp1250", CP_1250),
("cp1251", CP_1251),
("cp1252", CP_1252),
("cp1254", CP_1254),
("latin1", ISO_8859_1)
]
instance Show Encoding where
show = lookupShow encodings
-- | This is for bacward compatibility. Since GHC 6.12 we
-- started using the native Unicode support in GHC but it
-- uses different names for the code pages.
renameEncoding :: String -> String
renameEncoding "utf8" = "UTF-8"
renameEncoding "latin1" = "CP1252"
renameEncoding ('c':'p':s) | all isDigit s = 'C':'P':s
renameEncoding s = s
lookupShow :: Eq a => [(String,a)] -> a -> String
lookupShow xs z = fromMaybe "lookupShow" $ lookup z [(y,x) | (x,y) <- xs]

View File

@@ -23,7 +23,6 @@ import PGF.Linearize
import GF.Data.Operations
import GF.Infra.UseIO
import GF.Infra.Option
import GF.Text.Coding
import PGF.Probabilistic
import System.Random
@@ -33,9 +32,9 @@ import Data.List (nub)
-- generic quiz function
mkQuiz :: Encoding -> String -> [(String,[String])] -> IO ()
mkQuiz cod msg tts = do
let qas = [ (encodeUnicode cod q, mkAnswer cod as) | (q,as) <- tts]
mkQuiz :: String -> [(String,[String])] -> IO ()
mkQuiz msg tts = do
let qas = [(q, mkAnswer as) | (q,as) <- tts]
teachDialogue qas msg
translationList ::
@@ -62,14 +61,13 @@ morphologyList mex mprobs pgf ig typ number = do
(pwss@(pws0:_),i) <- zip ss forms, let ws = map (\pws -> snd (pws !! i)) pwss]
-- | compare answer to the list of right answers, increase score and give feedback
mkAnswer :: Encoding -> [String] -> String -> (Integer, String)
mkAnswer cod as s =
mkAnswer :: [String] -> String -> (Integer, String)
mkAnswer as s =
if (elem (norm s) as)
then (1,"Yes.")
else (0,"No, not" +++ s ++ ", but" ++++ enc (unlines as))
else (0,"No, not" +++ s ++ ", but" ++++ unlines as)
where
norm = unwords . words . decodeUnicode cod
enc = encodeUnicode cod
norm = unwords . words
norml = unwords . words

View File

@@ -1,91 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module : GF.Text.CP1250
-- Maintainer : Krasimir Angelov
--
-- cp1250 is a code page used under Microsoft Windows to represent texts
-- in Central European and Eastern European languages that use Latin script,
-- such as Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian,
-- Serbian (Latin script), Romanian and Albanian. It may also be used with
-- the German language; German-language texts encoded with cp1250 and cp1252
-- are identical.
--
-----------------------------------------------------------------------------
module GF.Text.CP1250 where
import Data.Char
decodeCP1250 = map convert where
convert c
| c == '\x80' = chr 0x20AC
| c == '\x82' = chr 0x201A
| c == '\x84' = chr 0x201E
| c == '\x85' = chr 0x2026
| c == '\x86' = chr 0x2020
| c == '\x87' = chr 0x2021
| c == '\x89' = chr 0x2030
| c == '\x8A' = chr 0x0160
| c == '\x8B' = chr 0x2039
| c == '\x8C' = chr 0x015A
| c == '\x8D' = chr 0x0164
| c == '\x8E' = chr 0x017D
| c == '\x8F' = chr 0x0179
| c == '\x91' = chr 0x2018
| c == '\x92' = chr 0x2019
| c == '\x93' = chr 0x201C
| c == '\x94' = chr 0x201D
| c == '\x95' = chr 0x2022
| c == '\x96' = chr 0x2013
| c == '\x97' = chr 0x2014
| c == '\x99' = chr 0x2122
| c == '\x9A' = chr 0x0161
| c == '\x9B' = chr 0x203A
| c == '\x9C' = chr 0x015B
| c == '\x9D' = chr 0x0165
| c == '\x9E' = chr 0x017E
| c == '\x9F' = chr 0x017A
| c == '\xA1' = chr 0x02C7
| c == '\xA5' = chr 0x0104
| c == '\xB9' = chr 0x0105
| c == '\xBC' = chr 0x013D
| c == '\xBE' = chr 0x013E
| otherwise = c
encodeCP1250 = map convert where
convert c
| oc == 0x20AC = '\x80'
| oc == 0x201A = '\x82'
| oc == 0x201E = '\x84'
| oc == 0x2026 = '\x85'
| oc == 0x2020 = '\x86'
| oc == 0x2021 = '\x87'
| oc == 0x2030 = '\x89'
| oc == 0x0160 = '\x8A'
| oc == 0x2039 = '\x8B'
| oc == 0x015A = '\x8C'
| oc == 0x0164 = '\x8D'
| oc == 0x017D = '\x8E'
| oc == 0x0179 = '\x8F'
| oc == 0x2018 = '\x91'
| oc == 0x2019 = '\x92'
| oc == 0x201C = '\x93'
| oc == 0x201D = '\x94'
| oc == 0x2022 = '\x95'
| oc == 0x2013 = '\x96'
| oc == 0x2014 = '\x97'
| oc == 0x2122 = '\x99'
| oc == 0x0161 = '\x9A'
| oc == 0x203A = '\x9B'
| oc == 0x015B = '\x9C'
| oc == 0x0165 = '\x9D'
| oc == 0x017E = '\x9E'
| oc == 0x017A = '\x9F'
| oc == 0x02C7 = '\xA1'
| oc == 0x0104 = '\xA5'
| oc == 0x0105 = '\xB9'
| oc == 0x013D = '\xBC'
| oc == 0x013E = '\xBE'
| otherwise = c
where oc = ord c

View File

@@ -1,86 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module : GF.Text.CP1251
-- Maintainer : Krasimir Angelov
--
-- cp1251 is a popular 8-bit character encoding, designed to cover languages
-- that use the Cyrillic alphabet such as Russian, Bulgarian, Serbian Cyrillic
-- and other languages. It is the most widely used for encoding the Bulgarian,
-- Serbian and Macedonian languages.
--
-----------------------------------------------------------------------------
module GF.Text.CP1251 where
import Data.Char
decodeCP1251 = map convert where
convert c
| c >= '\xC0' && c <= '\xFF' = chr (ord c + (0x410-0xC0))
| c == '\xA8' = chr 0x401 -- cyrillic capital letter lo
| c == '\x80' = chr 0x402
| c == '\x81' = chr 0x403
| c == '\xAA' = chr 0x404
| c == '\xBD' = chr 0x405
| c == '\xB2' = chr 0x406
| c == '\xAF' = chr 0x407
| c == '\xA3' = chr 0x408
| c == '\x8A' = chr 0x409
| c == '\x8C' = chr 0x40A
| c == '\x8E' = chr 0x40B
| c == '\x8D' = chr 0x40C
| c == '\xA1' = chr 0x40E
| c == '\x8F' = chr 0x40F
| c == '\xB8' = chr 0x451 -- cyrillic small letter lo
| c == '\x90' = chr 0x452
| c == '\x83' = chr 0x453
| c == '\xBA' = chr 0x454
| c == '\xBE' = chr 0x455
| c == '\xB3' = chr 0x456
| c == '\xBF' = chr 0x457
| c == '\xBC' = chr 0x458
| c == '\x9A' = chr 0x459
| c == '\x9C' = chr 0x45A
| c == '\x9E' = chr 0x45B
| c == '\x9D' = chr 0x45C
| c == '\xA2' = chr 0x45E
| c == '\x9F' = chr 0x45F
| c == '\xA5' = chr 0x490
| c == '\xB4' = chr 0x491
| otherwise = c
encodeCP1251 = map convert where
convert c
| oc >= 0x410 && oc <= 0x44F = chr (oc - (0x410-0xC0))
| oc == 0x401 = '\xA8' -- cyrillic capital letter lo
| oc == 0x402 = '\x80'
| oc == 0x403 = '\x81'
| oc == 0x404 = '\xAA'
| oc == 0x405 = '\xBD'
| oc == 0x406 = '\xB2'
| oc == 0x407 = '\xAF'
| oc == 0x408 = '\xA3'
| oc == 0x409 = '\x8A'
| oc == 0x40A = '\x8C'
| oc == 0x40B = '\x8E'
| oc == 0x40C = '\x8D'
| oc == 0x40E = '\xA1'
| oc == 0x40F = '\x8F'
| oc == 0x451 = '\xB8' -- cyrillic small letter lo
| oc == 0x452 = '\x90'
| oc == 0x453 = '\x83'
| oc == 0x454 = '\xBA'
| oc == 0x455 = '\xBE'
| oc == 0x456 = '\xB3'
| oc == 0x457 = '\xBF'
| oc == 0x458 = '\xBC'
| oc == 0x459 = '\x9A'
| oc == 0x45A = '\x9C'
| oc == 0x45B = '\x9E'
| oc == 0x45C = '\x9D'
| oc == 0x45E = '\xA2'
| oc == 0x45F = '\x9F'
| oc == 0x490 = '\xA5'
| oc == 0x491 = '\xB4'
| otherwise = c
where oc = ord c

View File

@@ -1,17 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module : GF.Text.CP1252
-- Maintainer : Krasimir Angelov
--
-- cp1252 is a character encoding of the Latin alphabet, used by default in
-- the legacy components of Microsoft Windows in English and some other
-- Western languages.
--
-----------------------------------------------------------------------------
module GF.Text.CP1252 where
import Data.Char
decodeCP1252 = map id
encodeCP1252 = map (\x -> if x <= '\255' then x else '?')

View File

@@ -1,84 +0,0 @@
-----------------------------------------------------------------------------
-- |
-- Module : GF.Text.CP1254
-- Maintainer : Krasimir Angelov
--
-- cp1254 is a code page used under Microsoft Windows to write Turkish.
-- Characters with codepoints A0 through FF are compatible with ISO 8859-9.
--
-----------------------------------------------------------------------------
module GF.Text.CP1254 where
import Data.Char
decodeCP1254 = map convert where
convert c
| c == '\x80' = chr 0x20AC
| c == '\x82' = chr 0x201A
| c == '\x83' = chr 0x192
| c == '\x84' = chr 0x201E
| c == '\x85' = chr 0x2026
| c == '\x86' = chr 0x2020
| c == '\x87' = chr 0x2021
| c == '\x88' = chr 0x2C6
| c == '\x89' = chr 0x2030
| c == '\x8A' = chr 0x160
| c == '\x8B' = chr 0x2039
| c == '\x8C' = chr 0x152
| c == '\x91' = chr 0x2018
| c == '\x92' = chr 0x2019
| c == '\x93' = chr 0x201C
| c == '\x94' = chr 0x201D
| c == '\x95' = chr 0x2022
| c == '\x96' = chr 0x2013
| c == '\x97' = chr 0x2014
| c == '\x98' = chr 0x2DC
| c == '\x99' = chr 0x2122
| c == '\x9A' = chr 0x161
| c == '\x9B' = chr 0x203A
| c == '\x9C' = chr 0x153
| c == '\x9F' = chr 0x178
| c == '\xD0' = chr 0x11E
| c == '\xDD' = chr 0x130
| c == '\xDE' = chr 0x15E
| c == '\xF0' = chr 0x11F
| c == '\xFD' = chr 0x131
| c == '\xFE' = chr 0x15F
| otherwise = c
encodeCP1254 = map convert where
convert c
| oc == 0x20AC = '\x80'
| oc == 0x201A = '\x82'
| oc == 0x192 = '\x83'
| oc == 0x201E = '\x84'
| oc == 0x2026 = '\x85'
| oc == 0x2020 = '\x86'
| oc == 0x2021 = '\x87'
| oc == 0x2C6 = '\x88'
| oc == 0x2030 = '\x89'
| oc == 0x160 = '\x8A'
| oc == 0x2039 = '\x8B'
| oc == 0x152 = '\x8C'
| oc == 0x2018 = '\x91'
| oc == 0x2019 = '\x92'
| oc == 0x201C = '\x93'
| oc == 0x201D = '\x94'
| oc == 0x2022 = '\x95'
| oc == 0x2013 = '\x96'
| oc == 0x2014 = '\x97'
| oc == 0x2DC = '\x98'
| oc == 0x2122 = '\x99'
| oc == 0x161 = '\x9A'
| oc == 0x203A = '\x9B'
| oc == 0x153 = '\x9C'
| oc == 0x178 = '\x9F'
| oc == 0x11E = '\xD0'
| oc == 0x130 = '\xDD'
| oc == 0x15E = '\xDE'
| oc == 0x11F = '\xF0'
| oc == 0x131 = '\xFD'
| oc == 0x15F = '\xFE'
| otherwise = c
where oc = ord c

View File

@@ -1,24 +1,69 @@
module GF.Text.Coding where
import GF.Infra.Option
import GF.Text.UTF8
import GF.Text.CP1250
import GF.Text.CP1251
import GF.Text.CP1252
import GF.Text.CP1254
import qualified Data.ByteString as BS
import Data.ByteString.Internal
import GHC.IO
import GHC.IO.Buffer
import GHC.IO.Encoding
import GHC.IO.Exception
import Control.Monad
encodeUnicode e = case e of
UTF_8 -> encodeUTF8
CP_1250 -> encodeCP1250
CP_1251 -> encodeCP1251
CP_1252 -> encodeCP1252
CP_1254 -> encodeCP1254
_ -> id
encodeUnicode :: TextEncoding -> String -> ByteString
encodeUnicode enc s =
unsafePerformIO $ do
let len = length s
cbuf0 <- newCharBuffer (len*4) ReadBuffer
foldM (\i c -> writeCharBuf (bufRaw cbuf0) i c) 0 s
let cbuf = cbuf0{bufR=len}
case enc of
TextEncoding {mkTextEncoder=mk} -> do encoder <- mk
bss <- translate (encode encoder) cbuf
close encoder
return (BS.concat bss)
where
translate cod cbuf
| i < w = do bbuf <- newByteBuffer 128 WriteBuffer
(cbuf,bbuf) <- cod cbuf bbuf
if isEmptyBuffer bbuf
then ioe_invalidCharacter
else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
bss <- translate cod cbuf
return (bs:bss)
| otherwise = return []
where
i = bufL cbuf
w = bufR cbuf
decodeUnicode e = case e of
UTF_8 -> decodeUTF8
CP_1250 -> decodeCP1250
CP_1251 -> decodeCP1251
CP_1252 -> decodeCP1252
CP_1254 -> decodeCP1254
_ -> id
decodeUnicode :: TextEncoding -> ByteString -> String
decodeUnicode enc (PS fptr l len) =
unsafePerformIO $ do
let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len}
cbuf <- newCharBuffer 128 WriteBuffer
case enc of
TextEncoding {mkTextDecoder=mk} -> do decoder <- mk
s <- translate (encode decoder) bbuf cbuf
close decoder
return s
where
translate cod bbuf cbuf
| i < w = do (bbuf,cbuf) <- cod bbuf cbuf
if isEmptyBuffer cbuf
then ioe_invalidCharacter
else unpack cod bbuf cbuf
| otherwise = return []
where
i = bufL bbuf
w = bufR bbuf
unpack cod bbuf cbuf
| i < w = do (c,i') <- readCharBuf (bufRaw cbuf) i
cs <- unpack cod bbuf cbuf{bufL=i'}
return (c:cs)
| otherwise = translate cod bbuf cbuf{bufL=0,bufR=0}
where
i = bufL cbuf
w = bufR cbuf
ioe_invalidCharacter = ioException
(IOError Nothing InvalidArgument ""
("invalid byte sequence for this encoding") Nothing Nothing)

View File

@@ -1,8 +1,6 @@
module GF.Text.Lexing (stringOp,opInEnv) where
import GF.Text.Transliterations
import GF.Text.UTF8
import GF.Text.CP1251
import Data.Char
import Data.List (intersperse)
@@ -23,10 +21,6 @@ stringOp name = case name of
"unlexmixed" -> Just $ capitInit . appUnlexer (unlexMixed . unquote)
"unwords" -> Just $ appUnlexer unwords
"to_html" -> Just wrapHTML
"to_utf8" -> Just encodeUTF8
"from_utf8" -> Just decodeUTF8
"to_cp1251" -> Just encodeCP1251
"from_cp1251" -> Just decodeCP1251
_ -> transliterate name
-- perform op in environments beg--end, t.ex. between "--"

View File

@@ -5,8 +5,6 @@ module GF.Text.Transliterations (
transliterationPrintNames
) where
import GF.Text.UTF8
import Data.Char
import Numeric
import qualified Data.Map as Map

View File

@@ -1,48 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : UTF8
-- Maintainer : AR
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:42 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.5 $
--
-- From the Char module supplied with HBC.
-- code by Thomas Hallgren (Jul 10 1999)
-----------------------------------------------------------------------------
module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where
-- | Take a Unicode string and encode it as a string
-- with the UTF8 method.
decodeUTF8 :: String -> String
decodeUTF8 "" = ""
decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' &&
'\x80' <= c' && c' <= '\xbf' =
toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' &&
'\x80' <= c' && c' <= '\xbf' &&
'\x80' <= c'' && c'' <= '\xbf' =
toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
decodeUTF8 s = s ---- AR workaround 22/6/2006
----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
encodeUTF8 :: String -> String
encodeUTF8 "" = ""
encodeUTF8 (c:cs) =
if c > '\x0000' && c < '\x0080' then
c : encodeUTF8 cs
else if c < toEnum 0x0800 then
let i = fromEnum c
in toEnum (0xc0 + i `div` 0x40) :
toEnum (0x80 + i `mod` 0x40) :
encodeUTF8 cs
else
let i = fromEnum c
in toEnum (0xe0 + i `div` 0x1000) :
toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) :
toEnum (0x80 + i `mod` 0x40) :
encodeUTF8 cs

View File

@@ -17,6 +17,7 @@ import Data.Maybe
import Data.Binary
import System.FilePath
import System.IO
import Control.Exception
mainGFC :: Options -> [FilePath] -> IOE ()
@@ -81,8 +82,8 @@ writeOutput opts file str =
do let path = case flag optOutputDir opts of
Nothing -> file
Just dir -> dir </> file
writeOutputFile opts path str
writeOutputFile :: Options -> FilePath -> String -> IOE ()
writeOutputFile opts outfile output =
do putPointE Normal opts ("Writing " ++ outfile ++ "...") $ ioeIO $ writeFile outfile output
putPointE Normal opts ("Writing " ++ path ++ "...") $ ioeIO $
bracket
(openFile path WriteMode)
(hClose)
(\h -> hSetEncoding h utf8 >> hPutStr h str)

View File

@@ -21,7 +21,6 @@ import GF.Infra.Option
import GF.Infra.Modules (greatestResource, modules, emptyModInfo)
import GF.System.Readline
import GF.Text.Coding
import GF.Compile.Coding
import PGF
@@ -34,6 +33,7 @@ import Data.List(isPrefixOf)
import qualified Data.Map as Map
import qualified Data.ByteString.Char8 as BS
import qualified Text.ParserCombinators.ReadP as RP
import System.IO
import System.Cmd
import System.CPUTime
import System.Directory
@@ -86,9 +86,7 @@ loop opts gfenv0 = do
s0 <- fetch
let gfenv = gfenv0 {history = s0 : history gfenv0}
let
enc = encode gfenv
s = decode gfenv s0
pwords = case words s of
pwords = case words s0 of
w:ws -> getCommandOp w :ws
ws -> ws
@@ -130,8 +128,8 @@ loop opts gfenv0 = do
case runP pExp (BS.pack s) of
Left (_,msg) -> putStrLn msg
Right t -> case checkComputeTerm sgr (codeTerm (decode gfenv) (L (0,0) t)) of
Ok x -> putStrLn $ enc (showTerm sgr style q x)
Bad s -> putStrLn $ enc s
Ok x -> putStrLn $ showTerm sgr style q x
Bad s -> putStrLn $ s
loopNewCPU gfenv
"dg":ws -> do
let stop = case ws of
@@ -141,7 +139,7 @@ loop opts gfenv0 = do
putStrLn "wrote graph in file _gfdepgraph.dot"
loopNewCPU gfenv
"eh":w:_ -> do
cs <- readFile w >>= return . map (interpretCommandLine enc env) . lines
cs <- readFile w >>= return . map (interpretCommandLine env) . lines
loopNewCPU gfenv
"i":args -> do
@@ -179,25 +177,28 @@ loop opts gfenv0 = do
_ -> putStrLn "value definition not parsed" >> loopNewCPU gfenv
"ph":_ ->
mapM_ (putStrLn . enc) (reverse (history gfenv0)) >> loopNewCPU gfenv
"se":c:_ ->
case lookup c encodings of
Just cod -> do
mapM_ putStrLn (reverse (history gfenv0)) >> loopNewCPU gfenv
"se":c:_ -> do
let cod = renameEncoding c
#ifdef mingw32_HOST_OS
case c of
'c':'p':c -> case reads c of
[(cp,"")] -> setConsoleCP cp >> setConsoleOutputCP cp
_ -> return ()
"utf8" -> setConsoleCP 65001 >> setConsoleOutputCP 65001
_ -> return ()
case cod of
'C':'P':c -> case reads c of
[(cp,"")] -> do setConsoleCP cp
setConsoleOutputCP cp
_ -> return ()
"UTF-8" -> do setConsoleCP 65001
setConsoleOutputCP 65001
_ -> return ()
#endif
loopNewCPU $ gfenv {coding = cod}
Nothing -> do putStrLn "unknown encoding"
loopNewCPU gfenv
enc <- mkTextEncoding cod
hSetEncoding stdin enc
hSetEncoding stdout enc
hSetEncoding stderr enc
loopNewCPU gfenv
-- ordinary commands, working on CommandEnv
_ -> do
interpretCommandLine enc env s
interpretCommandLine env s0
loopNewCPU gfenv
-- gfenv' <- return $ either (const gfenv) id r
gfenv' <- either (\e -> (print e >> return gfenv)) return r
@@ -215,7 +216,7 @@ importInEnv gfenv opts files
if (verbAtLeast opts Normal)
then putStrLnFlush $ unwords $ "\nLanguages:" : map showCId (languages pgf1)
else return ()
return $ gfenv { commandenv = mkCommandEnv (coding gfenv) pgf1 }
return $ gfenv { commandenv = mkCommandEnv pgf1 }
tryGetLine = do
res <- try getLine
@@ -252,24 +253,16 @@ data GFEnv = GFEnv {
sourcegrammar :: SourceGrammar, -- gfo grammar -retain
commandenv :: CommandEnv,
history :: [String],
cputime :: Integer,
coding :: Encoding
cputime :: Integer
}
emptyGFEnv :: IO GFEnv
emptyGFEnv = do
#ifdef mingw32_HOST_OS
codepage <- getACP
let coding = fromMaybe UTF_8 (lookup ("cp"++show codepage) encodings)
#else
let coding = UTF_8
#endif
return $ GFEnv emptySourceGrammar{modules=[(identW,emptyModInfo)]} (mkCommandEnv coding emptyPGF) [] 0 coding
return $ GFEnv emptySourceGrammar{modules=[(identW,emptyModInfo)]} (mkCommandEnv emptyPGF) [] 0
encode = encodeUnicode . coding
decode = decodeUnicode . coding
decode _ = id -- decodeUnicode . coding
wordCompletion gfenv line0 prefix0 p =
wordCompletion gfenv line prefix p =
case wc_type (take p line) of
CmplCmd pref
-> ret ' ' [name | name <- Map.keys (commands cmdEnv), isPrefixOf pref name]
@@ -280,7 +273,7 @@ wordCompletion gfenv line0 prefix0 p =
in case loop state0 ws of
Nothing -> ret ' ' []
Just state -> let compls = getCompletions state prefix
in ret ' ' (map (encode gfenv) (Map.keys compls))
in ret ' ' (Map.keys compls)
Left (_ :: SomeException) -> ret ' ' []
CmplOpt (Just (Command n _ _)) pref
-> case Map.lookup n (commands cmdEnv) of
@@ -298,9 +291,6 @@ wordCompletion gfenv line0 prefix0 p =
Left (_ :: SomeException) -> ret ' ' []
_ -> ret ' ' []
where
line = decode gfenv line0
prefix = decode gfenv prefix0
pgf = multigrammar cmdEnv
cmdEnv = commandenv gfenv
optLang opts = valCIdOpts "lang" (head (languages pgf)) opts