mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
use the native unicode support from GHC 6.12
This commit is contained in:
1
GF.cabal
1
GF.cabal
@@ -89,7 +89,6 @@ executable gf
|
||||
GF.JavaScript.AbsJS
|
||||
GF.JavaScript.PrintJS
|
||||
GF.Infra.CompactPrint
|
||||
GF.Text.UTF8
|
||||
GF.Data.TrieMap
|
||||
GF.Data.Utilities
|
||||
GF.Data.SortedList
|
||||
|
||||
@@ -24,6 +24,10 @@ main = do
|
||||
codepage <- getACP
|
||||
setConsoleCP codepage
|
||||
setConsoleOutputCP codepage
|
||||
enc <- mkTextEncoding ("CP"++show codepage)
|
||||
hSetEncoding stdin enc
|
||||
hSetEncoding stdout enc
|
||||
hSetEncoding stderr enc
|
||||
#endif
|
||||
args <- getArgs
|
||||
case parseOptions args of
|
||||
|
||||
@@ -24,7 +24,7 @@ import PGF.Probabilistic -- (getProbsFromFile,prProbabilities,defaultProbabiliti
|
||||
import PGF.Generate (generateRandomFrom) ----
|
||||
import GF.Compile.Export
|
||||
import GF.Compile.ExampleBased
|
||||
import GF.Infra.Option (noOptions, readOutputFormat, Encoding(..))
|
||||
import GF.Infra.Option (noOptions, readOutputFormat)
|
||||
import GF.Infra.UseIO
|
||||
import GF.Data.ErrM ----
|
||||
import GF.Command.Abstract
|
||||
@@ -36,7 +36,6 @@ import GF.Quiz
|
||||
import GF.Command.TreeOperations ---- temporary place for typecheck and compute
|
||||
|
||||
import GF.Data.Operations
|
||||
import GF.Text.Coding
|
||||
|
||||
import Data.List
|
||||
import Data.Maybe
|
||||
@@ -77,10 +76,10 @@ emptyCommandInfo = CommandInfo {
|
||||
lookCommand :: String -> Map.Map String CommandInfo -> Maybe CommandInfo
|
||||
lookCommand = Map.lookup
|
||||
|
||||
commandHelpAll :: Encoding -> PGFEnv -> [Option] -> String
|
||||
commandHelpAll cod pgf opts = unlines
|
||||
commandHelpAll :: PGFEnv -> [Option] -> String
|
||||
commandHelpAll pgf opts = unlines
|
||||
[commandHelp (isOpt "full" opts) (co,info)
|
||||
| (co,info) <- Map.assocs (allCommands cod pgf)]
|
||||
| (co,info) <- Map.assocs (allCommands pgf)]
|
||||
|
||||
commandHelp :: Bool -> (String,CommandInfo) -> String
|
||||
commandHelp full (co,info) = unlines $ [
|
||||
@@ -120,8 +119,8 @@ commandHelpTags full (co,info) = unlines $ [
|
||||
type PGFEnv = (PGF, Map.Map Language Morpho)
|
||||
|
||||
-- this list must no more be kept sorted by the command name
|
||||
allCommands :: Encoding -> PGFEnv -> Map.Map String CommandInfo
|
||||
allCommands cod env@(pgf, mos) = Map.fromList [
|
||||
allCommands :: PGFEnv -> Map.Map String CommandInfo
|
||||
allCommands env@(pgf, mos) = Map.fromList [
|
||||
("!", emptyCommandInfo {
|
||||
synopsis = "system command: escape to system shell",
|
||||
syntax = "! SYSTEMCOMMAND",
|
||||
@@ -156,7 +155,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
||||
let file s = "_grph." ++ s
|
||||
let view = optViewGraph opts ++ " "
|
||||
let format = optViewFormat opts
|
||||
writeFile (file "dot") (enc grph)
|
||||
writeFile (file "dot") grph
|
||||
system $ "dot -T" ++ format ++ " " ++ file "dot" ++ " > " ++ file format ++
|
||||
" ; " ++ view ++ file format
|
||||
return void
|
||||
@@ -365,10 +364,10 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
||||
_ | isOpt "coding" opts -> codingMsg
|
||||
_ | isOpt "license" opts -> licenseMsg
|
||||
[t] -> let co = getCommandOp (showExpr [] t) in
|
||||
case lookCommand co (allCommands cod env) of ---- new map ??!!
|
||||
case lookCommand co (allCommands env) of ---- new map ??!!
|
||||
Just info -> commandHelp True (co,info)
|
||||
_ -> "command not found"
|
||||
_ -> commandHelpAll cod env opts
|
||||
_ -> commandHelpAll env opts
|
||||
in return (fromString msg),
|
||||
needsTypeCheck = False
|
||||
}),
|
||||
@@ -458,7 +457,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
||||
let typ = optType opts
|
||||
mprobs <- optProbs opts pgf
|
||||
let mt = mexp xs
|
||||
morphologyQuiz mt mprobs cod pgf lang typ
|
||||
morphologyQuiz mt mprobs pgf lang typ
|
||||
return void,
|
||||
flags = [
|
||||
("lang","language of the quiz"),
|
||||
@@ -656,7 +655,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
||||
let typ = optType opts
|
||||
let mt = mexp xs
|
||||
mprobs <- optProbs opts pgf
|
||||
translationQuiz mt mprobs cod pgf from to typ
|
||||
translationQuiz mt mprobs pgf from to typ
|
||||
return void,
|
||||
flags = [
|
||||
("from","translate from this language"),
|
||||
@@ -687,7 +686,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
||||
exec = \opts arg -> do
|
||||
let tmpi = "_tmpi" ---
|
||||
let tmpo = "_tmpo"
|
||||
writeFile tmpi $ enc $ toString arg
|
||||
writeFile tmpi $ toString arg
|
||||
let syst = optComm opts ++ " " ++ tmpi
|
||||
system $ syst ++ " <" ++ tmpi ++ " >" ++ tmpo
|
||||
s <- readFile tmpo
|
||||
@@ -738,7 +737,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
||||
let file s = "_grphd." ++ s
|
||||
let view = optViewGraph opts ++ " "
|
||||
let format = optViewFormat opts
|
||||
writeFile (file "dot") (enc grphs)
|
||||
writeFile (file "dot") grphs
|
||||
system $ "dot -T" ++ format ++ " " ++ file "dot" ++ " > " ++ file format ++
|
||||
" ; " ++ view ++ file format
|
||||
return void
|
||||
@@ -779,7 +778,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
||||
let file s = "_grph." ++ s
|
||||
let view = optViewGraph opts ++ " "
|
||||
let format = optViewFormat opts
|
||||
writeFile (file "dot") (enc grph)
|
||||
writeFile (file "dot") grph
|
||||
system $ "dot -T" ++ format ++ " " ++ file "dot" ++ " > " ++ file format ++
|
||||
" ; " ++ view ++ file format
|
||||
return void
|
||||
@@ -819,7 +818,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
||||
let file s = "_grph." ++ s
|
||||
let view = optViewGraph opts ++ " "
|
||||
let format = optViewFormat opts
|
||||
writeFile (file "dot") (enc grph)
|
||||
writeFile (file "dot") grph
|
||||
system $ "dot -T" ++ format ++ " " ++ file "dot" ++ " > " ++ file format ++
|
||||
" ; " ++ view ++ file format
|
||||
return void
|
||||
@@ -844,8 +843,8 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
||||
exec = \opts arg -> do
|
||||
let file = valStrOpts "file" "_gftmp" opts
|
||||
if isOpt "append" opts
|
||||
then appendFile file (enc (toString arg))
|
||||
else writeFile file (enc (toString arg))
|
||||
then appendFile file (toString arg)
|
||||
else writeFile file (toString arg)
|
||||
return void,
|
||||
options = [
|
||||
("append","append to file, instead of overwriting it")
|
||||
@@ -889,8 +888,6 @@ allCommands cod env@(pgf, mos) = Map.fromList [
|
||||
})
|
||||
]
|
||||
where
|
||||
enc = encodeUnicode cod
|
||||
|
||||
par opts s = case optOpenTypes opts of
|
||||
[] -> concat [parse pgf lang (optType opts) s | lang <- optLangs opts]
|
||||
open_typs -> concat [parseWithRecovery pgf lang (optType opts) open_typs s | lang <- optLangs opts]
|
||||
@@ -1063,17 +1060,17 @@ stringOpOptions = sort $ [
|
||||
treeOpOptions pgf = [(op,expl) | (op,(expl,Left _)) <- allTreeOps pgf]
|
||||
treeOpFlags pgf = [(op,expl) | (op,(expl,Right _)) <- allTreeOps pgf]
|
||||
|
||||
translationQuiz :: Maybe Expr -> Maybe Probabilities -> Encoding ->
|
||||
translationQuiz :: Maybe Expr -> Maybe Probabilities ->
|
||||
PGF -> Language -> Language -> Type -> IO ()
|
||||
translationQuiz mex mprobs cod pgf ig og typ = do
|
||||
translationQuiz mex mprobs pgf ig og typ = do
|
||||
tts <- translationList mex mprobs pgf ig og typ infinity
|
||||
mkQuiz cod "Welcome to GF Translation Quiz." tts
|
||||
mkQuiz "Welcome to GF Translation Quiz." tts
|
||||
|
||||
morphologyQuiz :: Maybe Expr -> Maybe Probabilities -> Encoding ->
|
||||
morphologyQuiz :: Maybe Expr -> Maybe Probabilities ->
|
||||
PGF -> Language -> Type -> IO ()
|
||||
morphologyQuiz mex mprobs cod pgf ig typ = do
|
||||
morphologyQuiz mex mprobs pgf ig typ = do
|
||||
tts <- morphologyList mex mprobs pgf ig typ infinity
|
||||
mkQuiz cod "Welcome to GF Morphology Quiz." tts
|
||||
mkQuiz "Welcome to GF Morphology Quiz." tts
|
||||
|
||||
-- | the maximal number of precompiled quiz problems
|
||||
infinity :: Int
|
||||
|
||||
@@ -29,24 +29,24 @@ data CommandEnv = CommandEnv {
|
||||
expmacros :: Map.Map String Expr
|
||||
}
|
||||
|
||||
mkCommandEnv :: Encoding -> PGF -> CommandEnv
|
||||
mkCommandEnv enc pgf =
|
||||
mkCommandEnv :: PGF -> CommandEnv
|
||||
mkCommandEnv pgf =
|
||||
let mos = Map.fromList [(la,buildMorpho pgf la) | la <- languages pgf] in
|
||||
CommandEnv pgf mos (allCommands enc (pgf, mos)) Map.empty Map.empty
|
||||
CommandEnv pgf mos (allCommands (pgf, mos)) Map.empty Map.empty
|
||||
|
||||
emptyCommandEnv :: CommandEnv
|
||||
emptyCommandEnv = mkCommandEnv UTF_8 emptyPGF
|
||||
emptyCommandEnv = mkCommandEnv emptyPGF
|
||||
|
||||
interpretCommandLine :: (String -> String) -> CommandEnv -> String -> IO ()
|
||||
interpretCommandLine enc env line =
|
||||
interpretCommandLine :: CommandEnv -> String -> IO ()
|
||||
interpretCommandLine env line =
|
||||
case readCommandLine line of
|
||||
Just [] -> return ()
|
||||
Just pipes -> mapM_ (interpretPipe enc env) pipes
|
||||
Just pipes -> mapM_ (interpretPipe env) pipes
|
||||
Nothing -> putStrLnFlush "command not parsed"
|
||||
|
||||
interpretPipe enc env cs = do
|
||||
interpretPipe env cs = do
|
||||
v@(_,s) <- intercs ([],"") cs
|
||||
putStrLnFlush $ enc s
|
||||
putStrLnFlush s
|
||||
return v
|
||||
where
|
||||
intercs treess [] = return treess
|
||||
@@ -57,14 +57,14 @@ interpretPipe enc env cs = do
|
||||
'%':f -> case Map.lookup f (commandmacros env) of
|
||||
Just css ->
|
||||
case getCommandTrees env False arg es of
|
||||
Right es -> do mapM_ (interpretPipe enc env) (appLine es css)
|
||||
Right es -> do mapM_ (interpretPipe env) (appLine es css)
|
||||
return ([],[])
|
||||
Left msg -> do putStrLn ('\n':msg)
|
||||
return ([],[])
|
||||
Nothing -> do
|
||||
putStrLn $ "command macro " ++ co ++ " not interpreted"
|
||||
return ([],[])
|
||||
_ -> interpret enc env es comm
|
||||
_ -> interpret env es comm
|
||||
appLine es = map (map (appCommand es))
|
||||
|
||||
-- macro definition applications: replace ?i by (exps !! i)
|
||||
@@ -81,14 +81,14 @@ appCommand xs c@(Command i os arg) = case arg of
|
||||
EFun x -> EFun x
|
||||
|
||||
-- return the trees to be sent in pipe, and the output possibly printed
|
||||
interpret :: (String -> String) -> CommandEnv -> [Expr] -> Command -> IO CommandOutput
|
||||
interpret enc env trees comm =
|
||||
interpret :: CommandEnv -> [Expr] -> Command -> IO CommandOutput
|
||||
interpret env trees comm =
|
||||
case getCommand env trees comm of
|
||||
Left msg -> do putStrLn ('\n':msg)
|
||||
return ([],[])
|
||||
Right (info,opts,trees) -> do tss@(_,s) <- exec info opts trees
|
||||
if isOpt "tr" opts
|
||||
then putStrLn (enc s)
|
||||
then putStrLn s
|
||||
else return ()
|
||||
return tss
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@ import GF.Compile.Update
|
||||
import GF.Compile.Refresh
|
||||
|
||||
import GF.Compile.Coding
|
||||
import GF.Text.UTF8 ----
|
||||
|
||||
import GF.Grammar.Grammar
|
||||
import GF.Grammar.Lookup
|
||||
@@ -82,7 +81,7 @@ compileSourceGrammar opts gr@(MGrammar ms) = do
|
||||
-- to output an intermediate stage
|
||||
intermOut :: Options -> Dump -> Doc -> IOE ()
|
||||
intermOut opts d doc
|
||||
| dump opts d = ioeIO (hPutStrLn stderr (encodeUTF8 (render (text "\n\n--#" <+> text (show d) $$ doc))))
|
||||
| dump opts d = ioeIO (hPutStrLn stderr (render (text "\n\n--#" <+> text (show d) $$ doc)))
|
||||
| otherwise = return ()
|
||||
|
||||
-- | the environment
|
||||
@@ -162,7 +161,8 @@ compileOne opts env@(_,srcgr,_) file = do
|
||||
|
||||
sm00 <- putpOpt ("- parsing" +++ file) ("- compiling" +++ file ++ "... ") $
|
||||
getSourceModule opts file
|
||||
let sm0 = decodeStringsInModule sm00
|
||||
enc <- ioeIO $ mkTextEncoding (renameEncoding (flag optEncoding (flagsModule sm00)))
|
||||
let sm0 = decodeStringsInModule enc sm00
|
||||
|
||||
intermOut opts DumpSource (ppModule Qualified sm0)
|
||||
|
||||
|
||||
@@ -8,12 +8,14 @@ import GF.Infra.Option
|
||||
import GF.Data.Operations
|
||||
|
||||
import Data.Char
|
||||
import System.IO
|
||||
import qualified Data.ByteString.Char8 as BS
|
||||
|
||||
encodeStringsInModule :: SourceModule -> SourceModule
|
||||
encodeStringsInModule = codeSourceModule (encodeUnicode UTF_8)
|
||||
encodeStringsInModule :: TextEncoding -> SourceModule -> SourceModule
|
||||
encodeStringsInModule enc = codeSourceModule (BS.unpack . encodeUnicode enc)
|
||||
|
||||
decodeStringsInModule :: SourceModule -> SourceModule
|
||||
decodeStringsInModule mo = codeSourceModule (decodeUnicode (flag optEncoding (flagsModule mo))) mo
|
||||
decodeStringsInModule :: TextEncoding -> SourceModule -> SourceModule
|
||||
decodeStringsInModule enc mo = codeSourceModule (decodeUnicode enc . BS.pack) mo
|
||||
|
||||
codeSourceModule :: (String -> String) -> SourceModule -> SourceModule
|
||||
codeSourceModule co (id,mo) = (id,replaceJudgements mo (mapTree codj (jments mo)))
|
||||
|
||||
@@ -22,7 +22,6 @@ import PGF.Macros
|
||||
|
||||
import GF.Data.Operations
|
||||
import GF.Infra.Option
|
||||
import GF.Text.UTF8
|
||||
|
||||
import Data.List --(isPrefixOf, find, intersperse)
|
||||
import qualified Data.Map as Map
|
||||
@@ -34,7 +33,7 @@ grammar2haskell :: Options
|
||||
-> String -- ^ Module name.
|
||||
-> PGF
|
||||
-> String
|
||||
grammar2haskell opts name gr = encodeUTF8 $ foldr (++++) [] $
|
||||
grammar2haskell opts name gr = foldr (++++) [] $
|
||||
pragmas ++ haskPreamble name ++ [types, gfinstances gId lexical gr']
|
||||
where gr' = hSkeleton gr
|
||||
gadt = haskellOption opts HaskellGADT
|
||||
|
||||
@@ -6,7 +6,6 @@ import qualified PGF.Macros as M
|
||||
import qualified GF.JavaScript.AbsJS as JS
|
||||
import qualified GF.JavaScript.PrintJS as JS
|
||||
|
||||
import GF.Text.UTF8
|
||||
import GF.Data.ErrM
|
||||
import GF.Infra.Option
|
||||
|
||||
@@ -21,7 +20,7 @@ import qualified Data.IntMap as IntMap
|
||||
|
||||
pgf2js :: PGF -> String
|
||||
pgf2js pgf =
|
||||
encodeUTF8 $ JS.printTree $ JS.Program [JS.ElStmt $ JS.SDeclOrExpr $ JS.Decl [JS.DInit (JS.Ident n) grammar]]
|
||||
JS.printTree $ JS.Program [JS.ElStmt $ JS.SDeclOrExpr $ JS.Decl [JS.DInit (JS.Ident n) grammar]]
|
||||
where
|
||||
n = showCId $ absname pgf
|
||||
as = abstract pgf
|
||||
|
||||
@@ -15,7 +15,6 @@ import PGF.Data
|
||||
import PGF.Macros
|
||||
|
||||
import GF.Data.Operations
|
||||
import GF.Text.UTF8
|
||||
|
||||
import qualified Data.Map as Map
|
||||
import Data.Char (isAlphaNum, isAsciiLower, isAsciiUpper, ord)
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
module GF.Data.XML (XML(..), Attr, comments, showXMLDoc, showsXMLDoc, showsXML, bottomUpXML) where
|
||||
|
||||
import GF.Data.Utilities
|
||||
import GF.Text.UTF8
|
||||
|
||||
data XML = Data String | CData String | Tag String [Attr] [XML] | ETag String [Attr] | Comment String | Empty
|
||||
deriving (Ord,Eq,Show)
|
||||
@@ -21,7 +20,7 @@ showXMLDoc :: XML -> String
|
||||
showXMLDoc xml = showsXMLDoc xml ""
|
||||
|
||||
showsXMLDoc :: XML -> ShowS
|
||||
showsXMLDoc xml = encodeUTF8 . showString header . showsXML xml
|
||||
showsXMLDoc xml = showString header . showsXML xml
|
||||
where header = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
|
||||
|
||||
showsXML :: XML -> ShowS
|
||||
|
||||
@@ -441,9 +441,7 @@ Exp6
|
||||
| '?' { Meta 0 }
|
||||
| '[' ']' { Empty }
|
||||
| '[' Ident Exps ']' { foldl App (Vr (mkListId $2)) $3 }
|
||||
| '[' String ']' { case $2 of
|
||||
[] -> Empty
|
||||
str -> foldr1 C (map K (words str)) }
|
||||
| '[' String ']' { K $2 }
|
||||
| '{' ListLocDef '}' {% mkR $2 }
|
||||
| '<' ListTupleComp '>' { R (tuple2record $2) }
|
||||
| '<' Exp ':' Exp '>' { Typed $2 $4 }
|
||||
|
||||
@@ -3,7 +3,7 @@ module GF.Infra.Option
|
||||
-- * Option types
|
||||
Options,
|
||||
Flags(..),
|
||||
Mode(..), Phase(..), Verbosity(..), Encoding(..), OutputFormat(..),
|
||||
Mode(..), Phase(..), Verbosity(..), OutputFormat(..),
|
||||
SISRFormat(..), Optimization(..), CFGTransform(..), HaskellOption(..),
|
||||
Dump(..), Printer(..), Recomp(..),
|
||||
-- * Option parsing
|
||||
@@ -17,7 +17,7 @@ module GF.Infra.Option
|
||||
helpMessage,
|
||||
-- * Checking specific options
|
||||
flag, cfgTransform, haskellOption, readOutputFormat,
|
||||
isLexicalCat, encodings,
|
||||
isLexicalCat, renameEncoding,
|
||||
-- * Setting specific options
|
||||
setOptimization, setCFGTransform,
|
||||
-- * Convenience methods for checking options
|
||||
@@ -25,12 +25,13 @@ module GF.Infra.Option
|
||||
) where
|
||||
|
||||
import Control.Monad
|
||||
import Data.Char (toLower)
|
||||
import Data.Char (toLower, isDigit)
|
||||
import Data.List
|
||||
import Data.Maybe
|
||||
import GF.Infra.GetOpt
|
||||
--import System.Console.GetOpt
|
||||
import System.FilePath
|
||||
import System.IO
|
||||
|
||||
import GF.Data.ErrM
|
||||
|
||||
@@ -77,9 +78,6 @@ data Verbosity = Quiet | Normal | Verbose | Debug
|
||||
data Phase = Preproc | Convert | Compile | Link
|
||||
deriving (Show,Eq,Ord)
|
||||
|
||||
data Encoding = UTF_8 | ISO_8859_1 | CP_1250 | CP_1251 | CP_1252 | CP_1254
|
||||
deriving (Eq,Ord)
|
||||
|
||||
data OutputFormat = FmtPGFPretty
|
||||
| FmtJavaScript
|
||||
| FmtHaskell
|
||||
@@ -161,7 +159,7 @@ data Flags = Flags {
|
||||
optCncName :: Maybe String,
|
||||
optResName :: Maybe String,
|
||||
optPreprocessors :: [String],
|
||||
optEncoding :: Encoding,
|
||||
optEncoding :: String,
|
||||
optOptimizations :: Set Optimization,
|
||||
optCFGTransforms :: Set CFGTransform,
|
||||
optLibraryPath :: [FilePath],
|
||||
@@ -207,7 +205,7 @@ fixRelativeLibPaths curr_dir lib_dir (Options o) = Options (fixPathFlags . o)
|
||||
-- | Pretty-print the options that are preserved in .gfo files.
|
||||
optionsGFO :: Options -> [(String,String)]
|
||||
optionsGFO opts = optionsPGF opts
|
||||
++ [("coding", show (flag optEncoding opts))]
|
||||
++ [("coding", flag optEncoding opts)]
|
||||
|
||||
-- | Pretty-print the options that are preserved in .pgf files.
|
||||
optionsPGF :: Options -> [(String,String)]
|
||||
@@ -260,7 +258,7 @@ defaultFlags = Flags {
|
||||
optCncName = Nothing,
|
||||
optResName = Nothing,
|
||||
optPreprocessors = [],
|
||||
optEncoding = ISO_8859_1,
|
||||
optEncoding = "latin1",
|
||||
optOptimizations = Set.fromList [OptStem,OptCSE,OptExpand,OptParametrize],
|
||||
optCFGTransforms = Set.fromList [CFGRemoveCycles, CFGBottomUpFilter,
|
||||
CFGTopDownFilter, CFGMergeIdentical],
|
||||
@@ -343,8 +341,7 @@ optDescr =
|
||||
(unlines ["Use CMD to preprocess input files.",
|
||||
"Multiple preprocessors can be used by giving this option multiple times."]),
|
||||
Option [] ["coding"] (ReqArg coding "ENCODING")
|
||||
("Character encoding of the source grammar, ENCODING = "
|
||||
++ concat (intersperse " | " (map fst encodings)) ++ "."),
|
||||
("Character encoding of the source grammar, ENCODING = utf8, latin1, cp1251, ..."),
|
||||
Option [] ["startcat"] (ReqArg startcat "CAT") "Grammar start category.",
|
||||
Option [] ["language"] (ReqArg language "LANG") "Set the speech language flag to LANG in the generated grammar.",
|
||||
Option [] ["lexer"] (ReqArg lexer "LEXER") "Use lexer LEXER.",
|
||||
@@ -400,9 +397,7 @@ optDescr =
|
||||
addLibDir x = set $ \o -> o { optLibraryPath = x:optLibraryPath o }
|
||||
setLibPath x = set $ \o -> o { optLibraryPath = splitInModuleSearchPath x }
|
||||
preproc x = set $ \o -> o { optPreprocessors = optPreprocessors o ++ [x] }
|
||||
coding x = case lookup x encodings of
|
||||
Just c -> set $ \o -> o { optEncoding = c }
|
||||
Nothing -> fail $ "Unknown character encoding: " ++ x
|
||||
coding x = set $ \o -> o { optEncoding = x }
|
||||
startcat x = set $ \o -> o { optStartCat = Just x }
|
||||
language x = set $ \o -> o { optSpeechLanguage = Just x }
|
||||
lexer x = set $ \o -> o { optLexer = Just x }
|
||||
@@ -483,18 +478,14 @@ haskellOptionNames =
|
||||
("gadt", HaskellGADT),
|
||||
("lexical", HaskellLexical)]
|
||||
|
||||
encodings :: [(String,Encoding)]
|
||||
encodings =
|
||||
[("utf8", UTF_8),
|
||||
("cp1250", CP_1250),
|
||||
("cp1251", CP_1251),
|
||||
("cp1252", CP_1252),
|
||||
("cp1254", CP_1254),
|
||||
("latin1", ISO_8859_1)
|
||||
]
|
||||
|
||||
instance Show Encoding where
|
||||
show = lookupShow encodings
|
||||
-- | This is for bacward compatibility. Since GHC 6.12 we
|
||||
-- started using the native Unicode support in GHC but it
|
||||
-- uses different names for the code pages.
|
||||
renameEncoding :: String -> String
|
||||
renameEncoding "utf8" = "UTF-8"
|
||||
renameEncoding "latin1" = "CP1252"
|
||||
renameEncoding ('c':'p':s) | all isDigit s = 'C':'P':s
|
||||
renameEncoding s = s
|
||||
|
||||
lookupShow :: Eq a => [(String,a)] -> a -> String
|
||||
lookupShow xs z = fromMaybe "lookupShow" $ lookup z [(y,x) | (x,y) <- xs]
|
||||
|
||||
@@ -23,7 +23,6 @@ import PGF.Linearize
|
||||
import GF.Data.Operations
|
||||
import GF.Infra.UseIO
|
||||
import GF.Infra.Option
|
||||
import GF.Text.Coding
|
||||
import PGF.Probabilistic
|
||||
|
||||
import System.Random
|
||||
@@ -33,9 +32,9 @@ import Data.List (nub)
|
||||
|
||||
-- generic quiz function
|
||||
|
||||
mkQuiz :: Encoding -> String -> [(String,[String])] -> IO ()
|
||||
mkQuiz cod msg tts = do
|
||||
let qas = [ (encodeUnicode cod q, mkAnswer cod as) | (q,as) <- tts]
|
||||
mkQuiz :: String -> [(String,[String])] -> IO ()
|
||||
mkQuiz msg tts = do
|
||||
let qas = [(q, mkAnswer as) | (q,as) <- tts]
|
||||
teachDialogue qas msg
|
||||
|
||||
translationList ::
|
||||
@@ -62,14 +61,13 @@ morphologyList mex mprobs pgf ig typ number = do
|
||||
(pwss@(pws0:_),i) <- zip ss forms, let ws = map (\pws -> snd (pws !! i)) pwss]
|
||||
|
||||
-- | compare answer to the list of right answers, increase score and give feedback
|
||||
mkAnswer :: Encoding -> [String] -> String -> (Integer, String)
|
||||
mkAnswer cod as s =
|
||||
mkAnswer :: [String] -> String -> (Integer, String)
|
||||
mkAnswer as s =
|
||||
if (elem (norm s) as)
|
||||
then (1,"Yes.")
|
||||
else (0,"No, not" +++ s ++ ", but" ++++ enc (unlines as))
|
||||
else (0,"No, not" +++ s ++ ", but" ++++ unlines as)
|
||||
where
|
||||
norm = unwords . words . decodeUnicode cod
|
||||
enc = encodeUnicode cod
|
||||
norm = unwords . words
|
||||
|
||||
norml = unwords . words
|
||||
|
||||
|
||||
@@ -1,91 +0,0 @@
|
||||
-----------------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : GF.Text.CP1250
|
||||
-- Maintainer : Krasimir Angelov
|
||||
--
|
||||
-- cp1250 is a code page used under Microsoft Windows to represent texts
|
||||
-- in Central European and Eastern European languages that use Latin script,
|
||||
-- such as Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian,
|
||||
-- Serbian (Latin script), Romanian and Albanian. It may also be used with
|
||||
-- the German language; German-language texts encoded with cp1250 and cp1252
|
||||
-- are identical.
|
||||
--
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.CP1250 where
|
||||
|
||||
import Data.Char
|
||||
|
||||
decodeCP1250 = map convert where
|
||||
convert c
|
||||
| c == '\x80' = chr 0x20AC
|
||||
| c == '\x82' = chr 0x201A
|
||||
| c == '\x84' = chr 0x201E
|
||||
| c == '\x85' = chr 0x2026
|
||||
| c == '\x86' = chr 0x2020
|
||||
| c == '\x87' = chr 0x2021
|
||||
| c == '\x89' = chr 0x2030
|
||||
| c == '\x8A' = chr 0x0160
|
||||
| c == '\x8B' = chr 0x2039
|
||||
| c == '\x8C' = chr 0x015A
|
||||
| c == '\x8D' = chr 0x0164
|
||||
| c == '\x8E' = chr 0x017D
|
||||
| c == '\x8F' = chr 0x0179
|
||||
| c == '\x91' = chr 0x2018
|
||||
| c == '\x92' = chr 0x2019
|
||||
| c == '\x93' = chr 0x201C
|
||||
| c == '\x94' = chr 0x201D
|
||||
| c == '\x95' = chr 0x2022
|
||||
| c == '\x96' = chr 0x2013
|
||||
| c == '\x97' = chr 0x2014
|
||||
| c == '\x99' = chr 0x2122
|
||||
| c == '\x9A' = chr 0x0161
|
||||
| c == '\x9B' = chr 0x203A
|
||||
| c == '\x9C' = chr 0x015B
|
||||
| c == '\x9D' = chr 0x0165
|
||||
| c == '\x9E' = chr 0x017E
|
||||
| c == '\x9F' = chr 0x017A
|
||||
| c == '\xA1' = chr 0x02C7
|
||||
| c == '\xA5' = chr 0x0104
|
||||
| c == '\xB9' = chr 0x0105
|
||||
| c == '\xBC' = chr 0x013D
|
||||
| c == '\xBE' = chr 0x013E
|
||||
| otherwise = c
|
||||
|
||||
|
||||
encodeCP1250 = map convert where
|
||||
convert c
|
||||
| oc == 0x20AC = '\x80'
|
||||
| oc == 0x201A = '\x82'
|
||||
| oc == 0x201E = '\x84'
|
||||
| oc == 0x2026 = '\x85'
|
||||
| oc == 0x2020 = '\x86'
|
||||
| oc == 0x2021 = '\x87'
|
||||
| oc == 0x2030 = '\x89'
|
||||
| oc == 0x0160 = '\x8A'
|
||||
| oc == 0x2039 = '\x8B'
|
||||
| oc == 0x015A = '\x8C'
|
||||
| oc == 0x0164 = '\x8D'
|
||||
| oc == 0x017D = '\x8E'
|
||||
| oc == 0x0179 = '\x8F'
|
||||
| oc == 0x2018 = '\x91'
|
||||
| oc == 0x2019 = '\x92'
|
||||
| oc == 0x201C = '\x93'
|
||||
| oc == 0x201D = '\x94'
|
||||
| oc == 0x2022 = '\x95'
|
||||
| oc == 0x2013 = '\x96'
|
||||
| oc == 0x2014 = '\x97'
|
||||
| oc == 0x2122 = '\x99'
|
||||
| oc == 0x0161 = '\x9A'
|
||||
| oc == 0x203A = '\x9B'
|
||||
| oc == 0x015B = '\x9C'
|
||||
| oc == 0x0165 = '\x9D'
|
||||
| oc == 0x017E = '\x9E'
|
||||
| oc == 0x017A = '\x9F'
|
||||
| oc == 0x02C7 = '\xA1'
|
||||
| oc == 0x0104 = '\xA5'
|
||||
| oc == 0x0105 = '\xB9'
|
||||
| oc == 0x013D = '\xBC'
|
||||
| oc == 0x013E = '\xBE'
|
||||
| otherwise = c
|
||||
where oc = ord c
|
||||
@@ -1,86 +0,0 @@
|
||||
-----------------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : GF.Text.CP1251
|
||||
-- Maintainer : Krasimir Angelov
|
||||
--
|
||||
-- cp1251 is a popular 8-bit character encoding, designed to cover languages
|
||||
-- that use the Cyrillic alphabet such as Russian, Bulgarian, Serbian Cyrillic
|
||||
-- and other languages. It is the most widely used for encoding the Bulgarian,
|
||||
-- Serbian and Macedonian languages.
|
||||
--
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.CP1251 where
|
||||
|
||||
import Data.Char
|
||||
|
||||
decodeCP1251 = map convert where
|
||||
convert c
|
||||
| c >= '\xC0' && c <= '\xFF' = chr (ord c + (0x410-0xC0))
|
||||
| c == '\xA8' = chr 0x401 -- cyrillic capital letter lo
|
||||
| c == '\x80' = chr 0x402
|
||||
| c == '\x81' = chr 0x403
|
||||
| c == '\xAA' = chr 0x404
|
||||
| c == '\xBD' = chr 0x405
|
||||
| c == '\xB2' = chr 0x406
|
||||
| c == '\xAF' = chr 0x407
|
||||
| c == '\xA3' = chr 0x408
|
||||
| c == '\x8A' = chr 0x409
|
||||
| c == '\x8C' = chr 0x40A
|
||||
| c == '\x8E' = chr 0x40B
|
||||
| c == '\x8D' = chr 0x40C
|
||||
| c == '\xA1' = chr 0x40E
|
||||
| c == '\x8F' = chr 0x40F
|
||||
| c == '\xB8' = chr 0x451 -- cyrillic small letter lo
|
||||
| c == '\x90' = chr 0x452
|
||||
| c == '\x83' = chr 0x453
|
||||
| c == '\xBA' = chr 0x454
|
||||
| c == '\xBE' = chr 0x455
|
||||
| c == '\xB3' = chr 0x456
|
||||
| c == '\xBF' = chr 0x457
|
||||
| c == '\xBC' = chr 0x458
|
||||
| c == '\x9A' = chr 0x459
|
||||
| c == '\x9C' = chr 0x45A
|
||||
| c == '\x9E' = chr 0x45B
|
||||
| c == '\x9D' = chr 0x45C
|
||||
| c == '\xA2' = chr 0x45E
|
||||
| c == '\x9F' = chr 0x45F
|
||||
| c == '\xA5' = chr 0x490
|
||||
| c == '\xB4' = chr 0x491
|
||||
| otherwise = c
|
||||
|
||||
encodeCP1251 = map convert where
|
||||
convert c
|
||||
| oc >= 0x410 && oc <= 0x44F = chr (oc - (0x410-0xC0))
|
||||
| oc == 0x401 = '\xA8' -- cyrillic capital letter lo
|
||||
| oc == 0x402 = '\x80'
|
||||
| oc == 0x403 = '\x81'
|
||||
| oc == 0x404 = '\xAA'
|
||||
| oc == 0x405 = '\xBD'
|
||||
| oc == 0x406 = '\xB2'
|
||||
| oc == 0x407 = '\xAF'
|
||||
| oc == 0x408 = '\xA3'
|
||||
| oc == 0x409 = '\x8A'
|
||||
| oc == 0x40A = '\x8C'
|
||||
| oc == 0x40B = '\x8E'
|
||||
| oc == 0x40C = '\x8D'
|
||||
| oc == 0x40E = '\xA1'
|
||||
| oc == 0x40F = '\x8F'
|
||||
| oc == 0x451 = '\xB8' -- cyrillic small letter lo
|
||||
| oc == 0x452 = '\x90'
|
||||
| oc == 0x453 = '\x83'
|
||||
| oc == 0x454 = '\xBA'
|
||||
| oc == 0x455 = '\xBE'
|
||||
| oc == 0x456 = '\xB3'
|
||||
| oc == 0x457 = '\xBF'
|
||||
| oc == 0x458 = '\xBC'
|
||||
| oc == 0x459 = '\x9A'
|
||||
| oc == 0x45A = '\x9C'
|
||||
| oc == 0x45B = '\x9E'
|
||||
| oc == 0x45C = '\x9D'
|
||||
| oc == 0x45E = '\xA2'
|
||||
| oc == 0x45F = '\x9F'
|
||||
| oc == 0x490 = '\xA5'
|
||||
| oc == 0x491 = '\xB4'
|
||||
| otherwise = c
|
||||
where oc = ord c
|
||||
@@ -1,17 +0,0 @@
|
||||
-----------------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : GF.Text.CP1252
|
||||
-- Maintainer : Krasimir Angelov
|
||||
--
|
||||
-- cp1252 is a character encoding of the Latin alphabet, used by default in
|
||||
-- the legacy components of Microsoft Windows in English and some other
|
||||
-- Western languages.
|
||||
--
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.CP1252 where
|
||||
|
||||
import Data.Char
|
||||
|
||||
decodeCP1252 = map id
|
||||
encodeCP1252 = map (\x -> if x <= '\255' then x else '?')
|
||||
@@ -1,84 +0,0 @@
|
||||
-----------------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : GF.Text.CP1254
|
||||
-- Maintainer : Krasimir Angelov
|
||||
--
|
||||
-- cp1254 is a code page used under Microsoft Windows to write Turkish.
|
||||
-- Characters with codepoints A0 through FF are compatible with ISO 8859-9.
|
||||
--
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.CP1254 where
|
||||
|
||||
import Data.Char
|
||||
|
||||
decodeCP1254 = map convert where
|
||||
convert c
|
||||
| c == '\x80' = chr 0x20AC
|
||||
| c == '\x82' = chr 0x201A
|
||||
| c == '\x83' = chr 0x192
|
||||
| c == '\x84' = chr 0x201E
|
||||
| c == '\x85' = chr 0x2026
|
||||
| c == '\x86' = chr 0x2020
|
||||
| c == '\x87' = chr 0x2021
|
||||
| c == '\x88' = chr 0x2C6
|
||||
| c == '\x89' = chr 0x2030
|
||||
| c == '\x8A' = chr 0x160
|
||||
| c == '\x8B' = chr 0x2039
|
||||
| c == '\x8C' = chr 0x152
|
||||
| c == '\x91' = chr 0x2018
|
||||
| c == '\x92' = chr 0x2019
|
||||
| c == '\x93' = chr 0x201C
|
||||
| c == '\x94' = chr 0x201D
|
||||
| c == '\x95' = chr 0x2022
|
||||
| c == '\x96' = chr 0x2013
|
||||
| c == '\x97' = chr 0x2014
|
||||
| c == '\x98' = chr 0x2DC
|
||||
| c == '\x99' = chr 0x2122
|
||||
| c == '\x9A' = chr 0x161
|
||||
| c == '\x9B' = chr 0x203A
|
||||
| c == '\x9C' = chr 0x153
|
||||
| c == '\x9F' = chr 0x178
|
||||
| c == '\xD0' = chr 0x11E
|
||||
| c == '\xDD' = chr 0x130
|
||||
| c == '\xDE' = chr 0x15E
|
||||
| c == '\xF0' = chr 0x11F
|
||||
| c == '\xFD' = chr 0x131
|
||||
| c == '\xFE' = chr 0x15F
|
||||
| otherwise = c
|
||||
|
||||
encodeCP1254 = map convert where
|
||||
convert c
|
||||
| oc == 0x20AC = '\x80'
|
||||
| oc == 0x201A = '\x82'
|
||||
| oc == 0x192 = '\x83'
|
||||
| oc == 0x201E = '\x84'
|
||||
| oc == 0x2026 = '\x85'
|
||||
| oc == 0x2020 = '\x86'
|
||||
| oc == 0x2021 = '\x87'
|
||||
| oc == 0x2C6 = '\x88'
|
||||
| oc == 0x2030 = '\x89'
|
||||
| oc == 0x160 = '\x8A'
|
||||
| oc == 0x2039 = '\x8B'
|
||||
| oc == 0x152 = '\x8C'
|
||||
| oc == 0x2018 = '\x91'
|
||||
| oc == 0x2019 = '\x92'
|
||||
| oc == 0x201C = '\x93'
|
||||
| oc == 0x201D = '\x94'
|
||||
| oc == 0x2022 = '\x95'
|
||||
| oc == 0x2013 = '\x96'
|
||||
| oc == 0x2014 = '\x97'
|
||||
| oc == 0x2DC = '\x98'
|
||||
| oc == 0x2122 = '\x99'
|
||||
| oc == 0x161 = '\x9A'
|
||||
| oc == 0x203A = '\x9B'
|
||||
| oc == 0x153 = '\x9C'
|
||||
| oc == 0x178 = '\x9F'
|
||||
| oc == 0x11E = '\xD0'
|
||||
| oc == 0x130 = '\xDD'
|
||||
| oc == 0x15E = '\xDE'
|
||||
| oc == 0x11F = '\xF0'
|
||||
| oc == 0x131 = '\xFD'
|
||||
| oc == 0x15F = '\xFE'
|
||||
| otherwise = c
|
||||
where oc = ord c
|
||||
@@ -1,24 +1,69 @@
|
||||
module GF.Text.Coding where
|
||||
|
||||
import GF.Infra.Option
|
||||
import GF.Text.UTF8
|
||||
import GF.Text.CP1250
|
||||
import GF.Text.CP1251
|
||||
import GF.Text.CP1252
|
||||
import GF.Text.CP1254
|
||||
import qualified Data.ByteString as BS
|
||||
import Data.ByteString.Internal
|
||||
import GHC.IO
|
||||
import GHC.IO.Buffer
|
||||
import GHC.IO.Encoding
|
||||
import GHC.IO.Exception
|
||||
import Control.Monad
|
||||
|
||||
encodeUnicode e = case e of
|
||||
UTF_8 -> encodeUTF8
|
||||
CP_1250 -> encodeCP1250
|
||||
CP_1251 -> encodeCP1251
|
||||
CP_1252 -> encodeCP1252
|
||||
CP_1254 -> encodeCP1254
|
||||
_ -> id
|
||||
encodeUnicode :: TextEncoding -> String -> ByteString
|
||||
encodeUnicode enc s =
|
||||
unsafePerformIO $ do
|
||||
let len = length s
|
||||
cbuf0 <- newCharBuffer (len*4) ReadBuffer
|
||||
foldM (\i c -> writeCharBuf (bufRaw cbuf0) i c) 0 s
|
||||
let cbuf = cbuf0{bufR=len}
|
||||
case enc of
|
||||
TextEncoding {mkTextEncoder=mk} -> do encoder <- mk
|
||||
bss <- translate (encode encoder) cbuf
|
||||
close encoder
|
||||
return (BS.concat bss)
|
||||
where
|
||||
translate cod cbuf
|
||||
| i < w = do bbuf <- newByteBuffer 128 WriteBuffer
|
||||
(cbuf,bbuf) <- cod cbuf bbuf
|
||||
if isEmptyBuffer bbuf
|
||||
then ioe_invalidCharacter
|
||||
else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
|
||||
bss <- translate cod cbuf
|
||||
return (bs:bss)
|
||||
| otherwise = return []
|
||||
where
|
||||
i = bufL cbuf
|
||||
w = bufR cbuf
|
||||
|
||||
decodeUnicode e = case e of
|
||||
UTF_8 -> decodeUTF8
|
||||
CP_1250 -> decodeCP1250
|
||||
CP_1251 -> decodeCP1251
|
||||
CP_1252 -> decodeCP1252
|
||||
CP_1254 -> decodeCP1254
|
||||
_ -> id
|
||||
decodeUnicode :: TextEncoding -> ByteString -> String
|
||||
decodeUnicode enc (PS fptr l len) =
|
||||
unsafePerformIO $ do
|
||||
let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len}
|
||||
cbuf <- newCharBuffer 128 WriteBuffer
|
||||
case enc of
|
||||
TextEncoding {mkTextDecoder=mk} -> do decoder <- mk
|
||||
s <- translate (encode decoder) bbuf cbuf
|
||||
close decoder
|
||||
return s
|
||||
where
|
||||
translate cod bbuf cbuf
|
||||
| i < w = do (bbuf,cbuf) <- cod bbuf cbuf
|
||||
if isEmptyBuffer cbuf
|
||||
then ioe_invalidCharacter
|
||||
else unpack cod bbuf cbuf
|
||||
| otherwise = return []
|
||||
where
|
||||
i = bufL bbuf
|
||||
w = bufR bbuf
|
||||
|
||||
unpack cod bbuf cbuf
|
||||
| i < w = do (c,i') <- readCharBuf (bufRaw cbuf) i
|
||||
cs <- unpack cod bbuf cbuf{bufL=i'}
|
||||
return (c:cs)
|
||||
| otherwise = translate cod bbuf cbuf{bufL=0,bufR=0}
|
||||
where
|
||||
i = bufL cbuf
|
||||
w = bufR cbuf
|
||||
|
||||
ioe_invalidCharacter = ioException
|
||||
(IOError Nothing InvalidArgument ""
|
||||
("invalid byte sequence for this encoding") Nothing Nothing)
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
module GF.Text.Lexing (stringOp,opInEnv) where
|
||||
|
||||
import GF.Text.Transliterations
|
||||
import GF.Text.UTF8
|
||||
import GF.Text.CP1251
|
||||
|
||||
import Data.Char
|
||||
import Data.List (intersperse)
|
||||
@@ -23,10 +21,6 @@ stringOp name = case name of
|
||||
"unlexmixed" -> Just $ capitInit . appUnlexer (unlexMixed . unquote)
|
||||
"unwords" -> Just $ appUnlexer unwords
|
||||
"to_html" -> Just wrapHTML
|
||||
"to_utf8" -> Just encodeUTF8
|
||||
"from_utf8" -> Just decodeUTF8
|
||||
"to_cp1251" -> Just encodeCP1251
|
||||
"from_cp1251" -> Just decodeCP1251
|
||||
_ -> transliterate name
|
||||
|
||||
-- perform op in environments beg--end, t.ex. between "--"
|
||||
|
||||
@@ -5,8 +5,6 @@ module GF.Text.Transliterations (
|
||||
transliterationPrintNames
|
||||
) where
|
||||
|
||||
import GF.Text.UTF8
|
||||
|
||||
import Data.Char
|
||||
import Numeric
|
||||
import qualified Data.Map as Map
|
||||
|
||||
@@ -1,48 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : UTF8
|
||||
-- Maintainer : AR
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:42 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.5 $
|
||||
--
|
||||
-- From the Char module supplied with HBC.
|
||||
-- code by Thomas Hallgren (Jul 10 1999)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where
|
||||
|
||||
-- | Take a Unicode string and encode it as a string
|
||||
-- with the UTF8 method.
|
||||
decodeUTF8 :: String -> String
|
||||
decodeUTF8 "" = ""
|
||||
decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
|
||||
decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' &&
|
||||
'\x80' <= c' && c' <= '\xbf' =
|
||||
toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
|
||||
decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' &&
|
||||
'\x80' <= c' && c' <= '\xbf' &&
|
||||
'\x80' <= c'' && c'' <= '\xbf' =
|
||||
toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
|
||||
decodeUTF8 s = s ---- AR workaround 22/6/2006
|
||||
----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
|
||||
|
||||
encodeUTF8 :: String -> String
|
||||
encodeUTF8 "" = ""
|
||||
encodeUTF8 (c:cs) =
|
||||
if c > '\x0000' && c < '\x0080' then
|
||||
c : encodeUTF8 cs
|
||||
else if c < toEnum 0x0800 then
|
||||
let i = fromEnum c
|
||||
in toEnum (0xc0 + i `div` 0x40) :
|
||||
toEnum (0x80 + i `mod` 0x40) :
|
||||
encodeUTF8 cs
|
||||
else
|
||||
let i = fromEnum c
|
||||
in toEnum (0xe0 + i `div` 0x1000) :
|
||||
toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) :
|
||||
toEnum (0x80 + i `mod` 0x40) :
|
||||
encodeUTF8 cs
|
||||
@@ -17,6 +17,7 @@ import Data.Maybe
|
||||
import Data.Binary
|
||||
import System.FilePath
|
||||
import System.IO
|
||||
import Control.Exception
|
||||
|
||||
|
||||
mainGFC :: Options -> [FilePath] -> IOE ()
|
||||
@@ -81,8 +82,8 @@ writeOutput opts file str =
|
||||
do let path = case flag optOutputDir opts of
|
||||
Nothing -> file
|
||||
Just dir -> dir </> file
|
||||
writeOutputFile opts path str
|
||||
|
||||
writeOutputFile :: Options -> FilePath -> String -> IOE ()
|
||||
writeOutputFile opts outfile output =
|
||||
do putPointE Normal opts ("Writing " ++ outfile ++ "...") $ ioeIO $ writeFile outfile output
|
||||
putPointE Normal opts ("Writing " ++ path ++ "...") $ ioeIO $
|
||||
bracket
|
||||
(openFile path WriteMode)
|
||||
(hClose)
|
||||
(\h -> hSetEncoding h utf8 >> hPutStr h str)
|
||||
|
||||
@@ -21,7 +21,6 @@ import GF.Infra.Option
|
||||
import GF.Infra.Modules (greatestResource, modules, emptyModInfo)
|
||||
import GF.System.Readline
|
||||
|
||||
import GF.Text.Coding
|
||||
import GF.Compile.Coding
|
||||
|
||||
import PGF
|
||||
@@ -34,6 +33,7 @@ import Data.List(isPrefixOf)
|
||||
import qualified Data.Map as Map
|
||||
import qualified Data.ByteString.Char8 as BS
|
||||
import qualified Text.ParserCombinators.ReadP as RP
|
||||
import System.IO
|
||||
import System.Cmd
|
||||
import System.CPUTime
|
||||
import System.Directory
|
||||
@@ -86,9 +86,7 @@ loop opts gfenv0 = do
|
||||
s0 <- fetch
|
||||
let gfenv = gfenv0 {history = s0 : history gfenv0}
|
||||
let
|
||||
enc = encode gfenv
|
||||
s = decode gfenv s0
|
||||
pwords = case words s of
|
||||
pwords = case words s0 of
|
||||
w:ws -> getCommandOp w :ws
|
||||
ws -> ws
|
||||
|
||||
@@ -130,8 +128,8 @@ loop opts gfenv0 = do
|
||||
case runP pExp (BS.pack s) of
|
||||
Left (_,msg) -> putStrLn msg
|
||||
Right t -> case checkComputeTerm sgr (codeTerm (decode gfenv) (L (0,0) t)) of
|
||||
Ok x -> putStrLn $ enc (showTerm sgr style q x)
|
||||
Bad s -> putStrLn $ enc s
|
||||
Ok x -> putStrLn $ showTerm sgr style q x
|
||||
Bad s -> putStrLn $ s
|
||||
loopNewCPU gfenv
|
||||
"dg":ws -> do
|
||||
let stop = case ws of
|
||||
@@ -141,7 +139,7 @@ loop opts gfenv0 = do
|
||||
putStrLn "wrote graph in file _gfdepgraph.dot"
|
||||
loopNewCPU gfenv
|
||||
"eh":w:_ -> do
|
||||
cs <- readFile w >>= return . map (interpretCommandLine enc env) . lines
|
||||
cs <- readFile w >>= return . map (interpretCommandLine env) . lines
|
||||
loopNewCPU gfenv
|
||||
|
||||
"i":args -> do
|
||||
@@ -179,25 +177,28 @@ loop opts gfenv0 = do
|
||||
_ -> putStrLn "value definition not parsed" >> loopNewCPU gfenv
|
||||
|
||||
"ph":_ ->
|
||||
mapM_ (putStrLn . enc) (reverse (history gfenv0)) >> loopNewCPU gfenv
|
||||
"se":c:_ ->
|
||||
case lookup c encodings of
|
||||
Just cod -> do
|
||||
mapM_ putStrLn (reverse (history gfenv0)) >> loopNewCPU gfenv
|
||||
"se":c:_ -> do
|
||||
let cod = renameEncoding c
|
||||
#ifdef mingw32_HOST_OS
|
||||
case c of
|
||||
'c':'p':c -> case reads c of
|
||||
[(cp,"")] -> setConsoleCP cp >> setConsoleOutputCP cp
|
||||
_ -> return ()
|
||||
"utf8" -> setConsoleCP 65001 >> setConsoleOutputCP 65001
|
||||
_ -> return ()
|
||||
case cod of
|
||||
'C':'P':c -> case reads c of
|
||||
[(cp,"")] -> do setConsoleCP cp
|
||||
setConsoleOutputCP cp
|
||||
_ -> return ()
|
||||
"UTF-8" -> do setConsoleCP 65001
|
||||
setConsoleOutputCP 65001
|
||||
_ -> return ()
|
||||
#endif
|
||||
loopNewCPU $ gfenv {coding = cod}
|
||||
Nothing -> do putStrLn "unknown encoding"
|
||||
loopNewCPU gfenv
|
||||
enc <- mkTextEncoding cod
|
||||
hSetEncoding stdin enc
|
||||
hSetEncoding stdout enc
|
||||
hSetEncoding stderr enc
|
||||
loopNewCPU gfenv
|
||||
|
||||
-- ordinary commands, working on CommandEnv
|
||||
_ -> do
|
||||
interpretCommandLine enc env s
|
||||
interpretCommandLine env s0
|
||||
loopNewCPU gfenv
|
||||
-- gfenv' <- return $ either (const gfenv) id r
|
||||
gfenv' <- either (\e -> (print e >> return gfenv)) return r
|
||||
@@ -215,7 +216,7 @@ importInEnv gfenv opts files
|
||||
if (verbAtLeast opts Normal)
|
||||
then putStrLnFlush $ unwords $ "\nLanguages:" : map showCId (languages pgf1)
|
||||
else return ()
|
||||
return $ gfenv { commandenv = mkCommandEnv (coding gfenv) pgf1 }
|
||||
return $ gfenv { commandenv = mkCommandEnv pgf1 }
|
||||
|
||||
tryGetLine = do
|
||||
res <- try getLine
|
||||
@@ -252,24 +253,16 @@ data GFEnv = GFEnv {
|
||||
sourcegrammar :: SourceGrammar, -- gfo grammar -retain
|
||||
commandenv :: CommandEnv,
|
||||
history :: [String],
|
||||
cputime :: Integer,
|
||||
coding :: Encoding
|
||||
cputime :: Integer
|
||||
}
|
||||
|
||||
emptyGFEnv :: IO GFEnv
|
||||
emptyGFEnv = do
|
||||
#ifdef mingw32_HOST_OS
|
||||
codepage <- getACP
|
||||
let coding = fromMaybe UTF_8 (lookup ("cp"++show codepage) encodings)
|
||||
#else
|
||||
let coding = UTF_8
|
||||
#endif
|
||||
return $ GFEnv emptySourceGrammar{modules=[(identW,emptyModInfo)]} (mkCommandEnv coding emptyPGF) [] 0 coding
|
||||
return $ GFEnv emptySourceGrammar{modules=[(identW,emptyModInfo)]} (mkCommandEnv emptyPGF) [] 0
|
||||
|
||||
encode = encodeUnicode . coding
|
||||
decode = decodeUnicode . coding
|
||||
decode _ = id -- decodeUnicode . coding
|
||||
|
||||
wordCompletion gfenv line0 prefix0 p =
|
||||
wordCompletion gfenv line prefix p =
|
||||
case wc_type (take p line) of
|
||||
CmplCmd pref
|
||||
-> ret ' ' [name | name <- Map.keys (commands cmdEnv), isPrefixOf pref name]
|
||||
@@ -280,7 +273,7 @@ wordCompletion gfenv line0 prefix0 p =
|
||||
in case loop state0 ws of
|
||||
Nothing -> ret ' ' []
|
||||
Just state -> let compls = getCompletions state prefix
|
||||
in ret ' ' (map (encode gfenv) (Map.keys compls))
|
||||
in ret ' ' (Map.keys compls)
|
||||
Left (_ :: SomeException) -> ret ' ' []
|
||||
CmplOpt (Just (Command n _ _)) pref
|
||||
-> case Map.lookup n (commands cmdEnv) of
|
||||
@@ -298,9 +291,6 @@ wordCompletion gfenv line0 prefix0 p =
|
||||
Left (_ :: SomeException) -> ret ' ' []
|
||||
_ -> ret ' ' []
|
||||
where
|
||||
line = decode gfenv line0
|
||||
prefix = decode gfenv prefix0
|
||||
|
||||
pgf = multigrammar cmdEnv
|
||||
cmdEnv = commandenv gfenv
|
||||
optLang opts = valCIdOpts "lang" (head (languages pgf)) opts
|
||||
|
||||
Reference in New Issue
Block a user