From 239f310eb560763c215400a41af7e21fa0f2d51f Mon Sep 17 00:00:00 2001 From: aarne Date: Tue, 24 Jun 2008 13:58:04 +0000 Subject: [PATCH] cp1251 coding ; trying to recognize the coding flag in grammar --- src-3.0/GF/Command/Abstract.hs | 4 ++++ src-3.0/GF/Command/Commands.hs | 23 ++++++++++++++++------- src-3.0/GF/Infra/Option.hs | 8 +++++--- src-3.0/GF/Text/Lexing.hs | 16 ++++++++++++++++ 4 files changed, 41 insertions(+), 10 deletions(-) diff --git a/src-3.0/GF/Command/Abstract.hs b/src-3.0/GF/Command/Abstract.hs index 23f76fa82..16905c2f9 100644 --- a/src-3.0/GF/Command/Abstract.hs +++ b/src-3.0/GF/Command/Abstract.hs @@ -60,4 +60,8 @@ isFlag o opts = elem o [x | OFlag x _ <- opts] prOpt :: Option -> String prOpt (OOpt i) = i ---- +mkOpt :: String -> Option +mkOpt = OOpt + + diff --git a/src-3.0/GF/Command/Commands.hs b/src-3.0/GF/Command/Commands.hs index 68e2c5526..b5ba99f6f 100644 --- a/src-3.0/GF/Command/Commands.hs +++ b/src-3.0/GF/Command/Commands.hs @@ -32,6 +32,8 @@ import Data.Maybe import qualified Data.Map as Map import System.Cmd +import Debug.Trace + type CommandOutput = ([Tree],String) ---- errors, etc data CommandInfo = CommandInfo { @@ -343,7 +345,7 @@ allCommands pgf = Map.fromList [ "ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UTF8 terminal", "ps -to_devanagari -to_utf8 \"A-p\" -- show Devanagari in UTF8 terminal" ], - exec = \opts -> return . fromString . stringOps opts . toString, + exec = \opts -> return . fromString . stringOps (map prOpt opts) . toString, options = stringOpOptions }), ("q", emptyCommandInfo { @@ -497,11 +499,18 @@ allCommands pgf = Map.fromList [ (abstractName pgf ++ ": " ++ showTree t) : [lang ++ ": " ++ linear opts lang t | lang <- optLangs opts] - unlex opts lang = stringOps (exceptUTF8 opts) where - exceptUTF8 = if isUTF8 then filter ((/="to_UTF8") . prOpt) else id - isUTF8 = case lookFlag pgf lang "coding" of - Just "utf8" -> True - _ -> False +-- logic of coding in unlexing: +-- - If lang has no coding flag, or -to_utf8 is not in opts, just opts are used. +-- - If lang has flag coding=utf8, -to_utf8 is ignored. +-- - If lang has coding=other, and -to_utf8 is in opts, from_other is applied first. + + unlex opts lang = {- trace (unwords optsC) $ -} stringOps optsC where + optsC = case lookFlag pgf lang "coding" of + Just "utf8" -> filter (/="to_utf8") $ map prOpt opts + Just other | isOpt "to_utf8" opts -> + let cod = ("from_" ++ other) + in cod : filter (/=cod) (map prOpt opts) + _ -> map prOpt opts optRestricted opts = restrictPGF (hasLin pgf (mkCId (optLang opts))) pgf @@ -536,7 +545,7 @@ allCommands pgf = Map.fromList [ [lookupMorpho (buildMorpho pgf (mkCId la)) s | la <- optLangs opts] -- ps -f -g s returns g (f s) - stringOps opts s = foldr app s (reverse (map prOpt opts)) where + stringOps opts s = foldr app s (reverse opts) where app f = maybe id id (stringOp f) stringOpOptions = [ diff --git a/src-3.0/GF/Infra/Option.hs b/src-3.0/GF/Infra/Option.hs index 44d4adfa5..6c9d3550b 100644 --- a/src-3.0/GF/Infra/Option.hs +++ b/src-3.0/GF/Infra/Option.hs @@ -76,7 +76,7 @@ data Verbosity = Quiet | Normal | Verbose | Debug data Phase = Preproc | Convert | Compile | Link deriving (Show,Eq,Ord) -data Encoding = UTF_8 | ISO_8859_1 +data Encoding = UTF_8 | ISO_8859_1 | CP_1251 deriving (Show,Eq,Ord) data OutputFormat = FmtPGF @@ -469,7 +469,9 @@ optimizationPackages = encodings :: [(String,Encoding)] encodings = [("utf8", UTF_8), - ("latin1", ISO_8859_1)] + ("cp1251", CP_1251), + ("latin1", ISO_8859_1) + ] lookupShow :: Eq a => [(String,a)] -> a -> String lookupShow xs z = fromMaybe "lookupShow" $ lookup z [(y,x) | (x,y) <- xs] @@ -542,4 +544,4 @@ instance Functor OptDescr where instance Functor ArgDescr where fmap f (NoArg x) = NoArg (f x) fmap f (ReqArg g s) = ReqArg (f . g) s - fmap f (OptArg g s) = OptArg (f . g) s \ No newline at end of file + fmap f (OptArg g s) = OptArg (f . g) s diff --git a/src-3.0/GF/Text/Lexing.hs b/src-3.0/GF/Text/Lexing.hs index 1ac2eb498..2c6b417b8 100644 --- a/src-3.0/GF/Text/Lexing.hs +++ b/src-3.0/GF/Text/Lexing.hs @@ -24,6 +24,8 @@ stringOp name = case name of "to_html" -> Just wrapHTML "to_utf8" -> Just encodeUTF8 "from_utf8" -> Just decodeUTF8 + "to_cp1251" -> Just encodeCP1251 + "from_cp1251" -> Just decodeCP1251 _ -> transliterate name appLexer :: (String -> [String]) -> String -> String @@ -97,3 +99,17 @@ isPunct = flip elem ".?!,:;" isParen = flip elem "()[]{}" isClosing = flip elem ")]}" + +-- might be in a file of its own: Windows Cyrillic, used in Bulgarian resource + +decodeCP1251 = map convert where + convert c + | c >= '\192' && c <= '\255' = chr (ord c + 848) + | otherwise = c + +encodeCP1251 = map convert where + convert c + | oc >= 1040 && oc <= 1103 = chr (oc - 848) + | otherwise = c + where oc = ord c +