cp1251 coding ; trying to recognize the coding flag in grammar

This commit is contained in:
aarne
2008-06-24 13:58:04 +00:00
parent 223480bb77
commit 239f310eb5
4 changed files with 41 additions and 10 deletions

View File

@@ -60,4 +60,8 @@ isFlag o opts = elem o [x | OFlag x _ <- opts]
prOpt :: Option -> String
prOpt (OOpt i) = i ----
mkOpt :: String -> Option
mkOpt = OOpt

View File

@@ -32,6 +32,8 @@ import Data.Maybe
import qualified Data.Map as Map
import System.Cmd
import Debug.Trace
type CommandOutput = ([Tree],String) ---- errors, etc
data CommandInfo = CommandInfo {
@@ -343,7 +345,7 @@ allCommands pgf = Map.fromList [
"ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UTF8 terminal",
"ps -to_devanagari -to_utf8 \"A-p\" -- show Devanagari in UTF8 terminal"
],
exec = \opts -> return . fromString . stringOps opts . toString,
exec = \opts -> return . fromString . stringOps (map prOpt opts) . toString,
options = stringOpOptions
}),
("q", emptyCommandInfo {
@@ -497,11 +499,18 @@ allCommands pgf = Map.fromList [
(abstractName pgf ++ ": " ++ showTree t) :
[lang ++ ": " ++ linear opts lang t | lang <- optLangs opts]
unlex opts lang = stringOps (exceptUTF8 opts) where
exceptUTF8 = if isUTF8 then filter ((/="to_UTF8") . prOpt) else id
isUTF8 = case lookFlag pgf lang "coding" of
Just "utf8" -> True
_ -> False
-- logic of coding in unlexing:
-- - If lang has no coding flag, or -to_utf8 is not in opts, just opts are used.
-- - If lang has flag coding=utf8, -to_utf8 is ignored.
-- - If lang has coding=other, and -to_utf8 is in opts, from_other is applied first.
unlex opts lang = {- trace (unwords optsC) $ -} stringOps optsC where
optsC = case lookFlag pgf lang "coding" of
Just "utf8" -> filter (/="to_utf8") $ map prOpt opts
Just other | isOpt "to_utf8" opts ->
let cod = ("from_" ++ other)
in cod : filter (/=cod) (map prOpt opts)
_ -> map prOpt opts
optRestricted opts = restrictPGF (hasLin pgf (mkCId (optLang opts))) pgf
@@ -536,7 +545,7 @@ allCommands pgf = Map.fromList [
[lookupMorpho (buildMorpho pgf (mkCId la)) s | la <- optLangs opts]
-- ps -f -g s returns g (f s)
stringOps opts s = foldr app s (reverse (map prOpt opts)) where
stringOps opts s = foldr app s (reverse opts) where
app f = maybe id id (stringOp f)
stringOpOptions = [

View File

@@ -76,7 +76,7 @@ data Verbosity = Quiet | Normal | Verbose | Debug
data Phase = Preproc | Convert | Compile | Link
deriving (Show,Eq,Ord)
data Encoding = UTF_8 | ISO_8859_1
data Encoding = UTF_8 | ISO_8859_1 | CP_1251
deriving (Show,Eq,Ord)
data OutputFormat = FmtPGF
@@ -469,7 +469,9 @@ optimizationPackages =
encodings :: [(String,Encoding)]
encodings =
[("utf8", UTF_8),
("latin1", ISO_8859_1)]
("cp1251", CP_1251),
("latin1", ISO_8859_1)
]
lookupShow :: Eq a => [(String,a)] -> a -> String
lookupShow xs z = fromMaybe "lookupShow" $ lookup z [(y,x) | (x,y) <- xs]
@@ -542,4 +544,4 @@ instance Functor OptDescr where
instance Functor ArgDescr where
fmap f (NoArg x) = NoArg (f x)
fmap f (ReqArg g s) = ReqArg (f . g) s
fmap f (OptArg g s) = OptArg (f . g) s
fmap f (OptArg g s) = OptArg (f . g) s

View File

@@ -24,6 +24,8 @@ stringOp name = case name of
"to_html" -> Just wrapHTML
"to_utf8" -> Just encodeUTF8
"from_utf8" -> Just decodeUTF8
"to_cp1251" -> Just encodeCP1251
"from_cp1251" -> Just decodeCP1251
_ -> transliterate name
appLexer :: (String -> [String]) -> String -> String
@@ -97,3 +99,17 @@ isPunct = flip elem ".?!,:;"
isParen = flip elem "()[]{}"
isClosing = flip elem ")]}"
-- might be in a file of its own: Windows Cyrillic, used in Bulgarian resource
decodeCP1251 = map convert where
convert c
| c >= '\192' && c <= '\255' = chr (ord c + 848)
| otherwise = c
encodeCP1251 = map convert where
convert c
| oc >= 1040 && oc <= 1103 = chr (oc - 848)
| otherwise = c
where oc = ord c