From ed72a2ef79e69c57ba2a6f41d88d8822d26548c4 Mon Sep 17 00:00:00 2001
From: aarne <aarne@cs.chalmers.se>
Date: Tue, 24 Jun 2008 21:52:07 +0000
Subject: [PATCH] added default decodings to Make, to enable multilingual utf8
 generation

---
 lib/resource-1.4/Make.hs       | 61 ++++++++++++++++++++++------------
 src-3.0/GF/Command/Abstract.hs |  6 ++--
 src-3.0/GF/Command/Commands.hs | 28 ++++++++++++----
 3 files changed, 64 insertions(+), 31 deletions(-)

diff --git a/lib/resource-1.4/Make.hs b/lib/resource-1.4/Make.hs
index a76bb59d7..f815540ff 100644
--- a/lib/resource-1.4/Make.hs
+++ b/lib/resource-1.4/Make.hs
@@ -13,25 +13,31 @@ import System
 -- With no argument, lang and api are done, in this order.
 -- See 'make' below for what is done by which command.
 
-langs = [
-  ("arabic",   "Ara"),
-  ("bulgarian","Bul"),
-  ("catalan",  "Cat"),
-  ("danish",   "Dan"),
-  ("english",  "Eng"),
-  ("finnish",  "Fin"),
-  ("french",   "Fre"),
-  ("hindi",    "Hin"),
-  ("german",   "Ger"),
-  ("interlingua","Ina"),
-  ("italian",  "Ita"),
-  ("norwegian","Nor"),
-  ("russian",  "Rus"),
-  ("spanish",  "Spa"),
-  ("swedish",  "Swe"),
-  ("thai",     "Tha")
+-- the languages have long directory names and short ISO codes (3 letters)
+-- we also give the decodings for postprocessing linearizations, as long as grammars
+-- don't support all flags needed; they are used in tests
+ 
+langsCoding = [
+  (("arabic",   "Ara"),""),
+  (("bulgarian","Bul"),"from_cp1251,to_utf8"),
+  (("catalan",  "Cat"),"to_utf8"),
+  (("danish",   "Dan"),"to_utf8"),
+  (("english",  "Eng"),""),
+  (("finnish",  "Fin"),"to_utf8"),
+  (("french",   "Fre"),"to_utf8"),
+  (("hindi",    "Hin"),"to_devanagari,to_utf8"),
+  (("german",   "Ger"),"to_utf8"),
+  (("interlingua","Ina"),""),
+  (("italian",  "Ita"),"to_utf8"),
+  (("norwegian","Nor"),"to_utf8"),
+  (("russian",  "Rus"),""),
+  (("spanish",  "Spa"),"to_utf8"),
+  (("swedish",  "Swe"),"to_utf8"), 
+  (("thai",     "Tha"),"to_thai,to_utf8")
   ]
 
+langs = map fst langsCoding
+
 -- languagues for which to compile Lang
 langsLang = langs `except` ["Ara"]
 
@@ -81,9 +87,11 @@ make xx = do
               unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- optl langsPGF] ++
               " +RTS -K100M"
   ifxx "test" $ do
-    gf treeb $ unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- optl langsTest]
+    let ls = optl langsTest
+    gf (treeb "Lang" ls) $ unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- ls] 
   ifxx "demo" $ do
-    gf demos $ unwords ["demo/Demo" ++ la ++ ".gf" | (_,la) <- optl langsDemo]
+    let ls = optl langsDemo
+    gf (demos "Demo" ls) $ unwords ["demo/Demo" ++ la ++ ".gf" | (_,la) <- ls]
   ifxx "clean" $ do
     system "rm */*.gfo ../alltenses/*.gfo ../present/*.gfo"
   ifxx "clone" $ do
@@ -104,10 +112,11 @@ gf comm file = do
   putStrLn $ "reading " ++ file
   system $ "echo \"" ++ comm ++ "\" | gf3 -s " ++ file
 
-treeb = "rf -lines -tree -file=" ++ treebankExx ++ 
-        " | l -treebank | wf -file=" ++ treebankResults
+treeb abstr ls = "rf -lines -tree -file=" ++ treebankExx ++ 
+        " | l -treebank " ++ unlexer abstr ls ++ " | wf -file=" ++ treebankResults
 
-demos = "gr -number=100 | l -treebank | ps -to_utf8 -to_html | wf -file=resdemo.html"
+demos abstr ls = "gr -number=100 | l -treebank " ++ unlexer abstr ls ++ 
+           " | ps -to_html | wf -file=resdemo.html"
 
 lang (lla,la) = lla ++ "/Lang" ++ la ++ ".gf"
 try  (lla,la) = "api/Try"  ++ la ++ ".gf"
@@ -140,3 +149,11 @@ replaceLang s1 s2 = repl where
     _ -> s
   lgs = 3 -- length s1
 
+unlexer abstr ls = 
+  "-unlexer=\\\"" ++ unwords 
+      [abstr ++ la ++ "=" ++ unl | 
+        lla@(_,la) <- ls, let unl = unlex lla, not (null unl)] ++ 
+      "\\\""
+    where
+      unlex lla = maybe "" id $ lookup lla langsCoding
+
diff --git a/src-3.0/GF/Command/Abstract.hs b/src-3.0/GF/Command/Abstract.hs
index 16905c2f9..29111b432 100644
--- a/src-3.0/GF/Command/Abstract.hs
+++ b/src-3.0/GF/Command/Abstract.hs
@@ -58,10 +58,10 @@ isFlag :: String -> [Option] -> Bool
 isFlag o opts = elem o [x | OFlag x _ <- opts]
 
 prOpt :: Option -> String
-prOpt (OOpt i) = i ----
+prOpt o = case o of
+  OOpt i    -> i
+  OFlag f x -> f ++ "=" ++ show x
 
 mkOpt :: String -> Option
 mkOpt = OOpt
 
-
-
diff --git a/src-3.0/GF/Command/Commands.hs b/src-3.0/GF/Command/Commands.hs
index b5ba99f6f..96e7c57f4 100644
--- a/src-3.0/GF/Command/Commands.hs
+++ b/src-3.0/GF/Command/Commands.hs
@@ -228,11 +228,16 @@ allCommands pgf = Map.fromList [
        "The -lang flag can be used to restrict this to fewer languages.",
        "A sequence of string operations (see command ps) can be given",
        "as options, and works then like a pipe to the ps command, except",
-       "that it only affect the strings, not e.g. the table labels."
+       "that it only affect the strings, not e.g. the table labels.",
+       "These can be given separately to each language with the unlexer flag",
+       "whose results are prepended to the other lexer flags. The value of the",
+       "unlexer flag is a space-separated list of comma-separated string operation",
+       "sequences; see example."
        ],
      examples = [
        "l -langs=LangSwe,LangNor no_Utt   -- linearize tree to LangSwe and LangNor",
-       "gr -lang=LangHin -cat=Cl | l -table -to_devanagari -to_utf8 -- hindi table"
+       "gr -lang=LangHin -cat=Cl | l -table -to_devanagari -to_utf8 -- hindi table",
+       "l -unlexer=\"LangSwe=to_utf8 LangHin=to_devanagari,to_utf8\" -- different lexers"
        ],
      exec = \opts -> return . fromStrings . map (optLin opts),
      options = [
@@ -243,7 +248,8 @@ allCommands pgf = Map.fromList [
        ("treebank","show the tree and tag linearizations with language names")
        ] ++ stringOpOptions,
      flags = [
-       ("lang","the languages of linearization (comma-separated, no spaces)")
+       ("lang","the languages of linearization (comma-separated, no spaces)"),
+       ("unlexer","set unlexers separately to each language (space-separated)")
        ]
      }),
   ("ma", emptyCommandInfo {
@@ -499,12 +505,20 @@ allCommands pgf = Map.fromList [
      (abstractName pgf ++ ": " ++ showTree t) :
      [lang ++ ": " ++ linear opts lang t | lang <- optLangs opts]
 
--- logic of coding in unlexing:
+   unlex opts lang = stringOps (getUnlex opts lang ++ map prOpt opts)
+
+   getUnlex opts lang = case words (valStrOpts "unlexer" "" opts) of
+     lexs -> case lookup lang 
+               [(la,tail le) | lex <- lexs, let (la,le) = span (/='=') lex, not (null le)] of
+       Just le -> chunks ',' le
+       _ -> []
+
+-- Proposed logic of coding in unlexing:
 --   - If lang has no coding flag, or -to_utf8 is not in opts, just opts are used.
 --   - If lang has flag coding=utf8, -to_utf8 is ignored.
 --   - If lang has coding=other, and -to_utf8 is in opts, from_other is applied first.
-
-   unlex opts lang = {- trace (unwords optsC) $ -} stringOps optsC where
+-- THIS DOES NOT WORK UNFORTUNATELY - can't use the grammar flag properly
+   unlexx opts lang = {- trace (unwords optsC) $ -} stringOps optsC where
      optsC = case lookFlag pgf lang "coding" of
        Just "utf8" -> filter (/="to_utf8") $ map prOpt opts
        Just other | isOpt "to_utf8" opts -> 
@@ -551,12 +565,14 @@ allCommands pgf = Map.fromList [
 stringOpOptions = [
        ("bind","bind tokens separated by Prelude.BIND, i.e. &+"),
        ("chars","lexer that makes every non-space character a token"),
+       ("from_cp1251","decode from cp1251 (Cyrillic used in Bulgarian resource)"),
        ("from_devanagari","from unicode to GF Devanagari transliteration"),
        ("from_thai","from unicode to GF Thai transliteration"),
        ("from_utf8","decode from utf8"),
        ("lextext","text-like lexer"),
        ("lexcode","code-like lexer"),
        ("lexmixed","mixture of text and code (code between $...$)"), 
+       ("to_cp1251","encode to cp1251 (Cyrillic used in Bulgarian resource)"),
        ("to_devanagari","from GF Devanagari transliteration to unicode"),
        ("to_html","wrap in a html file with linebreaks"),
        ("to_thai","from GF Thai transliteration to unicode"),