added default decodings to Make, to enable multilingual utf8 generation

2026-04-23 11:42:49 -06:00 · 2008-06-24 21:52:07 +00:00
parent 06eb01ac7e
commit ed72a2ef79
3 changed files with 64 additions and 31 deletions
--- a/lib/resource-1.4/Make.hs
+++ b/lib/resource-1.4/Make.hs
@@ -13,25 +13,31 @@ import System
 -- With no argument, lang and api are done, in this order.
 -- See 'make' below for what is done by which command.
-langs = [
+-- the languages have long directory names and short ISO codes (3 letters)
-  ("arabic",   "Ara"),
+-- we also give the decodings for postprocessing linearizations, as long as grammars
-  ("bulgarian","Bul"),
+-- don't support all flags needed; they are used in tests
-  ("catalan",  "Cat"),
+ 
-  ("danish",   "Dan"),
+langsCoding = [
-  ("english",  "Eng"),
+  (("arabic",   "Ara"),""),
-  ("finnish",  "Fin"),
+  (("bulgarian","Bul"),"from_cp1251,to_utf8"),
-  ("french",   "Fre"),
+  (("catalan",  "Cat"),"to_utf8"),
-  ("hindi",    "Hin"),
+  (("danish",   "Dan"),"to_utf8"),
-  ("german",   "Ger"),
+  (("english",  "Eng"),""),
-  ("interlingua","Ina"),
+  (("finnish",  "Fin"),"to_utf8"),
-  ("italian",  "Ita"),
+  (("french",   "Fre"),"to_utf8"),
-  ("norwegian","Nor"),
+  (("hindi",    "Hin"),"to_devanagari,to_utf8"),
-  ("russian",  "Rus"),
+  (("german",   "Ger"),"to_utf8"),
-  ("spanish",  "Spa"),
+  (("interlingua","Ina"),""),
-  ("swedish",  "Swe"),
+  (("italian",  "Ita"),"to_utf8"),
-  ("thai",     "Tha")
+  (("norwegian","Nor"),"to_utf8"),
  (("russian",  "Rus"),""),
  (("spanish",  "Spa"),"to_utf8"),
  (("swedish",  "Swe"),"to_utf8"), 
  (("thai",     "Tha"),"to_thai,to_utf8")
  ]
 langs = map fst langsCoding
 -- languagues for which to compile Lang
 langsLang = langs `except` ["Ara"]
@@ -81,9 +87,11 @@ make xx = do
              unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- optl langsPGF] ++
              " +RTS -K100M"
  ifxx "test" $ do
-    gf treeb $ unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- optl langsTest]
+    let ls = optl langsTest
    gf (treeb "Lang" ls) $ unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- ls] 
  ifxx "demo" $ do
-    gf demos $ unwords ["demo/Demo" ++ la ++ ".gf" | (_,la) <- optl langsDemo]
+    let ls = optl langsDemo
    gf (demos "Demo" ls) $ unwords ["demo/Demo" ++ la ++ ".gf" | (_,la) <- ls]
  ifxx "clean" $ do
    system "rm */*.gfo ../alltenses/*.gfo ../present/*.gfo"
  ifxx "clone" $ do
@@ -104,10 +112,11 @@ gf comm file = do
  putStrLn $ "reading " ++ file
  system $ "echo \"" ++ comm ++ "\" | gf3 -s " ++ file
-treeb = "rf -lines -tree -file=" ++ treebankExx ++ 
+treeb abstr ls = "rf -lines -tree -file=" ++ treebankExx ++ 
-        " | l -treebank | wf -file=" ++ treebankResults
+        " | l -treebank " ++ unlexer abstr ls ++ " | wf -file=" ++ treebankResults
-demos = "gr -number=100 | l -treebank | ps -to_utf8 -to_html | wf -file=resdemo.html"
+demos abstr ls = "gr -number=100 | l -treebank " ++ unlexer abstr ls ++ 
           " | ps -to_html | wf -file=resdemo.html"
 lang (lla,la) = lla ++ "/Lang" ++ la ++ ".gf"
 try  (lla,la) = "api/Try"  ++ la ++ ".gf"
@@ -140,3 +149,11 @@ replaceLang s1 s2 = repl where
    _ -> s
  lgs = 3 -- length s1
 unlexer abstr ls = 
  "-unlexer=\\\"" ++ unwords 
      [abstr ++ la ++ "=" ++ unl | 
        lla@(_,la) <- ls, let unl = unlex lla, not (null unl)] ++ 
      "\\\""
    where
      unlex lla = maybe "" id $ lookup lla langsCoding
--- a/src-3.0/GF/Command/Abstract.hs
+++ b/src-3.0/GF/Command/Abstract.hs
@@ -58,10 +58,10 @@ isFlag :: String -> [Option] -> Bool
 isFlag o opts = elem o [x | OFlag x _ <- opts]
 prOpt :: Option -> String
-prOpt (OOpt i) = i ----
+prOpt o = case o of
  OOpt i    -> i
  OFlag f x -> f ++ "=" ++ show x
 mkOpt :: String -> Option
 mkOpt = OOpt
--- a/src-3.0/GF/Command/Commands.hs
+++ b/src-3.0/GF/Command/Commands.hs
@@ -228,11 +228,16 @@ allCommands pgf = Map.fromList [
       "The -lang flag can be used to restrict this to fewer languages.",
       "A sequence of string operations (see command ps) can be given",
       "as options, and works then like a pipe to the ps command, except",
-       "that it only affect the strings, not e.g. the table labels."
+       "that it only affect the strings, not e.g. the table labels.",
       "These can be given separately to each language with the unlexer flag",
       "whose results are prepended to the other lexer flags. The value of the",
       "unlexer flag is a space-separated list of comma-separated string operation",
       "sequences; see example."
       ],
     examples = [
       "l -langs=LangSwe,LangNor no_Utt   -- linearize tree to LangSwe and LangNor",
-       "gr -lang=LangHin -cat=Cl | l -table -to_devanagari -to_utf8 -- hindi table"
+       "gr -lang=LangHin -cat=Cl | l -table -to_devanagari -to_utf8 -- hindi table",
       "l -unlexer=\"LangSwe=to_utf8 LangHin=to_devanagari,to_utf8\" -- different lexers"
       ],
     exec = \opts -> return . fromStrings . map (optLin opts),
     options = [
@@ -243,7 +248,8 @@ allCommands pgf = Map.fromList [
       ("treebank","show the tree and tag linearizations with language names")
       ] ++ stringOpOptions,
     flags = [
-       ("lang","the languages of linearization (comma-separated, no spaces)")
+       ("lang","the languages of linearization (comma-separated, no spaces)"),
       ("unlexer","set unlexers separately to each language (space-separated)")
       ]
     }),
  ("ma", emptyCommandInfo {
@@ -499,12 +505,20 @@ allCommands pgf = Map.fromList [
     (abstractName pgf ++ ": " ++ showTree t) :
     [lang ++ ": " ++ linear opts lang t | lang <- optLangs opts]
-- logic of coding in unlexing:
+   unlex opts lang = stringOps (getUnlex opts lang ++ map prOpt opts)
   getUnlex opts lang = case words (valStrOpts "unlexer" "" opts) of
     lexs -> case lookup lang 
               [(la,tail le) | lex <- lexs, let (la,le) = span (/='=') lex, not (null le)] of
       Just le -> chunks ',' le
       _ -> []
 -- Proposed logic of coding in unlexing:
 --   - If lang has no coding flag, or -to_utf8 is not in opts, just opts are used.
 --   - If lang has flag coding=utf8, -to_utf8 is ignored.
 --   - If lang has coding=other, and -to_utf8 is in opts, from_other is applied first.
-
+-- THIS DOES NOT WORK UNFORTUNATELY - can't use the grammar flag properly
-   unlex opts lang = {- trace (unwords optsC) $ -} stringOps optsC where
+   unlexx opts lang = {- trace (unwords optsC) $ -} stringOps optsC where
     optsC = case lookFlag pgf lang "coding" of
       Just "utf8" -> filter (/="to_utf8") $ map prOpt opts
       Just other | isOpt "to_utf8" opts -> 
@@ -551,12 +565,14 @@ allCommands pgf = Map.fromList [
 stringOpOptions = [
       ("bind","bind tokens separated by Prelude.BIND, i.e. &+"),
       ("chars","lexer that makes every non-space character a token"),
       ("from_cp1251","decode from cp1251 (Cyrillic used in Bulgarian resource)"),
       ("from_devanagari","from unicode to GF Devanagari transliteration"),
       ("from_thai","from unicode to GF Thai transliteration"),
       ("from_utf8","decode from utf8"),
       ("lextext","text-like lexer"),
       ("lexcode","code-like lexer"),
       ("lexmixed","mixture of text and code (code between $...$)"), 
       ("to_cp1251","encode to cp1251 (Cyrillic used in Bulgarian resource)"),
       ("to_devanagari","from GF Devanagari transliteration to unicode"),
       ("to_html","wrap in a html file with linebreaks"),
       ("to_thai","from GF Thai transliteration to unicode"),