From ed72a2ef79e69c57ba2a6f41d88d8822d26548c4 Mon Sep 17 00:00:00 2001 From: aarne Date: Tue, 24 Jun 2008 21:52:07 +0000 Subject: [PATCH] added default decodings to Make, to enable multilingual utf8 generation --- lib/resource-1.4/Make.hs | 61 ++++++++++++++++++++++------------ src-3.0/GF/Command/Abstract.hs | 6 ++-- src-3.0/GF/Command/Commands.hs | 28 ++++++++++++---- 3 files changed, 64 insertions(+), 31 deletions(-) diff --git a/lib/resource-1.4/Make.hs b/lib/resource-1.4/Make.hs index a76bb59d7..f815540ff 100644 --- a/lib/resource-1.4/Make.hs +++ b/lib/resource-1.4/Make.hs @@ -13,25 +13,31 @@ import System -- With no argument, lang and api are done, in this order. -- See 'make' below for what is done by which command. -langs = [ - ("arabic", "Ara"), - ("bulgarian","Bul"), - ("catalan", "Cat"), - ("danish", "Dan"), - ("english", "Eng"), - ("finnish", "Fin"), - ("french", "Fre"), - ("hindi", "Hin"), - ("german", "Ger"), - ("interlingua","Ina"), - ("italian", "Ita"), - ("norwegian","Nor"), - ("russian", "Rus"), - ("spanish", "Spa"), - ("swedish", "Swe"), - ("thai", "Tha") +-- the languages have long directory names and short ISO codes (3 letters) +-- we also give the decodings for postprocessing linearizations, as long as grammars +-- don't support all flags needed; they are used in tests + +langsCoding = [ + (("arabic", "Ara"),""), + (("bulgarian","Bul"),"from_cp1251,to_utf8"), + (("catalan", "Cat"),"to_utf8"), + (("danish", "Dan"),"to_utf8"), + (("english", "Eng"),""), + (("finnish", "Fin"),"to_utf8"), + (("french", "Fre"),"to_utf8"), + (("hindi", "Hin"),"to_devanagari,to_utf8"), + (("german", "Ger"),"to_utf8"), + (("interlingua","Ina"),""), + (("italian", "Ita"),"to_utf8"), + (("norwegian","Nor"),"to_utf8"), + (("russian", "Rus"),""), + (("spanish", "Spa"),"to_utf8"), + (("swedish", "Swe"),"to_utf8"), + (("thai", "Tha"),"to_thai,to_utf8") ] +langs = map fst langsCoding + -- languagues for which to compile Lang langsLang = langs `except` ["Ara"] @@ -81,9 +87,11 @@ make xx = do unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- optl langsPGF] ++ " +RTS -K100M" ifxx "test" $ do - gf treeb $ unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- optl langsTest] + let ls = optl langsTest + gf (treeb "Lang" ls) $ unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- ls] ifxx "demo" $ do - gf demos $ unwords ["demo/Demo" ++ la ++ ".gf" | (_,la) <- optl langsDemo] + let ls = optl langsDemo + gf (demos "Demo" ls) $ unwords ["demo/Demo" ++ la ++ ".gf" | (_,la) <- ls] ifxx "clean" $ do system "rm */*.gfo ../alltenses/*.gfo ../present/*.gfo" ifxx "clone" $ do @@ -104,10 +112,11 @@ gf comm file = do putStrLn $ "reading " ++ file system $ "echo \"" ++ comm ++ "\" | gf3 -s " ++ file -treeb = "rf -lines -tree -file=" ++ treebankExx ++ - " | l -treebank | wf -file=" ++ treebankResults +treeb abstr ls = "rf -lines -tree -file=" ++ treebankExx ++ + " | l -treebank " ++ unlexer abstr ls ++ " | wf -file=" ++ treebankResults -demos = "gr -number=100 | l -treebank | ps -to_utf8 -to_html | wf -file=resdemo.html" +demos abstr ls = "gr -number=100 | l -treebank " ++ unlexer abstr ls ++ + " | ps -to_html | wf -file=resdemo.html" lang (lla,la) = lla ++ "/Lang" ++ la ++ ".gf" try (lla,la) = "api/Try" ++ la ++ ".gf" @@ -140,3 +149,11 @@ replaceLang s1 s2 = repl where _ -> s lgs = 3 -- length s1 +unlexer abstr ls = + "-unlexer=\\\"" ++ unwords + [abstr ++ la ++ "=" ++ unl | + lla@(_,la) <- ls, let unl = unlex lla, not (null unl)] ++ + "\\\"" + where + unlex lla = maybe "" id $ lookup lla langsCoding + diff --git a/src-3.0/GF/Command/Abstract.hs b/src-3.0/GF/Command/Abstract.hs index 16905c2f9..29111b432 100644 --- a/src-3.0/GF/Command/Abstract.hs +++ b/src-3.0/GF/Command/Abstract.hs @@ -58,10 +58,10 @@ isFlag :: String -> [Option] -> Bool isFlag o opts = elem o [x | OFlag x _ <- opts] prOpt :: Option -> String -prOpt (OOpt i) = i ---- +prOpt o = case o of + OOpt i -> i + OFlag f x -> f ++ "=" ++ show x mkOpt :: String -> Option mkOpt = OOpt - - diff --git a/src-3.0/GF/Command/Commands.hs b/src-3.0/GF/Command/Commands.hs index b5ba99f6f..96e7c57f4 100644 --- a/src-3.0/GF/Command/Commands.hs +++ b/src-3.0/GF/Command/Commands.hs @@ -228,11 +228,16 @@ allCommands pgf = Map.fromList [ "The -lang flag can be used to restrict this to fewer languages.", "A sequence of string operations (see command ps) can be given", "as options, and works then like a pipe to the ps command, except", - "that it only affect the strings, not e.g. the table labels." + "that it only affect the strings, not e.g. the table labels.", + "These can be given separately to each language with the unlexer flag", + "whose results are prepended to the other lexer flags. The value of the", + "unlexer flag is a space-separated list of comma-separated string operation", + "sequences; see example." ], examples = [ "l -langs=LangSwe,LangNor no_Utt -- linearize tree to LangSwe and LangNor", - "gr -lang=LangHin -cat=Cl | l -table -to_devanagari -to_utf8 -- hindi table" + "gr -lang=LangHin -cat=Cl | l -table -to_devanagari -to_utf8 -- hindi table", + "l -unlexer=\"LangSwe=to_utf8 LangHin=to_devanagari,to_utf8\" -- different lexers" ], exec = \opts -> return . fromStrings . map (optLin opts), options = [ @@ -243,7 +248,8 @@ allCommands pgf = Map.fromList [ ("treebank","show the tree and tag linearizations with language names") ] ++ stringOpOptions, flags = [ - ("lang","the languages of linearization (comma-separated, no spaces)") + ("lang","the languages of linearization (comma-separated, no spaces)"), + ("unlexer","set unlexers separately to each language (space-separated)") ] }), ("ma", emptyCommandInfo { @@ -499,12 +505,20 @@ allCommands pgf = Map.fromList [ (abstractName pgf ++ ": " ++ showTree t) : [lang ++ ": " ++ linear opts lang t | lang <- optLangs opts] --- logic of coding in unlexing: + unlex opts lang = stringOps (getUnlex opts lang ++ map prOpt opts) + + getUnlex opts lang = case words (valStrOpts "unlexer" "" opts) of + lexs -> case lookup lang + [(la,tail le) | lex <- lexs, let (la,le) = span (/='=') lex, not (null le)] of + Just le -> chunks ',' le + _ -> [] + +-- Proposed logic of coding in unlexing: -- - If lang has no coding flag, or -to_utf8 is not in opts, just opts are used. -- - If lang has flag coding=utf8, -to_utf8 is ignored. -- - If lang has coding=other, and -to_utf8 is in opts, from_other is applied first. - - unlex opts lang = {- trace (unwords optsC) $ -} stringOps optsC where +-- THIS DOES NOT WORK UNFORTUNATELY - can't use the grammar flag properly + unlexx opts lang = {- trace (unwords optsC) $ -} stringOps optsC where optsC = case lookFlag pgf lang "coding" of Just "utf8" -> filter (/="to_utf8") $ map prOpt opts Just other | isOpt "to_utf8" opts -> @@ -551,12 +565,14 @@ allCommands pgf = Map.fromList [ stringOpOptions = [ ("bind","bind tokens separated by Prelude.BIND, i.e. &+"), ("chars","lexer that makes every non-space character a token"), + ("from_cp1251","decode from cp1251 (Cyrillic used in Bulgarian resource)"), ("from_devanagari","from unicode to GF Devanagari transliteration"), ("from_thai","from unicode to GF Thai transliteration"), ("from_utf8","decode from utf8"), ("lextext","text-like lexer"), ("lexcode","code-like lexer"), ("lexmixed","mixture of text and code (code between $...$)"), + ("to_cp1251","encode to cp1251 (Cyrillic used in Bulgarian resource)"), ("to_devanagari","from GF Devanagari transliteration to unicode"), ("to_html","wrap in a html file with linebreaks"), ("to_thai","from GF Thai transliteration to unicode"),