mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 11:42:49 -06:00
added default decodings to Make, to enable multilingual utf8 generation
This commit is contained in:
@@ -13,25 +13,31 @@ import System
|
|||||||
-- With no argument, lang and api are done, in this order.
|
-- With no argument, lang and api are done, in this order.
|
||||||
-- See 'make' below for what is done by which command.
|
-- See 'make' below for what is done by which command.
|
||||||
|
|
||||||
langs = [
|
-- the languages have long directory names and short ISO codes (3 letters)
|
||||||
("arabic", "Ara"),
|
-- we also give the decodings for postprocessing linearizations, as long as grammars
|
||||||
("bulgarian","Bul"),
|
-- don't support all flags needed; they are used in tests
|
||||||
("catalan", "Cat"),
|
|
||||||
("danish", "Dan"),
|
langsCoding = [
|
||||||
("english", "Eng"),
|
(("arabic", "Ara"),""),
|
||||||
("finnish", "Fin"),
|
(("bulgarian","Bul"),"from_cp1251,to_utf8"),
|
||||||
("french", "Fre"),
|
(("catalan", "Cat"),"to_utf8"),
|
||||||
("hindi", "Hin"),
|
(("danish", "Dan"),"to_utf8"),
|
||||||
("german", "Ger"),
|
(("english", "Eng"),""),
|
||||||
("interlingua","Ina"),
|
(("finnish", "Fin"),"to_utf8"),
|
||||||
("italian", "Ita"),
|
(("french", "Fre"),"to_utf8"),
|
||||||
("norwegian","Nor"),
|
(("hindi", "Hin"),"to_devanagari,to_utf8"),
|
||||||
("russian", "Rus"),
|
(("german", "Ger"),"to_utf8"),
|
||||||
("spanish", "Spa"),
|
(("interlingua","Ina"),""),
|
||||||
("swedish", "Swe"),
|
(("italian", "Ita"),"to_utf8"),
|
||||||
("thai", "Tha")
|
(("norwegian","Nor"),"to_utf8"),
|
||||||
|
(("russian", "Rus"),""),
|
||||||
|
(("spanish", "Spa"),"to_utf8"),
|
||||||
|
(("swedish", "Swe"),"to_utf8"),
|
||||||
|
(("thai", "Tha"),"to_thai,to_utf8")
|
||||||
]
|
]
|
||||||
|
|
||||||
|
langs = map fst langsCoding
|
||||||
|
|
||||||
-- languagues for which to compile Lang
|
-- languagues for which to compile Lang
|
||||||
langsLang = langs `except` ["Ara"]
|
langsLang = langs `except` ["Ara"]
|
||||||
|
|
||||||
@@ -81,9 +87,11 @@ make xx = do
|
|||||||
unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- optl langsPGF] ++
|
unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- optl langsPGF] ++
|
||||||
" +RTS -K100M"
|
" +RTS -K100M"
|
||||||
ifxx "test" $ do
|
ifxx "test" $ do
|
||||||
gf treeb $ unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- optl langsTest]
|
let ls = optl langsTest
|
||||||
|
gf (treeb "Lang" ls) $ unwords [dir ++ "/Lang" ++ la ++ ".gfo" | (_,la) <- ls]
|
||||||
ifxx "demo" $ do
|
ifxx "demo" $ do
|
||||||
gf demos $ unwords ["demo/Demo" ++ la ++ ".gf" | (_,la) <- optl langsDemo]
|
let ls = optl langsDemo
|
||||||
|
gf (demos "Demo" ls) $ unwords ["demo/Demo" ++ la ++ ".gf" | (_,la) <- ls]
|
||||||
ifxx "clean" $ do
|
ifxx "clean" $ do
|
||||||
system "rm */*.gfo ../alltenses/*.gfo ../present/*.gfo"
|
system "rm */*.gfo ../alltenses/*.gfo ../present/*.gfo"
|
||||||
ifxx "clone" $ do
|
ifxx "clone" $ do
|
||||||
@@ -104,10 +112,11 @@ gf comm file = do
|
|||||||
putStrLn $ "reading " ++ file
|
putStrLn $ "reading " ++ file
|
||||||
system $ "echo \"" ++ comm ++ "\" | gf3 -s " ++ file
|
system $ "echo \"" ++ comm ++ "\" | gf3 -s " ++ file
|
||||||
|
|
||||||
treeb = "rf -lines -tree -file=" ++ treebankExx ++
|
treeb abstr ls = "rf -lines -tree -file=" ++ treebankExx ++
|
||||||
" | l -treebank | wf -file=" ++ treebankResults
|
" | l -treebank " ++ unlexer abstr ls ++ " | wf -file=" ++ treebankResults
|
||||||
|
|
||||||
demos = "gr -number=100 | l -treebank | ps -to_utf8 -to_html | wf -file=resdemo.html"
|
demos abstr ls = "gr -number=100 | l -treebank " ++ unlexer abstr ls ++
|
||||||
|
" | ps -to_html | wf -file=resdemo.html"
|
||||||
|
|
||||||
lang (lla,la) = lla ++ "/Lang" ++ la ++ ".gf"
|
lang (lla,la) = lla ++ "/Lang" ++ la ++ ".gf"
|
||||||
try (lla,la) = "api/Try" ++ la ++ ".gf"
|
try (lla,la) = "api/Try" ++ la ++ ".gf"
|
||||||
@@ -140,3 +149,11 @@ replaceLang s1 s2 = repl where
|
|||||||
_ -> s
|
_ -> s
|
||||||
lgs = 3 -- length s1
|
lgs = 3 -- length s1
|
||||||
|
|
||||||
|
unlexer abstr ls =
|
||||||
|
"-unlexer=\\\"" ++ unwords
|
||||||
|
[abstr ++ la ++ "=" ++ unl |
|
||||||
|
lla@(_,la) <- ls, let unl = unlex lla, not (null unl)] ++
|
||||||
|
"\\\""
|
||||||
|
where
|
||||||
|
unlex lla = maybe "" id $ lookup lla langsCoding
|
||||||
|
|
||||||
|
|||||||
@@ -58,10 +58,10 @@ isFlag :: String -> [Option] -> Bool
|
|||||||
isFlag o opts = elem o [x | OFlag x _ <- opts]
|
isFlag o opts = elem o [x | OFlag x _ <- opts]
|
||||||
|
|
||||||
prOpt :: Option -> String
|
prOpt :: Option -> String
|
||||||
prOpt (OOpt i) = i ----
|
prOpt o = case o of
|
||||||
|
OOpt i -> i
|
||||||
|
OFlag f x -> f ++ "=" ++ show x
|
||||||
|
|
||||||
mkOpt :: String -> Option
|
mkOpt :: String -> Option
|
||||||
mkOpt = OOpt
|
mkOpt = OOpt
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -228,11 +228,16 @@ allCommands pgf = Map.fromList [
|
|||||||
"The -lang flag can be used to restrict this to fewer languages.",
|
"The -lang flag can be used to restrict this to fewer languages.",
|
||||||
"A sequence of string operations (see command ps) can be given",
|
"A sequence of string operations (see command ps) can be given",
|
||||||
"as options, and works then like a pipe to the ps command, except",
|
"as options, and works then like a pipe to the ps command, except",
|
||||||
"that it only affect the strings, not e.g. the table labels."
|
"that it only affect the strings, not e.g. the table labels.",
|
||||||
|
"These can be given separately to each language with the unlexer flag",
|
||||||
|
"whose results are prepended to the other lexer flags. The value of the",
|
||||||
|
"unlexer flag is a space-separated list of comma-separated string operation",
|
||||||
|
"sequences; see example."
|
||||||
],
|
],
|
||||||
examples = [
|
examples = [
|
||||||
"l -langs=LangSwe,LangNor no_Utt -- linearize tree to LangSwe and LangNor",
|
"l -langs=LangSwe,LangNor no_Utt -- linearize tree to LangSwe and LangNor",
|
||||||
"gr -lang=LangHin -cat=Cl | l -table -to_devanagari -to_utf8 -- hindi table"
|
"gr -lang=LangHin -cat=Cl | l -table -to_devanagari -to_utf8 -- hindi table",
|
||||||
|
"l -unlexer=\"LangSwe=to_utf8 LangHin=to_devanagari,to_utf8\" -- different lexers"
|
||||||
],
|
],
|
||||||
exec = \opts -> return . fromStrings . map (optLin opts),
|
exec = \opts -> return . fromStrings . map (optLin opts),
|
||||||
options = [
|
options = [
|
||||||
@@ -243,7 +248,8 @@ allCommands pgf = Map.fromList [
|
|||||||
("treebank","show the tree and tag linearizations with language names")
|
("treebank","show the tree and tag linearizations with language names")
|
||||||
] ++ stringOpOptions,
|
] ++ stringOpOptions,
|
||||||
flags = [
|
flags = [
|
||||||
("lang","the languages of linearization (comma-separated, no spaces)")
|
("lang","the languages of linearization (comma-separated, no spaces)"),
|
||||||
|
("unlexer","set unlexers separately to each language (space-separated)")
|
||||||
]
|
]
|
||||||
}),
|
}),
|
||||||
("ma", emptyCommandInfo {
|
("ma", emptyCommandInfo {
|
||||||
@@ -499,12 +505,20 @@ allCommands pgf = Map.fromList [
|
|||||||
(abstractName pgf ++ ": " ++ showTree t) :
|
(abstractName pgf ++ ": " ++ showTree t) :
|
||||||
[lang ++ ": " ++ linear opts lang t | lang <- optLangs opts]
|
[lang ++ ": " ++ linear opts lang t | lang <- optLangs opts]
|
||||||
|
|
||||||
-- logic of coding in unlexing:
|
unlex opts lang = stringOps (getUnlex opts lang ++ map prOpt opts)
|
||||||
|
|
||||||
|
getUnlex opts lang = case words (valStrOpts "unlexer" "" opts) of
|
||||||
|
lexs -> case lookup lang
|
||||||
|
[(la,tail le) | lex <- lexs, let (la,le) = span (/='=') lex, not (null le)] of
|
||||||
|
Just le -> chunks ',' le
|
||||||
|
_ -> []
|
||||||
|
|
||||||
|
-- Proposed logic of coding in unlexing:
|
||||||
-- - If lang has no coding flag, or -to_utf8 is not in opts, just opts are used.
|
-- - If lang has no coding flag, or -to_utf8 is not in opts, just opts are used.
|
||||||
-- - If lang has flag coding=utf8, -to_utf8 is ignored.
|
-- - If lang has flag coding=utf8, -to_utf8 is ignored.
|
||||||
-- - If lang has coding=other, and -to_utf8 is in opts, from_other is applied first.
|
-- - If lang has coding=other, and -to_utf8 is in opts, from_other is applied first.
|
||||||
|
-- THIS DOES NOT WORK UNFORTUNATELY - can't use the grammar flag properly
|
||||||
unlex opts lang = {- trace (unwords optsC) $ -} stringOps optsC where
|
unlexx opts lang = {- trace (unwords optsC) $ -} stringOps optsC where
|
||||||
optsC = case lookFlag pgf lang "coding" of
|
optsC = case lookFlag pgf lang "coding" of
|
||||||
Just "utf8" -> filter (/="to_utf8") $ map prOpt opts
|
Just "utf8" -> filter (/="to_utf8") $ map prOpt opts
|
||||||
Just other | isOpt "to_utf8" opts ->
|
Just other | isOpt "to_utf8" opts ->
|
||||||
@@ -551,12 +565,14 @@ allCommands pgf = Map.fromList [
|
|||||||
stringOpOptions = [
|
stringOpOptions = [
|
||||||
("bind","bind tokens separated by Prelude.BIND, i.e. &+"),
|
("bind","bind tokens separated by Prelude.BIND, i.e. &+"),
|
||||||
("chars","lexer that makes every non-space character a token"),
|
("chars","lexer that makes every non-space character a token"),
|
||||||
|
("from_cp1251","decode from cp1251 (Cyrillic used in Bulgarian resource)"),
|
||||||
("from_devanagari","from unicode to GF Devanagari transliteration"),
|
("from_devanagari","from unicode to GF Devanagari transliteration"),
|
||||||
("from_thai","from unicode to GF Thai transliteration"),
|
("from_thai","from unicode to GF Thai transliteration"),
|
||||||
("from_utf8","decode from utf8"),
|
("from_utf8","decode from utf8"),
|
||||||
("lextext","text-like lexer"),
|
("lextext","text-like lexer"),
|
||||||
("lexcode","code-like lexer"),
|
("lexcode","code-like lexer"),
|
||||||
("lexmixed","mixture of text and code (code between $...$)"),
|
("lexmixed","mixture of text and code (code between $...$)"),
|
||||||
|
("to_cp1251","encode to cp1251 (Cyrillic used in Bulgarian resource)"),
|
||||||
("to_devanagari","from GF Devanagari transliteration to unicode"),
|
("to_devanagari","from GF Devanagari transliteration to unicode"),
|
||||||
("to_html","wrap in a html file with linebreaks"),
|
("to_html","wrap in a html file with linebreaks"),
|
||||||
("to_thai","from GF Thai transliteration to unicode"),
|
("to_thai","from GF Thai transliteration to unicode"),
|
||||||
|
|||||||
Reference in New Issue
Block a user