From 39846cc94cbf708900cdfc8926b036a8d176072d Mon Sep 17 00:00:00 2001 From: aarne Date: Wed, 8 Feb 2006 10:18:00 +0000 Subject: [PATCH] command tb for creating treebanks --- lib/resource-1.0/TODO | 1 + src/GF/Compile/ShellState.hs | 2 +- src/GF/Shell.hs | 6 +++ src/GF/Shell/HelpFile.hs | 7 ++++ src/GF/Shell/PShell.hs | 1 + src/GF/Shell/ShellCommands.hs | 2 + src/GF/UseGrammar/Treebank.hs | 69 +++++++++++++++++++++++++++++++++++ src/HelpFile | 7 ++++ 8 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 src/GF/UseGrammar/Treebank.hs diff --git a/lib/resource-1.0/TODO b/lib/resource-1.0/TODO index a72e7c8f8..ac612ed34 100644 --- a/lib/resource-1.0/TODO +++ b/lib/resource-1.0/TODO @@ -42,3 +42,4 @@ API: text construction All: punctuation +API: apposition, e.g. Det -> CN -> [PN] -> NP diff --git a/src/GF/Compile/ShellState.hs b/src/GF/Compile/ShellState.hs index 1c1d8556e..6c281a926 100644 --- a/src/GF/Compile/ShellState.hs +++ b/src/GF/Compile/ShellState.hs @@ -405,7 +405,7 @@ allActiveGrammars :: ShellState -> [StateGrammar] globalOptions = gloptions --allLanguages = map (fst . fst) . concretes -allLanguages = M.allConcreteModules . canModules +allLanguages = map (snd . fst) . actualConcretes allTransfers = map fst . transfers allCategories = map fst . allCatsOf . canModules diff --git a/src/GF/Shell.hs b/src/GF/Shell.hs index 36dfc5b14..150b756c1 100644 --- a/src/GF/Shell.hs +++ b/src/GF/Shell.hs @@ -27,6 +27,7 @@ import qualified GF.Canon.CMacros as CMacros import qualified GF.Compile.GrammarToCanon as GrammarToCanon import GF.Grammar.Values import GF.UseGrammar.GetTree +import GF.UseGrammar.Treebank import GF.Shell.ShellCommands @@ -289,6 +290,11 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com _ -> Nothing returnArg (ATrms $ generateTrees opts gro mt) sa + CTreeBank -> do + let ts = strees $ s2t $ snd sa + comm = "command" ---- + justOutput opts (mkTreebank opts st comm ts) sa + CShowTreeGraph | oElem emitCode opts -> do -- -o returnArg (AString $ visualizeTrees opts $ strees $ s2t a) sa CShowTreeGraph -> do diff --git a/src/GF/Shell/HelpFile.hs b/src/GF/Shell/HelpFile.hs index dd03d2515..14c87e621 100644 --- a/src/GF/Shell/HelpFile.hs +++ b/src/GF/Shell/HelpFile.hs @@ -229,6 +229,13 @@ txtHelpFile = "\n examples:" ++ "\n p -lang=Cncdecimal \"123\" | at num2bin | l -- convert dec to bin" ++ "\n" ++ + "\ntb, tree_bank: tb" ++ + "\n Generate a multilingual treebank from a list of trees." ++ + "\n flags:" ++ + "\n -xml wrap the treebank with XML tags" ++ + "\n examples:" ++ + "\n gr -cat=S -number=100 | tb" ++ + "\n" ++ "\ntt, test_tokenizer: tt String" ++ "\n Show the token list sent to the parser when String is parsed." ++ "\n HINT: can be useful when debugging the parser." ++ diff --git a/src/GF/Shell/PShell.hs b/src/GF/Shell/PShell.hs index aba743503..676e54c46 100644 --- a/src/GF/Shell/PShell.hs +++ b/src/GF/Shell/PShell.hs @@ -113,6 +113,7 @@ pCommand ws = case ws of "tt" : s -> aString CTestTokenizer s "cc" : s -> aUnit $ CComputeConcrete $ unwords s "so" : s -> aUnit $ CShowOpers $ unwords s + "tb" : [] -> aUnit CTreeBank "tq" : i:o:[] -> aUnit (CTranslationQuiz (language i) (language o)) "tl":i:o:[] -> aUnit (CTranslationList (language i) (language o)) diff --git a/src/GF/Shell/ShellCommands.hs b/src/GF/Shell/ShellCommands.hs index 48eac25a5..935b0be09 100644 --- a/src/GF/Shell/ShellCommands.hs +++ b/src/GF/Shell/ShellCommands.hs @@ -48,6 +48,7 @@ data Command = | CTranslate Language Language | CGenerateRandom | CGenerateTrees + | CTreeBank | CPutTerm | CWrapTerm I.Ident | CApplyTransfer (Maybe I.Ident, I.Ident) @@ -182,6 +183,7 @@ optionsOfCommand co = case co of CGenerateRandom -> both "cf prob" "cat lang number depth" CGenerateTrees -> both "metas" "atoms depth alts cat lang number" CPutTerm -> flags "transform number" + CTreeBank -> opts "xml" CWrapTerm _ -> opts "c" CApplyTransfer _ -> flags "lang transfer" CMorphoAnalyse -> both "short" "lang" diff --git a/src/GF/UseGrammar/Treebank.hs b/src/GF/UseGrammar/Treebank.hs new file mode 100644 index 000000000..667c323f6 --- /dev/null +++ b/src/GF/UseGrammar/Treebank.hs @@ -0,0 +1,69 @@ +---------------------------------------------------------------------- +-- | +-- Module : Treebank +-- Maintainer : AR +-- Stability : (stable) +-- Portability : (portable) +-- +-- Generate multilingual treebanks. AR 8\/2\/2006 +-- +-- (c) Aarne Ranta 2006 under GNU GPL +-- +-- Purpose: to generate treebanks. +----------------------------------------------------------------------------- + +module GF.UseGrammar.Treebank (mkTreebank) where + + +import GF.Compile.ShellState (ShellState,grammar2shellState,canModules,stateGrammarOfLang,abstract,grammar,firstStateGrammar,allLanguages,allCategories) +import GF.UseGrammar.Linear (linTree2string) +import GF.UseGrammar.Custom +import GF.Canon.CMacros (noMark) +import GF.Grammar.Grammar (Trm) +import GF.Grammar.MMacros (exp2tree) +import GF.Grammar.Macros (zIdent) +import GF.Grammar.PrGrammar (prt_) +import GF.Grammar.Values (tree2exp) +import GF.Data.Operations +import GF.Infra.Option +import qualified GF.Grammar.Abstract as A + +-- Generate a treebank with a multilingual grammar. AR 8/2/2006 +-- (c) Aarne Ranta 2006 under GNU GPL + +-- | the main function +mkTreebank :: Options -> ShellState -> String -> [A.Tree] -> IO () +mkTreebank opts sh com trees = putInXML opts "treebank" comm(mapM_ mkItem trees) + where + mkItem t = putInXML opts "item" cat (mkTree t >>mapM_ (mkLin t) langs) + mkTree t = putInXML opts "tree" [] (putStrLn $ showTree t) + mkLin t lg = putInXML opts "lin" (lang lg) (putStrLn $ linearize sh lg t) + + langs = [prt_ l | l <- allLanguages sh] + comm = "" --- " command=" ++ show com +++ "abstract=" ++ show abstr + abstr = "" --- "Abs" ---- + cat = "" --- " cat=" ++ show "S" ---- + lang lg = " lang=" ++ show (prt_ (zIdent lg)) + + +putInXML :: Options -> String -> String -> IO () -> IO () +putInXML opts tag attrs io = do + ifXML $ putStrLn $ tagXML $ tag ++ attrs + io + ifXML $ putStrLn $ tagXML $ '/':tag + where + ifXML c = if oElem showXML opts then c else return () + +tagXML :: String -> String +tagXML s = "<" ++ s ++ ">" + +--- these handy functions are borrowed from EmbedAPI + +linearize mgr lang = + untok . + linTree2string noMark (canModules mgr) (zIdent lang) + where + sgr = stateGrammarOfLang mgr (zIdent lang) + untok = customOrDefault noOptions useUntokenizer customUntokenizer sgr + +showTree t = prt_ $ tree2exp t diff --git a/src/HelpFile b/src/HelpFile index 0ff04b25b..8b54e5585 100644 --- a/src/HelpFile +++ b/src/HelpFile @@ -200,6 +200,13 @@ at, apply_transfer: at (Module.Fun | Fun) examples: p -lang=Cncdecimal "123" | at num2bin | l -- convert dec to bin +tb, tree_bank: tb + Generate a multilingual treebank from a list of trees. + flags: + -xml wrap the treebank with XML tags + examples: + gr -cat=S -number=100 | tb + tt, test_tokenizer: tt String Show the token list sent to the parser when String is parsed. HINT: can be useful when debugging the parser.