forked from GitHub/gf-core
compact treebank format for translation systems
This commit is contained in:
@@ -26,7 +26,8 @@ module GF.UseGrammar.Treebank (
|
|||||||
lookupTreebank,
|
lookupTreebank,
|
||||||
assocsTreebank,
|
assocsTreebank,
|
||||||
isWordInTreebank,
|
isWordInTreebank,
|
||||||
printAssoc
|
printAssoc,
|
||||||
|
mkCompactTreebank
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import GF.Compile.ShellState
|
import GF.Compile.ShellState
|
||||||
@@ -47,6 +48,7 @@ import GF.Infra.UseIO
|
|||||||
import qualified GF.Grammar.Abstract as A
|
import qualified GF.Grammar.Abstract as A
|
||||||
import qualified Data.Map as M
|
import qualified Data.Map as M
|
||||||
import qualified Data.Set as S
|
import qualified Data.Set as S
|
||||||
|
import qualified Data.List as L
|
||||||
import Control.Monad (liftM)
|
import Control.Monad (liftM)
|
||||||
|
|
||||||
-- Generate a treebank with a multilingual grammar. AR 8/2/2006
|
-- Generate a treebank with a multilingual grammar. AR 8/2/2006
|
||||||
@@ -91,8 +93,10 @@ uni2multiTreebank la tb =
|
|||||||
|
|
||||||
-- builds a treebank where trees are the keys, and writes a file (opt. XML)
|
-- builds a treebank where trees are the keys, and writes a file (opt. XML)
|
||||||
mkMultiTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res
|
mkMultiTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res
|
||||||
mkMultiTreebank opts sh com trees = putInXML opts "treebank" comm (concatMap mkItem tris)
|
mkMultiTreebank opts sh com trees
|
||||||
where
|
| oElem (iOpt "compact") opts = mkCompactTreebank opts sh trees
|
||||||
|
mkMultiTreebank opts sh com trees =
|
||||||
|
putInXML opts "treebank" comm (concatMap mkItem tris) where
|
||||||
mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t ++ concatMap (mkLin t) langs)
|
mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t ++ concatMap (mkLin t) langs)
|
||||||
-- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs)
|
-- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs)
|
||||||
mkTree t = putInXML opts "tree" [] (puts $ showTree t)
|
mkTree t = putInXML opts "tree" [] (puts $ showTree t)
|
||||||
@@ -197,6 +201,29 @@ putInXML opts tag attrs io =
|
|||||||
tagXML :: String -> String
|
tagXML :: String -> String
|
||||||
tagXML s = "<" ++ s ++ ">"
|
tagXML s = "<" ++ s ++ ">"
|
||||||
|
|
||||||
|
-- print the treebank in a compact format:
|
||||||
|
-- first a sorted list of all words, referrable by index
|
||||||
|
-- then the linearization of each tree, as sequences of word indices
|
||||||
|
-- this format is usable in embedded translation systems.
|
||||||
|
|
||||||
|
mkCompactTreebank :: Options -> ShellState -> [A.Tree] -> [String]
|
||||||
|
mkCompactTreebank opts sh = printCompactTreebank . mkJustMultiTreebank opts sh
|
||||||
|
|
||||||
|
printCompactTreebank :: MultiTreebank -> [String]
|
||||||
|
printCompactTreebank tb = (unwords ws : "\n" : map lins tb) where
|
||||||
|
ws = L.sort $ L.nub $ concat $ map (concatMap (words . snd) . snd) tb
|
||||||
|
lins (_,ls) = unlines [unwords (map encode (words ws)) | (_,ws) <- ls]
|
||||||
|
encode w = maybe undefined id $ M.lookup w wmap
|
||||||
|
wmap = M.fromAscList $ zip ws (map show [0..])
|
||||||
|
|
||||||
|
-- [(String,[(String,String)])] -- tree,lang,lin
|
||||||
|
mkJustMultiTreebank :: Options -> ShellState -> [A.Tree] -> MultiTreebank
|
||||||
|
mkJustMultiTreebank opts sh ts =
|
||||||
|
[(prt_ t, [(la, lin la t) | la <- langs]) | t <- ts] where
|
||||||
|
langs = map prt_ $ allLanguages sh
|
||||||
|
lin = linearize opts sh
|
||||||
|
|
||||||
|
|
||||||
--- these handy functions are borrowed from EmbedAPI
|
--- these handy functions are borrowed from EmbedAPI
|
||||||
|
|
||||||
linearize opts mgr lang = lin where
|
linearize opts mgr lang = lin where
|
||||||
|
|||||||
Reference in New Issue
Block a user