forked from GitHub/gf-core
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
This commit is contained in:
251
src-3.0/GF/UseGrammar/Treebank.hs
Normal file
251
src-3.0/GF/UseGrammar/Treebank.hs
Normal file
@@ -0,0 +1,251 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Treebank
|
||||
-- Maintainer : AR
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- Generate multilingual treebanks. AR 8\/2\/2006
|
||||
--
|
||||
-- (c) Aarne Ranta 2006 under GNU GPL
|
||||
--
|
||||
-- Purpose: to generate treebanks.
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.UseGrammar.Treebank (
|
||||
mkMultiTreebank,
|
||||
mkUniTreebank,
|
||||
multi2uniTreebank,
|
||||
uni2multiTreebank,
|
||||
testMultiTreebank,
|
||||
treesTreebank,
|
||||
getTreebank,
|
||||
getUniTreebank,
|
||||
readUniTreebanks,
|
||||
readMultiTreebank,
|
||||
lookupTreebank,
|
||||
assocsTreebank,
|
||||
isWordInTreebank,
|
||||
printAssoc,
|
||||
mkCompactTreebank
|
||||
) where
|
||||
|
||||
import GF.Compile.ShellState
|
||||
import GF.UseGrammar.Linear -- (linTree2string)
|
||||
import GF.UseGrammar.Custom
|
||||
import GF.UseGrammar.GetTree (string2tree)
|
||||
import GF.Grammar.TypeCheck (annotate)
|
||||
import GF.Canon.CMacros (noMark)
|
||||
import GF.Grammar.Grammar (Trm)
|
||||
import GF.Grammar.MMacros (exp2tree)
|
||||
import GF.Grammar.Macros (zIdent)
|
||||
import GF.Grammar.PrGrammar (prt_,prt)
|
||||
import GF.Grammar.Values (tree2exp)
|
||||
import GF.Data.Operations
|
||||
import GF.Infra.Option
|
||||
import GF.Infra.Ident (Ident)
|
||||
import GF.Infra.UseIO
|
||||
import qualified GF.Grammar.Abstract as A
|
||||
import qualified Data.Map as M
|
||||
import qualified Data.Set as S
|
||||
import qualified Data.List as L
|
||||
import Control.Monad (liftM)
|
||||
import System.FilePath
|
||||
|
||||
-- Generate a treebank with a multilingual grammar. AR 8/2/2006
|
||||
-- (c) Aarne Ranta 2006 under GNU GPL
|
||||
|
||||
-- keys are trees; format: XML file
|
||||
type MultiTreebank = [(String,[(String,String)])] -- tree,lang,lin
|
||||
|
||||
-- keys are strings; format: string TAB tree TAB ... TAB tree
|
||||
type UniTreebank = Treebank -- M.Map String [String] -- string,tree
|
||||
|
||||
-- both formats can be read from both kinds of files
|
||||
readUniTreebanks :: FilePath -> IO [(Ident,UniTreebank)]
|
||||
readUniTreebanks file = do
|
||||
s <- readFileIf file
|
||||
return $ if isMultiTreebank s
|
||||
then multi2uniTreebank $ getTreebank $ lines s
|
||||
else
|
||||
let tb = getUniTreebank $ lines s
|
||||
in [(zIdent (dropExtension file),tb)]
|
||||
|
||||
readMultiTreebank :: FilePath -> IO MultiTreebank
|
||||
readMultiTreebank file = do
|
||||
s <- readFileIf file
|
||||
return $ if isMultiTreebank s
|
||||
then getTreebank $ lines s
|
||||
else uni2multiTreebank (zIdent (dropExtension file)) $ getUniTreebank $ lines s
|
||||
|
||||
isMultiTreebank :: String -> Bool
|
||||
isMultiTreebank s = take 10 s == "<treebank>"
|
||||
|
||||
multi2uniTreebank :: MultiTreebank -> [(Ident,UniTreebank)]
|
||||
multi2uniTreebank mt@((_,lls):_) = [(zIdent la, mkTb la) | (la,_) <- lls] where
|
||||
mkTb la = M.fromListWith (++) [(s,[t]) | (t,lls) <- mt, (l,s) <- lls, l==la]
|
||||
multi2uniTreebank [] = []
|
||||
|
||||
uni2multiTreebank :: Ident -> UniTreebank -> MultiTreebank
|
||||
uni2multiTreebank la tb =
|
||||
[(t,[(prt_ la, s)]) | (s,ts) <- assocsTreebank tb, t <- ts]
|
||||
|
||||
-- | the main functions
|
||||
|
||||
-- builds a treebank where trees are the keys, and writes a file (opt. XML)
|
||||
mkMultiTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res
|
||||
mkMultiTreebank opts sh com trees
|
||||
| oElem (iOpt "compact") opts = mkCompactTreebank opts sh trees
|
||||
mkMultiTreebank opts sh com trees =
|
||||
putInXML opts "treebank" comm (concatMap mkItem tris) where
|
||||
mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t ++ concatMap (mkLin t) langs)
|
||||
-- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs)
|
||||
mkTree t = putInXML opts "tree" [] (puts $ showTree t)
|
||||
mkLin t lg = putInXML opts "lin" (lang lg) (puts $ linearize opts sh lg t)
|
||||
|
||||
langs = [prt_ l | l <- allLanguages sh]
|
||||
comm = "" --- " command=" ++ show com +++ "abstract=" ++ show abstr
|
||||
abstr = "" --- "Abs" ----
|
||||
cat i = " number=" ++ show (show i) --- " cat=" ++ show "S" ----
|
||||
lang lg = " lang=" ++ show (prt_ (zIdent lg))
|
||||
tris = zip trees [1..]
|
||||
|
||||
-- builds a unilingual treebank where strings are the keys into an internal treebank
|
||||
|
||||
mkUniTreebank :: Options -> ShellState -> Language -> [A.Tree] -> Treebank
|
||||
mkUniTreebank opts sh lg trees = M.fromListWith (++) [(lin t, [prt_ t]) | t <- trees]
|
||||
where
|
||||
lang = prt_ lg
|
||||
lin t = linearize opts sh lang t
|
||||
|
||||
-- reads a treebank and linearizes its trees again, printing all differences
|
||||
testMultiTreebank :: Options -> ShellState -> String -> Res
|
||||
testMultiTreebank opts sh = putInXML opts "testtreebank" [] .
|
||||
concatMap testOne .
|
||||
getTreebanks . lines
|
||||
where
|
||||
testOne (e,lang,str0) = do
|
||||
let tr = annot gr e
|
||||
let str = linearize opts sh lang tr
|
||||
if str == str0 then ret else putInXML opts "diff" [] $ concat [
|
||||
putInXML opts "tree" [] (puts $ showTree tr),
|
||||
putInXML opts "old" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str0,
|
||||
putInXML opts "new" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str
|
||||
]
|
||||
gr = firstStateGrammar sh
|
||||
|
||||
-- writes all the trees of the treebank
|
||||
treesTreebank :: Options -> String -> [String]
|
||||
treesTreebank _ = terms . getTreebank . lines where
|
||||
terms ts = [t | (t,_) <- ts]
|
||||
|
||||
-- string vs. IO
|
||||
type Res = [String] -- IO ()
|
||||
puts :: String -> Res
|
||||
puts = return -- putStrLn
|
||||
ret = [] -- return ()
|
||||
--
|
||||
|
||||
-- here strings are keys
|
||||
assocsTreebank :: UniTreebank -> [(String,[String])]
|
||||
assocsTreebank = M.assocs
|
||||
|
||||
isWordInTreebank :: UniTreebank -> String -> Bool
|
||||
isWordInTreebank tb w = S.member w (S.fromList (concatMap words (M.keys tb)))
|
||||
|
||||
printAssoc (s, ts) = s ++ concat ["\t" ++ t | t <- ts]
|
||||
|
||||
getTreebanks :: [String] -> [(String,String,String)]
|
||||
getTreebanks = concatMap grps . getTreebank where
|
||||
grps (t,lls) = [(t,x,y) | (x,y) <- lls]
|
||||
|
||||
getTreebank :: [String] -> MultiTreebank
|
||||
getTreebank ll = case ll of
|
||||
l:ls@(_:_:_) ->
|
||||
let (l1,l2) = getItem ls
|
||||
(tr,lins) = getTree l1
|
||||
lglins = getLins lins
|
||||
in (tr,lglins) : getTreebank l2
|
||||
_ -> []
|
||||
where
|
||||
getItem = span ((/="</item") . take 6)
|
||||
|
||||
getTree (_:ss) =
|
||||
let (t1,t2) = span ((/="</tree") . take 6) ss in (last t1, drop 1 t2)
|
||||
|
||||
getLins (beg:str:end:ss) = (getLang beg, str):getLins ss
|
||||
getLins _ = []
|
||||
|
||||
getLang = takeWhile (/='"') . tail . dropWhile (/='"')
|
||||
|
||||
getUniTreebank :: [String] -> UniTreebank
|
||||
getUniTreebank ls = M.fromListWith (++) [(s, ts) | s:ts <- map chop ls] where
|
||||
chop = chunks '\t'
|
||||
|
||||
lookupTreebank :: Treebank -> String -> [String]
|
||||
lookupTreebank tb s = maybe [] id $ M.lookup s tb
|
||||
|
||||
annot :: StateGrammar -> String -> A.Tree
|
||||
annot gr s = errVal (error "illegal tree") $ do
|
||||
let t = tree2exp $ string2tree gr s
|
||||
annotate (grammar gr) t
|
||||
|
||||
putInXML :: Options -> String -> String -> Res -> Res
|
||||
putInXML opts tag attrs io =
|
||||
(ifXML $ puts $ tagXML $ tag ++ attrs) ++
|
||||
io ++
|
||||
(ifXML $ puts $ tagXML $ '/':tag)
|
||||
where
|
||||
ifXML c = if oElem showXML opts then c else []
|
||||
|
||||
|
||||
tagXML :: String -> String
|
||||
tagXML s = "<" ++ s ++ ">"
|
||||
|
||||
-- print the treebank in a compact format:
|
||||
-- first a sorted list of all words, referrable by index
|
||||
-- then the linearization of each tree, as sequences of word indices
|
||||
-- this format is usable in embedded translation systems.
|
||||
|
||||
mkCompactTreebank :: Options -> ShellState -> [A.Tree] -> [String]
|
||||
mkCompactTreebank opts sh = printCompactTreebank . mkJustMultiTreebank opts sh
|
||||
|
||||
printCompactTreebank :: (MultiTreebank,[String]) -> [String]
|
||||
printCompactTreebank (tb,lgs) = (stat:langs:unwords ws : "\n" : linss) where
|
||||
ws = L.sort $ L.nub $ concat $ map (concatMap (words . snd) . snd) tb
|
||||
|
||||
linss = map (unwords . pad) linss0
|
||||
linss0 = map (map (show . encode) . words) allExs
|
||||
allExs = concat [[snd (ls !! i) | (_,ls) <- tb] | i <- [0..length lgs - 1]]
|
||||
encode w = maybe undefined id $ M.lookup w wmap
|
||||
wmap = M.fromAscList $ zip ws [1..]
|
||||
stat = unwords $ map show [length ws, length lgs, length tb, smax]
|
||||
langs = unwords lgs
|
||||
smax = maximum $ map length linss0
|
||||
pad ws = ws ++ replicate (smax - length ws) "0"
|
||||
|
||||
-- [(String,[(String,String)])] -- tree,lang,lin
|
||||
mkJustMultiTreebank :: Options -> ShellState -> [A.Tree] -> (MultiTreebank,[String])
|
||||
mkJustMultiTreebank opts sh ts =
|
||||
([(prt_ t, [(la, lin la t) | la <- langs]) | t <- ts],langs) where
|
||||
langs = map prt_ $ allLanguages sh
|
||||
lin = linearize opts sh
|
||||
|
||||
|
||||
--- these handy functions are borrowed from EmbedAPI
|
||||
|
||||
linearize opts mgr lang = lin where
|
||||
sgr = stateGrammarOfLangOpt False mgr zlang
|
||||
cgr = canModules mgr
|
||||
zlang = zIdent lang
|
||||
untok = customOrDefault (addOptions opts (stateOptions sgr)) useUntokenizer customUntokenizer sgr
|
||||
lin
|
||||
| oElem showRecord opts = err id id . liftM prt . linearizeNoMark cgr zlang
|
||||
| oElem tableLin opts =
|
||||
err id id . liftM (unlines . map untok . prLinTable True) . allLinTables True cgr zlang
|
||||
| oElem showAll opts =
|
||||
err id id . liftM (unlines . map untok . prLinTable False) . allLinTables False cgr zlang
|
||||
|
||||
| otherwise = untok . linTree2string noMark cgr zlang
|
||||
|
||||
showTree t = prt_ $ tree2exp t
|
||||
Reference in New Issue
Block a user