forked from GitHub/gf-core
143 lines
4.4 KiB
Haskell
143 lines
4.4 KiB
Haskell
----------------------------------------------------------------------
|
|
-- |
|
|
-- Module : Treebank
|
|
-- Maintainer : AR
|
|
-- Stability : (stable)
|
|
-- Portability : (portable)
|
|
--
|
|
-- Generate multilingual treebanks. AR 8\/2\/2006
|
|
--
|
|
-- (c) Aarne Ranta 2006 under GNU GPL
|
|
--
|
|
-- Purpose: to generate treebanks.
|
|
-----------------------------------------------------------------------------
|
|
|
|
module GF.UseGrammar.Treebank (
|
|
mkTreebank,
|
|
testTreebank,
|
|
treesTreebank,
|
|
getTreebank,
|
|
lookupTreebank,
|
|
pre2treebank
|
|
) where
|
|
|
|
import GF.Compile.ShellState
|
|
import GF.UseGrammar.Linear (linTree2string)
|
|
import GF.UseGrammar.Custom
|
|
import GF.UseGrammar.GetTree (string2tree)
|
|
import GF.Grammar.TypeCheck (annotate)
|
|
import GF.Canon.CMacros (noMark)
|
|
import GF.Grammar.Grammar (Trm)
|
|
import GF.Grammar.MMacros (exp2tree)
|
|
import GF.Grammar.Macros (zIdent)
|
|
import GF.Grammar.PrGrammar (prt_)
|
|
import GF.Grammar.Values (tree2exp)
|
|
import GF.Data.Operations
|
|
import GF.Infra.Option
|
|
import qualified GF.Grammar.Abstract as A
|
|
import qualified Data.Map as M
|
|
|
|
-- Generate a treebank with a multilingual grammar. AR 8/2/2006
|
|
-- (c) Aarne Ranta 2006 under GNU GPL
|
|
|
|
-- | the main functions
|
|
mkTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res
|
|
mkTreebank opts sh com trees = putInXML opts "treebank" comm (concatMap mkItem tris)
|
|
where
|
|
mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t ++ concatMap (mkLin t) langs)
|
|
-- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs)
|
|
mkTree t = putInXML opts "tree" [] (puts $ showTree t)
|
|
mkLin t lg = putInXML opts "lin" (lang lg) (puts $ linearize sh lg t)
|
|
|
|
langs = [prt_ l | l <- allLanguages sh]
|
|
comm = "" --- " command=" ++ show com +++ "abstract=" ++ show abstr
|
|
abstr = "" --- "Abs" ----
|
|
cat i = " number=" ++ show (show i) --- " cat=" ++ show "S" ----
|
|
lang lg = " lang=" ++ show (prt_ (zIdent lg))
|
|
tris = zip trees [1..]
|
|
|
|
testTreebank :: Options -> ShellState -> String -> Res
|
|
testTreebank opts sh = putInXML opts "testtreebank" [] .
|
|
concatMap testOne .
|
|
getTreebanks . lines
|
|
where
|
|
testOne (e,lang,str0) = do
|
|
let tr = annot gr e
|
|
let str = linearize sh lang tr
|
|
if str == str0 then ret else putInXML opts "diff" [] $ concat [
|
|
putInXML opts "tree" [] (puts $ showTree tr),
|
|
putInXML opts "old" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str0,
|
|
putInXML opts "new" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str
|
|
]
|
|
gr = firstStateGrammar sh
|
|
|
|
treesTreebank :: Options -> String -> [String]
|
|
treesTreebank _ = terms . getTreebank . lines where
|
|
terms ts = [t | (t,_) <- ts]
|
|
|
|
-- string vs. IO
|
|
type Res = [String] -- IO ()
|
|
puts :: String -> Res
|
|
puts = return -- putStrLn
|
|
ret = [] -- return ()
|
|
--
|
|
|
|
type PreTreebank = [(String,[(String,String)])]
|
|
|
|
getTreebanks :: [String] -> [(String,String,String)]
|
|
getTreebanks = concatMap grps . getTreebank where
|
|
grps (t,lls) = [(t,x,y) | (x,y) <- lls]
|
|
|
|
getTreebank :: [String] -> PreTreebank
|
|
getTreebank ll = case ll of
|
|
l:ls@(_:_:_) ->
|
|
let (l1,l2) = getItem ls
|
|
(tr,lins) = getTree l1
|
|
lglins = getLins lins
|
|
in (tr,lglins) : getTreebank l2
|
|
_ -> []
|
|
where
|
|
getItem = span ((/="</item") . take 6)
|
|
|
|
getTree (_:ss) =
|
|
let (t1,t2) = span ((/="</tree") . take 6) ss in (last t1, drop 1 t2)
|
|
|
|
getLins (beg:str:end:ss) = (getLang beg, str):getLins ss
|
|
getLins _ = []
|
|
|
|
getLang = takeWhile (/='"') . tail . dropWhile (/='"')
|
|
|
|
lookupTreebank :: Treebank -> String -> [(String,String)]
|
|
lookupTreebank tb s = maybe [] id $ M.lookup s tb
|
|
|
|
pre2treebank :: PreTreebank -> Treebank
|
|
pre2treebank tb = M.fromListWith (++) [(s,[(l,t)]) | (t,ls) <- tb, (l,s) <- ls]
|
|
|
|
annot :: StateGrammar -> String -> A.Tree
|
|
annot gr s = errVal (error "illegal tree") $ do
|
|
let t = tree2exp $ string2tree gr s
|
|
annotate (grammar gr) t
|
|
|
|
putInXML :: Options -> String -> String -> Res -> Res
|
|
putInXML opts tag attrs io =
|
|
(ifXML $ puts $ tagXML $ tag ++ attrs) ++
|
|
io ++
|
|
(ifXML $ puts $ tagXML $ '/':tag)
|
|
where
|
|
ifXML c = if oElem showXML opts then c else []
|
|
|
|
|
|
tagXML :: String -> String
|
|
tagXML s = "<" ++ s ++ ">"
|
|
|
|
--- these handy functions are borrowed from EmbedAPI
|
|
|
|
linearize mgr lang =
|
|
untok .
|
|
linTree2string noMark (canModules mgr) (zIdent lang)
|
|
where
|
|
sgr = stateGrammarOfLangOpt False mgr (zIdent lang)
|
|
untok = customOrDefault (stateOptions sgr) useUntokenizer customUntokenizer sgr
|
|
|
|
showTree t = prt_ $ tree2exp t
|