testing treebanks

This commit is contained in:
aarne
2006-02-08 21:58:36 +00:00
parent a1e79f3634
commit 676058581d
9 changed files with 116 additions and 28 deletions

View File

@@ -14,6 +14,15 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2
<p>
8/2 (AR) The command <tt>tb = tree_bank</tt> for creating and testing against
multilingual treebanks. Example uses:
<pre>
gr -cat=S -number=100 | tb -xml | wf tb.xml -- random treebank into file
rf tb.txt | tb -c -- read comparison treebank from file
</pre>
<p>
10/1 (AR) Forbade variable binding inside negation and Kleene star
patterns.

View File

@@ -1,4 +1,4 @@
all:
gslt:
txt2tags gslt-sem-2006.txt
htmls gslt-sem-2006.html

View File

@@ -7,7 +7,7 @@
<P ALIGN="center"><CENTER><H1>Grammars as Software Libraries</H1>
<FONT SIZE="4">
<I>Author: Aarne Ranta &lt;aarne (at) cs.chalmers.se&gt;</I><BR>
Last update: Mon Jan 30 10:53:18 2006
Last update: Wed Feb 8 19:06:27 2006
</FONT></CENTER>
<P>
@@ -64,6 +64,7 @@ Student projects on grammar libraries:
<UL>
<LI>Inger Andersson &amp; Therese Söderberg: Spanish morphology
<LI>Ludmilla Bogavac: Russian morphology
<LI>Karin Cavallin: comparison with Svenska Akademins Grammatik
<LI>Ali El Dada: Arabic morphology and syntax
<LI>Muhammad Humayoun: Urdu morphology
<LI>Michael Pellauer: Estonian morphology
@@ -79,6 +80,21 @@ Technology, also:
<LI>Peter Ljunglöf
</UL>
<P>
Various grammar library contributions from the multilingual Chalmers comminity:
</P>
<UL>
<LI>Koen Claessen, Carlos Gonzalía, Patrik Jansson, Wojciech Mostowski, Karol Ostrovsky,
David Wahlstedt
</UL>
<P>
Resource library patches and suggestions from the WebALT staff:
</P>
<UL>
<LI>Lauri Carlson, Glòria Casanellas, Anni Laine, Wanjiku Ng'ang'a, Jordi Saludes
</UL>
<P>
<!-- NEW -->
</P>

View File

@@ -47,6 +47,7 @@ Staff contributions to grammar libraries:
Student projects on grammar libraries:
- Inger Andersson & Therese Söderberg: Spanish morphology
- Ludmilla Bogavac: Russian morphology
- Karin Cavallin: comparison with Svenska Akademins Grammatik
- Ali El Dada: Arabic morphology and syntax
- Muhammad Humayoun: Urdu morphology
- Michael Pellauer: Estonian morphology
@@ -59,6 +60,15 @@ Technology, also:
- Peter Ljunglöf
Various grammar library contributions from the multilingual Chalmers comminity:
- Koen Claessen, Carlos Gonzalía, Patrik Jansson, Wojciech Mostowski, Karol Ostrovsky,
David Wahlstedt
Resource library patches and suggestions from the WebALT staff:
- Lauri Carlson, Glòria Casanellas, Anni Laine, Wanjiku Ng'ang'a, Jordi Saludes
#NEW
==Software Libraries==

View File

@@ -290,10 +290,13 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com
_ -> Nothing
returnArg (ATrms $ generateTrees opts gro mt) sa
CTreeBank | oElem doCompute opts -> do -- -c
let bank = prCommandArg a
returnArg (AString $ unlines $ testTreebank opts st bank) sa
CTreeBank -> do
let ts = strees $ s2t $ snd sa
comm = "command" ----
justOutput opts (mkTreebank opts st comm ts) sa
returnArg (AString $ unlines $ mkTreebank opts st comm ts) sa
CShowTreeGraph | oElem emitCode opts -> do -- -o
returnArg (AString $ visualizeTrees opts $ strees $ s2t a) sa

View File

@@ -230,11 +230,14 @@ txtHelpFile =
"\n p -lang=Cncdecimal \"123\" | at num2bin | l -- convert dec to bin" ++
"\n" ++
"\ntb, tree_bank: tb" ++
"\n Generate a multilingual treebank from a list of trees." ++
"\n flags:" ++
"\n -xml wrap the treebank with XML tags" ++
"\n Generate a multilingual treebank from a list of trees (default) or compare" ++
"\n to an existing treebank." ++
"\n options:" ++
"\n -c compare to existing xml-formatted treebank" ++
"\n -xml wrap the treebank (or comparison results) with XML tags" ++
"\n examples:" ++
"\n gr -cat=S -number=100 | tb" ++
"\n gr -cat=S -number=100 | tb -xml | wf tb.xml -- random treebank into file" ++
"\n rf tb.txt | tb -c -- read comparison treebank from file" ++
"\n" ++
"\ntt, test_tokenizer: tt String" ++
"\n Show the token list sent to the parser when String is parsed." ++

View File

@@ -183,7 +183,7 @@ optionsOfCommand co = case co of
CGenerateRandom -> both "cf prob" "cat lang number depth"
CGenerateTrees -> both "metas" "atoms depth alts cat lang number"
CPutTerm -> flags "transform number"
CTreeBank -> opts "xml"
CTreeBank -> opts "c xml"
CWrapTerm _ -> opts "c"
CApplyTransfer _ -> flags "lang transfer"
CMorphoAnalyse -> both "short" "lang"

View File

@@ -12,12 +12,13 @@
-- Purpose: to generate treebanks.
-----------------------------------------------------------------------------
module GF.UseGrammar.Treebank (mkTreebank) where
module GF.UseGrammar.Treebank (mkTreebank,testTreebank) where
import GF.Compile.ShellState (ShellState,grammar2shellState,canModules,stateGrammarOfLang,abstract,grammar,firstStateGrammar,allLanguages,allCategories)
import GF.Compile.ShellState
import GF.UseGrammar.Linear (linTree2string)
import GF.UseGrammar.Custom
import GF.UseGrammar.GetTree (string2tree)
import GF.Grammar.TypeCheck (annotate)
import GF.Canon.CMacros (noMark)
import GF.Grammar.Grammar (Trm)
import GF.Grammar.MMacros (exp2tree)
@@ -31,13 +32,14 @@ import qualified GF.Grammar.Abstract as A
-- Generate a treebank with a multilingual grammar. AR 8/2/2006
-- (c) Aarne Ranta 2006 under GNU GPL
-- | the main function
mkTreebank :: Options -> ShellState -> String -> [A.Tree] -> IO ()
mkTreebank opts sh com trees = putInXML opts "treebank" comm(mapM_ mkItem tris)
-- | the main functions
mkTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res
mkTreebank opts sh com trees = putInXML opts "treebank" comm (concatMap mkItem tris)
where
mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs)
mkTree t = putInXML opts "tree" [] (putStrLn $ showTree t)
mkLin t lg = putInXML opts "lin" (lang lg) (putStrLn $ linearize sh lg t)
mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t ++ concatMap (mkLin t) langs)
-- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs)
mkTree t = putInXML opts "tree" [] (puts $ showTree t)
mkLin t lg = putInXML opts "lin" (lang lg) (puts $ linearize sh lg t)
langs = [prt_ l | l <- allLanguages sh]
comm = "" --- " command=" ++ show com +++ "abstract=" ++ show abstr
@@ -46,14 +48,56 @@ mkTreebank opts sh com trees = putInXML opts "treebank" comm(mapM_ mkItem tris)
lang lg = " lang=" ++ show (prt_ (zIdent lg))
tris = zip trees [1..]
putInXML :: Options -> String -> String -> IO () -> IO ()
putInXML opts tag attrs io = do
ifXML $ putStrLn $ tagXML $ tag ++ attrs
io
ifXML $ putStrLn $ tagXML $ '/':tag
testTreebank :: Options -> ShellState -> String -> Res
testTreebank opts sh = putInXML opts "diff" [] . concatMap testOne . getTreebank . lines
where
ifXML c = if oElem showXML opts then c else return ()
testOne (e,lang,str) = do
let tr = annot gr e
let str0 = linearize sh lang tr
if str == str0 then ret else putInXML opts "diff" [] $ do
putInXML opts "tree" [] (puts $ showTree tr)
putInXML opts "old" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str0
putInXML opts "new" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str
gr = firstStateGrammar sh
-- string vs. IO
type Res = [String] -- IO ()
puts :: String -> Res
puts = return -- putStrLn
ret = [] -- return ()
--
getTreebank :: [String] -> [(String,String,String)]
getTreebank ll = case ll of
[] -> []
l:ls ->
let (l1,l2) = getItem ls
(tr,lins) = getTree l1
lglins = getLins lins
in [(tr,lang,str) | (lang,str) <- lglins] ++ getTreebank l2
where
getItem = span ((/="</item") . take 6)
getTree (_:ss) = let (t1,t2) = span ((/="</tree") . take 6) ss in (last t1, drop 1 t2)
getLins (beg:str:end:ss) = (getLang beg, str):getLins ss
getLins _ = []
getLang = takeWhile (/='"') . tail . dropWhile (/='"')
annot :: StateGrammar -> String -> A.Tree
annot gr s = errVal (error "illegal tree") $ do
let t = tree2exp $ string2tree gr s
annotate (grammar gr) t
putInXML :: Options -> String -> String -> Res -> Res
putInXML opts tag attrs io =
(ifXML $ puts $ tagXML $ tag ++ attrs) ++
io ++
(ifXML $ puts $ tagXML $ '/':tag)
where
ifXML c = if oElem showXML opts then c else []
tagXML :: String -> String
tagXML s = "<" ++ s ++ ">"

View File

@@ -201,11 +201,14 @@ at, apply_transfer: at (Module.Fun | Fun)
p -lang=Cncdecimal "123" | at num2bin | l -- convert dec to bin
tb, tree_bank: tb
Generate a multilingual treebank from a list of trees.
flags:
-xml wrap the treebank with XML tags
Generate a multilingual treebank from a list of trees (default) or compare
to an existing treebank.
options:
-c compare to existing xml-formatted treebank
-xml wrap the treebank (or comparison results) with XML tags
examples:
gr -cat=S -number=100 | tb
gr -cat=S -number=100 | tb -xml | wf tb.xml -- random treebank into file
rf tb.txt | tb -c -- read comparison treebank from file
tt, test_tokenizer: tt String
Show the token list sent to the parser when String is parsed.