Malt parser .conll format by vd -output=malt

This commit is contained in:
aarne
2009-10-20 13:04:33 +00:00
parent 499ac428d3
commit e82b6e02e9
2 changed files with 39 additions and 12 deletions

View File

@@ -569,7 +569,8 @@ allCommands cod env@(pgf, mos) = Map.fromList [
longname = "visualize_dependency", longname = "visualize_dependency",
synopsis = "show word dependency tree graphically", synopsis = "show word dependency tree graphically",
explanation = unlines [ explanation = unlines [
"Prints a dependency tree the .dot format (the graphviz format).", "Prints a dependency tree in the .dot format (the graphviz format, default)",
"or the MaltParser/CoNLL format (flag -output=malt)",
"By default, the last argument is the head of every abstract syntax", "By default, the last argument is the head of every abstract syntax",
"function; moreover, the head depends on the head of the function above.", "function; moreover, the head depends on the head of the function above.",
"The graph can be saved in a file by the wf command as usual.", "The graph can be saved in a file by the wf command as usual.",
@@ -581,21 +582,21 @@ allCommands cod env@(pgf, mos) = Map.fromList [
exec = \opts es -> do exec = \opts es -> do
let debug = isOpt "v" opts let debug = isOpt "v" opts
let file = valStrOpts "file" "" opts let file = valStrOpts "file" "" opts
let outp = valStrOpts "output" "dot" opts
mlab <- case file of mlab <- case file of
"" -> return Nothing "" -> return Nothing
_ -> readFile file >>= return . Just . getDepLabels . lines _ -> readFile file >>= return . Just . getDepLabels . lines
let lang = optLang opts let lang = optLang opts
let grph = if null es then [] else let grphs = unlines $ map (dependencyTree outp debug mlab Nothing pgf lang) es
dependencyTree debug mlab Nothing pgf lang (head es)
if isFlag "view" opts || isFlag "format" opts then do if isFlag "view" opts || isFlag "format" opts then do
let file s = "_grph." ++ s let file s = "_grphd." ++ s
let view = optViewGraph opts ++ " " let view = optViewGraph opts ++ " "
let format = optViewFormat opts let format = optViewFormat opts
writeFile (file "dot") (enc grph) writeFile (file "dot") (enc grphs)
system $ "dot -T" ++ format ++ " " ++ file "dot" ++ " > " ++ file format ++ system $ "dot -T" ++ format ++ " " ++ file "dot" ++ " > " ++ file format ++
" ; " ++ view ++ file format " ; " ++ view ++ file format
return void return void
else return $ fromString grph, else return $ fromString grphs,
examples = [ examples = [
"gr | aw -- generate a tree and show word alignment as graph script", "gr | aw -- generate a tree and show word alignment as graph script",
"gr | vt -view=\"open\" -- generate a tree and display alignment on a Mac" "gr | vt -view=\"open\" -- generate a tree and display alignment on a Mac"
@@ -606,6 +607,7 @@ allCommands cod env@(pgf, mos) = Map.fromList [
flags = [ flags = [
("file","configuration file for labels per fun, format 'fun l1 ... label ... l2'"), ("file","configuration file for labels per fun, format 'fun l1 ... label ... l2'"),
("format","format of the visualization file (default \"png\")"), ("format","format of the visualization file (default \"png\")"),
("output","output format of graph source (default \"dot\")"),
("view","program to open the resulting file (default \"open\")") ("view","program to open the resulting file (default \"open\")")
] ]
}), }),

View File

@@ -60,10 +60,16 @@ prGraph digr ns = concat $ map (++"\n") $ [graph ++ "{\n"] ++ ns ++ ["}"] where
-- dependency trees from Linearize.linearizeMark -- dependency trees from Linearize.linearizeMark
dependencyTree :: Bool -> Maybe Labels -> Maybe String -> PGF -> CId -> Expr -> String dependencyTree :: String -> Bool -> Maybe Labels -> Maybe String -> PGF -> CId -> Expr -> String
dependencyTree debug mlab ms pgf lang exp = prGraph True lin2dep where dependencyTree format debug mlab ms pgf lang exp = case format of
"malt" -> unlines (lin2dep format)
_ -> prGraph True (lin2dep format)
lin2dep = trace (ifd (show sortedNodes ++ show nodeWords)) $ prelude ++ nodes ++ links where
lin2dep format = trace (ifd (show sortedNodes ++ show nodeWords)) $ case format of
"malt" -> map (concat . intersperse "\t") wnodes
_ -> prelude ++ nodes ++ links
ifd s = if debug then s else [] ifd s = if debug then s else []
@@ -78,8 +84,8 @@ dependencyTree debug mlab ms pgf lang exp = prGraph True lin2dep where
nodeWords = (0,((mkCId "",[]),["ROOT"])) : zip [1..] [((f,p),w)| nodeWords = (0,((mkCId "",[]),["ROOT"])) : zip [1..] [((f,p),w)|
((Just f,p),w) <- wlins pot] ((Just f,p),w) <- wlins pot]
links = map mkLink links = map mkLink thelinks
[(word y, x, label tr y x) | thelinks = [(word y, x, label tr y x) |
(_,((f,x),_)) <- tail nodeWords, (_,((f,x),_)) <- tail nodeWords,
let y = dominant x] let y = dominant x]
mkLink (x,y,l) = node x ++ " -> " ++ node y ++ " [label = \"" ++ l ++ "\"] ;" mkLink (x,y,l) = node x ++ " -> " ++ node y ++ " [label = \"" ++ l ++ "\"] ;"
@@ -120,13 +126,32 @@ dependencyTree debug mlab ms pgf lang exp = prGraph True lin2dep where
Just ls | length ls > i -> ifd (showCId f ++ "#" ++ show i ++ "=") ++ ls !! i Just ls | length ls > i -> ifd (showCId f ++ "#" ++ show i ++ "=") ++ ls !! i
_ -> showCId f ++ "#" ++ show i _ -> showCId f ++ "#" ++ show i
-- to generate CoNLL format for MaltParser
nodeMap :: Map.Map [Int] Int
nodeMap = Map.fromList [(p,i) | (i,((_,p),_)) <- nodeWords]
arcMap :: Map.Map [Int] ([Int],String)
arcMap = Map.fromList [(y,(x,l)) | (x,y,l) <- thelinks]
lookDomLab p = case Map.lookup p arcMap of
Just (q,l) -> (maybe 0 id (Map.lookup q nodeMap), if null l then "_" else l)
_ -> (0,unspec)
wnodes = [[show i, unwords ws, showCId fun, pos, pos, morph, show dom, lab, unspec, unspec] |
(i, ((fun,p),ws)) <- tail nodeWords,
let pos = showCId $ lookValCat pgf fun,
let morph = unspec,
let (dom,lab) = lookDomLab p
]
unspec = "_"
type Labels = Map.Map CId [String] type Labels = Map.Map CId [String]
getDepLabels :: [String] -> Labels getDepLabels :: [String] -> Labels
getDepLabels ss = Map.fromList [(mkCId f,ls) | f:ls <- map words ss] getDepLabels ss = Map.fromList [(mkCId f,ls) | f:ls <- map words ss]
-- parse trees from Linearize.linearizeMark -- parse trees from Linearize.linearizeMark
---- nubrec and domins are quadratic, but could be (n log n) ---- nubrec and domins are quadratic, but could be (n log n)