added morphological tags to UD tree output. Tags are give in CncConfiguration, e.g. @N Sg Pl. Default tag is Cat-offset, as defined for each Cat in pgf

This commit is contained in:
Aarne Ranta
2018-12-18 15:59:48 +01:00
parent eb22112178
commit 40e7544a2b

View File

@@ -37,7 +37,7 @@ import qualified Data.Map as Map
--import qualified Data.IntMap as IntMap
import Data.List (intersperse,nub,mapAccumL,find,groupBy,sortBy)
import Data.Ord (comparing)
--import Data.Char (isDigit)
import Data.Char (isDigit)
import Data.Maybe (fromMaybe)
import Text.PrettyPrint
@@ -146,16 +146,16 @@ graphvizDependencyTree format debug mlab mclab pgf lang t =
conll = maybe conll0 (\ls -> fixCoNLL ls conll0) mclab
conll0 = (map.map) render wnodes
nodes = map mkNode leaves
links = map mkLink [(fid, fromMaybe (dep_lbl,nil) (lookup fid deps)) | ((cat,fid,fun),_,w) <- tail leaves]
links = map mkLink [(fid, fromMaybe (dep_lbl,nil) (lookup fid deps)) | ((cat,fid,fun,_),_,w) <- tail leaves]
-- CoNLL format: ID FORM LEMMA PLEMMA POS PPOS FEAT PFEAT HEAD PHEAD DEPREL PDEPREL
-- P variants are automatically predicted rather than gold standard
wnodes = [[int i, maltws ws, ppCId fun, ppCId (posCat cat), ppCId cat, unspec, int parent, text lab, unspec, unspec] |
((cat,fid,fun),i,ws) <- tail leaves,
wnodes = [[int i, maltws ws, ppCId fun, ppCId (posCat cat), ppCId cat, int lind, int parent, text lab, unspec, unspec] |
((cat,fid,fun,lind),i,ws) <- tail leaves,
let (lab,parent) = fromMaybe (dep_lbl,0)
(do (lbl,fid) <- lookup fid deps
(_,i,_) <- find (\((_,fid1,_),i,_) -> fid == fid1) leaves
(_,i,_) <- find (\((_,fid1,_,_),i,_) -> fid == fid1) leaves
return (lbl,i))
]
maltws = text . concat . intersperse "+" . words -- no spaces in column 2
@@ -164,7 +164,7 @@ graphvizDependencyTree format debug mlab mclab pgf lang t =
bss = bracketedLinearize pgf lang t
root = (wildCId,nil,wildCId)
root = (wildCId,nil,wildCId,0)
leaves = (root,0,root_lbl) : (groupAndIndexIt 1 . concatMap (getLeaves root)) bss
deps = let (_,(h,deps)) = getDeps 0 [] t []
@@ -183,9 +183,9 @@ graphvizDependencyTree format debug mlab mclab pgf lang t =
getLeaves parent bs =
case bs of
Leaf w -> [(parent,w)]
Bracket cat fid _ fun _ bss -> concatMap (getLeaves (cat,fid,fun)) bss
Bracket cat fid lind fun _ bss -> concatMap (getLeaves (cat,fid,fun,lind)) bss
mkNode ((_,p,_),i,w) =
mkNode ((_,p,_,_),i,w) =
tag p <+> brackets (text "label = " <> doubleQuotes (int i <> char '.' <+> text w)) <+> semi
mkLink (x,(lbl,y)) = tag y <+> text "->" <+> tag x <+> text "[label = " <> doubleQuotes (text lbl) <> text "] ;"
@@ -514,7 +514,7 @@ conll2latex' = dep2latex . conll2dep'
data Dep = Dep {
wordLength :: Int -> Double -- length of word at position int -- was: fixed width, millimetres (>= 20.0)
, tokens :: [(String,String)] -- word, pos (0..)
, tokens :: [(String,(String,String))] -- word, (pos,features) (0..)
, deps :: [((Int,Int),String)] -- from, to, label
, root :: Int -- root word position
}
@@ -554,7 +554,8 @@ dep2latex d =
[Comment (unwords (map fst (tokens d))),
Picture defaultUnit (width,height) (
[Put (wpos rwld i,0) (Text w) | (i,w) <- zip [0..] (map fst (tokens d))] -- words
++ [Put (wpos rwld i,15) (TinyText w) | (i,w) <- zip [0..] (map snd (tokens d))] -- pos tags 15u above bottom
++ [Put (wpos rwld i,15) (TinyText w) | (i,(w,_)) <- zip [0..] (map snd (tokens d))] -- pos tags 15u above bottom
++ [Put (wpos rwld i,-15) (TinyText w) | (i,(_,w)) <- zip [0..] (map snd (tokens d))] -- features 15u below bottom
++ concat [putArc rwld (aheight x y) x y label | ((x,y),label) <- deps d] -- arcs and labels
++ [Put (wpos rwld (root d) + 15,height) (ArrowDown (height-arcbase))]
++ [Put (wpos rwld (root d) + 20,height - 10) (TinyText "ROOT")]
@@ -585,8 +586,8 @@ conll2dep' ls = Dep {
, root = head $ [read x-1 | x:_:_:_:_:_:"0":_ <- ls] ++ [1]
}
where
wld i = maximum (0:[charWidth * fromIntegral (length w) | w <- let (tok,pos) = toks !! i in [tok,pos]])
toks = [(w,c) | _:w:_:c:_ <- ls]
wld i = maximum (0:[charWidth * fromIntegral (length w) | w <- let (tok,(pos,feat)) = toks !! i in [tok,pos,feat]])
toks = [(w,(c,m)) | _:w:_:c:_:m:_ <- ls]
dps = [((read y-1, read x-1),lab) | x:_:_:_:_:_:y:lab:_ <- ls, y /="0"]
--maxdist = maximum [abs (x-y) | ((x,y),_) <- dps]
@@ -751,18 +752,26 @@ ppSVG svg =
-- UseComp {"not"} PART neg head
-- UseComp {*} AUX cop head
type CncLabels = [(String, String -> Maybe (String -> String,String,String))]
-- (fun, word -> (pos,label,target))
-- the pos can remain unchanged, as in the current notation in the article
type CncLabels = [
Either
(String, String -> Maybe (String -> String,String,String))
-- (fun, word -> (pos,label,target))
-- the pos can remain unchanged, as in the current notation in the article
(String,[String])
-- (category, morphological forms)
]
fixCoNLL :: CncLabels -> CoNLL -> CoNLL
fixCoNLL labels conll = map fixc conll where
fixCoNLL cncLabels conll = map fixc conll where
labels = [l | Left l <- cncLabels]
flabels = [r | Right r <- cncLabels]
fixc row = case row of
(i:word:fun:pos:cat:x_:"0":"dep":xs) -> (i:word:fun:pos:cat:x_:"0":"root":xs) --- change the root label from dep to root
(i:word:fun:pos:cat:x_:"0":"dep":xs) -> (i:word:fun:pos:cat:(feat cat x_):"0":"root":xs) --- change the root label from dep to root
(i:word:fun:pos:cat:x_:j:label:xs) -> case look (fun,word) of
Just (pos',label',"head") -> (i:word:fun:pos' pos:cat:x_:j :label':xs)
Just (pos',label',target) -> (i:word:fun:pos' pos:cat:x_: getDep j target:label':xs)
_ -> row
Just (pos',label',"head") -> (i:word:fun:pos' pos:cat:(feat cat x_):j :label':xs)
Just (pos',label',target) -> (i:word:fun:pos' pos:cat:(feat cat x_): getDep j target:label':xs)
_ -> (i:word:fun:pos:cat:(feat cat x_):j:label:xs)
_ -> row
look (fun,word) = case lookup fun labels of
@@ -777,17 +786,36 @@ fixCoNLL labels conll = map fixc conll where
getDep j label = maybe j id $ lookup (label,j) [((label,j),i) | i:word:fun:pos:cat:x_:j:label:xs <- conll]
feat cat x = case lookup cat flabels of
Just tags | all isDigit x && length tags > read x -> tags !! read x
_ -> cat ++ "-" ++ x
getCncDepLabels :: String -> CncLabels
getCncDepLabels =
map merge .
groupBy (\ (x,_) (a,_) -> x == a) .
sortBy (comparing fst) .
concatMap analyse .
filter choose .
lines
getCncDepLabels s = wlabels s ++ flabels s
where
wlabels =
map Left .
map merge .
groupBy (\ (x,_) (a,_) -> x == a) .
sortBy (comparing fst) .
concatMap analyse .
filter chooseW .
lines
flabels =
map Right .
map collectTags .
map words .
filter chooseF .
lines
--- choose is for compatibility with the general notation
choose line = notElem '(' line && elem '{' line --- ignoring non-local (with "(") and abstract (without "{") rules
chooseW line = notElem '(' line &&
elem '{' line &&
--- ignoring non-local (with "(") and abstract (without "{") rules
---- TODO: this means that "(" cannot be a token
not (chooseF line)
chooseF line = take 1 line == "@" --- feature assignments have the form e.g. @N SgNom SgGen ; no spaces inside tags
analyse line = case break (=='{') line of
(beg,_:ws) -> case break (=='}') ws of
@@ -804,7 +832,9 @@ getCncDepLabels =
)
getToks = map unquote . filter (/=",") . toks
toks s = case lex s of [(t,"")] -> [t] ; [(t,cc)] -> t:toks cc ; _ -> []
unquote s = case s of '"':cc@(_:_) | last cc == '"' -> init cc ; _ -> s
unquote s = case s of '"':cc@(_:_) | last cc == '"' -> init cc ; _ -> s
collectTags (w:ws) = (tail w,ws)
printCoNLL :: CoNLL -> String
printCoNLL = unlines . map (concat . intersperse "\t")