mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-05-31 05:38:55 -06:00
added the GF version of Talbanken which was imported by Malin
This commit is contained in:
106
treebanks/talbanken/Format.hs
Normal file
106
treebanks/talbanken/Format.hs
Normal file
@@ -0,0 +1,106 @@
|
||||
{-# LANGUAGE TypeSynonymInstances, FlexibleInstances #-}
|
||||
module Format where
|
||||
import Prelude hiding (words,id)
|
||||
import Text.XML.HXT.Core
|
||||
import Data.Ord
|
||||
import Data.List hiding (words)
|
||||
import Data.Maybe
|
||||
import qualified Data.Map as M
|
||||
import System.Environment
|
||||
import Debug.Trace as Debug
|
||||
import Data.Char
|
||||
import qualified Data.Tree as T
|
||||
import Control.Monad.State
|
||||
|
||||
-- Functions for parsing XML to a format read by the other Haskell files
|
||||
|
||||
data Word = W {id :: Id, word :: String, pos :: Tag}
|
||||
data PhrTag = Ph {idPh :: Id, cat :: Tag, tags :: [(Tag,Id)]}
|
||||
data Sentence = Sent {rootS :: Id, words :: [Word], info :: [PhrTag], ws :: Int}
|
||||
type Tag = String
|
||||
type Id = String
|
||||
|
||||
|
||||
instance XmlPickler [Word] where
|
||||
xpickle = xpWords
|
||||
|
||||
instance XmlPickler Sentence where
|
||||
xpickle = xpSentence
|
||||
|
||||
xpSentences :: PU [Sentence]
|
||||
xpSentences = xpElem "corpus"
|
||||
$ xpWrap (snd, \a -> ((),a))
|
||||
$ xpPair (xpElem "head" $ xpUnit) (xpElem "body" $ xpList $ xpSentence)
|
||||
xpTags :: PU [PhrTag]
|
||||
xpTags = xpList $ xpElem "nt"
|
||||
$ xpWrap (uncurry3 Ph,\p -> (idPh p,cat p,tags p))
|
||||
$ xpTriple (xpAttr "id" xpText) (xpAttr "cat" xpText)
|
||||
(xpList $ xpTagMap)
|
||||
|
||||
xpTagMap :: PU (Tag,String)
|
||||
xpTagMap = xpElem "edge"
|
||||
$ xpPair (xpAttr "label" xpText)
|
||||
(xpAttr "idref" xpText)
|
||||
|
||||
xpSentence :: PU Sentence
|
||||
xpSentence = xpElem "s"
|
||||
$ xpWrap (makeSentence,\s -> (rootS s,words s, info s))
|
||||
$ xpElem "graph"
|
||||
$ xpTriple (xpAttr "root" xpText)
|
||||
( xpElem "terminals" xpWords)
|
||||
( xpElem "nonterminals" xpTags)
|
||||
where makeSentence (r,ws,tgs) = Sent r ws tgs (length ws)
|
||||
|
||||
|
||||
xpWords :: PU [Word]
|
||||
xpWords = xpList $ xpElem "t"
|
||||
$ xpWrap (uncurry3 W,\t -> (id t, word t,pos t))
|
||||
$ xpTriple (xpAttr "id" xpText)
|
||||
(xpAttr "word" xpText)
|
||||
(xpAttr "pos" xpText)
|
||||
|
||||
mainF src =
|
||||
runX (xunpickleDocument xpSentences [withInputEncoding utf8
|
||||
, withRemoveWS yes] src
|
||||
>>> arrIO (putStrLn . unlines . map (show . toTree)))
|
||||
|
||||
runPickle f src =
|
||||
runX (xunpickleDocument xpSentences [withInputEncoding utf8
|
||||
, withRemoveWS yes] src
|
||||
>>> arrIO (return . map f))
|
||||
|
||||
parse = runPickle toStringTree
|
||||
parseIdTree = runPickle toTree
|
||||
|
||||
toStringTree :: Sentence -> (String,T.Tree String)
|
||||
toStringTree = second (fmap snd) . toTree
|
||||
|
||||
toTree :: Sentence -> (String,T.Tree (Id,String))
|
||||
toTree s@(Sent root ws inf _) = (root,toTree' root s)
|
||||
|
||||
toTree' :: String -> Sentence -> T.Tree (Id,String)
|
||||
toTree' nr s@(Sent root ws inf _) =
|
||||
case (lookup' nr ws,lookup'' nr inf) of
|
||||
(Just w,_) -> putWord w
|
||||
(_,Just p) -> putPhrase p
|
||||
_ -> error $ "Error in toTree' "++show nr++" could not be found"
|
||||
where putWord (W i w p) = T.Node (i,p) [T.Node (i,w) []]
|
||||
putPhrase (Ph i c t) = T.Node (i,c)
|
||||
$ map (\(tag,next) -> T.Node (next,tag) [toTree' next s]) t
|
||||
|
||||
lookup' y (w@(W x _ _):xs) | y ==x = Just w
|
||||
| otherwise = lookup' y xs
|
||||
lookup' y [] = Nothing
|
||||
|
||||
lookup'' y (w@(Ph x _ _):xs) | y ==x = Just w
|
||||
| otherwise = lookup'' y xs
|
||||
lookup'' y [] = Nothing
|
||||
|
||||
|
||||
treeToSentence :: [T.Tree String] -> String
|
||||
treeToSentence ts = unwords $ map extractS ts
|
||||
where extractS (T.Node ws []) = ws
|
||||
extractS (T.Node c ts) = unwords $ map extractS ts
|
||||
|
||||
showa :: T.Tree String -> String
|
||||
showa (T.Node root ts) = "("++root++" "++concatMap showa ts++" )"
|
||||
Reference in New Issue
Block a user