added script for reading the Susanne treebank

This commit is contained in:
kr.angelov
2013-12-04 12:11:41 +00:00
parent f5cda1e6a3
commit 106b41a2cb
2 changed files with 57 additions and 0 deletions

View File

@@ -0,0 +1,43 @@
module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank) where
import Data.Char
type Tag = String
type Id = String
type Word = String
type Lemma = String
data ParseTree
= Phrase Tag [ParseTree]
| Word Id Tag Word Lemma
data ParseTreePos
= Root
| At ParseTreePos Tag [ParseTree]
instance Show ParseTree where
show (Phrase tag ts) = "["++tag++" "++unwords (map show ts)++"]"
show (Word _ tag w _) = "["++tag++" "++w++"]"
readTreebank ls = readLines Root (map words ls)
readLines p [] = []
readLines p ([id,_,tag,w,l,parse]:ls) =
readParse p (Word id tag w l) parse ls
readParse p w [] ls = readLines p ls
readParse p w ('[':cs) ls =
case break (not . isTagChar) cs of
(tag,cs) -> readParse (At p tag []) w cs ls
readParse (At p tag ts) w ('.':cs) ls =
readParse (At p tag (w:ts)) w cs ls
readParse (At p tag ts) w cs ls =
case break (not . isTagChar) cs of
(tag,']':cs) -> let t = Phrase tag (reverse ts)
in case p of
Root -> t : readLines p ls
At p tag ts -> readParse (At p tag (t:ts)) w cs ls
_ -> error cs
isTagChar c =
isLetter c || isDigit c || elem c ":&+-%@=?\"*!"

View File

@@ -0,0 +1,14 @@
import System.Directory
import System.FilePath
import Data.List
import SusanneFormat
main = do
fs <- getDirectoryContents "data"
txts <- (mapM (\f -> readFile ("data" </> f)) . filter ((/= ".") . take 1)) (sort fs)
let ts = filter (not . isBreak) (readTreebank (lines (concat txts)))
writeFile "text" (unlines (map show ts))
isBreak (Phrase "Oh" [Word _ "YB" "<minbrk>" _]) = True
isBreak _ = False