mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
added script for reading the Susanne treebank
This commit is contained in:
43
treebanks/susanne/SusanneFormat.hs
Normal file
43
treebanks/susanne/SusanneFormat.hs
Normal file
@@ -0,0 +1,43 @@
|
||||
module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank) where
|
||||
|
||||
import Data.Char
|
||||
|
||||
type Tag = String
|
||||
type Id = String
|
||||
type Word = String
|
||||
type Lemma = String
|
||||
|
||||
data ParseTree
|
||||
= Phrase Tag [ParseTree]
|
||||
| Word Id Tag Word Lemma
|
||||
|
||||
data ParseTreePos
|
||||
= Root
|
||||
| At ParseTreePos Tag [ParseTree]
|
||||
|
||||
instance Show ParseTree where
|
||||
show (Phrase tag ts) = "["++tag++" "++unwords (map show ts)++"]"
|
||||
show (Word _ tag w _) = "["++tag++" "++w++"]"
|
||||
|
||||
readTreebank ls = readLines Root (map words ls)
|
||||
|
||||
readLines p [] = []
|
||||
readLines p ([id,_,tag,w,l,parse]:ls) =
|
||||
readParse p (Word id tag w l) parse ls
|
||||
|
||||
readParse p w [] ls = readLines p ls
|
||||
readParse p w ('[':cs) ls =
|
||||
case break (not . isTagChar) cs of
|
||||
(tag,cs) -> readParse (At p tag []) w cs ls
|
||||
readParse (At p tag ts) w ('.':cs) ls =
|
||||
readParse (At p tag (w:ts)) w cs ls
|
||||
readParse (At p tag ts) w cs ls =
|
||||
case break (not . isTagChar) cs of
|
||||
(tag,']':cs) -> let t = Phrase tag (reverse ts)
|
||||
in case p of
|
||||
Root -> t : readLines p ls
|
||||
At p tag ts -> readParse (At p tag (t:ts)) w cs ls
|
||||
_ -> error cs
|
||||
|
||||
isTagChar c =
|
||||
isLetter c || isDigit c || elem c ":&+-%@=?\"*!"
|
||||
14
treebanks/susanne/convert.hs
Normal file
14
treebanks/susanne/convert.hs
Normal file
@@ -0,0 +1,14 @@
|
||||
import System.Directory
|
||||
import System.FilePath
|
||||
import Data.List
|
||||
|
||||
import SusanneFormat
|
||||
|
||||
main = do
|
||||
fs <- getDirectoryContents "data"
|
||||
txts <- (mapM (\f -> readFile ("data" </> f)) . filter ((/= ".") . take 1)) (sort fs)
|
||||
let ts = filter (not . isBreak) (readTreebank (lines (concat txts)))
|
||||
writeFile "text" (unlines (map show ts))
|
||||
|
||||
isBreak (Phrase "Oh" [Word _ "YB" "<minbrk>" _]) = True
|
||||
isBreak _ = False
|
||||
Reference in New Issue
Block a user