a new version of the conversion script for the Susanne corpus which covers a bit less than half of the data

This commit is contained in:
krasimir
2015-11-13 13:05:21 +00:00
parent 2a0edb2148
commit ce249c1dc8
4 changed files with 903 additions and 199 deletions

View File

@@ -1,7 +1,7 @@
module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank,readTag) where
import PGF(CId)
import Data.Char
import qualified Data.Map as Map
type Tag = String
type Mods = String
@@ -14,7 +14,8 @@ type Lemma = String
data ParseTree
= Phrase Tag Mods Fn Index [ParseTree]
| Word Id Tag Word Lemma
| App CId [ParseTree]
| App String [ParseTree]
| Lit String
deriving Eq
data ParseTreePos
@@ -28,14 +29,15 @@ instance Show ParseTree where
| otherwise = "["++tag++mods++":"++fn++show idx++" "++unwords (map show ts)++"]"
show (Word _ tag w _) = "["++tag++" "++w++"]"
show (App f ts)
| null ts = show f
| otherwise = "("++show f++" "++unwords (map show ts)++")"
| null ts = f
| otherwise = "("++f++" "++unwords (map show ts)++")"
show (Lit s) = show s
readTreebank ls = readLines Root (map words ls)
readLines p [] = []
readLines p ([id,_,tag,w,l,parse]:ls) =
readParse (Word id tag w l) p parse ls
readParse (Word id tag (readWord w) l) p parse ls
readParse w p [] ls = readLines p ls
readParse w p ('[':cs) ls =
@@ -81,3 +83,61 @@ readTag w (c:cs) -- phrase tag
readTag w cs = readError w
readError (Word id _ _ _) = error id
readWord w0 = replaceEntities w2
where
w1 | head w0 == '+' = tail w0
| otherwise = w0
w2 | last w1 == '+' = init w1
| otherwise = w1
replaceEntities [] = []
replaceEntities ('<':cs) =
let (e,'>':cs1) = break (=='>') cs
in case Map.lookup e entity_names of
Just c -> c : replaceEntities cs1
Nothing -> "<"++e++">"++ replaceEntities cs1
replaceEntities (c: cs) = c : replaceEntities cs
entity_names = Map.fromList
[("agr",'α')
,("agrave",'à')
,("apos",'\'')
,("auml",'ä')
,("bgr",'β')
,("blank",' ')
,("ccedil",'ç')
,("deg",'°')
,("dollar",'$')
,("eacute",'é')
,("egr",'ε')
,("egrave",'è')
,("frac12",'½')
,("frac14",'¼')
,("ggr",'γ')
,("hellip",'…')
,("hyphen",'-')
,("iuml",'ï')
,("khgr",'χ')
,("ldquo",'“')
,("lgr",'λ')
,("lsquo",'')
,("mdash",'—')
,("mgr",'μ')
,("minus",'-')
,("ntilde",'ñ')
,("oelig",'œ')
,("ouml",'ö')
,("para",'¶')
,("pgr",'π')
,("phgr",'φ')
,("prime",'')
,("Prime",'″')
,("rdquo",'”')
,("rgr",'ρ')
,("rsquo",'')
,("sect",'§')
,("sol",'/')
,("tggr",'θ')
]