Files
gf-core/treebanks/susanne/SusanneFormat.hs

144 lines
4.8 KiB
Haskell
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank,readTag) where
import Data.Char
import qualified Data.Map as Map
type Tag = String
type Mods = String
type Fn = String
type Index = Int
type Id = String
type Word = String
type Lemma = String
data ParseTree
= Phrase Tag Mods Fn Index [ParseTree]
| Word Id Tag Word Lemma
| App String [ParseTree]
| Lit String
deriving Eq
data ParseTreePos
= Root
| At ParseTreePos ([ParseTree] -> ParseTree) [ParseTree]
instance Show ParseTree where
show (Phrase tag mods fn idx ts)
| tag == "" = "["++fn++show idx++" "++unwords (map show ts)++"]"
| fn == "" && idx == 0 = "["++tag++mods++" "++unwords (map show ts)++"]"
| otherwise = "["++tag++mods++":"++fn++show idx++" "++unwords (map show ts)++"]"
show (Word _ tag w _) = "["++tag++" "++w++"]"
show (App f ts)
| null ts = f
| otherwise = "("++f++" "++unwords (map show ts)++")"
show (Lit s) = show s
readTreebank ls = readLines Root (map words ls)
readLines p [] = []
readLines p ([id,_,tag,w,l,parse]:ls) =
readParse (Word id tag (readWord w) l) p parse ls
readParse w p [] ls = readLines p ls
readParse w p ('[':cs) ls =
case readTag w cs of
(fn,cs) -> readParse w (At p fn []) cs ls
readParse w (At p fn ts) ('.':cs) ls =
readParse w (At p fn (w:ts)) cs ls
readParse w (At p fn ts) cs ls =
case readTag w cs of
(_,']':cs) -> let t = fn (reverse ts)
in case p of
Root -> t : readLines p ls
At p fn ts -> readParse w (At p fn (t:ts)) cs ls
_ -> readError w
readTag w cs@(c1:c2:_) -- word tag on phrase level
| isUpper c1 && isUpper c2 =
case break (\c -> not (isLetter c || isDigit c)) cs of
(tag,cs) -> case break (\c -> not (elem c "?*%!\"=+-&@")) cs of
(mods,cs) -> case cs of
(':':c:cs) | isLetter c -> case break (not . isDigit) cs of
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
| isDigit c -> case break (not . isDigit) (c:cs) of
(ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
_ -> (Phrase tag mods "" 0,cs)
readTag w (c:cs) -- phrase tag
| isUpper c = let tag = [c]
in case break (\c -> not (isLetter c || isDigit c || elem c "?*%!\"=+-&@")) cs of
(mods,cs) -> case cs of
(':':c:cs) | isLetter c -> case break (not . isDigit) cs of
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
| isDigit c -> case break (not . isDigit) (c:cs) of
(ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
_ -> (Phrase tag mods "" 0,cs)
| isLower c = let tag = []
mods = []
in case break (not . isDigit) cs of
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
| isDigit c = let tag = []
mods = []
in case break (not . isDigit) cs of
(ds,cs) -> (Phrase tag mods [] (read ds),cs)
readTag w cs = readError w
readError (Word id _ _ _) = error id
readWord w0 = replaceEntities w2
where
w1 | head w0 == '+' = tail w0
| otherwise = w0
w2 | last w1 == '+' = init w1
| otherwise = w1
replaceEntities [] = []
replaceEntities ('<':cs) =
let (e,'>':cs1) = break (=='>') cs
in case Map.lookup e entity_names of
Just c -> c : replaceEntities cs1
Nothing -> "<"++e++">"++ replaceEntities cs1
replaceEntities (c: cs) = c : replaceEntities cs
entity_names = Map.fromList
[("agr",'α')
,("agrave",'à')
,("apos",'\'')
,("auml",'ä')
,("bgr",'β')
,("blank",' ')
,("ccedil",'ç')
,("deg",'°')
,("dollar",'$')
,("eacute",'é')
,("egr",'ε')
,("egrave",'è')
,("frac12",'½')
,("frac14",'¼')
,("ggr",'γ')
,("hellip",'…')
,("hyphen",'-')
,("iuml",'ï')
,("khgr",'χ')
,("ldquo",'“')
,("lgr",'λ')
,("lsquo",'')
,("mdash",'—')
,("mgr",'μ')
,("minus",'-')
,("ntilde",'ñ')
,("oelig",'œ')
,("ouml",'ö')
,("para",'¶')
,("pgr",'π')
,("phgr",'φ')
,("prime",'')
,("Prime",'″')
,("rdquo",'”')
,("rgr",'ρ')
,("rsquo",'')
,("sect",'§')
,("sol",'/')
,("tggr",'θ')
]