gf-core/treebanks/susanne/SusanneFormat.hs

module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank,readTag) where

import Data.Char
import qualified Data.Map as Map

type Tag   = String
type Mods  = String
type Fn    = String
type Index = Int
type Id    = String
type Word  = String
type Lemma = String

data ParseTree
 = Phrase Tag Mods Fn Index [ParseTree]
 | Word   Id Tag Word Lemma
 | App String [ParseTree]
 | Lit String
 deriving Eq

data ParseTreePos
 = Root
 | At ParseTreePos ([ParseTree] -> ParseTree) [ParseTree]

instance Show ParseTree where
  show (Phrase tag mods fn idx ts)
    | tag == ""            = "["++fn++show idx++" "++unwords (map show ts)++"]"
    | fn == "" && idx == 0 = "["++tag++mods++" "++unwords (map show ts)++"]"
    | otherwise            = "["++tag++mods++":"++fn++show idx++" "++unwords (map show ts)++"]"
  show (Word _ tag w _)    = "["++tag++" "++w++"]"
  show (App f ts)
    | null ts              = f
    | otherwise            = "("++f++" "++unwords (map show ts)++")"
  show (Lit s)             = show s

readTreebank ls = readLines Root (map words ls)

readLines p []                        = []
readLines p ([id,_,tag,w,l,parse]:ls) =
  readParse (Word id tag (readWord w) l) p parse ls

readParse w p []       ls = readLines p ls
readParse w p ('[':cs) ls =
  case readTag w cs of
    (fn,cs) -> readParse w (At p fn []) cs ls
readParse w (At p fn ts) ('.':cs) ls =
  readParse w (At p fn (w:ts)) cs ls
readParse w (At p fn ts) cs ls =
  case readTag w cs of
    (_,']':cs) -> let t = fn (reverse ts)
                  in case p of
                       Root       -> t : readLines p ls
                       At p fn ts -> readParse w (At p fn (t:ts)) cs ls
    _          -> readError w

readTag w cs@(c1:c2:_)             -- word tag on phrase level
  | isUpper c1 && isUpper c2 =
      case break (\c -> not (isLetter c || isDigit c)) cs of
        (tag,cs) -> case break (\c -> not (elem c "?*%!\"=+-&@")) cs of
                      (mods,cs) -> case cs of
                                    (':':c:cs) | isLetter c -> case break (not . isDigit) cs of
                                                                 (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
                                               | isDigit  c -> case break (not . isDigit) (c:cs) of
                                                                 (ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
                                    _                       -> (Phrase tag mods "" 0,cs)
readTag w (c:cs)                -- phrase tag
  | isUpper c = let tag = [c]
                in case break (\c -> not (isLetter c || isDigit c || elem c "?*%!\"=+-&@")) cs of
                     (mods,cs) -> case cs of
                                    (':':c:cs) | isLetter c -> case break (not . isDigit) cs of
                                                                 (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
                                               | isDigit  c -> case break (not . isDigit) (c:cs) of
                                                                 (ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
                                    _                       -> (Phrase tag mods "" 0,cs)
  | isLower c = let tag  = []
                    mods = []
                in case break (not . isDigit) cs of
                     (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
  | isDigit c = let tag  = []
                    mods = []
                in case break (not . isDigit) cs of
                     (ds,cs) -> (Phrase tag mods [] (read ds),cs)
readTag w cs = readError w

readError (Word id _ _ _) = error id

readWord w0 = replaceEntities w2
  where
    w1 | head w0 == '+' = tail w0
       | otherwise      = w0
    w2 | last w1 == '+' = init w1
       | otherwise      = w1

    replaceEntities []       = []
    replaceEntities ('<':cs) =
      let (e,'>':cs1) = break (=='>') cs
      in case Map.lookup e entity_names of
           Just c  -> c : replaceEntities cs1
           Nothing -> "<"++e++">"++ replaceEntities cs1
    replaceEntities (c:  cs) = c : replaceEntities cs

entity_names = Map.fromList
  [("agr",'α')
  ,("agrave",'à')
  ,("apos",'\'')
  ,("auml",'ä')
  ,("bgr",'β')
  ,("blank",' ')
  ,("ccedil",'ç')
  ,("deg",'°')
  ,("dollar",'$')
  ,("eacute",'é')
  ,("egr",'ε')
  ,("egrave",'è')
  ,("frac12",'½')
  ,("frac14",'¼')
  ,("ggr",'γ')
  ,("hellip",'…')
  ,("hyphen",'-')
  ,("iuml",'ï')
  ,("khgr",'χ')
  ,("ldquo",'“')
  ,("lgr",'λ')
  ,("lsquo",'‘')
  ,("mdash",'—')
  ,("mgr",'μ')
  ,("minus",'-')
  ,("ntilde",'ñ')
  ,("oelig",'œ')
  ,("ouml",'ö')
  ,("para",'¶')
  ,("pgr",'π')
  ,("phgr",'φ')
  ,("prime",'′')
  ,("Prime",'″')
  ,("rdquo",'”')
  ,("rgr",'ρ')
  ,("rsquo",'’')
  ,("sect",'§')
  ,("sol",'/')
  ,("tggr",'θ')
  ]