mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-16 00:09:31 -06:00
84 lines
3.6 KiB
Haskell
84 lines
3.6 KiB
Haskell
module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank,readTag) where
|
|
|
|
import PGF(CId)
|
|
import Data.Char
|
|
|
|
type Tag = String
|
|
type Mods = String
|
|
type Fn = String
|
|
type Index = Int
|
|
type Id = String
|
|
type Word = String
|
|
type Lemma = String
|
|
|
|
data ParseTree
|
|
= Phrase Tag Mods Fn Index [ParseTree]
|
|
| Word Id Tag Word Lemma
|
|
| App CId [ParseTree]
|
|
deriving Eq
|
|
|
|
data ParseTreePos
|
|
= Root
|
|
| At ParseTreePos ([ParseTree] -> ParseTree) [ParseTree]
|
|
|
|
instance Show ParseTree where
|
|
show (Phrase tag mods fn idx ts)
|
|
| tag == "" = "["++fn++show idx++" "++unwords (map show ts)++"]"
|
|
| fn == "" && idx == 0 = "["++tag++mods++" "++unwords (map show ts)++"]"
|
|
| otherwise = "["++tag++mods++":"++fn++show idx++" "++unwords (map show ts)++"]"
|
|
show (Word _ tag w _) = "["++tag++" "++w++"]"
|
|
show (App f ts)
|
|
| null ts = show f
|
|
| otherwise = "("++show f++" "++unwords (map show ts)++")"
|
|
|
|
readTreebank ls = readLines Root (map words ls)
|
|
|
|
readLines p [] = []
|
|
readLines p ([id,_,tag,w,l,parse]:ls) =
|
|
readParse (Word id tag w l) p parse ls
|
|
|
|
readParse w p [] ls = readLines p ls
|
|
readParse w p ('[':cs) ls =
|
|
case readTag w cs of
|
|
(fn,cs) -> readParse w (At p fn []) cs ls
|
|
readParse w (At p fn ts) ('.':cs) ls =
|
|
readParse w (At p fn (w:ts)) cs ls
|
|
readParse w (At p fn ts) cs ls =
|
|
case readTag w cs of
|
|
(_,']':cs) -> let t = fn (reverse ts)
|
|
in case p of
|
|
Root -> t : readLines p ls
|
|
At p fn ts -> readParse w (At p fn (t:ts)) cs ls
|
|
_ -> readError w
|
|
|
|
readTag w cs@(c1:c2:_) -- word tag on phrase level
|
|
| isUpper c1 && isUpper c2 =
|
|
case break (\c -> not (isLetter c || isDigit c)) cs of
|
|
(tag,cs) -> case break (\c -> not (elem c "?*%!\"=+-&@")) cs of
|
|
(mods,cs) -> case cs of
|
|
(':':c:cs) | isLetter c -> case break (not . isDigit) cs of
|
|
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
|
|
| isDigit c -> case break (not . isDigit) (c:cs) of
|
|
(ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
|
|
_ -> (Phrase tag mods "" 0,cs)
|
|
readTag w (c:cs) -- phrase tag
|
|
| isUpper c = let tag = [c]
|
|
in case break (\c -> not (isLetter c || isDigit c || elem c "?*%!\"=+-&@")) cs of
|
|
(mods,cs) -> case cs of
|
|
(':':c:cs) | isLetter c -> case break (not . isDigit) cs of
|
|
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
|
|
| isDigit c -> case break (not . isDigit) (c:cs) of
|
|
(ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
|
|
_ -> (Phrase tag mods "" 0,cs)
|
|
| isLower c = let tag = []
|
|
mods = []
|
|
in case break (not . isDigit) cs of
|
|
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
|
|
| isDigit c = let tag = []
|
|
mods = []
|
|
in case break (not . isDigit) cs of
|
|
(ds,cs) -> (Phrase tag mods [] (read ds),cs)
|
|
readTag w cs = readError w
|
|
|
|
readError (Word id _ _ _) = error id
|