From 39fff9318d8c98f8bee0a22bf7024e6082db15c1 Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Thu, 5 Dec 2013 10:05:33 +0000 Subject: [PATCH] more on the Susanne treebank --- treebanks/susanne/Parser.hs | 37 ++++++++++++++ treebanks/susanne/SusanneFormat.hs | 81 ++++++++++++++++++++++-------- treebanks/susanne/convert.hs | 46 +++++++++++++++-- 3 files changed, 140 insertions(+), 24 deletions(-) create mode 100644 treebanks/susanne/Parser.hs diff --git a/treebanks/susanne/Parser.hs b/treebanks/susanne/Parser.hs new file mode 100644 index 000000000..4e87c6a00 --- /dev/null +++ b/treebanks/susanne/Parser.hs @@ -0,0 +1,37 @@ +module Parser where + +import Control.Monad + +import SusanneFormat + +newtype P a = P {runP :: [ParseTree] -> Maybe ([ParseTree], a)} + +instance Monad P where + return x = P (\ts -> Just (ts, x)) + f >>= g = P (\ts -> case runP f ts of + Nothing -> Nothing + Just (ts,x) -> runP (g x) ts) + +instance MonadPlus P where + mzero = P (\ts -> Nothing) + mplus f g = P (\ts -> mplus (runP f ts) (runP g ts)) + +match tag_spec = P (\ts -> + case ts of + (Phrase tag1 mods1 fn1 _ _:ts) + | tag == tag1 && + all (flip elem mods1) mods && + (null fn || fn == fn1) -> Just (ts,()) + (Word _ tag1 _ _:ts) + | tag == tag1 -> Just (ts,()) + _ -> Nothing) + where + (f,_) = readTag (Word "" undefined undefined undefined) tag_spec + Phrase tag mods fn _ _ = f [] + +many f = + do x <- f + xs <- many f + return (x:xs) + `mplus` + do return [] diff --git a/treebanks/susanne/SusanneFormat.hs b/treebanks/susanne/SusanneFormat.hs index 052f95978..3eb3187e2 100644 --- a/treebanks/susanne/SusanneFormat.hs +++ b/treebanks/susanne/SusanneFormat.hs @@ -1,43 +1,82 @@ -module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank) where +module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank,readTag) where +import PGF(CId) import Data.Char type Tag = String +type Mods = String +type Fn = String +type Index = Int type Id = String type Word = String type Lemma = String data ParseTree - = Phrase Tag [ParseTree] + = Phrase Tag Mods Fn Index [ParseTree] | Word Id Tag Word Lemma + | App CId [ParseTree] data ParseTreePos = Root - | At ParseTreePos Tag [ParseTree] + | At ParseTreePos ([ParseTree] -> ParseTree) [ParseTree] instance Show ParseTree where - show (Phrase tag ts) = "["++tag++" "++unwords (map show ts)++"]" - show (Word _ tag w _) = "["++tag++" "++w++"]" + show (Phrase tag mods fn idx ts) + | tag == "" = "["++fn++show idx++" "++unwords (map show ts)++"]" + | fn == "" && idx == 0 = "["++tag++mods++" "++unwords (map show ts)++"]" + | otherwise = "["++tag++mods++":"++fn++show idx++" "++unwords (map show ts)++"]" + show (Word _ tag w _) = "["++tag++" "++w++"]" + show (App f ts) + | null ts = show f + | otherwise = "("++show f++" "++unwords (map show ts)++")" readTreebank ls = readLines Root (map words ls) readLines p [] = [] readLines p ([id,_,tag,w,l,parse]:ls) = - readParse p (Word id tag w l) parse ls + readParse (Word id tag w l) p parse ls -readParse p w [] ls = readLines p ls -readParse p w ('[':cs) ls = - case break (not . isTagChar) cs of - (tag,cs) -> readParse (At p tag []) w cs ls -readParse (At p tag ts) w ('.':cs) ls = - readParse (At p tag (w:ts)) w cs ls -readParse (At p tag ts) w cs ls = - case break (not . isTagChar) cs of - (tag,']':cs) -> let t = Phrase tag (reverse ts) - in case p of - Root -> t : readLines p ls - At p tag ts -> readParse (At p tag (t:ts)) w cs ls - _ -> error cs +readParse w p [] ls = readLines p ls +readParse w p ('[':cs) ls = + case readTag w cs of + (fn,cs) -> readParse w (At p fn []) cs ls +readParse w (At p fn ts) ('.':cs) ls = + readParse w (At p fn (w:ts)) cs ls +readParse w (At p fn ts) cs ls = + case readTag w cs of + (_,']':cs) -> let t = fn (reverse ts) + in case p of + Root -> t : readLines p ls + At p fn ts -> readParse w (At p fn (t:ts)) cs ls + _ -> readError w -isTagChar c = - isLetter c || isDigit c || elem c ":&+-%@=?\"*!" +readTag w cs@(c1:c2:_) -- word tag on phrase level + | isUpper c1 && isUpper c2 = + case break (\c -> not (isLetter c || isDigit c)) cs of + (tag,cs) -> case break (\c -> not (elem c "?*%!\"=+-&@")) cs of + (mods,cs) -> case cs of + (':':c:cs) | isLetter c -> case break (not . isDigit) cs of + (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs) + | isDigit c -> case break (not . isDigit) (c:cs) of + (ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs) + _ -> (Phrase tag mods "" 0,cs) +readTag w (c:cs) -- phrase tag + | isUpper c = let tag = [c] + in case break (\c -> not (isLetter c || isDigit c || elem c "?*%!\"=+-&@")) cs of + (mods,cs) -> case cs of + (':':c:cs) | isLetter c -> case break (not . isDigit) cs of + (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs) + | isDigit c -> case break (not . isDigit) (c:cs) of + (ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs) + _ -> (Phrase tag mods "" 0,cs) + | isLower c = let tag = [] + mods = [] + in case break (not . isDigit) cs of + (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs) + | isDigit c = let tag = [] + mods = [] + in case break (not . isDigit) cs of + (ds,cs) -> (Phrase tag mods [] (read ds),cs) +readTag w cs = readError w + +readError (Word id _ _ _) = error id diff --git a/treebanks/susanne/convert.hs b/treebanks/susanne/convert.hs index 91fdc2cf4..dfd2328ca 100644 --- a/treebanks/susanne/convert.hs +++ b/treebanks/susanne/convert.hs @@ -1,14 +1,54 @@ import System.Directory import System.FilePath import Data.List +import Data.Char(toLower) +import PGF (readPGF, readLanguage, buildMorpho, lookupMorpho, mkCId, functionType, unType) import SusanneFormat +Just eng = readLanguage "ParseEng" + main = do + gr <- readPGF "../../ParseEngAbs.pgf" + let morpho = buildMorpho gr eng fs <- getDirectoryContents "data" txts <- (mapM (\f -> readFile ("data" f)) . filter ((/= ".") . take 1)) (sort fs) - let ts = filter (not . isBreak) (readTreebank (lines (concat txts))) + --let ts = concatMap (convert gr morpho) (readTreebank (lines (concat txts))) + let ts = readTreebank (lines (concat txts)) writeFile "text" (unlines (map show ts)) -isBreak (Phrase "Oh" [Word _ "YB" "" _]) = True -isBreak _ = False +convert pgf morpho w@(Word _ tag _ lemma) + | elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = [] + | tag == "NN1c" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w + | tag == "NN1n" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w + | tag == "NN2" = convertLemma pgf morpho (mkCId "N") "s Pl Nom" w + | tag == "JJ" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w + | tag == "JB" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w + | tag == "JBo" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w + | tag == "AT" = convertLemma pgf morpho (mkCId "Quant") "s False Sg" w + | tag == "VVDi" = convertLemma pgf morpho (mkCId "V") "s VPast" w + | tag == "VVDt" = convertLemma pgf morpho (mkCId "V2") "s VPast" w + | tag == "VVDv" = convertLemma pgf morpho (mkCId "V") "s VPast" w + | tag == "VVZi" = convertLemma pgf morpho (mkCId "V") "s VPres" w + | tag == "VVZt" = convertLemma pgf morpho (mkCId "V2") "s VPres" w + | tag == "VVZv" = convertLemma pgf morpho (mkCId "V") "s VPres" w + | tag == "PPHS2"= convertLemma pgf morpho (mkCId "Pron") "s (NCase Nom)" w + | tag == "PPHO2"= convertLemma pgf morpho (mkCId "Pron") "s NPAcc" w + | tag == "RR" = convertLemma pgf morpho (mkCId "Adv") "s" w + | tag == "II" = convertLemma pgf morpho (mkCId "Prep") "s" w + | tag == "IO" = convertLemma pgf morpho (mkCId "Prep") "s" w + | otherwise = [w] +convert pgf morpho (Phrase tag mods fn idx ts) + | tag == "O" = concatMap (convert pgf morpho) ts + | otherwise = [Phrase tag mods fn idx (concatMap (convert pgf morpho) ts)] + +convertLemma pgf morpho cat an0 w@(Word _ tag form _) = + case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of + [f] -> [App f []] + _ -> [w] + where + hasCat pgf f cat = + case functionType pgf f of + Just ty -> case unType ty of + (_,cat1,_) -> cat1 == cat + Nothing -> False