diff --git a/treebanks/susanne/Parser.hs b/treebanks/susanne/Parser.hs index 4e87c6a00..62e362a9f 100644 --- a/treebanks/susanne/Parser.hs +++ b/treebanks/susanne/Parser.hs @@ -1,37 +1,90 @@ module Parser where +import Data.Char import Control.Monad +import PGF(PGF,Morpho,lookupMorpho,functionType,unType) import SusanneFormat -newtype P a = P {runP :: [ParseTree] -> Maybe ([ParseTree], a)} +newtype P a = P {runP :: PGF -> Morpho -> [ParseTree] -> Maybe ([ParseTree], a)} instance Monad P where - return x = P (\ts -> Just (ts, x)) - f >>= g = P (\ts -> case runP f ts of - Nothing -> Nothing - Just (ts,x) -> runP (g x) ts) + return x = P (\pgf morpho ts -> Just (ts, x)) + f >>= g = P (\pgf morpho ts -> case runP f pgf morpho ts of + Nothing -> Nothing + Just (ts,x) -> runP (g x) pgf morpho ts) instance MonadPlus P where - mzero = P (\ts -> Nothing) - mplus f g = P (\ts -> mplus (runP f ts) (runP g ts)) + mzero = P (\pgf morpho ts -> Nothing) + mplus f g = P (\pgf morpho ts -> mplus (runP f pgf morpho ts) (runP g pgf morpho ts)) -match tag_spec = P (\ts -> +match tag_spec = P (\pgf morpho ts -> case ts of - (Phrase tag1 mods1 fn1 _ _:ts) + (t@(Phrase tag1 mods1 fn1 _ _):ts) | tag == tag1 && all (flip elem mods1) mods && - (null fn || fn == fn1) -> Just (ts,()) - (Word _ tag1 _ _:ts) - | tag == tag1 -> Just (ts,()) + (null fn || fn == fn1) -> Just (ts,t) + (t@(Word _ tag1 _ _):ts) + | tag == tag1 -> Just (ts,t) _ -> Nothing) where (f,_) = readTag (Word "" undefined undefined undefined) tag_spec Phrase tag mods fn _ _ = f [] +many1 f = do + x <- f + xs <- many f + return (x:xs) + many f = do x <- f xs <- many f return (x:xs) `mplus` do return [] + +inside tag_spec p = P (\pgf morpho ts -> + case ts of + (t@(Phrase tag1 mods1 fn1 _ ts'):ts) + | tag == tag1 && + all (flip elem mods1) mods && + (null fn || fn == fn1) -> case runP p pgf morpho ts' of + Just ([],x) -> Just (ts,x) + _ -> Nothing + _ -> Nothing) + where + (f,_) = readTag (Word "" undefined undefined undefined) tag_spec + Phrase tag mods fn _ _ = f [] + +insideOpt tag_spec p = P (\pgf morpho ts -> + case ts of + (t@(Phrase tag1 mods1 fn1 _ ts'):ts) + | tag == tag1 && + all (flip elem mods1) mods && + (null fn || fn == fn1) -> case runP p pgf morpho ts' of + Just ([],x) -> Just (ts,x) + _ -> Just (ts,t) + _ -> Nothing) + where + (f,_) = readTag (Word "" undefined undefined undefined) tag_spec + Phrase tag mods fn _ _ = f [] + +lemma tag cat an0 = P (\pgf morpho ts -> + case ts of + (t@(Word _ tag1 form _):ts) | tag == tag1 -> + case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of + [f] -> Just (ts,App f []) + _ -> Just (ts,t) + _ -> Nothing) + where + hasCat pgf f cat = + case functionType pgf f of + Just ty -> case unType ty of + (_,cat1,_) -> cat1 == cat + Nothing -> False + +opt f = + do x <- f + return (Just x) + `mplus` + do return Nothing diff --git a/treebanks/susanne/SusanneFormat.hs b/treebanks/susanne/SusanneFormat.hs index 3eb3187e2..43a685a0f 100644 --- a/treebanks/susanne/SusanneFormat.hs +++ b/treebanks/susanne/SusanneFormat.hs @@ -15,6 +15,7 @@ data ParseTree = Phrase Tag Mods Fn Index [ParseTree] | Word Id Tag Word Lemma | App CId [ParseTree] + deriving Eq data ParseTreePos = Root diff --git a/treebanks/susanne/convert.hs b/treebanks/susanne/convert.hs index dfd2328ca..d25e7296d 100644 --- a/treebanks/susanne/convert.hs +++ b/treebanks/susanne/convert.hs @@ -2,24 +2,34 @@ import System.Directory import System.FilePath import Data.List import Data.Char(toLower) +import Control.Monad +import qualified Data.Map as Map import PGF (readPGF, readLanguage, buildMorpho, lookupMorpho, mkCId, functionType, unType) import SusanneFormat +import Parser +import Idents -Just eng = readLanguage "ParseEng" +Just eng = readLanguage "DictEng" main = do - gr <- readPGF "../../ParseEngAbs.pgf" + gr <- readPGF "DictEngAbs.pgf" let morpho = buildMorpho gr eng fs <- getDirectoryContents "data" txts <- (mapM (\f -> readFile ("data" f)) . filter ((/= ".") . take 1)) (sort fs) - --let ts = concatMap (convert gr morpho) (readTreebank (lines (concat txts))) - let ts = readTreebank (lines (concat txts)) - writeFile "text" (unlines (map show ts)) + --let ts' = readTreebank (lines (concat txts)) + --writeFile "text" (unlines (map show ts')) + let (ts',rs') = combineRes (convert gr morpho) (readTreebank (lines (concat txts))) + let rm = Map.fromListWith (++) rs' + writeFile "susanne.gft" (unlines (map show ts')) + writeFile "rules" (unlines (concat [unwords ("-":cat:"->":cats) : map (\t -> " "++show t) rs'' | (cat :-> cats,rs'') <- Map.toList rm])) + +data Rule = Tag :-> [Tag] + deriving (Eq,Ord) convert pgf morpho w@(Word _ tag _ lemma) - | elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = [] - | tag == "NN1c" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w + | elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = ([],[]) +{- | tag == "NN1c" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w | tag == "NN1n" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w | tag == "NN2" = convertLemma pgf morpho (mkCId "N") "s Pl Nom" w | tag == "JJ" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w @@ -36,19 +46,259 @@ convert pgf morpho w@(Word _ tag _ lemma) | tag == "PPHO2"= convertLemma pgf morpho (mkCId "Pron") "s NPAcc" w | tag == "RR" = convertLemma pgf morpho (mkCId "Adv") "s" w | tag == "II" = convertLemma pgf morpho (mkCId "Prep") "s" w - | tag == "IO" = convertLemma pgf morpho (mkCId "Prep") "s" w - | otherwise = [w] -convert pgf morpho (Phrase tag mods fn idx ts) - | tag == "O" = concatMap (convert pgf morpho) ts - | otherwise = [Phrase tag mods fn idx (concatMap (convert pgf morpho) ts)] + | tag == "IO" = convertLemma pgf morpho (mkCId "Prep") "s" w-} + | otherwise = ([w],[]) +convert pgf morpho t@(Phrase tag mods fn idx ts) + | tag == "O" = (ts',rs') + | tag == "Q" = (ts',rs') + | tag == "S" = case runP pS pgf morpho ts' of + Just ([],x) -> ([x], rs') + _ -> ([Phrase tag mods fn idx ts'], (r,[t]) : rs') + | otherwise = ([Phrase tag mods fn idx ts'], (r,[t]) : rs') + where + (ts',rs') = combineRes (convert pgf morpho) ts + r = tag :-> map getTag ts + + isExtra (Word _ "YIL" _ _) = True + isExtra (Word _ "YIR" _ _) = True + isExtra (Word _ "YTL" _ _) = True + isExtra (Word _ "YTR" _ _) = True + isExtra _ = False + + getTag (Phrase tag mods fn idx ts) = tag++if null fn then "" else ":"++fn + getTag (Word _ tag _ _) = tag convertLemma pgf morpho cat an0 w@(Word _ tag form _) = case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of - [f] -> [App f []] - _ -> [w] + [f] -> ([App f []], []) + _ -> ([w],[]) where hasCat pgf f cat = case functionType pgf f of Just ty -> case unType ty of (_,cat1,_) -> cat1 == cat Nothing -> False + +combineRes f ts = (ts',rs') + where + (x,y) = unzip (map f ts) + ts' = concat x + rs' = concat y + +pS = + do mplus pConj (return ()) + advs <- many pAdS + np <- pSubject + (t,p,vp) <- pVP + return (foldr ($) (App cidUseCl [t,p,App cidPredVP [np, vp]]) advs) + `mplus` + do mplus pConj (return ()) + (t,p,vp) <- pVP + return (App cidImpVP [vp]) + `mplus` + do mplus pConj (return ()) + advs <- many pAdS + t1 <- match "EX" + (t,p,vp) <- pVP + return (foldr ($) (App cidUseCl [t,p,App cidExistNP [t1,vp]]) advs) + +pSubject = + do insideOpt "N:s" pNP + `mplus` + do insideOpt "N:S" pNP + `mplus` + do match "M:s" + `mplus` + do match "M:S" + `mplus` + do match "D:s" + `mplus` + do match "D:S" + +pConj = + do match "CC" + return () + `mplus` + do match "CCB" + return () + +pAdS = + do adv <- pAdv + match "YC" + return (\t -> App cidExtAdvS [adv,t]) + `mplus` + do adv <- pAdv + return (\t -> App cidAdvS [adv,t]) + +pVP = + do adVs <- many pAdV + (t,p,vs) <- pV "VS" + advs <- many pAdv + s <- insideOpt "F:o" + (opt (match "CST") >> pS) + return (t,p,foldr (\adv t -> App cidAdVVP [adv,t]) + (foldl (\t adv -> App cidAdvVP [t, adv]) + (App cidComplVS [vs, s]) + advs) + adVs) + `mplus` + do adVs <- many pAdV + (t,p,vv) <- pV "VV" + advs <- many pAdv + vp <- match "Ti" + return (t,p,foldr (\adv t -> App cidAdVVP [adv,t]) + (foldl (\t adv -> App cidAdvVP [t, adv]) + (App cidComplVV [vv, vp]) + advs) + adVs) + `mplus` + do adVs <- many pAdV + (t,p,v2) <- pV "V2" + o <- pObject + opt (match "YC") -- what is this? + advs <- many pAdv + return (t,p,foldr (\adv t -> App cidAdVVP [adv,t]) + (foldl (\t adv -> App cidAdvVP [t, adv]) + (App cidComplSlash [App cidSlashV2a [v2],o]) + advs) + adVs) + `mplus` + do adVs <- many pAdV + (t,p,v) <- pV "V" + advs <- many pAdv + return (t,p,foldr (\adv t -> App cidAdVVP [adv,t]) + (foldl (\t adv -> App cidAdvVP [t, adv]) + (App cidUseV [v]) + advs) + adVs) + +pV cat = + do inside "V" $ + do v <- lemma "VVDv" (mkCId cat) "s VPast" + return (App cidTTAnt [App cidTPast [],App cidASimul []],App cidPPos [],v) + `mplus` + do v <- lemma "VVDt" (mkCId cat) "s VPast" + return (App cidTTAnt [App cidTPast [],App cidASimul []],App cidPPos [],v) + `mplus` + do v <- lemma "VVZv" (mkCId cat) "s VPres" + return (App cidTTAnt [App cidTPres [],App cidASimul []],App cidPPos [],v) + `mplus` + do match "VHD" + match "VHD" + v <- lemma "VVNv" (mkCId cat) "s VPPart" + return (App cidTTAnt [App cidTPres [],App cidAAnter []],App cidPPos [],v) + `mplus` + do v <- match "V" + return (App (mkCId "XXX") [],App (mkCId "XXX") [],v) + +pAdV = + do insideOpt "R:c" $ + lemma "RRR" (mkCId "AdV") "s" + `mplus` + do match "R:m" + +pObject = + match "P:u" + `mplus` + insideOpt "N:o" pNP + `mplus` + match "N:e" + `mplus` + match "M:e" + `mplus` + match "D:e" + `mplus` + match "P:e" + +pAdv = + do match "N:t" + `mplus` + do match "N:h" + `mplus` + do match "P:p" + `mplus` + do match "P:q" + `mplus` + do match "P:a" + `mplus` + do match "P:t" + `mplus` + do match "P:h" + `mplus` + do match "P:m" + `mplus` + do match "P:r" + `mplus` + do match "R:p" + `mplus` + do match "R:q" + `mplus` + do match "R:a" + `mplus` + do match "R:t" + `mplus` + do match "R:h" + `mplus` + do match "R:m" + `mplus` + do match "R:c" + `mplus` + do match "R:r" + `mplus` + do match "F:p" + `mplus` + do match "F:q" + `mplus` + do match "F:a" + `mplus` + do match "F:t" + `mplus` + do match "F:h" + `mplus` + do match "F:m" + `mplus` + do match "F:r" + `mplus` + do match "W:b" + `mplus` + do match "L:b" + +pNP = do + q <- pQuant + (n,cn) <- pCN + return (App cidDetCN [App cidDetQuant [q,n],cn]) + +pQuant = + do lemma "AT" (mkCId "Quant") "s False Sg" + `mplus` + do match "AT1" + return (App cidIndefArt []) + +pCN = + do np <- insideOpt "N" pNP + (n,cn) <- pCN + return (n,App (mkCId "Appos") [np,cn]) + `mplus` + do a <- lemma "JJ" (mkCId "A") "s (AAdj Posit Nom)" + (n,cn) <- pCN + return (n,App cidAdjCN [App cidPositA [a],cn]) + `mplus` + do (num,n) <- pN + advs <- many pPo + return (num, + foldl (\t adv -> App cidAdvCN [t, adv]) + (App cidUseN [n]) + advs) + +pN = + do n <- lemma "NN1c" (mkCId "N") "s Sg Nom" + return (App cidNumSg [], n) + `mplus` + do n <- lemma "NN1n" (mkCId "N") "s Sg Nom" + return (App cidNumSg [], n) + +pPo = + insideOpt "Po" $ do + p <- match "IO" + np <- insideOpt "N" pNP + return (App cidPrepNP [p,np])