From 6011fbc033b95ed3e703a61cf1162f4d032ad4ca Mon Sep 17 00:00:00 2001 From: krasimir Date: Fri, 13 Nov 2015 13:05:21 +0000 Subject: [PATCH] a new version of the conversion script for the Susanne corpus which covers a bit less than half of the data --- treebanks/susanne/Idents.hs | 148 ++++++ treebanks/susanne/Parser.hs | 59 ++- treebanks/susanne/SusanneFormat.hs | 70 ++- treebanks/susanne/convert.hs | 825 +++++++++++++++++++++++------ 4 files changed, 903 insertions(+), 199 deletions(-) create mode 100644 treebanks/susanne/Idents.hs diff --git a/treebanks/susanne/Idents.hs b/treebanks/susanne/Idents.hs new file mode 100644 index 000000000..14a731f76 --- /dev/null +++ b/treebanks/susanne/Idents.hs @@ -0,0 +1,148 @@ +module Idents where + +import SusanneFormat + +cidASimul = app0 "ASimul" +cidAAnter = app0 "AAnter" +cidPositAdvAdj = app0 "PositAdvAdj" +cidPositAdVAdj = app0 "PositAdVAdj" +cidUseCl = app3 "UseCl" +cidPredVP = app2 "PredVP" +cidSlashVP = app2 "SlashVP" +cidVPSlashPrep = app2 "VPSlashPrep" +cidComplPredVP = app0 "ComplPredVP" +cidAdjCN = app2 "AdjCN" +cidUseN = app1 "UseN" +cidDetQuant = app2 "DetQuant" +cidDetQuantOrd = app3 "DetQuantOrd" +cidNumSg = app0 "NumSg" +cidNumPl = app0 "NumPl" +cidDetCN = app2 "DetCN" +cidIndefArt = app0 "IndefArt" +cidDefArt = app0 "DefArt" +cidUsePN = app1 "UsePN" +cidUseQuantPN = app0 "UseQuantPN" +cidSymbPN = app1 "SymbPN" +cidMkSymb = app1 "MkSymb" +cidUsePron = app1 "UsePron" +cidConjNP = app0 "ConjNP" +cidBaseNP = app0 "BaseNP" +cidConsNP = app0 "ConsNP" +cidConjCN = app0 "ConjCN" +cidBaseCN = app0 "BaseCN" +cidConsCN = app0 "ConsCN" +cidConjAdv = app0 "ConjAdv" +cidBaseAdv = app0 "BaseAdv" +cidConsAdv = app0 "ConsAdv" +cidBaseS = app0 "BaseS" +cidConsS = app0 "ConsS" +cidConjS = app0 "ConjS" +cidMassNP = app1 "MassNP" +cidAdvNP = app2 "AdvNP" +cidTPres = app0 "TPres" +cidTPast = app0 "TPast" +cidTFut = app0 "TFut" +cidTCond = app0 "TCond" +cidTTAnt = app2 "TTAnt" +cidPPos = app0 "PPos" +cidPNeg = app0 "PNeg" +cidComplSlash = app2 "ComplSlash" +cidSlashV2a = app1 "SlashV2a" +cidSlashV2A = app2 "SlashV2A" +cidComplVS = app2 "ComplVS" +cidComplVV = app4 "ComplVV" +cidUseV = app1 "UseV" +cidAdVVP = app2 "AdVVP" +cidAdvVP = app2 "AdvVP" +cidAdvVPSlash = app2 "AdvVPSlash" +cidPrepNP = app2 "PrepNP" +cidto_Prep = app0 "to_Prep" +cidsuch_as_Prep= app0 "such_as_Prep" +cidPastPartAP = app1 "PastPartAP" +cidPassVPSlash = app0 "PassVPSlash" +cidAdvS = app2 "AdvS" +cidPositA = app1 "PositA" +cidIDig = app0 "IDig" +cidIIDig = app0 "IIDig" +cidNumCard = app0 "NumCard" +cidNumDigits = app0 "NumDigits" +cidNumNumeral = app0 "NumNumeral" +cidnum = app0 "num" +cidpot2as3 = app0 "pot2as3" +cidpot1as2 = app0 "pot1as2" +cidpot0as1 = app0 "pot0as1" +cidpot01 = app0 "pot01" +cidpot0 = app0 "pot0" +cidn2 = app0 "n2" +cidn3 = app0 "n3" +cidn4 = app0 "n4" +cidn5 = app0 "n5" +cidn6 = app0 "n6" +cidn7 = app0 "n7" +cidn8 = app0 "n8" +cidn9 = app0 "n9" +cidPossPron = app1 "PossPron" +cidCompAP = app1 "CompAP" +cidCompNP = app1 "CompNP" +cidCompAdv = app1 "CompAdv" +cidCompS = app1 "CompS" +cidCompVP = app1 "CompVP" +cidUseComp = app1 "UseComp" +cidCompoundSgCN= app2 "CompoundSgCN" +cidCompoundPlCN= app2 "CompoundPlCN" +cidDashSgN = app2 "DashSgN" +cidDashPlN = app2 "DashPlN" +cidProgrVP = app0 "ProgrVP" +cidGerundN = app0 "GerundN" +cidGerundAP = app0 "GerundAP" +cidGenNP = app1 "GenNP" +cidPredetNP = app1 "PredetNP" +cidDetNP = app1 "DetNP" +cidAdAP = app2 "AdAP" +cidAdvAP = app2 "AdvAP" +cidPositAdAAdj = app1 "PositAdAAdj" +cideither7or_DConj = app0 "either7or_DConj" +cidboth7and_DConj = app0 "both7and_DConj" +cidor_Conj = app0 "or_Conj" +cidand_Conj = app0 "and_Conj" +cidamp_Conj = app0 "amp_Conj" +cidSlashV2V = app0 "SlashV2V" +cidComplVA = app0 "ComplVA" +cidAdNum = app0 "AdNum" +cidOrdSuperl = app1 "OrdSuperl" +cidno_RP = app0 "no_RP" +cidthat_RP = app0 "that_RP" +cidUseRCl = app3 "UseRCl" +cidRelSlash = app2 "RelSlash" +cidRelNP = app2 "RelNP" +cidRelCN = app2 "RelCN" +cidRelVP = app2 "RelVP" +cidIdRP = app0 "IdRP" +cidmany_Det = app0 "many_Det" +cidImpVP = app1 "ImpVP" +cidExistNP = app2 "ExistNP" +cidExtAdvS = app2 "ExtAdvS" +cidAdvCN = app2 "AdvCN" +cidNameCN = app2 "NameCN" +cidno_Quant = app0 "no_Quant" +cidSubjS = app2 "SubjS" +cidthat_Subj = app0 "that_Subj" +cidanySg_Det = app0 "anySg_Det" +cidanyPl_Det = app0 "anyPl_Det" +cidhave_V2 = app0 "have_V2" +cidby_Prep = app0 "by_Prep" +cidweekdayPunctualAdv = app1 "weekdayPunctualAdv" +cidi_Pron = app0 "i_Pron" +cidyouSg_Pron = app0 "youSg_Pron" +cidhe_Pron = app0 "he_Pron" +cidshe_Pron = app0 "she_Pron" +cidit_Pron = app0 "it_Pron" +cidwe_Pron = app0 "we_Pron" +cidthey_Pron = app0 "they_Pron" +cidUseComparA = app1 "UseComparA" + +app0 f = App f [] +app1 f = \x -> App f [x] +app2 f = \x y -> App f [x,y] +app3 f = \x y z -> App f [x,y,z] +app4 f = \w x y z -> App f [w,x,y,z] diff --git a/treebanks/susanne/Parser.hs b/treebanks/susanne/Parser.hs index 62e362a9f..f34bb3423 100644 --- a/treebanks/susanne/Parser.hs +++ b/treebanks/susanne/Parser.hs @@ -3,29 +3,30 @@ module Parser where import Data.Char import Control.Monad -import PGF(PGF,Morpho,lookupMorpho,functionType,unType) +import PGF2 import SusanneFormat +import Debug.Trace -newtype P a = P {runP :: PGF -> Morpho -> [ParseTree] -> Maybe ([ParseTree], a)} +newtype P a = P {runP :: PGF -> Concr -> [ParseTree] -> Maybe ([ParseTree], a)} instance Monad P where - return x = P (\pgf morpho ts -> Just (ts, x)) - f >>= g = P (\pgf morpho ts -> case runP f pgf morpho ts of + return x = P (\pgf cnc ts -> Just (ts, x)) + f >>= g = P (\pgf cnc ts -> case runP f pgf cnc ts of Nothing -> Nothing - Just (ts,x) -> runP (g x) pgf morpho ts) + Just (ts,x) -> runP (g x) pgf cnc ts) instance MonadPlus P where - mzero = P (\pgf morpho ts -> Nothing) - mplus f g = P (\pgf morpho ts -> mplus (runP f pgf morpho ts) (runP g pgf morpho ts)) + mzero = P (\pgf cnc ts -> Nothing) + mplus f g = P (\pgf cnc ts -> mplus (runP f pgf cnc ts) (runP g pgf cnc ts)) -match tag_spec = P (\pgf morpho ts -> +match convert tag_spec = P (\pgf cnc ts -> case ts of (t@(Phrase tag1 mods1 fn1 _ _):ts) | tag == tag1 && all (flip elem mods1) mods && - (null fn || fn == fn1) -> Just (ts,t) + (null fn || fn == fn1) -> Just (ts,convert pgf cnc t) (t@(Word _ tag1 _ _):ts) - | tag == tag1 -> Just (ts,t) + | tag == tag1 && null mods-> Just (ts,convert pgf cnc t) _ -> Nothing) where (f,_) = readTag (Word "" undefined undefined undefined) tag_spec @@ -43,12 +44,12 @@ many f = `mplus` do return [] -inside tag_spec p = P (\pgf morpho ts -> +inside tag_spec p = P (\pgf cnc ts -> case ts of (t@(Phrase tag1 mods1 fn1 _ ts'):ts) | tag == tag1 && all (flip elem mods1) mods && - (null fn || fn == fn1) -> case runP p pgf morpho ts' of + (null fn || fn == fn1) -> case runP p pgf cnc ts' of Just ([],x) -> Just (ts,x) _ -> Nothing _ -> Nothing) @@ -56,35 +57,45 @@ inside tag_spec p = P (\pgf morpho ts -> (f,_) = readTag (Word "" undefined undefined undefined) tag_spec Phrase tag mods fn _ _ = f [] -insideOpt tag_spec p = P (\pgf morpho ts -> +insideOpt convert tag_spec p = P (\pgf cnc ts -> case ts of (t@(Phrase tag1 mods1 fn1 _ ts'):ts) | tag == tag1 && all (flip elem mods1) mods && - (null fn || fn == fn1) -> case runP p pgf morpho ts' of + (null fn || fn == fn1) -> case runP p pgf cnc ts' of Just ([],x) -> Just (ts,x) - _ -> Just (ts,t) + _ -> Just (ts,convert pgf cnc t) _ -> Nothing) where (f,_) = readTag (Word "" undefined undefined undefined) tag_spec Phrase tag mods fn _ _ = f [] -lemma tag cat an0 = P (\pgf morpho ts -> +lemma tag cat an0 = P (\pgf cnc ts -> case ts of - (t@(Word _ tag1 form _):ts) | tag == tag1 -> - case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of - [f] -> Just (ts,App f []) - _ -> Just (ts,t) - _ -> Nothing) + (t@(Word _ tag1 form _):ts) | tag == tag1 -> case runP (lookupForm cat an0 form) pgf cnc ts of + Nothing -> Just (ts,t) + x -> x + _ -> Nothing) + +lookupForm cat an0 form = P (\pgf cnc ts -> + case [f | (f,an,_) <- lookupMorpho cnc form, hasCat pgf f cat, an == an0] of + [] -> case [f | (f,an,_) <- lookupMorpho cnc (map toLower form), hasCat pgf f cat, an == an0] of + [f] -> Just (ts,App f []) + _ -> Nothing + [f] -> Just (ts,App f []) + _ -> Nothing) where hasCat pgf f cat = case functionType pgf f of - Just ty -> case unType ty of - (_,cat1,_) -> cat1 == cat - Nothing -> False + (DTyp _ cat1 _) -> cat1 == cat opt f = do x <- f return (Just x) `mplus` do return Nothing + +word tag = P (\pgf cnc ts -> + case ts of + ((Word _ tag1 form _):ts) | tag == tag1 -> Just (ts,form) + _ -> Nothing) diff --git a/treebanks/susanne/SusanneFormat.hs b/treebanks/susanne/SusanneFormat.hs index 43a685a0f..04c9fbbc8 100644 --- a/treebanks/susanne/SusanneFormat.hs +++ b/treebanks/susanne/SusanneFormat.hs @@ -1,7 +1,7 @@ module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank,readTag) where -import PGF(CId) import Data.Char +import qualified Data.Map as Map type Tag = String type Mods = String @@ -14,7 +14,8 @@ type Lemma = String data ParseTree = Phrase Tag Mods Fn Index [ParseTree] | Word Id Tag Word Lemma - | App CId [ParseTree] + | App String [ParseTree] + | Lit String deriving Eq data ParseTreePos @@ -28,14 +29,15 @@ instance Show ParseTree where | otherwise = "["++tag++mods++":"++fn++show idx++" "++unwords (map show ts)++"]" show (Word _ tag w _) = "["++tag++" "++w++"]" show (App f ts) - | null ts = show f - | otherwise = "("++show f++" "++unwords (map show ts)++")" + | null ts = f + | otherwise = "("++f++" "++unwords (map show ts)++")" + show (Lit s) = show s readTreebank ls = readLines Root (map words ls) readLines p [] = [] readLines p ([id,_,tag,w,l,parse]:ls) = - readParse (Word id tag w l) p parse ls + readParse (Word id tag (readWord w) l) p parse ls readParse w p [] ls = readLines p ls readParse w p ('[':cs) ls = @@ -81,3 +83,61 @@ readTag w (c:cs) -- phrase tag readTag w cs = readError w readError (Word id _ _ _) = error id + +readWord w0 = replaceEntities w2 + where + w1 | head w0 == '+' = tail w0 + | otherwise = w0 + w2 | last w1 == '+' = init w1 + | otherwise = w1 + + replaceEntities [] = [] + replaceEntities ('<':cs) = + let (e,'>':cs1) = break (=='>') cs + in case Map.lookup e entity_names of + Just c -> c : replaceEntities cs1 + Nothing -> "<"++e++">"++ replaceEntities cs1 + replaceEntities (c: cs) = c : replaceEntities cs + +entity_names = Map.fromList + [("agr",'α') + ,("agrave",'à') + ,("apos",'\'') + ,("auml",'ä') + ,("bgr",'β') + ,("blank",' ') + ,("ccedil",'ç') + ,("deg",'°') + ,("dollar",'$') + ,("eacute",'é') + ,("egr",'ε') + ,("egrave",'è') + ,("frac12",'½') + ,("frac14",'¼') + ,("ggr",'γ') + ,("hellip",'…') + ,("hyphen",'-') + ,("iuml",'ï') + ,("khgr",'χ') + ,("ldquo",'“') + ,("lgr",'λ') + ,("lsquo",'‘') + ,("mdash",'—') + ,("mgr",'μ') + ,("minus",'-') + ,("ntilde",'ñ') + ,("oelig",'œ') + ,("ouml",'ö') + ,("para",'¶') + ,("pgr",'π') + ,("phgr",'φ') + ,("prime",'′') + ,("Prime",'″') + ,("rdquo",'”') + ,("rgr",'ρ') + ,("rsquo",'’') + ,("sect",'§') + ,("sol",'/') + ,("tggr",'θ') + ] + diff --git a/treebanks/susanne/convert.hs b/treebanks/susanne/convert.hs index e413dffb7..edeea2825 100644 --- a/treebanks/susanne/convert.hs +++ b/treebanks/susanne/convert.hs @@ -5,92 +5,63 @@ import Data.Char(toLower) import Control.Monad import qualified Data.Map as Map -import PGF (readPGF, readLanguage, buildMorpho, lookupMorpho, mkCId, functionType, unType) +import PGF2 import SusanneFormat import Parser import Idents -Just eng = readLanguage "DictEng" - main = do - gr <- readPGF "DictEngAbs.pgf" - let morpho = buildMorpho gr eng + gr <- readPGF "ParseEngAbs.pgf" + let Just eng = Map.lookup "ParseEng" (languages gr) fs <- getDirectoryContents "data" txts <- (mapM (\f -> readFile ("data" f)) . filter ((/= ".") . take 1)) (sort fs) - --let ts' = readTreebank (lines (concat txts)) - --writeFile "text" (unlines (map show ts')) - let (ts',rs') = combineRes (convert gr morpho) (readTreebank (lines (concat txts))) - let rm = Map.fromListWith (++) rs' - writeFile "susanne.gft" (unlines (map show ts')) - writeFile "rules" (unlines (concat [unwords ("-":cat:"->":cats) : map (\t -> " "++show t) rs'' | (cat :-> cats,rs'') <- Map.toList rm])) + let ts = (map (convert gr eng) . concatMap filterTree) (readTreebank (lines (concat txts))) + writeFile "susanne.gft" (unlines (map show ts)) -data Rule = Tag :-> [Tag] - deriving (Eq,Ord) - -convert pgf morpho w@(Word _ tag _ lemma) - | elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = ([],[]) -{- | tag == "NN1c" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w - | tag == "NN1n" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w - | tag == "NN2" = convertLemma pgf morpho (mkCId "N") "s Pl Nom" w - | tag == "JJ" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w - | tag == "JB" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w - | tag == "JBo" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w - | tag == "AT" = convertLemma pgf morpho (mkCId "Quant") "s False Sg" w - | tag == "VVDi" = convertLemma pgf morpho (mkCId "V") "s VPast" w - | tag == "VVDt" = convertLemma pgf morpho (mkCId "V2") "s VPast" w - | tag == "VVDv" = convertLemma pgf morpho (mkCId "V") "s VPast" w - | tag == "VVZi" = convertLemma pgf morpho (mkCId "V") "s VPres" w - | tag == "VVZt" = convertLemma pgf morpho (mkCId "V2") "s VPres" w - | tag == "VVZv" = convertLemma pgf morpho (mkCId "V") "s VPres" w - | tag == "PPHS2"= convertLemma pgf morpho (mkCId "Pron") "s (NCase Nom)" w - | tag == "PPHO2"= convertLemma pgf morpho (mkCId "Pron") "s NPAcc" w - | tag == "RR" = convertLemma pgf morpho (mkCId "Adv") "s" w - | tag == "II" = convertLemma pgf morpho (mkCId "Prep") "s" w - | tag == "IO" = convertLemma pgf morpho (mkCId "Prep") "s" w-} - | otherwise = ([w],[]) -convert pgf morpho t@(Phrase tag mods fn idx ts) - | tag == "O" = (ts',rs') - | tag == "Q" = (ts',rs') - | tag == "S" = case runP pS pgf morpho ts' of - Just ([],x) -> ([x], rs') - _ -> ([Phrase tag mods fn idx ts'], (r,[t]) : rs') - | otherwise = ([Phrase tag mods fn idx ts'], (r,[t]) : rs') +filterTree w@(Word _ tag _ lemma) + | elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = [] + | otherwise = [w] +filterTree (Phrase tag mods fn idx ts) + | tag == "O" = ts' + | tag == "Q" = ts' + | otherwise = [Phrase tag mods fn idx ts'] where - (ts',rs') = combineRes (convert pgf morpho) ts - r = tag :-> map getTag ts + ts' = concatMap filterTree ts - isExtra (Word _ "YIL" _ _) = True - isExtra (Word _ "YIR" _ _) = True - isExtra (Word _ "YTL" _ _) = True - isExtra (Word _ "YTR" _ _) = True - isExtra _ = False - - getTag (Phrase tag mods fn idx ts) = tag++if null fn then "" else ":"++fn - getTag (Word _ tag _ _) = tag - -convertLemma pgf morpho cat an0 w@(Word _ tag form _) = - case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of - [f] -> ([App f []], []) - _ -> ([w],[]) +convert pgf eng t@(Phrase tag mods fn idx ts) + | tag == "S" = case runP pS pgf eng ts of + Just ([],x) -> x + _ -> Phrase tag mods fn idx ts' + | tag == "N" = case runP pNP pgf eng ts of + Just ([],x) -> x + _ -> Phrase tag mods fn idx ts' + | tag == "V" = case runP (pV "V") pgf eng [t] of + Just ([],(_,_,_,_,x)) -> x + _ -> Phrase tag mods fn idx ts' + | tag == "P" = case runP pPP pgf eng ts of + Just ([],x) -> x + _ -> Phrase tag mods fn idx ts' + | tag == "Po"= case runP pPP pgf eng ts of + Just ([],x) -> x + _ -> Phrase tag mods fn idx ts' + | otherwise = Phrase tag mods fn idx ts' where - hasCat pgf f cat = - case functionType pgf f of - Just ty -> case unType ty of - (_,cat1,_) -> cat1 == cat - Nothing -> False - -combineRes f ts = (ts',rs') - where - (x,y) = unzip (map f ts) - ts' = concat x - rs' = concat y + ts' = map (convert pgf eng) ts +convert pgf eng t@(Word _ tag _ lemma) + | take 2 tag == "NN" = case runP pN pgf eng [t] of + Just ([],(_,x)) -> x + _ -> t + | take 1 tag == "J" = case runP pAP pgf eng [t] of + Just ([],x) -> x + _ -> t + | otherwise = t pS = do mplus pConj (return ()) advs <- many pAdS np <- pSubject (t,p,vp) <- pVP - return (foldr ($) (cidUseCl (cidTTAnt t p) (cidPredVP np vp)) advs) + return (foldr ($) (cidUseCl t p (cidPredVP np vp)) advs) `mplus` do mplus pConj (return ()) (t,p,vp) <- pVP @@ -98,44 +69,56 @@ pS = `mplus` do mplus pConj (return ()) advs <- many pAdS - t1 <- match "EX" + t1 <- match convert "EX" (t,p,vp) <- pVP - return (foldr ($) (cidUseCl (cidTTAnt t p) (cidExistNP t1 vp)) advs) + return (foldr ($) (cidUseCl t p (cidExistNP t1 vp)) advs) pSubject = - do insideOpt "N:s" pNP + do insideOpt convert "N:s" pNP `mplus` - do insideOpt "N:S" pNP + do insideOpt convert "N:S" pNP `mplus` - do match "M:s" + do match convert "M:s" `mplus` - do match "M:S" + do match convert "M:S" `mplus` - do match "D:s" + do insideOpt convert "Ds:s" $ do + det <- pDet + return (cidDetNP (det cidNumSg)) `mplus` - do match "D:S" + do insideOpt convert "Dp:s" $ do + det <- pDet + return (cidDetNP (det cidNumPl)) + `mplus` + do insideOpt convert "Ds:S" $ do + det <- pDet + return (cidDetNP (det cidNumSg)) + `mplus` + do insideOpt convert "Dp:S" $ do + det <- pDet + return (cidDetNP (det cidNumPl)) pConj = - do match "CC" + do match convert "CC" return () `mplus` - do match "CCB" + do match convert "CCB" return () pAdS = - do adv <- pAdv - match "YC" + do adv <- pVPMods + match convert "YC" return (\t -> cidExtAdvS adv t) `mplus` - do adv <- pAdv + do adv <- pVPMods return (\t -> cidAdvS adv t) pVP = do adVs <- many pAdV - (t,p,vs) <- pV "VS" - advs <- many pAdv - s <- insideOpt "F:o" - (opt (match "CST") >> pS) + (t,p,voice,apect,vs) <- pV "VS" + advs <- many pVPMods + s <- insideOpt convert "F:o" + (opt (match convert "CST") >> pS) return (t,p,foldr (\adv t -> cidAdVVP adv t) (foldl (\t adv -> cidAdvVP t adv) (cidComplVS vs s) @@ -143,20 +126,22 @@ pVP = adVs) `mplus` do adVs <- many pAdV - (t,p,vv) <- pV "VV" - advs <- many pAdv - vp <- match "Ti" + (t,p,voice,apect,vv) <- pV "VV" + advs <- many pVPMods + (p2,voice,aspect,vp) <- inside "Ti" $ do + match convert "s" + pVPInf return (t,p,foldr (\adv t -> cidAdVVP adv t) (foldl (\t adv -> cidAdvVP t adv) - (cidComplVV vv vp) + (cidComplVV vv cidASimul p2 vp) advs) adVs) `mplus` do adVs <- many pAdV - (t,p,v2) <- pV "V2" + (t,p,voice,apect,v2) <- pV "V2" o <- pObject - opt (match "YC") -- what is this? - advs <- many pAdv + opt (match convert "YC") -- what is this? + advs <- many pVPMods return (t,p,foldr (\adv t -> cidAdVVP adv t) (foldl (\t adv -> cidAdvVP t adv) (cidComplSlash (cidSlashV2a v2) o) @@ -164,141 +149,641 @@ pVP = adVs) `mplus` do adVs <- many pAdV - (t,p,v) <- pV "V" - advs <- many pAdv + (t,p,voice,apect,v) <- pV "V" + advs <- many pVPMods return (t,p,foldr (\adv t -> cidAdVVP adv t) (foldl (\t adv -> cidAdvVP t adv) (cidUseV v) advs) adVs) + `mplus` + do inside "V" (match convert "VBZ") + adVs <- many pAdV + p <- pPol + comp <- pComp + advs <- many pVPMods + return (cidTTAnt cidTPres cidASimul,p,foldr (\adv t -> cidAdVVP adv t) + (foldl (\t adv -> cidAdvVP t adv) + (cidUseComp comp) + advs) + adVs) + +pComp = + do adv <- insideOpt convert "R:e" pAdv + return (cidCompAdv adv) + `mplus` + do np <- insideOpt convert "N:e" pNP + return (cidCompNP np) + `mplus` + do ap <- pAP + return (cidCompAP ap) + +pAdv = + do lemma "RP" "Adv" "s" + +data Voice = Active | Passive +data Aspect = Simple | Progressive pV cat = do inside "V" $ - do v <- lemma "VVDv" (mkCId cat) "s VPast" - return (cidTTAnt cidTPast cidASimul,cidPPos,v) + do v <- pVPres cat + return (cidTTAnt cidTPres cidASimul,cidPPos,Active,Simple,v) `mplus` - do v <- lemma "VVDt" (mkCId cat) "s VPast" - return (cidTTAnt cidTPast cidASimul,cidPPos,v) + do v <- do lemma "VVZi" cat "s VPres" + `mplus` + do lemma "VVZt" cat "s VPres" + `mplus` + do lemma "VVZv" cat "s VPres" + return (cidTTAnt cidTPres cidASimul,cidPPos,Active,Simple,v) `mplus` - do v <- lemma "VVZv" (mkCId cat) "s VPres" - return (cidTTAnt cidTPres cidASimul,cidPPos,v) + do v <- do lemma "VVDi" cat "s VPast" + `mplus` + do lemma "VVDt" cat "s VPast" + `mplus` + do lemma "VVDv" cat "s VPast" + return (cidTTAnt cidTPast cidASimul,cidPPos,Active,Simple,v) `mplus` - do match "VHD" - match "VHD" - v <- lemma "VVNv" (mkCId cat) "s VPPart" - return (cidTTAnt cidTPres cidAAnter,cidPPos,v) + do (match convert "VBM" `mplus` match convert "VBR" `mplus` match convert "VBZ") -- am,are,is + pol <- pPol + (voice,aspect,v) <- pVPart cat + return (cidTTAnt cidTPres cidASimul,pol,voice,aspect,v) + `mplus` + do (match convert "VBDZ" `mplus` match convert "VBDR") -- was,were + pol <- pPol + (voice,aspect,v) <- pVPart cat + return (cidTTAnt cidTPast cidASimul,pol,voice,aspect,v) + `mplus` + do match convert "VH0" -- have + pol <- pPol + (voice,aspect,v) <- do v <- pVPastPart cat + return (Active,Simple,v) + `mplus` + do match convert "VBN" -- been + pVPart cat + return (cidTTAnt cidTPres cidAAnter,pol,voice,aspect,v) + `mplus` + do match convert "VH0" -- have + return (cidTTAnt cidTPres cidAAnter,cidPPos,Active,Simple,cidhave_V2) + `mplus` + do match convert "VHZ" -- has + pol <- pPol + (voice,aspect,v) <- do v <- pVPastPart cat + return (Active,Simple,v) + `mplus` + do match convert "VBN" -- been + pVPart cat + return (cidTTAnt cidTPres cidAAnter,pol,voice,aspect,v) + `mplus` + do match convert "VHZ" -- has + return (cidTTAnt cidTPres cidAAnter,cidPPos,Active,Simple,cidhave_V2) + `mplus` + do match convert "VHD" -- had + pol <- pPol + (voice,aspect,v) <- do v <- pVPastPart cat + return (Active,Simple,v) + `mplus` + do match convert "VBN" -- been + pVPart cat + return (cidTTAnt cidTPast cidAAnter,pol,voice,aspect,v) + `mplus` + do match convert "VHD" -- had + return (cidTTAnt cidTPast cidASimul,cidPPos,Active,Simple,cidhave_V2) + `mplus` + do w <- word "VMo" -- will + guard (w == "will") + pol <- pPol + (voice,apect,vp) <- pVInf cat + return (cidTTAnt cidTFut cidASimul,pol,voice,apect,vp) + `mplus` + do mplus (match convert "VD0") (match convert "VDZ") -- do,does + match convert "XX" + vp <- pVPres cat + return (cidTTAnt cidTPres cidASimul,cidPNeg,Active,Simple,vp) + `mplus` + do match convert "VDD" -- did + match convert "XX" + vp <- pVPres cat + return (cidTTAnt cidTPast cidASimul,cidPNeg,Active,Simple,vp) + + +pVPInf = do + adVs <- many pAdV + (pol,voice,apect,vp) <- do (pol,voice,apect,v2) <- inside "Vi" $ + do pol <- pPol + match convert "TO" + (voice,aspect,v) <- pVInf "V2" + return (pol,voice,aspect,v) + o <- pObject + return (pol,voice,apect,cidComplSlash (cidSlashV2a v2) o) + `mplus` + do (pol,voice,apect,v2a) <- inside "Vi" $ + do pol <- pPol + match convert "TO" + (voice,aspect,v) <- pVInf "V2A" + return (pol,voice,aspect,v) + o <- pObject + ap <- pAP + return (pol,voice,apect,cidComplSlash (cidSlashV2A v2a ap) o) + advs <- many pVPMods + return (pol,voice,apect, + foldr (\adv t -> cidAdVVP adv t) + (foldl (\t adv -> cidAdvVP t adv) + vp + advs) + adVs) + +pVInf cat = + do v <- pVPres cat + return (Active,Simple,v) `mplus` - do v <- match "V" - return (App (mkCId "XXX") [],App (mkCId "XXX") [],v) + do match convert "VB0" -- be + pVPart cat + `mplus` + do v <- match convert "VH0" -- have + return (Active,Simple,v) + +pVPart cat = + do v <- do lemma "VVGi" cat "s VPresPart" + `mplus` + do lemma "VVGt" cat "s VPresPart" + `mplus` + do lemma "VVGv" cat "s VPresPart" + return (Active,Progressive,v) + `mplus` + do v <- pVPastPart cat + return (Passive,Simple,v) + +pVPres cat = + do lemma "VV0i" cat "s VInf" + `mplus` + do lemma "VV0t" cat "s VInf" + `mplus` + do lemma "VV0v" cat "s VInf" + +pVPastPart cat = + do lemma "VVNi" cat "s VPPart" + `mplus` + do lemma "VVNt" cat "s VPPart" + `mplus` + do lemma "VVNv" cat "s VPPart" + +pPol = + do match convert "XX" + return cidPNeg + `mplus` + do return cidPPos pAdV = - do insideOpt "R:c" $ - lemma "RRR" (mkCId "AdV") "s" + do insideOpt convert "R:c" $ + lemma "RRR" "AdV" "s" `mplus` - do match "R:m" + do insideOpt convert "R:m" $ + lemma "RRR" "AdV" "s" pObject = - match "P:u" + match convert "P:u" `mplus` - insideOpt "N:o" pNP + insideOpt convert "N:o" pNP `mplus` - match "N:e" + match convert "N:e" `mplus` - match "M:e" + match convert "M:e" `mplus` - match "D:e" + do insideOpt convert "Ds:e" $ do + det <- pDet + return (cidDetNP (det cidNumSg)) `mplus` - match "P:e" + do insideOpt convert "Dp:e" $ do + det <- pDet + return (cidDetNP (det cidNumPl)) + `mplus` + do insideOpt convert "P:e" $ pPP -pAdv = - do match "N:t" +pVPMods = + do insideOpt convert "N:t" pTimeNPAdv `mplus` - do match "N:h" + do match convert "N:h" `mplus` - do match "P:p" + do insideOpt convert "P:p" $ pPP `mplus` - do match "P:q" + do insideOpt convert "P:q" $ pPP `mplus` - do match "P:a" + do insideOpt convert "Pb:a" $ do + match convert "IIb" + np <- insideOpt convert "N" pNP + return (cidPrepNP cidby_Prep np) `mplus` - do match "P:t" + do insideOpt convert "P:t" $ pPP `mplus` - do match "P:h" + do insideOpt convert "P:h" $ pPP `mplus` - do match "P:m" + do insideOpt convert "P:m" $ pPP `mplus` - do match "P:r" + do insideOpt convert "P:r" $ pPP `mplus` - do match "R:p" + do insideOpt convert "R:p" $ pAdv `mplus` - do match "R:q" + do insideOpt convert "R:q" $ pAdv `mplus` - do match "R:a" + do insideOpt convert "R:a" $ pAdv `mplus` - do match "R:t" + do insideOpt convert "R:t" $ pAdv `mplus` - do match "R:h" + do insideOpt convert "R:h" $ pAdv `mplus` - do match "R:m" + do insideOpt convert "R:m" $ pAdv `mplus` - do match "R:c" + do insideOpt convert "R:c" $ pAdv `mplus` - do match "R:r" + do insideOpt convert "R:r" $ pAdv `mplus` - do match "F:p" + do match convert "F:p" `mplus` - do match "F:q" + do match convert "F:q" `mplus` - do match "F:a" + do match convert "F:a" `mplus` - do match "F:t" + do match convert "F:t" `mplus` - do match "F:h" + do match convert "F:h" `mplus` - do match "F:m" + do match convert "F:m" `mplus` - do match "F:r" + do match convert "F:r" `mplus` - do match "W:b" + do match convert "W:b" `mplus` - do match "L:b" + do match convert "L:b" -pNP = do - q <- pQuant - (n,cn) <- pCN - return (cidDetCN (cidDetQuant q n) cn) +pPP = + do prep <- do lemma "ICS" "Prep" "s" + `mplus` + do lemma "ICSk" "Prep" "s" + `mplus` + do lemma "ICSt" "Prep" "s" + `mplus` + do lemma "ICSx" "Prep" "s" + `mplus` + do lemma "IF" "Prep" "s" + `mplus` + do lemma "II" "Prep" "s" + `mplus` + do lemma "IIa" "Prep" "s" + `mplus` + do lemma "IIb" "Prep" "s" + `mplus` + do lemma "IIg" "Prep" "s" + `mplus` + do lemma "IIp" "Prep" "s" + `mplus` + do lemma "IIt" "Prep" "s" + `mplus` + do lemma "IO" "Prep" "s" + `mplus` + do lemma "IW" "Prep" "s" + `mplus` + do insideOpt convert "II=" $ do + w1 <- word "II21" + w2 <- word "II22" + lookupForm "Prep" "s" (unwords [w1,w2]) + np <- do insideOpt convert "N" pNP + `mplus` + do (mb_num,n) <- pN + case mb_num of + Just num | num == cidNumPl -> return (cidDetCN (cidDetQuant cidIndefArt num) (cidUseN n)) + _ -> return (cidMassNP (cidUseN n)) -- we don't know the number + return (cidPrepNP prep np) + +pNP = + do np <- pBaseNP + match convert "YC" + fr <- insideOpt convert "Fr" pRCl + return (cidRelNP np fr) + `mplus` + do pBaseNP + +pBaseNP = + do det <- pDet + (mb_num,cn) <- pCN + case mb_num of + Just num -> return (cidDetCN (det num) cn) + Nothing -> mzero -- we don't know the number + `mplus` + do pn <- pName + return (cidUsePN pn) + `mplus` + do (mb_num,cn) <- pCN + case mb_num of + Just num | num == cidNumPl -> return (cidDetCN (cidDetQuant cidIndefArt num) cn) + _ -> return (cidMassNP cn) -- we don't know the number + `mplus` + do match convert "PPIS1" + return (cidUsePron cidi_Pron) + `mplus` + do match convert "PPY" + return (cidUsePron cidyouSg_Pron) + `mplus` + do match convert "PPHS1m" + return (cidUsePron cidhe_Pron) + `mplus` + do match convert "PPHS1f" + return (cidUsePron cidshe_Pron) + `mplus` + do match convert "PPH1" + return (cidUsePron cidit_Pron) + `mplus` + do match convert "PPIS2" + return (cidUsePron cidwe_Pron) + `mplus` + do match convert "PPHS2" + return (cidUsePron cidthey_Pron) + `mplus` + do match convert "Nn" + +pDet = + do match convert "DDy" + return (\num -> if num == cidNumSg + then cidanySg_Det + else cidanyPl_Det) + `mplus` + do det <- lemma "DA2" "Det" "s" + return (\num -> det) + `mplus` + do q <- pQuant + ord <- pOrd + return (\num -> cidDetQuantOrd q num ord) + `mplus` + do q <- pQuant + return (\num -> cidDetQuant q num) pQuant = - do lemma "AT" (mkCId "Quant") "s False Sg" + do match convert "AT" + return cidDefArt `mplus` - do match "AT1" + do match convert "AT1" return cidIndefArt + `mplus` + do match convert "ATn" + return cidno_Quant + `mplus` + do match convert "APPGi1" + return (cidPossPron cidi_Pron) + `mplus` + do match convert "APPGy" + return (cidPossPron cidyouSg_Pron) + `mplus` + do match convert "APPGm" + return (cidPossPron cidhe_Pron) + `mplus` + do match convert "APPGf" + return (cidPossPron cidshe_Pron) + `mplus` + do match convert "APPGh1" + return (cidPossPron cidit_Pron) + `mplus` + do match convert "APPGi2" + return (cidPossPron cidwe_Pron) + `mplus` + do match convert "APPGh2" + return (cidPossPron cidthey_Pron) + `mplus` + do lemma "DD1a" "Quant" "s True Sg" + `mplus` + do lemma "DD1i" "Quant" "s True Sg" + `mplus` + do lemma "DD2a" "Quant" "s True Pl" + `mplus` + do lemma "DD2i" "Quant" "s True Pl" + `mplus` + do lemma "DDi" "Quant" "s True Pl" + `mplus` + insideOpt convert "G" pGenitive + +pOrd = + do a <- lemma "JJT" "A" "s (AAdj Superl Nom)" + return (cidOrdSuperl a) + +pGenitive = + do np <- insideOpt convert "N" pNP + match convert "GG" + return (cidGenNP np) pCN = - do np <- insideOpt "N" pNP - (n,cn) <- pCN - return (n,App (mkCId "Appos") [np,cn]) + do pn <- mplus pName (insideOpt convert "Nn" pName) + (mb_num,cn) <- pCN + return (mb_num,cidNameCN pn cn) `mplus` - do a <- lemma "JJ" (mkCId "A") "s (AAdj Posit Nom)" - (n,cn) <- pCN - return (n,cidAdjCN (cidPositA a) cn) + do (mb_num_n,n) <- mplus pN (inside "N" pCN) + (mb_num,cn) <- pCN + case mb_num_n of + Just num | num == cidNumPl -> return (mb_num,cidCompoundPlCN (cidUseN n) cn) + _ -> return (mb_num,cidCompoundSgCN (cidUseN n) cn) -- here we don't really know the number `mplus` - do (num,n) <- pN - advs <- many pPo - return (num, - foldl (\t adv -> cidAdvCN t adv) + do ap <- pAP + (mb_num,cn) <- pCN + return (mb_num,cidAdjCN ap cn) + `mplus` + do t <- match convert "NN1u&" + mods <- many pCNMods + return (Just cidNumSg + ,foldl (\t mod -> mod t) + t + mods) + `mplus` + do (mb_num,n) <- pN + mods <- many pCNMods + return (mb_num, + foldl (\t mod -> mod t) (cidUseN n) - advs) + mods) -pN = - do n <- lemma "NN1c" (mkCId "N") "s Sg Nom" - return (cidNumSg, n) +pAP = + do a <- lemma "JJ" "A" "s (AAdj Posit Nom)" + `mplus` + lemma "JA" "A" "s (AAdj Posit Nom)" + `mplus` + lemma "JB" "A" "s (AAdj Posit Nom)" + `mplus` + lemma "JBo" "A" "s (AAdj Posit Nom)" + `mplus` + lemma "DAy" "A" "s (AAdj Posit Nom)" + `mplus` + lemma "DAz" "A" "s (AAdj Posit Nom)" + return (cidPositA a) `mplus` - do n <- lemma "NN1n" (mkCId "N") "s Sg Nom" - return (cidNumSg, n) + do a <- lemma "JJR" "A" "s (AAdj Compar Nom)" + return (cidUseComparA a) + `mplus` + do vp <- match convert "Tn" + return (cidPastPartAP vp) + `mplus` + do insideOpt convert "J" $ do + adas <- many pAdA + ap <- pAP + mods <- many pAPMods + return (foldl (\t ada -> cidAdAP ada t) + (foldl (\t mod -> cidAdvAP t mod) + ap + mods) + adas) -pPo = - insideOpt "Po" $ do - p <- match "IO" - np <- insideOpt "N" pNP - return (cidPrepNP p np) +pAdA = do + a <- lemma "RR" "A" "s AAdv" + return (cidPositAdAAdj a) + +pAPMods = + do insideOpt convert "P" pPP + +pN = + do n <- lemma "NNn" "N" "s Sg Nom" + return (Nothing, n) + `mplus` + do n <- lemma "NNu" "N" "s Sg Nom" + return (Just cidNumPl, n) + `mplus` + do n <- lemma "NNux" "N" "s Sg Nom" + return (Nothing, n) + `mplus` + do n <- lemma "NN1c" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NN1m" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NN1n" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NN1u" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NN1ux" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NN2" "N" "s Pl Nom" + return (Just cidNumPl, n) + `mplus` + do n <- lemma "NNJ1c" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNJ1n" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNJ2" "N" "s Pl Nom" + return (Just cidNumPl, n) + `mplus` + do n <- lemma "NNLc" "N" "s Sg Nom" + return (Nothing, n) + `mplus` + do n <- lemma "NNL1c" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNL1cb" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNL1n" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNL2" "N" "s Pl Nom" + return (Just cidNumPl, n) + `mplus` + do n <- lemma "NNS1c" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNS1n" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNS2" "N" "s Pl Nom" + return (Just cidNumPl, n) + `mplus` + do n <- lemma "NNT1h" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNT1m" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNT1c" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNT2" "N" "s Pl Nom" + return (Just cidNumPl, n) + `mplus` + do n <- lemma "NNUc" "N" "s Sg Nom" + return (Nothing, n) + `mplus` + do n <- lemma "NNUn" "N" "s Sg Nom" + return (Nothing, n) + `mplus` + do n <- lemma "NNU1c" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNU1n" "N" "s Sg Nom" + return (Just cidNumSg, n) + `mplus` + do n <- lemma "NNU2" "N" "s Pl Nom" + return (Just cidNumPl, n) + `mplus` + do inside "Ns" $ do + (mb_num1,n1) <- pN + match convert "YH" + (mb_num2,n2) <- pN + case mb_num1 of + Just num | num == cidNumPl -> return (mb_num2,cidDashPlN n1 n2) + _ -> return (mb_num2,cidDashSgN n1 n2) -- here we don't really know the number + +pCNMods = + do adv <- insideOpt convert "Po" $ pPP + return (\t -> cidAdvCN t adv) + `mplus` + do adv <- insideOpt convert "P" $ pPP + return (\t -> cidAdvCN t adv) + `mplus` + do adv <- insideOpt convert "Fn" $ do + match convert "CST" + s <- pS + return (cidSubjS cidthat_Subj s) + return (\t -> cidAdvCN t adv) + `mplus` + do fr <- insideOpt convert "Fr" pRCl + return (\t -> cidRelCN t fr) + +pName = + do w1 <- word "NP1s" + w2 <- word "NNL1cb" + return (cidSymbPN (cidMkSymb (Lit (unwords [w1,w2])))) + `mplus` + do w1 <- word "NPM1" + match convert "YH" + w2 <- word "NPM1" + return (cidSymbPN (cidMkSymb (Lit (unwords [w1,"-",w2])))) + `mplus` + do w1 <- msum [word "NP1c", word "NP1f", word "NP1g" + ,word "NP1j", word "NP1m", word "NP1p" + ,word "NP1s", word "NP1t", word "NP1x" + ,word "NP1z", word "NP2c", word "NP2f" + ,word "NP2g", word "NP2j", word "NP2m" + ,word "NP2p", word "NP2s", word "NP2t" + ,word "NP2x", word "NP2z"] + return (cidSymbPN (cidMkSymb (Lit w1))) + +pRCl = + do rp <- pRP + (t,p,vp) <- pVP + opt (match convert "YC") + return (cidUseRCl t p (cidRelVP rp vp)) + `mplus` + do (prep,rp) <- inside "Pq" $ do + prep <- lemma "II" "Prep" "s" + rp <- pRP + return (prep,rp) + np <- pSubject + (t,p,vp) <- pVP + opt (match convert "YC") + return (cidUseRCl t p (cidRelSlash rp (cidSlashVP np (cidVPSlashPrep vp prep)))) + +pRP = + do inside "Dq" (match convert "DDQr") + return cidIdRP + +pTimeNPAdv = do + day <- lemma "NPD1" "Weekday" "s Sg Nom" + return (cidweekdayPunctualAdv day)