partial reconstruction for sentence structures in Susanne

This commit is contained in:
kr.angelov
2013-12-06 15:05:56 +00:00
parent 0d047707fc
commit 1026824060
3 changed files with 330 additions and 26 deletions

View File

@@ -1,37 +1,90 @@
module Parser where module Parser where
import Data.Char
import Control.Monad import Control.Monad
import PGF(PGF,Morpho,lookupMorpho,functionType,unType)
import SusanneFormat import SusanneFormat
newtype P a = P {runP :: [ParseTree] -> Maybe ([ParseTree], a)} newtype P a = P {runP :: PGF -> Morpho -> [ParseTree] -> Maybe ([ParseTree], a)}
instance Monad P where instance Monad P where
return x = P (\ts -> Just (ts, x)) return x = P (\pgf morpho ts -> Just (ts, x))
f >>= g = P (\ts -> case runP f ts of f >>= g = P (\pgf morpho ts -> case runP f pgf morpho ts of
Nothing -> Nothing Nothing -> Nothing
Just (ts,x) -> runP (g x) ts) Just (ts,x) -> runP (g x) pgf morpho ts)
instance MonadPlus P where instance MonadPlus P where
mzero = P (\ts -> Nothing) mzero = P (\pgf morpho ts -> Nothing)
mplus f g = P (\ts -> mplus (runP f ts) (runP g ts)) mplus f g = P (\pgf morpho ts -> mplus (runP f pgf morpho ts) (runP g pgf morpho ts))
match tag_spec = P (\ts -> match tag_spec = P (\pgf morpho ts ->
case ts of case ts of
(Phrase tag1 mods1 fn1 _ _:ts) (t@(Phrase tag1 mods1 fn1 _ _):ts)
| tag == tag1 && | tag == tag1 &&
all (flip elem mods1) mods && all (flip elem mods1) mods &&
(null fn || fn == fn1) -> Just (ts,()) (null fn || fn == fn1) -> Just (ts,t)
(Word _ tag1 _ _:ts) (t@(Word _ tag1 _ _):ts)
| tag == tag1 -> Just (ts,()) | tag == tag1 -> Just (ts,t)
_ -> Nothing) _ -> Nothing)
where where
(f,_) = readTag (Word "<match>" undefined undefined undefined) tag_spec (f,_) = readTag (Word "<match>" undefined undefined undefined) tag_spec
Phrase tag mods fn _ _ = f [] Phrase tag mods fn _ _ = f []
many1 f = do
x <- f
xs <- many f
return (x:xs)
many f = many f =
do x <- f do x <- f
xs <- many f xs <- many f
return (x:xs) return (x:xs)
`mplus` `mplus`
do return [] do return []
inside tag_spec p = P (\pgf morpho ts ->
case ts of
(t@(Phrase tag1 mods1 fn1 _ ts'):ts)
| tag == tag1 &&
all (flip elem mods1) mods &&
(null fn || fn == fn1) -> case runP p pgf morpho ts' of
Just ([],x) -> Just (ts,x)
_ -> Nothing
_ -> Nothing)
where
(f,_) = readTag (Word "<match>" undefined undefined undefined) tag_spec
Phrase tag mods fn _ _ = f []
insideOpt tag_spec p = P (\pgf morpho ts ->
case ts of
(t@(Phrase tag1 mods1 fn1 _ ts'):ts)
| tag == tag1 &&
all (flip elem mods1) mods &&
(null fn || fn == fn1) -> case runP p pgf morpho ts' of
Just ([],x) -> Just (ts,x)
_ -> Just (ts,t)
_ -> Nothing)
where
(f,_) = readTag (Word "<match>" undefined undefined undefined) tag_spec
Phrase tag mods fn _ _ = f []
lemma tag cat an0 = P (\pgf morpho ts ->
case ts of
(t@(Word _ tag1 form _):ts) | tag == tag1 ->
case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of
[f] -> Just (ts,App f [])
_ -> Just (ts,t)
_ -> Nothing)
where
hasCat pgf f cat =
case functionType pgf f of
Just ty -> case unType ty of
(_,cat1,_) -> cat1 == cat
Nothing -> False
opt f =
do x <- f
return (Just x)
`mplus`
do return Nothing

View File

@@ -15,6 +15,7 @@ data ParseTree
= Phrase Tag Mods Fn Index [ParseTree] = Phrase Tag Mods Fn Index [ParseTree]
| Word Id Tag Word Lemma | Word Id Tag Word Lemma
| App CId [ParseTree] | App CId [ParseTree]
deriving Eq
data ParseTreePos data ParseTreePos
= Root = Root

View File

@@ -2,24 +2,34 @@ import System.Directory
import System.FilePath import System.FilePath
import Data.List import Data.List
import Data.Char(toLower) import Data.Char(toLower)
import Control.Monad
import qualified Data.Map as Map
import PGF (readPGF, readLanguage, buildMorpho, lookupMorpho, mkCId, functionType, unType) import PGF (readPGF, readLanguage, buildMorpho, lookupMorpho, mkCId, functionType, unType)
import SusanneFormat import SusanneFormat
import Parser
import Idents
Just eng = readLanguage "ParseEng" Just eng = readLanguage "DictEng"
main = do main = do
gr <- readPGF "../../ParseEngAbs.pgf" gr <- readPGF "DictEngAbs.pgf"
let morpho = buildMorpho gr eng let morpho = buildMorpho gr eng
fs <- getDirectoryContents "data" fs <- getDirectoryContents "data"
txts <- (mapM (\f -> readFile ("data" </> f)) . filter ((/= ".") . take 1)) (sort fs) txts <- (mapM (\f -> readFile ("data" </> f)) . filter ((/= ".") . take 1)) (sort fs)
--let ts = concatMap (convert gr morpho) (readTreebank (lines (concat txts))) --let ts' = readTreebank (lines (concat txts))
let ts = readTreebank (lines (concat txts)) --writeFile "text" (unlines (map show ts'))
writeFile "text" (unlines (map show ts)) let (ts',rs') = combineRes (convert gr morpho) (readTreebank (lines (concat txts)))
let rm = Map.fromListWith (++) rs'
writeFile "susanne.gft" (unlines (map show ts'))
writeFile "rules" (unlines (concat [unwords ("-":cat:"->":cats) : map (\t -> " "++show t) rs'' | (cat :-> cats,rs'') <- Map.toList rm]))
data Rule = Tag :-> [Tag]
deriving (Eq,Ord)
convert pgf morpho w@(Word _ tag _ lemma) convert pgf morpho w@(Word _ tag _ lemma)
| elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = [] | elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = ([],[])
| tag == "NN1c" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w {- | tag == "NN1c" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w
| tag == "NN1n" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w | tag == "NN1n" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w
| tag == "NN2" = convertLemma pgf morpho (mkCId "N") "s Pl Nom" w | tag == "NN2" = convertLemma pgf morpho (mkCId "N") "s Pl Nom" w
| tag == "JJ" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w | tag == "JJ" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w
@@ -36,19 +46,259 @@ convert pgf morpho w@(Word _ tag _ lemma)
| tag == "PPHO2"= convertLemma pgf morpho (mkCId "Pron") "s NPAcc" w | tag == "PPHO2"= convertLemma pgf morpho (mkCId "Pron") "s NPAcc" w
| tag == "RR" = convertLemma pgf morpho (mkCId "Adv") "s" w | tag == "RR" = convertLemma pgf morpho (mkCId "Adv") "s" w
| tag == "II" = convertLemma pgf morpho (mkCId "Prep") "s" w | tag == "II" = convertLemma pgf morpho (mkCId "Prep") "s" w
| tag == "IO" = convertLemma pgf morpho (mkCId "Prep") "s" w | tag == "IO" = convertLemma pgf morpho (mkCId "Prep") "s" w-}
| otherwise = [w] | otherwise = ([w],[])
convert pgf morpho (Phrase tag mods fn idx ts) convert pgf morpho t@(Phrase tag mods fn idx ts)
| tag == "O" = concatMap (convert pgf morpho) ts | tag == "O" = (ts',rs')
| otherwise = [Phrase tag mods fn idx (concatMap (convert pgf morpho) ts)] | tag == "Q" = (ts',rs')
| tag == "S" = case runP pS pgf morpho ts' of
Just ([],x) -> ([x], rs')
_ -> ([Phrase tag mods fn idx ts'], (r,[t]) : rs')
| otherwise = ([Phrase tag mods fn idx ts'], (r,[t]) : rs')
where
(ts',rs') = combineRes (convert pgf morpho) ts
r = tag :-> map getTag ts
isExtra (Word _ "YIL" _ _) = True
isExtra (Word _ "YIR" _ _) = True
isExtra (Word _ "YTL" _ _) = True
isExtra (Word _ "YTR" _ _) = True
isExtra _ = False
getTag (Phrase tag mods fn idx ts) = tag++if null fn then "" else ":"++fn
getTag (Word _ tag _ _) = tag
convertLemma pgf morpho cat an0 w@(Word _ tag form _) = convertLemma pgf morpho cat an0 w@(Word _ tag form _) =
case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of
[f] -> [App f []] [f] -> ([App f []], [])
_ -> [w] _ -> ([w],[])
where where
hasCat pgf f cat = hasCat pgf f cat =
case functionType pgf f of case functionType pgf f of
Just ty -> case unType ty of Just ty -> case unType ty of
(_,cat1,_) -> cat1 == cat (_,cat1,_) -> cat1 == cat
Nothing -> False Nothing -> False
combineRes f ts = (ts',rs')
where
(x,y) = unzip (map f ts)
ts' = concat x
rs' = concat y
pS =
do mplus pConj (return ())
advs <- many pAdS
np <- pSubject
(t,p,vp) <- pVP
return (foldr ($) (App cidUseCl [t,p,App cidPredVP [np, vp]]) advs)
`mplus`
do mplus pConj (return ())
(t,p,vp) <- pVP
return (App cidImpVP [vp])
`mplus`
do mplus pConj (return ())
advs <- many pAdS
t1 <- match "EX"
(t,p,vp) <- pVP
return (foldr ($) (App cidUseCl [t,p,App cidExistNP [t1,vp]]) advs)
pSubject =
do insideOpt "N:s" pNP
`mplus`
do insideOpt "N:S" pNP
`mplus`
do match "M:s"
`mplus`
do match "M:S"
`mplus`
do match "D:s"
`mplus`
do match "D:S"
pConj =
do match "CC"
return ()
`mplus`
do match "CCB"
return ()
pAdS =
do adv <- pAdv
match "YC"
return (\t -> App cidExtAdvS [adv,t])
`mplus`
do adv <- pAdv
return (\t -> App cidAdvS [adv,t])
pVP =
do adVs <- many pAdV
(t,p,vs) <- pV "VS"
advs <- many pAdv
s <- insideOpt "F:o"
(opt (match "CST") >> pS)
return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
(foldl (\t adv -> App cidAdvVP [t, adv])
(App cidComplVS [vs, s])
advs)
adVs)
`mplus`
do adVs <- many pAdV
(t,p,vv) <- pV "VV"
advs <- many pAdv
vp <- match "Ti"
return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
(foldl (\t adv -> App cidAdvVP [t, adv])
(App cidComplVV [vv, vp])
advs)
adVs)
`mplus`
do adVs <- many pAdV
(t,p,v2) <- pV "V2"
o <- pObject
opt (match "YC") -- what is this?
advs <- many pAdv
return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
(foldl (\t adv -> App cidAdvVP [t, adv])
(App cidComplSlash [App cidSlashV2a [v2],o])
advs)
adVs)
`mplus`
do adVs <- many pAdV
(t,p,v) <- pV "V"
advs <- many pAdv
return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
(foldl (\t adv -> App cidAdvVP [t, adv])
(App cidUseV [v])
advs)
adVs)
pV cat =
do inside "V" $
do v <- lemma "VVDv" (mkCId cat) "s VPast"
return (App cidTTAnt [App cidTPast [],App cidASimul []],App cidPPos [],v)
`mplus`
do v <- lemma "VVDt" (mkCId cat) "s VPast"
return (App cidTTAnt [App cidTPast [],App cidASimul []],App cidPPos [],v)
`mplus`
do v <- lemma "VVZv" (mkCId cat) "s VPres"
return (App cidTTAnt [App cidTPres [],App cidASimul []],App cidPPos [],v)
`mplus`
do match "VHD"
match "VHD"
v <- lemma "VVNv" (mkCId cat) "s VPPart"
return (App cidTTAnt [App cidTPres [],App cidAAnter []],App cidPPos [],v)
`mplus`
do v <- match "V"
return (App (mkCId "XXX") [],App (mkCId "XXX") [],v)
pAdV =
do insideOpt "R:c" $
lemma "RRR" (mkCId "AdV") "s"
`mplus`
do match "R:m"
pObject =
match "P:u"
`mplus`
insideOpt "N:o" pNP
`mplus`
match "N:e"
`mplus`
match "M:e"
`mplus`
match "D:e"
`mplus`
match "P:e"
pAdv =
do match "N:t"
`mplus`
do match "N:h"
`mplus`
do match "P:p"
`mplus`
do match "P:q"
`mplus`
do match "P:a"
`mplus`
do match "P:t"
`mplus`
do match "P:h"
`mplus`
do match "P:m"
`mplus`
do match "P:r"
`mplus`
do match "R:p"
`mplus`
do match "R:q"
`mplus`
do match "R:a"
`mplus`
do match "R:t"
`mplus`
do match "R:h"
`mplus`
do match "R:m"
`mplus`
do match "R:c"
`mplus`
do match "R:r"
`mplus`
do match "F:p"
`mplus`
do match "F:q"
`mplus`
do match "F:a"
`mplus`
do match "F:t"
`mplus`
do match "F:h"
`mplus`
do match "F:m"
`mplus`
do match "F:r"
`mplus`
do match "W:b"
`mplus`
do match "L:b"
pNP = do
q <- pQuant
(n,cn) <- pCN
return (App cidDetCN [App cidDetQuant [q,n],cn])
pQuant =
do lemma "AT" (mkCId "Quant") "s False Sg"
`mplus`
do match "AT1"
return (App cidIndefArt [])
pCN =
do np <- insideOpt "N" pNP
(n,cn) <- pCN
return (n,App (mkCId "Appos") [np,cn])
`mplus`
do a <- lemma "JJ" (mkCId "A") "s (AAdj Posit Nom)"
(n,cn) <- pCN
return (n,App cidAdjCN [App cidPositA [a],cn])
`mplus`
do (num,n) <- pN
advs <- many pPo
return (num,
foldl (\t adv -> App cidAdvCN [t, adv])
(App cidUseN [n])
advs)
pN =
do n <- lemma "NN1c" (mkCId "N") "s Sg Nom"
return (App cidNumSg [], n)
`mplus`
do n <- lemma "NN1n" (mkCId "N") "s Sg Nom"
return (App cidNumSg [], n)
pPo =
insideOpt "Po" $ do
p <- match "IO"
np <- insideOpt "N" pNP
return (App cidPrepNP [p,np])