1
0
forked from GitHub/gf-core
Files
gf-core/treebanks/susanne/convert.hs

305 lines
8.4 KiB
Haskell

import System.Directory
import System.FilePath
import Data.List
import Data.Char(toLower)
import Control.Monad
import qualified Data.Map as Map
import PGF (readPGF, readLanguage, buildMorpho, lookupMorpho, mkCId, functionType, unType)
import SusanneFormat
import Parser
import Idents
Just eng = readLanguage "DictEng"
main = do
gr <- readPGF "DictEngAbs.pgf"
let morpho = buildMorpho gr eng
fs <- getDirectoryContents "data"
txts <- (mapM (\f -> readFile ("data" </> f)) . filter ((/= ".") . take 1)) (sort fs)
--let ts' = readTreebank (lines (concat txts))
--writeFile "text" (unlines (map show ts'))
let (ts',rs') = combineRes (convert gr morpho) (readTreebank (lines (concat txts)))
let rm = Map.fromListWith (++) rs'
writeFile "susanne.gft" (unlines (map show ts'))
writeFile "rules" (unlines (concat [unwords ("-":cat:"->":cats) : map (\t -> " "++show t) rs'' | (cat :-> cats,rs'') <- Map.toList rm]))
data Rule = Tag :-> [Tag]
deriving (Eq,Ord)
convert pgf morpho w@(Word _ tag _ lemma)
| elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = ([],[])
{- | tag == "NN1c" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w
| tag == "NN1n" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w
| tag == "NN2" = convertLemma pgf morpho (mkCId "N") "s Pl Nom" w
| tag == "JJ" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w
| tag == "JB" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w
| tag == "JBo" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w
| tag == "AT" = convertLemma pgf morpho (mkCId "Quant") "s False Sg" w
| tag == "VVDi" = convertLemma pgf morpho (mkCId "V") "s VPast" w
| tag == "VVDt" = convertLemma pgf morpho (mkCId "V2") "s VPast" w
| tag == "VVDv" = convertLemma pgf morpho (mkCId "V") "s VPast" w
| tag == "VVZi" = convertLemma pgf morpho (mkCId "V") "s VPres" w
| tag == "VVZt" = convertLemma pgf morpho (mkCId "V2") "s VPres" w
| tag == "VVZv" = convertLemma pgf morpho (mkCId "V") "s VPres" w
| tag == "PPHS2"= convertLemma pgf morpho (mkCId "Pron") "s (NCase Nom)" w
| tag == "PPHO2"= convertLemma pgf morpho (mkCId "Pron") "s NPAcc" w
| tag == "RR" = convertLemma pgf morpho (mkCId "Adv") "s" w
| tag == "II" = convertLemma pgf morpho (mkCId "Prep") "s" w
| tag == "IO" = convertLemma pgf morpho (mkCId "Prep") "s" w-}
| otherwise = ([w],[])
convert pgf morpho t@(Phrase tag mods fn idx ts)
| tag == "O" = (ts',rs')
| tag == "Q" = (ts',rs')
| tag == "S" = case runP pS pgf morpho ts' of
Just ([],x) -> ([x], rs')
_ -> ([Phrase tag mods fn idx ts'], (r,[t]) : rs')
| otherwise = ([Phrase tag mods fn idx ts'], (r,[t]) : rs')
where
(ts',rs') = combineRes (convert pgf morpho) ts
r = tag :-> map getTag ts
isExtra (Word _ "YIL" _ _) = True
isExtra (Word _ "YIR" _ _) = True
isExtra (Word _ "YTL" _ _) = True
isExtra (Word _ "YTR" _ _) = True
isExtra _ = False
getTag (Phrase tag mods fn idx ts) = tag++if null fn then "" else ":"++fn
getTag (Word _ tag _ _) = tag
convertLemma pgf morpho cat an0 w@(Word _ tag form _) =
case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of
[f] -> ([App f []], [])
_ -> ([w],[])
where
hasCat pgf f cat =
case functionType pgf f of
Just ty -> case unType ty of
(_,cat1,_) -> cat1 == cat
Nothing -> False
combineRes f ts = (ts',rs')
where
(x,y) = unzip (map f ts)
ts' = concat x
rs' = concat y
pS =
do mplus pConj (return ())
advs <- many pAdS
np <- pSubject
(t,p,vp) <- pVP
return (foldr ($) (App cidUseCl [t,p,App cidPredVP [np, vp]]) advs)
`mplus`
do mplus pConj (return ())
(t,p,vp) <- pVP
return (App cidImpVP [vp])
`mplus`
do mplus pConj (return ())
advs <- many pAdS
t1 <- match "EX"
(t,p,vp) <- pVP
return (foldr ($) (App cidUseCl [t,p,App cidExistNP [t1,vp]]) advs)
pSubject =
do insideOpt "N:s" pNP
`mplus`
do insideOpt "N:S" pNP
`mplus`
do match "M:s"
`mplus`
do match "M:S"
`mplus`
do match "D:s"
`mplus`
do match "D:S"
pConj =
do match "CC"
return ()
`mplus`
do match "CCB"
return ()
pAdS =
do adv <- pAdv
match "YC"
return (\t -> App cidExtAdvS [adv,t])
`mplus`
do adv <- pAdv
return (\t -> App cidAdvS [adv,t])
pVP =
do adVs <- many pAdV
(t,p,vs) <- pV "VS"
advs <- many pAdv
s <- insideOpt "F:o"
(opt (match "CST") >> pS)
return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
(foldl (\t adv -> App cidAdvVP [t, adv])
(App cidComplVS [vs, s])
advs)
adVs)
`mplus`
do adVs <- many pAdV
(t,p,vv) <- pV "VV"
advs <- many pAdv
vp <- match "Ti"
return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
(foldl (\t adv -> App cidAdvVP [t, adv])
(App cidComplVV [vv, vp])
advs)
adVs)
`mplus`
do adVs <- many pAdV
(t,p,v2) <- pV "V2"
o <- pObject
opt (match "YC") -- what is this?
advs <- many pAdv
return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
(foldl (\t adv -> App cidAdvVP [t, adv])
(App cidComplSlash [App cidSlashV2a [v2],o])
advs)
adVs)
`mplus`
do adVs <- many pAdV
(t,p,v) <- pV "V"
advs <- many pAdv
return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
(foldl (\t adv -> App cidAdvVP [t, adv])
(App cidUseV [v])
advs)
adVs)
pV cat =
do inside "V" $
do v <- lemma "VVDv" (mkCId cat) "s VPast"
return (App cidTTAnt [App cidTPast [],App cidASimul []],App cidPPos [],v)
`mplus`
do v <- lemma "VVDt" (mkCId cat) "s VPast"
return (App cidTTAnt [App cidTPast [],App cidASimul []],App cidPPos [],v)
`mplus`
do v <- lemma "VVZv" (mkCId cat) "s VPres"
return (App cidTTAnt [App cidTPres [],App cidASimul []],App cidPPos [],v)
`mplus`
do match "VHD"
match "VHD"
v <- lemma "VVNv" (mkCId cat) "s VPPart"
return (App cidTTAnt [App cidTPres [],App cidAAnter []],App cidPPos [],v)
`mplus`
do v <- match "V"
return (App (mkCId "XXX") [],App (mkCId "XXX") [],v)
pAdV =
do insideOpt "R:c" $
lemma "RRR" (mkCId "AdV") "s"
`mplus`
do match "R:m"
pObject =
match "P:u"
`mplus`
insideOpt "N:o" pNP
`mplus`
match "N:e"
`mplus`
match "M:e"
`mplus`
match "D:e"
`mplus`
match "P:e"
pAdv =
do match "N:t"
`mplus`
do match "N:h"
`mplus`
do match "P:p"
`mplus`
do match "P:q"
`mplus`
do match "P:a"
`mplus`
do match "P:t"
`mplus`
do match "P:h"
`mplus`
do match "P:m"
`mplus`
do match "P:r"
`mplus`
do match "R:p"
`mplus`
do match "R:q"
`mplus`
do match "R:a"
`mplus`
do match "R:t"
`mplus`
do match "R:h"
`mplus`
do match "R:m"
`mplus`
do match "R:c"
`mplus`
do match "R:r"
`mplus`
do match "F:p"
`mplus`
do match "F:q"
`mplus`
do match "F:a"
`mplus`
do match "F:t"
`mplus`
do match "F:h"
`mplus`
do match "F:m"
`mplus`
do match "F:r"
`mplus`
do match "W:b"
`mplus`
do match "L:b"
pNP = do
q <- pQuant
(n,cn) <- pCN
return (App cidDetCN [App cidDetQuant [q,n],cn])
pQuant =
do lemma "AT" (mkCId "Quant") "s False Sg"
`mplus`
do match "AT1"
return (App cidIndefArt [])
pCN =
do np <- insideOpt "N" pNP
(n,cn) <- pCN
return (n,App (mkCId "Appos") [np,cn])
`mplus`
do a <- lemma "JJ" (mkCId "A") "s (AAdj Posit Nom)"
(n,cn) <- pCN
return (n,App cidAdjCN [App cidPositA [a],cn])
`mplus`
do (num,n) <- pN
advs <- many pPo
return (num,
foldl (\t adv -> App cidAdvCN [t, adv])
(App cidUseN [n])
advs)
pN =
do n <- lemma "NN1c" (mkCId "N") "s Sg Nom"
return (App cidNumSg [], n)
`mplus`
do n <- lemma "NN1n" (mkCId "N") "s Sg Nom"
return (App cidNumSg [], n)
pPo =
insideOpt "Po" $ do
p <- match "IO"
np <- insideOpt "N" pNP
return (App cidPrepNP [p,np])