added the GF version of Talbanken which was imported by Malin

This commit is contained in:
kr.angelov
2012-10-01 08:33:43 +00:00
parent 8b1cec5610
commit 475109a40f
7 changed files with 8446 additions and 0 deletions

View File

@@ -0,0 +1,106 @@
{-# LANGUAGE TypeSynonymInstances, FlexibleInstances #-}
module Format where
import Prelude hiding (words,id)
import Text.XML.HXT.Core
import Data.Ord
import Data.List hiding (words)
import Data.Maybe
import qualified Data.Map as M
import System.Environment
import Debug.Trace as Debug
import Data.Char
import qualified Data.Tree as T
import Control.Monad.State
-- Functions for parsing XML to a format read by the other Haskell files
data Word = W {id :: Id, word :: String, pos :: Tag}
data PhrTag = Ph {idPh :: Id, cat :: Tag, tags :: [(Tag,Id)]}
data Sentence = Sent {rootS :: Id, words :: [Word], info :: [PhrTag], ws :: Int}
type Tag = String
type Id = String
instance XmlPickler [Word] where
xpickle = xpWords
instance XmlPickler Sentence where
xpickle = xpSentence
xpSentences :: PU [Sentence]
xpSentences = xpElem "corpus"
$ xpWrap (snd, \a -> ((),a))
$ xpPair (xpElem "head" $ xpUnit) (xpElem "body" $ xpList $ xpSentence)
xpTags :: PU [PhrTag]
xpTags = xpList $ xpElem "nt"
$ xpWrap (uncurry3 Ph,\p -> (idPh p,cat p,tags p))
$ xpTriple (xpAttr "id" xpText) (xpAttr "cat" xpText)
(xpList $ xpTagMap)
xpTagMap :: PU (Tag,String)
xpTagMap = xpElem "edge"
$ xpPair (xpAttr "label" xpText)
(xpAttr "idref" xpText)
xpSentence :: PU Sentence
xpSentence = xpElem "s"
$ xpWrap (makeSentence,\s -> (rootS s,words s, info s))
$ xpElem "graph"
$ xpTriple (xpAttr "root" xpText)
( xpElem "terminals" xpWords)
( xpElem "nonterminals" xpTags)
where makeSentence (r,ws,tgs) = Sent r ws tgs (length ws)
xpWords :: PU [Word]
xpWords = xpList $ xpElem "t"
$ xpWrap (uncurry3 W,\t -> (id t, word t,pos t))
$ xpTriple (xpAttr "id" xpText)
(xpAttr "word" xpText)
(xpAttr "pos" xpText)
mainF src =
runX (xunpickleDocument xpSentences [withInputEncoding utf8
, withRemoveWS yes] src
>>> arrIO (putStrLn . unlines . map (show . toTree)))
runPickle f src =
runX (xunpickleDocument xpSentences [withInputEncoding utf8
, withRemoveWS yes] src
>>> arrIO (return . map f))
parse = runPickle toStringTree
parseIdTree = runPickle toTree
toStringTree :: Sentence -> (String,T.Tree String)
toStringTree = second (fmap snd) . toTree
toTree :: Sentence -> (String,T.Tree (Id,String))
toTree s@(Sent root ws inf _) = (root,toTree' root s)
toTree' :: String -> Sentence -> T.Tree (Id,String)
toTree' nr s@(Sent root ws inf _) =
case (lookup' nr ws,lookup'' nr inf) of
(Just w,_) -> putWord w
(_,Just p) -> putPhrase p
_ -> error $ "Error in toTree' "++show nr++" could not be found"
where putWord (W i w p) = T.Node (i,p) [T.Node (i,w) []]
putPhrase (Ph i c t) = T.Node (i,c)
$ map (\(tag,next) -> T.Node (next,tag) [toTree' next s]) t
lookup' y (w@(W x _ _):xs) | y ==x = Just w
| otherwise = lookup' y xs
lookup' y [] = Nothing
lookup'' y (w@(Ph x _ _):xs) | y ==x = Just w
| otherwise = lookup'' y xs
lookup'' y [] = Nothing
treeToSentence :: [T.Tree String] -> String
treeToSentence ts = unwords $ map extractS ts
where extractS (T.Node ws []) = ws
extractS (T.Node c ts) = unwords $ map extractS ts
showa :: T.Tree String -> String
showa (T.Node root ts) = "("++root++" "++concatMap showa ts++" )"

View File

@@ -0,0 +1,182 @@
module Idents where
import PGF
cidASimul = mkCId "ASimul"
cidAAnter = mkCId "AAnter"
cidPositAdvAdj = mkCId "PositAdvAdj"
cidPositAdVAdj = mkCId "PositAdVAdj"
cidUseCl = mkCId "UseCl"
cidUseQCl = mkCId "UseQCl"
cidPredVP = mkCId "PredVP"
cidAdjCN = mkCId "AdjCN"
cidUseN = mkCId "UseN"
cidDetQuant = mkCId "DetQuant"
cidNumSg = mkCId "NumSg"
cidNumPl = mkCId "NumPl"
cidDetCN = mkCId "DetCN"
cidIndefArt = mkCId "IndefArt"
cidDefArt = mkCId "DefArt"
cidUsePN = mkCId "UsePN"
cidUseQuantPN = mkCId "UseQuantPN"
cidSymbPN = mkCId "SymbPN"
cidMkSymb = mkCId "MkSymb"
cidUsePron = mkCId "UsePron"
cidConjNP = mkCId "ConjNP"
cidBaseNP = mkCId "BaseNP"
cidConsNP = mkCId "ConsNP"
cidConjCN = mkCId "ConjCN"
cidBaseCN = mkCId "BaseCN"
cidConsCN = mkCId "ConsCN"
cidMassNP = mkCId "MassNP"
cidAdvNP = mkCId "AdvNP"
cidTPres = mkCId "TPres"
cidTPast = mkCId "TPast"
cidTFut = mkCId "TFut"
cidTFutKommer = mkCId "TFutKommer"
cidTCond = mkCId "TCond"
cidTTAnt = mkCId "TTAnt"
cidPPos = mkCId "PPos"
cidPNeg = mkCId "PNeg"
cidComplSlash = mkCId "ComplSlash"
cidSlashV2a = mkCId "SlashV2a"
cidSlashV2A = mkCId "SlashV2A"
cidComplVS = mkCId "ComplVS"
cidUseV = mkCId "UseV"
cidAdVVP = mkCId "AdVVP"
cidAdvVP = mkCId "AdvVP"
cidAdvVPSlash = mkCId "AdvVPSlash"
cidPrepNP = mkCId "PrepNP"
cidto_Prep = mkCId "to_Prep"
cidsuch_as_Prep= mkCId "such_as_Prep"
cidPastPartAP = mkCId "PastPartAP"
cidPassV2 = mkCId "PassV2"
cidAdvS = mkCId "AdvS"
cidPositA = mkCId "PositA"
cidIDig = mkCId "IDig"
cidIIDig = mkCId "IIDig"
cidNumCard = mkCId "NumCard"
cidNumDigits = mkCId "NumDigits"
cidNumNumeral = mkCId "NumNumeral"
cidnum = mkCId "num"
cidpot2as3 = mkCId "pot2as3"
cidpot1as2 = mkCId "pot1as2"
cidpot0as1 = mkCId "pot0as1"
cidpot01 = mkCId "pot01"
cidpot0 = mkCId "pot0"
cidn7 = mkCId "n7"
cidPossPron = mkCId "PossPron"
cidCompAP = mkCId "CompAP"
cidCompNP = mkCId "CompNP"
cidCompAdv = mkCId "CompAdv"
cidUseComp = mkCId "UseComp"
cidCompoundCN = mkCId "CompoundCN"
cidDashCN = mkCId "DashCN"
cidProgrVP = mkCId "ProgrVP"
cidGerundN = mkCId "GerundN"
cidGenNP = mkCId "GenNP"
cidPredetNP = mkCId "PredetNP"
cidDetNP = mkCId "DetNP"
cidAdAP = mkCId "AdAP"
cidPositAdAAdj = mkCId "PositAdAAdj"
cidBaseAP = mkCId "BaseAP"
cidConjAP = mkCId "ConjAP"
cidAndConj = mkCId "and_Conj"
cidOrConj = mkCId "or_Conj"
cidConsAP = mkCId "ConsAP"
cidQuestVP = mkCId "QuestVP"
cidComplVV = mkCId "ComplVV"
cidComplVA = mkCId "ComplVA"
cidUseCopula = mkCId "UseCopula"
cidPhrUtt = mkCId "PhrUtt"
cidNoPConj = mkCId "NoPConj"
cidNoVoc = mkCId "NoVoc"
cidUttS = mkCId "UttS"
cidUttQS = mkCId "UttQS"
cidUseComparA = mkCId "UseComparA"
cidOrdSuperl = mkCId "OrdSuperl"
cidUttImpPol = mkCId "UttImpPol"
cidImpVP = mkCId "ImpVP"
cidPConjConj = mkCId "PConjConj"
cidUttNP = mkCId "UttNP"
cidGenericCl = mkCId "GenericCl"
cidAdAdv = mkCId "AdAdv"
cidConsAdv = mkCId "ConsAdv"
cidBaseAdv = mkCId "BaseAdv"
cidConjAdv = mkCId "ConjAdv"
cidConsVPS = mkCId "ConsVPS"
cidBaseVPS = mkCId "BaseVPS"
cidConjVPS = mkCId "ConjVPS"
cidConsS = mkCId "ConsS"
cidBaseS = mkCId "BaseS"
cidConjS = mkCId "ConjS"
cidSubjS = mkCId "SubjS"
cidUttAdv = mkCId "UttAdv"
cidApposCN = mkCId "ApposCN"
cidUseRCl = mkCId "UseRCl"
cidImpersCl = mkCId "ImpersCl"
cidReflVP = mkCId "ReflVP"
cidExistNP = mkCId "ExistNP"
cidUseA2 = mkCId "UseA2"
cidComplN2 = mkCId "ComplN2"
cidAdvIAdv = mkCId "AdvIAdv"
cidQuestIAdv = mkCId "QuestIAdv"
cidQuestIComp = mkCId "QuestIComp"
cidQuestSlash = mkCId "QuestSlash"
cidCompIAdv = mkCId "CompIAdv"
cidCompIP = mkCId "CompIP"
cidIdetCN = mkCId "IdetCN"
cidIdetQuant = mkCId "IdetQuant"
cidPrepIP = mkCId "PrepIP"
cidFocObj = mkCId "FocObj"
cidSlashVP = mkCId "SlashVP"
cidAdvSlash = mkCId "AdvSlash"
cidSSubjS = mkCId "SSubjS"
cidRelVP = mkCId "RelVP"
cidIdRP = mkCId "IdRP"
cidRelSlash = mkCId "RelSlash"
cidTopAdv = mkCId "TopAdv"
cidTopAP = mkCId "TopAP"
cidTopObj = mkCId "TopObj"
cidUseTop = mkCId "UseTop"
-- added to Extra
cidDropAttVV = mkCId "ComplBareVV"
cidRelCN = mkCId "RelCN"
cidPassV2' = mkCId "PassV2"
cidVPSlashAP = mkCId "PPartAP"
cidReflCN = mkCId "ReflIdPron"
cidReflIdPron = mkCId "ReflIdPron"
cidPredetAdvF = mkCId "PredetAdvF"
cidCompPronAQ = mkCId "CompPronAQ"
cidQuantPronAQ = mkCId "QuantPronAQ"
--not present in grammar
cidCNNumNP = mkCId "CNNumNP"
-- words
cidhow8much_IAdv = mkCId "how8much_IAdv"
cidBy8agent_Prep = mkCId "by8agent_Prep"
cidD_1 = mkCId "D_1"
cidName = mkCId "john_PN"
cidCan_VV = mkCId "can_VV"
cidMust_VV = mkCId "must_VV"
cidWant_VV = mkCId "want_VV"
cidHave_V2 = mkCId "have_V2"
cidGet_V2 = mkCId "faa_vb_1_1_V2" -- in lexicon, not grammar
cidGet_VV = mkCId "faa_vb_1_1_VV" -- in lexicon, not grammar
cidDo_V2 = mkCId "do_V2"
cidDo_VV = mkCId "do_VV"
cidBecome_V2 = mkCId "become_V2"
cidBecome_VA = mkCId "become_VA"
-- for old translation
cidReflSlash = mkCId "ReflSlash"
cidSlash3V3 = mkCId "Slash3V3"
cidSlash2V3 = mkCId "Slash2V3"
cidSlashV2V = mkCId "Slash2V2"
cidComplVQ = mkCId "ComplVQ"
cidRelNP' = mkCId "RelNP'"

View File

@@ -0,0 +1,193 @@
{-# LANGUAGE FlexibleInstances,
MultiParamTypeClasses,
ScopedTypeVariables,
FlexibleContexts,
UndecidableInstances #-}
module MonadSP ( Rule(..), Grammar, grammar
, P, parse
, cat, word, word2, lemma, inside, insideSuff, transform
, many, many1, opt
, optEat, consume, wordlookup,write
) where
import Data.Tree
import Data.Char
import Data.List
import qualified Data.Map as Map
import Control.Monad
import Control.Monad.State
import Control.Monad.Writer
import PGF hiding (Tree,parse)
infix 1 :->
data Rule m t e = t :-> P t e m e
type Grammar m t e = t -> PGF -> Morpho -> [Tree t] -> m e
instance Show t => Show (Rule m t e) where
show (t :-> x) = show t
grammar :: (MonadWriter [String] m,MonadState s m,Ord t,Show t,Show e)
=> ([e] -> e) -> [Rule m t e] -> Grammar m t e
grammar def rules = gr
where
gr = \tag -> do
let retry = \pgf m ts -> case ts of
[Node w []] -> return (def [])
trs -> def `liftM` sequence [ gr tag pgf m trs'
| Node tag trs' <- trs]
case Map.lookup tag pmap of
Just f -> \pgf m ts -> do
stored <- get
r <- unP f gr pgf m ts
case r of
Just (e,[]) -> return e
Just (e,xs) -> tell ["Rest parse"] >> retry pgf m ts -- use xs here?
Nothing -> put stored >> retry pgf m ts
Nothing -> retry
-- If many rules match, try all of them (mplus)
pmap = Map.fromListWith mplus (map (\(t :-> r) -> (t,r)) rules)
newtype P t e m a = P {unP :: Grammar m t e -> PGF -> Morpho -> [Tree t] -> m (Maybe (a,[Tree t]))}
instance Monad m => Monad (P t e m) where
return x = P $ \gr pgf m ts -> return (Just (x,ts))
f >>= g = P $ \gr pgf m ts -> unP f gr pgf m ts >>= \r -> case r of
Just (x,ts') -> unP (g x) gr pgf m ts'
Nothing -> return Nothing
instance MonadState s m => MonadPlus (P t e m) where
mzero = P $ \gr pgf m ts -> return Nothing
mplus f g = P $ \gr pgf m ts -> do
store <- get
res <- unP f gr pgf m ts
case res of
Just x -> return (Just x)
Nothing -> put store >> unP g gr pgf m ts
instance MonadState s m => MonadState s (P t e m) where
put s = P $ \gr pgf m ts -> put s >> return (Just ((),ts))
get = P $ \gr pgf m ts -> get >>= \s -> return (Just (s,ts))
instance MonadWriter w m => MonadWriter w (P t e m) where
tell w = P $ \gr pgf m ts -> tell w >> return (Just ((),ts))
listen = error "listen not implemented for P"
pass = error "pass not implemented for P"
-- write x = tell [x]
write :: MonadWriter [w] m => w -> P t e m ()
write = tell . return
instance MonadTrans (P t e) where
lift m = P $ \gr p morpho ts -> m >>= \r -> return (Just (r,ts))
parse :: Monad m => Grammar m t e -> PGF -> Morpho -> Tree t -> m e
parse gr pgf morpho (Node tag ts) = gr tag pgf morpho ts
silent m = (m,[])
speak s (m,w) = (m,s:w)
speaks s (m,w) = (m,s++w)
addS s m = (m,s)
add s m = (m,[s])
cat :: (Monad m,Eq t,Show t) => [t] -> P [t] e m e
cat tag = P $ \gr pgf morpho ts ->
case ts of
Node tag1 ts1 : ts | tag `isPrefixOf` tag1
-> gr tag1 pgf morpho ts1 >>= \r -> return (Just (r,ts))
_ -> return Nothing
word :: (Monad m,Show t,Eq t) => [t] -> P [t] e m [t]
word tag = P $ \gr pgf morpho ts -> return $
case ts of
(Node tag1 [Node w []] : ts) | tag `isPrefixOf` tag1
-> Just (w,ts)
_ -> Nothing
word2 :: (Monad m,Eq t) => t -> P t e m t
word2 tag = P $ \gr pgf morpho ts -> return $
case ts of
(Node tag1 [Node tag2 [Node w []]] : ts) | tag == tag1 -> Just (w,ts)
_ -> Nothing
inside, insideSuff :: (MonadWriter [String] m,Eq t,Show t)=> [t] -> P [t] e m a -> P [t] e m a
insideSuff = inside' isSuffixOf
inside = inside' isPrefixOf
inside' :: (MonadWriter [String] m,Eq t,Show t)=>
([t] -> [t] -> Bool) -> [t] -> P [t] e m a -> P [t] e m a
inside' isEq tag f = P $ \gr pgf morpho ts ->
case ts of
Node tag1 ts1 : ts | tag `isEq` tag1 -> do
tell [show tag++" "++show tag1]
unP f gr pgf morpho ts1 >>= \r -> case r of
Just (x,[]) -> return (Just (x,ts))
Just (x,xs) -> tell ["inside fail "++show xs] >> return Nothing
Nothing -> return Nothing
_ -> return Nothing
magicLookup :: String -> String -> String -> Morpho -> PGF -> [Lemma]
magicLookup w cat0 an0 morpho pgf = [ lem
| (lem, an1) <- lookupMorpho morpho (map toLower w)
, let cat1 = maybe "" (showType []) (functionType pgf lem)
, cat0 == cat1 && an0 == an1
]
wordlookup :: MonadWriter [String] m => String -> String -> String -> P String e m CId
wordlookup w cat0 an0 = P $ \gr pgf morpho ts -> do
tell ["wordlookup: " ++ w ++ show ts ++ show cat0]
let wds = magicLookup w cat0 an0 morpho pgf
tell [show wds]
case wds of
(wd:_) -> return $ Just (wd,ts)
[] -> return Nothing
lemma :: MonadWriter [String] m => String -> String -> P String e m CId
lemma cat = liftM head . lemmas cat
lemmas :: MonadWriter [String] m => String -> String -> P String e m [CId]
lemmas cat0 an0 = P $ \gr pgf morpho ts -> do
tell ["lemma: "++show ts++show cat0]
case ts of
Node w [] : ts -> case magicLookup w cat0 an0 morpho pgf of
(id:ids) -> tell ["lemma ok"] >> return (Just (id:ids,ts))
_ -> tell ["no word "++w++cat0++an0] >> return Nothing
_ -> tell ["tried to lemma a tag"] >> return Nothing
transform :: Monad m => ([Tree t] -> [Tree t]) -> P t e m ()
transform f = P $ \gr pgf morpho ts -> return (Just ((),f ts))
many :: MonadState s m => P t e m a -> P t e m [a]
many f = many1 f
`mplus`
return []
many1 :: MonadState s m => P t e m a -> P t e m [a]
many1 f = do x <- f
xs <- many f
return (x:xs)
opt :: MonadState s m => P t e m a -> a -> P t e m a
opt f x = mplus f (return x)
optEat :: MonadState s m => P t e m a -> a -> P t e m a
optEat f x = mplus f (consume >> return x)
consume :: Monad m => P t e m ()
consume = P $ \gr pgf morpho ts ->
case ts of
Node x w:ws -> return (Just ((),ws))
[] -> return (Just ((),[]))

View File

@@ -0,0 +1,40 @@
{-# LANGUAGE TemplateHaskell #-}
module State where
import Structure
import PGF
import Data.Label
data State = State {
_isExist :: Bool
, _iquant :: Bool
, _passive :: Bool
, _sentenceType :: SentenceType
, _vform :: [VPForm]
, _complement :: (VPForm,[Maybe Expr],[Bool])
, _object :: Maybe Expr -- for objects not within the VP 'vilka äpplen äter han'
, _tmp :: Maybe (VForm CId)
, _anter :: Bool
, _pol :: Maybe Bool
, _subj :: Maybe Expr
, _nptype :: NPType
}
$(mkLabels [''State])
startState :: State
startState = State {
_isExist = False
,_passive = False
,_iquant = False
,_vform = []
,_complement = (V,[],[])
,_sentenceType = Dir
,_object = Nothing
,_tmp = Nothing
,_anter = False
,_pol = Nothing
,_subj = Nothing
,_nptype = Normal}

View File

@@ -0,0 +1,27 @@
module Structure where
data SentenceType = Q | Dir | Top
deriving (Show,Eq)
data NPType = Generic | Impers | Normal | Exist
deriving (Show,Eq)
data VPForm = Cop | Sup | VV | VA
| V | V2 | V2A | V2Pass
| Fut | FutKommer
| VS
deriving (Eq,Show)
data VForm a
= VInf | VPart | VSupin | VImp | VTense a
deriving (Show,Eq)
instance Functor VForm where
fmap f VInf = VInf
fmap f VPart = VPart
fmap f VSupin = VSupin
fmap f VImp = VImp
fmap f (VTense t) = VTense (f t)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff