mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
more on the Susanne treebank
This commit is contained in:
37
treebanks/susanne/Parser.hs
Normal file
37
treebanks/susanne/Parser.hs
Normal file
@@ -0,0 +1,37 @@
|
||||
module Parser where
|
||||
|
||||
import Control.Monad
|
||||
|
||||
import SusanneFormat
|
||||
|
||||
newtype P a = P {runP :: [ParseTree] -> Maybe ([ParseTree], a)}
|
||||
|
||||
instance Monad P where
|
||||
return x = P (\ts -> Just (ts, x))
|
||||
f >>= g = P (\ts -> case runP f ts of
|
||||
Nothing -> Nothing
|
||||
Just (ts,x) -> runP (g x) ts)
|
||||
|
||||
instance MonadPlus P where
|
||||
mzero = P (\ts -> Nothing)
|
||||
mplus f g = P (\ts -> mplus (runP f ts) (runP g ts))
|
||||
|
||||
match tag_spec = P (\ts ->
|
||||
case ts of
|
||||
(Phrase tag1 mods1 fn1 _ _:ts)
|
||||
| tag == tag1 &&
|
||||
all (flip elem mods1) mods &&
|
||||
(null fn || fn == fn1) -> Just (ts,())
|
||||
(Word _ tag1 _ _:ts)
|
||||
| tag == tag1 -> Just (ts,())
|
||||
_ -> Nothing)
|
||||
where
|
||||
(f,_) = readTag (Word "<match>" undefined undefined undefined) tag_spec
|
||||
Phrase tag mods fn _ _ = f []
|
||||
|
||||
many f =
|
||||
do x <- f
|
||||
xs <- many f
|
||||
return (x:xs)
|
||||
`mplus`
|
||||
do return []
|
||||
@@ -1,43 +1,82 @@
|
||||
module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank) where
|
||||
module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank,readTag) where
|
||||
|
||||
import PGF(CId)
|
||||
import Data.Char
|
||||
|
||||
type Tag = String
|
||||
type Mods = String
|
||||
type Fn = String
|
||||
type Index = Int
|
||||
type Id = String
|
||||
type Word = String
|
||||
type Lemma = String
|
||||
|
||||
data ParseTree
|
||||
= Phrase Tag [ParseTree]
|
||||
= Phrase Tag Mods Fn Index [ParseTree]
|
||||
| Word Id Tag Word Lemma
|
||||
| App CId [ParseTree]
|
||||
|
||||
data ParseTreePos
|
||||
= Root
|
||||
| At ParseTreePos Tag [ParseTree]
|
||||
| At ParseTreePos ([ParseTree] -> ParseTree) [ParseTree]
|
||||
|
||||
instance Show ParseTree where
|
||||
show (Phrase tag ts) = "["++tag++" "++unwords (map show ts)++"]"
|
||||
show (Word _ tag w _) = "["++tag++" "++w++"]"
|
||||
show (Phrase tag mods fn idx ts)
|
||||
| tag == "" = "["++fn++show idx++" "++unwords (map show ts)++"]"
|
||||
| fn == "" && idx == 0 = "["++tag++mods++" "++unwords (map show ts)++"]"
|
||||
| otherwise = "["++tag++mods++":"++fn++show idx++" "++unwords (map show ts)++"]"
|
||||
show (Word _ tag w _) = "["++tag++" "++w++"]"
|
||||
show (App f ts)
|
||||
| null ts = show f
|
||||
| otherwise = "("++show f++" "++unwords (map show ts)++")"
|
||||
|
||||
readTreebank ls = readLines Root (map words ls)
|
||||
|
||||
readLines p [] = []
|
||||
readLines p ([id,_,tag,w,l,parse]:ls) =
|
||||
readParse p (Word id tag w l) parse ls
|
||||
readParse (Word id tag w l) p parse ls
|
||||
|
||||
readParse p w [] ls = readLines p ls
|
||||
readParse p w ('[':cs) ls =
|
||||
case break (not . isTagChar) cs of
|
||||
(tag,cs) -> readParse (At p tag []) w cs ls
|
||||
readParse (At p tag ts) w ('.':cs) ls =
|
||||
readParse (At p tag (w:ts)) w cs ls
|
||||
readParse (At p tag ts) w cs ls =
|
||||
case break (not . isTagChar) cs of
|
||||
(tag,']':cs) -> let t = Phrase tag (reverse ts)
|
||||
in case p of
|
||||
Root -> t : readLines p ls
|
||||
At p tag ts -> readParse (At p tag (t:ts)) w cs ls
|
||||
_ -> error cs
|
||||
readParse w p [] ls = readLines p ls
|
||||
readParse w p ('[':cs) ls =
|
||||
case readTag w cs of
|
||||
(fn,cs) -> readParse w (At p fn []) cs ls
|
||||
readParse w (At p fn ts) ('.':cs) ls =
|
||||
readParse w (At p fn (w:ts)) cs ls
|
||||
readParse w (At p fn ts) cs ls =
|
||||
case readTag w cs of
|
||||
(_,']':cs) -> let t = fn (reverse ts)
|
||||
in case p of
|
||||
Root -> t : readLines p ls
|
||||
At p fn ts -> readParse w (At p fn (t:ts)) cs ls
|
||||
_ -> readError w
|
||||
|
||||
isTagChar c =
|
||||
isLetter c || isDigit c || elem c ":&+-%@=?\"*!"
|
||||
readTag w cs@(c1:c2:_) -- word tag on phrase level
|
||||
| isUpper c1 && isUpper c2 =
|
||||
case break (\c -> not (isLetter c || isDigit c)) cs of
|
||||
(tag,cs) -> case break (\c -> not (elem c "?*%!\"=+-&@")) cs of
|
||||
(mods,cs) -> case cs of
|
||||
(':':c:cs) | isLetter c -> case break (not . isDigit) cs of
|
||||
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
|
||||
| isDigit c -> case break (not . isDigit) (c:cs) of
|
||||
(ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
|
||||
_ -> (Phrase tag mods "" 0,cs)
|
||||
readTag w (c:cs) -- phrase tag
|
||||
| isUpper c = let tag = [c]
|
||||
in case break (\c -> not (isLetter c || isDigit c || elem c "?*%!\"=+-&@")) cs of
|
||||
(mods,cs) -> case cs of
|
||||
(':':c:cs) | isLetter c -> case break (not . isDigit) cs of
|
||||
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
|
||||
| isDigit c -> case break (not . isDigit) (c:cs) of
|
||||
(ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
|
||||
_ -> (Phrase tag mods "" 0,cs)
|
||||
| isLower c = let tag = []
|
||||
mods = []
|
||||
in case break (not . isDigit) cs of
|
||||
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
|
||||
| isDigit c = let tag = []
|
||||
mods = []
|
||||
in case break (not . isDigit) cs of
|
||||
(ds,cs) -> (Phrase tag mods [] (read ds),cs)
|
||||
readTag w cs = readError w
|
||||
|
||||
readError (Word id _ _ _) = error id
|
||||
|
||||
@@ -1,14 +1,54 @@
|
||||
import System.Directory
|
||||
import System.FilePath
|
||||
import Data.List
|
||||
import Data.Char(toLower)
|
||||
|
||||
import PGF (readPGF, readLanguage, buildMorpho, lookupMorpho, mkCId, functionType, unType)
|
||||
import SusanneFormat
|
||||
|
||||
Just eng = readLanguage "ParseEng"
|
||||
|
||||
main = do
|
||||
gr <- readPGF "../../ParseEngAbs.pgf"
|
||||
let morpho = buildMorpho gr eng
|
||||
fs <- getDirectoryContents "data"
|
||||
txts <- (mapM (\f -> readFile ("data" </> f)) . filter ((/= ".") . take 1)) (sort fs)
|
||||
let ts = filter (not . isBreak) (readTreebank (lines (concat txts)))
|
||||
--let ts = concatMap (convert gr morpho) (readTreebank (lines (concat txts)))
|
||||
let ts = readTreebank (lines (concat txts))
|
||||
writeFile "text" (unlines (map show ts))
|
||||
|
||||
isBreak (Phrase "Oh" [Word _ "YB" "<minbrk>" _]) = True
|
||||
isBreak _ = False
|
||||
convert pgf morpho w@(Word _ tag _ lemma)
|
||||
| elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = []
|
||||
| tag == "NN1c" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w
|
||||
| tag == "NN1n" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w
|
||||
| tag == "NN2" = convertLemma pgf morpho (mkCId "N") "s Pl Nom" w
|
||||
| tag == "JJ" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w
|
||||
| tag == "JB" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w
|
||||
| tag == "JBo" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w
|
||||
| tag == "AT" = convertLemma pgf morpho (mkCId "Quant") "s False Sg" w
|
||||
| tag == "VVDi" = convertLemma pgf morpho (mkCId "V") "s VPast" w
|
||||
| tag == "VVDt" = convertLemma pgf morpho (mkCId "V2") "s VPast" w
|
||||
| tag == "VVDv" = convertLemma pgf morpho (mkCId "V") "s VPast" w
|
||||
| tag == "VVZi" = convertLemma pgf morpho (mkCId "V") "s VPres" w
|
||||
| tag == "VVZt" = convertLemma pgf morpho (mkCId "V2") "s VPres" w
|
||||
| tag == "VVZv" = convertLemma pgf morpho (mkCId "V") "s VPres" w
|
||||
| tag == "PPHS2"= convertLemma pgf morpho (mkCId "Pron") "s (NCase Nom)" w
|
||||
| tag == "PPHO2"= convertLemma pgf morpho (mkCId "Pron") "s NPAcc" w
|
||||
| tag == "RR" = convertLemma pgf morpho (mkCId "Adv") "s" w
|
||||
| tag == "II" = convertLemma pgf morpho (mkCId "Prep") "s" w
|
||||
| tag == "IO" = convertLemma pgf morpho (mkCId "Prep") "s" w
|
||||
| otherwise = [w]
|
||||
convert pgf morpho (Phrase tag mods fn idx ts)
|
||||
| tag == "O" = concatMap (convert pgf morpho) ts
|
||||
| otherwise = [Phrase tag mods fn idx (concatMap (convert pgf morpho) ts)]
|
||||
|
||||
convertLemma pgf morpho cat an0 w@(Word _ tag form _) =
|
||||
case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of
|
||||
[f] -> [App f []]
|
||||
_ -> [w]
|
||||
where
|
||||
hasCat pgf f cat =
|
||||
case functionType pgf f of
|
||||
Just ty -> case unType ty of
|
||||
(_,cat1,_) -> cat1 == cat
|
||||
Nothing -> False
|
||||
|
||||
Reference in New Issue
Block a user