mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-05-26 03:08:55 -06:00
distinguished uni and multi treebanks
This commit is contained in:
@@ -75,7 +75,7 @@ data ShellState = ShSt {
|
|||||||
transfers :: [(Ident,T.Env)] -- ^ transfer modules
|
transfers :: [(Ident,T.Env)] -- ^ transfer modules
|
||||||
}
|
}
|
||||||
|
|
||||||
type Treebank = Map.Map String [(String,String)] -- lang, tree
|
type Treebank = Map.Map String [String] -- string, trees
|
||||||
|
|
||||||
actualConcretes :: ShellState -> [((Ident,Ident),Bool)]
|
actualConcretes :: ShellState -> [((Ident,Ident),Bool)]
|
||||||
actualConcretes sh = nub [((c,c),b) |
|
actualConcretes sh = nub [((c,c),b) |
|
||||||
@@ -480,9 +480,8 @@ addTransfer :: (Ident,T.Env) -> ShellState -> ShellState
|
|||||||
addTransfer it@(i,_) sh =
|
addTransfer it@(i,_) sh =
|
||||||
sh {transfers = it : filter ((/= i) . fst) (transfers sh)}
|
sh {transfers = it : filter ((/= i) . fst) (transfers sh)}
|
||||||
|
|
||||||
addTreebank :: (Ident,Treebank) -> ShellState -> ShellState
|
addTreebanks :: [(Ident,Treebank)] -> ShellState -> ShellState
|
||||||
addTreebank it@(i,_) sh =
|
addTreebanks its sh = sh {treebanks = its ++ treebanks sh}
|
||||||
sh {treebanks = it : filter ((/= i) . fst) (treebanks sh)}
|
|
||||||
|
|
||||||
findTreebank :: ShellState -> Ident -> Err Treebank
|
findTreebank :: ShellState -> Ident -> Err Treebank
|
||||||
findTreebank sh i = maybeErr "no treebank found" $ lookup i $ treebanks sh
|
findTreebank sh i = maybeErr "no treebank found" $ lookup i $ treebanks sh
|
||||||
|
|||||||
@@ -585,7 +585,7 @@ removeAssoc :: Eq a => a -> [(a,b)] -> [(a,b)]
|
|||||||
removeAssoc a = filter ((/=a) . fst)
|
removeAssoc a = filter ((/=a) . fst)
|
||||||
|
|
||||||
-- | chop into separator-separated parts
|
-- | chop into separator-separated parts
|
||||||
chunks :: String -> [String] -> [[String]]
|
chunks :: Eq a => a -> [a] -> [[a]]
|
||||||
chunks sep ws = case span (/= sep) ws of
|
chunks sep ws = case span (/= sep) ws of
|
||||||
(a,_:b) -> a : bs where bs = chunks sep b
|
(a,_:b) -> a : bs where bs = chunks sep b
|
||||||
(a, []) -> if null a then [] else [a]
|
(a, []) -> if null a then [] else [a]
|
||||||
|
|||||||
@@ -182,9 +182,8 @@ execC :: CommandOpt -> ShellIO
|
|||||||
execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case comm of
|
execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case comm of
|
||||||
|
|
||||||
CImport file | oElem (iOpt "treebank") opts -> do
|
CImport file | oElem (iOpt "treebank") opts -> do
|
||||||
ss <- readFileIf file >>= return . lines
|
tbs <- readUniTreebanks file
|
||||||
let tb = pre2treebank $ getTreebank ss
|
changeState (addTreebanks tbs) sa
|
||||||
changeState (addTreebank (I.identC (takeWhile (/='.') file), tb)) sa
|
|
||||||
CImport file | oElem fromExamples opts -> do
|
CImport file | oElem fromExamples opts -> do
|
||||||
es <- liftM nub $ getGFEFiles opts file
|
es <- liftM nub $ getGFEFiles opts file
|
||||||
system $ "gf -examples" +++ unlines es
|
system $ "gf -examples" +++ unlines es
|
||||||
@@ -296,7 +295,7 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com
|
|||||||
|
|
||||||
CTreeBank | oElem doCompute opts -> do -- -c
|
CTreeBank | oElem doCompute opts -> do -- -c
|
||||||
let bank = prCommandArg a
|
let bank = prCommandArg a
|
||||||
returnArg (AString $ unlines $ testTreebank opts st bank) sa
|
returnArg (AString $ unlines $ testMultiTreebank opts st bank) sa
|
||||||
CTreeBank | oElem getTrees opts -> do -- -trees
|
CTreeBank | oElem getTrees opts -> do -- -trees
|
||||||
let bank = prCommandArg a
|
let bank = prCommandArg a
|
||||||
tes = map (string2treeErr gro) $ treesTreebank opts bank
|
tes = map (string2treeErr gro) $ treesTreebank opts bank
|
||||||
@@ -305,21 +304,28 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com
|
|||||||
CTreeBank -> do
|
CTreeBank -> do
|
||||||
let ts = strees $ s2t $ snd sa
|
let ts = strees $ s2t $ snd sa
|
||||||
comm = "command" ----
|
comm = "command" ----
|
||||||
returnArg (AString $ unlines $ mkTreebank opts st comm ts) sa
|
returnArg (AString $ unlines $ mkMultiTreebank opts st comm ts) sa
|
||||||
|
|
||||||
CLookupTreebank -> do
|
CLookupTreebank -> do
|
||||||
let tbs = treebanks st
|
let tbs = treebanks st
|
||||||
|
let s = prCommandArg a
|
||||||
if null tbs
|
if null tbs
|
||||||
then returnArg (AError "no treebank") sa
|
then returnArg (AError "no treebank") sa
|
||||||
else do
|
else do
|
||||||
let tbi = maybe (fst $ head tbs) I.identC (getOptVal opts (aOpt "treebank"))
|
let tbi = maybe (fst $ head tbs) I.identC (getOptVal opts (aOpt "treebank"))
|
||||||
case lookup tbi tbs of
|
case lookup tbi tbs of
|
||||||
Nothing -> returnArg (AError ("no treebank" +++ prt tbi)) sa
|
Nothing -> returnArg (AError ("no treebank" +++ prt tbi)) sa
|
||||||
Just tb -> do
|
Just tb -> case () of
|
||||||
let s = prCommandArg a
|
_ | oElem (iOpt "strings") opts -> do
|
||||||
let tes = map (string2treeErr gro . snd) $ lookupTreebank tb s
|
returnArg (AString $ unlines $ map fst $ assocsTreebank tb) sa
|
||||||
terms = [t | Ok t <- tes]
|
_ | oElem (iOpt "raw") opts -> do
|
||||||
returnArg (ATrms terms) sa
|
returnArg (AString $ unlines $ lookupTreebank tb s) sa
|
||||||
|
_ | oElem (iOpt "assocs") opts -> do
|
||||||
|
returnArg (AString $ unlines $ map printAssoc $ assocsTreebank tb) sa
|
||||||
|
_ -> do
|
||||||
|
let tes = map (string2treeErr gro) $ lookupTreebank tb s
|
||||||
|
terms = [t | Ok t <- tes]
|
||||||
|
returnArg (ATrms terms) sa
|
||||||
|
|
||||||
CShowTreeGraph | oElem emitCode opts -> do -- -o
|
CShowTreeGraph | oElem emitCode opts -> do -- -o
|
||||||
returnArg (AString $ visualizeTrees opts $ strees $ s2t a) sa
|
returnArg (AString $ visualizeTrees opts $ strees $ s2t a) sa
|
||||||
|
|||||||
@@ -13,12 +13,19 @@
|
|||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
|
|
||||||
module GF.UseGrammar.Treebank (
|
module GF.UseGrammar.Treebank (
|
||||||
mkTreebank,
|
mkMultiTreebank,
|
||||||
testTreebank,
|
mkUniTreebank,
|
||||||
|
multi2uniTreebank,
|
||||||
|
uni2multiTreebank,
|
||||||
|
testMultiTreebank,
|
||||||
treesTreebank,
|
treesTreebank,
|
||||||
getTreebank,
|
getTreebank,
|
||||||
|
getUniTreebank,
|
||||||
|
readUniTreebanks,
|
||||||
|
readMultiTreebank,
|
||||||
lookupTreebank,
|
lookupTreebank,
|
||||||
pre2treebank
|
assocsTreebank,
|
||||||
|
printAssoc
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import GF.Compile.ShellState
|
import GF.Compile.ShellState
|
||||||
@@ -34,15 +41,54 @@ import GF.Grammar.PrGrammar (prt_)
|
|||||||
import GF.Grammar.Values (tree2exp)
|
import GF.Grammar.Values (tree2exp)
|
||||||
import GF.Data.Operations
|
import GF.Data.Operations
|
||||||
import GF.Infra.Option
|
import GF.Infra.Option
|
||||||
|
import GF.Infra.Ident (Ident)
|
||||||
|
import GF.Infra.UseIO
|
||||||
import qualified GF.Grammar.Abstract as A
|
import qualified GF.Grammar.Abstract as A
|
||||||
import qualified Data.Map as M
|
import qualified Data.Map as M
|
||||||
|
|
||||||
-- Generate a treebank with a multilingual grammar. AR 8/2/2006
|
-- Generate a treebank with a multilingual grammar. AR 8/2/2006
|
||||||
-- (c) Aarne Ranta 2006 under GNU GPL
|
-- (c) Aarne Ranta 2006 under GNU GPL
|
||||||
|
|
||||||
|
-- keys are trees; format: XML file
|
||||||
|
type MultiTreebank = [(String,[(String,String)])] -- tree,lang,lin
|
||||||
|
|
||||||
|
-- keys are strings; format: string TAB tree TAB ... TAB tree
|
||||||
|
type UniTreebank = Treebank -- M.Map String [String] -- string,tree
|
||||||
|
|
||||||
|
-- both formats can be read from both kinds of files
|
||||||
|
readUniTreebanks :: FilePath -> IO [(Ident,UniTreebank)]
|
||||||
|
readUniTreebanks file = do
|
||||||
|
s <- readFileIf file
|
||||||
|
return $ if isMultiTreebank s
|
||||||
|
then multi2uniTreebank $ getTreebank $ lines s
|
||||||
|
else
|
||||||
|
let tb = getUniTreebank $ lines s
|
||||||
|
in [(zIdent (unsuffixFile file),tb)]
|
||||||
|
|
||||||
|
readMultiTreebank :: FilePath -> IO MultiTreebank
|
||||||
|
readMultiTreebank file = do
|
||||||
|
s <- readFileIf file
|
||||||
|
return $ if isMultiTreebank s
|
||||||
|
then getTreebank $ lines s
|
||||||
|
else uni2multiTreebank (zIdent (unsuffixFile file)) $ getUniTreebank $ lines s
|
||||||
|
|
||||||
|
isMultiTreebank :: String -> Bool
|
||||||
|
isMultiTreebank s = take 10 s == "<treebank>"
|
||||||
|
|
||||||
|
multi2uniTreebank :: MultiTreebank -> [(Ident,UniTreebank)]
|
||||||
|
multi2uniTreebank mt@((_,lls):_) = [(zIdent la, mkTb la) | (la,_) <- lls] where
|
||||||
|
mkTb la = M.fromListWith (++) [(s,[t]) | (t,lls) <- mt, (l,s) <- lls, l==la]
|
||||||
|
multi2uniTreebank [] = []
|
||||||
|
|
||||||
|
uni2multiTreebank :: Ident -> UniTreebank -> MultiTreebank
|
||||||
|
uni2multiTreebank la tb =
|
||||||
|
[(t,[(prt_ la, s)]) | (s,ts) <- assocsTreebank tb, t <- ts]
|
||||||
|
|
||||||
-- | the main functions
|
-- | the main functions
|
||||||
mkTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res
|
|
||||||
mkTreebank opts sh com trees = putInXML opts "treebank" comm (concatMap mkItem tris)
|
-- builds a treebank where trees are the keys, and writes a file (opt. XML)
|
||||||
|
mkMultiTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res
|
||||||
|
mkMultiTreebank opts sh com trees = putInXML opts "treebank" comm (concatMap mkItem tris)
|
||||||
where
|
where
|
||||||
mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t ++ concatMap (mkLin t) langs)
|
mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t ++ concatMap (mkLin t) langs)
|
||||||
-- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs)
|
-- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs)
|
||||||
@@ -56,10 +102,19 @@ mkTreebank opts sh com trees = putInXML opts "treebank" comm (concatMap mkItem t
|
|||||||
lang lg = " lang=" ++ show (prt_ (zIdent lg))
|
lang lg = " lang=" ++ show (prt_ (zIdent lg))
|
||||||
tris = zip trees [1..]
|
tris = zip trees [1..]
|
||||||
|
|
||||||
testTreebank :: Options -> ShellState -> String -> Res
|
-- builds a unilingual treebank where strings are the keys into an internal treebank
|
||||||
testTreebank opts sh = putInXML opts "testtreebank" [] .
|
|
||||||
concatMap testOne .
|
mkUniTreebank :: Options -> ShellState -> Language -> [A.Tree] -> Treebank
|
||||||
getTreebanks . lines
|
mkUniTreebank opts sh lg trees = M.fromListWith (++) [(lin t, [prt_ t]) | t <- trees]
|
||||||
|
where
|
||||||
|
lang = prt_ lg
|
||||||
|
lin t = linearize sh lang t
|
||||||
|
|
||||||
|
-- reads a treebank and linearizes its trees again, printing all differences
|
||||||
|
testMultiTreebank :: Options -> ShellState -> String -> Res
|
||||||
|
testMultiTreebank opts sh = putInXML opts "testtreebank" [] .
|
||||||
|
concatMap testOne .
|
||||||
|
getTreebanks . lines
|
||||||
where
|
where
|
||||||
testOne (e,lang,str0) = do
|
testOne (e,lang,str0) = do
|
||||||
let tr = annot gr e
|
let tr = annot gr e
|
||||||
@@ -71,6 +126,7 @@ testTreebank opts sh = putInXML opts "testtreebank" [] .
|
|||||||
]
|
]
|
||||||
gr = firstStateGrammar sh
|
gr = firstStateGrammar sh
|
||||||
|
|
||||||
|
-- writes all the trees of the treebank
|
||||||
treesTreebank :: Options -> String -> [String]
|
treesTreebank :: Options -> String -> [String]
|
||||||
treesTreebank _ = terms . getTreebank . lines where
|
treesTreebank _ = terms . getTreebank . lines where
|
||||||
terms ts = [t | (t,_) <- ts]
|
terms ts = [t | (t,_) <- ts]
|
||||||
@@ -82,13 +138,17 @@ puts = return -- putStrLn
|
|||||||
ret = [] -- return ()
|
ret = [] -- return ()
|
||||||
--
|
--
|
||||||
|
|
||||||
type PreTreebank = [(String,[(String,String)])]
|
-- here strings are keys
|
||||||
|
assocsTreebank :: UniTreebank -> [(String,[String])]
|
||||||
|
assocsTreebank = M.assocs
|
||||||
|
|
||||||
|
printAssoc (s, ts) = s ++ concat ["\t" ++ t | t <- ts]
|
||||||
|
|
||||||
getTreebanks :: [String] -> [(String,String,String)]
|
getTreebanks :: [String] -> [(String,String,String)]
|
||||||
getTreebanks = concatMap grps . getTreebank where
|
getTreebanks = concatMap grps . getTreebank where
|
||||||
grps (t,lls) = [(t,x,y) | (x,y) <- lls]
|
grps (t,lls) = [(t,x,y) | (x,y) <- lls]
|
||||||
|
|
||||||
getTreebank :: [String] -> PreTreebank
|
getTreebank :: [String] -> MultiTreebank
|
||||||
getTreebank ll = case ll of
|
getTreebank ll = case ll of
|
||||||
l:ls@(_:_:_) ->
|
l:ls@(_:_:_) ->
|
||||||
let (l1,l2) = getItem ls
|
let (l1,l2) = getItem ls
|
||||||
@@ -107,11 +167,12 @@ getTreebank ll = case ll of
|
|||||||
|
|
||||||
getLang = takeWhile (/='"') . tail . dropWhile (/='"')
|
getLang = takeWhile (/='"') . tail . dropWhile (/='"')
|
||||||
|
|
||||||
lookupTreebank :: Treebank -> String -> [(String,String)]
|
getUniTreebank :: [String] -> UniTreebank
|
||||||
lookupTreebank tb s = maybe [] id $ M.lookup s tb
|
getUniTreebank ls = M.fromListWith (++) [(s, ts) | s:ts <- map chop ls] where
|
||||||
|
chop = chunks '\t'
|
||||||
|
|
||||||
pre2treebank :: PreTreebank -> Treebank
|
lookupTreebank :: Treebank -> String -> [String]
|
||||||
pre2treebank tb = M.fromListWith (++) [(s,[(l,t)]) | (t,ls) <- tb, (l,s) <- ls]
|
lookupTreebank tb s = maybe [] id $ M.lookup s tb
|
||||||
|
|
||||||
annot :: StateGrammar -> String -> A.Tree
|
annot :: StateGrammar -> String -> A.Tree
|
||||||
annot gr s = errVal (error "illegal tree") $ do
|
annot gr s = errVal (error "illegal tree") $ do
|
||||||
|
|||||||
Reference in New Issue
Block a user