diff --git a/src/GF/Compile/ShellState.hs b/src/GF/Compile/ShellState.hs index 269b9adb1..0718814c5 100644 --- a/src/GF/Compile/ShellState.hs +++ b/src/GF/Compile/ShellState.hs @@ -75,7 +75,7 @@ data ShellState = ShSt { transfers :: [(Ident,T.Env)] -- ^ transfer modules } -type Treebank = Map.Map String [(String,String)] -- lang, tree +type Treebank = Map.Map String [String] -- string, trees actualConcretes :: ShellState -> [((Ident,Ident),Bool)] actualConcretes sh = nub [((c,c),b) | @@ -480,9 +480,8 @@ addTransfer :: (Ident,T.Env) -> ShellState -> ShellState addTransfer it@(i,_) sh = sh {transfers = it : filter ((/= i) . fst) (transfers sh)} -addTreebank :: (Ident,Treebank) -> ShellState -> ShellState -addTreebank it@(i,_) sh = - sh {treebanks = it : filter ((/= i) . fst) (treebanks sh)} +addTreebanks :: [(Ident,Treebank)] -> ShellState -> ShellState +addTreebanks its sh = sh {treebanks = its ++ treebanks sh} findTreebank :: ShellState -> Ident -> Err Treebank findTreebank sh i = maybeErr "no treebank found" $ lookup i $ treebanks sh diff --git a/src/GF/Data/Operations.hs b/src/GF/Data/Operations.hs index f5434486f..ac1ec85bb 100644 --- a/src/GF/Data/Operations.hs +++ b/src/GF/Data/Operations.hs @@ -585,7 +585,7 @@ removeAssoc :: Eq a => a -> [(a,b)] -> [(a,b)] removeAssoc a = filter ((/=a) . fst) -- | chop into separator-separated parts -chunks :: String -> [String] -> [[String]] +chunks :: Eq a => a -> [a] -> [[a]] chunks sep ws = case span (/= sep) ws of (a,_:b) -> a : bs where bs = chunks sep b (a, []) -> if null a then [] else [a] diff --git a/src/GF/Shell.hs b/src/GF/Shell.hs index d502b74ce..cde5ff743 100644 --- a/src/GF/Shell.hs +++ b/src/GF/Shell.hs @@ -182,9 +182,8 @@ execC :: CommandOpt -> ShellIO execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case comm of CImport file | oElem (iOpt "treebank") opts -> do - ss <- readFileIf file >>= return . lines - let tb = pre2treebank $ getTreebank ss - changeState (addTreebank (I.identC (takeWhile (/='.') file), tb)) sa + tbs <- readUniTreebanks file + changeState (addTreebanks tbs) sa CImport file | oElem fromExamples opts -> do es <- liftM nub $ getGFEFiles opts file system $ "gf -examples" +++ unlines es @@ -296,7 +295,7 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com CTreeBank | oElem doCompute opts -> do -- -c let bank = prCommandArg a - returnArg (AString $ unlines $ testTreebank opts st bank) sa + returnArg (AString $ unlines $ testMultiTreebank opts st bank) sa CTreeBank | oElem getTrees opts -> do -- -trees let bank = prCommandArg a tes = map (string2treeErr gro) $ treesTreebank opts bank @@ -305,21 +304,28 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com CTreeBank -> do let ts = strees $ s2t $ snd sa comm = "command" ---- - returnArg (AString $ unlines $ mkTreebank opts st comm ts) sa + returnArg (AString $ unlines $ mkMultiTreebank opts st comm ts) sa CLookupTreebank -> do let tbs = treebanks st + let s = prCommandArg a if null tbs then returnArg (AError "no treebank") sa else do let tbi = maybe (fst $ head tbs) I.identC (getOptVal opts (aOpt "treebank")) case lookup tbi tbs of Nothing -> returnArg (AError ("no treebank" +++ prt tbi)) sa - Just tb -> do - let s = prCommandArg a - let tes = map (string2treeErr gro . snd) $ lookupTreebank tb s - terms = [t | Ok t <- tes] - returnArg (ATrms terms) sa + Just tb -> case () of + _ | oElem (iOpt "strings") opts -> do + returnArg (AString $ unlines $ map fst $ assocsTreebank tb) sa + _ | oElem (iOpt "raw") opts -> do + returnArg (AString $ unlines $ lookupTreebank tb s) sa + _ | oElem (iOpt "assocs") opts -> do + returnArg (AString $ unlines $ map printAssoc $ assocsTreebank tb) sa + _ -> do + let tes = map (string2treeErr gro) $ lookupTreebank tb s + terms = [t | Ok t <- tes] + returnArg (ATrms terms) sa CShowTreeGraph | oElem emitCode opts -> do -- -o returnArg (AString $ visualizeTrees opts $ strees $ s2t a) sa diff --git a/src/GF/UseGrammar/Treebank.hs b/src/GF/UseGrammar/Treebank.hs index befbae0c0..f1dd5b75b 100644 --- a/src/GF/UseGrammar/Treebank.hs +++ b/src/GF/UseGrammar/Treebank.hs @@ -13,12 +13,19 @@ ----------------------------------------------------------------------------- module GF.UseGrammar.Treebank ( - mkTreebank, - testTreebank, + mkMultiTreebank, + mkUniTreebank, + multi2uniTreebank, + uni2multiTreebank, + testMultiTreebank, treesTreebank, getTreebank, + getUniTreebank, + readUniTreebanks, + readMultiTreebank, lookupTreebank, - pre2treebank + assocsTreebank, + printAssoc ) where import GF.Compile.ShellState @@ -34,15 +41,54 @@ import GF.Grammar.PrGrammar (prt_) import GF.Grammar.Values (tree2exp) import GF.Data.Operations import GF.Infra.Option +import GF.Infra.Ident (Ident) +import GF.Infra.UseIO import qualified GF.Grammar.Abstract as A import qualified Data.Map as M -- Generate a treebank with a multilingual grammar. AR 8/2/2006 -- (c) Aarne Ranta 2006 under GNU GPL +-- keys are trees; format: XML file +type MultiTreebank = [(String,[(String,String)])] -- tree,lang,lin + +-- keys are strings; format: string TAB tree TAB ... TAB tree +type UniTreebank = Treebank -- M.Map String [String] -- string,tree + +-- both formats can be read from both kinds of files +readUniTreebanks :: FilePath -> IO [(Ident,UniTreebank)] +readUniTreebanks file = do + s <- readFileIf file + return $ if isMultiTreebank s + then multi2uniTreebank $ getTreebank $ lines s + else + let tb = getUniTreebank $ lines s + in [(zIdent (unsuffixFile file),tb)] + +readMultiTreebank :: FilePath -> IO MultiTreebank +readMultiTreebank file = do + s <- readFileIf file + return $ if isMultiTreebank s + then getTreebank $ lines s + else uni2multiTreebank (zIdent (unsuffixFile file)) $ getUniTreebank $ lines s + +isMultiTreebank :: String -> Bool +isMultiTreebank s = take 10 s == "" + +multi2uniTreebank :: MultiTreebank -> [(Ident,UniTreebank)] +multi2uniTreebank mt@((_,lls):_) = [(zIdent la, mkTb la) | (la,_) <- lls] where + mkTb la = M.fromListWith (++) [(s,[t]) | (t,lls) <- mt, (l,s) <- lls, l==la] +multi2uniTreebank [] = [] + +uni2multiTreebank :: Ident -> UniTreebank -> MultiTreebank +uni2multiTreebank la tb = + [(t,[(prt_ la, s)]) | (s,ts) <- assocsTreebank tb, t <- ts] + -- | the main functions -mkTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res -mkTreebank opts sh com trees = putInXML opts "treebank" comm (concatMap mkItem tris) + +-- builds a treebank where trees are the keys, and writes a file (opt. XML) +mkMultiTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res +mkMultiTreebank opts sh com trees = putInXML opts "treebank" comm (concatMap mkItem tris) where mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t ++ concatMap (mkLin t) langs) -- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs) @@ -56,10 +102,19 @@ mkTreebank opts sh com trees = putInXML opts "treebank" comm (concatMap mkItem t lang lg = " lang=" ++ show (prt_ (zIdent lg)) tris = zip trees [1..] -testTreebank :: Options -> ShellState -> String -> Res -testTreebank opts sh = putInXML opts "testtreebank" [] . - concatMap testOne . - getTreebanks . lines +-- builds a unilingual treebank where strings are the keys into an internal treebank + +mkUniTreebank :: Options -> ShellState -> Language -> [A.Tree] -> Treebank +mkUniTreebank opts sh lg trees = M.fromListWith (++) [(lin t, [prt_ t]) | t <- trees] + where + lang = prt_ lg + lin t = linearize sh lang t + +-- reads a treebank and linearizes its trees again, printing all differences +testMultiTreebank :: Options -> ShellState -> String -> Res +testMultiTreebank opts sh = putInXML opts "testtreebank" [] . + concatMap testOne . + getTreebanks . lines where testOne (e,lang,str0) = do let tr = annot gr e @@ -71,6 +126,7 @@ testTreebank opts sh = putInXML opts "testtreebank" [] . ] gr = firstStateGrammar sh +-- writes all the trees of the treebank treesTreebank :: Options -> String -> [String] treesTreebank _ = terms . getTreebank . lines where terms ts = [t | (t,_) <- ts] @@ -82,13 +138,17 @@ puts = return -- putStrLn ret = [] -- return () -- -type PreTreebank = [(String,[(String,String)])] +-- here strings are keys +assocsTreebank :: UniTreebank -> [(String,[String])] +assocsTreebank = M.assocs + +printAssoc (s, ts) = s ++ concat ["\t" ++ t | t <- ts] getTreebanks :: [String] -> [(String,String,String)] getTreebanks = concatMap grps . getTreebank where grps (t,lls) = [(t,x,y) | (x,y) <- lls] -getTreebank :: [String] -> PreTreebank +getTreebank :: [String] -> MultiTreebank getTreebank ll = case ll of l:ls@(_:_:_) -> let (l1,l2) = getItem ls @@ -107,11 +167,12 @@ getTreebank ll = case ll of getLang = takeWhile (/='"') . tail . dropWhile (/='"') -lookupTreebank :: Treebank -> String -> [(String,String)] -lookupTreebank tb s = maybe [] id $ M.lookup s tb +getUniTreebank :: [String] -> UniTreebank +getUniTreebank ls = M.fromListWith (++) [(s, ts) | s:ts <- map chop ls] where + chop = chunks '\t' -pre2treebank :: PreTreebank -> Treebank -pre2treebank tb = M.fromListWith (++) [(s,[(l,t)]) | (t,ls) <- tb, (l,s) <- ls] +lookupTreebank :: Treebank -> String -> [String] +lookupTreebank tb s = maybe [] id $ M.lookup s tb annot :: StateGrammar -> String -> A.Tree annot gr s = errVal (error "illegal tree") $ do