From 32508e37b37a0baad884e6aaa518071b62e1e6d8 Mon Sep 17 00:00:00 2001 From: aarne Date: Thu, 2 Mar 2006 17:19:36 +0000 Subject: [PATCH] treebanks in shell state: i -treebank and lt --- src/GF/CF/CFIdent.hs | 6 ++++-- src/GF/Compile/ShellState.hs | 26 ++++++++++++++++++++------ src/GF/Shell.hs | 13 +++++++++++++ src/GF/Shell/HelpFile.hs | 8 ++++++++ src/GF/Shell/PShell.hs | 1 + src/GF/Shell/ShellCommands.hs | 7 +++++-- src/GF/UseGrammar/Treebank.hs | 20 ++++++++++++++++++-- src/HelpFile | 8 ++++++++ 8 files changed, 77 insertions(+), 12 deletions(-) diff --git a/src/GF/CF/CFIdent.hs b/src/GF/CF/CFIdent.hs index df12be0f8..02ee482c0 100644 --- a/src/GF/CF/CFIdent.hs +++ b/src/GF/CF/CFIdent.hs @@ -238,14 +238,16 @@ compatCF (CFCat (CIQ _ c, l)) (CFCat (CIQ _ c', l')) = c==c' && l==l' -- instead of break wordsLits [] = [] wordsLits (c:cs) | isSpace c = wordsLits (dropWhile isSpace cs) - | c == '\'' || c == '"' + | isQuote c = let (l,rs) = breaks (==c) cs rs' = drop 1 rs in ([c]++l++[c]):wordsLits rs' - | otherwise = let (w,rs) = break isSpace cs + | otherwise = let (w,rs) = break isSpaceQ cs in (c:w):wordsLits rs where breaks c cs = case break c cs of (l@(_:_),d:rs) | last l == '\\' -> let (r,ts) = breaks c rs in (l++[d]++r, ts) v -> v + isQuote c = elem c "\"'" + isSpaceQ c = isSpace c ---- || isQuote c diff --git a/src/GF/Compile/ShellState.hs b/src/GF/Compile/ShellState.hs index 33e20b03b..696b3776e 100644 --- a/src/GF/Compile/ShellState.hs +++ b/src/GF/Compile/ShellState.hs @@ -44,6 +44,7 @@ import qualified GF.Conversion.GFC as Cnv import qualified GF.Parsing.GFC as Prs import Data.List (nub,nubBy) +import qualified Data.Map as Map -- AR 11/11/2001 -- 17/6/2003 (for modules) ---- unfinished @@ -61,6 +62,7 @@ data ShellState = ShSt { -- (large, with parameters, no-so overgenerating) pInfos :: [(Ident, Prs.PInfo)], -- ^ parsing information (compiled mcfg&cfg grammars) morphos :: [(Ident,Morpho)], -- ^ morphologies + treebanks :: [(Ident,Treebank)], -- ^ treebanks probss :: [(Ident,Probs)], -- ^ probability distributions gloptions :: Options, -- ^ global options readFiles :: [(FilePath,ModTime)],-- ^ files read @@ -73,6 +75,8 @@ data ShellState = ShSt { transfers :: [(Ident,T.Env)] -- ^ transfer modules } +type Treebank = Map.Map String [(String,String)] -- lang, tree + actualConcretes :: ShellState -> [((Ident,Ident),Bool)] actualConcretes sh = nub [((c,c),b) | Just a <- [abstract sh], @@ -102,6 +106,7 @@ emptyShellState = ShSt { cfgs = [], pInfos = [], morphos = [], + treebanks = [], probss = [], gloptions = noOptions, readFiles = [], @@ -249,6 +254,7 @@ updateShellState opts ign mcnc sh ((_,sgr,gr),rts) = do cfgs = zip concrs cfgs, pInfos = zip concrs pInfos, morphos = zip concrs morphos, + treebanks = treebanks sh, probss = zip concrs probss, gloptions = gloptions sh, --- opts, -- this would be command-line options readFiles = [ft | ft@(f,_) <- readFiles sh, notInrts f] ++ rts, @@ -298,6 +304,7 @@ purgeShellState sh = ShSt { cfgs = cfgs sh, pInfos = pInfos sh, morphos = morphos sh, + treebanks = treebanks sh, probss = probss sh, gloptions = gloptions sh, readFiles = [], @@ -314,17 +321,17 @@ purgeShellState sh = ShSt { acncs = maybe [] singleton abstr ++ map (snd . fst) (actualConcretes sh) changeMain :: Maybe Ident -> ShellState -> Err ShellState -changeMain Nothing (ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos pbs os rs acs s trs) = - return (ShSt Nothing Nothing [] ms ss cfs old_pis mcfgs cfgs pinfos mos pbs os rs acs s trs) +changeMain Nothing (ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos tbs pbs os rs acs s trs) = + return (ShSt Nothing Nothing [] ms ss cfs old_pis mcfgs cfgs pinfos mos tbs pbs os rs acs s trs) changeMain - (Just c) st@(ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos pbs os rs acs s trs) = + (Just c) st@(ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos tbs pbs os rs acs s trs) = case lookup c (M.modules ms) of Just _ -> do a <- M.abstractOfConcrete ms c let cas = M.allConcretes ms a let cs' = [((c,c),True) | c <- cas] return (ShSt (Just a) (Just c) cs' ms ss cfs old_pis mcfgs cfgs - pinfos mos pbs os rs acs s trs) + pinfos mos tbs pbs os rs acs s trs) _ -> P.prtBad "The state has no concrete syntax named" c -- | form just one state grammar, if unique, from a canonical grammar @@ -472,6 +479,13 @@ addTransfer :: (Ident,T.Env) -> ShellState -> ShellState addTransfer it@(i,_) sh = sh {transfers = it : filter ((/= i) . fst) (transfers sh)} +addTreebank :: (Ident,Treebank) -> ShellState -> ShellState +addTreebank it@(i,_) sh = + sh {treebanks = it : filter ((/= i) . fst) (treebanks sh)} + +findTreebank :: ShellState -> Ident -> Err Treebank +findTreebank sh i = maybeErr "no treebank found" $ lookup i $ treebanks sh + -- modify state type ShellStateOper = ShellState -> ShellState @@ -496,8 +510,8 @@ changeOptions f sh = sh {gloptions = f (gloptions sh)} changeModTimes :: [(FilePath,ModTime)] -> ShellStateOper --- __________ this is OBSOLETE changeModTimes mfs - (ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms pbs os ff ts ss trs) = - ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms pbs os ff' ts ss trs + (ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms tbs pbs os ff ts ss trs) = + ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms tbs pbs os ff' ts ss trs where ff' = mfs ++ [mf | mf@(f,_) <- ff, notElem f (map fst mfs)] diff --git a/src/GF/Shell.hs b/src/GF/Shell.hs index 0d5332fb8..39460b2f2 100644 --- a/src/GF/Shell.hs +++ b/src/GF/Shell.hs @@ -181,6 +181,10 @@ execLine put (c@(co, os), arg, cs) (outps,st) = do execC :: CommandOpt -> ShellIO execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case comm of + CImport file | oElem (iOpt "treebank") opts -> do + ss <- readFileIf file >>= return . lines + let tb = pre2treebank $ getTreebank ss + changeState (addTreebank (I.identC (takeWhile (/='.') file), tb)) sa CImport file | oElem fromExamples opts -> do es <- liftM nub $ getGFEFiles opts file system $ "gf -examples" +++ unlines es @@ -303,6 +307,15 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com comm = "command" ---- returnArg (AString $ unlines $ mkTreebank opts st comm ts) sa + CLookupTreebank -> do + case treebanks st of + [] -> returnArg (AError "no treebank") sa + (_,tb):_ -> do + let s = prCommandArg a + let tes = map (string2treeErr gro . snd) $ lookupTreebank tb s + terms = [t | Ok t <- tes] + returnArg (ATrms terms) sa + CShowTreeGraph | oElem emitCode opts -> do -- -o returnArg (AString $ visualizeTrees opts $ strees $ s2t a) sa CShowTreeGraph -> do diff --git a/src/GF/Shell/HelpFile.hs b/src/GF/Shell/HelpFile.hs index 2f21184f1..d89877a3d 100644 --- a/src/GF/Shell/HelpFile.hs +++ b/src/GF/Shell/HelpFile.hs @@ -66,6 +66,7 @@ txtHelpFile = "\n -o do emit code (default with new grammar format)" ++ "\n -ex preprocess .gfe files if needed" ++ "\n -prob read probabilities from top grammar file (format --# prob Fun Double)" ++ + "\n -treebank read a treebank file to memory (xml format)" ++ "\n flags:" ++ "\n -abs set the name used for abstract syntax (with -old option)" ++ "\n -cnc set the name used for concrete syntax (with -old option)" ++ @@ -238,11 +239,18 @@ txtHelpFile = "\n -c compare to existing xml-formatted treebank" ++ "\n -trees return the trees of the treebank" ++ "\n -xml wrap the treebank (or comparison results) with XML tags" ++ + "\n -mem write the treebank in memory instead of a file TODO" ++ "\n examples:" ++ "\n gr -cat=S -number=100 | tb -xml | wf tb.xml -- random treebank into file" ++ "\n rf tb.xml | tb -c -- compare-test treebank from file" ++ "\n rf old.xml | tb -trees | tb -xml -- create new treebank from old" ++ "\n" ++ + "\nlt, lookup_treebank: lt String" ++ + "\n Lookup a string in a treebank and return the resulting trees." ++ + "\n Use 'tb' to create a treebank and 'i -treebank' to read it in memory." ++ + "\n flag:" ++ + "\n -treebank use this treebank (instead of the latest introduced one) TODO" ++ + "\n" ++ "\ntt, test_tokenizer: tt String" ++ "\n Show the token list sent to the parser when String is parsed." ++ "\n HINT: can be useful when debugging the parser." ++ diff --git a/src/GF/Shell/PShell.hs b/src/GF/Shell/PShell.hs index 676e54c46..2c501ab9f 100644 --- a/src/GF/Shell/PShell.hs +++ b/src/GF/Shell/PShell.hs @@ -114,6 +114,7 @@ pCommand ws = case ws of "cc" : s -> aUnit $ CComputeConcrete $ unwords s "so" : s -> aUnit $ CShowOpers $ unwords s "tb" : [] -> aUnit CTreeBank + "lt" : s -> aString CLookupTreebank s "tq" : i:o:[] -> aUnit (CTranslationQuiz (language i) (language o)) "tl":i:o:[] -> aUnit (CTranslationList (language i) (language o)) diff --git a/src/GF/Shell/ShellCommands.hs b/src/GF/Shell/ShellCommands.hs index 1351e8784..bf27016a5 100644 --- a/src/GF/Shell/ShellCommands.hs +++ b/src/GF/Shell/ShellCommands.hs @@ -57,6 +57,8 @@ data Command = | CComputeConcrete String | CShowOpers String + | CLookupTreebank + | CTranslationQuiz Language Language | CTranslationList Language Language | CMorphoQuiz @@ -169,8 +171,9 @@ optionsOfCommand co = case co of CSetFlag -> both "utf8 table struct record all multi" "cat lang lexer parser number depth rawtrees unlexer optimize path conversion printer" - CImport _ -> both "old v s src gfc retain nocf nocheckcirc cflexer noemit o make ex prob" - "abs cnc res path optimize conversion cat preproc probs noparse" + CImport _ -> + both "old v s src gfc retain nocf nocheckcirc cflexer noemit o make ex prob treebank" + "abs cnc res path optimize conversion cat preproc probs noparse" CRemoveLanguage _ -> none CEmptyState -> none CStripState -> none diff --git a/src/GF/UseGrammar/Treebank.hs b/src/GF/UseGrammar/Treebank.hs index 8f5fd71a7..befbae0c0 100644 --- a/src/GF/UseGrammar/Treebank.hs +++ b/src/GF/UseGrammar/Treebank.hs @@ -12,7 +12,14 @@ -- Purpose: to generate treebanks. ----------------------------------------------------------------------------- -module GF.UseGrammar.Treebank (mkTreebank,testTreebank,treesTreebank) where +module GF.UseGrammar.Treebank ( + mkTreebank, + testTreebank, + treesTreebank, + getTreebank, + lookupTreebank, + pre2treebank + ) where import GF.Compile.ShellState import GF.UseGrammar.Linear (linTree2string) @@ -28,6 +35,7 @@ import GF.Grammar.Values (tree2exp) import GF.Data.Operations import GF.Infra.Option import qualified GF.Grammar.Abstract as A +import qualified Data.Map as M -- Generate a treebank with a multilingual grammar. AR 8/2/2006 -- (c) Aarne Ranta 2006 under GNU GPL @@ -74,11 +82,13 @@ puts = return -- putStrLn ret = [] -- return () -- +type PreTreebank = [(String,[(String,String)])] + getTreebanks :: [String] -> [(String,String,String)] getTreebanks = concatMap grps . getTreebank where grps (t,lls) = [(t,x,y) | (x,y) <- lls] -getTreebank :: [String] -> [(String,[(String,String)])] +getTreebank :: [String] -> PreTreebank getTreebank ll = case ll of l:ls@(_:_:_) -> let (l1,l2) = getItem ls @@ -97,6 +107,12 @@ getTreebank ll = case ll of getLang = takeWhile (/='"') . tail . dropWhile (/='"') +lookupTreebank :: Treebank -> String -> [(String,String)] +lookupTreebank tb s = maybe [] id $ M.lookup s tb + +pre2treebank :: PreTreebank -> Treebank +pre2treebank tb = M.fromListWith (++) [(s,[(l,t)]) | (t,ls) <- tb, (l,s) <- ls] + annot :: StateGrammar -> String -> A.Tree annot gr s = errVal (error "illegal tree") $ do let t = tree2exp $ string2tree gr s diff --git a/src/HelpFile b/src/HelpFile index 4c3973d02..aff813043 100644 --- a/src/HelpFile +++ b/src/HelpFile @@ -37,6 +37,7 @@ i, import: i File -o do emit code (default with new grammar format) -ex preprocess .gfe files if needed -prob read probabilities from top grammar file (format --# prob Fun Double) + -treebank read a treebank file to memory (xml format) flags: -abs set the name used for abstract syntax (with -old option) -cnc set the name used for concrete syntax (with -old option) @@ -209,11 +210,18 @@ tb, tree_bank: tb -c compare to existing xml-formatted treebank -trees return the trees of the treebank -xml wrap the treebank (or comparison results) with XML tags + -mem write the treebank in memory instead of a file TODO examples: gr -cat=S -number=100 | tb -xml | wf tb.xml -- random treebank into file rf tb.xml | tb -c -- compare-test treebank from file rf old.xml | tb -trees | tb -xml -- create new treebank from old +lt, lookup_treebank: lt String + Lookup a string in a treebank and return the resulting trees. + Use 'tb' to create a treebank and 'i -treebank' to read it in memory. + flag: + -treebank use this treebank (instead of the latest introduced one) TODO + tt, test_tokenizer: tt String Show the token list sent to the parser when String is parsed. HINT: can be useful when debugging the parser.