diff --git a/doc/gf-history.html b/doc/gf-history.html index 5639f57ac..193b18a50 100644 --- a/doc/gf-history.html +++ b/doc/gf-history.html @@ -12,6 +12,14 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2 +31/10 (AR) Probabilistic grammars. Probabilities can be used to +weight random generation (gr -prob) and to rank parse +results (p -prob). They are read from a separate file +(flag i -probs=File, format --# prob Fun Double) +or from the top-level grammar file itself (option i -prob). + +

+ 12/10 (AR) Flag -atoms=Int to the command gt = generate_trees takes away all zero-argument functions except Int per category. In this way, it is possible to generate a corpus illustrating each diff --git a/src/GF/API/IOGrammar.hs b/src/GF/API/IOGrammar.hs index 987800e16..f06799da1 100644 --- a/src/GF/API/IOGrammar.hs +++ b/src/GF/API/IOGrammar.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/04/21 16:45:58 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.18 $ +-- > CVS $Date: 2005/10/31 19:02:35 $ +-- > CVS $Author: aarne $ +-- > CVS $Revision: 1.19 $ -- -- for reading grammars and terms from strings and files ----------------------------------------------------------------------------- @@ -21,6 +21,7 @@ import GF.Compile.PGrammar import GF.Grammar.TypeCheck import GF.Compile.Compile import GF.Compile.ShellState +import GF.Probabilistic.Probabilistic import GF.Infra.Modules import GF.Infra.ReadFiles (isOldFile) @@ -50,7 +51,9 @@ string2annotTree gr m = annotate gr . string2absTerm (prt m) ---- prt ---string2paramList st = map (renameTrm (lookupConcrete st) . patt2term) . pPattList shellStateFromFiles :: Options -> ShellState -> FilePath -> IOE ShellState -shellStateFromFiles opts st file = case fileSuffix file of +shellStateFromFiles opts st file = do + let top = identC $ justModuleName file + sh <- case fileSuffix file of "gfcm" -> do cenv <- compileOne opts (compileEnvShSt st []) file ioeErr $ updateShellState opts Nothing st cenv @@ -66,10 +69,14 @@ shellStateFromFiles opts st file = case fileSuffix file of then addOptions (options []) opts' -- for old no emit else addOptions (options [emitCode]) opts' grts <- compileModule osb st file - let top = identC $ justModuleName file - mtop = if oElem showOld opts' then Nothing else Just top + let mtop = if oElem showOld opts' then Nothing else Just top ioeErr $ updateShellState opts' mtop st grts - --- liftM (changeModTimes rts) $ grammar2shellState opts gr + if (isSetFlag opts probFile || oElem (iOpt "prob") opts) + then do + probs <- ioeIO $ getProbsFromFile opts file + let lang = maybe top id $ concrete sh --- to work with cf, too + ioeErr $ addProbs (lang,probs) sh + else return sh getShellStateFromFiles :: Options -> FilePath -> IO ShellState getShellStateFromFiles os = diff --git a/src/GF/Compile/ShellState.hs b/src/GF/Compile/ShellState.hs index e00e2e477..2d87bdf67 100644 --- a/src/GF/Compile/ShellState.hs +++ b/src/GF/Compile/ShellState.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/10/30 23:44:00 $ +-- > CVS $Date: 2005/10/31 19:02:35 $ -- > CVS $Author: aarne $ --- > CVS $Revision: 1.49 $ +-- > CVS $Revision: 1.50 $ -- -- (Description of the module) ----------------------------------------------------------------------------- @@ -462,6 +462,13 @@ abstractOfState = maybe emptyAbstractST id . maybeStateAbstract stateIsWord :: StateGrammar -> String -> Bool stateIsWord sg = isKnownWord (stateMorpho sg) +addProbs :: (Ident,Probs) -> ShellState -> Err ShellState +addProbs ip@(lang,probs) + sh@(ShSt x y cs ms ss cfs old_pis mcfgs cfgs pinfos mos pbs os rs acs s) = do + let gr = grammarOfLang sh lang + probs' <- checkGrammarProbs gr probs + let pbs' = (lang,probs') : filter ((/= lang) . fst) pbs + return (ShSt x y cs ms ss cfs old_pis mcfgs cfgs pinfos mos pbs' os rs acs s) {- diff --git a/src/GF/Probabilistic/Probabilistic.hs b/src/GF/Probabilistic/Probabilistic.hs index 81f9a60d0..daf382790 100644 --- a/src/GF/Probabilistic/Probabilistic.hs +++ b/src/GF/Probabilistic/Probabilistic.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/10/31 08:12:18 $ +-- > CVS $Date: 2005/10/31 19:02:35 $ -- > CVS $Author: aarne $ --- > CVS $Revision: 1.2 $ +-- > CVS $Revision: 1.3 $ -- -- Probabilistic abstract syntax. AR 30\/10\/2005 -- @@ -26,6 +26,7 @@ module GF.Probabilistic.Probabilistic ( ,Probs -- = BinTree Ident Double ,getProbsFromFile -- :: Opts -> IO Probs ,emptyProbs -- :: Probs + ,prProbs -- :: Probs -> String ) where import GF.Canon.GFC @@ -54,8 +55,10 @@ generateRandomTreesProb opts gen gr probs cat = cat' = prt $ snd cat -- | check that probabilities attached to a grammar make sense -checkGrammarProbs :: GFCGrammar -> Probs -> Err () -checkGrammarProbs gr probs = err Bad (const (return ())) $ checkSGrammar $ gr2sgr gr probs +checkGrammarProbs :: GFCGrammar -> Probs -> Err Probs +checkGrammarProbs gr probs = + err Bad (return . gr2probs) $ checkSGrammar $ gr2sgr gr probs where + gr2probs sgr = buildTree [(zIdent f,p) | (_,rs) <- tree2list sgr, ((p,f),_) <- rs] -- | compute the probability of a given tree computeProbTree :: Probs -> Tree -> Double @@ -71,14 +74,14 @@ computeProbTree probs (Tr (N (_,at,_,_,_),ts)) = case at of rankByScore :: Ord n => [(a,n)] -> [(a,n)] rankByScore = sortBy (\ (_,p) (_,q) -> compare q p) -getProbsFromFile :: Options -> IO Probs -getProbsFromFile opts = do - s <- maybe (return "") readFile $ getOptVal opts probFile +getProbsFromFile :: Options -> FilePath -> IO Probs +getProbsFromFile opts file = do + s <- maybe (readFile file) readFile $ getOptVal opts probFile return $ buildTree $ concatMap pProb $ lines s where pProb s = case words s of - "--":f:p:_ | isDouble p -> [(zIdent f, read p)] - f:p:_ | isDouble p -> [(zIdent f, read p)] + "--#":"prob":f:p:_ | isDouble p -> [(zIdent f, read p)] + f:p:_ | isDouble p -> [(zIdent f, read p)] _ -> [] isDouble = all (flip elem ('.':['0'..'9'])) @@ -86,7 +89,11 @@ type Probs = BinTree Ident Double emptyProbs :: Probs emptyProbs = emptyBinTree - + +prProbs :: Probs -> String +prProbs = unlines . map pr . tree2list where + pr (f,p) = prt f ++ "\t" ++ show p + ------------------------------------------ -- translate grammar to simpler form and generated trees back @@ -151,21 +158,14 @@ genTree :: [Double] -> SGrammar -> SCat -> (STree,Int) genTree rs gr = gett rs where gett ds "String" = (SString "foo",1) gett ds "Int" = (SInt 1978,1) - gett ds cat = let + gett ds cat = case look cat of + [] -> (SMeta cat,1) -- if no productions, return ? + fs -> let d:ds2 = ds - (pf,args) = getf d cat + (pf,args) = getf d fs (ts,k) = getts ds2 args in (SApp (pf,ts), k+1) - getf d cat = - let - regs0 = [(p,(pf,args)) | (pf@(p,_),(args,_)) <- look cat] -{- not needed - pstd = 1.0 / genericLength regs - regs = if any (>1.0) (map fst regs0) - then [(pstd,pa) | (_,pa) <- regs0] - else regs0 --} - in hitRegion d regs0 + getf d fs = hitRegion d [(p,(pf,args)) | (pf@(p,_),(args,_)) <- fs] getts ds cats = case cats of c:cs -> let (t, k) = gett ds c diff --git a/src/GF/Shell.hs b/src/GF/Shell.hs index 03a47a05c..488504c65 100644 --- a/src/GF/Shell.hs +++ b/src/GF/Shell.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/10/31 08:12:18 $ +-- > CVS $Date: 2005/10/31 19:02:35 $ -- > CVS $Author: aarne $ --- > CVS $Revision: 1.48 $ +-- > CVS $Revision: 1.49 $ -- -- GF shell command interpreter. ----------------------------------------------------------------------------- @@ -222,8 +222,8 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com let p = optParseArgErrMsg opts gro x case p of Ok (ts,msg) - | isSetFlag opts probFile -> do - probs <- getProbsFromFile opts + | oElem (iOpt "prob") opts -> do + let probs = stateProbs gro let tps = rankByScore [(t,computeProbTree probs t) | t <- ts] putStrLnFlush msg mapM_ putStrLnFlush [show p | (t,p) <- tps] @@ -235,17 +235,8 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com let a' = opST2CommandArg (optParseArgErr opts (sgr il)) a returnArg (opTS2CommandArg (optLinearizeTreeVal opts (sgr ol)) a') sa - - CGenerateRandom | isSetFlag opts probFile -> do - probs <- getProbsFromFile opts - let cat = firstAbsCat opts gro - let n = optIntOrN opts flagNumber 1 - gen <- newStdGen - let ts = take n $ generateRandomTreesProb opts gen cgr probs cat - returnArg (ATrms (map (term2tree gro) ts)) sa - - CGenerateRandom | oElem showCF opts -> do - let probs = emptyProbs --- + CGenerateRandom | oElem showCF opts || oElem (iOpt "prob") opts -> do + let probs = stateProbs gro let cat = firstAbsCat opts gro let n = optIntOrN opts flagNumber 1 gen <- newStdGen diff --git a/src/GF/Shell/HelpFile.hs b/src/GF/Shell/HelpFile.hs index b139ba647..e2216ce64 100644 --- a/src/GF/Shell/HelpFile.hs +++ b/src/GF/Shell/HelpFile.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/10/12 12:38:30 $ +-- > CVS $Date: 2005/10/31 19:02:35 $ -- > CVS $Author: aarne $ --- > CVS $Revision: 1.17 $ +-- > CVS $Revision: 1.18 $ -- -- Help on shell commands. Generated from HelpFile by 'make help'. -- PLEASE DON'T EDIT THIS FILE. @@ -63,6 +63,7 @@ txtHelpFile = "\n -noemit do not emit code (default with old grammar format)" ++ "\n -o do emit code (default with new grammar format)" ++ "\n -ex preprocess .gfe files if needed" ++ + "\n -prob read probabilities from top grammar file (format --# prob Fun Double)" ++ "\n flags:" ++ "\n -abs set the name used for abstract syntax (with -old option)" ++ "\n -cnc set the name used for concrete syntax (with -old option)" ++ @@ -70,6 +71,7 @@ txtHelpFile = "\n -path use the (colon-separated) search path to find modules" ++ "\n -optimize select an optimization to override file-defined flags" ++ "\n -conversion select parsing method (values strict|nondet)" ++ + "\n -probs read probabilities from file (format (--# prob) Fun Double)" ++ "\n examples:" ++ "\n i English.gf -- ordinary import of Concrete" ++ "\n i -retain german/ParadigmsGer.gf -- import of Resource to test" ++ @@ -194,6 +196,7 @@ txtHelpFile = "\n options for batch input:" ++ "\n -lines parse each line of input separately, ignoring empty lines" ++ "\n -all as -lines, but also parse empty lines" ++ + "\n -prob rank results by probability" ++ "\n options for selecting parsing method:" ++ "\n (default)parse using an overgenerating CFG" ++ "\n -cfg parse using a much less overgenerating CFG" ++ @@ -270,6 +273,9 @@ txtHelpFile = "\n Generates a random Tree of a given category. If a Tree" ++ "\n argument is given, the command completes the Tree with values to" ++ "\n the metavariables in the tree. " ++ + "\n options:" ++ + "\n -prob use probabilities (works for nondep types only)" ++ + "\n -cf use a very fast method (works for nondep types only)" ++ "\n flags:" ++ "\n -cat generate in this category" ++ "\n -lang use the abstract syntax of this grammar" ++ @@ -566,6 +572,7 @@ txtHelpFile = "\n *-printer=xml XML: DTD for the pg command, object for st" ++ "\n -printer=old old GF: file readable by GF 1.2" ++ "\n -printer=stat show some statistics of generated GFC" ++ + "\n -printer=probs show probabilities of all functions" ++ "\n -printer=gsl Nuance GSL speech recognition grammar" ++ "\n -printer=jsgf Java Speech Grammar Format" ++ "\n -printer=slf a finite automaton in the HTK SLF format" ++ diff --git a/src/GF/Shell/ShellCommands.hs b/src/GF/Shell/ShellCommands.hs index eac97b22c..c5b0c479e 100644 --- a/src/GF/Shell/ShellCommands.hs +++ b/src/GF/Shell/ShellCommands.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/10/31 08:12:18 $ +-- > CVS $Date: 2005/10/31 19:02:35 $ -- > CVS $Author: aarne $ --- > CVS $Revision: 1.44 $ +-- > CVS $Revision: 1.45 $ -- -- The datatype of shell commands and the list of their options. ----------------------------------------------------------------------------- @@ -165,18 +165,18 @@ optionsOfCommand co = case co of CSetFlag -> both "utf8 table struct record all multi" "cat lang lexer parser number depth rawtrees unlexer optimize path conversion printer" - CImport _ -> both "old v s src retain nocf nocheckcirc cflexer noemit o ex" - "abs cnc res path optimize conversion cat" + CImport _ -> both "old v s src retain nocf nocheckcirc cflexer noemit o ex prob" + "abs cnc res path optimize conversion cat probs" CRemoveLanguage _ -> none CEmptyState -> none CStripState -> none CTransformGrammar _ -> flags "printer" CConvertLatex _ -> none CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer" - CParse -> both "new newer cfg mcfg n ign raw v lines all" - "cat lang lexer parser number rawtrees probs" + CParse -> both "new newer cfg mcfg n ign raw v lines all prob" + "cat lang lexer parser number rawtrees" CTranslate _ _ -> opts "cat lexer parser" - CGenerateRandom -> flags "cat lang number depth probs" + CGenerateRandom -> both "cf prob" "cat lang number depth" CGenerateTrees -> both "metas" "atoms depth alts cat lang number" CPutTerm -> flags "transform number" CWrapTerm _ -> opts "c" diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs index 9887a2371..67ed388f8 100644 --- a/src/GF/UseGrammar/Custom.hs +++ b/src/GF/UseGrammar/Custom.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/10/31 16:48:10 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.79 $ +-- > CVS $Date: 2005/10/31 19:02:35 $ +-- > CVS $Author: aarne $ +-- > CVS $Revision: 1.80 $ -- -- A database for customizable GF shell commands. -- @@ -68,6 +68,7 @@ import GF.UseGrammar.Information import GF.API.GrammarToHaskell -----import GrammarToCanon (showCanon, showCanonOpt) -----import qualified GrammarToGFC as GFC +import GF.Probabilistic.Probabilistic (prProbs) -- the cf parsing algorithms import GF.CF.ChartParser -- OBSOLETE @@ -266,6 +267,7 @@ customGrammarPrinter = ,(strCI "words", unwords . stateGrammarWords) ,(strCI "printnames", C.prPrintnamesGrammar . stateGrammarST) ,(strCI "stat", prStatistics . stateGrammarST) + ,(strCI "probs", prProbs . stateProbs) ,(strCI "unpar", prCanon . unparametrizeCanon . stateGrammarST) ,(strCI "subs", prSubtermStat . stateGrammarST) diff --git a/src/HelpFile b/src/HelpFile index c28a9d2fc..573191204 100644 --- a/src/HelpFile +++ b/src/HelpFile @@ -34,6 +34,7 @@ i, import: i File -noemit do not emit code (default with old grammar format) -o do emit code (default with new grammar format) -ex preprocess .gfe files if needed + -prob read probabilities from top grammar file (format --# prob Fun Double) flags: -abs set the name used for abstract syntax (with -old option) -cnc set the name used for concrete syntax (with -old option) @@ -41,6 +42,7 @@ i, import: i File -path use the (colon-separated) search path to find modules -optimize select an optimization to override file-defined flags -conversion select parsing method (values strict|nondet) + -probs read probabilities from file (format (--# prob) Fun Double) examples: i English.gf -- ordinary import of Concrete i -retain german/ParadigmsGer.gf -- import of Resource to test @@ -165,6 +167,7 @@ p, parse: p String options for batch input: -lines parse each line of input separately, ignoring empty lines -all as -lines, but also parse empty lines + -prob rank results by probability options for selecting parsing method: (default)parse using an overgenerating CFG -cfg parse using a much less overgenerating CFG @@ -241,6 +244,9 @@ gr, generate_random: gr Tree? Generates a random Tree of a given category. If a Tree argument is given, the command completes the Tree with values to the metavariables in the tree. + options: + -prob use probabilities (works for nondep types only) + -cf use a very fast method (works for nondep types only) flags: -cat generate in this category -lang use the abstract syntax of this grammar @@ -537,6 +543,7 @@ q, quit: q *-printer=xml XML: DTD for the pg command, object for st -printer=old old GF: file readable by GF 1.2 -printer=stat show some statistics of generated GFC + -printer=probs show probabilities of all functions -printer=gsl Nuance GSL speech recognition grammar -printer=jsgf Java Speech Grammar Format -printer=slf a finite automaton in the HTK SLF format