probabilities in ShellState

This commit is contained in:
aarne
2005-10-31 18:02:34 +00:00
parent 08eca90037
commit 20d4485bb6
9 changed files with 87 additions and 58 deletions

View File

@@ -12,6 +12,14 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2
</center>
31/10 (AR) Probabilistic grammars. Probabilities can be used to
weight random generation (<tt>gr -prob</tt>) and to rank parse
results (<tt>p -prob</tt>). They are read from a separate file
(flag <tt>i -probs=File</tt>, format <tt>--# prob Fun Double</tt>)
or from the top-level grammar file itself (option <tt>i -prob</tt>).
<p>
12/10 (AR) Flag <tt>-atoms=Int</tt> to the command <tt>gt = generate_trees</tt>
takes away all zero-argument functions except Int per category. In
this way, it is possible to generate a corpus illustrating each

View File

@@ -5,9 +5,9 @@
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:45:58 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.18 $
-- > CVS $Date: 2005/10/31 19:02:35 $
-- > CVS $Author: aarne $
-- > CVS $Revision: 1.19 $
--
-- for reading grammars and terms from strings and files
-----------------------------------------------------------------------------
@@ -21,6 +21,7 @@ import GF.Compile.PGrammar
import GF.Grammar.TypeCheck
import GF.Compile.Compile
import GF.Compile.ShellState
import GF.Probabilistic.Probabilistic
import GF.Infra.Modules
import GF.Infra.ReadFiles (isOldFile)
@@ -50,7 +51,9 @@ string2annotTree gr m = annotate gr . string2absTerm (prt m) ---- prt
---string2paramList st = map (renameTrm (lookupConcrete st) . patt2term) . pPattList
shellStateFromFiles :: Options -> ShellState -> FilePath -> IOE ShellState
shellStateFromFiles opts st file = case fileSuffix file of
shellStateFromFiles opts st file = do
let top = identC $ justModuleName file
sh <- case fileSuffix file of
"gfcm" -> do
cenv <- compileOne opts (compileEnvShSt st []) file
ioeErr $ updateShellState opts Nothing st cenv
@@ -66,10 +69,14 @@ shellStateFromFiles opts st file = case fileSuffix file of
then addOptions (options []) opts' -- for old no emit
else addOptions (options [emitCode]) opts'
grts <- compileModule osb st file
let top = identC $ justModuleName file
mtop = if oElem showOld opts' then Nothing else Just top
let mtop = if oElem showOld opts' then Nothing else Just top
ioeErr $ updateShellState opts' mtop st grts
--- liftM (changeModTimes rts) $ grammar2shellState opts gr
if (isSetFlag opts probFile || oElem (iOpt "prob") opts)
then do
probs <- ioeIO $ getProbsFromFile opts file
let lang = maybe top id $ concrete sh --- to work with cf, too
ioeErr $ addProbs (lang,probs) sh
else return sh
getShellStateFromFiles :: Options -> FilePath -> IO ShellState
getShellStateFromFiles os =

View File

@@ -5,9 +5,9 @@
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/10/30 23:44:00 $
-- > CVS $Date: 2005/10/31 19:02:35 $
-- > CVS $Author: aarne $
-- > CVS $Revision: 1.49 $
-- > CVS $Revision: 1.50 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
@@ -462,6 +462,13 @@ abstractOfState = maybe emptyAbstractST id . maybeStateAbstract
stateIsWord :: StateGrammar -> String -> Bool
stateIsWord sg = isKnownWord (stateMorpho sg)
addProbs :: (Ident,Probs) -> ShellState -> Err ShellState
addProbs ip@(lang,probs)
sh@(ShSt x y cs ms ss cfs old_pis mcfgs cfgs pinfos mos pbs os rs acs s) = do
let gr = grammarOfLang sh lang
probs' <- checkGrammarProbs gr probs
let pbs' = (lang,probs') : filter ((/= lang) . fst) pbs
return (ShSt x y cs ms ss cfs old_pis mcfgs cfgs pinfos mos pbs' os rs acs s)
{-

View File

@@ -5,9 +5,9 @@
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/10/31 08:12:18 $
-- > CVS $Date: 2005/10/31 19:02:35 $
-- > CVS $Author: aarne $
-- > CVS $Revision: 1.2 $
-- > CVS $Revision: 1.3 $
--
-- Probabilistic abstract syntax. AR 30\/10\/2005
--
@@ -26,6 +26,7 @@ module GF.Probabilistic.Probabilistic (
,Probs -- = BinTree Ident Double
,getProbsFromFile -- :: Opts -> IO Probs
,emptyProbs -- :: Probs
,prProbs -- :: Probs -> String
) where
import GF.Canon.GFC
@@ -54,8 +55,10 @@ generateRandomTreesProb opts gen gr probs cat =
cat' = prt $ snd cat
-- | check that probabilities attached to a grammar make sense
checkGrammarProbs :: GFCGrammar -> Probs -> Err ()
checkGrammarProbs gr probs = err Bad (const (return ())) $ checkSGrammar $ gr2sgr gr probs
checkGrammarProbs :: GFCGrammar -> Probs -> Err Probs
checkGrammarProbs gr probs =
err Bad (return . gr2probs) $ checkSGrammar $ gr2sgr gr probs where
gr2probs sgr = buildTree [(zIdent f,p) | (_,rs) <- tree2list sgr, ((p,f),_) <- rs]
-- | compute the probability of a given tree
computeProbTree :: Probs -> Tree -> Double
@@ -71,14 +74,14 @@ computeProbTree probs (Tr (N (_,at,_,_,_),ts)) = case at of
rankByScore :: Ord n => [(a,n)] -> [(a,n)]
rankByScore = sortBy (\ (_,p) (_,q) -> compare q p)
getProbsFromFile :: Options -> IO Probs
getProbsFromFile opts = do
s <- maybe (return "") readFile $ getOptVal opts probFile
getProbsFromFile :: Options -> FilePath -> IO Probs
getProbsFromFile opts file = do
s <- maybe (readFile file) readFile $ getOptVal opts probFile
return $ buildTree $ concatMap pProb $ lines s
where
pProb s = case words s of
"--":f:p:_ | isDouble p -> [(zIdent f, read p)]
f:p:_ | isDouble p -> [(zIdent f, read p)]
"--#":"prob":f:p:_ | isDouble p -> [(zIdent f, read p)]
f:p:_ | isDouble p -> [(zIdent f, read p)]
_ -> []
isDouble = all (flip elem ('.':['0'..'9']))
@@ -86,7 +89,11 @@ type Probs = BinTree Ident Double
emptyProbs :: Probs
emptyProbs = emptyBinTree
prProbs :: Probs -> String
prProbs = unlines . map pr . tree2list where
pr (f,p) = prt f ++ "\t" ++ show p
------------------------------------------
-- translate grammar to simpler form and generated trees back
@@ -151,21 +158,14 @@ genTree :: [Double] -> SGrammar -> SCat -> (STree,Int)
genTree rs gr = gett rs where
gett ds "String" = (SString "foo",1)
gett ds "Int" = (SInt 1978,1)
gett ds cat = let
gett ds cat = case look cat of
[] -> (SMeta cat,1) -- if no productions, return ?
fs -> let
d:ds2 = ds
(pf,args) = getf d cat
(pf,args) = getf d fs
(ts,k) = getts ds2 args
in (SApp (pf,ts), k+1)
getf d cat =
let
regs0 = [(p,(pf,args)) | (pf@(p,_),(args,_)) <- look cat]
{- not needed
pstd = 1.0 / genericLength regs
regs = if any (>1.0) (map fst regs0)
then [(pstd,pa) | (_,pa) <- regs0]
else regs0
-}
in hitRegion d regs0
getf d fs = hitRegion d [(p,(pf,args)) | (pf@(p,_),(args,_)) <- fs]
getts ds cats = case cats of
c:cs -> let
(t, k) = gett ds c

View File

@@ -5,9 +5,9 @@
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/10/31 08:12:18 $
-- > CVS $Date: 2005/10/31 19:02:35 $
-- > CVS $Author: aarne $
-- > CVS $Revision: 1.48 $
-- > CVS $Revision: 1.49 $
--
-- GF shell command interpreter.
-----------------------------------------------------------------------------
@@ -222,8 +222,8 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com
let p = optParseArgErrMsg opts gro x
case p of
Ok (ts,msg)
| isSetFlag opts probFile -> do
probs <- getProbsFromFile opts
| oElem (iOpt "prob") opts -> do
let probs = stateProbs gro
let tps = rankByScore [(t,computeProbTree probs t) | t <- ts]
putStrLnFlush msg
mapM_ putStrLnFlush [show p | (t,p) <- tps]
@@ -235,17 +235,8 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com
let a' = opST2CommandArg (optParseArgErr opts (sgr il)) a
returnArg (opTS2CommandArg (optLinearizeTreeVal opts (sgr ol)) a') sa
CGenerateRandom | isSetFlag opts probFile -> do
probs <- getProbsFromFile opts
let cat = firstAbsCat opts gro
let n = optIntOrN opts flagNumber 1
gen <- newStdGen
let ts = take n $ generateRandomTreesProb opts gen cgr probs cat
returnArg (ATrms (map (term2tree gro) ts)) sa
CGenerateRandom | oElem showCF opts -> do
let probs = emptyProbs ---
CGenerateRandom | oElem showCF opts || oElem (iOpt "prob") opts -> do
let probs = stateProbs gro
let cat = firstAbsCat opts gro
let n = optIntOrN opts flagNumber 1
gen <- newStdGen

View File

@@ -5,9 +5,9 @@
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/10/12 12:38:30 $
-- > CVS $Date: 2005/10/31 19:02:35 $
-- > CVS $Author: aarne $
-- > CVS $Revision: 1.17 $
-- > CVS $Revision: 1.18 $
--
-- Help on shell commands. Generated from HelpFile by 'make help'.
-- PLEASE DON'T EDIT THIS FILE.
@@ -63,6 +63,7 @@ txtHelpFile =
"\n -noemit do not emit code (default with old grammar format)" ++
"\n -o do emit code (default with new grammar format)" ++
"\n -ex preprocess .gfe files if needed" ++
"\n -prob read probabilities from top grammar file (format --# prob Fun Double)" ++
"\n flags:" ++
"\n -abs set the name used for abstract syntax (with -old option)" ++
"\n -cnc set the name used for concrete syntax (with -old option)" ++
@@ -70,6 +71,7 @@ txtHelpFile =
"\n -path use the (colon-separated) search path to find modules" ++
"\n -optimize select an optimization to override file-defined flags" ++
"\n -conversion select parsing method (values strict|nondet)" ++
"\n -probs read probabilities from file (format (--# prob) Fun Double)" ++
"\n examples:" ++
"\n i English.gf -- ordinary import of Concrete" ++
"\n i -retain german/ParadigmsGer.gf -- import of Resource to test" ++
@@ -194,6 +196,7 @@ txtHelpFile =
"\n options for batch input:" ++
"\n -lines parse each line of input separately, ignoring empty lines" ++
"\n -all as -lines, but also parse empty lines" ++
"\n -prob rank results by probability" ++
"\n options for selecting parsing method:" ++
"\n (default)parse using an overgenerating CFG" ++
"\n -cfg parse using a much less overgenerating CFG" ++
@@ -270,6 +273,9 @@ txtHelpFile =
"\n Generates a random Tree of a given category. If a Tree" ++
"\n argument is given, the command completes the Tree with values to" ++
"\n the metavariables in the tree. " ++
"\n options:" ++
"\n -prob use probabilities (works for nondep types only)" ++
"\n -cf use a very fast method (works for nondep types only)" ++
"\n flags:" ++
"\n -cat generate in this category" ++
"\n -lang use the abstract syntax of this grammar" ++
@@ -566,6 +572,7 @@ txtHelpFile =
"\n *-printer=xml XML: DTD for the pg command, object for st" ++
"\n -printer=old old GF: file readable by GF 1.2" ++
"\n -printer=stat show some statistics of generated GFC" ++
"\n -printer=probs show probabilities of all functions" ++
"\n -printer=gsl Nuance GSL speech recognition grammar" ++
"\n -printer=jsgf Java Speech Grammar Format" ++
"\n -printer=slf a finite automaton in the HTK SLF format" ++

View File

@@ -5,9 +5,9 @@
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/10/31 08:12:18 $
-- > CVS $Date: 2005/10/31 19:02:35 $
-- > CVS $Author: aarne $
-- > CVS $Revision: 1.44 $
-- > CVS $Revision: 1.45 $
--
-- The datatype of shell commands and the list of their options.
-----------------------------------------------------------------------------
@@ -165,18 +165,18 @@ optionsOfCommand co = case co of
CSetFlag -> both "utf8 table struct record all multi"
"cat lang lexer parser number depth rawtrees unlexer optimize path conversion printer"
CImport _ -> both "old v s src retain nocf nocheckcirc cflexer noemit o ex"
"abs cnc res path optimize conversion cat"
CImport _ -> both "old v s src retain nocf nocheckcirc cflexer noemit o ex prob"
"abs cnc res path optimize conversion cat probs"
CRemoveLanguage _ -> none
CEmptyState -> none
CStripState -> none
CTransformGrammar _ -> flags "printer"
CConvertLatex _ -> none
CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer"
CParse -> both "new newer cfg mcfg n ign raw v lines all"
"cat lang lexer parser number rawtrees probs"
CParse -> both "new newer cfg mcfg n ign raw v lines all prob"
"cat lang lexer parser number rawtrees"
CTranslate _ _ -> opts "cat lexer parser"
CGenerateRandom -> flags "cat lang number depth probs"
CGenerateRandom -> both "cf prob" "cat lang number depth"
CGenerateTrees -> both "metas" "atoms depth alts cat lang number"
CPutTerm -> flags "transform number"
CWrapTerm _ -> opts "c"

View File

@@ -5,9 +5,9 @@
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/10/31 16:48:10 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.79 $
-- > CVS $Date: 2005/10/31 19:02:35 $
-- > CVS $Author: aarne $
-- > CVS $Revision: 1.80 $
--
-- A database for customizable GF shell commands.
--
@@ -68,6 +68,7 @@ import GF.UseGrammar.Information
import GF.API.GrammarToHaskell
-----import GrammarToCanon (showCanon, showCanonOpt)
-----import qualified GrammarToGFC as GFC
import GF.Probabilistic.Probabilistic (prProbs)
-- the cf parsing algorithms
import GF.CF.ChartParser -- OBSOLETE
@@ -266,6 +267,7 @@ customGrammarPrinter =
,(strCI "words", unwords . stateGrammarWords)
,(strCI "printnames", C.prPrintnamesGrammar . stateGrammarST)
,(strCI "stat", prStatistics . stateGrammarST)
,(strCI "probs", prProbs . stateProbs)
,(strCI "unpar", prCanon . unparametrizeCanon . stateGrammarST)
,(strCI "subs", prSubtermStat . stateGrammarST)

View File

@@ -34,6 +34,7 @@ i, import: i File
-noemit do not emit code (default with old grammar format)
-o do emit code (default with new grammar format)
-ex preprocess .gfe files if needed
-prob read probabilities from top grammar file (format --# prob Fun Double)
flags:
-abs set the name used for abstract syntax (with -old option)
-cnc set the name used for concrete syntax (with -old option)
@@ -41,6 +42,7 @@ i, import: i File
-path use the (colon-separated) search path to find modules
-optimize select an optimization to override file-defined flags
-conversion select parsing method (values strict|nondet)
-probs read probabilities from file (format (--# prob) Fun Double)
examples:
i English.gf -- ordinary import of Concrete
i -retain german/ParadigmsGer.gf -- import of Resource to test
@@ -165,6 +167,7 @@ p, parse: p String
options for batch input:
-lines parse each line of input separately, ignoring empty lines
-all as -lines, but also parse empty lines
-prob rank results by probability
options for selecting parsing method:
(default)parse using an overgenerating CFG
-cfg parse using a much less overgenerating CFG
@@ -241,6 +244,9 @@ gr, generate_random: gr Tree?
Generates a random Tree of a given category. If a Tree
argument is given, the command completes the Tree with values to
the metavariables in the tree.
options:
-prob use probabilities (works for nondep types only)
-cf use a very fast method (works for nondep types only)
flags:
-cat generate in this category
-lang use the abstract syntax of this grammar
@@ -537,6 +543,7 @@ q, quit: q
*-printer=xml XML: DTD for the pg command, object for st
-printer=old old GF: file readable by GF 1.2
-printer=stat show some statistics of generated GFC
-printer=probs show probabilities of all functions
-printer=gsl Nuance GSL speech recognition grammar
-printer=jsgf Java Speech Grammar Format
-printer=slf a finite automaton in the HTK SLF format