From 928c84b36cf68b9d590ad2d8cba9e69e3cd3d2d2 Mon Sep 17 00:00:00 2001 From: bringert Date: Tue, 1 Nov 2005 19:09:04 +0000 Subject: [PATCH] Documented SRGS XML generation. Started working on support for probabilities in SRG generation. Added support for probabilities in for SRGS. --- src/GF/Data/Operations.hs | 15 ++++++------ src/GF/Shell/HelpFile.hs | 7 +++--- src/GF/Speech/PrGSL.hs | 16 +++++++------ src/GF/Speech/PrJSGF.hs | 19 +++++++++------- src/GF/Speech/PrSRGS.hs | 34 +++++++++++++++------------ src/GF/Speech/SRG.hs | 43 +++++++++++++++++++++++++++-------- src/GF/Speech/TransformCFG.hs | 16 +++++++++---- src/GF/UseGrammar/Custom.hs | 16 ++++++++----- src/HelpFile | 1 + 9 files changed, 108 insertions(+), 59 deletions(-) diff --git a/src/GF/Data/Operations.hs b/src/GF/Data/Operations.hs index c297bc55a..ce8f90a03 100644 --- a/src/GF/Data/Operations.hs +++ b/src/GF/Data/Operations.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/05/30 18:39:44 $ --- > CVS $Author: aarne $ --- > CVS $Revision: 1.20 $ +-- > CVS $Date: 2005/11/01 20:09:04 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.21 $ -- -- some auxiliary GF operations. AR 19\/6\/1998 -- 6\/2\/2001 -- @@ -100,6 +100,7 @@ data Err a = Ok a | Bad String instance Monad Err where return = Ok + fail = Bad Ok a >>= f = f a Bad s >>= f = Bad s @@ -302,19 +303,19 @@ isInBinTree :: (Ord a) => a -> BinTree a b -> Bool isInBinTree x = err (const False) (const True) . justLookupTree x -- isInBinTree = elemFM -justLookupTree :: (Ord a) => a -> BinTree a b -> Err b +justLookupTree :: (Monad m,Ord a) => a -> BinTree a b -> m b justLookupTree = lookupTree (const []) -lookupTree :: (Ord a) => (a -> String) -> a -> BinTree a b -> Err b +lookupTree :: (Monad m,Ord a) => (a -> String) -> a -> BinTree a b -> m b lookupTree pr x tree = case tree of - NT -> Bad ("no occurrence of element" +++ pr x) + NT -> fail ("no occurrence of element" +++ pr x) BT (a,b) left right | x < a -> lookupTree pr x left | x > a -> lookupTree pr x right | x == a -> return b --lookupTree pr x tree = case lookupFM tree x of -- Just y -> return y --- _ -> Bad ("no occurrence of element" +++ pr x) +-- _ -> fail ("no occurrence of element" +++ pr x) lookupTreeMany :: Ord a => (a -> String) -> [BinTree a b] -> a -> Err b lookupTreeMany pr (t:ts) x = case lookupTree pr x t of diff --git a/src/GF/Shell/HelpFile.hs b/src/GF/Shell/HelpFile.hs index e2216ce64..7daac5816 100644 --- a/src/GF/Shell/HelpFile.hs +++ b/src/GF/Shell/HelpFile.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/10/31 19:02:35 $ --- > CVS $Author: aarne $ --- > CVS $Revision: 1.18 $ +-- > CVS $Date: 2005/11/01 20:09:04 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.19 $ -- -- Help on shell commands. Generated from HelpFile by 'make help'. -- PLEASE DON'T EDIT THIS FILE. @@ -575,6 +575,7 @@ txtHelpFile = "\n -printer=probs show probabilities of all functions" ++ "\n -printer=gsl Nuance GSL speech recognition grammar" ++ "\n -printer=jsgf Java Speech Grammar Format" ++ + "\n -printer=srgs_xml SRGS XML format" ++ "\n -printer=slf a finite automaton in the HTK SLF format" ++ "\n -printer=slf_graphviz the same automaton as in SLF, but in Graphviz format" ++ "\n -printer=fa_graphviz a finite automaton with labelled edges" ++ diff --git a/src/GF/Speech/PrGSL.hs b/src/GF/Speech/PrGSL.hs index 4f245a328..34e2620bb 100644 --- a/src/GF/Speech/PrGSL.hs +++ b/src/GF/Speech/PrGSL.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/09/14 15:17:29 $ +-- > CVS $Date: 2005/11/01 20:09:04 $ -- > CVS $Author: bringert $ --- > CVS $Revision: 1.21 $ +-- > CVS $Revision: 1.22 $ -- -- This module prints a CFG as a Nuance GSL 2.0 grammar. -- @@ -26,13 +26,14 @@ import GF.Formalism.Utilities (Symbol(..)) import GF.Conversion.Types import GF.Infra.Print import GF.Infra.Option +import GF.Probabilistic.Probabilistic (Probs) import Data.Char (toUpper,toLower) gslPrinter :: Ident -- ^ Grammar name - -> Options -> CGrammar -> String -gslPrinter name opts cfg = prGSL srg "" - where srg = makeSRG name opts cfg + -> Options -> Maybe Probs -> CGrammar -> String +gslPrinter name opts probs cfg = prGSL srg "" + where srg = makeSRG name opts probs cfg prGSL :: SRG -> ShowS prGSL (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs}) @@ -46,8 +47,9 @@ prGSL (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs}) prRule (SRGRule cat origCat rhs) = showString "; " . prtS origCat . nl . prCat cat . sp . wrap "[" (unwordsS (map prAlt rhs)) "]" . nl - prAlt rhs = wrap "(" (unwordsS (map prSymbol rhs')) ")" - where rhs' = rmPunct rhs + -- FIXME: use the probability + prAlt (SRGAlt mp rhs) = wrap "(" (unwordsS (map prSymbol rhs')) ")" + where rhs' = rmPunct rhs prSymbol (Cat c) = prCat c prSymbol (Tok t) = wrap "\"" (showString (showToken t)) "\"" -- GSL requires an upper case letter in category names diff --git a/src/GF/Speech/PrJSGF.hs b/src/GF/Speech/PrJSGF.hs index 5d0b0a211..0eab36828 100644 --- a/src/GF/Speech/PrJSGF.hs +++ b/src/GF/Speech/PrJSGF.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/09/14 15:17:29 $ +-- > CVS $Date: 2005/11/01 20:09:04 $ -- > CVS $Author: bringert $ --- > CVS $Revision: 1.15 $ +-- > CVS $Revision: 1.16 $ -- -- This module prints a CFG as a JSGF grammar. -- @@ -26,12 +26,13 @@ import GF.Formalism.Utilities (Symbol(..)) import GF.Infra.Ident import GF.Infra.Print import GF.Infra.Option +import GF.Probabilistic.Probabilistic (Probs) import GF.Speech.SRG jsgfPrinter :: Ident -- ^ Grammar name - -> Options -> CGrammar -> String -jsgfPrinter name opts cfg = prJSGF srg "" - where srg = makeSRG name opts cfg + -> Options -> Maybe Probs -> CGrammar -> String +jsgfPrinter name opts probs cfg = prJSGF srg "" + where srg = makeSRG name opts probs cfg prJSGF :: SRG -> ShowS prJSGF (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs}) @@ -47,9 +48,11 @@ prJSGF (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs}) prRule (SRGRule cat origCat rhs) = comments [origCat] . nl . prCat cat . showString " = " . joinS " | " (map prAlt rhs) . nl - prAlt rhs | null rhs' = showString "" - | otherwise = wrap "(" (unwordsS (map prSymbol rhs')) ")" - where rhs' = rmPunct rhs + -- FIXME: use the probability + prAlt (SRGAlt mp rhs) + | null rhs' = showString "" + | otherwise = wrap "(" (unwordsS (map prSymbol rhs')) ")" + where rhs' = rmPunct rhs prSymbol (Cat c) = prCat c prSymbol (Tok t) = wrap "\"" (prtS t) "\"" prCat c = showChar '<' . showString c . showChar '>' diff --git a/src/GF/Speech/PrSRGS.hs b/src/GF/Speech/PrSRGS.hs index dc4e1c126..3604f34cd 100644 --- a/src/GF/Speech/PrSRGS.hs +++ b/src/GF/Speech/PrSRGS.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/10/31 16:48:08 $ +-- > CVS $Date: 2005/11/01 20:09:04 $ -- > CVS $Author: bringert $ --- > CVS $Revision: 1.1 $ +-- > CVS $Revision: 1.2 $ -- -- This module prints a CFG as an SRGS XML grammar. -- @@ -26,6 +26,7 @@ import GF.Formalism.Utilities (Symbol(..)) import GF.Conversion.Types import GF.Infra.Print import GF.Infra.Option +import GF.Probabilistic.Probabilistic (Probs) import Data.Char (toUpper,toLower) @@ -35,9 +36,11 @@ data XML = Data String | Tag String [Attr] [XML] | Comment String type Attr = (String,String) srgsXmlPrinter :: Ident -- ^ Grammar name - -> Options -> CGrammar -> String -srgsXmlPrinter name opts cfg = prSrgsXml srg "" - where srg = makeSRG name opts cfg + -> Options + -> Maybe Probs + -> CGrammar -> String +srgsXmlPrinter name opts probs cfg = prSrgsXml srg "" + where srg = makeSRG name opts probs cfg prSrgsXml :: SRG -> ShowS prSrgsXml (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs}) @@ -50,13 +53,12 @@ prSrgsXml (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs}) "Generated by GF", "Original start category: " ++ origStart] ++ map ruleToXML rs) - ruleToXML (SRGRule cat origCat rhss) = - rule (prCat cat) (comments ["Category " ++ origCat] ++ [prRhs rhss]) - prRhs rhss = oneOf (map item (map prAlt rhss)) - -- FIXME: don't use one-of if there is only one - prAlt rhs = map prSymbol rhs + ruleToXML (SRGRule cat origCat alts) = + rule (prCat cat) (comments ["Category " ++ origCat] ++ [prRhs alts]) + prRhs rhss = oneOf (map prAlt rhss) + prAlt (SRGAlt p rhs) = item p (map prSymbol rhs) prSymbol (Cat c) = Tag "ruleref" [("uri","#" ++ prCat c)] [] - prSymbol (Tok t) = item [Data (showToken t)] + prSymbol (Tok t) = item Nothing [Data (showToken t)] prCat c = c -- FIXME: escape something? showToken t = t -- FIXME: escape something? @@ -64,9 +66,13 @@ rule :: String -- ^ id -> [XML] -> XML rule i = Tag "rule" [("id",i)] -item :: [XML] -> XML -item [x@(Tag "item" _ _)] = x -item xs = Tag "item" [] xs +item :: Maybe Double -> [XML] -> XML +-- FIXME: what is the weight called? +item mp xs = Tag "item" as cs + where as = maybe [] (\p -> [("weight", show p)]) mp + cs = case xs of + [Tag "item" [] xs'] -> xs' + _ -> xs oneOf :: [XML] -> XML oneOf [x] = x diff --git a/src/GF/Speech/SRG.hs b/src/GF/Speech/SRG.hs index 11d326fff..51c01df01 100644 --- a/src/GF/Speech/SRG.hs +++ b/src/GF/Speech/SRG.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/10/26 17:14:03 $ +-- > CVS $Date: 2005/11/01 20:09:04 $ -- > CVS $Author: bringert $ --- > CVS $Revision: 1.19 $ +-- > CVS $Revision: 1.20 $ -- -- Representation of, conversion to, and utilities for -- printing of a general Speech Recognition Grammar. @@ -20,6 +20,7 @@ module GF.Speech.SRG where +import GF.Data.Operations import GF.Data.Utilities import GF.Infra.Ident import GF.Formalism.CFG @@ -28,6 +29,7 @@ import GF.Conversion.Types import GF.Infra.Print import GF.Speech.TransformCFG import GF.Infra.Option +import GF.Probabilistic.Probabilistic (Probs) import Data.List import Data.Maybe (fromMaybe) @@ -38,9 +40,15 @@ data SRG = SRG { grammarName :: String -- ^ grammar name , origStartCat :: String -- ^ original start category name , rules :: [SRGRule] } + deriving (Eq,Show) + data SRGRule = SRGRule String String [SRGAlt] -- ^ SRG category name, original category name -- and productions -type SRGAlt = [Symbol String Token] + deriving (Eq,Show) + +-- | maybe a probability, and a list of symbols +data SRGAlt = SRGAlt (Maybe Double) [Symbol String Token] + deriving (Eq,Show) -- | SRG category name and original name type CatName = (String,String) @@ -49,27 +57,42 @@ type CatNames = FiniteMap String String makeSRG :: Ident -- ^ Grammar name -> Options -- ^ Grammar options + -> Maybe Probs -- ^ Probabilities -> CGrammar -- ^ A context-free grammar -> SRG -makeSRG i opts gr = SRG { grammarName = name, - startCat = lookupFM_ names origStart, - origStartCat = origStart, - rules = map (cfgRulesToSRGRule names) cfgRules } +makeSRG i opts probs gr + = SRG { grammarName = name, + startCat = lookupFM_ names origStart, + origStartCat = origStart, + rules = rs } where name = prIdent i origStart = getStartCat opts gr' = removeLeftRecursion $ removeIdenticalRules $ removeEmptyCats $ cfgToCFRules gr (cats,cfgRules) = unzip gr' names = mkCatNames name cats + rs = map (cfgRulesToSRGRule names probs) cfgRules -cfgRulesToSRGRule :: FiniteMap String String -> [CFRule_] -> SRGRule -cfgRulesToSRGRule names rs@(r:_) = SRGRule cat origCat rhs + +-- FIXME: probabilities get larger than 1.0 when new rules are +-- introduced +-- FIXME: merge alternatives with same rhs but different probabilities +cfgRulesToSRGRule :: FiniteMap String String -> Maybe Probs -> [CFRule_] -> SRGRule +cfgRulesToSRGRule names probs rs@(r:_) = SRGRule cat origCat rhs where origCat = lhsCat r cat = lookupFM_ names origCat - rhs = nub $ map (map renameCat . ruleRhs) rs + rhs = nub $ map ruleToAlt rs + ruleToAlt r = SRGAlt (ruleProb probs r) (map renameCat (ruleRhs r)) renameCat (Cat c) = Cat (lookupFM_ names c) renameCat t = t +ruleProb :: Maybe Probs -> CFRule_ -> Maybe Double +ruleProb mp r = mp >>= \probs -> lookupProb probs (ruleFun r) + +-- FIXME: move to GF.Probabilistic.Probabilistic? +lookupProb :: Probs -> Ident -> Maybe Double +lookupProb probs i = lookupTree prIdent i probs + mkCatNames :: String -- ^ Category name prefix -> [String] -- ^ Original category names -> FiniteMap String String -- ^ Maps original names to SRG names diff --git a/src/GF/Speech/TransformCFG.hs b/src/GF/Speech/TransformCFG.hs index d12d06628..39060206b 100644 --- a/src/GF/Speech/TransformCFG.hs +++ b/src/GF/Speech/TransformCFG.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/10/26 18:47:16 $ +-- > CVS $Date: 2005/11/01 20:09:04 $ -- > CVS $Author: bringert $ --- > CVS $Revision: 1.23 $ +-- > CVS $Revision: 1.24 $ -- -- This module does some useful transformations on CFGs. -- @@ -26,7 +26,8 @@ module GF.Speech.TransformCFG {- (CFRule_, CFRules, import GF.Conversion.Types import GF.Data.Utilities import GF.Formalism.CFG -import GF.Formalism.Utilities (Symbol(..), mapSymbol, filterCats, symbol, NameProfile(..)) +import GF.Formalism.Utilities (Symbol(..), mapSymbol, filterCats, symbol, + NameProfile(..), name2fun) import GF.Infra.Ident import GF.Infra.Option import GF.Infra.Print @@ -75,6 +76,7 @@ removeEmptyCats = fix removeEmptyCats' k' = map (\ (c,xs) -> (c, filter (not . anyUsedBy emptyCats) xs)) keep -- | Remove rules which are identical, not caring about the rule names. +-- FIXME: this messes up probabilities removeIdenticalRules :: CFRules -> CFRules removeIdenticalRules g = [(c,sortNubBy compareCatAndRhs rs) | (c,rs) <- g] where compareCatAndRhs (CFRule c1 ss1 _) (CFRule c2 ss2 _) = @@ -87,7 +89,11 @@ removeLeftRecursion rs = concatMap removeDirectLeftRecursion $ map handleProds r where handleProds (c, r) = (c, concatMap handleProd r) handleProd (CFRule ai (Cat aj:alpha) n) | aj < ai = - -- FIXME: this will give multiple rules with the same name + -- FIXME: for non-recursive categories, this changes + -- the grammar unneccessarily, maybe we can use mutRecCats + -- to make this less invasive + -- FIXME: this will give multiple rules with the same name, + -- which may mess up the probabilities. [CFRule ai (beta ++ alpha) n | CFRule _ beta _ <- lookup' aj rs] handleProd r = [r] @@ -124,6 +130,8 @@ lhsCat (CFRule c _ _) = c ruleRhs :: CFRule c n t -> [Symbol c t] ruleRhs (CFRule _ ss _) = ss +ruleFun :: CFRule_ -> Fun +ruleFun (CFRule _ _ n) = name2fun n -- | Checks if a symbol is a non-terminal of one of the given categories. catElem :: Eq c => Symbol c t -> [c] -> Bool diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs index 67ed388f8..c13b5c91c 100644 --- a/src/GF/UseGrammar/Custom.hs +++ b/src/GF/UseGrammar/Custom.hs @@ -5,9 +5,9 @@ -- Stability : (stable) -- Portability : (portable) -- --- > CVS $Date: 2005/10/31 19:02:35 $ --- > CVS $Author: aarne $ --- > CVS $Revision: 1.80 $ +-- > CVS $Date: 2005/11/01 20:09:04 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.81 $ -- -- A database for customizable GF shell commands. -- @@ -240,13 +240,17 @@ customGrammarPrinter = ,(strCI "srg", prSRG . stateCF) ,(strCI "gsl", \s -> let opts = stateOptions s name = cncId s - in gslPrinter name opts $ stateCFG s) + in gslPrinter name opts Nothing $ stateCFG s) ,(strCI "jsgf", \s -> let opts = stateOptions s name = cncId s - in jsgfPrinter name opts $ stateCFG s) + in jsgfPrinter name opts Nothing $ stateCFG s) ,(strCI "srgs_xml", \s -> let opts = stateOptions s name = cncId s - in srgsXmlPrinter name opts $ stateCFG s) + in srgsXmlPrinter name opts Nothing $ stateCFG s) + ,(strCI "srgs_xml_prob", \s -> let opts = stateOptions s + name = cncId s + probs = stateProbs s + in srgsXmlPrinter name opts (Just probs) $ stateCFG s) ,(strCI "slf", \s -> let opts = stateOptions s name = cncId s in slfPrinter name opts $ stateCFG s) diff --git a/src/HelpFile b/src/HelpFile index 573191204..68f23aee1 100644 --- a/src/HelpFile +++ b/src/HelpFile @@ -546,6 +546,7 @@ q, quit: q -printer=probs show probabilities of all functions -printer=gsl Nuance GSL speech recognition grammar -printer=jsgf Java Speech Grammar Format + -printer=srgs_xml SRGS XML format -printer=slf a finite automaton in the HTK SLF format -printer=slf_graphviz the same automaton as in SLF, but in Graphviz format -printer=fa_graphviz a finite automaton with labelled edges