First version of SRGS with semantic tags.

This commit is contained in:
bringert
2006-02-01 16:23:14 +00:00
parent 992e212bcc
commit fd0dfd7d4d
9 changed files with 124 additions and 51 deletions

View File

@@ -73,7 +73,12 @@ sortNub = map head . group . sort
-- | Like 'nubBy', but more efficient as it uses sorting internally.
sortNubBy :: (a -> a -> Ordering) -> [a] -> [a]
sortNubBy f = map head . groupBy (compareEq f) . sortBy f
sortNubBy f = map head . sortGroupBy f
-- | Sorts and then groups elements given and ordering of the
-- elements.
sortGroupBy :: (a -> a -> Ordering) -> [a] -> [[a]]
sortGroupBy f = groupBy (compareEq f) . sortBy f
-- | Take the union of a list of lists.
unionAll :: Eq a => [[a]] -> [a]

View File

@@ -598,10 +598,14 @@ txtHelpFile =
"\n -printer=jsgf Java Speech Grammar Format" ++
"\n -printer=srgs_xml SRGS XML format" ++
"\n -printer=srgs_xml_prob SRGS XML format, with weights" ++
"\n -printer=srgs_xml_ms_sem SRGS XML format, with semantic tags for the" ++
"\n Microsoft Speech API." ++
"\n -printer=slf a finite automaton in the HTK SLF format" ++
"\n -printer=slf_graphviz the same automaton as slf, but in Graphviz format" ++
"\n -printer=slf_sub a finite automaton with sub-automata in the HTK SLF format" ++
"\n -printer=slf_sub_graphviz the same automaton as slf_sub, but in Graphviz format" ++
"\n -printer=slf_sub a finite automaton with sub-automata in the " ++
"\n HTK SLF format" ++
"\n -printer=slf_sub_graphviz the same automaton as slf_sub, but in " ++
"\n Graphviz format" ++
"\n -printer=fa_graphviz a finite automaton with labelled edges" ++
"\n -printer=regular a regular grammar in a simple BNF" ++
"\n -printer=unpar a gfc grammar with parameters eliminated" ++

View File

@@ -33,7 +33,7 @@ import Data.Char (toUpper,toLower)
gslPrinter :: Ident -- ^ Grammar name
-> Options -> Maybe Probs -> CGrammar -> String
gslPrinter name opts probs cfg = prGSL srg ""
where srg = makeSRG name opts probs cfg
where srg = makeSimpleSRG name opts probs cfg
prGSL :: SRG -> ShowS
prGSL (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs})
@@ -48,7 +48,7 @@ prGSL (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs})
showString "; " . prtS origCat . nl
. prCat cat . sp . wrap "[" (unwordsS (map prAlt rhs)) "]" . nl
-- FIXME: use the probability
prAlt (SRGAlt mp rhs) = wrap "(" (unwordsS (map prSymbol rhs')) ")"
prAlt (SRGAlt mp _ rhs) = wrap "(" (unwordsS (map prSymbol rhs')) ")"
where rhs' = rmPunct rhs
prSymbol (Cat c) = prCat c
prSymbol (Tok t) = wrap "\"" (showString (showToken t)) "\""

View File

@@ -32,7 +32,7 @@ import GF.Speech.SRG
jsgfPrinter :: Ident -- ^ Grammar name
-> Options -> Maybe Probs -> CGrammar -> String
jsgfPrinter name opts probs cfg = prJSGF srg ""
where srg = makeSRG name opts probs cfg
where srg = makeSimpleSRG name opts probs cfg
prJSGF :: SRG -> ShowS
prJSGF (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs})
@@ -49,7 +49,7 @@ prJSGF (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs})
comments [origCat] . nl
. prCat cat . showString " = " . joinS " | " (map prAlt rhs) . nl
-- FIXME: use the probability
prAlt (SRGAlt mp rhs)
prAlt (SRGAlt mp _ rhs)
| null rhs' = showString "<NULL>"
| otherwise = wrap "(" (unwordsS (map prSymbol rhs')) ")"
where rhs' = rmPunct rhs

View File

@@ -22,7 +22,7 @@ import GF.Speech.SRG
import GF.Infra.Ident
import GF.Formalism.CFG
import GF.Formalism.Utilities (Symbol(..))
import GF.Formalism.Utilities (Symbol(..), NameProfile(..), Profile(..), forestName)
import GF.Conversion.Types
import GF.Infra.Print
import GF.Infra.Option
@@ -37,43 +37,74 @@ type Attr = (String,String)
srgsXmlPrinter :: Ident -- ^ Grammar name
-> Options
-> Bool -- ^ Whether to include semantic interpretation
-> Maybe Probs
-> CGrammar -> String
srgsXmlPrinter name opts probs cfg = prSrgsXml srg ""
srgsXmlPrinter name opts sisr probs cfg = prSrgsXml sisr srg ""
where srg = makeSRG name opts probs cfg
prSrgsXml :: SRG -> ShowS
prSrgsXml (SRG{grammarName=name,startCat=start,
prSrgsXml :: Bool -> SRG -> ShowS
prSrgsXml sisr (SRG{grammarName=name,startCat=start,
origStartCat=origStart,grammarLanguage=l,rules=rs})
= header . showsXML xmlGr
where
header = showString "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
root = prCat start
xmlGr = grammar root l (comments
["SRGS XML speech recognition grammar for " ++ name,
"Generated by GF",
"Original start category: " ++ origStart]
xmlGr = grammar root l ([meta "description"
("SRGS XML speech recognition grammar for " ++ name
++ ". " ++ "Original start category: " ++ origStart),
meta "generator" "GF"]
++ map ruleToXML rs)
ruleToXML (SRGRule cat origCat alts) =
rule (prCat cat) (comments ["Category " ++ origCat] ++ [prRhs alts])
prRhs rhss = oneOf (map prAlt rhss)
prAlt (SRGAlt p rhs) = item p (map prSymbol rhs)
prSymbol (Cat c) = Tag "ruleref" [("uri","#" ++ prCat c)] []
prSymbol (Tok t) = item Nothing [Data (showToken t)]
prCat c = c -- FIXME: escape something?
showToken t = t -- FIXME: escape something?
prAlt (SRGAlt p n@(Name _ pr) rhs)
| sisr = prodItem (Just n) p (map (uncurry (symItem pr)) (numberCats 0 rhs))
| otherwise = prodItem Nothing p (map (\s -> symItem [] s 0) rhs)
numberCats _ [] = []
numberCats n (s@(Cat _):ss) = (s,n):numberCats (n+1) ss
numberCats n (s:ss) = (s,n):numberCats n ss
rule :: String -- ^ id
-> [XML] -> XML
rule :: String -> [XML] -> XML
rule i = Tag "rule" [("id",i)]
item :: Maybe Double -> [XML] -> XML
-- FIXME: what is the weight called?
item mp xs = Tag "item" as cs
where as = maybe [] (\p -> [("weight", show p)]) mp
cs = case xs of
[Tag "item" [] xs'] -> xs'
_ -> xs
prodItem :: Maybe Name -> Maybe Double -> [XML] -> XML
prodItem n mp xs = Tag "item" w (t++cs)
where
w = maybe [] (\p -> [("weight", show p)]) mp
t = maybe [] prodTag n
cs = case xs of
[Tag "item" [] xs'] -> xs'
_ -> xs
prodTag :: Name -> [XML]
prodTag (Name f prs) = [Tag "tag" [] [Data (join "; " ts)]]
where
ts = ["$.name=" ++ showFun f] ++
["$.arg" ++ show n ++ "=" ++ argInit (prs!!n)
| n <- [0..length prs-1]]
argInit (Unify _) = metavar
argInit (Constant f) = maybe metavar showFun (forestName f)
showFun = show . prIdent
metavar = show "?"
symItem :: [Profile a] -> Symbol String Token -> Int -> XML
symItem prs (Cat c) x = Tag "item" [] ([Tag "ruleref" [("uri","#" ++ prCat c)] []]++t)
where
t = if null ts then [] else [Tag "tag" [] [Data (join "; " ts)]]
ts = ["$.arg" ++ show n ++ "=$$"
| n <- [0..length prs-1], inProfile x (prs!!n)]
symItem _ (Tok t) _ = Tag "item" [] [Data (showToken t)]
inProfile :: Int -> Profile a -> Bool
inProfile x (Unify xs) = x `elem` xs
inProfile _ (Constant _) = False
prCat :: String -> String
prCat c = c -- FIXME: escape something?
showToken :: Token -> String
showToken t = t -- FIXME: escape something?
oneOf :: [XML] -> XML
oneOf [x] = x
@@ -88,6 +119,9 @@ grammar root l = Tag "grammar" [("xml:lang", l),
("mode","voice"),
("root",root)]
meta :: String -> String -> XML
meta n c = Tag "meta" [("name",n),("content",c)] []
comments :: [String] -> [XML]
comments = map Comment

View File

@@ -18,13 +18,16 @@
-- FIXME: figure out name prefix from grammar name
-----------------------------------------------------------------------------
module GF.Speech.SRG where
module GF.Speech.SRG (SRG(..), SRGRule(..), SRGAlt(..),
makeSimpleSRG, makeSRG
, lookupFM_, prtS) where
import GF.Data.Operations
import GF.Data.Utilities
import GF.Infra.Ident
import GF.Formalism.CFG
import GF.Formalism.Utilities (Symbol(..))
import GF.Formalism.Utilities (Symbol(..), NameProfile(..)
, Profile, SyntaxForest)
import GF.Conversion.Types
import GF.Infra.Print
import GF.Speech.TransformCFG
@@ -48,8 +51,8 @@ data SRGRule = SRGRule String String [SRGAlt] -- ^ SRG category name, original c
-- and productions
deriving (Eq,Show)
-- | maybe a probability, and a list of symbols
data SRGAlt = SRGAlt (Maybe Double) [Symbol String Token]
-- | maybe a probability, a rule name and a list of symbols
data SRGAlt = SRGAlt (Maybe Double) Name [Symbol String Token]
deriving (Eq,Show)
-- | SRG category name and original name
@@ -57,12 +60,34 @@ type CatName = (String,String)
type CatNames = FiniteMap String String
-- | Create a non-left-recursive SRG.
-- FIXME: the probabilities, names and profiles in the returned
-- grammar may be meaningless.
makeSimpleSRG :: Ident -- ^ Grammar name
-> Options -- ^ Grammar options
-> Maybe Probs -- ^ Probabilities
-> CGrammar -- ^ A context-free grammar
-> SRG
makeSimpleSRG
= makeSRG_ (removeLeftRecursion . removeIdenticalRules . removeEmptyCats)
-- | Create a SRG preserving the names, profiles and probabilities of the
-- input grammar. The returned grammar may be left-recursive.
makeSRG :: Ident -- ^ Grammar name
-> Options -- ^ Grammar options
-> Maybe Probs -- ^ Probabilities
-> CGrammar -- ^ A context-free grammar
-> SRG
makeSRG i opts probs gr
makeSRG = makeSRG_ removeEmptyCats
makeSRG_ :: (CFRules -> CFRules) -- ^ Transformations to apply to the
-- CFG before converting to SRG
-> Ident -- ^ Grammar name
-> Options -- ^ Grammar options
-> Maybe Probs -- ^ Probabilities
-> CGrammar -- ^ A context-free grammar
-> SRG
makeSRG_ f i opts probs gr
= SRG { grammarName = name,
startCat = lookupFM_ names origStart,
origStartCat = origStart,
@@ -72,21 +97,19 @@ makeSRG i opts probs gr
name = prIdent i
origStart = getStartCat opts
l = fromMaybe "en_UK" (getOptVal opts speechLanguage)
gr' = removeLeftRecursion $ removeIdenticalRules $ removeEmptyCats $ cfgToCFRules gr
gr' = f (cfgToCFRules gr)
(cats,cfgRules) = unzip gr'
names = mkCatNames name cats
rs = map (cfgRulesToSRGRule names probs) cfgRules
-- FIXME: probabilities get larger than 1.0 when new rules are
-- introduced
-- FIXME: merge alternatives with same rhs but different probabilities
-- FIXME: merge alternatives with same rhs and profile but different probabilities
cfgRulesToSRGRule :: FiniteMap String String -> Maybe Probs -> [CFRule_] -> SRGRule
cfgRulesToSRGRule names probs rs@(r:_) = SRGRule cat origCat rhs
where origCat = lhsCat r
cat = lookupFM_ names origCat
rhs = nub $ map ruleToAlt rs
ruleToAlt r = SRGAlt (ruleProb probs r) (map renameCat (ruleRhs r))
ruleToAlt r@(CFRule c ss n)
= SRGAlt (ruleProb probs r) n (map renameCat ss)
renameCat (Cat c) = Cat (lookupFM_ names c)
renameCat t = t

View File

@@ -37,6 +37,7 @@ import Control.Monad
import Data.FiniteMap
import Data.List
import Data.Maybe (fromMaybe)
import Data.Monoid (mconcat)
import Data.Set (Set)
import qualified Data.Set as Set
@@ -77,14 +78,13 @@ removeEmptyCats = fix removeEmptyCats'
emptyCats = filter (nothingOrNull . flip lookup rs) allCats
k' = map (\ (c,xs) -> (c, filter (not . anyUsedBy emptyCats) xs)) keep
-- | Remove rules which are identical, not caring about the rule names.
-- FIXME: this messes up probabilities
-- | Remove rules which have the same rhs.
-- FIXME: this messes up probabilities, names and profiles
removeIdenticalRules :: CFRules -> CFRules
removeIdenticalRules g = [(c,sortNubBy compareCatAndRhs rs) | (c,rs) <- g]
where compareCatAndRhs (CFRule c1 ss1 _) (CFRule c2 ss2 _) =
case c1 `compare` c2 of
EQ -> ss1 `compare` ss2
o -> o
removeIdenticalRules g = [(c,sortNubBy cmpRules rs) | (c,rs) <- g]
where
cmpRules (CFRule c1 ss1 _) (CFRule c2 ss2 _) =
mconcat [c1 `compare` c2, ss1 `compare` ss2]
removeLeftRecursion :: CFRules -> CFRules
removeLeftRecursion rs = concatMap removeDirectLeftRecursion $ map handleProds rs

View File

@@ -249,11 +249,14 @@ customGrammarPrinter =
in jsgfPrinter name opts Nothing $ stateCFG s)
,(strCI "srgs_xml", \s -> let opts = stateOptions s
name = cncId s
in srgsXmlPrinter name opts Nothing $ stateCFG s)
in srgsXmlPrinter name opts False Nothing $ stateCFG s)
,(strCI "srgs_xml_prob", \s -> let opts = stateOptions s
name = cncId s
probs = stateProbs s
in srgsXmlPrinter name opts (Just probs) $ stateCFG s)
in srgsXmlPrinter name opts False (Just probs) $ stateCFG s)
,(strCI "srgs_xml_ms_sem", \s -> let opts = stateOptions s
name = cncId s
in srgsXmlPrinter name opts True Nothing $ stateCFG s)
,(strCI "slf", \s -> let opts = stateOptions s
start = getStartCat opts
name = cncId s

View File

@@ -569,10 +569,14 @@ q, quit: q
-printer=jsgf Java Speech Grammar Format
-printer=srgs_xml SRGS XML format
-printer=srgs_xml_prob SRGS XML format, with weights
-printer=srgs_xml_ms_sem SRGS XML format, with semantic tags for the
Microsoft Speech API.
-printer=slf a finite automaton in the HTK SLF format
-printer=slf_graphviz the same automaton as slf, but in Graphviz format
-printer=slf_sub a finite automaton with sub-automata in the HTK SLF format
-printer=slf_sub_graphviz the same automaton as slf_sub, but in Graphviz format
-printer=slf_sub a finite automaton with sub-automata in the
HTK SLF format
-printer=slf_sub_graphviz the same automaton as slf_sub, but in
Graphviz format
-printer=fa_graphviz a finite automaton with labelled edges
-printer=regular a regular grammar in a simple BNF
-printer=unpar a gfc grammar with parameters eliminated