From 160c6354c05d92c764ddd23fd27006b238ca2625 Mon Sep 17 00:00:00 2001 From: bringert Date: Fri, 15 Dec 2006 16:09:58 +0000 Subject: [PATCH] Towards smaller SRGs when lots of variants are used. --- src/GF/Speech/FiniteState.hs | 35 +++++++++++++++ src/GF/Speech/Graph.hs | 14 +++++- src/GF/Speech/PrJSGF.hs | 32 ++++++++------ src/GF/Speech/PrRegExp.hs | 21 +++++++++ src/GF/Speech/PrSRGS.hs | 71 ++++-------------------------- src/GF/Speech/RegExp.hs | 84 ++++++++++++++++++++++++++++++++++++ src/GF/Speech/SRG.hs | 23 +++++++--- src/GF/UseGrammar/Custom.hs | 4 ++ 8 files changed, 201 insertions(+), 83 deletions(-) create mode 100644 src/GF/Speech/PrRegExp.hs create mode 100644 src/GF/Speech/RegExp.hs diff --git a/src/GF/Speech/FiniteState.hs b/src/GF/Speech/FiniteState.hs index 832fb81d4..7054784c8 100644 --- a/src/GF/Speech/FiniteState.hs +++ b/src/GF/Speech/FiniteState.hs @@ -14,11 +14,17 @@ module GF.Speech.FiniteState (FA(..), State, NFA, DFA, startState, finalStates, states, transitions, + isInternal, newFA, addFinalState, newState, newStates, newTransition, newTransitions, + insertTransitionWith, insertTransitionsWith, mapStates, mapTransitions, + modifyTransitions, + nonLoopTransitionsTo, nonLoopTransitionsFrom, + loops, + removeState, oneFinalState, insertNFA, onGraph, @@ -41,6 +47,7 @@ import qualified GF.Visualization.Graphviz as Dot type State = Int +-- | Type parameters: node id type, state label type, edge label type data FA n a b = FA !(Graph n a b) !n ![n] type NFA a = FA State () (Maybe a) @@ -82,18 +89,46 @@ newTransition f t l = onGraph (newEdge (f,t,l)) newTransitions :: [(n, n, b)] -> FA n a b -> FA n a b newTransitions es = onGraph (newEdges es) +insertTransitionWith :: Eq n => + (b -> b -> b) -> (n, n, b) -> FA n a b -> FA n a b +insertTransitionWith f t = onGraph (insertEdgeWith f t) + +insertTransitionsWith :: Eq n => + (b -> b -> b) -> [(n, n, b)] -> FA n a b -> FA n a b +insertTransitionsWith f ts fa = + foldl' (flip (insertTransitionWith f)) fa ts + mapStates :: (a -> c) -> FA n a b -> FA n c b mapStates f = onGraph (nmap f) mapTransitions :: (b -> c) -> FA n a b -> FA n a c mapTransitions f = onGraph (emap f) +modifyTransitions :: ([(n,n,b)] -> [(n,n,b)]) -> FA n a b -> FA n a b +modifyTransitions f = onGraph (\ (Graph r ns es) -> Graph r ns (f es)) + +removeState :: Ord n => n -> FA n a b -> FA n a b +removeState n = onGraph (removeNode n) + minimize :: Ord a => NFA a -> DFA a minimize = determinize . reverseNFA . dfa2nfa . determinize . reverseNFA unusedNames :: FA n a b -> [n] unusedNames (FA (Graph names _ _) _ _) = names +-- | Gets all incoming transitions to a given state, excluding +-- transtions from the state itself. +nonLoopTransitionsTo :: Eq n => n -> FA n a b -> [(n,b)] +nonLoopTransitionsTo s fa = + [(f,l) | (f,t,l) <- transitions fa, t == s && f /= s] + +nonLoopTransitionsFrom :: Eq n => n -> FA n a b -> [(n,b)] +nonLoopTransitionsFrom s fa = + [(t,l) | (f,t,l) <- transitions fa, f == s && t /= s] + +loops :: Eq n => n -> FA n a b -> [b] +loops s fa = [l | (f,t,l) <- transitions fa, f == s && t == s] + -- | Give new names to all nodes. renameStates :: Ord x => [y] -- ^ Infinite supply of new names -> FA x a b diff --git a/src/GF/Speech/Graph.hs b/src/GF/Speech/Graph.hs index c23c5e384..1a0ebe0c0 100644 --- a/src/GF/Speech/Graph.hs +++ b/src/GF/Speech/Graph.hs @@ -14,7 +14,8 @@ module GF.Speech.Graph ( Graph(..), Node, Edge, NodeInfo , newGraph, nodes, edges , nmap, emap, newNode, newNodes, newEdge, newEdges - , removeNodes + , insertEdgeWith + , removeNode, removeNodes , nodeInfo , getIncoming, getOutgoing, getNodeLabel , inDegree, outDegree @@ -82,6 +83,17 @@ newEdges es g = foldl' (flip newEdge) g es -- lazy version: -- newEdges es' (Graph c ns es) = Graph c ns (es'++es) +insertEdgeWith :: Eq n => + (b -> b -> b) -> Edge n b -> Graph n a b -> Graph n a b +insertEdgeWith f e@(x,y,l) (Graph c ns es) = Graph c ns (h es) + where h [] = [e] + h (e'@(x',y',l'):es') | x' == x && y' == y = (x',y', f l l'):es' + | otherwise = e':h es' + +-- | Remove a node and all edges to and from that node. +removeNode :: Ord n => n -> Graph n a b -> Graph n a b +removeNode n = removeNodes (Set.singleton n) + -- | Remove a set of nodes and all edges to and from those nodes. removeNodes :: Ord n => Set n -> Graph n a b -> Graph n a b removeNodes xs (Graph c ns es) = Graph c ns' es' diff --git a/src/GF/Speech/PrJSGF.hs b/src/GF/Speech/PrJSGF.hs index 6183b9826..f09d454d9 100644 --- a/src/GF/Speech/PrJSGF.hs +++ b/src/GF/Speech/PrJSGF.hs @@ -28,6 +28,7 @@ import GF.Infra.Print import GF.Infra.Option import GF.Probabilistic.Probabilistic (Probs) import GF.Speech.SRG +import GF.Speech.RegExp jsgfPrinter :: Ident -- ^ Grammar name -> String -- ^ Start category @@ -48,20 +49,27 @@ prJSGF (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs}) . showString "public
= " . prCat start . showChar ';' . nl . nl prRule (SRGRule cat origCat rhs) = comments [origCat] . nl - . prCat cat . showString " = " . joinS " | " (map prAlt rhs) . nl + . prCat cat . showString " = " . joinS " | " (map prAlt (ebnfSRGAlts rhs)) . nl -- FIXME: use the probability - prAlt (SRGAlt mp _ rhs) - | null rhs' = showString "" - | otherwise = wrap "(" (unwordsS (map prSymbol rhs')) ")" - where rhs' = rmPunct rhs - prSymbol (Cat (c,_)) = prCat c - prSymbol (Tok t) = wrap "\"" (prtS t) "\"" - prCat c = showChar '<' . showString c . showChar '>' + prAlt (EBnfSRGAlt mp _ rhs) = prItem rhs -rmPunct :: [Symbol c Token] -> [Symbol c Token] -rmPunct [] = [] -rmPunct (Tok t:ss) | all isPunct (prt t) = rmPunct ss -rmPunct (s:ss) = s : rmPunct ss +prCat :: SRGCat -> ShowS +prCat c = showChar '<' . showString c . showChar '>' + +prItem :: EBnfSRGItem -> ShowS +prItem = f + where + f (REUnion []) = showString "" + f (REUnion xs) = wrap "(" (joinS " | " (map f xs)) ")" + f (REConcat []) = showString "" + f (REConcat xs) = wrap "(" (unwordsS (map f xs)) ")" + f (RERepeat x) = wrap "(" (f x) ")" . showString "*" + f (RESymbol s) = prSymbol s + +prSymbol :: Symbol SRGNT Token -> ShowS +prSymbol (Cat (c,_)) = prCat c +prSymbol (Tok t) | all isPunct (prt t) = id -- removes punctuation + | otherwise = wrap "\"" (prtS t) "\"" isPunct :: Char -> Bool isPunct c = c `elem` "-_.;.,?!" diff --git a/src/GF/Speech/PrRegExp.hs b/src/GF/Speech/PrRegExp.hs new file mode 100644 index 000000000..c0aadab00 --- /dev/null +++ b/src/GF/Speech/PrRegExp.hs @@ -0,0 +1,21 @@ +---------------------------------------------------------------------- +-- | +-- Module : PrSLF +-- Maintainer : BB +-- Stability : (stable) +-- Portability : (portable) +-- +-- This module prints a grammar as a regular expression. +----------------------------------------------------------------------------- + +module GF.Speech.PrRegExp (regexpPrinter) where + +import GF.Conversion.Types +import GF.Infra.Ident +import GF.Speech.CFGToFiniteState +import GF.Speech.RegExp + + +regexpPrinter :: Ident -- ^ Grammar name + -> String -> CGrammar -> String +regexpPrinter name start cfg = prRE $ dfa2re $ cfgToFA start cfg diff --git a/src/GF/Speech/PrSRGS.hs b/src/GF/Speech/PrSRGS.hs index 9f86c1468..63ca91034 100644 --- a/src/GF/Speech/PrSRGS.hs +++ b/src/GF/Speech/PrSRGS.hs @@ -15,6 +15,7 @@ module GF.Speech.PrSRGS (SISRFormat(..), srgsXmlPrinter) where import GF.Data.Utilities import GF.Data.XML +import GF.Speech.RegExp import GF.Speech.SISR as SISR import GF.Speech.SRG import GF.Infra.Ident @@ -85,9 +86,12 @@ mkProd sisr (EBnfSRGAlt mp n@(Name f prs) rhs) = Tag "item" w (t ++ xs) argInit (Constant f) = maybe "?" prIdent (forestName f) mkItem :: Maybe SISRFormat -> EBnfSRGItem -> XML -mkItem sisr (EBnfOneOf xs) = oneOf (map (mkItem sisr) xs) -mkItem sisr (EBnfSeq xs) = Tag "item" [] (map (mkItem sisr) xs) -mkItem sisr (EBnfSymbol s) = symItem sisr s +mkItem sisr = f + where + f (REUnion xs) = oneOf (map f xs) + f (REConcat xs) = Tag "item" [] (map f xs) + f (RERepeat x) = Tag "item" [("repeat","0-")] [f x] + f (RESymbol s) = symItem sisr s symItem :: Maybe SISRFormat -> Symbol SRGNT Token -> XML symItem sisr (Cat (c,slots)) = Tag "item" [] ([Tag "ruleref" [("uri","#" ++ prCat c)] []]++t) @@ -107,8 +111,7 @@ showToken :: Token -> String showToken t = t oneOf :: [XML] -> XML -oneOf [x] = x -oneOf xs = Tag "one-of" [] xs +oneOf = Tag "one-of" [] grammar :: Maybe SISRFormat -> String -- ^ root @@ -130,61 +133,3 @@ optimizeSRGS = bottomUpXML f where f (Tag "item" [] [x@(Tag "item" [] _)]) = x f (Tag "one-of" [] [x]) = x f x = x - -{- - --- --- * SRGS minimization --- - -minimizeRule :: XML -> XML -minimizeRule (Tag "rule" attrs cs) - = Tag "rule" attrs (map minimizeOneOf cs) - -minimizeOneOf :: XML -> XML -minimizeOneOf (Tag "one-of" attrs cs) - = Tag "item" [] (p++[Tag "one-of" attrs cs']) - where - (pref,cs') = factor cs - p = if null pref then [] else [Tag "one-of" [] pref] -minimizeOneOf x = x - -factor :: [XML] -> ([XML],[XML]) -factor xs = case f of - Just (ps,xs') -> (map it ps, map it xs') - Nothing -> ([],xs) - where - -- FIXME: maybe getting all the longest terminal prefixes - -- is not optimal? - f = cartesianFactor $ map (terminalPrefix . unIt) xs - unIt (Tag "item" [] cs) = cs - it cs = Tag "item" [] cs - -terminalPrefix :: [XML] -> ([XML],[XML]) -terminalPrefix cs = (terms, tags ++ cs'') - where (tags,cs') = span isTag cs - (terms,cs'') = span isTerminalItem cs' - -isTag :: XML -> Bool -isTag (Tag t _ _) = t == "tag" -isTag _ = False - -isTerminalItem :: XML -> Bool -isTerminalItem (Tag "item" [] [Data _]) = True -isTerminalItem _ = False - --- --- * Utilities --- - -allEqual :: Eq a => [a] -> Bool -allEqual [] = True -allEqual (x:xs) = all (x==) xs - -cartesianFactor :: (Ord a, Ord b) => [(a,b)] -> Maybe ([a],[b]) -cartesianFactor xs - | not (null es) && allEqual es = Just (Map.keys m, Set.elems (head es)) - | otherwise = Nothing - where m = Map.fromListWith Set.union [(x,Set.singleton y) | (x,y) <- xs] - es = Map.elems m --} \ No newline at end of file diff --git a/src/GF/Speech/RegExp.hs b/src/GF/Speech/RegExp.hs new file mode 100644 index 000000000..2f4c7cd48 --- /dev/null +++ b/src/GF/Speech/RegExp.hs @@ -0,0 +1,84 @@ +module GF.Speech.RegExp (RE(..), dfa2re, prRE) where + +import Data.List + +import GF.Data.Utilities +import GF.Speech.FiniteState + +data RE a = + REUnion [RE a] -- ^ REUnion [] is null + | REConcat [RE a] -- ^ REConcat [] is epsilon + | RERepeat (RE a) + | RESymbol a + deriving (Eq,Show) + + +dfa2re :: Show a => DFA a -> RE a +dfa2re = finalRE . elimStates . modifyTransitions merge . addLoops + . oneFinalState () epsilonRE . mapTransitions RESymbol + where addLoops fa = newTransitions [(s,s,nullRE) | (s,_) <- states fa] fa + merge es = [(f,t,unionRE ls) + | ((f,t),ls) <- buildMultiMap [((f,t),l) | (f,t,l) <- es]] + +elimStates :: Show a => DFA (RE a) -> DFA (RE a) +elimStates fa = + case [s | (s,_) <- states fa, isInternal fa s] of + [] -> fa + sE:_ -> elimStates $ insertTransitionsWith (\x y -> unionRE [x,y]) ts $ removeState sE fa + where sAs = nonLoopTransitionsTo sE fa + sBs = nonLoopTransitionsFrom sE fa + r2 = unionRE $ loops sE fa + ts = [(sA, sB, r r1 r3) | (sA,r1) <- sAs, (sB,r3) <- sBs] + r r1 r3 = concatRE [r1, repeatRE r2, r3] + +epsilonRE = REConcat [] + +nullRE = REUnion [] + +isNull (REUnion []) = True +isNull _ = False + +isEpsilon (REConcat []) = True +isEpsilon _ = False + +unionRE :: [RE a] -> RE a +unionRE = unionOrId . concatMap toList + where + toList (REUnion xs) = xs + toList x = [x] + unionOrId [r] = r + unionOrId rs = REUnion rs + +concatRE :: [RE a] -> RE a +concatRE xs | any isNull xs = nullRE + | otherwise = case concatMap toList xs of + [r] -> r + rs -> REConcat rs + where + toList (REConcat xs) = xs + toList x = [x] + +repeatRE :: RE a -> RE a +repeatRE x | isNull x || isEpsilon x = epsilonRE + | otherwise = RERepeat x + +finalRE :: DFA (RE a) -> RE a +finalRE fa = concatRE [repeatRE r1, r2, + repeatRE (unionRE [r3, concatRE [r4, repeatRE r1, r2]])] + where + s0 = startState fa + [sF] = finalStates fa + r1 = unionRE $ loops s0 fa + r2 = unionRE $ map snd $ nonLoopTransitionsTo sF fa + r3 = unionRE $ loops sF fa + r4 = unionRE $ map snd $ nonLoopTransitionsFrom sF fa + +-- Debugging + +prRE :: Show a => RE a -> String +prRE (REUnion []) = "" +prRE (REUnion xs) = "(" ++ concat (intersperse " | " (map prRE xs)) ++ ")" +prRE (REConcat xs) = "(" ++ unwords (map prRE xs) ++ ")" +prRE (RERepeat x) = "(" ++ prRE x ++ ")*" +prRE (RESymbol s) = show s + diff --git a/src/GF/Speech/SRG.hs b/src/GF/Speech/SRG.hs index 9082fa1f4..b27c5ad56 100644 --- a/src/GF/Speech/SRG.hs +++ b/src/GF/Speech/SRG.hs @@ -23,7 +23,7 @@ module GF.Speech.SRG (SRG(..), SRGRule(..), SRGAlt(..), makeSimpleSRG, makeSRG , lookupFM_, prtS , topDownFilter - , EBnfSRGAlt(..), EBnfSRGItem(..) + , EBnfSRGAlt(..), EBnfSRGItem , ebnfSRGAlts ) where @@ -38,6 +38,8 @@ import GF.Conversion.Types import GF.Infra.Print import GF.Speech.TransformCFG import GF.Speech.Relation +import GF.Speech.FiniteState +import GF.Speech.RegExp import GF.Infra.Option import GF.Probabilistic.Probabilistic (Probs) @@ -173,18 +175,25 @@ allSRGCats SRG { rules = rs } = [c | SRGRule c _ _ <- rs] data EBnfSRGAlt = EBnfSRGAlt (Maybe Double) Name EBnfSRGItem deriving (Eq,Show) -data EBnfSRGItem = - EBnfOneOf [EBnfSRGItem] - | EBnfSeq [EBnfSRGItem] - | EBnfSymbol (Symbol SRGNT Token) - deriving (Eq,Show) +type EBnfSRGItem = RE (Symbol SRGNT Token) + ebnfSRGAlts :: [SRGAlt] -> [EBnfSRGAlt] ebnfSRGAlts alts = [EBnfSRGAlt p n (ebnfSRGItem sss) | ((p,n),sss) <- buildMultiMap [((p,n),ss) | SRGAlt p n ss <- alts]] ebnfSRGItem :: [[Symbol SRGNT Token]] -> EBnfSRGItem -ebnfSRGItem sss = EBnfOneOf (map (EBnfSeq . map EBnfSymbol) sss) +ebnfSRGItem = dfa2re . mkSRGFA + +mkSRGFA :: [[Symbol SRGNT Token]] -> DFA (Symbol SRGNT Token) +mkSRGFA = minimize . dfa2nfa . foldr addString (newFA ()) + +addString :: [a] -> DFA a -> DFA a +addString xs fa = addFinalState (last sts0) $ newTransitions ts fa' + where (fa',ss) = newStates (replicate (length xs) ()) fa + sts0 = startState fa : sts1 + sts1 = map fst ss + ts = zip3 sts0 sts1 xs -- -- * Utilities for building and printing SRGs diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs index 7e76688d9..92b95756a 100644 --- a/src/GF/UseGrammar/Custom.hs +++ b/src/GF/UseGrammar/Custom.hs @@ -64,6 +64,7 @@ import GF.Speech.PrJSGF (jsgfPrinter) import qualified GF.Speech.PrSRGS as SRGS import GF.Speech.PrSLF import GF.Speech.PrFA (faGraphvizPrinter,regularPrinter,faCPrinter) +import GF.Speech.PrRegExp (regexpPrinter) import GF.Speech.GrammarToVoiceXML (grammar2vxml) import GF.Data.Zipper @@ -284,6 +285,9 @@ customGrammarPrinter = ,(strCI "fa_c", \opts s -> let start = getStartCatCF opts s name = cncId s in faCPrinter name start $ stateCFG s) + ,(strCI "regexp", \opts s -> let start = getStartCatCF opts s + name = cncId s + in regexpPrinter name start $ stateCFG s) ,(strCI "regular", \_ -> regularPrinter . stateCFG) ,(strCI "plbnf", \_ -> prLBNF True) ,(strCI "lbnf", \_ -> prLBNF False)