From 476080e6db0efc267b9402f77c10377577f33ff5 Mon Sep 17 00:00:00 2001 From: bringert Date: Sun, 7 Jan 2007 20:04:39 +0000 Subject: [PATCH] Generate more compact JSGF by converting to ENBF. --- src/GF/Speech/PrJSGF.hs | 41 ++++++++++++++++++----------------- src/GF/Speech/RegExp.hs | 26 ++++++++++++++++------ src/GF/Speech/SRG.hs | 34 ++++++++++++++--------------- src/GF/Speech/TransformCFG.hs | 2 +- 4 files changed, 58 insertions(+), 45 deletions(-) diff --git a/src/GF/Speech/PrJSGF.hs b/src/GF/Speech/PrJSGF.hs index d1d904dbb..9d6dca598 100644 --- a/src/GF/Speech/PrJSGF.hs +++ b/src/GF/Speech/PrJSGF.hs @@ -55,11 +55,11 @@ prJSGF sisr srg@(SRG{grammarName=name,startCat=start,origStartCat=origStart,rule rule True "MAIN" [prCat start] prRule (SRGRule cat origCat rhs) = comment origCat $$ --- rule False cat (map prAlt (ebnfSRGAlts rhs)) - rule False cat (map prAlt rhs) + rule False cat (map prAlt (ebnfSRGAlts rhs)) +-- rule False cat (map prAlt rhs) -- FIXME: use the probability --- prAlt (EBnfSRGAlt mp n rhs) = tag sisr (profileInitSISR n) . showChar ' '. prItem sisr rhs - prAlt (SRGAlt mp n rhs) = initTag <+> prItem sisr n rhs <+> finalTag + prAlt (EBnfSRGAlt mp n rhs) = sep [initTag, prItem sisr n rhs, finalTag] +-- prAlt (SRGAlt mp n rhs) = initTag <+> prItem sisr n rhs <+> finalTag where initTag | isEmpty t = empty | otherwise = text "" <+> t where t = tag sisr (profileInitSISR n) @@ -74,25 +74,25 @@ catFormId = (++ "_cat") prCat :: SRGCat -> Doc prCat c = char '<' <> text c <> char '>' -{- -prItem :: Maybe SISRFormat -> EBnfSRGItem -> ShowS -prItem sisr = f 1 +prItem :: Maybe SISRFormat -> CFTerm -> EBnfSRGItem -> Doc +prItem sisr t = f 1 where - f _ (REUnion []) = showString "" + f _ (REUnion []) = text "" f p (REUnion xs) - | not (null es) = wrap "[" (f 0 (REUnion nes)) "]" - | otherwise = (if p >= 1 then paren else id) (joinS " | " (map (f 1) xs)) - where (es,nes) = partition (== REConcat []) xs - f _ (REConcat []) = showString "" - f p (REConcat xs) = (if p >= 3 then paren else id) (unwordsS (map (f 2) xs)) - f p (RERepeat x) = f 3 x . showString "*" - f _ (RESymbol s) = prSymbol sisr s --} + | not (null es) = brackets (f 0 (REUnion nes)) + | otherwise = (if p >= 1 then parens else id) (alts (map (f 1) xs)) + where (es,nes) = partition isEpsilon xs + f _ (REConcat []) = text "" + f p (REConcat xs) = (if p >= 3 then parens else id) (hsep (map (f 2) xs)) + f p (RERepeat x) = f 3 x <> char '*' + f _ (RESymbol s) = prSymbol sisr t s +{- prItem :: Maybe SISRFormat -> CFTerm -> [Symbol SRGNT Token] -> Doc prItem _ _ [] = text "" prItem sisr cn ss = paren $ hsep $ map (prSymbol sisr cn) ss where paren = if length ss == 1 then id else parens +-} prSymbol :: Maybe SISRFormat -> CFTerm -> Symbol SRGNT Token -> Doc prSymbol sisr cn (Cat n@(c,_)) = prCat c <+> tag sisr (catSISR cn n) @@ -103,7 +103,7 @@ tag :: Maybe SISRFormat -> (SISRFormat -> SISRTag) -> Doc tag Nothing _ = empty tag (Just fmt) t = case t fmt of [] -> empty - ts -> char '{' <+> text (e $ prSISR ts) <+> char '}' + ts -> char '{' <+> (text (e $ prSISR ts)) <+> char '}' where e [] = [] e ('}':xs) = '\\':'}':e xs e ('\n':xs) = ' ' : e (dropWhile isSpace xs) @@ -115,11 +115,11 @@ isPunct c = c `elem` "-_.;.,?!" comment :: String -> Doc comment s = text "//" <+> text s - +alts :: [Doc] -> Doc +alts = sep . prepunctuate (text "| ") rule :: Bool -> SRGCat -> [Doc] -> Doc -rule pub c xs = p <+> prCat c <+> char '=' - $$ nest 2 (sep (prepunctuate (text "| ") xs) <+> char ';') +rule pub c xs = sep [p <+> prCat c <+> char '=', nest 2 (alts xs) <+> char ';'] where p = if pub then text "public" else empty -- Pretty-printing utilities @@ -133,3 +133,4 @@ prepunctuate p (x:xs) = x : map (p <>) xs ($++$) :: Doc -> Doc -> Doc x $++$ y = x $$ emptyLine $$ y + diff --git a/src/GF/Speech/RegExp.hs b/src/GF/Speech/RegExp.hs index 2f4c7cd48..1eb6efa4d 100644 --- a/src/GF/Speech/RegExp.hs +++ b/src/GF/Speech/RegExp.hs @@ -1,4 +1,9 @@ -module GF.Speech.RegExp (RE(..), dfa2re, prRE) where +module GF.Speech.RegExp (RE(..), + epsilonRE, nullRE, + isEpsilon, isNull, + unionRE, concatRE, seqRE, + repeatRE, + dfa2re, prRE) where import Data.List @@ -10,17 +15,17 @@ data RE a = | REConcat [RE a] -- ^ REConcat [] is epsilon | RERepeat (RE a) | RESymbol a - deriving (Eq,Show) + deriving (Eq,Ord,Show) -dfa2re :: Show a => DFA a -> RE a +dfa2re :: (Show a,Ord a) => DFA a -> RE a dfa2re = finalRE . elimStates . modifyTransitions merge . addLoops . oneFinalState () epsilonRE . mapTransitions RESymbol where addLoops fa = newTransitions [(s,s,nullRE) | (s,_) <- states fa] fa merge es = [(f,t,unionRE ls) | ((f,t),ls) <- buildMultiMap [((f,t),l) | (f,t,l) <- es]] -elimStates :: Show a => DFA (RE a) -> DFA (RE a) +elimStates :: (Show a, Ord a) => DFA (RE a) -> DFA (RE a) elimStates fa = case [s | (s,_) <- states fa, isInternal fa s] of [] -> fa @@ -31,18 +36,22 @@ elimStates fa = ts = [(sA, sB, r r1 r3) | (sA,r1) <- sAs, (sB,r3) <- sBs] r r1 r3 = concatRE [r1, repeatRE r2, r3] +epsilonRE :: RE a epsilonRE = REConcat [] +nullRE :: RE a nullRE = REUnion [] +isNull :: RE a -> Bool isNull (REUnion []) = True isNull _ = False +isEpsilon :: RE a -> Bool isEpsilon (REConcat []) = True isEpsilon _ = False -unionRE :: [RE a] -> RE a -unionRE = unionOrId . concatMap toList +unionRE :: Ord a => [RE a] -> RE a +unionRE = unionOrId . sortNub . concatMap toList where toList (REUnion xs) = xs toList x = [x] @@ -58,11 +67,14 @@ concatRE xs | any isNull xs = nullRE toList (REConcat xs) = xs toList x = [x] +seqRE :: [a] -> RE a +seqRE = concatRE . map RESymbol + repeatRE :: RE a -> RE a repeatRE x | isNull x || isEpsilon x = epsilonRE | otherwise = RERepeat x -finalRE :: DFA (RE a) -> RE a +finalRE :: Ord a => DFA (RE a) -> RE a finalRE fa = concatRE [repeatRE r1, r2, repeatRE (unionRE [r3, concatRE [r4, repeatRE r1, r2]])] where diff --git a/src/GF/Speech/SRG.hs b/src/GF/Speech/SRG.hs index cf74ba66e..cc03cdca6 100644 --- a/src/GF/Speech/SRG.hs +++ b/src/GF/Speech/SRG.hs @@ -23,8 +23,8 @@ module GF.Speech.SRG (SRG(..), SRGRule(..), SRGAlt(..), makeSimpleSRG, makeSRG , lookupFM_, prtS , topDownFilter, cfgCatToGFCat, srgTopCats - --, EBnfSRGAlt(..), EBnfSRGItem - --, ebnfSRGAlts + , EBnfSRGAlt(..), EBnfSRGItem + , ebnfSRGAlts ) where import GF.Data.Operations @@ -33,7 +33,7 @@ import GF.Infra.Ident import GF.Formalism.CFG import GF.Formalism.Utilities (Symbol(..), NameProfile(..) , Profile(..), SyntaxForest - , filterCats, mapSymbol) + , filterCats, mapSymbol, symbol) import GF.Conversion.Types import GF.Infra.Print import GF.Speech.TransformCFG @@ -177,8 +177,7 @@ srgTopCats srg = buildMultiMap [(oc, cat) | SRGRule cat origCat _ <- rules srg, -- * Size-optimized EBNF SRGs -- -{- -data EBnfSRGAlt = EBnfSRGAlt (Maybe Double) Name EBnfSRGItem +data EBnfSRGAlt = EBnfSRGAlt (Maybe Double) CFTerm EBnfSRGItem deriving (Eq,Show) type EBnfSRGItem = RE (Symbol SRGNT Token) @@ -186,21 +185,22 @@ type EBnfSRGItem = RE (Symbol SRGNT Token) ebnfSRGAlts :: [SRGAlt] -> [EBnfSRGAlt] ebnfSRGAlts alts = [EBnfSRGAlt p n (ebnfSRGItem sss) - | ((p,n),sss) <- buildMultiMap [((p,n),ss) | SRGAlt p n ss <- alts]] + | ((n,p),sss) <- buildMultiMap [((n,p),ss) | SRGAlt p n ss <- alts]] ebnfSRGItem :: [[Symbol SRGNT Token]] -> EBnfSRGItem -ebnfSRGItem = dfa2re . mkSRGFA +ebnfSRGItem = unionRE . map mergeItems . sortGroupBy (compareBy filterCats) -mkSRGFA :: [[Symbol SRGNT Token]] -> DFA (Symbol SRGNT Token) -mkSRGFA = {- minimize . dfa2nfa . -} foldr addString (newFA ()) - -addString :: [a] -> DFA a -> DFA a -addString xs fa = addFinalState (last sts0) $ newTransitions ts fa' - where (fa',ss) = newStates (replicate (length xs) ()) fa - sts0 = startState fa : sts1 - sts1 = map fst ss - ts = zip3 sts0 sts1 xs --} +-- ^ Merges a list of right-hand sides which all have the same +-- sequence of non-terminals. +mergeItems :: [[Symbol SRGNT Token]] -> EBnfSRGItem +--mergeItems = unionRE . map seqRE +mergeItems [] = nullRE +mergeItems sss | any null rss = t + | otherwise = concatRE [t,seqRE (head cs), mergeItems nss] + where (tss,rss) = unzip $ map (span isToken) sss + t = unionRE (map seqRE tss) + (cs,nss) = unzip $ map (splitAt 1) rss + isToken = symbol (const False) (const True) -- -- * Utilities for building and printing SRGs diff --git a/src/GF/Speech/TransformCFG.hs b/src/GF/Speech/TransformCFG.hs index ed1730a3d..bb6f16243 100644 --- a/src/GF/Speech/TransformCFG.hs +++ b/src/GF/Speech/TransformCFG.hs @@ -60,7 +60,7 @@ data CFTerm | CFVar Int | CFConst String | CFMeta String - deriving (Eq,Show) + deriving (Eq,Ord,Show) type Cat_ = String type CFSymbol_ = Symbol Cat_ Token