From ed1e7f448574662d8b54fe92b3c08ed4942f575d Mon Sep 17 00:00:00 2001 From: bringert Date: Sat, 24 Mar 2007 23:29:38 +0000 Subject: [PATCH] SRG generation: merge categories with identical set of productions. The LC_LR algorithm produces lots of those, especially when there is little inflection. --- src/GF/Speech/SRG.hs | 3 ++- src/GF/Speech/TransformCFG.hs | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/GF/Speech/SRG.hs b/src/GF/Speech/SRG.hs index 8c3c5e02c..054cf62f6 100644 --- a/src/GF/Speech/SRG.hs +++ b/src/GF/Speech/SRG.hs @@ -95,7 +95,8 @@ makeSimpleSRG opt s = probs = stateProbs s l = fmap (replace '_' '-') $ getOptVal opts speechLanguage (cats,cfgRules) = unzip $ preprocess $ cfgToCFRules s - preprocess = removeLeftRecursion origStart + preprocess = mergeIdentical + . removeLeftRecursion origStart . fix (topDownFilter origStart . bottomUpFilter) . removeCycles names = mkCatNames name cats diff --git a/src/GF/Speech/TransformCFG.hs b/src/GF/Speech/TransformCFG.hs index dcb666bf3..93f2a1be9 100644 --- a/src/GF/Speech/TransformCFG.hs +++ b/src/GF/Speech/TransformCFG.hs @@ -112,6 +112,18 @@ topDownFilter start rules = filter ((`Set.member` keep) . fst) rules uses = reflexiveClosure_ (allCats rules) $ transitiveClosure $ mkRel rhsCats keep = allRelated uses start +-- | Merges categories with identical right-hand-sides. +-- FIXME: handle probabilities +mergeIdentical :: CFRules -> CFRules +mergeIdentical g = sortNubBy (compareBy fst) [(substCat c, map subst rs) | (c,rs) <- g] + where + -- maps categories to their replacement + m = Map.fromList [(y,concat (intersperse "+" xs)) | (_,xs) <- buildMultiMap [(rulesKey rs,c) | (c,rs) <- g], y <- xs] + -- build data to compare for each category: a set of name,rhs pairs + rulesKey rs = Set.fromList [(n,r) | CFRule _ r n <- rs] + subst (CFRule c r n) = CFRule (substCat c) (map (mapSymbol substCat id) r) n + substCat c = Map.findWithDefault (error $ "mergeIdentical: " ++ c) c m + -- * Removing left recursion -- The LC_LR algorithm from