SRG generation: merge categories with identical set of productions. The LC_LR algorithm produces lots of those, especially when there is little inflection.

This commit is contained in:
bringert
2007-03-24 23:29:38 +00:00
parent 436ddd5ebf
commit ed1e7f4485
2 changed files with 14 additions and 1 deletions

View File

@@ -95,7 +95,8 @@ makeSimpleSRG opt s =
probs = stateProbs s
l = fmap (replace '_' '-') $ getOptVal opts speechLanguage
(cats,cfgRules) = unzip $ preprocess $ cfgToCFRules s
preprocess = removeLeftRecursion origStart
preprocess = mergeIdentical
. removeLeftRecursion origStart
. fix (topDownFilter origStart . bottomUpFilter)
. removeCycles
names = mkCatNames name cats

View File

@@ -112,6 +112,18 @@ topDownFilter start rules = filter ((`Set.member` keep) . fst) rules
uses = reflexiveClosure_ (allCats rules) $ transitiveClosure $ mkRel rhsCats
keep = allRelated uses start
-- | Merges categories with identical right-hand-sides.
-- FIXME: handle probabilities
mergeIdentical :: CFRules -> CFRules
mergeIdentical g = sortNubBy (compareBy fst) [(substCat c, map subst rs) | (c,rs) <- g]
where
-- maps categories to their replacement
m = Map.fromList [(y,concat (intersperse "+" xs)) | (_,xs) <- buildMultiMap [(rulesKey rs,c) | (c,rs) <- g], y <- xs]
-- build data to compare for each category: a set of name,rhs pairs
rulesKey rs = Set.fromList [(n,r) | CFRule _ r n <- rs]
subst (CFRule c r n) = CFRule (substCat c) (map (mapSymbol substCat id) r) n
substCat c = Map.findWithDefault (error $ "mergeIdentical: " ++ c) c m
-- * Removing left recursion
-- The LC_LR algorithm from