mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
SRG generation: merge categories with identical set of productions. The LC_LR algorithm produces lots of those, especially when there is little inflection.
This commit is contained in:
@@ -95,7 +95,8 @@ makeSimpleSRG opt s =
|
||||
probs = stateProbs s
|
||||
l = fmap (replace '_' '-') $ getOptVal opts speechLanguage
|
||||
(cats,cfgRules) = unzip $ preprocess $ cfgToCFRules s
|
||||
preprocess = removeLeftRecursion origStart
|
||||
preprocess = mergeIdentical
|
||||
. removeLeftRecursion origStart
|
||||
. fix (topDownFilter origStart . bottomUpFilter)
|
||||
. removeCycles
|
||||
names = mkCatNames name cats
|
||||
|
||||
@@ -112,6 +112,18 @@ topDownFilter start rules = filter ((`Set.member` keep) . fst) rules
|
||||
uses = reflexiveClosure_ (allCats rules) $ transitiveClosure $ mkRel rhsCats
|
||||
keep = allRelated uses start
|
||||
|
||||
-- | Merges categories with identical right-hand-sides.
|
||||
-- FIXME: handle probabilities
|
||||
mergeIdentical :: CFRules -> CFRules
|
||||
mergeIdentical g = sortNubBy (compareBy fst) [(substCat c, map subst rs) | (c,rs) <- g]
|
||||
where
|
||||
-- maps categories to their replacement
|
||||
m = Map.fromList [(y,concat (intersperse "+" xs)) | (_,xs) <- buildMultiMap [(rulesKey rs,c) | (c,rs) <- g], y <- xs]
|
||||
-- build data to compare for each category: a set of name,rhs pairs
|
||||
rulesKey rs = Set.fromList [(n,r) | CFRule _ r n <- rs]
|
||||
subst (CFRule c r n) = CFRule (substCat c) (map (mapSymbol substCat id) r) n
|
||||
substCat c = Map.findWithDefault (error $ "mergeIdentical: " ++ c) c m
|
||||
|
||||
-- * Removing left recursion
|
||||
|
||||
-- The LC_LR algorithm from
|
||||
|
||||
Reference in New Issue
Block a user