forked from GitHub/gf-core
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
This commit is contained in:
378
src-3.0/GF/Speech/TransformCFG.hs
Normal file
378
src-3.0/GF/Speech/TransformCFG.hs
Normal file
@@ -0,0 +1,378 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : TransformCFG
|
||||
-- Maintainer : BB
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/11/01 20:09:04 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.24 $
|
||||
--
|
||||
-- This module does some useful transformations on CFGs.
|
||||
--
|
||||
-- peb thinks: most of this module should be moved to GF.Conversion...
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Speech.TransformCFG where
|
||||
|
||||
import GF.Canon.CanonToGFCC (canon2gfcc)
|
||||
import qualified GF.GFCC.CId as C
|
||||
import GF.GFCC.Macros (lookType,catSkeleton)
|
||||
import GF.GFCC.DataGFCC (GFCC)
|
||||
import GF.Conversion.Types
|
||||
import GF.CF.PPrCF (prCFCat)
|
||||
import GF.Data.Utilities
|
||||
import GF.Formalism.CFG
|
||||
import GF.Formalism.Utilities (Symbol(..), mapSymbol, filterCats, symbol,
|
||||
NameProfile(..), Profile(..), name2fun, forestName)
|
||||
import GF.Infra.Ident
|
||||
import GF.Infra.Option
|
||||
import GF.Infra.Print
|
||||
import GF.Speech.Relation
|
||||
import GF.Compile.ShellState (StateGrammar, stateCFG, stateGrammarST, startCatStateOpts, stateOptions)
|
||||
|
||||
import Control.Monad
|
||||
import Control.Monad.State (State, get, put, evalState)
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map
|
||||
import Data.List
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.Monoid (mconcat)
|
||||
import Data.Set (Set)
|
||||
import qualified Data.Set as Set
|
||||
|
||||
-- not very nice to replace the structured CFCat type with a simple string
|
||||
type CFRule_ = CFRule Cat_ CFTerm Token
|
||||
|
||||
data CFTerm
|
||||
= CFObj Fun [CFTerm] -- ^ an abstract syntax function with arguments
|
||||
| CFAbs Int CFTerm -- ^ A lambda abstraction. The Int is the variable id.
|
||||
| CFApp CFTerm CFTerm -- ^ Application
|
||||
| CFRes Int -- ^ The result of the n:th (0-based) non-terminal
|
||||
| CFVar Int -- ^ A lambda-bound variable
|
||||
| CFMeta String -- ^ A metavariable
|
||||
deriving (Eq,Ord,Show)
|
||||
|
||||
type Cat_ = String
|
||||
type CFSymbol_ = Symbol Cat_ Token
|
||||
|
||||
type CFRules = Map Cat_ (Set CFRule_)
|
||||
|
||||
|
||||
cfgToCFRules :: StateGrammar -> CFRules
|
||||
cfgToCFRules s =
|
||||
groupProds [CFRule (catToString c) (map symb r) (nameToTerm n)
|
||||
| CFRule c r n <- cfg]
|
||||
where cfg = stateCFG s
|
||||
symb = mapSymbol catToString id
|
||||
catToString = prt
|
||||
gfcc = stateGFCC s
|
||||
nameToTerm (Name IW [Unify [n]]) = CFRes n
|
||||
nameToTerm (Name f@(IC c) prs) =
|
||||
CFObj f (zipWith profileToTerm args prs)
|
||||
where (args,_) = catSkeleton $ lookType gfcc (C.CId c)
|
||||
nameToTerm n = error $ "cfgToCFRules.nameToTerm" ++ show n
|
||||
profileToTerm (C.CId t) (Unify []) = CFMeta t
|
||||
profileToTerm _ (Unify xs) = CFRes (last xs) -- FIXME: unify
|
||||
profileToTerm (C.CId t) (Constant f) = maybe (CFMeta t) (\x -> CFObj x []) (forestName f)
|
||||
|
||||
getStartCat :: Options -> StateGrammar -> String
|
||||
getStartCat opts sgr = prCFCat (startCatStateOpts opts' sgr)
|
||||
where opts' = addOptions opts (stateOptions sgr)
|
||||
|
||||
getStartCatCF :: Options -> StateGrammar -> String
|
||||
getStartCatCF opts sgr = getStartCat opts sgr ++ "{}.s"
|
||||
|
||||
stateGFCC :: StateGrammar -> GFCC
|
||||
stateGFCC = canon2gfcc noOptions . stateGrammarST
|
||||
|
||||
-- * Grammar filtering
|
||||
|
||||
-- | Removes all directly and indirectly cyclic productions.
|
||||
-- FIXME: this may be too aggressive, only one production
|
||||
-- needs to be removed to break a given cycle. But which
|
||||
-- one should we pick?
|
||||
-- FIXME: Does not (yet) remove productions which are cyclic
|
||||
-- because of empty productions.
|
||||
removeCycles :: CFRules -> CFRules
|
||||
removeCycles = groupProds . f . allRules
|
||||
where f rs = filter (not . isCycle) rs
|
||||
where alias = transitiveClosure $ mkRel [(c,c') | CFRule c [Cat c'] _ <- rs]
|
||||
isCycle (CFRule c [Cat c'] _) = isRelatedTo alias c' c
|
||||
isCycle _ = False
|
||||
|
||||
-- | Better bottom-up filter that also removes categories which contain no finite
|
||||
-- strings.
|
||||
bottomUpFilter :: CFRules -> CFRules
|
||||
bottomUpFilter gr = fix grow Map.empty
|
||||
where grow g = g `unionCFRules` filterCFRules (all (okSym g) . ruleRhs) gr
|
||||
okSym g = symbol (`elem` allCats g) (const True)
|
||||
|
||||
-- | Removes categories which are not reachable from the start category.
|
||||
topDownFilter :: Cat_ -> CFRules -> CFRules
|
||||
topDownFilter start rules = filterCFRulesCats (isRelatedTo uses start) rules
|
||||
where
|
||||
rhsCats = [ (lhsCat r, c') | r <- allRules rules, c' <- filterCats (ruleRhs r) ]
|
||||
uses = reflexiveClosure_ (allCats rules) $ transitiveClosure $ mkRel rhsCats
|
||||
|
||||
-- | Merges categories with identical right-hand-sides.
|
||||
-- FIXME: handle probabilities
|
||||
mergeIdentical :: CFRules -> CFRules
|
||||
mergeIdentical g = groupProds $ map subst $ allRules g
|
||||
where
|
||||
-- maps categories to their replacement
|
||||
m = Map.fromList [(y,concat (intersperse "+" xs))
|
||||
| (_,xs) <- buildMultiMap [(rulesKey rs,c) | (c,rs) <- Map.toList g], y <- xs]
|
||||
-- build data to compare for each category: a set of name,rhs pairs
|
||||
rulesKey = Set.map (\ (CFRule _ r n) -> (n,r))
|
||||
subst (CFRule c r n) = CFRule (substCat c) (map (mapSymbol substCat id) r) n
|
||||
substCat c = Map.findWithDefault (error $ "mergeIdentical: " ++ c) c m
|
||||
|
||||
-- * Removing left recursion
|
||||
|
||||
-- The LC_LR algorithm from
|
||||
-- http://research.microsoft.com/users/bobmoore/naacl2k-proc-rev.pdf
|
||||
removeLeftRecursion :: Cat_ -> CFRules -> CFRules
|
||||
removeLeftRecursion start gr
|
||||
= groupProds $ concat [scheme1, scheme2, scheme3, scheme4]
|
||||
where
|
||||
scheme1 = [CFRule a [x,Cat a_x] n' |
|
||||
a <- retainedLeftRecursive,
|
||||
x <- properLeftCornersOf a,
|
||||
not (isLeftRecursive x),
|
||||
let a_x = mkCat (Cat a) x,
|
||||
-- this is an extension of LC_LR to avoid generating
|
||||
-- A-X categories for which there are no productions:
|
||||
a_x `Set.member` newCats,
|
||||
let n' = symbol (\_ -> CFApp (CFRes 1) (CFRes 0))
|
||||
(\_ -> CFRes 0) x]
|
||||
scheme2 = [CFRule a_x (beta++[Cat a_b]) n' |
|
||||
a <- retainedLeftRecursive,
|
||||
b@(Cat b') <- properLeftCornersOf a,
|
||||
isLeftRecursive b,
|
||||
CFRule _ (x:beta) n <- catRules gr b',
|
||||
let a_x = mkCat (Cat a) x,
|
||||
let a_b = mkCat (Cat a) b,
|
||||
let i = length $ filterCats beta,
|
||||
let n' = symbol (\_ -> CFAbs 1 (CFApp (CFRes i) (shiftTerm n)))
|
||||
(\_ -> CFApp (CFRes i) n) x]
|
||||
scheme3 = [CFRule a_x beta n' |
|
||||
a <- retainedLeftRecursive,
|
||||
x <- properLeftCornersOf a,
|
||||
CFRule _ (x':beta) n <- catRules gr a,
|
||||
x == x',
|
||||
let a_x = mkCat (Cat a) x,
|
||||
let n' = symbol (\_ -> CFAbs 1 (shiftTerm n))
|
||||
(\_ -> n) x]
|
||||
scheme4 = catSetRules gr $ Set.fromList $ filter (not . isLeftRecursive . Cat) cats
|
||||
|
||||
newCats = Set.fromList (map lhsCat (scheme2 ++ scheme3))
|
||||
|
||||
shiftTerm :: CFTerm -> CFTerm
|
||||
shiftTerm (CFObj f ts) = CFObj f (map shiftTerm ts)
|
||||
shiftTerm (CFRes 0) = CFVar 1
|
||||
shiftTerm (CFRes n) = CFRes (n-1)
|
||||
shiftTerm t = t
|
||||
-- note: the rest don't occur in the original grammar
|
||||
|
||||
cats = allCats gr
|
||||
rules = allRules gr
|
||||
|
||||
directLeftCorner = mkRel [(Cat c,t) | CFRule c (t:_) _ <- allRules gr]
|
||||
leftCorner = reflexiveClosure_ (map Cat cats) $ transitiveClosure directLeftCorner
|
||||
properLeftCorner = transitiveClosure directLeftCorner
|
||||
properLeftCornersOf = Set.toList . allRelated properLeftCorner . Cat
|
||||
isProperLeftCornerOf = flip (isRelatedTo properLeftCorner)
|
||||
|
||||
leftRecursive = reflexiveElements properLeftCorner
|
||||
isLeftRecursive = (`Set.member` leftRecursive)
|
||||
|
||||
retained = start `Set.insert`
|
||||
Set.fromList [a | r <- allRules (filterCFRulesCats (not . isLeftRecursive . Cat) gr),
|
||||
Cat a <- ruleRhs r]
|
||||
isRetained = (`Set.member` retained)
|
||||
|
||||
retainedLeftRecursive = filter (isLeftRecursive . Cat) $ Set.toList retained
|
||||
|
||||
mkCat :: CFSymbol_ -> CFSymbol_ -> Cat_
|
||||
mkCat x y = showSymbol x ++ "-" ++ showSymbol y
|
||||
where showSymbol = symbol id show
|
||||
|
||||
{-
|
||||
|
||||
-- Paull's algorithm, see
|
||||
-- http://research.microsoft.com/users/bobmoore/naacl2k-proc-rev.pdf
|
||||
removeLeftRecursion :: Cat_ -> CFRules -> CFRules
|
||||
removeLeftRecursion start rs = removeDirectLeftRecursions $ map handleProds rs
|
||||
where
|
||||
handleProds (c, r) = (c, concatMap handleProd r)
|
||||
handleProd (CFRule ai (Cat aj:alpha) n) | aj < ai =
|
||||
-- FIXME: for non-recursive categories, this changes
|
||||
-- the grammar unneccessarily, maybe we can use mutRecCats
|
||||
-- to make this less invasive
|
||||
-- FIXME: this will give multiple rules with the same name,
|
||||
-- which may mess up the probabilities.
|
||||
[CFRule ai (beta ++ alpha) n | CFRule _ beta _ <- lookup' aj rs]
|
||||
handleProd r = [r]
|
||||
|
||||
removeDirectLeftRecursions :: CFRules -> CFRules
|
||||
removeDirectLeftRecursions = concat . flip evalState 0 . mapM removeDirectLeftRecursion
|
||||
|
||||
removeDirectLeftRecursion :: (Cat_,[CFRule_]) -- ^ All productions for a category
|
||||
-> State Int CFRules
|
||||
removeDirectLeftRecursion (a,rs)
|
||||
| null dr = return [(a,rs)]
|
||||
| otherwise =
|
||||
do
|
||||
a' <- fresh a
|
||||
let as = maybeEndWithA' nr
|
||||
is = [CFRule a' (tail r) n | CFRule _ r n <- dr]
|
||||
a's = maybeEndWithA' is
|
||||
-- the not null constraint here avoids creating new
|
||||
-- left recursive (cyclic) rules.
|
||||
maybeEndWithA' xs = xs ++ [CFRule c (r++[Cat a']) n | CFRule c r n <- xs,
|
||||
not (null r)]
|
||||
return [(a, as), (a', a's)]
|
||||
where
|
||||
(dr,nr) = partition isDirectLeftRecursive rs
|
||||
fresh x = do { n <- get; put (n+1); return $ x ++ "-" ++ show n }
|
||||
|
||||
isDirectLeftRecursive :: CFRule_ -> Bool
|
||||
isDirectLeftRecursive (CFRule c (Cat c':_) _) = c == c'
|
||||
isDirectLeftRecursive _ = False
|
||||
|
||||
-}
|
||||
|
||||
-- | Get the sets of mutually recursive non-terminals for a grammar.
|
||||
mutRecCats :: Bool -- ^ If true, all categories will be in some set.
|
||||
-- If false, only recursive categories will be included.
|
||||
-> CFRules -> [Set Cat_]
|
||||
mutRecCats incAll g = equivalenceClasses $ refl $ symmetricSubrelation $ transitiveClosure r
|
||||
where r = mkRel [(c,c') | CFRule c ss _ <- allRules g, Cat c' <- ss]
|
||||
refl = if incAll then reflexiveClosure_ (allCats g) else reflexiveSubrelation
|
||||
|
||||
--
|
||||
-- * Approximate context-free grammars with regular grammars.
|
||||
--
|
||||
|
||||
-- Use the transformation algorithm from \"Regular Approximation of Context-free
|
||||
-- Grammars through Approximation\", Mohri and Nederhof, 2000
|
||||
-- to create an over-generating regular frammar for a context-free
|
||||
-- grammar
|
||||
makeRegular :: CFRules -> CFRules
|
||||
makeRegular g = groupProds $ concatMap trSet (mutRecCats True g)
|
||||
where trSet cs | allXLinear cs rs = rs
|
||||
| otherwise = concatMap handleCat csl
|
||||
where csl = Set.toList cs
|
||||
rs = catSetRules g cs
|
||||
handleCat c = [CFRule c' [] (mkCFTerm (c++"-empty"))] -- introduce A' -> e
|
||||
++ concatMap (makeRightLinearRules c) (catRules g c)
|
||||
where c' = newCat c
|
||||
makeRightLinearRules b' (CFRule c ss n) =
|
||||
case ys of
|
||||
[] -> newRule b' (xs ++ [Cat (newCat c)]) n -- no non-terminals left
|
||||
(Cat b:zs) -> newRule b' (xs ++ [Cat b]) n
|
||||
++ makeRightLinearRules (newCat b) (CFRule c zs n)
|
||||
where (xs,ys) = break (`catElem` cs) ss
|
||||
-- don't add rules on the form A -> A
|
||||
newRule c rhs n | rhs == [Cat c] = []
|
||||
| otherwise = [CFRule c rhs n]
|
||||
newCat c = c ++ "$"
|
||||
|
||||
--
|
||||
-- * CFG rule utilities
|
||||
--
|
||||
|
||||
-- | Group productions by their lhs categories
|
||||
groupProds :: [CFRule_] -> CFRules
|
||||
groupProds = Map.fromListWith Set.union . map (\r -> (lhsCat r,Set.singleton r))
|
||||
|
||||
allRules :: CFRules -> [CFRule_]
|
||||
allRules = concat . map Set.toList . Map.elems
|
||||
|
||||
allRulesGrouped :: CFRules -> [(Cat_,[CFRule_])]
|
||||
allRulesGrouped = Map.toList . Map.map Set.toList
|
||||
|
||||
allCats :: CFRules -> [Cat_]
|
||||
allCats = Map.keys
|
||||
|
||||
catRules :: CFRules -> Cat_ -> [CFRule_]
|
||||
catRules rs c = Set.toList $ Map.findWithDefault Set.empty c rs
|
||||
|
||||
catSetRules :: CFRules -> Set Cat_ -> [CFRule_]
|
||||
catSetRules g cs = allRules $ Map.filterWithKey (\c _ -> c `Set.member` cs) g
|
||||
|
||||
cleanCFRules :: CFRules -> CFRules
|
||||
cleanCFRules = Map.filter (not . Set.null)
|
||||
|
||||
unionCFRules :: CFRules -> CFRules -> CFRules
|
||||
unionCFRules = Map.unionWith Set.union
|
||||
|
||||
filterCFRules :: (CFRule_ -> Bool) -> CFRules -> CFRules
|
||||
filterCFRules p = cleanCFRules . Map.map (Set.filter p)
|
||||
|
||||
filterCFRulesCats :: (Cat_ -> Bool) -> CFRules -> CFRules
|
||||
filterCFRulesCats p = Map.filterWithKey (\c _ -> p c)
|
||||
|
||||
countCats :: CFRules -> Int
|
||||
countCats = Map.size . cleanCFRules
|
||||
|
||||
countRules :: CFRules -> Int
|
||||
countRules = length . allRules
|
||||
|
||||
lhsCat :: CFRule c n t -> c
|
||||
lhsCat (CFRule c _ _) = c
|
||||
|
||||
ruleRhs :: CFRule c n t -> [Symbol c t]
|
||||
ruleRhs (CFRule _ ss _) = ss
|
||||
|
||||
ruleFun :: CFRule_ -> Fun
|
||||
ruleFun (CFRule _ _ t) = f t
|
||||
where f (CFObj n _) = n
|
||||
f (CFApp _ x) = f x
|
||||
f (CFAbs _ x) = f x
|
||||
f _ = IC ""
|
||||
|
||||
-- | Checks if a symbol is a non-terminal of one of the given categories.
|
||||
catElem :: Ord c => Symbol c t -> Set c -> Bool
|
||||
catElem s cs = symbol (`Set.member` cs) (const False) s
|
||||
|
||||
-- | Check if any of the categories used on the right-hand side
|
||||
-- are in the given list of categories.
|
||||
anyUsedBy :: Eq c => [c] -> CFRule c n t -> Bool
|
||||
anyUsedBy cs (CFRule _ ss _) = any (`elem` cs) (filterCats ss)
|
||||
|
||||
mkCFTerm :: String -> CFTerm
|
||||
mkCFTerm n = CFObj (IC n) []
|
||||
|
||||
ruleIsNonRecursive :: Ord c => Set c -> CFRule c n t -> Bool
|
||||
ruleIsNonRecursive cs = noCatsInSet cs . ruleRhs
|
||||
|
||||
noCatsInSet :: Ord c => Set c -> [Symbol c t] -> Bool
|
||||
noCatsInSet cs = not . any (`catElem` cs)
|
||||
|
||||
-- | Check if all the rules are right-linear, or all the rules are
|
||||
-- left-linear, with respect to given categories.
|
||||
allXLinear :: Ord c => Set c -> [CFRule c n t] -> Bool
|
||||
allXLinear cs rs = all (isRightLinear cs) rs || all (isLeftLinear cs) rs
|
||||
|
||||
-- | Checks if a context-free rule is right-linear.
|
||||
isRightLinear :: Ord c =>
|
||||
Set c -- ^ The categories to consider
|
||||
-> CFRule c n t -- ^ The rule to check for right-linearity
|
||||
-> Bool
|
||||
isRightLinear cs = noCatsInSet cs . safeInit . ruleRhs
|
||||
|
||||
-- | Checks if a context-free rule is left-linear.
|
||||
isLeftLinear :: Ord c =>
|
||||
Set c -- ^ The categories to consider
|
||||
-> CFRule c n t -- ^ The rule to check for left-linearity
|
||||
-> Bool
|
||||
isLeftLinear cs = noCatsInSet cs . drop 1 . ruleRhs
|
||||
|
||||
prCFRules :: CFRules -> String
|
||||
prCFRules = unlines . map prRule . allRules
|
||||
where
|
||||
prRule r = lhsCat r ++ " --> " ++ unwords (map prSym (ruleRhs r))
|
||||
prSym = symbol id (\t -> "\""++ t ++"\"")
|
||||
Reference in New Issue
Block a user