Use LCLR algorithm for eliminating left-recursion, with lambda terms in SISR for getting trees right.

This commit is contained in:
bringert
2006-12-20 20:10:15 +00:00
parent c7df9f4167
commit f9621483a0
14 changed files with 541 additions and 308 deletions

View File

@@ -76,7 +76,7 @@ makeRegular g = groupProds $ concatMap trSet (mutRecCats True g)
| otherwise = concatMap handleCat csl
where csl = Set.toList cs
rs = catSetRules g cs
handleCat c = [CFRule c' [] (mkName (c++"-empty"))] -- introduce A' -> e
handleCat c = [CFRule c' [] (mkCFTerm (c++"-empty"))] -- introduce A' -> e
++ concatMap (makeRightLinearRules c) (catRules g c)
where c' = newCat c
makeRightLinearRules b' (CFRule c ss n) =

View File

@@ -22,7 +22,7 @@ module GF.Speech.PrJSGF (jsgfPrinter) where
import GF.Conversion.Types
import GF.Data.Utilities
import GF.Formalism.CFG
import GF.Formalism.Utilities (Symbol(..), NameProfile(..), Profile(..))
import GF.Formalism.Utilities (Symbol(..), NameProfile(..), Profile(..), filterCats)
import GF.Infra.Ident
import GF.Infra.Print
import GF.Infra.Option
@@ -31,6 +31,7 @@ import GF.Speech.SISR
import GF.Speech.SRG
import GF.Speech.RegExp
import Data.Char
import Data.List
import Debug.Trace
@@ -45,7 +46,7 @@ jsgfPrinter name start opts sisr probs cfg = prJSGF srg sisr ""
prJSGF :: SRG -> Maybe SISRFormat -> ShowS
prJSGF srg@(SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs}) sisr
= header . nl
= trace (show srg) $ header . nl
. mainCat . nl
. unlinesS topCatRules . nl
. unlinesS (map prRule rs)
@@ -58,12 +59,17 @@ prJSGF srg@(SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs}
. rule True "MAIN" [prCat start]
prRule (SRGRule cat origCat rhs) =
comment origCat
. rule False cat (map prAlt (ebnfSRGAlts rhs))
-- . rule False cat (map prAlt (ebnfSRGAlts rhs))
. rule False cat (map prAlt rhs)
-- FIXME: use the probability
prAlt (EBnfSRGAlt mp n rhs) = tag sisr (profileInitSISR n) . showChar ' '. prItem sisr rhs
-- prAlt (EBnfSRGAlt mp n rhs) = tag sisr (profileInitSISR n) . showChar ' '. prItem sisr rhs
prAlt (SRGAlt mp n rhs) = initTag . showChar ' '. prItem sisr n rhs . tag sisr (profileFinalSISR n)
where initTag | null (t "") = id
| otherwise = showString "<NULL>" . showChar ' ' . t
where t = tag sisr (profileInitSISR n)
topCatRules = [rule True (catFormId tc) (map (it tc) cs) | (tc,cs) <- srgTopCats srg]
where it i c = prCat c . tag sisr [(EThis :. catFieldId i) := (ERef c)]
where it i c = prCat c . tag sisr (topCatSISR (catFieldId i) c)
catFormId :: String -> String
catFormId = (++ "_cat")
@@ -74,6 +80,7 @@ catFieldId = (++ "_field")
prCat :: SRGCat -> ShowS
prCat c = showChar '<' . showString c . showChar '>'
{-
prItem :: Maybe SISRFormat -> EBnfSRGItem -> ShowS
prItem sisr = f 1
where
@@ -86,16 +93,26 @@ prItem sisr = f 1
f p (REConcat xs) = (if p >= 3 then paren else id) (unwordsS (map (f 2) xs))
f p (RERepeat x) = f 3 x . showString "*"
f _ (RESymbol s) = prSymbol sisr s
-}
prSymbol :: Maybe SISRFormat -> Symbol SRGNT Token -> ShowS
prSymbol sisr (Cat n@(c,_)) = prCat c . tag sisr (catSISR n)
prSymbol _ (Tok t) | all isPunct (prt t) = id -- removes punctuation
| otherwise = prtS t -- FIXME: quote if there is whitespace or odd chars
prItem :: Maybe SISRFormat -> CFTerm -> [Symbol SRGNT Token] -> ShowS
prItem _ _ [] = showString "<NULL>"
prItem sisr cn ss = paren $ unwordsS $ map (prSymbol sisr cn) ss
tag :: Maybe SISRFormat -> [SISRExpr] -> ShowS
prSymbol :: Maybe SISRFormat -> CFTerm -> Symbol SRGNT Token -> ShowS
prSymbol sisr cn (Cat n@(c,_)) = prCat c . tag sisr (catSISR cn n)
prSymbol _ cn (Tok t) | all isPunct (prt t) = id -- removes punctuation
| otherwise = prtS t -- FIXME: quote if there is whitespace or odd chars
tag :: Maybe SISRFormat -> (SISRFormat -> SISRTag) -> ShowS
tag Nothing _ = id
tag _ [] = id
tag (Just fmt) t = showString "{" . showString (prSISR fmt t) . showString "}"
tag (Just fmt) t = case t fmt of
[] -> id
ts -> showString "{" . showString (e $ prSISR ts) . showString "}"
where e [] = []
e ('}':xs) = '\\':'}':e xs
e ('\n':xs) = ' ' : e (dropWhile isSpace xs)
e (x:xs) = x:e xs
isPunct :: Char -> Bool
isPunct c = c `elem` "-_.;.,?!"

View File

@@ -22,7 +22,7 @@ import GF.Infra.Ident
import GF.Today
import GF.Formalism.CFG
import GF.Formalism.Utilities (Symbol(..), NameProfile(..), Profile(..), forestName)
import GF.Formalism.Utilities (Symbol(..), NameProfile(..), Profile(..), forestName, filterCats)
import GF.Conversion.Types
import GF.Infra.Print
import GF.Infra.Option
@@ -59,17 +59,18 @@ prSrgsXml sisr srg@(SRG{grammarName=name,startCat=start,
++ topCatRules
++ concatMap ruleToXML rs
ruleToXML (SRGRule cat origCat alts) =
comments ["Category " ++ origCat] ++ [rule cat (prRhs $ ebnfSRGAlts alts)]
comments ["Category " ++ origCat] ++ [rule cat (prRhs alts)]
prRhs rhss = [oneOf (map (mkProd sisr) rhss)]
-- externally visible rules for each of the GF categories
topCatRules = [topRule tc [oneOf (map (it tc) cs)] | (tc,cs) <- srgTopCats srg]
where it i c = Tag "item" [] [Tag "ruleref" [("uri","#" ++ c)] [],
tag sisr [(EThis :. catFieldId i) := (ERef c)]]
tag sisr (topCatSISR (catFieldId i) c)]
topRule i is = Tag "rule" [("id",catFormId i),("scope","public")] is
rule :: String -> [XML] -> XML
rule i = Tag "rule" [("id",i)]
{-
mkProd :: Maybe SISRFormat -> EBnfSRGAlt -> XML
mkProd sisr (EBnfSRGAlt mp n rhs) = Tag "item" w (t ++ xs)
where xs = [mkItem sisr rhs]
@@ -83,17 +84,29 @@ mkItem sisr = f
f (REConcat xs) = Tag "item" [] (map f xs)
f (RERepeat x) = Tag "item" [("repeat","0-")] [f x]
f (RESymbol s) = symItem sisr s
-}
symItem :: Maybe SISRFormat -> Symbol SRGNT Token -> XML
symItem sisr (Cat n@(c,_)) =
Tag "item" [] [Tag "ruleref" [("uri","#" ++ c)] [], tag sisr (catSISR n)]
symItem _ (Tok t) = Tag "item" [] [Data (showToken t)]
mkProd :: Maybe SISRFormat -> SRGAlt -> XML
mkProd sisr (SRGAlt mp n rhs) = Tag "item" w (ti ++ xs ++ tf)
where xs = mkItem sisr n rhs
w = maybe [] (\p -> [("weight", show p)]) mp
ti = [tag sisr (profileInitSISR n)]
tf = [tag sisr (profileFinalSISR n)]
tag :: Maybe SISRFormat -> [SISRExpr] -> XML
mkItem :: Maybe SISRFormat -> CFTerm -> [Symbol SRGNT Token] -> [XML]
mkItem sisr cn ss = map (symItem sisr cn) ss
symItem :: Maybe SISRFormat -> CFTerm -> Symbol SRGNT Token -> XML
symItem sisr cn (Cat n@(c,_)) =
Tag "item" [] $ [Tag "ruleref" [("uri","#" ++ c)] [], tag sisr (catSISR cn n)]
symItem _ _ (Tok t) = Tag "item" [] [Data (showToken t)]
tag :: Maybe SISRFormat -> (SISRFormat -> SISRTag) -> XML
tag Nothing _ = Empty
tag _ [] = Empty
tag (Just fmt) ts = Tag "tag" [] [Data (prSISR fmt ts)]
tag (Just fmt) t = case t fmt of
[] -> Empty
ts -> Tag "tag" [] [Data (prSISR ts)]
catFormId :: String -> String
catFormId = (++ "_cat")

View File

@@ -10,8 +10,8 @@
--
-----------------------------------------------------------------------------
module GF.Speech.SISR (SISRFormat(..), SISRExpr(..), prSISR,
profileInitSISR, catSISR) where
module GF.Speech.SISR (SISRFormat(..), SISRTag, prSISR,
topCatSISR, profileInitSISR, catSISR, profileFinalSISR) where
import Data.List
@@ -20,11 +20,11 @@ import GF.Data.Utilities
import GF.Formalism.CFG
import GF.Formalism.Utilities (Symbol(..), NameProfile(..), Profile(..), forestName)
import GF.Infra.Ident
import GF.Speech.TransformCFG
import GF.Speech.SRG
infixl 8 :.
infixr 1 :=
import qualified GF.JavaScript.AbsJS as JS
import qualified GF.JavaScript.PrintJS as JS
data SISRFormat =
-- SISR Working draft 1 April 2003
@@ -32,35 +32,57 @@ data SISRFormat =
SISROld
deriving Show
data SISRExpr = SISRExpr := SISRExpr
| EThis
| SISRExpr :. String
| ERef String
| EStr String
| EApp SISRExpr [SISRExpr]
| ENew String [SISRExpr]
deriving Show
type SISRTag = [JS.Expr]
prSISR :: SISRFormat -> [SISRExpr] -> String
prSISR fmt = join "; " . map f
where
f e =
case e of
x := y -> f x ++ "=" ++ f y
EThis -> "$"
x :. y -> f x ++ "." ++ y
ERef y -> "$" ++ y
EStr s -> show s
EApp x ys -> f x ++ "(" ++ concat (intersperse "," (map f ys)) ++ ")"
ENew n ys -> "new " ++ n ++ "(" ++ concat (intersperse "," (map f ys)) ++ ")"
profileInitSISR :: Name -> [SISRExpr]
profileInitSISR (Name f prs) =
[(EThis :. "name") := (EStr (prIdent f))] ++
[(EThis :. ("arg" ++ show n)) := (EStr (argInit (prs!!n)))
| n <- [0..length prs-1]]
where argInit (Unify _) = "?"
argInit (Constant f) = maybe "?" prIdent (forestName f)
prSISR :: SISRTag -> String
prSISR = JS.printTree
catSISR :: SRGNT -> [SISRExpr]
catSISR (c,slots) = [(EThis :. ("arg" ++ show s)) := (ERef c) | s <- slots]
topCatSISR :: String -> String -> SISRFormat -> SISRTag
topCatSISR i c fmt = [field (fmtOut fmt) i `ass` fmtRef fmt c]
profileInitSISR :: CFTerm -> SISRFormat -> SISRTag
profileInitSISR t fmt
| null (usedChildren t) = []
| otherwise = [children `ass` JS.ENew (JS.Ident "Array") []]
usedChildren :: CFTerm -> [Int]
usedChildren (CFObj _ ts) = foldr union [] (map usedChildren ts)
usedChildren (CFAbs _ x) = usedChildren x
usedChildren (CFApp x y) = usedChildren x `union` usedChildren y
usedChildren (CFRes i) = [i]
usedChildren _ = []
catSISR :: CFTerm -> SRGNT -> SISRFormat -> SISRTag
catSISR t (c,i) fmt
| i `elem` usedChildren t =
[JS.EIndex children (JS.EInt (fromIntegral i)) `ass` fmtRef fmt c]
| otherwise = []
profileFinalSISR :: CFTerm -> SISRFormat -> SISRTag
profileFinalSISR term fmt = [fmtOut fmt `ass` f term]
where f (CFObj n ts) =
JS.ESeq $ [ret `ass` JS.ENew (JS.Ident "Object") [],
field ret "name" `ass` JS.EStr (prIdent n)]
++ [field ret ("arg"++show i) `ass` f t
| (i,t) <- zip [0..] ts ]
++ [ret]
where ret = JS.EVar (JS.Ident "ret")
f (CFAbs v x) = JS.EFun [var v] [JS.SReturn (f x)]
f (CFApp x y) = JS.ECall (f x) [f y]
f (CFRes i) = JS.EIndex children (JS.EInt (fromIntegral i))
f (CFVar v) = JS.EVar (var v)
f (CFConst s) = JS.EStr s
fmtOut SISROld = JS.EVar (JS.Ident "$")
fmtRef SISROld c = JS.EVar (JS.Ident ("$" ++ c))
children = JS.EVar (JS.Ident "c")
var v = JS.Ident ("x" ++ show v)
field x y = JS.EMember x (JS.Ident y)
ass = JS.EAssign

View File

@@ -19,12 +19,12 @@
-----------------------------------------------------------------------------
module GF.Speech.SRG (SRG(..), SRGRule(..), SRGAlt(..),
SRGCat, SRGNT,
SRGCat, SRGNT, CFTerm,
makeSimpleSRG, makeSRG
, lookupFM_, prtS
, topDownFilter, cfgCatToGFCat, srgTopCats
, EBnfSRGAlt(..), EBnfSRGItem
, ebnfSRGAlts
--, EBnfSRGAlt(..), EBnfSRGItem
--, ebnfSRGAlts
) where
import GF.Data.Operations
@@ -64,13 +64,13 @@ data SRGRule = SRGRule SRGCat String [SRGAlt] -- ^ SRG category name, original c
deriving (Eq,Show)
-- | maybe a probability, a rule name and a list of symbols
data SRGAlt = SRGAlt (Maybe Double) Name [Symbol SRGNT Token]
data SRGAlt = SRGAlt (Maybe Double) CFTerm [Symbol SRGNT Token]
deriving (Eq,Show)
type SRGCat = String
-- | An SRG non-terminal. Category name and slots which it fills in.
type SRGNT = (SRGCat, [Int])
-- | An SRG non-terminal. Category name and its number in the profile.
type SRGNT = (SRGCat, Int)
-- | SRG category name and original name
type CatName = (SRGCat,String)
@@ -129,17 +129,13 @@ cfgRulesToSRGRule names probs rs@(r:_) = SRGRule cat origCat rhs
origCat = lhsCat r
cat = lookupFM_ names origCat
rhs = nub $ map ruleToAlt rs
ruleToAlt r@(CFRule c ss n@(Name _ prs))
ruleToAlt r@(CFRule c ss n)
= SRGAlt (ruleProb probs r) n (mkSRGSymbols 0 ss)
where
mkSRGSymbols _ [] = []
mkSRGSymbols i (Cat c:ss) = Cat (c',slots) : mkSRGSymbols (i+1) ss
where c' = lookupFM_ names c
slots = [x | x <- [0..length prs-1], inProfile i (prs!!x)]
mkSRGSymbols i (Cat c:ss) = Cat (renameCat c,0) : mkSRGSymbols (i+1) ss
mkSRGSymbols i (Tok t:ss) = Tok t : mkSRGSymbols i ss
inProfile :: Int -> Profile a -> Bool
inProfile x (Unify xs) = x `elem` xs
inProfile _ (Constant _) = False
renameCat = lookupFM_ names
ruleProb :: Maybe Probs -> CFRule_ -> Maybe Double
ruleProb mp r = mp >>= \probs -> lookupProb probs (ruleFun r)
@@ -182,6 +178,7 @@ srgTopCats srg = buildMultiMap [(oc, cat) | SRGRule cat origCat _ <- rules srg,
-- * Size-optimized EBNF SRGs
--
{-
data EBnfSRGAlt = EBnfSRGAlt (Maybe Double) Name EBnfSRGItem
deriving (Eq,Show)
@@ -204,6 +201,7 @@ addString xs fa = addFinalState (last sts0) $ newTransitions ts fa'
sts0 = startState fa : sts1
sts1 = map fst ss
ts = zip3 sts0 sts1 xs
-}
--
-- * Utilities for building and printing SRGs

View File

@@ -27,7 +27,7 @@ import GF.Conversion.Types
import GF.Data.Utilities
import GF.Formalism.CFG
import GF.Formalism.Utilities (Symbol(..), mapSymbol, filterCats, symbol,
NameProfile(..), name2fun)
NameProfile(..), Profile(..), name2fun, forestName)
import GF.Infra.Ident
import GF.Infra.Option
import GF.Infra.Print
@@ -44,19 +44,34 @@ import Data.Set (Set)
import qualified Data.Set as Set
-- | not very nice to replace the structured CFCat type with a simple string
type CFRule_ = CFRule Cat_ Name Token
-- not very nice to replace the structured CFCat type with a simple string
type CFRule_ = CFRule Cat_ CFTerm Token
data CFTerm
= CFObj Fun [CFTerm]
| CFAbs Int CFTerm
| CFApp CFTerm CFTerm
| CFRes Int
| CFVar Int
| CFConst String
deriving (Eq,Show)
type Cat_ = String
type CFSymbol_ = Symbol Cat_ Token
type CFRules = [(Cat_,[CFRule_])]
cfgToCFRules :: CGrammar -> CFRules
cfgToCFRules cfg = groupProds [CFRule (catToString c) (map symb r) n | CFRule c r n <- cfg]
cfgToCFRules cfg =
groupProds [CFRule (catToString c) (map symb r) (nameToTerm n)
| CFRule c r n <- cfg]
where symb = mapSymbol catToString id
-- symb (Cat c) = Cat (catToString c)
-- symb (Tok t) = Tok t
catToString = prt
nameToTerm (Name f prs) = CFObj f (map profileToTerm prs)
profileToTerm (Unify []) = CFConst "?"
profileToTerm (Unify xs) = CFRes (last xs) -- FIXME: unify
profileToTerm (Constant f) = CFConst (maybe "?" prIdent (forestName f))
-- | Remove productions which use categories which have no productions
removeEmptyCats :: CFRules -> CFRules
@@ -80,35 +95,44 @@ removeIdenticalRules g = [(c,sortNubBy cmpRules rs) | (c,rs) <- g]
-- * Removing left recursion
{-
-- The LC_LR algorithm from
-- http://research.microsoft.com/users/bobmoore/naacl2k-proc-rev.pdf
-- Not used since I haven't figured out how to make proper profiles. /Bjorn
removeLeftRecursion :: Cat_ -> CFRules -> CFRules
removeLeftRecursion start gr
= groupProds $ concat [scheme1, scheme2, scheme3, scheme4]
where
scheme1 = [CFRule a [x,Cat a_x] (Name (IC "phony1") []) |
scheme1 = [CFRule a [x,Cat a_x] n' |
a <- retainedLeftRecursive,
x <- properLeftCornersOf a,
not (isLeftRecursive x),
let a_x = mkCat (Cat a) x]
scheme2 = [CFRule a_x (beta++[Cat a_b]) (Name (IC "phony2") []) |
let a_x = mkCat (Cat a) x,
let n' = symbol (\_ -> CFApp (CFRes 1) (CFRes 0))
(\_ -> CFRes 0) x]
scheme2 = [CFRule a_x (beta++[Cat a_b]) n' |
a <- retainedLeftRecursive,
b@(Cat b') <- properLeftCornersOf a,
isLeftRecursive b,
CFRule _ (x:beta) n <- catRules gr b',
let a_x = mkCat (Cat a) x,
let a_b = mkCat (Cat a) b]
scheme3 = [CFRule a_x beta n | -- FIXME: remove 0 from all profile elements
let a_b = mkCat (Cat a) b,
let i = length $ filterCats beta,
let n' = symbol (\_ -> CFAbs 1 (CFApp (CFRes i) (shiftTerm n)))
(\_ -> CFApp (CFRes i) n) x]
scheme3 = [CFRule a_x beta n' |
a <- retainedLeftRecursive,
x <- properLeftCornersOf a,
CFRule _ (x':beta) n <- catRules gr a,
x == x',
let a_x = mkCat (Cat a) x]
let a_x = mkCat (Cat a) x,
let n' = symbol (\_ -> CFAbs 1 (shiftTerm n))
(\_ -> n) x]
scheme4 = catSetRules gr $ Set.fromList $ filter (not . isLeftRecursive . Cat) cats
shiftTerm :: CFTerm -> CFTerm
shiftTerm (CFObj f ts) = CFObj f (map shiftTerm ts)
shiftTerm (CFRes 0) = CFVar 1
shiftTerm t = t
cats = allCats gr
rules = ungroupProds gr
@@ -121,7 +145,6 @@ removeLeftRecursion start gr
leftRecursive = reflexiveElements properLeftCorner
isLeftRecursive = (`Set.member` leftRecursive)
-- FIXME: include start cat
retained = start `Set.insert`
Set.fromList [a | (c,rs) <- gr, not (isLeftRecursive (Cat c)),
r <- rs, Cat a <- ruleRhs r]
@@ -131,9 +154,9 @@ removeLeftRecursion start gr
mkCat :: CFSymbol_ -> CFSymbol_ -> Cat_
mkCat x y = showSymbol x ++ "-" ++ showSymbol y
where showSymbol = symbol id ("$"++) -- FIXME !!!!!
where showSymbol = symbol id show
-}
{-
-- Paull's algorithm, see
-- http://research.microsoft.com/users/bobmoore/naacl2k-proc-rev.pdf
@@ -176,12 +199,13 @@ isDirectLeftRecursive :: CFRule_ -> Bool
isDirectLeftRecursive (CFRule c (Cat c':_) _) = c == c'
isDirectLeftRecursive _ = False
-}
-- * Removing cycles
removeCycles :: CFRules -> CFRules
removeCycles = groupProds . removeCycles_ . ungroupProds
where removeCycles_ rs = [r | r@(CFRule c rhs n) <- rs, rhs /= [Cat c]]
where removeCycles_ rs = [r | r@(CFRule c rhs _) <- rs, rhs /= [Cat c]]
-- | Get the sets of mutually recursive non-terminals for a grammar.
@@ -221,7 +245,11 @@ ruleRhs :: CFRule c n t -> [Symbol c t]
ruleRhs (CFRule _ ss _) = ss
ruleFun :: CFRule_ -> Fun
ruleFun (CFRule _ _ n) = name2fun n
ruleFun (CFRule _ _ t) = f t
where f (CFObj n _) = n
f (CFApp _ x) = f x
f (CFAbs _ x) = f x
f _ = IC ""
-- | Checks if a symbol is a non-terminal of one of the given categories.
catElem :: Symbol Cat_ t -> Set Cat_ -> Bool
@@ -232,7 +260,5 @@ catElem s cs = symbol (`Set.member` cs) (const False) s
anyUsedBy :: Eq c => [c] -> CFRule c n t -> Bool
anyUsedBy cs (CFRule _ ss _) = any (`elem` cs) (filterCats ss)
mkName :: String -> Name
mkName n = Name (IC n) []
mkCFTerm :: String -> CFTerm
mkCFTerm n = CFObj (IC n) []