From 0feb386691bb82e13c3dcc01e27ae33d8865f2ca Mon Sep 17 00:00:00 2001 From: hallgren Date: Wed, 27 Feb 2013 20:59:43 +0000 Subject: [PATCH] Faster regular expression pattern matching in the grammar compiler. The sequence operator (x+y) was implemented by splitting the string to be matched at all positions and trying to match the parts against the two subpatterns. To reduce the number of splits, we now estimate the minimum and maximum length of the string that the subpatterns could match. For common cases, where one of the subpatterns is a string of known length, like in (x+"y") or (x + ("a"|"o"|"u"|"e")+"y"), only one split will be tried. --- .../GF/Compile/Compute/ConcreteNew.hs | 4 +- src/compiler/GF/Grammar/Grammar.hs | 6 ++- src/compiler/GF/Grammar/Macros.hs | 4 ++ src/compiler/GF/Grammar/PatternMatch.hs | 45 ++++++++++++++++--- src/compiler/GF/Grammar/Printer.hs | 1 + 5 files changed, 50 insertions(+), 10 deletions(-) diff --git a/src/compiler/GF/Compile/Compute/ConcreteNew.hs b/src/compiler/GF/Compile/Compute/ConcreteNew.hs index 6be113d4b..c853458f8 100644 --- a/src/compiler/GF/Compile/Compute/ConcreteNew.hs +++ b/src/compiler/GF/Compile/Compute/ConcreteNew.hs @@ -8,7 +8,7 @@ module GF.Compile.Compute.ConcreteNew import GF.Grammar hiding (Env, VGen, VApp, VRecType) import GF.Grammar.Lookup(lookupResDefLoc,allParamValues) import GF.Grammar.Predef(cPredef,cErrorType,cTok,cStr) -import GF.Grammar.PatternMatch(matchPattern) +import GF.Grammar.PatternMatch(matchPattern,measurePatt) import GF.Grammar.Lockfield(unlockRecord,lockLabel,isLockLabel,lockRecType) import GF.Compile.Compute.Value hiding (Predefined(..)) import GF.Compile.Compute.Predef(predef,predefName,delta) @@ -320,7 +320,7 @@ valueTable env i cs = TWild _ -> True _ -> False - valueCase (p,t) = do p' <- inlinePattMacro p + valueCase (p,t) = do p' <- measurePatt # inlinePattMacro p let pvs = pattVars p' vt <- value (extend pvs env) t return (p', \ vs -> Bind $ \ bs -> vt (push' p' bs pvs vs)) diff --git a/src/compiler/GF/Grammar/Grammar.hs b/src/compiler/GF/Grammar/Grammar.hs index 218a2bd0b..c59cd809e 100644 --- a/src/compiler/GF/Grammar/Grammar.hs +++ b/src/compiler/GF/Grammar/Grammar.hs @@ -430,6 +430,7 @@ data Term = | Error String -- ^ error values returned by Predef.error deriving (Show, Eq, Ord) +-- | Patterns data Patt = PC Ident [Patt] -- ^ constructor pattern: @C p1 ... pn@ @C@ | PP QIdent [Patt] -- ^ package constructor pattern: @P.C p1 ... pn@ @P.C@ @@ -450,14 +451,17 @@ data Patt = | PNeg Patt -- ^ negated pattern: -p | PAlt Patt Patt -- ^ disjunctive pattern: p1 | p2 | PSeq Patt Patt -- ^ sequence of token parts: p + q + | PMSeq MPatt MPatt -- ^ sequence of token parts: p + q | PRep Patt -- ^ repetition of token part: p* | PChar -- ^ string of length one: ? | PChars [Char] -- ^ character list: ["aeiou"] | PMacro Ident -- #p | PM QIdent -- #m.p - deriving (Show, Eq, Ord) +-- | Measured pattern (paired with the min & max matching length) +type MPatt = ((Int,Int),Patt) + -- | to guide computation and type checking of tables data TInfo = TRaw -- ^ received from parser; can be anything diff --git a/src/compiler/GF/Grammar/Macros.hs b/src/compiler/GF/Grammar/Macros.hs index 97146b197..bd7de5db4 100644 --- a/src/compiler/GF/Grammar/Macros.hs +++ b/src/compiler/GF/Grammar/Macros.hs @@ -483,6 +483,8 @@ composOp co trm = ImplArg t -> liftM ImplArg (co t) _ -> return trm -- covers K, Vr, Cn, Sort, EPatt +composSafePattOp op = runIdentity . composPattOp (return . op) + composPattOp :: Monad m => (Patt -> m Patt) -> Patt -> m Patt composPattOp op patt = case patt of @@ -495,6 +497,7 @@ composPattOp op patt = PNeg p -> liftM PNeg (op p) PAlt p1 p2 -> liftM2 PAlt (op p1) (op p2) PSeq p1 p2 -> liftM2 PSeq (op p1) (op p2) + PMSeq (_,p1) (_,p2) -> liftM2 PSeq (op p1) (op p2) -- information loss PRep p -> liftM PRep (op p) _ -> return patt -- covers cases without subpatterns @@ -545,6 +548,7 @@ collectPattOp op patt = PNeg p -> op p PAlt p1 p2 -> op p1++op p2 PSeq p1 p2 -> op p1++op p2 + PMSeq (_,p1) (_,p2) -> op p1++op p2 PRep p -> op p _ -> [] -- covers cases without subpatterns diff --git a/src/compiler/GF/Grammar/PatternMatch.hs b/src/compiler/GF/Grammar/PatternMatch.hs index 12bd29c8c..8ea388f76 100644 --- a/src/compiler/GF/Grammar/PatternMatch.hs +++ b/src/compiler/GF/Grammar/PatternMatch.hs @@ -14,7 +14,8 @@ module GF.Grammar.PatternMatch (matchPattern, testOvershadow, - findMatch + findMatch, + measurePatt ) where import GF.Data.Operations @@ -117,6 +118,7 @@ tryMatch (p,t) = do _ -> Bad (render (text "no match with negative pattern" <+> ppPatt Unqualified 0 p)) (PSeq p1 p2, ([],K s, [])) -> matchPSeq p1 p2 s + (PMSeq mp1 mp2, ([],K s, [])) -> matchPMSeq mp1 mp2 s (PRep p1, ([],K s, [])) -> checks [ trym (foldr (const (PSeq p1)) (PString "") @@ -129,13 +131,18 @@ tryMatch (p,t) = do _ -> Bad (render (text "no match in case expr for" <+> ppTerm Unqualified 0 t)) -matchPSeq p1 p2 s = - do let min1 = 0 --minLength p1 - min2 = length s -- -minLength p2 - cuts = [splitAt n s | n <- [min1 .. min2]] +matchPMSeq (m1,p1) (m2,p2) s = matchPSeq' m1 p1 m2 p2 s +--matchPSeq p1 p2 s = matchPSeq' (0,maxBound::Int) p1 (0,maxBound::Int) p2 s +matchPSeq p1 p2 s = matchPSeq' (lengthBounds p1) p1 (lengthBounds p2) p2 s + +matchPSeq' b1@(min1,max1) p1 b2@(min2,max2) p2 s = + do let n = length s + lo = min1 `max` (n-max2) + hi = (n-min2) `min` max1 + cuts = [splitAt i s | i <- [lo..hi]] matches <- checks [mapM tryMatch [(p1,K s1),(p2,K s2)] | (s1,s2) <- cuts] return (concat matches) -{- + -- | Estimate the minimal length of the string that a pattern will match minLength p = case p of @@ -147,7 +154,31 @@ minLength p = PAs x p' -> minLength p' PT t p' -> minLength p' _ -> 0 -- safe underestimate --} + +-- | Estimate the maximal length of the string that a pattern will match +maxLength = maybe maxBound id . maxl -- safe overestimate + where + maxl p = + case p of + PString s -> Just (length s) + PSeq p1 p2 -> liftM2 (+) (maxl p1) (maxl p2) + PAlt p1 p2 -> liftM2 max (maxl p1) (maxl p2) + PChar -> Just 1 + PChars _ -> Just 1 + PAs x p' -> maxl p' + PT t p' -> maxl p' + _ -> Nothing -- unknown length + +lengthBounds p = (minLength p,maxLength p) + +mPatt p = (lengthBounds p,measurePatt p) + +measurePatt p = + case p of + PSeq p1 p2 -> PMSeq (mPatt p1) (mPatt p2) + _ -> composSafePattOp measurePatt p + + isInConstantForm :: Term -> Bool isInConstantForm trm = case trm of Cn _ -> True diff --git a/src/compiler/GF/Grammar/Printer.hs b/src/compiler/GF/Grammar/Printer.hs index 276f2c9c2..0d9d41b7b 100644 --- a/src/compiler/GF/Grammar/Printer.hs +++ b/src/compiler/GF/Grammar/Printer.hs @@ -238,6 +238,7 @@ ppCase q (p,e) = ppPatt q 0 p <+> text "=>" <+> ppTerm q 0 e ppPatt q d (PAlt p1 p2) = prec d 0 (ppPatt q 0 p1 <+> char '|' <+> ppPatt q 1 p2) ppPatt q d (PSeq p1 p2) = prec d 0 (ppPatt q 0 p1 <+> char '+' <+> ppPatt q 1 p2) +ppPatt q d (PMSeq (_,p1) (_,p2)) = prec d 0 (ppPatt q 0 p1 <+> char '+' <+> ppPatt q 1 p2) ppPatt q d (PC f ps) = if null ps then ppIdent f else prec d 1 (ppIdent f <+> hsep (map (ppPatt q 3) ps))