forked from GitHub/gf-core
Faster regular expression pattern matching in the grammar compiler.
The sequence operator (x+y) was implemented by splitting the string to be
matched at all positions and trying to match the parts against the two
subpatterns. To reduce the number of splits, we now estimate the minimum and
maximum length of the string that the subpatterns could match. For common
cases, where one of the subpatterns is a string of known length, like
in (x+"y") or (x + ("a"|"o"|"u"|"e")+"y"), only one split will be tried.
This commit is contained in:
@@ -8,7 +8,7 @@ module GF.Compile.Compute.ConcreteNew
|
|||||||
import GF.Grammar hiding (Env, VGen, VApp, VRecType)
|
import GF.Grammar hiding (Env, VGen, VApp, VRecType)
|
||||||
import GF.Grammar.Lookup(lookupResDefLoc,allParamValues)
|
import GF.Grammar.Lookup(lookupResDefLoc,allParamValues)
|
||||||
import GF.Grammar.Predef(cPredef,cErrorType,cTok,cStr)
|
import GF.Grammar.Predef(cPredef,cErrorType,cTok,cStr)
|
||||||
import GF.Grammar.PatternMatch(matchPattern)
|
import GF.Grammar.PatternMatch(matchPattern,measurePatt)
|
||||||
import GF.Grammar.Lockfield(unlockRecord,lockLabel,isLockLabel,lockRecType)
|
import GF.Grammar.Lockfield(unlockRecord,lockLabel,isLockLabel,lockRecType)
|
||||||
import GF.Compile.Compute.Value hiding (Predefined(..))
|
import GF.Compile.Compute.Value hiding (Predefined(..))
|
||||||
import GF.Compile.Compute.Predef(predef,predefName,delta)
|
import GF.Compile.Compute.Predef(predef,predefName,delta)
|
||||||
@@ -320,7 +320,7 @@ valueTable env i cs =
|
|||||||
TWild _ -> True
|
TWild _ -> True
|
||||||
_ -> False
|
_ -> False
|
||||||
|
|
||||||
valueCase (p,t) = do p' <- inlinePattMacro p
|
valueCase (p,t) = do p' <- measurePatt # inlinePattMacro p
|
||||||
let pvs = pattVars p'
|
let pvs = pattVars p'
|
||||||
vt <- value (extend pvs env) t
|
vt <- value (extend pvs env) t
|
||||||
return (p', \ vs -> Bind $ \ bs -> vt (push' p' bs pvs vs))
|
return (p', \ vs -> Bind $ \ bs -> vt (push' p' bs pvs vs))
|
||||||
|
|||||||
@@ -430,6 +430,7 @@ data Term =
|
|||||||
| Error String -- ^ error values returned by Predef.error
|
| Error String -- ^ error values returned by Predef.error
|
||||||
deriving (Show, Eq, Ord)
|
deriving (Show, Eq, Ord)
|
||||||
|
|
||||||
|
-- | Patterns
|
||||||
data Patt =
|
data Patt =
|
||||||
PC Ident [Patt] -- ^ constructor pattern: @C p1 ... pn@ @C@
|
PC Ident [Patt] -- ^ constructor pattern: @C p1 ... pn@ @C@
|
||||||
| PP QIdent [Patt] -- ^ package constructor pattern: @P.C p1 ... pn@ @P.C@
|
| PP QIdent [Patt] -- ^ package constructor pattern: @P.C p1 ... pn@ @P.C@
|
||||||
@@ -450,14 +451,17 @@ data Patt =
|
|||||||
| PNeg Patt -- ^ negated pattern: -p
|
| PNeg Patt -- ^ negated pattern: -p
|
||||||
| PAlt Patt Patt -- ^ disjunctive pattern: p1 | p2
|
| PAlt Patt Patt -- ^ disjunctive pattern: p1 | p2
|
||||||
| PSeq Patt Patt -- ^ sequence of token parts: p + q
|
| PSeq Patt Patt -- ^ sequence of token parts: p + q
|
||||||
|
| PMSeq MPatt MPatt -- ^ sequence of token parts: p + q
|
||||||
| PRep Patt -- ^ repetition of token part: p*
|
| PRep Patt -- ^ repetition of token part: p*
|
||||||
| PChar -- ^ string of length one: ?
|
| PChar -- ^ string of length one: ?
|
||||||
| PChars [Char] -- ^ character list: ["aeiou"]
|
| PChars [Char] -- ^ character list: ["aeiou"]
|
||||||
| PMacro Ident -- #p
|
| PMacro Ident -- #p
|
||||||
| PM QIdent -- #m.p
|
| PM QIdent -- #m.p
|
||||||
|
|
||||||
deriving (Show, Eq, Ord)
|
deriving (Show, Eq, Ord)
|
||||||
|
|
||||||
|
-- | Measured pattern (paired with the min & max matching length)
|
||||||
|
type MPatt = ((Int,Int),Patt)
|
||||||
|
|
||||||
-- | to guide computation and type checking of tables
|
-- | to guide computation and type checking of tables
|
||||||
data TInfo =
|
data TInfo =
|
||||||
TRaw -- ^ received from parser; can be anything
|
TRaw -- ^ received from parser; can be anything
|
||||||
|
|||||||
@@ -483,6 +483,8 @@ composOp co trm =
|
|||||||
ImplArg t -> liftM ImplArg (co t)
|
ImplArg t -> liftM ImplArg (co t)
|
||||||
_ -> return trm -- covers K, Vr, Cn, Sort, EPatt
|
_ -> return trm -- covers K, Vr, Cn, Sort, EPatt
|
||||||
|
|
||||||
|
composSafePattOp op = runIdentity . composPattOp (return . op)
|
||||||
|
|
||||||
composPattOp :: Monad m => (Patt -> m Patt) -> Patt -> m Patt
|
composPattOp :: Monad m => (Patt -> m Patt) -> Patt -> m Patt
|
||||||
composPattOp op patt =
|
composPattOp op patt =
|
||||||
case patt of
|
case patt of
|
||||||
@@ -495,6 +497,7 @@ composPattOp op patt =
|
|||||||
PNeg p -> liftM PNeg (op p)
|
PNeg p -> liftM PNeg (op p)
|
||||||
PAlt p1 p2 -> liftM2 PAlt (op p1) (op p2)
|
PAlt p1 p2 -> liftM2 PAlt (op p1) (op p2)
|
||||||
PSeq p1 p2 -> liftM2 PSeq (op p1) (op p2)
|
PSeq p1 p2 -> liftM2 PSeq (op p1) (op p2)
|
||||||
|
PMSeq (_,p1) (_,p2) -> liftM2 PSeq (op p1) (op p2) -- information loss
|
||||||
PRep p -> liftM PRep (op p)
|
PRep p -> liftM PRep (op p)
|
||||||
_ -> return patt -- covers cases without subpatterns
|
_ -> return patt -- covers cases without subpatterns
|
||||||
|
|
||||||
@@ -545,6 +548,7 @@ collectPattOp op patt =
|
|||||||
PNeg p -> op p
|
PNeg p -> op p
|
||||||
PAlt p1 p2 -> op p1++op p2
|
PAlt p1 p2 -> op p1++op p2
|
||||||
PSeq p1 p2 -> op p1++op p2
|
PSeq p1 p2 -> op p1++op p2
|
||||||
|
PMSeq (_,p1) (_,p2) -> op p1++op p2
|
||||||
PRep p -> op p
|
PRep p -> op p
|
||||||
_ -> [] -- covers cases without subpatterns
|
_ -> [] -- covers cases without subpatterns
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,8 @@
|
|||||||
|
|
||||||
module GF.Grammar.PatternMatch (matchPattern,
|
module GF.Grammar.PatternMatch (matchPattern,
|
||||||
testOvershadow,
|
testOvershadow,
|
||||||
findMatch
|
findMatch,
|
||||||
|
measurePatt
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import GF.Data.Operations
|
import GF.Data.Operations
|
||||||
@@ -117,6 +118,7 @@ tryMatch (p,t) = do
|
|||||||
_ -> Bad (render (text "no match with negative pattern" <+> ppPatt Unqualified 0 p))
|
_ -> Bad (render (text "no match with negative pattern" <+> ppPatt Unqualified 0 p))
|
||||||
|
|
||||||
(PSeq p1 p2, ([],K s, [])) -> matchPSeq p1 p2 s
|
(PSeq p1 p2, ([],K s, [])) -> matchPSeq p1 p2 s
|
||||||
|
(PMSeq mp1 mp2, ([],K s, [])) -> matchPMSeq mp1 mp2 s
|
||||||
|
|
||||||
(PRep p1, ([],K s, [])) -> checks [
|
(PRep p1, ([],K s, [])) -> checks [
|
||||||
trym (foldr (const (PSeq p1)) (PString "")
|
trym (foldr (const (PSeq p1)) (PString "")
|
||||||
@@ -129,13 +131,18 @@ tryMatch (p,t) = do
|
|||||||
|
|
||||||
_ -> Bad (render (text "no match in case expr for" <+> ppTerm Unqualified 0 t))
|
_ -> Bad (render (text "no match in case expr for" <+> ppTerm Unqualified 0 t))
|
||||||
|
|
||||||
matchPSeq p1 p2 s =
|
matchPMSeq (m1,p1) (m2,p2) s = matchPSeq' m1 p1 m2 p2 s
|
||||||
do let min1 = 0 --minLength p1
|
--matchPSeq p1 p2 s = matchPSeq' (0,maxBound::Int) p1 (0,maxBound::Int) p2 s
|
||||||
min2 = length s -- -minLength p2
|
matchPSeq p1 p2 s = matchPSeq' (lengthBounds p1) p1 (lengthBounds p2) p2 s
|
||||||
cuts = [splitAt n s | n <- [min1 .. min2]]
|
|
||||||
|
matchPSeq' b1@(min1,max1) p1 b2@(min2,max2) p2 s =
|
||||||
|
do let n = length s
|
||||||
|
lo = min1 `max` (n-max2)
|
||||||
|
hi = (n-min2) `min` max1
|
||||||
|
cuts = [splitAt i s | i <- [lo..hi]]
|
||||||
matches <- checks [mapM tryMatch [(p1,K s1),(p2,K s2)] | (s1,s2) <- cuts]
|
matches <- checks [mapM tryMatch [(p1,K s1),(p2,K s2)] | (s1,s2) <- cuts]
|
||||||
return (concat matches)
|
return (concat matches)
|
||||||
{-
|
|
||||||
-- | Estimate the minimal length of the string that a pattern will match
|
-- | Estimate the minimal length of the string that a pattern will match
|
||||||
minLength p =
|
minLength p =
|
||||||
case p of
|
case p of
|
||||||
@@ -147,7 +154,31 @@ minLength p =
|
|||||||
PAs x p' -> minLength p'
|
PAs x p' -> minLength p'
|
||||||
PT t p' -> minLength p'
|
PT t p' -> minLength p'
|
||||||
_ -> 0 -- safe underestimate
|
_ -> 0 -- safe underestimate
|
||||||
-}
|
|
||||||
|
-- | Estimate the maximal length of the string that a pattern will match
|
||||||
|
maxLength = maybe maxBound id . maxl -- safe overestimate
|
||||||
|
where
|
||||||
|
maxl p =
|
||||||
|
case p of
|
||||||
|
PString s -> Just (length s)
|
||||||
|
PSeq p1 p2 -> liftM2 (+) (maxl p1) (maxl p2)
|
||||||
|
PAlt p1 p2 -> liftM2 max (maxl p1) (maxl p2)
|
||||||
|
PChar -> Just 1
|
||||||
|
PChars _ -> Just 1
|
||||||
|
PAs x p' -> maxl p'
|
||||||
|
PT t p' -> maxl p'
|
||||||
|
_ -> Nothing -- unknown length
|
||||||
|
|
||||||
|
lengthBounds p = (minLength p,maxLength p)
|
||||||
|
|
||||||
|
mPatt p = (lengthBounds p,measurePatt p)
|
||||||
|
|
||||||
|
measurePatt p =
|
||||||
|
case p of
|
||||||
|
PSeq p1 p2 -> PMSeq (mPatt p1) (mPatt p2)
|
||||||
|
_ -> composSafePattOp measurePatt p
|
||||||
|
|
||||||
|
|
||||||
isInConstantForm :: Term -> Bool
|
isInConstantForm :: Term -> Bool
|
||||||
isInConstantForm trm = case trm of
|
isInConstantForm trm = case trm of
|
||||||
Cn _ -> True
|
Cn _ -> True
|
||||||
|
|||||||
@@ -238,6 +238,7 @@ ppCase q (p,e) = ppPatt q 0 p <+> text "=>" <+> ppTerm q 0 e
|
|||||||
|
|
||||||
ppPatt q d (PAlt p1 p2) = prec d 0 (ppPatt q 0 p1 <+> char '|' <+> ppPatt q 1 p2)
|
ppPatt q d (PAlt p1 p2) = prec d 0 (ppPatt q 0 p1 <+> char '|' <+> ppPatt q 1 p2)
|
||||||
ppPatt q d (PSeq p1 p2) = prec d 0 (ppPatt q 0 p1 <+> char '+' <+> ppPatt q 1 p2)
|
ppPatt q d (PSeq p1 p2) = prec d 0 (ppPatt q 0 p1 <+> char '+' <+> ppPatt q 1 p2)
|
||||||
|
ppPatt q d (PMSeq (_,p1) (_,p2)) = prec d 0 (ppPatt q 0 p1 <+> char '+' <+> ppPatt q 1 p2)
|
||||||
ppPatt q d (PC f ps) = if null ps
|
ppPatt q d (PC f ps) = if null ps
|
||||||
then ppIdent f
|
then ppIdent f
|
||||||
else prec d 1 (ppIdent f <+> hsep (map (ppPatt q 3) ps))
|
else prec d 1 (ppIdent f <+> hsep (map (ppPatt q 3) ps))
|
||||||
|
|||||||
Reference in New Issue
Block a user