[prev in list] [next in list] [prev in thread] [next in thread]
List: kde-commits
Subject: branches/KDE/4.4/kdelibs/kjs
From: Germain Garand <germain () ebooksfrance ! org>
Date: 2010-02-12 16:33:48
Message-ID: 1265992428.086989.15757.nullmailer () svn ! kde ! org
[Download RAW message or body]
SVN commit 1089179 by ggarand:
might want that too... even the dot is using such broken expressions!
----------
automatically merged revision 1088985:
rewrite a very common and extremely inefficient regular expression pattern
in a form that is equivalent but does not require a recursion.
this is several order of magnitude faster in any RE engine and prevents
libpcre from eating the stack.
BUG: 191736
M +78 -14 regexp.cpp
--- branches/KDE/4.4/kdelibs/kjs/regexp.cpp #1089178:1089179
@@ -44,6 +44,8 @@
RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
+static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx = 0);
+
// JS regexps can contain Unicode escape sequences (\uxxxx) which
// are rather uncommon elsewhere. As our regexp libs don't understand
// them we do the unescaping ourselves internally.
@@ -51,11 +53,13 @@
// expects null termination..
static UString sanitizePattern(const UString &p)
{
- UString newPattern;
+ UString np;
+ bool changed = false;
const char* const nil = "\\x00";
if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) {
bool escape = false;
+ changed = true;
for (int i = 0; i < p.size(); ++i) {
UChar c = p[i];
if (escape) {
@@ -81,13 +85,13 @@
if (j < 4) {
// sequence was incomplete. treat \u as u which IE always
// and FF sometimes does.
- newPattern.append(UString('u'));
+ np.append(UString('u'));
} else {
c = UChar(u);
switch (u) {
case 0:
// Make sure to encode 0, to avoid terminating the string
- newPattern += UString(nil);
+ np += UString(nil);
break;
case '^':
case '$':
@@ -101,35 +105,87 @@
case '[': case ']':
case '|':
// escape pattern characters have to remain escaped
- newPattern.append(UString('\\'));
+ np.append(UString('\\'));
// intentional fallthrough
default:
- newPattern += UString(&c, 1);
+ np += UString(&c, 1);
break;
}
}
continue;
}
- newPattern += UString('\\');
- newPattern += UString(&c, 1);
+ np += UString('\\');
+ np += UString(&c, 1);
} else {
if (c == '\\')
escape = true;
else if (c == '\0')
- newPattern += UString(nil);
+ np += UString(nil);
else
- newPattern += UString(&c, 1);
+ np += UString(&c, 1);
}
}
- return newPattern;
- } else {
- return p;
}
+ // Rewrite very inefficient RE formulation:
+ // (.|\s)+ is often used instead of the less intuitive, but vastly preferable \
[\w\W]+ + // The first wording needs to recurse at each character matched in \
libPCRE, leading to rapid exhaustion of stack space. + if (p.find(".|\\s)")>=0) {
+ if (np.isEmpty())
+ np = p;
+ bool didRewrite = false;
+ WTF::Vector<int> parenIdx;
+ sanitizePatternExtensions(np, &parenIdx);
+ Vector<int>::const_iterator end = parenIdx.end();
+ int previdx = 0;
+ UString tmp;
+ bool nonCapturing = false;
+ for (Vector<int>::const_iterator it = parenIdx.begin(); it != end; ++it) {
+ int idx = *it;
+ if (np.size() < idx+6)
+ break;
+ if (np[idx+1] == '?' && np[idx+2] == ':') {
+ nonCapturing = true;
+ idx += 3;
+ } else {
+ ++idx;
+ }
+ if (!(np[idx] == '.' && np[idx+1] == '|' && np[idx+2] == '\\' && np[idx+3] \
== 's')) + continue;
+ if (np.size() >= idx+6 && (np[idx+5] == '+' || (np[idx+5] == '*')) &&
+ // no need to do anything if the pattern is minimal e.g. (.|\s)+?
+ !(np.size() > idx+6 && np[idx+6] == '?')) {
+ didRewrite = true;
+ if (nonCapturing) { // trivial case: (?:.|\s)+ => \
[\w\W]+ + tmp.append(np, previdx, idx-previdx-3);
+ tmp.append("[\\w\\W]");
+ tmp.append(np[idx+5]);
+ } else if (np[idx+5] == '*') { // capture zero of one or more: \
(.|\s)* => (?:[\w\W]*([\w\W])|[\w\W]?) + tmp.append(np, previdx, \
idx-previdx-1); + \
tmp.append("(?:[\\w\\W]*([\\w\\W])|[\\w\\W]?)"); + } else { \
// capture last of one or more: (.|\s)+ => [\w\W]*([\w\W]) + \
assert(np[idx+5] == '+'); + tmp.append(np, previdx, \
idx-previdx-1); + tmp.append("[\\w\\W]*([\\w\\W])");
+ }
+ } else {
+ tmp.append(np, previdx, idx-previdx+5);
+ }
+ previdx = idx+6;
+ }
+ if (didRewrite) {
+ tmp.append(np, previdx);
+ fprintf(stderr, "Pattern: %s ", np.ascii());
+ fprintf(stderr, "was rewritten to: %s\n", tmp.ascii());
+ np = tmp;
+ changed = true;
+ }
+ }
+ return (changed ? np : p);
}
// For now, the only 'extension' to standard we are willing to deal with is
// a non-escaped closing bracket, outside of a character class. e.g. /.*]/
-static bool sanitizePatternExtensions(UString &p)
+static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx)
{
UString newPattern;
@@ -138,6 +194,7 @@
bool escape = false;
int state = StateNominal;
+ int escapedSinceLastParen = 0;
for (int i = 0; i < p.size(); ++i) {
UChar c = p[i];
if (escape) {
@@ -150,13 +207,20 @@
state = StateNominal;
} else if (state == StateNominal) {
v.append(i);
+ ++escapedSinceLastParen;
}
} else if (c == '[') {
if (state == StateOpenBracket) {
v.append(i);
+ ++escapedSinceLastParen;
} else if (state == StateNominal) {
state = StateOpenBracket;
}
+ } else if (c == '(') {
+ if (parenIdx && state == StateNominal) {
+ parenIdx->append(i+escapedSinceLastParen);
+ escapedSinceLastParen = 0;
+ }
}
}
}
@@ -176,7 +240,7 @@
p = newPattern;
return true;
} else {
- return false;
+ return false;
}
}
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic