From kde-commits Fri Feb 12 16:33:48 2010 From: Germain Garand Date: Fri, 12 Feb 2010 16:33:48 +0000 To: kde-commits Subject: branches/KDE/4.4/kdelibs/kjs Message-Id: <1265992428.086989.15757.nullmailer () svn ! kde ! org> X-MARC-Message: https://marc.info/?l=kde-commits&m=126599243605438 SVN commit 1089179 by ggarand: might want that too... even the dot is using such broken expressions! ---------- automatically merged revision 1088985: rewrite a very common and extremely inefficient regular expression pattern in a form that is equivalent but does not require a recursion. this is several order of magnitude faster in any RE engine and prevents libpcre from eating the stack. BUG: 191736 M +78 -14 regexp.cpp --- branches/KDE/4.4/kdelibs/kjs/regexp.cpp #1089178:1089179 @@ -44,6 +44,8 @@ RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown; +static bool sanitizePatternExtensions(UString &p, WTF::Vector* parenIdx = 0); + // JS regexps can contain Unicode escape sequences (\uxxxx) which // are rather uncommon elsewhere. As our regexp libs don't understand // them we do the unescaping ourselves internally. @@ -51,11 +53,13 @@ // expects null termination.. static UString sanitizePattern(const UString &p) { - UString newPattern; + UString np; + bool changed = false; const char* const nil = "\\x00"; if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) { bool escape = false; + changed = true; for (int i = 0; i < p.size(); ++i) { UChar c = p[i]; if (escape) { @@ -81,13 +85,13 @@ if (j < 4) { // sequence was incomplete. treat \u as u which IE always // and FF sometimes does. - newPattern.append(UString('u')); + np.append(UString('u')); } else { c = UChar(u); switch (u) { case 0: // Make sure to encode 0, to avoid terminating the string - newPattern += UString(nil); + np += UString(nil); break; case '^': case '$': @@ -101,35 +105,87 @@ case '[': case ']': case '|': // escape pattern characters have to remain escaped - newPattern.append(UString('\\')); + np.append(UString('\\')); // intentional fallthrough default: - newPattern += UString(&c, 1); + np += UString(&c, 1); break; } } continue; } - newPattern += UString('\\'); - newPattern += UString(&c, 1); + np += UString('\\'); + np += UString(&c, 1); } else { if (c == '\\') escape = true; else if (c == '\0') - newPattern += UString(nil); + np += UString(nil); else - newPattern += UString(&c, 1); + np += UString(&c, 1); } } - return newPattern; - } else { - return p; } + // Rewrite very inefficient RE formulation: + // (.|\s)+ is often used instead of the less intuitive, but vastly preferable [\w\W]+ + // The first wording needs to recurse at each character matched in libPCRE, leading to rapid exhaustion of stack space. + if (p.find(".|\\s)")>=0) { + if (np.isEmpty()) + np = p; + bool didRewrite = false; + WTF::Vector parenIdx; + sanitizePatternExtensions(np, &parenIdx); + Vector::const_iterator end = parenIdx.end(); + int previdx = 0; + UString tmp; + bool nonCapturing = false; + for (Vector::const_iterator it = parenIdx.begin(); it != end; ++it) { + int idx = *it; + if (np.size() < idx+6) + break; + if (np[idx+1] == '?' && np[idx+2] == ':') { + nonCapturing = true; + idx += 3; + } else { + ++idx; + } + if (!(np[idx] == '.' && np[idx+1] == '|' && np[idx+2] == '\\' && np[idx+3] == 's')) + continue; + if (np.size() >= idx+6 && (np[idx+5] == '+' || (np[idx+5] == '*')) && + // no need to do anything if the pattern is minimal e.g. (.|\s)+? + !(np.size() > idx+6 && np[idx+6] == '?')) { + didRewrite = true; + if (nonCapturing) { // trivial case: (?:.|\s)+ => [\w\W]+ + tmp.append(np, previdx, idx-previdx-3); + tmp.append("[\\w\\W]"); + tmp.append(np[idx+5]); + } else if (np[idx+5] == '*') { // capture zero of one or more: (.|\s)* => (?:[\w\W]*([\w\W])|[\w\W]?) + tmp.append(np, previdx, idx-previdx-1); + tmp.append("(?:[\\w\\W]*([\\w\\W])|[\\w\\W]?)"); + } else { // capture last of one or more: (.|\s)+ => [\w\W]*([\w\W]) + assert(np[idx+5] == '+'); + tmp.append(np, previdx, idx-previdx-1); + tmp.append("[\\w\\W]*([\\w\\W])"); + } + } else { + tmp.append(np, previdx, idx-previdx+5); + } + previdx = idx+6; + } + if (didRewrite) { + tmp.append(np, previdx); + fprintf(stderr, "Pattern: %s ", np.ascii()); + fprintf(stderr, "was rewritten to: %s\n", tmp.ascii()); + np = tmp; + changed = true; + } + } + return (changed ? np : p); } // For now, the only 'extension' to standard we are willing to deal with is // a non-escaped closing bracket, outside of a character class. e.g. /.*]/ -static bool sanitizePatternExtensions(UString &p) +static bool sanitizePatternExtensions(UString &p, WTF::Vector* parenIdx) { UString newPattern; @@ -138,6 +194,7 @@ bool escape = false; int state = StateNominal; + int escapedSinceLastParen = 0; for (int i = 0; i < p.size(); ++i) { UChar c = p[i]; if (escape) { @@ -150,13 +207,20 @@ state = StateNominal; } else if (state == StateNominal) { v.append(i); + ++escapedSinceLastParen; } } else if (c == '[') { if (state == StateOpenBracket) { v.append(i); + ++escapedSinceLastParen; } else if (state == StateNominal) { state = StateOpenBracket; } + } else if (c == '(') { + if (parenIdx && state == StateNominal) { + parenIdx->append(i+escapedSinceLastParen); + escapedSinceLastParen = 0; + } } } } @@ -176,7 +240,7 @@ p = newPattern; return true; } else { - return false; + return false; } }