From kde-commits  Fri Feb 12 16:33:48 2010
From: Germain Garand <germain () ebooksfrance ! org>
Date: Fri, 12 Feb 2010 16:33:48 +0000
To: kde-commits
Subject: branches/KDE/4.4/kdelibs/kjs
Message-Id: <1265992428.086989.15757.nullmailer () svn ! kde ! org>
X-MARC-Message: https://marc.info/?l=kde-commits&m=126599243605438

SVN commit 1089179 by ggarand:

might want that too... even the dot is using such broken expressions!
----------
automatically merged revision 1088985:
rewrite a very common and extremely inefficient regular expression pattern
in a form that is equivalent but does not require a recursion.

this is several order of magnitude faster in any RE engine and prevents
libpcre from eating the stack.

BUG: 191736

 M  +78 -14    regexp.cpp
--- branches/KDE/4.4/kdelibs/kjs/regexp.cpp #1089178:1089179
@@ -44,6 +44,8 @@
 
 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
 
+static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx = 0);
+
 // JS regexps can contain Unicode escape sequences (\uxxxx) which
 // are rather uncommon elsewhere. As our regexp libs don't understand
 // them we do the unescaping ourselves internally.
@@ -51,11 +53,13 @@
 // expects null termination..
 static UString sanitizePattern(const UString &p)
 {
-  UString newPattern;
+  UString np;
 
+  bool changed = false;
   const char* const nil = "\\x00";
   if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) {
     bool escape = false;
+    changed = true;
     for (int i = 0; i < p.size(); ++i) {
       UChar c = p[i];
       if (escape) {
@@ -81,13 +85,13 @@
 	  if (j < 4) {
 	    // sequence was incomplete. treat \u as u which IE always
 	    // and FF sometimes does.
-	     newPattern.append(UString('u'));
+	     np.append(UString('u'));
 	  } else {
             c = UChar(u);
             switch (u) {
             case 0:
 	      // Make sure to encode 0, to avoid terminating the string
-	       newPattern += UString(nil);
+	       np += UString(nil);
 	      break;
             case '^':
             case '$':
@@ -101,35 +105,87 @@
             case '[': case ']':
             case '|':
 	      // escape pattern characters have to remain escaped
-	       newPattern.append(UString('\\'));
+	       np.append(UString('\\'));
 	      // intentional fallthrough
             default:
-	      newPattern += UString(&c, 1);
+	      np += UString(&c, 1);
 	      break;
 	    }
           }
           continue;
         }
-        newPattern += UString('\\');
-        newPattern += UString(&c, 1);
+        np += UString('\\');
+        np += UString(&c, 1);
       } else {
         if (c == '\\')
           escape = true;
         else if (c == '\0')
-          newPattern += UString(nil);
+          np += UString(nil);
         else
-          newPattern += UString(&c, 1);
+          np += UString(&c, 1);
       }
     }
-    return newPattern;
-  } else {
-    return p;
   }
+  // Rewrite very inefficient RE formulation:
+  // (.|\s)+ is often used instead of the less intuitive, but vastly preferable [\w\W]+
+  // The first wording needs to recurse at each character matched in libPCRE, leading to rapid exhaustion of stack space.
+  if (p.find(".|\\s)")>=0) {
+      if (np.isEmpty())
+          np = p;
+      bool didRewrite = false;
+      WTF::Vector<int> parenIdx;
+      sanitizePatternExtensions(np, &parenIdx);
+      Vector<int>::const_iterator end = parenIdx.end();
+      int previdx = 0;
+      UString tmp;
+      bool nonCapturing = false;
+      for (Vector<int>::const_iterator it = parenIdx.begin(); it != end; ++it) {
+          int idx = *it;
+          if (np.size() < idx+6)
+              break;
+          if (np[idx+1] == '?' && np[idx+2] == ':') {
+              nonCapturing = true;
+              idx += 3;
+          } else {
+             ++idx;
+          }
+          if (!(np[idx] == '.' && np[idx+1] == '|' && np[idx+2] == '\\' && np[idx+3] == 's'))
+              continue;
+          if (np.size() >= idx+6 && (np[idx+5] == '+' || (np[idx+5] == '*')) &&
+              // no need to do anything if the pattern is minimal e.g. (.|\s)+?
+              !(np.size() > idx+6 && np[idx+6] == '?')) {
+                  didRewrite = true;
+                  if (nonCapturing) {               // trivial case: (?:.|\s)+ => [\w\W]+
+                      tmp.append(np, previdx, idx-previdx-3);
+                      tmp.append("[\\w\\W]");
+                      tmp.append(np[idx+5]);
+                  } else if (np[idx+5] == '*') {    // capture zero of one or more: (.|\s)* => (?:[\w\W]*([\w\W])|[\w\W]?)
+                      tmp.append(np, previdx, idx-previdx-1);
+                      tmp.append("(?:[\\w\\W]*([\\w\\W])|[\\w\\W]?)");
+                  } else {                          // capture last of one or more: (.|\s)+ => [\w\W]*([\w\W])
+                      assert(np[idx+5] == '+');
+                      tmp.append(np, previdx, idx-previdx-1);
+                      tmp.append("[\\w\\W]*([\\w\\W])");
+                  }
+          } else {
+              tmp.append(np, previdx, idx-previdx+5);
+          }
+          previdx = idx+6;
+      }
+      if (didRewrite) {
+          tmp.append(np, previdx);
+          fprintf(stderr, "Pattern: %s ", np.ascii());
+          fprintf(stderr, "was rewritten to: %s\n", tmp.ascii());
+          np = tmp;
+          changed = true;
+      }
+  }
+  return (changed ? np : p);
 }
 
 // For now, the only 'extension' to standard we are willing to deal with is
 // a non-escaped closing bracket, outside of a character class. e.g. /.*]/
-static bool sanitizePatternExtensions(UString &p)
+static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx)
 {
   UString newPattern;
 
@@ -138,6 +194,7 @@
   bool escape = false;
 
   int state = StateNominal;
+  int escapedSinceLastParen = 0;
   for (int i = 0; i < p.size(); ++i) {
       UChar c = p[i];
       if (escape) {
@@ -150,13 +207,20 @@
                 state = StateNominal;
             } else if (state == StateNominal) {
                 v.append(i);
+                ++escapedSinceLastParen;
             }
         } else if (c == '[') {
             if (state == StateOpenBracket) {
                 v.append(i);
+                ++escapedSinceLastParen;
             } else if (state == StateNominal) {
                 state = StateOpenBracket;
             }
+        } else if (c == '(') {
+            if (parenIdx && state == StateNominal) {
+                parenIdx->append(i+escapedSinceLastParen);
+                escapedSinceLastParen = 0;
+            }
         }
     }
   }
@@ -176,7 +240,7 @@
       p = newPattern;
       return true;
   } else {
-    return false;
+      return false;
   }
 }