[prev in list] [next in list] [prev in thread] [next in thread]
List: kde-commits
Subject: branches/KDE/3.5/kdelibs/kjs
From: Maks Orlovich <maksim () kde ! org>
Date: 2006-11-12 19:52:18
Message-ID: 1163361138.876098.8633.nullmailer () svn ! kde ! org
[Download RAW message or body]
SVN commit 604411 by orlovich:
Fix Unicode support in RegExp handling, and also be more robust vs. embedded nulls.
Fixes problems with some cyrillic characters in gmail (#116512)
Makes google calendar somewhat work, though not fully (part #135445)
Also fixes #135246, #101398
BUG:135246
BUG:101398
BUG:116512
CCBUG:135445
M +149 -8 regexp.cpp
M +24 -1 regexp.h
M +2 -0 regexp_object.cpp
M +28 -4 string_object.cpp
--- branches/KDE/3.5/kdelibs/kjs/regexp.cpp #604410:604411
@@ -2,6 +2,8 @@
/*
* This file is part of the KDE libraries
* Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
+ * Copyright (C) 2003,2004 Apple Computer, Inc.
+ * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -22,22 +24,35 @@
#include "regexp.h"
#include "lexer.h"
+#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
using namespace KJS;
+RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
+
RegExp::RegExp(const UString &p, int f)
- : pat(p), flgs(f), m_notEmpty(false), valid(true)
+ : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
{
+ // Determine whether libpcre has unicode support if need be..
+ if (utf8Support == Unknown) {
+ int supported;
+ pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
+ utf8Support = supported ? Supported : Unsupported;
+ }
+
nrSubPatterns = 0; // determined in match() with POSIX regex.
// JS regexps can contain Unicode escape sequences (\uxxxx) which
// are rather uncommon elsewhere. As our regexp libs don't understand
// them we do the unescaping ourselves internally.
+ // Also make sure to expand out any nulls as pcre_compile
+ // expects null termination..
UString intern;
- if (p.find('\\') >= 0) {
+ const char* const nil = "\\x00";
+ if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
bool escape = false;
for (int i = 0; i < p.size(); ++i) {
UChar c = p[i];
@@ -52,7 +67,12 @@
if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) &&
Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) {
c = Lexer::convertUnicode(c0, c1, c2, c3);
- intern += UString(&c, 1);
+ if (c.unicode() == 0) {
+ // Make sure to encode 0, to avoid terminating the string
+ intern += UString(nil);
+ } else {
+ intern += UString(&c, 1);
+ }
i += 4;
continue;
}
@@ -62,6 +82,8 @@
} else {
if (c == '\\')
escape = true;
+ else if (c == '\0')
+ intern += UString(nil);
else
intern += UString(&c, 1);
}
@@ -81,8 +103,16 @@
if (flgs & Multiline)
pcreflags |= PCRE_MULTILINE;
- pcregex = pcre_compile(intern.ascii(), pcreflags,
+ if (utf8Support == Supported)
+ pcreflags |= PCRE_UTF8;
+
+ // Fill our buffer with an encoded version, whether utf-8, or,
+ // if PCRE is incapable, truncated.
+ prepareMatch(intern);
+
+ pcregex = pcre_compile(buffer, pcreflags,
&perrormsg, &errorOffset, NULL);
+ doneMatch(); // Cleanup buffers
if (!pcregex) {
#ifndef NDEBUG
fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
@@ -128,6 +158,7 @@
RegExp::~RegExp()
{
+ doneMatch(); // Be 100% sure buffers are freed
#ifdef HAVE_PCREPOSIX
if (pcregex)
pcre_free(pcregex);
@@ -137,8 +168,97 @@
#endif
}
+void RegExp::prepareUtf8(const UString& s)
+{
+ // Allocate a buffer big enough to hold all the characters plus \0
+ const int length = s.size();
+ buffer = new char[length * 3 + 1];
+
+ // Also create buffer for positions. We need one extra character in there,
+ // even past the \0 since the non-empty handling may jump one past the end
+ originalPos = new int[length * 3 + 2];
+
+ // Convert to runs of 8-bit characters, and generate indeces
+ // Note that we do NOT combine surrogate pairs here, as
+ // regexps operate on them as separate characters
+ char *p = buffer;
+ int *posOut = originalPos;
+ const UChar *d = s.data();
+ for (int i = 0; i != length; ++i) {
+ unsigned short c = d[i].unicode();
+
+ int sequenceLen;
+ if (c < 0x80) {
+ *p++ = (char)c;
+ sequenceLen = 1;
+ } else if (c < 0x800) {
+ *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
+ *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+ sequenceLen = 2;
+ } else {
+ *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
+ *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
+ *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+ sequenceLen = 3;
+ }
+
+ while (sequenceLen > 0) {
+ *posOut = i;
+ ++posOut;
+ --sequenceLen;
+ }
+ }
+
+ bufferSize = p - buffer;
+
+ *p++ = '\0';
+
+ // Record positions for \0, and the fictional character after that.
+ *posOut = length;
+ *(posOut+1) = length+1;
+}
+
+void RegExp::prepareASCII (const UString& s)
+{
+ originalPos = 0;
+
+ // Best-effort attempt to get something done
+ // when we don't have utf 8 available -- use
+ // truncated version, and pray for the best
+ CString truncated = s.cstring();
+ buffer = new char[truncated.size() + 1];
+ memcpy(buffer, truncated.c_str(), truncated.size());
+ buffer[truncated.size()] = '\0'; // For _compile use
+ bufferSize = truncated.size();
+}
+
+void RegExp::prepareMatch(const UString &s)
+{
+ delete[] originalPos; // Just to be sure..
+ delete[] buffer;
+ if (utf8Support == Supported)
+ prepareUtf8(s);
+ else
+ prepareASCII(s);
+
+#ifndef NDEBUG
+ originalS = s;
+#endif
+}
+
+void RegExp::doneMatch()
+{
+ delete[] originalPos; originalPos = 0;
+ delete[] buffer; buffer = 0;
+}
+
UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
{
+#ifndef NDEBUG
+ assert(s.data() == originalS.data()); // Make sure prepareMatch got called right..
+#endif
+ assert(valid);
+
if (i < 0)
i = 0;
if (ovector)
@@ -151,14 +271,28 @@
return UString::null;
#ifdef HAVE_PCREPOSIX
- CString buffer(s.cstring());
- int bufferSize = buffer.size();
int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
if (ovector) *ovector = new int[ovecsize];
if (!pcregex)
return UString::null;
- if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i,
+ int startPos;
+ int nextPos;
+
+ if (utf8Support == Supported) {
+ startPos = i;
+ while (originalPos[startPos] < i)
+ ++startPos;
+
+ nextPos = startPos;
+ while (originalPos[nextPos] < (i + 1))
+ ++nextPos;
+ } else {
+ startPos = i;
+ nextPos = i + 1;
+ }
+
+ if (pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0, // see man \
pcretest ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
{
@@ -172,7 +306,7 @@
fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
#endif
m_notEmpty = 0;
- if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i+1, 0,
+ if (pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, 0,
ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
return UString::null;
}
@@ -181,6 +315,13 @@
}
// Got a match, proceed with it.
+ // But fix up the ovector if need be..
+ if (ovector && originalPos) {
+ for (unsigned c = 0; c < 2 * (nrSubPatterns + 1); ++c) {
+ if ((*ovector)[c] != -1)
+ (*ovector)[c] = originalPos[(*ovector)[c]];
+ }
+ }
if (!ovector)
return UString::null; // don't rely on the return value if you pass ovector==0
--- branches/KDE/3.5/kdelibs/kjs/regexp.h #604410:604411
@@ -51,16 +51,39 @@
// RegExp.exec, so it has to store $1 etc.
// bool test(const UString &s, int i = -1);
unsigned int subPatterns() const { return nrSubPatterns; }
+
+ //These methods should be called around the match of the same string..
+ void prepareMatch(const UString &s);
+ void doneMatch();
private:
const UString pat;
- int flgs;
+ int flgs : 8;
bool m_notEmpty;
bool valid;
+
+ // Cached encoding info...
+ char* buffer;
+ int* originalPos;
+ int bufferSize;
+ void prepareUtf8 (const UString& s);
+ void prepareASCII (const UString& s);
+#ifndef NDEBUG
+ UString originalS; // the original string, used for sanity-checking
+#endif
+
#ifndef HAVE_PCREPOSIX
regex_t preg;
#else
pcre *pcregex;
+
+ enum UTF8SupportState {
+ Unknown,
+ Supported,
+ Unsupported
+ };
+
+ static UTF8SupportState utf8Support;
#endif
unsigned int nrSubPatterns;
--- branches/KDE/3.5/kdelibs/kjs/regexp_object.cpp #604410:604411
@@ -120,7 +120,9 @@
RegExpObjectImp* regExpObj = \
static_cast<RegExpObjectImp*>(exec->lexicalInterpreter()->builtinRegExp().imp()); \
int **ovector = regExpObj->registerRegexp( re, s.value() );
+ re->prepareMatch(s.value());
str = re->match(s.value(), i, 0L, ovector);
+ re->doneMatch();
regExpObj->setSubPatterns(re->subPatterns());
if (id == Test)
--- branches/KDE/3.5/kdelibs/kjs/string_object.cpp #604410:604411
@@ -294,6 +294,7 @@
}
RegExpObjectImp* regExpObj = \
static_cast<RegExpObjectImp*>(exec->interpreter()->builtinRegExp().imp()); int \
**ovector = regExpObj->registerRegexp(reg, s); + reg->prepareMatch(s);
UString mstr = reg->match(s, -1, &pos, ovector);
if (id == Search) {
result = Number(pos);
@@ -316,6 +317,7 @@
result = exec->lexicalInterpreter()->builtinArray().construct(exec, list);
}
}
+ reg->doneMatch();
delete tmpReg;
break;
}
@@ -337,13 +339,17 @@
else
u3 = a1.toString(exec); // 2nd arg is the replacement string
+ UString out;
+
// This is either a loop (if global is set) or a one-way (if not).
+ reg->prepareMatch(s);
do {
int **ovector = regExpObj->registerRegexp( reg, s );
UString mstr = reg->match(s, lastIndex, &pos, ovector);
regExpObj->setSubPatterns(reg->subPatterns());
if (pos == -1)
break;
+
len = mstr.size();
UString rstr;
@@ -381,12 +387,27 @@
Object thisObj = exec->interpreter()->globalObject();
rstr = o1.call( exec, thisObj, l ).toString(exec);
}
- lastIndex = pos + rstr.size();
- s = s.substr(0, pos) + rstr + s.substr(pos + len);
- //fprintf(stderr,"pos=%d,len=%d,lastIndex=%d,s=%s\n",pos,len,lastIndex,s.ascii());
+
+ // Append the stuff we skipped over to get to the match --
+ // that would be [lastIndex, pos) of the original..
+ if (pos != lastIndex)
+ out += s.substr(lastIndex, pos - lastIndex);
+
+ // Append the replacement..
+ out += rstr;
+
+ lastIndex = pos + len; // Skip over the matched stuff...
} while (global);
- result = String(s);
+ // Append the rest of the string to the output...
+ if (lastIndex == 0 && out.size() == 0) // Don't copy stuff if nothing changed
+ out = s;
+ else
+ out += s.substr(lastIndex, s.size() - lastIndex);
+
+ reg->doneMatch();
+
+ result = String(out);
} else { // First arg is a string
u2 = a0.toString(exec);
pos = s.find(u2);
@@ -433,8 +454,10 @@
if (a0.type() == ObjectType && \
Object::dynamicCast(a0).inherits(&RegExpImp::info)) { Object obj0 = \
Object::dynamicCast(a0); RegExp reg(obj0.get(exec,"source").toString(exec));
+ reg.prepareMatch(s);
if (s.isEmpty() && !reg.match(s, 0).isNull()) {
// empty string matched by regexp -> empty array
+ reg.doneMatch();
res.put(exec, lengthPropertyName, Number(0), DontDelete|ReadOnly|DontEnum);
break;
}
@@ -454,6 +477,7 @@
i++;
}
}
+ reg.doneMatch();
} else {
u2 = a0.toString(exec);
if (u2.isEmpty()) {
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic