[prev in list] [next in list] [prev in thread] [next in thread]
List: kde-commits
Subject: kdelibs/khtml/misc
From: Waldo Bastian <bastian () kde ! org>
Date: 2005-02-11 17:13:09
Message-ID: 20050211171309.596E91BB41 () office ! kde ! org
[Download RAW message or body]
CVS commit by waba:
Better endianness detection for utf16
M +26 -6 decoder.cpp 1.77
--- kdelibs/khtml/misc/decoder.cpp #1.76:1.77
@@ -208,17 +208,16 @@ QString Decoder::decode(const char *data
// Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
int bufferLength = buffer.length();
- const int maximumBOMLength = 3;
+ const int maximumBOMLength = 10;
if (beginning && bufferLength + len >= maximumBOMLength) {
- if (m_type != UserChosenEncoding) {
+ // If the user has chosen utf16 we still need to auto-detect the endianness
+ if ((m_type != UserChosenEncoding) || (m_codec->mibEnum() == 1000)) {
// Extract the first three bytes.
// Handle the case where some of bytes are already in the buffer.
- // The last byte is always guaranteed to not be in the buffer.
const uchar *udata = (const uchar *)data;
uchar c1 = bufferLength >= 1 ? (uchar)buffer[0] : *udata++;
uchar c2 = bufferLength >= 2 ? (uchar)buffer[1] : *udata++;
- assert(bufferLength < 3);
- uchar c3 = *udata;
+ uchar c3 = bufferLength >= 3 ? (uchar)buffer[2] : *udata++;
- // Check for the BOM.
+ // Check for the BOM
const char *autoDetectedEncoding;
if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {
@@ -226,4 +225,19 @@ QString Decoder::decode(const char *data
} else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
autoDetectedEncoding = "UTF-8";
+ } else if (c1 == 0x00 || c2 == 0x00) {
+ uchar c4 = bufferLength >= 4 ? (uchar)buffer[3] : *udata++;
+ uchar c5 = bufferLength >= 5 ? (uchar)buffer[4] : *udata++;
+ uchar c6 = bufferLength >= 6 ? (uchar)buffer[5] : *udata++;
+ uchar c7 = bufferLength >= 7 ? (uchar)buffer[6] : *udata++;
+ uchar c8 = bufferLength >= 8 ? (uchar)buffer[7] : *udata++;
+ uchar c9 = bufferLength >= 9 ? (uchar)buffer[8] : *udata++;
+ uchar c10 = bufferLength >= 10 ? (uchar)buffer[9] : *udata++;
+ int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
+ int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
+ if ((nul_count_even == 0 && nul_count_odd == 5) ||
+ (nul_count_even == 5 && nul_count_odd == 0))
+ autoDetectedEncoding = "ISO-10646-UCS-2";
+ else
+ autoDetectedEncoding = 0;
} else {
autoDetectedEncoding = 0;
@@ -238,4 +252,10 @@ QString Decoder::decode(const char *data
delete m_decoder;
m_decoder = m_codec->makeDecoder();
+ if (m_codec->mibEnum() == 1000 && c2 == 0x00)
+ {
+ // utf16LE, we need to put the decoder in LE mode
+ char reverseUtf16[3] = {0xFF, 0xFE, 0x00};
+ m_decoder->toUnicode(reverseUtf16, 2);
+ }
}
}
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic