[prev in list] [next in list] [prev in thread] [next in thread]
List: kfm-devel
Subject: [PATCH] better Japanese detection algorithm
From: Takumi ASAKI <asataku () osk3 ! 3web ! ne ! jp>
Date: 2002-02-21 13:31:16
[Download RAW message or body]
Hi!
I'm happy khtml supports Japanese auto-detectin.
But current code sometimes fails auto-detection.
Because detect_kanji() returns EUC-JP for ASCII strings.
So I suggest change detection algorithm.
Here is patch to change other algorithm.
It works better.
Please review it.
--
Che Che - Bye Bye
From: Takumi ASAKI <asataku@osk3.3web.ne.jp>
URL: http://www3.osk.3web.ne.jp/~asataku/
["kdelibs-3.0-cvs-khtml-auto-detect-japanese-20020219.diff" (text/x-diff)]
diff -ur /home/asaki/src/KDE/kde/kdelibs/khtml/misc/decoder.cpp \
kdelibs/khtml/misc/decoder.cpp
--- /home/asaki/src/KDE/kde/kdelibs/khtml/misc/decoder.cpp Sun Feb 17 14:14:24 2002
+++ kdelibs/khtml/misc/decoder.cpp Tue Feb 19 20:42:19 2002
@@ -41,154 +41,226 @@
#include <kdebug.h>
#include <klocale.h>
-// Kanji detection - start
-//
-// Extracted from libjconv. libjconv is licensed LGPL.
-//
-// Copyright (C) 1999-2000 Toru Hoshina <t@kondara.org>
-// Copyright (C) 1999-2000 Shingo Akagaki <dora@kondara.org>
-// Copyright (C) 1999-2000 Akira Higuchi <a@kondara.org>
-
-#define _ASCII_ 0
-#define _JIS_ 1
-#define _EUC_ 2
-#define _SJIS_ 3
-#define _EUCORSJIS_ 4
-#define _ESC_ 0x1b
-#define _SS2_ 0x8e
-
-static int detect_kanji(unsigned char *str)
+class KanjiCode
{
- int expected = _ASCII_;
- register int c;
- int c1, c2;
- int euc_c = 0, sjis_c = 0;
- unsigned char *ptr;
-
- if(!str) return (0);
-
- ptr = str;
- while ((c = (int)*ptr)!= '\0')
- {
- if (c == _ESC_)
- {
- if ((c = (int)*(++ptr)) == '\0') break;
- if (c == '$')
- {
- if ((c = (int)*(++ptr)) == '\0') break;
- if (c == 'B' || c == '@') return _JIS_;
- }
- ptr++;
- continue;
- }
-
- if ((c >= 0x81 && c <= 0x8d) || (c >= 0x8f && c <= 0x9f))
- return _SJIS_;
-
- if (c == _SS2_)
- {
- if ((c = (int)*(++ptr)) == '\0') break;
- if ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xa0) ||
- (c >= 0xe0 && c <= 0xfc)) return _SJIS_;
- if (c >= 0xa1 && c <= 0xdf) break;
- ptr++;
- continue;
- }
-
- if (c >= 0xa1 && c <= 0xdf) /* euc or kana */
- {
- if((c = (int)*(++ptr)) == '\0') break;
-
- if (c >= 0xe0 && c <= 0xfe)
- return _EUC_;
- if (c >= 0xa1 && c <= 0xdf)
- {
- expected = _EUCORSJIS_;
- ptr++;
- continue;
- }
-#if 1
- if (c == 0xa0 || (0xe0 <= c && c <= 0xfe))
- return _EUC_;
- else
- {
- expected = _EUCORSJIS_;
- ptr++;
- continue;
- }
-#else
- if (c <= 0x9f) return _SJIS_;
- if (c >= 0xf0 && c <= 0xfe) return _EUC_;
-#endif
-
- if (c >= 0xe0 && c <= 0xef)
- {
- expected = _EUCORSJIS_;
- while (c >= 0x40)
- {
- if (c >= 0x81)
- {
- if (c <= 0x8d || (c >= 0x8f && c <= 0x9f))
- return _SJIS_;
- else if (c >= 0xfd && c <= 0xfe)
- {
- return _EUC_;
- }
- }
- if ((c = (int)*(++ptr)) == '\0') break;
- }
- ptr++;
- continue;
- }
-
- if (c >= 0xe0 && c <= 0xef)
- {
- if ((c = (int)*(++ptr)) == '\0') break;
- if ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xa0))
- return _SJIS_;
- if (c >= 0xfd && c <= 0xfe) return _EUC_;
- if (c >= 0xa1 && c <= 0xfc)
- expected = _EUCORSJIS_;
- }
- }
-#if 1
- if (0xf0 <= c && c <= 0xfe)
- return _EUC_;
-#endif
- ptr++;
- }
+public:
+ enum Type {ASCII, JIS, EUC, SJIS, UNICODE, UTF8 };
+ static enum Type judge(const char *str);
+ static const int ESC = 0x1b;
+ static const int _SS2_ = 0x8e;
+ static const unsigned char kanji_map_sjis[];
+ static int ISkanji(int code)
+ {
+ if (code >= 0x100)
+ return 0;
+ return (kanji_map_sjis[code & 0xff] & 1);
+ }
- ptr = str;
- c2 = 0;
- while ((c1 = (int)*ptr++) != '\0')
- {
- if (((c2 > 0x80 && c2 < 0xa0) || (c2 >= 0xe0 && c2 < 0xfd)) &&
- ((c1 >= 0x40 && c1 < 0x7f) || (c1 >= 0x80 && c1 < 0xfd)))
- sjis_c++, c1 = *ptr++;
- c2 = c1;
- }
- if (sjis_c == 0)
- expected = _EUC_;
- else
- {
- ptr = str, c2 = 0;
- while ((c1 = (int)*ptr++) != '\0')
- {
- if ((c2 > 0xa0 && c2 < 0xff) &&
- (c1 > 0xa0 && c1 < 0xff))
- euc_c++, c1 = *ptr++;
- c2 = c1;
- }
- if (sjis_c > euc_c)
- expected = _SJIS_;
- else
- expected = _EUC_;
- }
- return expected;
-}
+ static int ISkana(int code)
+ {
+ if (code >= 0x100)
+ return 0;
+ return (kanji_map_sjis[code & 0xff] & 2);
+ }
-// Kanji detection - end
+};
+const unsigned char KanjiCode::kanji_map_sjis[] =
+{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
+};
+
+/*
+ * EUC-JP is
+ * [0xa1 - 0xfe][0xa1 - 0xfe]
+ * 0x8e[0xa1 - 0xfe](SS2)
+ * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
+ *
+ * Shift_Jis is
+ * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
+ *
+ * Shift_Jis Hankaku Kana is
+ * [0xa1 - 0xdf]
+ */
+
+/*
+ * KanjiCode::judge() is based on judge_jcode() from jvim
+ * http://hp.vector.co.jp/authors/VA003457/vim/
+ *
+ * Special Thanks to Kenichi Tsuchida
+ */
+
+/*
+ * Maybe we should use QTextCodec::heuristicContentMatch()
+ * But it fails detection.
+ * It's not useful.
+ */
+
+enum KanjiCode::Type KanjiCode::judge(const char *str)
+{
+ enum Type code;
+ int i;
+ int bfr = FALSE; /* Kana Moji */
+ int bfk = 0; /* EUC Kana */
+ int sjis = 0;
+ int euc = 0;
+
+ const unsigned char *ptr = (const unsigned char *) str;
+ size_t size = strlen(str);
+
+ code = ASCII;
+
+ i = 0;
+ while (i < size) {
+ if (ptr[i] == ESC && (size - i >= 3)) {
+ if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
+ || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
+ code = JIS;
+ goto breakBreak;
+ } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
+ || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
+ code = JIS;
+ goto breakBreak;
+ } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
+ code = JIS;
+ i += 3;
+ } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
+ code = JIS;
+ i += 3;
+ } else {
+ i++;
+ }
+ bfr = FALSE;
+ bfk = 0;
+ } else {
+ if (ptr[i] < 0x20) {
+ bfr = FALSE;
+ bfk = 0;
+ /* ?? check kudokuten ?? && ?? hiragana ?? */
+ if ((i >= 2) && (ptr[i - 2] == 0x81)
+ && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
+ code = SJIS;
+ sjis += 100; /* kudokuten */
+ } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
+ && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
+ code = EUC;
+ euc += 100; /* kudokuten */
+ } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
+ sjis += 40; /* hiragana */
+ } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
+ euc += 40; /* hiragana */
+ }
+ } else {
+ /* ?? check hiragana or katana ?? */
+ if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
+ sjis++; /* hiragana */
+ } else if ((size - i > 1) && (ptr[i] == 0x83)
+ && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
+ sjis++; /* katakana */
+ } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
+ euc++; /* hiragana */
+ } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
+ euc++; /* katakana */
+ }
+ if (bfr) {
+ if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
+ code = SJIS;
+ goto breakBreak;
+ } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= \
ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) { + code = SJIS;
+ goto breakBreak;
+ } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - \
1] && ptr[i - 1] <= 0xfe)) { + code = EUC;
+ goto breakBreak;
+ } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= \
ptr[i] && ptr[i] <= 0xfe)) { + code = EUC;
+ goto breakBreak;
+ } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - \
1])) { + code = SJIS;
+ goto breakBreak;
+ } else if (ptr[i] <= 0x7f) {
+ code = SJIS;
+ goto breakBreak;
+ } else {
+ if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
+ euc++; /* sjis hankaku kana kigo */
+ } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
+ ; /* sjis hankaku kana */
+ } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
+ euc++;
+ } else if (0x8e == ptr[i]) {
+ euc++;
+ } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
+ sjis++;
+ }
+ bfr = FALSE;
+ bfk = 0;
+ }
+ } else if (0x8e == ptr[i]) {
+ if (size - i <= 1) {
+ ;
+ } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
+ /* EUC KANA or SJIS KANJI */
+ if (bfk == 1) {
+ euc += 100;
+ }
+ bfk++;
+ i++;
+ } else {
+ /* SJIS only */
+ code = SJIS;
+ goto breakBreak;
+ }
+ } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
+ /* SJIS only */
+ code = SJIS;
+ if ((size - i >= 1)
+ && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
+ || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
+ goto breakBreak;
+ }
+ } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
+ /* EUC only */
+ code = EUC;
+ if ((size - i >= 1)
+ && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
+ goto breakBreak;
+ }
+ } else if (ptr[i] <= 0x7f) {
+ ;
+ } else {
+ bfr = TRUE;
+ bfk = 0;
+ }
+ }
+ i++;
+ }
+ }
+ if (code == ASCII) {
+ if (sjis > euc) {
+ code = SJIS;
+ } else if (sjis < euc) {
+ code = EUC;
+ }
+ }
+breakBreak:
+ return (code);
+}
Decoder::Decoder()
{
@@ -384,22 +456,32 @@
}
found:
- if (!haveEncoding && KGlobal::locale()->country() == "jp" )
- {
- switch ( detect_kanji( (unsigned char*)buffer.data() ) ) {
- case _JIS_:
- enc = "jis7";
- break;
- case _EUC_:
- enc = "eucjp";
- break;
- case _SJIS_:
- enc = "sjis";
- break;
- }
- if (!enc.isEmpty())
- setEncoding(enc, true);
+ if (!haveEncoding && KGlobal::locale()->languageList()[0] == "ja") {
+#ifdef DECODE_DEBUG
+ kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl;
+#endif
+ switch ( KanjiCode::judge( data ) ) {
+ case KanjiCode::JIS:
+ enc = "jis7";
+ break;
+ case KanjiCode::EUC:
+ enc = "eucjp";
+ break;
+ case KanjiCode::SJIS:
+ enc = "sjis";
+ break;
+ default:
+ enc = NULL;
+ break;
+ }
+#ifdef DECODE_DEBUG
+ kdDebug( 6005 ) << "Decoder: auto detect encoding is " << enc << endl;
+#endif
+ if (!enc.isEmpty()) {
+ setEncoding(enc, true);
+ }
}
+
// if we still haven't found an encoding latin1 will be used...
// this is according to HTML4.0 specs
if (!m_codec)
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic