[prev in list] [next in list] [prev in thread] [next in thread]
List: kfm-devel
Subject: [PATCH] Automatic charset detection for Japanese
From: Waldo Bastian <bastian () kde ! org>
Date: 2002-02-14 7:37:45
[Download RAW message or body]
Attached is a patch mostly by Masaji Takeyama. It adds automatic charset
detection in case no charset is specified. According to w3c we should default
to latin1 in that case, but for Japanese users that's very often wrong.
Cheers,
Waldo
--
bastian@kde.org | SuSE Labs KDE Developer | bastian@suse.com
["kdelibs-13feb2002-khtml-autodetect-jp.patch" (text/x-diff)]
Index: Makefile.am
===================================================================
RCS file: /home/kde/kdelibs/khtml/misc/Makefile.am,v
retrieving revision 1.20
diff -u -r1.20 Makefile.am
--- Makefile.am 2001/12/28 00:07:22 1.20
+++ Makefile.am 2002/02/14 07:36:39
@@ -22,13 +22,13 @@
noinst_LTLIBRARIES = libkhtmlmisc.la
libkhtmlmisc_la_SOURCES = \
decoder.cpp loader.cpp loader_jpeg.cpp \
- htmlhashes.cpp helper.cpp
+ htmlhashes.cpp helper.cpp jconv.c
libkhtmlmisc_la_METASOURCES = AUTO
noinst_HEADERS = \
decoder.h khtmllayout.h loader_jpeg.h loader.h \
- stringit.h htmlhashes.h helper.h
+ stringit.h htmlhashes.h helper.h jconv.h
INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/dcop -I$(top_srcdir)/kio -I$(top_srcdir)/libltdl \
-I$(top_srcdir)/khtml $(all_includes)
Index: decoder.cpp
===================================================================
RCS file: /home/kde/kdelibs/khtml/misc/decoder.cpp,v
retrieving revision 1.50
diff -u -r1.50 decoder.cpp
--- decoder.cpp 2001/07/29 16:26:38 1.50
+++ decoder.cpp 2002/02/14 07:36:40
@@ -30,6 +30,8 @@
using namespace khtml;
#include "htmlhashes.h"
+#include "jconv.h"
+
#include <qregexp.h>
#include <qtextcodec.h>
@@ -38,6 +40,7 @@
#include <ctype.h>
#include <kdebug.h>
+#include <klocale.h>
Decoder::Decoder()
{
@@ -222,6 +225,23 @@
#ifdef DECODE_DEBUG
kdDebug( 6005 ) << "Decoder: no charset found, using latin1. Id=" << id << endl;
#endif
+ if ( KGlobal::locale()->country() == "jp" ) {
+ switch ( detect_kanji( (unsigned char*)buffer.data() ) ) {
+ case _JIS_:
+ enc = "jis7";
+ break;
+ case _EUC_:
+ enc = "eucjp";
+ break;
+ case _SJIS_:
+ enc = "sjis";
+ break;
+ default:
+ enc = "iso8859-1";
+ break;
+ }
+ setEncoding(enc, true);
+ }
goto found;
}
}
Index: jconv.c
===================================================================
RCS file: jconv.c
diff -N jconv.c
--- /dev/null Fri Feb 1 11:53:05 2002
+++ jconv.c Thu Feb 14 08:36:40 2002
@@ -0,0 +1,160 @@
+/*
+ This file is part of the KDE libraries. It is extracted from libjconv.
+
+ Copyright (C) 1999-2000 Toru Hoshina <t@kondara.org>
+ Copyright (C) 1999-2000 Shingo Akagaki <dora@kondara.org>
+ Copyright (C) 1999-2000 Akira Higuchi <a@kondara.org>
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public License
+ along with this library; see the file COPYING.LIB. If not, write to
+ the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "jconv.h"
+
+int detect_kanji(unsigned char *str)
+{
+ int expected = _ASCII_;
+ register int c;
+ int c1, c2;
+ int euc_c = 0, sjis_c = 0;
+ unsigned char *ptr;
+
+ if(!str) return (0);
+
+ ptr = str;
+ while ((c = (int)*ptr)!= '\0')
+ {
+ if (c == _ESC_)
+ {
+ if ((c = (int)*(++ptr)) == '\0') break;
+ if (c == '$')
+ {
+ if ((c = (int)*(++ptr)) == '\0') break;
+ if (c == 'B' || c == '@') return _JIS_;
+ }
+ ptr++;
+ continue;
+ }
+
+ if ((c >= 0x81 && c <= 0x8d) || (c >= 0x8f && c <= 0x9f))
+ return _SJIS_;
+
+ if (c == _SS2_)
+ {
+ if ((c = (int)*(++ptr)) == '\0') break;
+ if ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xa0) ||
+ (c >= 0xe0 && c <= 0xfc)) return _SJIS_;
+ if (c >= 0xa1 && c <= 0xdf) break;
+ ptr++;
+ continue;
+ }
+
+ if (c >= 0xa1 && c <= 0xdf) /* euc or kana */
+ {
+ if((c = (int)*(++ptr)) == '\0') break;
+
+ if (c >= 0xe0 && c <= 0xfe)
+ return _EUC_;
+ if (c >= 0xa1 && c <= 0xdf)
+ {
+ expected = _EUCORSJIS_;
+ ptr++;
+ continue;
+ }
+#if 1
+ if (c == 0xa0 || (0xe0 <= c && c <= 0xfe))
+ return _EUC_;
+ else
+ {
+ expected = _EUCORSJIS_;
+ ptr++;
+ continue;
+ }
+#else
+ if (c <= 0x9f) return _SJIS_;
+ if (c >= 0xf0 && c <= 0xfe) return _EUC_;
+#endif
+
+ if (c >= 0xe0 && c <= 0xef)
+ {
+ expected = _EUCORSJIS_;
+ while (c >= 0x40)
+ {
+ if (c >= 0x81)
+ {
+ if (c <= 0x8d || (c >= 0x8f && c <= 0x9f))
+ return _SJIS_;
+ else if (c >= 0xfd && c <= 0xfe)
+ {
+ return _EUC_;
+ }
+ }
+ if ((c = (int)*(++ptr)) == '\0') break;
+ }
+ ptr++;
+ continue;
+ }
+
+ if (c >= 0xe0 && c <= 0xef)
+ {
+ if ((c = (int)*(++ptr)) == '\0') break;
+ if ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xa0))
+ return _SJIS_;
+ if (c >= 0xfd && c <= 0xfe) return _EUC_;
+ if (c >= 0xa1 && c <= 0xfc)
+ expected = _EUCORSJIS_;
+ }
+ }
+#if 1
+ if (0xf0 <= c && c <= 0xfe)
+ return _EUC_;
+#endif
+ ptr++;
+ }
+
+ ptr = str;
+ c2 = 0;
+ while ((c1 = (int)*ptr++) != '\0')
+ {
+ if (((c2 > 0x80 && c2 < 0xa0) || (c2 >= 0xe0 && c2 < 0xfd)) &&
+ ((c1 >= 0x40 && c1 < 0x7f) || (c1 >= 0x80 && c1 < 0xfd)))
+ sjis_c++, c1 = *ptr++;
+ c2 = c1;
+ }
+ if (sjis_c == 0)
+ expected = _EUC_;
+ else
+ {
+ ptr = str, c2 = 0;
+ while ((c1 = (int)*ptr++) != '\0')
+ {
+ if ((c2 > 0xa0 && c2 < 0xff) &&
+ (c1 > 0xa0 && c1 < 0xff))
+ euc_c++, c1 = *ptr++;
+ c2 = c1;
+ }
+ if (sjis_c > euc_c)
+ expected = _SJIS_;
+ else
+ expected = _EUC_;
+ }
+ return expected;
+}
Index: jconv.h
===================================================================
RCS file: jconv.h
diff -N jconv.h
--- /dev/null Fri Feb 1 11:53:05 2002
+++ jconv.h Thu Feb 14 08:36:41 2002
@@ -0,0 +1,42 @@
+/* jconv.h */
+/*
+ This file is part of the KDE libraries. It is extracted from libjconv.
+
+ Copyright (C) 1999-2000 Toru Hoshina <t@kondara.org>
+ Copyright (C) 1999-2000 Shingo Akagaki <dora@kondara.org>
+ Copyright (C) 1999-2000 Akira Higuchi <a@kondara.org>
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public License
+ along with this library; see the file COPYING.LIB. If not, write to
+ the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+
+*/
+
+#ifndef __JCONV_H__
+#define __JCONV_H__
+
+#define _ASCII_ 0
+#define _JIS_ 1
+#define _EUC_ 2
+#define _SJIS_ 3
+#define _EUCORSJIS_ 4
+#define _ESC_ 0x1b
+#define _SS2_ 0x8e
+
+#ifdef __cplusplus
+extern "C"
+#endif
+int detect_kanji(unsigned char *);
+
+#endif
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic