[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kfm-devel
Subject:    [PATCH] Automatic charset detection for Japanese
From:       Waldo Bastian <bastian () kde ! org>
Date:       2002-02-14 7:37:45
[Download RAW message or body]

Attached is a patch mostly by Masaji Takeyama. It adds automatic charset 
detection in case no charset is specified. According to w3c we should default 
to latin1 in that case, but for Japanese users that's very often wrong.

Cheers,
Waldo
-- 
bastian@kde.org  |   SuSE Labs KDE Developer  |  bastian@suse.com

["kdelibs-13feb2002-khtml-autodetect-jp.patch" (text/x-diff)]

Index: Makefile.am
===================================================================
RCS file: /home/kde/kdelibs/khtml/misc/Makefile.am,v
retrieving revision 1.20
diff -u -r1.20 Makefile.am
--- Makefile.am	2001/12/28 00:07:22	1.20
+++ Makefile.am	2002/02/14 07:36:39
@@ -22,13 +22,13 @@
 noinst_LTLIBRARIES = libkhtmlmisc.la
 libkhtmlmisc_la_SOURCES = \
 	decoder.cpp    loader.cpp loader_jpeg.cpp \
-	htmlhashes.cpp helper.cpp 
+	htmlhashes.cpp helper.cpp jconv.c
 
 libkhtmlmisc_la_METASOURCES = AUTO
 
 noinst_HEADERS = \
 	decoder.h      khtmllayout.h loader_jpeg.h loader.h \
-	stringit.h     htmlhashes.h    helper.h 
+	stringit.h     htmlhashes.h    helper.h jconv.h
 
 INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/dcop -I$(top_srcdir)/kio -I$(top_srcdir)/libltdl \
            -I$(top_srcdir)/khtml $(all_includes)
Index: decoder.cpp
===================================================================
RCS file: /home/kde/kdelibs/khtml/misc/decoder.cpp,v
retrieving revision 1.50
diff -u -r1.50 decoder.cpp
--- decoder.cpp	2001/07/29 16:26:38	1.50
+++ decoder.cpp	2002/02/14 07:36:40
@@ -30,6 +30,8 @@
 using namespace khtml;
 
 #include "htmlhashes.h"
+#include "jconv.h"
+
 #include <qregexp.h>
 #include <qtextcodec.h>
 
@@ -38,6 +40,7 @@
 
 #include <ctype.h>
 #include <kdebug.h>
+#include <klocale.h>
 
 Decoder::Decoder()
 {
@@ -222,6 +225,23 @@
 #ifdef DECODE_DEBUG
 			kdDebug( 6005 ) << "Decoder: no charset found, using latin1. Id=" << id << endl;
 #endif
+			if ( KGlobal::locale()->country() == "jp" ) {
+			    switch ( detect_kanji( (unsigned char*)buffer.data() ) ) {
+			    case _JIS_:
+			        enc = "jis7";
+				break;
+			    case _EUC_:
+				enc = "eucjp";
+                                break;
+			    case _SJIS_:
+                                enc = "sjis";
+                                break;
+			    default:
+                                enc = "iso8859-1";
+				break;
+			    }			
+			    setEncoding(enc, true);
+			}
                         goto found;
                     }
                 }
Index: jconv.c
===================================================================
RCS file: jconv.c
diff -N jconv.c
--- /dev/null	Fri Feb  1 11:53:05 2002
+++ jconv.c	Thu Feb 14 08:36:40 2002
@@ -0,0 +1,160 @@
+/*
+    This file is part of the KDE libraries. It is extracted from libjconv.
+
+    Copyright (C) 1999-2000 Toru Hoshina <t@kondara.org>
+    Copyright (C) 1999-2000 Shingo Akagaki <dora@kondara.org>
+    Copyright (C) 1999-2000 Akira Higuchi <a@kondara.org>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Library General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Library General Public License for more details.
+
+    You should have received a copy of the GNU Library General Public License
+    along with this library; see the file COPYING.LIB.  If not, write to
+    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+    Boston, MA 02111-1307, USA.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "jconv.h"
+
+int detect_kanji(unsigned char *str)
+{
+   int expected = _ASCII_;
+   register int c;
+   int c1, c2;
+   int euc_c = 0, sjis_c = 0;
+   unsigned char *ptr;
+
+   if(!str) return (0);
+
+   ptr = str;
+   while ((c = (int)*ptr)!= '\0')
+     {
+	if (c == _ESC_)
+	  {
+	     if ((c = (int)*(++ptr)) == '\0') break;
+	     if (c == '$')
+	       {
+		  if ((c = (int)*(++ptr)) == '\0') break;
+		  if (c == 'B' || c == '@') return _JIS_;
+	       }
+	     ptr++;
+	     continue;
+	  }
+
+	if ((c >= 0x81 && c <= 0x8d) || (c >= 0x8f && c <= 0x9f))
+	  return _SJIS_;
+
+	if (c == _SS2_)
+	  {
+	     if ((c = (int)*(++ptr)) == '\0') break;
+	     if ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xa0) || 
+		 (c >= 0xe0 && c <= 0xfc)) return _SJIS_;
+	     if (c >= 0xa1 && c <= 0xdf) break;
+	     ptr++;
+	     continue;
+	  }
+
+	if (c >= 0xa1 && c <= 0xdf) /* euc or kana */
+	  {
+	     if((c = (int)*(++ptr)) == '\0') break;
+      
+	     if (c >= 0xe0 && c <= 0xfe)
+	       return _EUC_;
+	     if (c >= 0xa1 && c <= 0xdf)
+	       {
+		  expected = _EUCORSJIS_;
+		  ptr++;
+		  continue;
+	       }
+#if 1
+	     if (c == 0xa0 || (0xe0 <= c && c <= 0xfe))
+	       return _EUC_;
+	     else
+	       {
+		  expected = _EUCORSJIS_;
+		  ptr++;
+		  continue;
+	       }
+#else
+	     if (c <= 0x9f) return _SJIS_;
+	     if (c >= 0xf0 && c <= 0xfe) return _EUC_;
+#endif
+     
+	     if (c >= 0xe0 && c <= 0xef)
+	       {
+		  expected = _EUCORSJIS_;
+		  while (c >= 0x40)
+		    {
+		       if (c >= 0x81)
+			 {
+			    if (c <= 0x8d || (c >= 0x8f && c <= 0x9f))
+			      return _SJIS_;
+			    else if (c >= 0xfd && c <= 0xfe)
+			      {
+				 return _EUC_;
+			      }
+			 }
+		       if ((c = (int)*(++ptr)) == '\0') break;
+		    }
+		  ptr++;
+		  continue;
+	       }
+
+	     if (c >= 0xe0 && c <= 0xef)
+	       {
+		  if ((c = (int)*(++ptr)) == '\0') break;
+		  if ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xa0))
+		    return _SJIS_;
+		  if (c >= 0xfd && c <= 0xfe) return _EUC_;
+		  if (c >= 0xa1 && c <= 0xfc)
+		    expected = _EUCORSJIS_;
+	       }
+	  }
+#if 1
+	if (0xf0 <= c && c <= 0xfe)
+	  return _EUC_;
+#endif
+	ptr++;
+     }
+
+   ptr = str;
+   c2 = 0;
+   while ((c1 = (int)*ptr++) != '\0')
+     {
+	if (((c2 >  0x80 && c2 < 0xa0) || (c2 >= 0xe0 && c2 < 0xfd)) &&
+	    ((c1 >= 0x40 && c1 < 0x7f) || (c1 >= 0x80 && c1 < 0xfd)))
+	  sjis_c++, c1 = *ptr++;
+	c2 = c1;
+     }
+   if (sjis_c == 0)
+     expected = _EUC_;
+   else
+     {
+	ptr = str, c2 = 0;
+	while ((c1 = (int)*ptr++) != '\0')
+	  {
+	     if ((c2 > 0xa0  && c2 < 0xff) &&
+		 (c1 > 0xa0  && c1 < 0xff))
+	       euc_c++, c1 = *ptr++;
+	     c2 = c1;
+	  }
+	if (sjis_c > euc_c)
+	  expected = _SJIS_;
+	else
+	  expected = _EUC_;
+     }
+   return expected;
+}
Index: jconv.h
===================================================================
RCS file: jconv.h
diff -N jconv.h
--- /dev/null	Fri Feb  1 11:53:05 2002
+++ jconv.h	Thu Feb 14 08:36:41 2002
@@ -0,0 +1,42 @@
+/* jconv.h */
+/*
+    This file is part of the KDE libraries. It is extracted from libjconv.
+
+    Copyright (C) 1999-2000 Toru Hoshina <t@kondara.org>
+    Copyright (C) 1999-2000 Shingo Akagaki <dora@kondara.org>
+    Copyright (C) 1999-2000 Akira Higuchi <a@kondara.org>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Library General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Library General Public License for more details.
+
+    You should have received a copy of the GNU Library General Public License
+    along with this library; see the file COPYING.LIB.  If not, write to
+    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+    Boston, MA 02111-1307, USA.
+
+*/
+
+#ifndef __JCONV_H__
+#define __JCONV_H__
+
+#define _ASCII_         0
+#define _JIS_           1
+#define _EUC_           2
+#define _SJIS_          3
+#define _EUCORSJIS_     4
+#define _ESC_           0x1b
+#define _SS2_           0x8e
+
+#ifdef __cplusplus
+extern "C"
+#endif
+int detect_kanji(unsigned char *);
+
+#endif


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic