'[PATCH] better Japanese detection algorithm'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kfm-devel
Subject:    [PATCH] better Japanese detection algorithm
From:       Takumi ASAKI <asataku () osk3 ! 3web ! ne ! jp>
Date:       2002-02-21 13:31:16
[Download RAW message or body]

Hi!

I'm happy khtml supports Japanese auto-detectin.
But current code sometimes fails auto-detection.
Because detect_kanji() returns EUC-JP for ASCII strings.

So I suggest change detection algorithm.
Here is patch to change other algorithm.
It works better.

Please review it.

-- 
  Che Che - Bye Bye
        From: Takumi ASAKI <asataku@osk3.3web.ne.jp>
	URL: http://www3.osk.3web.ne.jp/~asataku/

["kdelibs-3.0-cvs-khtml-auto-detect-japanese-20020219.diff" (text/x-diff)]

diff -ur /home/asaki/src/KDE/kde/kdelibs/khtml/misc/decoder.cpp \
                kdelibs/khtml/misc/decoder.cpp
--- /home/asaki/src/KDE/kde/kdelibs/khtml/misc/decoder.cpp	Sun Feb 17 14:14:24 2002
+++ kdelibs/khtml/misc/decoder.cpp	Tue Feb 19 20:42:19 2002
@@ -41,154 +41,226 @@
 #include <kdebug.h>
 #include <klocale.h>
 
-// Kanji detection - start
-//
-// Extracted from libjconv. libjconv is licensed LGPL.
-//
-//  Copyright (C) 1999-2000 Toru Hoshina <t@kondara.org>
-//  Copyright (C) 1999-2000 Shingo Akagaki <dora@kondara.org>
-//  Copyright (C) 1999-2000 Akira Higuchi <a@kondara.org>
-
-#define _ASCII_         0
-#define _JIS_           1
-#define _EUC_           2
-#define _SJIS_          3
-#define _EUCORSJIS_     4
-#define _ESC_           0x1b
-#define _SS2_           0x8e
-
-static int detect_kanji(unsigned char *str)
+class KanjiCode
 {
-   int expected = _ASCII_;
-   register int c;
-   int c1, c2;
-   int euc_c = 0, sjis_c = 0;
-   unsigned char *ptr;
-
-   if(!str) return (0);
-
-   ptr = str;
-   while ((c = (int)*ptr)!= '\0')
-     {
-	if (c == _ESC_)
-	  {
-	     if ((c = (int)*(++ptr)) == '\0') break;
-	     if (c == '$')
-	       {
-		  if ((c = (int)*(++ptr)) == '\0') break;
-		  if (c == 'B' || c == '@') return _JIS_;
-	       }
-	     ptr++;
-	     continue;
-	  }
-
-	if ((c >= 0x81 && c <= 0x8d) || (c >= 0x8f && c <= 0x9f))
-	  return _SJIS_;
-
-	if (c == _SS2_)
-	  {
-	     if ((c = (int)*(++ptr)) == '\0') break;
-	     if ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xa0) || 
-		 (c >= 0xe0 && c <= 0xfc)) return _SJIS_;
-	     if (c >= 0xa1 && c <= 0xdf) break;
-	     ptr++;
-	     continue;
-	  }
-
-	if (c >= 0xa1 && c <= 0xdf) /* euc or kana */
-	  {
-	     if((c = (int)*(++ptr)) == '\0') break;
-      
-	     if (c >= 0xe0 && c <= 0xfe)
-	       return _EUC_;
-	     if (c >= 0xa1 && c <= 0xdf)
-	       {
-		  expected = _EUCORSJIS_;
-		  ptr++;
-		  continue;
-	       }
-#if 1
-	     if (c == 0xa0 || (0xe0 <= c && c <= 0xfe))
-	       return _EUC_;
-	     else
-	       {
-		  expected = _EUCORSJIS_;
-		  ptr++;
-		  continue;
-	       }
-#else
-	     if (c <= 0x9f) return _SJIS_;
-	     if (c >= 0xf0 && c <= 0xfe) return _EUC_;
-#endif
-     
-	     if (c >= 0xe0 && c <= 0xef)
-	       {
-		  expected = _EUCORSJIS_;
-		  while (c >= 0x40)
-		    {
-		       if (c >= 0x81)
-			 {
-			    if (c <= 0x8d || (c >= 0x8f && c <= 0x9f))
-			      return _SJIS_;
-			    else if (c >= 0xfd && c <= 0xfe)
-			      {
-				 return _EUC_;
-			      }
-			 }
-		       if ((c = (int)*(++ptr)) == '\0') break;
-		    }
-		  ptr++;
-		  continue;
-	       }
-
-	     if (c >= 0xe0 && c <= 0xef)
-	       {
-		  if ((c = (int)*(++ptr)) == '\0') break;
-		  if ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xa0))
-		    return _SJIS_;
-		  if (c >= 0xfd && c <= 0xfe) return _EUC_;
-		  if (c >= 0xa1 && c <= 0xfc)
-		    expected = _EUCORSJIS_;
-	       }
-	  }
-#if 1
-	if (0xf0 <= c && c <= 0xfe)
-	  return _EUC_;
-#endif
-	ptr++;
-     }
+public:
+    enum Type {ASCII, JIS, EUC, SJIS, UNICODE, UTF8 };
+    static enum Type judge(const char *str);
+    static const int ESC = 0x1b;
+    static const int _SS2_ = 0x8e;
+    static const unsigned char kanji_map_sjis[];
+    static int ISkanji(int code)
+    {
+	if (code >= 0x100)
+		    return 0;
+	return (kanji_map_sjis[code & 0xff] & 1);
+    }
 
-   ptr = str;
-   c2 = 0;
-   while ((c1 = (int)*ptr++) != '\0')
-     {
-	if (((c2 >  0x80 && c2 < 0xa0) || (c2 >= 0xe0 && c2 < 0xfd)) &&
-	    ((c1 >= 0x40 && c1 < 0x7f) || (c1 >= 0x80 && c1 < 0xfd)))
-	  sjis_c++, c1 = *ptr++;
-	c2 = c1;
-     }
-   if (sjis_c == 0)
-     expected = _EUC_;
-   else
-     {
-	ptr = str, c2 = 0;
-	while ((c1 = (int)*ptr++) != '\0')
-	  {
-	     if ((c2 > 0xa0  && c2 < 0xff) &&
-		 (c1 > 0xa0  && c1 < 0xff))
-	       euc_c++, c1 = *ptr++;
-	     c2 = c1;
-	  }
-	if (sjis_c > euc_c)
-	  expected = _SJIS_;
-	else
-	  expected = _EUC_;
-     }
-   return expected;
-}
+    static int ISkana(int code)
+    {
+	if (code >= 0x100)
+		    return 0;
+	return (kanji_map_sjis[code & 0xff] & 2);
+    }
 
-// Kanji detection - end
+};
 
+const unsigned char KanjiCode::kanji_map_sjis[] =
+{
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
+};
+
+/*
+ * EUC-JP is
+ *     [0xa1 - 0xfe][0xa1 - 0xfe]
+ *     0x8e[0xa1 - 0xfe](SS2)
+ *     0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
+ *
+ * Shift_Jis is
+ *     [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
+ *
+ * Shift_Jis Hankaku Kana is
+ *     [0xa1 - 0xdf]
+ */
+
+/*
+ * KanjiCode::judge() is based on judge_jcode() from jvim
+ *     http://hp.vector.co.jp/authors/VA003457/vim/
+ *
+ * Special Thanks to Kenichi Tsuchida
+ */
+
+/*
+ * Maybe we should use QTextCodec::heuristicContentMatch()
+ * But it fails detection.
+ * It's not useful.
+ */
+
+enum KanjiCode::Type KanjiCode::judge(const char *str)
+{
+    enum Type code;
+    int i;
+    int bfr = FALSE;		/* Kana Moji */
+    int bfk = 0;		/* EUC Kana */
+    int sjis = 0;
+    int euc = 0;
+
+    const unsigned char *ptr = (const unsigned char *) str;
+    size_t size = strlen(str);
+
+    code = ASCII;
+
+    i = 0;
+    while (i < size) {
+	if (ptr[i] == ESC && (size - i >= 3)) {
+	    if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
+	    || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
+		code = JIS;
+		goto breakBreak;
+	    } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
+		    || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
+		code = JIS;
+		goto breakBreak;
+	    } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
+		code = JIS;
+		i += 3;
+	    } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
+		code = JIS;
+		i += 3;
+	    } else {
+		i++;
+	    }
+	    bfr = FALSE;
+	    bfk = 0;
+	} else {
+	    if (ptr[i] < 0x20) {
+		bfr = FALSE;
+		bfk = 0;
+		/* ?? check kudokuten ?? && ?? hiragana ?? */
+		if ((i >= 2) && (ptr[i - 2] == 0x81)
+			&& (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
+		    code = SJIS;
+		    sjis += 100;	/* kudokuten */
+		} else if ((i >= 2) && (ptr[i - 2] == 0xa1)
+			&& (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
+		    code = EUC;
+		    euc += 100;		/* kudokuten */
+		} else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
+		    sjis += 40;		/* hiragana */
+		} else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
+		    euc += 40;	/* hiragana */
+		}
+	    } else {
+		/* ?? check hiragana or katana ?? */
+		if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
+		    sjis++;	/* hiragana */
+		} else if ((size - i > 1) && (ptr[i] == 0x83)
+			 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
+		    sjis++;	/* katakana */
+		} else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
+		    euc++;	/* hiragana */
+		} else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
+		    euc++;	/* katakana */
+		}
+		if (bfr) {
+		    if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
+			code = SJIS;
+			goto breakBreak;
+		    } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= \
ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) { +			code = SJIS;
+			goto breakBreak;
+		    } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - \
1] && ptr[i - 1] <= 0xfe)) { +			code = EUC;
+			goto breakBreak;
+		    } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= \
ptr[i] && ptr[i] <= 0xfe)) { +			code = EUC;
+			goto breakBreak;
+		    } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - \
1])) { +			code = SJIS;
+			goto breakBreak;
+		    } else if (ptr[i] <= 0x7f) {
+			code = SJIS;
+			goto breakBreak;
+		    } else {
+			if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
+			    euc++;	/* sjis hankaku kana kigo */
+			} else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
+			    ;	/* sjis hankaku kana */
+			} else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
+			    euc++;
+			} else if (0x8e == ptr[i]) {
+			    euc++;
+			} else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
+			    sjis++;
+			}
+			bfr = FALSE;
+			bfk = 0;
+		    }
+		} else if (0x8e == ptr[i]) {
+		    if (size - i <= 1) {
+			;
+		    } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
+			/* EUC KANA or SJIS KANJI */
+			if (bfk == 1) {
+			    euc += 100;
+			}
+			bfk++;
+			i++;
+		    } else {
+			/* SJIS only */
+			code = SJIS;
+			goto breakBreak;
+		    }
+		} else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
+		    /* SJIS only */
+		    code = SJIS;
+		    if ((size - i >= 1)
+			    && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
+			    || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
+			goto breakBreak;
+		    }
+		} else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
+		    /* EUC only */
+		    code = EUC;
+		    if ((size - i >= 1)
+			    && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
+			goto breakBreak;
+		    }
+		} else if (ptr[i] <= 0x7f) {
+		    ;
+		} else {
+		    bfr = TRUE;
+		    bfk = 0;
+		}
+	    }
+	    i++;
+	}
+    }
+    if (code == ASCII) {
+	if (sjis > euc) {
+	    code = SJIS;
+	} else if (sjis < euc) {
+	    code = EUC;
+	}
+    }
+breakBreak:
+    return (code);
+}
 
 Decoder::Decoder()
 {
@@ -384,22 +456,32 @@
     }
 
  found:
-    if (!haveEncoding && KGlobal::locale()->country() == "jp" ) 
-    {
-        switch ( detect_kanji( (unsigned char*)buffer.data() ) ) {
-            case _JIS_:
-                enc = "jis7";
-                break;
-	    case _EUC_:
-		enc = "eucjp";
-                break;
-	    case _SJIS_:
-                enc = "sjis";
-                break;
-        }
-        if (!enc.isEmpty())
-            setEncoding(enc, true);
+    if (!haveEncoding && KGlobal::locale()->languageList()[0] == "ja") {
+#ifdef DECODE_DEBUG
+	kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl;
+#endif
+	switch ( KanjiCode::judge( data ) ) {
+	case KanjiCode::JIS:
+	    enc = "jis7";
+	    break;
+	case KanjiCode::EUC:
+	    enc = "eucjp";
+	    break;
+	case KanjiCode::SJIS:
+	    enc = "sjis";
+	    break;
+	default:
+	    enc = NULL;
+	    break;
+	}
+#ifdef DECODE_DEBUG
+	kdDebug( 6005 ) << "Decoder: auto detect encoding is " << enc << endl;
+#endif
+	if (!enc.isEmpty()) {
+	    setEncoding(enc, true);
+	}
     }
+
     // if we still haven't found an encoding latin1 will be used...
     // this is according to HTML4.0 specs
     if (!m_codec)



[prev in list] [next in list] [prev in thread] [next in thread]
Configure | About | News | Add a list | Sponsored by KoreLogic