'Fwd: Two bytes encoding.'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kmail-devel
Subject:    Fwd: Two bytes encoding.
From:       Michael =?iso-8859-1?q?H=E4ckel?= <Michael () Haeckel ! Net>
Date:       2001-02-23 21:11:53
[Download RAW message or body]

Hi,

for the case someone is interested.
This code fixes writing of mails with Japanese headers. However it introduces 
some new problems for non Japanese.
I will most likely commit a reworked version of that code when I have more 
time for it, hopefully next week.

Regards,
Michael H踄kel

----------  Forwarded Message  ----------
Subject: Two bytes encoding.
Date: Fri, 23 Feb 2001 15:10:28 +0900
From: Toyohiro <toyohiro@ksmplus.com>
To: Michael H?ckel <Michael@haeckel.net>
Cc: toyohiro@ksmplus.com


If I send a mail to you twice. I am VERY sorry.

Hi Michael,

On Thu, 8 Feb 2001 15:27:54 +0100 , you wrote:
> properly, I look into that. Also there is still a bug with two byte
> encodings
> or iso-2022-jp not cut correctely in headers when the header is too long
> for one line, but in the rest of KMail it should work properly now.

If you have time, Could you please test this patch.
and Please tell me, if you find any other encode related problems.

There are test patch for mail header of two byte encodings.
and mail header of more than one line to do one line.

When KDE 2.1 have been released, I think , I apply this patch
for japanese peoples.

There are two files.
 (1)  Patch name is encode.diff.
 (2)  Source file name is test.cpp

   $cd Sources_Directory/kdenetwork
   $patch -p0 < encode.diff

   $cd Sources_Directory/kdenetwork/kmail

   Please file of test.cpp copy ????/kdenetwork/kmail directory.
   $cp test.cpp Sources_Directory/kdenetwork/kmail
   $make

The test.cpp is executable itself.
   $g++ -I/usr/X11R6/include -I$QTDIR/include \
    -I$KDEDIR/include -L/usr/X11R6/lib \
    -L$QTDIR/lib -L$KDEDIR/lib \
    -lqt -lkdecore -lmimelib -lX11 -lXext -lICE -lSM -o tst test.cpp


Best Regards,
 Toyohiro <toyohiro@ksmplus.com>

-------------------------------------------------------

["encode.diff" (text/plain)]

diff -ur kmail.orig/kmmessage.cpp kmail/kmmessage.cpp

--- kmail.orig/kmmessage.cpp	Tue Feb  6 10:01:40 2001
+++ kmail/kmmessage.cpp	Thu Feb 22 18:09:23 2001
@@ -1424,6 +1424,9 @@
   else
     result = decodeRFC2047String(header.FieldBody((const char*)aName).
                     AsString().c_str());
+
+//  Mail header of more than one line to do one line.
+  result=result.replace(QRegExp("\n[ \t]"),"");
   return result;
 }
 
diff -ur kmail.orig/kmmsgbase.cpp kmail/kmmsgbase.cpp
--- kmail.orig/kmmsgbase.cpp	Tue Feb 13 15:37:41 2001
+++ kmail/kmmsgbase.cpp	Fri Feb 23 11:51:52 2001
@@ -404,6 +404,8 @@
 
 
 //-----------------------------------------------------------------------------
+#define TEST_MAIL_HEADER
+#include "test.cpp"
 const char especials[18] = "()<>@,;:\"/[]?.= \033";
 
 const QString KMMsgBase::encodeRFC2047String(const QString& _str,
@@ -413,6 +415,8 @@
   QString cset;
   if (charset.isEmpty()) cset = KGlobal::locale()->charset();
     else cset = charset;
+  if( isTwoByteChar(_str,cset ) )                   // test toyohiro
+    return(TestencodeRFC2047String(_str,cset)); // test toyohiro
   QTextCodec *codec = codecForName(cset);
   QCString latin;
   if (charset == "us-ascii") latin = toUsAscii(_str);
@@ -430,7 +434,8 @@
     while (cr < latinLen)
     {
       if (latin[cr] == 32) start = cr + 1;
-      if (latin[cr] < 32) break;
+//      if (latin[cr] < 32) break;
+      if(latin[cr] == 27 || latin[cr] < 0) break;
       cr++;
     }
     if (cr < latinLen)
@@ -503,7 +508,8 @@
   bool quote;
   while (*l)
   {
-    if (*l < 32) break;
+//    if (*l < 32) break;
+    if( *l < 0 || *l == 27 ) break; // 27 is ESC character
     l++;
   }
   if (!*l) return latin;

["test.cpp" (text/plain)]

// Strings convert to base64 , quoted printable.
//     Toyohiro ASUKAI <toyohiro@ksmplus.com>
//
// The country (Republic Of China(Taiwan), Japan , Korea ...) that is using
// single character by 2 bytes , They are using base64 encoding strings at 
// mail headers. Because the base64 encoded strings length are shorter
// than used quoted printabel.
//
// For example.
// The strings of same length converted to base64 , quoted printable.
// =?ISO-2022-JP?B?GyRCJDMkcyRLJEEkTxsoQg==?=
// =?ISO-2022-JP?Q?=1B=24=42=24=33=24=73=24=4B=24=41=24=4F=1B=28=42?=
//
// Compile : This is executable itself.
// g++ -I/usr/X11R6/include -I$QTDIR/include 
// -I$KDEDIR/include -L/usr/X11R6/lib 
// -L$QTDIR/lib -L$KDEDIR/lib 
// -lqt -lkdecore -lmimelib -lX11 -lXext -lICE -lSM -o x x.cpp
//
// OS : Debian GNU/Linux 2.2
// QT : Version 2.2.4
// GCC: Version 2.95.2

#ifndef TEST_MAIL_HEADER
#include <stdio.h>
#include <qstring.h>
#include <qtextcodec.h>
#include <qregexp.h>
#include <kglobal.h>
#include <klocale.h>
#include <mimelib/string.h>
#include <mimelib/utility.h>
#endif

#ifndef TEST_MAIL_HEADER
 const QString QEncodeStr( const QString &aStr ) // encode RFC2047 string
 {
  QString bStr = aStr;
  if (aStr.isNull())
    bStr = "";

  DwString dwsrc(bStr.data(), bStr.length());
  DwString dwdest;
  QString result;

  DwEncodeQuotedPrintable(dwsrc, dwdest);
  result = dwdest.c_str();
// printf("QencodeStr[%s]\n",result.data());
  return result;
 }
#else
#define QEncodeStr KMMsgBase::encodeQuotedPrintable
#endif

 const QString BEncodeStr( const QString &aStr ) // encode Base64 string
 {
#ifndef TEST_MAIL_HEADER
   QString bStr = aStr;
   if (aStr.isNull())
    bStr = "";
  
   DwString dwsrc(bStr.data(), bStr.length());
   DwString dwdest;
   QString result;
  
   DwEncodeBase64(dwsrc, dwdest);
   result = dwdest.c_str();
#else
   QString result = KMMsgBase::encodeBase64(aStr);
#endif
 //  printf("enc1[%s][%d]\n",result.data(),result.length());
 // return without '\n' character.
   return result.mid(0,result.length()-1);
 }

// Does single character consist of 1 byte or 2 bytes ?
int isTwoByteChar( const QString &aStr, const QString& charset)
{
  QTextCodec* codec = QTextCodec::codecForName(charset.lower());
  if( !codec ) // unknown charset. Maybe it is single character by 1 byte.
    return FALSE;
  if( aStr.length() == strlen((const char *)codec->fromUnicode(aStr)) )
    return FALSE; // single character by 1 byte.
  else
    return TRUE;  // single character by 2 bytes.
}

const QString TestencodeRFC2047String ( const QString& _str ,
                              const QString& charset ,bool subjMsg=FALSE)
{
  QString cset;

  if (_str.isEmpty()) return _str;
  if (charset.isEmpty())
    cset = KGlobal::locale()->charset();
  else
    cset = charset;
  QTextCodec *codec = QTextCodec::codecForName(cset.lower());
  if( !codec )
  {
     cset = "iso-8859-1";
     codec = QTextCodec::codecForName(cset.lower());
  }

  QString result="";
  QString tmp="";
  QString ENCSTR;
  uint i,newline=FALSE;
  int j,chsetoffset,TwoByte;

  QString aStr = _str;

  aStr = aStr.replace(QRegExp("\n"),""); // strip '\n'. To one line.

  if( !subjMsg ) // Subject Header or Other Header
    aStr = aStr.simplifyWhiteSpace(); // strip white space.

  uint limit = 70; // limit length of encoding strings.

// =? + charset + ?[BQ]? + EncodeStrings + ?=
// 2  + charset.length + 3 + EncodeStrings + 2
  int offset = 2+cset.length()+3+0+2;

  if( isTwoByteChar(_str,cset) )
  {
// for single character by 2 bytes or 1 byte.
// Miniman encoding string length for single character by 2 bytes of 8Bit.
// Base64
//    for single character by 2 bytes. 0xa4 0xa2 -> pKI= (4bytes)
//    for single character by 1 byte. 0xe8 -> 5A== (4 bytes)
//
      chsetoffset = 4;

// ISO-2022-JP charsets is an exception.
// Miniman encoding string length for ISO-2022-JP ( ESC $ B x x  ESC ( B ).
// Base64
//    ESC $ B $ " ESC ( B -> GyRCJCIbKEI= (12 bytes)

      if( cset.lower() == "iso-2022-jp" )
        chsetoffset = 15; // with a few byte reserve.

      ENCSTR="?B?";
      TwoByte=1;
  }
  else
  {
// Miniman encoding string length for single character by 2 bytes of 8Bit.
// Quoted-Printable
//    for single character by 2 bytes. 0xe8 0xe8 -> =E8=E8 ( 6 bytes )
//    for single character by 1 byte. 0xe8 -> =E8 (3 bytes)

      chsetoffset = 6;

// ISO-2022-JP charsets is an exception.
// Miniman encoding string length for ISO-2022-JP ( ESC $ B x x  ESC ( B ).
// Quoted-Printable
//    ESC $ B ? ? ESC ( B -> =1B=24=42=??=??=1B=28=42 (24 bytes)

      if( cset.lower() == "iso-2022-jp" )
        chsetoffset = 24;
 
      ENCSTR="?Q?";
      TwoByte=0;
  }

  printf("[%s][%d]\n",aStr.local8Bit().data(),aStr.length());

  QString cc,cc2,encstr;
  for( i = 0 ; i < aStr.length() ; ++i )
  {
     cc = aStr.mid(i,1);
     if( cc.at(0).unicode() >= 128 ) // Is it us-ascii character ?
     { // None US-ASCII
       if( limit < tmp.length()+offset+chsetoffset )
       {
         --i;
         result += tmp + "\n ";
         tmp="";
       }
       else
       {
         for( j=i+1 ;;++j)
         {
           cc2 = aStr.mid(j,1);
           if( cc2.at(0).unicode() >=128 )
           { // None US-ASCII
              if( TwoByte )
               encstr = BEncodeStr(codec->fromUnicode(aStr.mid(i,j-i)+cc2));
              else
               encstr = QEncodeStr(codec->fromUnicode(aStr.mid(i,j-i)+cc2));
              if( limit < (tmp.length()+encstr.length()+offset) )
              {
                tmp += "=?"+cset.upper()+ENCSTR+encstr+"?=";
                 i = j;
                 break;
              }
           }
           else
           { // US-ASCII or Null (End of QString)
              if( TwoByte )
               encstr = BEncodeStr(codec->fromUnicode(aStr.mid(i,j-i))); 
              else
               encstr = QEncodeStr(codec->fromUnicode(aStr.mid(i,j-i))); 
              tmp += "=?"+cset.upper()+ENCSTR+encstr+"?=";
              i = j-1;
              break;
           }
         }
       }
     }
     else
     { // US-ASCII
       if( !subjMsg )
       {
         if( cc == "," )
         {
           if( tmp.length() )
              newline = TRUE;
           tmp += cc ;
         }
         else if( cc == "<" ) // Mail address check. but
         {                    // I have a doubt whether this is right.
            j = aStr.find(">",i); // search ending character of mail address.
// printf("aaa[%d][%d][%s]\n",i,j,aStr.local8Bit().data());
           if( j > 0 )
            {
                QString ncc=aStr.mid(j+1,2);
// printf("bbb[%d][%s]\n",i,ncc.local8Bit().data());
                if( ncc == " ," )
                {
// printf("ccc[%s]\n",QString(aStr.mid(i,j-i+1)+ncc).data());
                   tmp += aStr.mid(i,j-i+1)+ncc;
                   i = j+2;
                   newline = TRUE;
                }
                else if( ncc.mid(0,1) == "," || ncc.mid(0,1) == " " )
                {
                   tmp += aStr.mid(i,j-i+1)+ncc.mid(0,1);
                   i = j+1;
                   newline = TRUE;
                }
                else
                {
                   tmp += aStr.mid(i,j-i+1);
                   i = j;
                }

            }
            else
              tmp += cc;
         }
         else
         {
           tmp +=cc;
         }
       }
       else
       {
          tmp += cc;
       }
//  printf("U[%s]\n",cc.data());
     }
     if( (limit < tmp.length()) || newline )
     {
//  printf("flush[%s]\n",tmp.data());
         result += tmp;
         if( i < aStr.length()-1 )
         {
            result += "\n ";
         }
         tmp=""; newline = FALSE;
     }
  }
  if( ! result.length() )
    result = tmp;
  else
    result += tmp;

  printf("Result[%s]\n\n",result.data());
  return result;
}
#ifndef TEST_MAIL_HEADER
#define EncodeString TestencodeRFC2047String
main()
{
  char ibuf[1024];
  QString charset;
  QString aStr;
  QString Res;
  QTextCodec *codec;

  charset="iso-8859-1";
  codec = QTextCodec::codecForName(charset.lower());
  aStr = codec->toUnicode("\344");
  printf("%s", isTwoByteChar(aStr,charset) ? "2 bytes char" : "1 byte char");
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode("Michael H\344ckel <Michael@Haeckel.Net>");
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode("Sa\342d KADHI <Saad.KADHI@abc.com>");
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode("Fay\347al SAILE<faycal@abc.com>");
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode("J\351r\364me MARTIN<jmartin@abc.com>");
  printf("%s", isTwoByteChar(aStr,charset) ? "2 bytes char" : "1 byte char");
  Res = EncodeString( aStr , charset );

  QString bStr = "Michael H\344ckel <Michael@Haeckel.Net>, ";
   bStr += "J\351r\364me MARTIN<jmartin@abc.com> , ";
   bStr += "     Sa\342d KADHI <Saad.KADHI@abc.com>     ,    ";
   bStr += "Fay\347al SAILE<faycal@abc.com>,"; 
   bStr += "J\351r\364me MARTIN<jmartin@abc.com>";
  aStr = codec->toUnicode(bStr);
  printf("%s", isTwoByteChar(aStr,charset) ? "2 bytes char" : "1 byte char");
  Res = EncodeString( aStr , charset );

// #define JAPANESE
#ifdef JAPANESE
  charset="euc-jp";
  codec = QTextCodec::codecForName(charset.lower());
  aStr = codec->toUnicode(
  "丐" // include euc-jp strings
  );
  printf("%s", isTwoByteChar(aStr,charset) ? "2 bytes char" : "1 byte char");
  Res = EncodeString( aStr , charset );

  charset="iso-2022-jp";
  codec = QTextCodec::codecForName(charset.lower());
  aStr = codec->toUnicode(
  "123456789 \n\033$B$\"$$$&$($*\033(B 111111" // include iso-2022-jp strings
  );
  Res = EncodeString( aStr , charset );

  codec = QTextCodec::codecForName("euc-jp");
  aStr = codec->toUnicode(
  "123456789 \n丐中丹尹云幻仆幻仆 111111" // include euc-jp strings
  );
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode(
  "幻仆幻仆 <asukai@deskpro.ksmplus.com>"
  );
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode(
  "幻仆幻仆    <asukai@deskpro.ksmplus.com>   , 仇氏卞切反<asukai@zz.ksmplus.com>" // \
include euc-jp strings  );
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode(
  "  幻仆幻仆   <asukai@deskpro.ksmplus.com>    ,    仇氏卞切反  \
<asukai@zz.ksmplus.com>" // include euc-jp strings  );
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode(
  "幻仆幻仆 <asukai@deskpro.ksmplus.com> , 仇氏卞切反<asukai@zz.ksmplus.com>"
  ); // include euc-jp strings
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode(
  "幻仆幻仆 <asukai@deskpro.ksmplus.com>, 仇氏卞切反<asukai@zz.ksmplus.com>"
  ); // include euc-jp strings
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode(
  "幻仆幻仆 <asukai@deskpro.ksmplus.com>,仇氏卞切反<asukai@zz.ksmplus.com>"
  ); // include euc-jp strings
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode(
  "幻仆幻仆 <asukai@deskpro.ksmplus.com> 仇氏卞切反<asukai@zz.ksmplus.com>"
  ); // include euc-jp strings
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode(
  "1234567890123456789 <asukai@ksmplus.com>"
  );
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode(
  "1234567890123456789 <asukai@ksmplus.com>, askai@aaa.net, bbbb@net,"
  );
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode(
  "1234567890123456789 <asukai@ksmplus.com>, askai@aaa.net, bbbb@net,"
  );
  Res = EncodeString( aStr , charset , TRUE);

  aStr = codec->toUnicode(
  "aaaa@abc.com   ,  bbbb@@abc.com ,  cccc@abc.com,dddd@abc.com ,eeeee@abc.com \
ffffff@abc.com,   gggggg@abc.com ,  ,  , hhhhhh@abc.com"  );
  Res = EncodeString( aStr , charset );

  aStr = codec->toUnicode(
"#CPMS# 左件仿奶件奶矛件玄＞凶分及立伊瓦失丞’申件打＝巨件玄伉□濮遛熬尥醱〞"
  ); // include euc-jp strings
  Res = EncodeString( aStr , charset , TRUE);
#endif
#if 0
  while(gets(ibuf))
  {
     aStr = QString::fromLocal8Bit(ibuf);
     Res = EncodeString( aStr , charset );
     Res = EncodeString( aStr , charset , TRUE);
  }
#endif
}
#endif


_______________________________________________
Kmail Developers mailing list
Kmail@master.kde.org
http://master.kde.org/mailman/listinfo/kmail


[prev in list] [next in list] [prev in thread] [next in thread]