[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kde-commits
Subject:    [okular] generators/txt: Drop detectEncoding method and just feed more data to KEncodingProber
From:       Albert Astals Cid <aacid () kde ! org>
Date:       2013-02-28 22:58:20
Message-ID: 20130228225820.789DFA604F () git ! kde ! org
[Download RAW message or body]

Git commit b5abd493ab80f642388b1ae055e7f06cfa1c59d5 by Albert Astals Cid, on behalf of Azat Khuzhin.
Committed on 28/02/2013 at 23:57.
Pushed by aacid into branch 'master'.

Drop detectEncoding method and just feed more data to KEncodingProber

REVIEW: 109119

M  +18   -15   generators/txt/document.cpp
M  +0    -2    generators/txt/document.h

http://commits.kde.org/okular/b5abd493ab80f642388b1ae055e7f06cfa1c59d5

diff --git a/generators/txt/document.cpp b/generators/txt/document.cpp
index badb647..29aa2db 100644
--- a/generators/txt/document.cpp
+++ b/generators/txt/document.cpp
@@ -40,29 +40,32 @@ Document::~Document()
 {
 }
 
-QByteArray Document::detectEncoding( const QByteArray &array )
+QString Document::toUnicode( const QByteArray &array )
 {
-    // TODO: see to "katetextloader.h"
+    QByteArray encoding;
     KEncodingProber prober(KEncodingProber::Universal);
-    prober.feed(array);
-    if (!prober.confidence() > 0.5)
+    int charsFeeded = 0;
+    int chunkSize = 3000; // ~= number of symbols in page.
+
+    // Try to detect encoding.
+    while ( encoding.isEmpty() && charsFeeded < array.size() )
     {
-        kDebug() << "Can't detect charset";
-        return QByteArray();
-    }
+        prober.feed( array.mid( charsFeeded, chunkSize ) );
+        charsFeeded += chunkSize;
 
-#ifdef TXT_DEBUG
-    kDebug() << "Detected" << prober.encoding() << "encoding";
-#endif
-    return prober.encoding();
-}
+        if (prober.confidence() >= 0.5)
+        {
+            encoding = prober.encoding();
+            break;
+        }
+    }
 
-QString Document::toUnicode( const QByteArray &array )
-{
-    const QByteArray encoding = detectEncoding( array );
     if ( encoding.isEmpty() )
     {
         return QString();
     }
+
+    kDebug() << "Detected" << prober.encoding() << "encoding"
+             << "based on" << charsFeeded << "chars";
     return QTextCodec::codecForName( encoding )->toUnicode( array );
 }
diff --git a/generators/txt/document.h b/generators/txt/document.h
index 08babea..bbf2923 100644
--- a/generators/txt/document.h
+++ b/generators/txt/document.h
@@ -19,8 +19,6 @@ namespace Txt
             ~Document();
 
         private:
-            // TODO: write a better detecter, based on some number of chunks
-            QByteArray detectEncoding( const QByteArray &array );
             QString toUnicode( const QByteArray &array );
     };
 }
[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic