From kde-commits Thu Feb 28 22:58:20 2013 From: Albert Astals Cid Date: Thu, 28 Feb 2013 22:58:20 +0000 To: kde-commits Subject: [okular] generators/txt: Drop detectEncoding method and just feed more data to KEncodingProber Message-Id: <20130228225820.789DFA604F () git ! kde ! org> X-MARC-Message: https://marc.info/?l=kde-commits&m=136209230829192 Git commit b5abd493ab80f642388b1ae055e7f06cfa1c59d5 by Albert Astals Cid, o= n behalf of Azat Khuzhin. Committed on 28/02/2013 at 23:57. Pushed by aacid into branch 'master'. Drop detectEncoding method and just feed more data to KEncodingProber REVIEW: 109119 M +18 -15 generators/txt/document.cpp M +0 -2 generators/txt/document.h http://commits.kde.org/okular/b5abd493ab80f642388b1ae055e7f06cfa1c59d5 diff --git a/generators/txt/document.cpp b/generators/txt/document.cpp index badb647..29aa2db 100644 --- a/generators/txt/document.cpp +++ b/generators/txt/document.cpp @@ -40,29 +40,32 @@ Document::~Document() { } = -QByteArray Document::detectEncoding( const QByteArray &array ) +QString Document::toUnicode( const QByteArray &array ) { - // TODO: see to "katetextloader.h" + QByteArray encoding; KEncodingProber prober(KEncodingProber::Universal); - prober.feed(array); - if (!prober.confidence() > 0.5) + int charsFeeded =3D 0; + int chunkSize =3D 3000; // ~=3D number of symbols in page. + + // Try to detect encoding. + while ( encoding.isEmpty() && charsFeeded < array.size() ) { - kDebug() << "Can't detect charset"; - return QByteArray(); - } + prober.feed( array.mid( charsFeeded, chunkSize ) ); + charsFeeded +=3D chunkSize; = -#ifdef TXT_DEBUG - kDebug() << "Detected" << prober.encoding() << "encoding"; -#endif - return prober.encoding(); -} + if (prober.confidence() >=3D 0.5) + { + encoding =3D prober.encoding(); + break; + } + } = -QString Document::toUnicode( const QByteArray &array ) -{ - const QByteArray encoding =3D detectEncoding( array ); if ( encoding.isEmpty() ) { return QString(); } + + kDebug() << "Detected" << prober.encoding() << "encoding" + << "based on" << charsFeeded << "chars"; return QTextCodec::codecForName( encoding )->toUnicode( array ); } diff --git a/generators/txt/document.h b/generators/txt/document.h index 08babea..bbf2923 100644 --- a/generators/txt/document.h +++ b/generators/txt/document.h @@ -19,8 +19,6 @@ namespace Txt ~Document(); = private: - // TODO: write a better detecter, based on some number of chun= ks - QByteArray detectEncoding( const QByteArray &array ); QString toUnicode( const QByteArray &array ); }; }