[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kde-commits
Subject:    [kbibtex] src: Extract text from PDF files for full-text search
From:       Thomas Fischer <null () kde ! org>
Date:       2017-03-31 21:56:22
Message-ID: E1cu4Ws-0004Se-Iz () code ! kde ! org
[Download RAW message or body]

Git commit b16ed40df5c972042db6e885504af1a9f87540c2 by Thomas Fischer.
Committed on 31/03/2017 at 21:41.
Pushed by thomasfischer into branch 'master'.

Extract text from PDF files for full-text search

Re-enables old Poppler/Qt4 code to make use of Poppler/Qt5.
As extracting text from PDF files blocked the application,
text extraction has been refactored to make use of
QtConcurrent, i.e. text extraction will run in one or multiple
parallel threads independent of main/GUI thread.
Search/filter results will not update once text extracting
is complete, but any new search will make use of it.

M  +0    -2    src/gui/file/sortfilterfilemodel.cpp
M  +45   -30   src/io/fileinfo.cpp
M  +3    -0    src/io/fileinfo.h

https://commits.kde.org/kbibtex/b16ed40df5c972042db6e885504af1a9f87540c2

diff --git a/src/gui/file/sortfilterfilemodel.cpp \
b/src/gui/file/sortfilterfilemodel.cpp index f738357d..d52aa8f2 100644
--- a/src/gui/file/sortfilterfilemodel.cpp
+++ b/src/gui/file/sortfilterfilemodel.cpp
@@ -199,8 +199,6 @@ bool SortFilterFileModel::filterAcceptsRow(int source_row, const \
QModelIndex &so  const auto entryUrlList = FileInfo::entryUrls(entry, \
fileSourceModel()->bibliographyFile()->property(File::Url, QUrl()).toUrl(), \
FileInfo::TestExistenceYes);  for (const QUrl &url : entryUrlList) {
                 if (url.isLocalFile() && \
                url.fileName().endsWith(QStringLiteral(".pdf"))) {
-                    // FIXME if you have a large collection of PDF files and the \
                text version
-                    // has not been generated yet, this will freeze KBibTeX for some \
                time
                     const QString text = \
FileInfo::pdfToText(url.url(QUrl::PreferLocalFile));  int i = 0;
                     for (QStringList::ConstIterator itsl = \
m_filterQuery.terms.constBegin(); itsl != m_filterQuery.terms.constEnd(); ++itsl, \
                ++i)
diff --git a/src/io/fileinfo.cpp b/src/io/fileinfo.cpp
index 41ded3ab..c8e567fb 100644
--- a/src/io/fileinfo.cpp
+++ b/src/io/fileinfo.cpp
@@ -17,12 +17,14 @@
 
 #include "fileinfo.h"
 
-// FIXME #include <poppler-qt4.h>
+#include <poppler-qt5.h>
 
 #include <QFileInfo>
 #include <QDir>
 #include <QTextStream>
 #include <QStandardPaths>
+#include <QDir>
+#include <QtConcurrent/QtConcurrent>
 
 #include <KSharedConfig>
 #include <KConfigGroup>
@@ -273,45 +275,58 @@ QString FileInfo::pdfToText(const QString &pdfFilename)
     static const QRegExp invalidChars("[^-a-z0-9_]", Qt::CaseInsensitive);
     QString textFilename = \
QString(pdfFilename).remove(invalidChars).append(QStringLiteral(".txt")).prepend(QStandardPaths::writableLocation(QStandardPaths::CacheLocation) \
+ QLatin1Char('/') + "pdftotext/");  
-    /// Initialize return value
-    QString text;
-
     /// First, check if there is a cache text file
     if (QFileInfo::exists(textFilename)) {
         /// Load text from cache file
         QFile f(textFilename);
         if (f.open(QFile::ReadOnly)) {
             QTextStream ts(&f);
-            text = ts.readAll();
+            const QString text = ts.readAll();
             f.close();
+            return text;
         }
-    }
+    } else
+        /// No cache file exists, so run text extraction in another thread
+        QtConcurrent::run(extractPDFTextToCache, pdfFilename, textFilename);
 
-    /// Either no cache text file existed or could not load text from it
-    if (text.isEmpty()) {
-        /// Load PDF file through Poppler
-        // FIXME
-        /*
-        Poppler::Document *doc = Poppler::Document::load(pdfFilename);
-        if (doc != NULL) {
-            /// Build text by appending each page's text
-            text = QStringLiteral("");
-            for (int i = 0; i < doc->numPages(); ++i)
-                text.append(doc->page(i)->text(QRect())).append(QStringLiteral("\n\n"));
                
-            delete doc;
-
-            /// Save text in cache file
-            QFile f(textFilename);
-            if (f.open(QFile::WriteOnly)) {
-                QTextStream ts(&f);
-                ts << text;
-                f.close();
-            }
-        }
-        */
-    }
+    return QString();
+}
+
+void FileInfo::extractPDFTextToCache(const QString &pdfFilename, const QString \
&cacheFilename) { +    /// In case of multiple calls, skip text extraction if cache \
file already exists +    if (QFile(cacheFilename).exists()) return;
 
-    return text;
+    QString text;
+    QStringList msgList;
+
+    /// Load PDF file through Poppler
+    Poppler::Document *doc = Poppler::Document::load(pdfFilename);
+    if (doc != nullptr) {
+        static const int maxPages = 64;
+        /// Build text by appending each page's text
+        for (int i = 0; i < qMin(maxPages, doc->numPages()); ++i)
+            text.append(doc->page(i)->text(QRect())).append(QStringLiteral("\n\n"));
+        if (doc->numPages() > maxPages)
+            msgList << QString(QStringLiteral("### Skipped %1 pages as PDF file \
contained too many pages (limit is %2 pages) ###")).arg(doc->numPages() - \
maxPages).arg(maxPages); +        delete doc;
+    } else
+        msgList << QStringLiteral("### Skipped as file could not be opened as PDF \
file ###"); +
+    /// Save text in cache file
+    QFile f(cacheFilename);
+    if (f.open(QFile::WriteOnly)) {
+        static const int maxCharacters = 1 << 18;
+        QTextStream ts(&f);
+        ts << text.left(maxCharacters); ///< keep only the first 2^18 many \
characters +
+        if (text.length() > maxCharacters)
+            msgList << QString(QStringLiteral("### Text too long, skipping %1 \
characters ###")).arg(text.length() - maxCharacters); +        /// Write all messages \
(warnings) to end of text file +        for (const QString &msg : const_cast<const \
QStringList &>(msgList)) +            ts << endl << msg;
+
+        f.close();
+    }
 }
 
 QString FileInfo::doiUrlPrefix()
diff --git a/src/io/fileinfo.h b/src/io/fileinfo.h
index 7128f2b7..3a88eaae 100644
--- a/src/io/fileinfo.h
+++ b/src/io/fileinfo.h
@@ -98,6 +98,9 @@ public:
 
 protected:
     FileInfo();
+
+private:
+    static void extractPDFTextToCache(const QString &pdfFilename, const QString \
&cacheFilename);  };
 
 #endif // KBIBTEX_IO_FILEINFO_H


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic