'[kbibtex/kbibtex/0.7] src: Parallelize text extraction from PDF files'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kde-commits
Subject:    [kbibtex/kbibtex/0.7] src: Parallelize text extraction from PDF files
From:       Thomas Fischer <null () kde ! org>
Date:       2017-03-31 21:56:05
Message-ID: E1cu4Wb-0004F6-DQ () code ! kde ! org
[Download RAW message or body]

Git commit 2f348ba4f482d6d9ac8b7b27535cfcf96818469c by Thomas Fischer.
Committed on 31/03/2017 at 21:37.
Pushed by thomasfischer into branch 'kbibtex/0.7'.

Parallelize text extraction from PDF files

As extracting text from PDF files blocked the application,
text extraction has been refactored to make use of
QtConcurrent, i.e. text extraction will run in one or multiple
parallel threads independent of main/GUI thread.
Search/filter results will not update once text extracting
is complete, but any new search will make use of it.

M  +0    -2    src/gui/file/filemodel.cpp
M  +44   -27   src/io/fileinfo.cpp
M  +4    -1    src/io/fileinfo.h

https://commits.kde.org/kbibtex/2f348ba4f482d6d9ac8b7b27535cfcf96818469c

diff --git a/src/gui/file/filemodel.cpp b/src/gui/file/filemodel.cpp
index 57270f09..97db8bac 100644
--- a/src/gui/file/filemodel.cpp
+++ b/src/gui/file/filemodel.cpp
@@ -208,8 +208,6 @@ bool SortFilterFileModel::filterAcceptsRow(int source_row, const \
                QModelIndex &so
         if (m_filterQuery.searchPDFfiles && m_filterQuery.field.isEmpty()) ///< not \
filtering for any specific field  foreach(const KUrl &url, \
FileInfo::entryUrls(entry.data(), \
fileSourceModel()->bibliographyFile()->property(File::Url, QUrl()).toUrl(), \
                FileInfo::TestExistenceYes)) {
             if (url.isLocalFile() && url.fileName().endsWith(QLatin1String(".pdf"))) \
                {
-                // FIXME if you have a large collection of PDF files and the text \
                version
-                // has not been generated yet, this will freeze KBibTeX for some \
                time
                 const QString text = FileInfo::pdfToText(url.pathOrUrl());
                 int i = 0;
                 for (QStringList::ConstIterator itsl = \
m_filterQuery.terms.constBegin(); itsl != m_filterQuery.terms.constEnd(); ++itsl, \
                ++i)
diff --git a/src/io/fileinfo.cpp b/src/io/fileinfo.cpp
index 1f63d240..0b11c11b 100644
--- a/src/io/fileinfo.cpp
+++ b/src/io/fileinfo.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
- *   Copyright (C) 2004-2014 by Thomas Fischer <fischer@unix-ag.uni-kl.de> *
+ *   Copyright (C) 2004-2017 by Thomas Fischer <fischer@unix-ag.uni-kl.de> *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
@@ -22,6 +22,7 @@
 #include <QFileInfo>
 #include <QDir>
 #include <QTextStream>
+#include <QtConcurrentRun>
 
 #include <KSharedConfig>
 #include <KConfigGroup>
@@ -267,42 +268,58 @@ QString FileInfo::pdfToText(const QString &pdfFilename)
     static const QRegExp invalidChars("[^-a-z0-9_]", Qt::CaseInsensitive);
     QString textFilename = \
QString(pdfFilename).remove(invalidChars).append(QLatin1String(".txt")).prepend(KStandardDirs::locateLocal("cache", \
"pdftotext/"));  
-    /// Initialize return value
-    QString text;
-
     /// First, check if there is a cache text file
     if (QFileInfo(textFilename).exists()) {
         /// Load text from cache file
         QFile f(textFilename);
         if (f.open(QFile::ReadOnly)) {
             QTextStream ts(&f);
-            text = ts.readAll();
+            const QString text = ts.readAll();
             f.close();
+            return text;
         }
-    }
+    } else
+        /// No cache text exists, so run text extraction in another thread
+        QtConcurrent::run(extractPDFTextToCache, pdfFilename, textFilename);
 
-    /// Either no cache text file existed or could not load text from it
-    if (text.isEmpty()) {
-        /// Load PDF file through Poppler
-        Poppler::Document *doc = Poppler::Document::load(pdfFilename);
-        if (doc != NULL) {
-            /// Build text by appending each page's text
-            text = QLatin1String("");
-            for (int i = 0; i < doc->numPages(); ++i)
-                text.append(doc->page(i)->text(QRect())).append(QLatin1String("\n\n"));
                
-            delete doc;
-
-            /// Save text in cache file
-            QFile f(textFilename);
-            if (f.open(QFile::WriteOnly)) {
-                QTextStream ts(&f);
-                ts << text;
-                f.close();
-            }
-        }
-    }
+    return QString();
+}
+
+void FileInfo::extractPDFTextToCache(const QString &pdfFilename, const QString \
&cacheFilename) { +    /// In case of multiple calls, skip text extraction if cache \
file already exists +    if (QFile(cacheFilename).exists()) return;
 
-    return text;
+    QString text;
+    QStringList msgList;
+
+    /// Load PDF file through Poppler
+    Poppler::Document *doc = Poppler::Document::load(pdfFilename);
+    if (doc != NULL) {
+        static const int maxPages = 64;
+        /// Build text by appending each page's text
+        for (int i = 0; i < qMin(maxPages, doc->numPages()); ++i)
+            text.append(doc->page(i)->text(QRect())).append(QLatin1String("\n\n"));
+        if (doc->numPages() > maxPages)
+            msgList << QString(QLatin1String("### Skipped %1 pages as PDF file \
contained too many pages (limit is %2 pages) ###")).arg(doc->numPages() - \
maxPages).arg(maxPages); +        delete doc;
+    } else
+        msgList << QLatin1String("### Skipped as file could not be opened as PDF \
file ###"); +
+    /// Save text in cache file
+    QFile f(cacheFilename);
+    if (f.open(QFile::WriteOnly)) {
+        static const int maxCharacters = 1 << 18;
+        QTextStream ts(&f);
+        ts << text.left(maxCharacters); ///< keep only the first 2^18 many \
characters +
+        if (text.length() > maxCharacters)
+            msgList << QString(QLatin1String("### Text too long, skipping %1 \
characters ###")).arg(text.length() - maxCharacters); +        /// Write all messages \
(warnings) to end of text file +        foreach(const QString &msg, msgList)
+           ts << endl << msg;
+
+        f.close();
+    }
 }
 
 QString FileInfo::doiUrlPrefix()
diff --git a/src/io/fileinfo.h b/src/io/fileinfo.h
index 27c817c9..ca7a6926 100644
--- a/src/io/fileinfo.h
+++ b/src/io/fileinfo.h
@@ -1,5 +1,5 @@
 /***************************************************************************
- *   Copyright (C) 2004-2014 by Thomas Fischer <fischer@unix-ag.uni-kl.de> *
+ *   Copyright (C) 2004-2017 by Thomas Fischer <fischer@unix-ag.uni-kl.de> *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
@@ -97,6 +97,9 @@ public:
 
 protected:
     FileInfo();
+
+private:
+    static void extractPDFTextToCache(const QString &pdfFilename, const QString \
&cacheFilename);  };
 
 #endif // KBIBTEX_IO_FILEINFO_H


[prev in list] [next in list] [prev in thread] [next in thread]
Configure | About | News | Add a list | Sponsored by KoreLogic