Git commit 2f348ba4f482d6d9ac8b7b27535cfcf96818469c by Thomas Fischer. Committed on 31/03/2017 at 21:37. Pushed by thomasfischer into branch 'kbibtex/0.7'. Parallelize text extraction from PDF files As extracting text from PDF files blocked the application, text extraction has been refactored to make use of QtConcurrent, i.e. text extraction will run in one or multiple parallel threads independent of main/GUI thread. Search/filter results will not update once text extracting is complete, but any new search will make use of it. M +0 -2 src/gui/file/filemodel.cpp M +44 -27 src/io/fileinfo.cpp M +4 -1 src/io/fileinfo.h https://commits.kde.org/kbibtex/2f348ba4f482d6d9ac8b7b27535cfcf96818469c diff --git a/src/gui/file/filemodel.cpp b/src/gui/file/filemodel.cpp index 57270f09..97db8bac 100644 --- a/src/gui/file/filemodel.cpp +++ b/src/gui/file/filemodel.cpp @@ -208,8 +208,6 @@ bool SortFilterFileModel::filterAcceptsRow(int source_r= ow, const QModelIndex &so if (m_filterQuery.searchPDFfiles && m_filterQuery.field.isEmpty())= ///< not filtering for any specific field foreach(const KUrl &url, FileInfo::entryUrls(entry.data(), fil= eSourceModel()->bibliographyFile()->property(File::Url, QUrl()).toUrl(), Fi= leInfo::TestExistenceYes)) { if (url.isLocalFile() && url.fileName().endsWith(QLatin1String= (".pdf"))) { - // FIXME if you have a large collection of PDF files and t= he text version - // has not been generated yet, this will freeze KBibTeX fo= r some time const QString text =3D FileInfo::pdfToText(url.pathOrUrl()= ); int i =3D 0; for (QStringList::ConstIterator itsl =3D m_filterQuery.ter= ms.constBegin(); itsl !=3D m_filterQuery.terms.constEnd(); ++itsl, ++i) diff --git a/src/io/fileinfo.cpp b/src/io/fileinfo.cpp index 1f63d240..0b11c11b 100644 --- a/src/io/fileinfo.cpp +++ b/src/io/fileinfo.cpp @@ -1,5 +1,5 @@ /*************************************************************************= ** - * Copyright (C) 2004-2014 by Thomas Fischer = * + * Copyright (C) 2004-2017 by Thomas Fischer = * * = * * This program is free software; you can redistribute it and/or modify = * * it under the terms of the GNU General Public License as published by = * @@ -22,6 +22,7 @@ #include #include #include +#include = #include #include @@ -267,42 +268,58 @@ QString FileInfo::pdfToText(const QString &pdfFilenam= e) static const QRegExp invalidChars("[^-a-z0-9_]", Qt::CaseInsensitive); QString textFilename =3D QString(pdfFilename).remove(invalidChars).app= end(QLatin1String(".txt")).prepend(KStandardDirs::locateLocal("cache", "pdf= totext/")); = - /// Initialize return value - QString text; - /// First, check if there is a cache text file if (QFileInfo(textFilename).exists()) { /// Load text from cache file QFile f(textFilename); if (f.open(QFile::ReadOnly)) { QTextStream ts(&f); - text =3D ts.readAll(); + const QString text =3D ts.readAll(); f.close(); + return text; } - } + } else + /// No cache text exists, so run text extraction in another thread + QtConcurrent::run(extractPDFTextToCache, pdfFilename, textFilename= ); = - /// Either no cache text file existed or could not load text from it - if (text.isEmpty()) { - /// Load PDF file through Poppler - Poppler::Document *doc =3D Poppler::Document::load(pdfFilename); - if (doc !=3D NULL) { - /// Build text by appending each page's text - text =3D QLatin1String(""); - for (int i =3D 0; i < doc->numPages(); ++i) - text.append(doc->page(i)->text(QRect())).append(QLatin1Str= ing("\n\n")); - delete doc; - - /// Save text in cache file - QFile f(textFilename); - if (f.open(QFile::WriteOnly)) { - QTextStream ts(&f); - ts << text; - f.close(); - } - } - } + return QString(); +} + +void FileInfo::extractPDFTextToCache(const QString &pdfFilename, const QSt= ring &cacheFilename) { + /// In case of multiple calls, skip text extraction if cache file alre= ady exists + if (QFile(cacheFilename).exists()) return; = - return text; + QString text; + QStringList msgList; + + /// Load PDF file through Poppler + Poppler::Document *doc =3D Poppler::Document::load(pdfFilename); + if (doc !=3D NULL) { + static const int maxPages =3D 64; + /// Build text by appending each page's text + for (int i =3D 0; i < qMin(maxPages, doc->numPages()); ++i) + text.append(doc->page(i)->text(QRect())).append(QLatin1String(= "\n\n")); + if (doc->numPages() > maxPages) + msgList << QString(QLatin1String("### Skipped %1 pages as PDF = file contained too many pages (limit is %2 pages) ###")).arg(doc->numPages(= ) - maxPages).arg(maxPages); + delete doc; + } else + msgList << QLatin1String("### Skipped as file could not be opened = as PDF file ###"); + + /// Save text in cache file + QFile f(cacheFilename); + if (f.open(QFile::WriteOnly)) { + static const int maxCharacters =3D 1 << 18; + QTextStream ts(&f); + ts << text.left(maxCharacters); ///< keep only the first 2^18 many= characters + + if (text.length() > maxCharacters) + msgList << QString(QLatin1String("### Text too long, skipping = %1 characters ###")).arg(text.length() - maxCharacters); + /// Write all messages (warnings) to end of text file + foreach(const QString &msg, msgList) + ts << endl << msg; + + f.close(); + } } = QString FileInfo::doiUrlPrefix() diff --git a/src/io/fileinfo.h b/src/io/fileinfo.h index 27c817c9..ca7a6926 100644 --- a/src/io/fileinfo.h +++ b/src/io/fileinfo.h @@ -1,5 +1,5 @@ /*************************************************************************= ** - * Copyright (C) 2004-2014 by Thomas Fischer = * + * Copyright (C) 2004-2017 by Thomas Fischer = * * = * * This program is free software; you can redistribute it and/or modify = * * it under the terms of the GNU General Public License as published by = * @@ -97,6 +97,9 @@ public: = protected: FileInfo(); + +private: + static void extractPDFTextToCache(const QString &pdfFilename, const QS= tring &cacheFilename); }; = #endif // KBIBTEX_IO_FILEINFO_H