[prev in list] [next in list] [prev in thread] [next in thread]
List: kde-commits
Subject: Re: KDE/kdebase/runtime/nepomuk/services/strigi
From: Vishesh Handa <handa.vish () gmail ! com>
Date: 2010-08-11 14:57:27
Message-ID: AANLkTinpbbTu330KZVZaP0oeGWab21mmfrjQ5zq-60BH () mail ! gmail ! com
[Download RAW message or body]
Hey Sebastian
On Wed, Aug 11, 2010 at 7:48 PM, Sebastian Trueg <sebastian@trueg.de> wrote:
> SVN commit 1162155 by trueg:
>
> 1. Remove all index graphs that contain resources without a nie:url (junk
> from buggy runs)
>
Suppose the Akonadi people want to Index an email attachment. That
attachment might or might not have a nie:url. If it doesn't then this would
remove the indexed data.
Or are you going to keep it mandatory for each indexed file to have a
nie:url?
- Vishesh Handa
> 2. Remove graphs in batches to avoid the iterator breakage that comes with
> removing iterated items.
>
> M +1 -0 CMakeLists.txt
> M +58 -31 indexscheduler.cpp
> M +1 -0 indexscheduler.h
>
>
> --- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/CMakeLists.txt
> #1162154:1162155
> @@ -60,6 +60,7 @@
> ${KDE4_KIO_LIBS}
> ${KDE4_SOLID_LIBS}
> ${KDE4_KIDLETIME_LIBS}
> + ${NEPOMUK_QUERY_LIBRARIES}
> ${NEPOMUK_LIBRARIES}
> ${SOPRANO_LIBRARIES}
> )
> --- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/indexscheduler.cpp
> #1162154:1162155
> @@ -23,6 +23,7 @@
> #include "indexscheduler.h"
> #include "strigiserviceconfig.h"
> #include "nepomukindexer.h"
> +#include "util.h"
> #include "nfo.h"
> #include "nie.h"
>
> @@ -42,11 +43,15 @@
> #include <Nepomuk/Resource>
> #include <Nepomuk/ResourceManager>
> #include <Nepomuk/Variant>
> +#include <Nepomuk/Query/Query>
> +#include <Nepomuk/Query/ComparisonTerm>
> +#include <Nepomuk/Query/ResourceTerm>
>
> #include <Soprano/Model>
> #include <Soprano/QueryResultIterator>
> #include <Soprano/NodeIterator>
> #include <Soprano/Node>
> +#include <Soprano/Vocabulary/RDF>
> #include <Soprano/Vocabulary/Xesam>
>
> #include <map>
> @@ -544,7 +549,7 @@
> // We query all files that should not be in the store
> // This for example excludes all filex:/ URLs.
> //
> - QString query = QString::fromLatin1( "select distinct ?g ?url where {
> "
> + QString query = QString::fromLatin1( "select distinct ?g where { "
> "?r %1 ?url . "
> "?g <
> http://www.strigi.org/fields#indexGraphFor> ?r . "
>
> "FILTER(REGEX(STR(?url),'^file:/')) . "
> @@ -552,21 +557,10 @@
> .arg( Soprano::Node::resourceToN3(
> Nepomuk::Vocabulary::NIE::url() ),
> folderFilter );
> kDebug() << query;
> + if ( !removeAllGraphsFromQuery( query ) )
> + return;
>
> - Soprano::QueryResultIterator it =
> ResourceManager::instance()->mainModel()->executeQuery( query,
> Soprano::Query::QueryLanguageSparql );
> - while ( it.next() ) {
>
> - // wait for resume or stop (or simply continue)
> - if ( !waitForContinue() ) {
> - break;
> - }
> -
> - const Soprano::Node& g = it[0];
> - kDebug() << "REMOVING" << it["url"].uri();
> - ResourceManager::instance()->mainModel()->removeContext( g );
> - }
> -
> -
> //
> // Build filter query for all exclude filters
> //
> @@ -584,7 +578,7 @@
> else if( !includeExcludeFilters.isEmpty() )
> filters = QString::fromLatin1("FILTER(%1) .").arg(
> includeExcludeFilters );
>
> - query = QString::fromLatin1( "select distinct ?g ?url where { "
> + query = QString::fromLatin1( "select distinct ?g where { "
> "?r %1 ?url . "
> "?r %2 ?fn . "
> "?g <
> http://www.strigi.org/fields#indexGraphFor> ?r . "
> @@ -594,20 +588,10 @@
> Soprano::Node::resourceToN3(
> Nepomuk::Vocabulary::NFO::fileName() ),
> filters );
> kDebug() << query;
> - it = ResourceManager::instance()->mainModel()->executeQuery( query,
> Soprano::Query::QueryLanguageSparql );
> - while ( it.next() ) {
> + if ( !removeAllGraphsFromQuery( query ) )
> + return;
>
> - // wait for resume or stop (or simply continue)
> - if ( !waitForContinue() ) {
> - break;
> - }
>
> - const Soprano::Node& g = it[0];
> - kDebug() << "REMOVING" << it["url"].uri();
> - ResourceManager::instance()->mainModel()->removeContext( g );
> - }
> -
> -
> //
> // Remove all old data from Xesam-times. While we leave out the data
> created by libnepomuk
> // there is no problem since libnepomuk still uses backwards compatible
> queries and we use
> @@ -620,19 +604,62 @@
> "{ graph ?g { ?r2 %1 ?u2 . } } "
> "}" )
> .arg( Soprano::Node::resourceToN3(
> Soprano::Vocabulary::Xesam::url() ) );
> - it = ResourceManager::instance()->mainModel()->executeQuery( query,
> Soprano::Query::QueryLanguageSparql );
> - while ( it.next() ) {
> + kDebug() << query;
> + if ( !removeAllGraphsFromQuery( query ) )
> + return;
>
> +
> + //
> + // Remove data which is useless but still around from before. This
> could happen due to some buggy version of
> + // the indexer or the filewatch service or even some application
> messing up the data.
> + // We look for indexed files that do not have a nie:url defined and
> thus, will never be catched by any of the
> + // other queries.
> + //
> + query = Query::Query(
> + Strigi::Ontology::indexGraphFor() == (
> Soprano::Vocabulary::RDF::type() == Query::ResourceTerm(
> Nepomuk::Vocabulary::NFO::FileDataObject() ) &&
> + !(
> Nepomuk::Vocabulary::NIE::url() == Query::Term() ) )
> + ).toSparqlQuery(Query::Query::NoResultRestrictions);
> + kDebug() << query;
> + removeAllGraphsFromQuery( query );
> +}
> +
> +
> +
> +/**
> + * Runs the query using a limit until all graphs have been deleted. This
> is not done
> + * in one big loop to avoid the problems with messed up iterators when one
> of the iterated
> + * item is deleted.
> + */
> +bool Nepomuk::IndexScheduler::removeAllGraphsFromQuery( const QString&
> query )
> +{
> + while ( 1 ) {
> + // get the next batch of graphs
> + QList<Soprano::Node> graphs
> + = ResourceManager::instance()->mainModel()->executeQuery(
> query + QLatin1String( " LIMIT 200" ),
> +
> Soprano::Query::QueryLanguageSparql ).iterateBindings( 0 ).allNodes();
> +
> + // remove all graphs in the batch
> + Q_FOREACH( const Soprano::Node& graph, graphs ) {
> +
> // wait for resume or stop (or simply continue)
> if ( !waitForContinue() ) {
> - break;
> + return false;
> }
>
> - ResourceManager::instance()->mainModel()->removeContext( it[0] );
> + ResourceManager::instance()->mainModel()->removeContext( graph
> );
> }
> +
> + // we are done when the last graphs are queried
> + if ( graphs.count() < 200 ) {
> + return true;
> }
> + }
>
> + // make gcc shut up
> + return true;
> +}
>
> +
> QDebug Nepomuk::operator<<( QDebug dbg, IndexScheduler::IndexingSpeed
> speed )
> {
> dbg << ( int )speed;
> --- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/indexscheduler.h
> #1162154:1162155
> @@ -206,6 +206,7 @@
> * to index anymore.
> */
> void removeOldAndUnwantedEntries();
> + bool removeAllGraphsFromQuery( const QString& query_ );
>
> bool m_suspended;
> bool m_stopped;
>
[Attachment #3 (text/html)]
Hey Sebastian<br><br><div class="gmail_quote">On Wed, Aug 11, 2010 at 7:48 PM, \
Sebastian Trueg <span dir="ltr"><<a \
href="mailto:sebastian@trueg.de">sebastian@trueg.de</a>></span> \
wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px \
#ccc solid;padding-left:1ex;"> SVN commit 1162155 by trueg:<br>
<br>
1. Remove all index graphs that contain resources without a nie:url (junk from buggy \
runs)<br></blockquote><div><br></div><div>Suppose the Akonadi people want to Index an \
email attachment. That attachment might or might not have a nie:url. If it \
doesn't then this would remove the indexed data. </div> <div><br></div><div>Or \
are you going to keep it mandatory for each indexed file to have a nie:url? \
</div><div><br></div><div>- Vishesh Handa</div><div> </div><blockquote \
class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc \
solid;padding-left:1ex;">
2. Remove graphs in batches to avoid the iterator breakage that comes with removing \
iterated items.<br> <br>
M +1 -0 CMakeLists.txt<br>
M +58 -31 indexscheduler.cpp<br>
M +1 -0 indexscheduler.h<br>
<br>
<br>
--- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/CMakeLists.txt \
#1162154:1162155<br> @@ -60,6 +60,7 @@<br>
${KDE4_KIO_LIBS}<br>
${KDE4_SOLID_LIBS}<br>
${KDE4_KIDLETIME_LIBS}<br>
+ ${NEPOMUK_QUERY_LIBRARIES}<br>
${NEPOMUK_LIBRARIES}<br>
${SOPRANO_LIBRARIES}<br>
)<br>
--- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/indexscheduler.cpp \
#1162154:1162155<br> @@ -23,6 +23,7 @@<br>
#include "indexscheduler.h"<br>
#include "strigiserviceconfig.h"<br>
#include "nepomukindexer.h"<br>
+#include "util.h"<br>
#include "nfo.h"<br>
#include "nie.h"<br>
<br>
@@ -42,11 +43,15 @@<br>
#include <Nepomuk/Resource><br>
#include <Nepomuk/ResourceManager><br>
#include <Nepomuk/Variant><br>
+#include <Nepomuk/Query/Query><br>
+#include <Nepomuk/Query/ComparisonTerm><br>
+#include <Nepomuk/Query/ResourceTerm><br>
<br>
#include <Soprano/Model><br>
#include <Soprano/QueryResultIterator><br>
#include <Soprano/NodeIterator><br>
#include <Soprano/Node><br>
+#include <Soprano/Vocabulary/RDF><br>
#include <Soprano/Vocabulary/Xesam><br>
<br>
#include <map><br>
@@ -544,7 +549,7 @@<br>
// We query all files that should not be in the store<br>
// This for example excludes all filex:/ URLs.<br>
//<br>
- QString query = QString::fromLatin1( "select distinct ?g ?url where { \
"<br> + QString query = QString::fromLatin1( "select distinct ?g where { \
"<br> "?r %1 ?url . "<br>
"?g <<a \
href="http://www.strigi.org/fields#indexGraphFor" \
target="_blank">http://www.strigi.org/fields#indexGraphFor</a>> ?r \
. "<br>
\
"FILTER(REGEX(STR(?url),'^file:/')) . "<br> @@ -552,21 +557,10 \
@@<br>
.arg( Soprano::Node::resourceToN3( \
Nepomuk::Vocabulary::NIE::url() ),<br> folderFilter );<br>
kDebug() << query;<br>
+ if ( !removeAllGraphsFromQuery( query ) )<br>
+ return;<br>
<br>
- Soprano::QueryResultIterator it = \
ResourceManager::instance()->mainModel()->executeQuery( query, \
Soprano::Query::QueryLanguageSparql );<br>
- while ( it.next() ) {<br>
<br>
- // wait for resume or stop (or simply continue)<br>
- if ( !waitForContinue() ) {<br>
- break;<br>
- }<br>
-<br>
- const Soprano::Node& g = it[0];<br>
- kDebug() << "REMOVING" << \
it["url"].uri();<br>
- ResourceManager::instance()->mainModel()->removeContext( g );<br>
- }<br>
-<br>
-<br>
//<br>
// Build filter query for all exclude filters<br>
//<br>
@@ -584,7 +578,7 @@<br>
else if( !includeExcludeFilters.isEmpty() )<br>
filters = QString::fromLatin1("FILTER(%1) .").arg( \
includeExcludeFilters );<br> <br>
- query = QString::fromLatin1( "select distinct ?g ?url where { "<br>
+ query = QString::fromLatin1( "select distinct ?g where { "<br>
"?r %1 ?url . "<br>
"?r %2 ?fn . "<br>
"?g <<a \
href="http://www.strigi.org/fields#indexGraphFor" \
target="_blank">http://www.strigi.org/fields#indexGraphFor</a>> ?r . "<br> @@ \
-594,20 +588,10 @@<br>
Soprano::Node::resourceToN3( Nepomuk::Vocabulary::NFO::fileName() \
),<br> filters );<br>
kDebug() << query;<br>
- it = ResourceManager::instance()->mainModel()->executeQuery( query, \
Soprano::Query::QueryLanguageSparql );<br>
- while ( it.next() ) {<br>
+ if ( !removeAllGraphsFromQuery( query ) )<br>
+ return;<br>
<br>
- // wait for resume or stop (or simply continue)<br>
- if ( !waitForContinue() ) {<br>
- break;<br>
- }<br>
<br>
- const Soprano::Node& g = it[0];<br>
- kDebug() << "REMOVING" << \
it["url"].uri();<br>
- ResourceManager::instance()->mainModel()->removeContext( g );<br>
- }<br>
-<br>
-<br>
//<br>
// Remove all old data from Xesam-times. While we leave out the data created by \
libnepomuk<br>
// there is no problem since libnepomuk still uses backwards compatible queries \
and we use<br> @@ -620,19 +604,62 @@<br>
"{ graph ?g { ?r2 %1 ?u2 . } } "<br>
"}" )<br>
.arg( Soprano::Node::resourceToN3( Soprano::Vocabulary::Xesam::url() ) \
);<br>
- it = ResourceManager::instance()->mainModel()->executeQuery( query, \
Soprano::Query::QueryLanguageSparql );<br>
- while ( it.next() ) {<br>
+ kDebug() << query;<br>
+ if ( !removeAllGraphsFromQuery( query ) )<br>
+ return;<br>
<br>
+<br>
+ //<br>
+ // Remove data which is useless but still around from before. This could happen \
due to some buggy version of<br> + // the indexer or the filewatch service or even \
some application messing up the data.<br> + // We look for indexed files that do \
not have a nie:url defined and thus, will never be catched by any of the<br> + // \
other queries.<br> + //<br>
+ query = Query::Query(<br>
+ Strigi::Ontology::indexGraphFor() == ( Soprano::Vocabulary::RDF::type() == \
Query::ResourceTerm( Nepomuk::Vocabulary::NFO::FileDataObject() ) &&<br> + \
!( Nepomuk::Vocabulary::NIE::url() == Query::Term() ) )<br> + \
).toSparqlQuery(Query::Query::NoResultRestrictions);<br> + kDebug() << \
query;<br> + removeAllGraphsFromQuery( query );<br>
+}<br>
+<br>
+<br>
+<br>
+/**<br>
+ * Runs the query using a limit until all graphs have been deleted. This is not \
done<br> + * in one big loop to avoid the problems with messed up iterators when one \
of the iterated<br> + * item is deleted.<br>
+ */<br>
+bool Nepomuk::IndexScheduler::removeAllGraphsFromQuery( const QString& query \
)<br> +{<br>
+ while ( 1 ) {<br>
+ // get the next batch of graphs<br>
+ QList<Soprano::Node> graphs<br>
+ = ResourceManager::instance()->mainModel()->executeQuery( query + \
QLatin1String( " LIMIT 200" ),<br> + \
Soprano::Query::QueryLanguageSparql ).iterateBindings( 0 ).allNodes();<br> +<br>
+ // remove all graphs in the batch<br>
+ Q_FOREACH( const Soprano::Node& graph, graphs ) {<br>
+<br>
// wait for resume or stop (or simply continue)<br>
if ( !waitForContinue() ) {<br>
- break;<br>
+ return false;<br>
}<br>
<br>
- ResourceManager::instance()->mainModel()->removeContext( it[0] );<br>
+ ResourceManager::instance()->mainModel()->removeContext( graph \
);<br> }<br>
+<br>
+ // we are done when the last graphs are queried<br>
+ if ( graphs.count() < 200 ) {<br>
+ return true;<br>
}<br>
+ }<br>
<br>
+ // make gcc shut up<br>
+ return true;<br>
+}<br>
<br>
+<br>
QDebug Nepomuk::operator<<( QDebug dbg, IndexScheduler::IndexingSpeed speed \
)<br> {<br>
dbg << ( int )speed;<br>
--- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/indexscheduler.h \
#1162154:1162155<br> @@ -206,6 +206,7 @@<br>
* to index anymore.<br>
*/<br>
void removeOldAndUnwantedEntries();<br>
+ bool removeAllGraphsFromQuery( const QString& query_ );<br>
<br>
bool m_suspended;<br>
bool m_stopped;<br>
</blockquote></div><br>
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic