[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kde-commits
Subject:    Re: KDE/kdebase/runtime/nepomuk/services/strigi
From:       Vishesh Handa <handa.vish () gmail ! com>
Date:       2010-08-11 14:57:27
Message-ID: AANLkTinpbbTu330KZVZaP0oeGWab21mmfrjQ5zq-60BH () mail ! gmail ! com
[Download RAW message or body]

Hey Sebastian

On Wed, Aug 11, 2010 at 7:48 PM, Sebastian Trueg <sebastian@trueg.de> wrote:

> SVN commit 1162155 by trueg:
>
> 1. Remove all index graphs that contain resources without a nie:url (junk
> from buggy runs)
>

Suppose the Akonadi people want to Index an email attachment. That
attachment might or might not have a nie:url. If it doesn't then this would
remove the indexed data.

Or are you going to keep it mandatory for each indexed file to have a
nie:url?

- Vishesh Handa


> 2. Remove graphs in batches to avoid the iterator breakage that comes with
> removing iterated items.
>
>  M  +1 -0      CMakeLists.txt
>  M  +58 -31    indexscheduler.cpp
>  M  +1 -0      indexscheduler.h
>
>
> --- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/CMakeLists.txt
> #1162154:1162155
> @@ -60,6 +60,7 @@
>   ${KDE4_KIO_LIBS}
>   ${KDE4_SOLID_LIBS}
>   ${KDE4_KIDLETIME_LIBS}
> +  ${NEPOMUK_QUERY_LIBRARIES}
>   ${NEPOMUK_LIBRARIES}
>   ${SOPRANO_LIBRARIES}
>   )
> --- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/indexscheduler.cpp
> #1162154:1162155
> @@ -23,6 +23,7 @@
>  #include "indexscheduler.h"
>  #include "strigiserviceconfig.h"
>  #include "nepomukindexer.h"
> +#include "util.h"
>  #include "nfo.h"
>  #include "nie.h"
>
> @@ -42,11 +43,15 @@
>  #include <Nepomuk/Resource>
>  #include <Nepomuk/ResourceManager>
>  #include <Nepomuk/Variant>
> +#include <Nepomuk/Query/Query>
> +#include <Nepomuk/Query/ComparisonTerm>
> +#include <Nepomuk/Query/ResourceTerm>
>
>  #include <Soprano/Model>
>  #include <Soprano/QueryResultIterator>
>  #include <Soprano/NodeIterator>
>  #include <Soprano/Node>
> +#include <Soprano/Vocabulary/RDF>
>  #include <Soprano/Vocabulary/Xesam>
>
>  #include <map>
> @@ -544,7 +549,7 @@
>     // We query all files that should not be in the store
>     // This for example excludes all filex:/ URLs.
>     //
> -    QString query = QString::fromLatin1( "select distinct ?g ?url where {
> "
> +    QString query = QString::fromLatin1( "select distinct ?g where { "
>                                          "?r %1 ?url . "
>                                          "?g <
> http://www.strigi.org/fields#indexGraphFor> ?r . "
>
>  "FILTER(REGEX(STR(?url),'^file:/')) . "
> @@ -552,21 +557,10 @@
>                     .arg( Soprano::Node::resourceToN3(
> Nepomuk::Vocabulary::NIE::url() ),
>                           folderFilter );
>     kDebug() << query;
> +    if ( !removeAllGraphsFromQuery( query ) )
> +        return;
>
> -    Soprano::QueryResultIterator it =
> ResourceManager::instance()->mainModel()->executeQuery( query,
> Soprano::Query::QueryLanguageSparql );
> -    while ( it.next() ) {
>
> -        // wait for resume or stop (or simply continue)
> -        if ( !waitForContinue() ) {
> -            break;
> -        }
> -
> -        const Soprano::Node& g = it[0];
> -        kDebug() << "REMOVING" << it["url"].uri();
> -        ResourceManager::instance()->mainModel()->removeContext( g );
> -    }
> -
> -
>     //
>     // Build filter query for all exclude filters
>     //
> @@ -584,7 +578,7 @@
>     else if( !includeExcludeFilters.isEmpty() )
>         filters = QString::fromLatin1("FILTER(%1) .").arg(
> includeExcludeFilters );
>
> -    query = QString::fromLatin1( "select distinct ?g ?url where { "
> +    query = QString::fromLatin1( "select distinct ?g where { "
>                                  "?r %1 ?url . "
>                                  "?r %2 ?fn . "
>                                  "?g <
> http://www.strigi.org/fields#indexGraphFor> ?r . "
> @@ -594,20 +588,10 @@
>                   Soprano::Node::resourceToN3(
> Nepomuk::Vocabulary::NFO::fileName() ),
>                   filters );
>     kDebug() << query;
> -    it = ResourceManager::instance()->mainModel()->executeQuery( query,
> Soprano::Query::QueryLanguageSparql );
> -    while ( it.next() ) {
> +    if ( !removeAllGraphsFromQuery( query ) )
> +        return;
>
> -        // wait for resume or stop (or simply continue)
> -        if ( !waitForContinue() ) {
> -            break;
> -        }
>
> -        const Soprano::Node& g = it[0];
> -        kDebug() << "REMOVING" << it["url"].uri();
> -        ResourceManager::instance()->mainModel()->removeContext( g );
> -    }
> -
> -
>     //
>     // Remove all old data from Xesam-times. While we leave out the data
> created by libnepomuk
>     // there is no problem since libnepomuk still uses backwards compatible
> queries and we use
> @@ -620,19 +604,62 @@
>                                  "{ graph ?g { ?r2 %1 ?u2 . } } "
>                                  "}" )
>             .arg( Soprano::Node::resourceToN3(
> Soprano::Vocabulary::Xesam::url() ) );
> -    it = ResourceManager::instance()->mainModel()->executeQuery( query,
> Soprano::Query::QueryLanguageSparql );
> -    while ( it.next() ) {
> +    kDebug() << query;
> +    if ( !removeAllGraphsFromQuery( query ) )
> +        return;
>
> +
> +    //
> +    // Remove data which is useless but still around from before. This
> could happen due to some buggy version of
> +    // the indexer or the filewatch service or even some application
> messing up the data.
> +    // We look for indexed files that do not have a nie:url defined and
> thus, will never be catched by any of the
> +    // other queries.
> +    //
> +    query = Query::Query(
> +        Strigi::Ontology::indexGraphFor() == (
> Soprano::Vocabulary::RDF::type() == Query::ResourceTerm(
> Nepomuk::Vocabulary::NFO::FileDataObject() ) &&
> +                                               !(
> Nepomuk::Vocabulary::NIE::url() == Query::Term() ) )
> +        ).toSparqlQuery(Query::Query::NoResultRestrictions);
> +    kDebug() << query;
> +    removeAllGraphsFromQuery( query );
> +}
> +
> +
> +
> +/**
> + * Runs the query using a limit until all graphs have been deleted. This
> is not done
> + * in one big loop to avoid the problems with messed up iterators when one
> of the iterated
> + * item is deleted.
> + */
> +bool Nepomuk::IndexScheduler::removeAllGraphsFromQuery( const QString&
> query )
> +{
> +    while ( 1 ) {
> +        // get the next batch of graphs
> +        QList<Soprano::Node> graphs
> +            = ResourceManager::instance()->mainModel()->executeQuery(
> query + QLatin1String( " LIMIT 200" ),
> +
>  Soprano::Query::QueryLanguageSparql ).iterateBindings( 0 ).allNodes();
> +
> +        // remove all graphs in the batch
> +        Q_FOREACH( const Soprano::Node& graph, graphs ) {
> +
>         // wait for resume or stop (or simply continue)
>         if ( !waitForContinue() ) {
> -            break;
> +                return false;
>         }
>
> -        ResourceManager::instance()->mainModel()->removeContext( it[0] );
> +            ResourceManager::instance()->mainModel()->removeContext( graph
> );
>     }
> +
> +        // we are done when the last graphs are queried
> +        if ( graphs.count() < 200 ) {
> +            return true;
>  }
> +    }
>
> +    // make gcc shut up
> +    return true;
> +}
>
> +
>  QDebug Nepomuk::operator<<( QDebug dbg, IndexScheduler::IndexingSpeed
> speed )
>  {
>     dbg << ( int )speed;
> --- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/indexscheduler.h
> #1162154:1162155
> @@ -206,6 +206,7 @@
>          * to index anymore.
>          */
>         void removeOldAndUnwantedEntries();
> +        bool removeAllGraphsFromQuery( const QString& query_ );
>
>         bool m_suspended;
>         bool m_stopped;
>

[Attachment #3 (text/html)]

Hey Sebastian<br><br><div class="gmail_quote">On Wed, Aug 11, 2010 at 7:48 PM, \
Sebastian Trueg <span dir="ltr">&lt;<a \
href="mailto:sebastian@trueg.de">sebastian@trueg.de</a>&gt;</span> \
wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px \
#ccc solid;padding-left:1ex;"> SVN commit 1162155 by trueg:<br>
<br>
1. Remove all index graphs that contain resources without a nie:url (junk from buggy \
runs)<br></blockquote><div><br></div><div>Suppose the Akonadi people want to Index an \
email attachment. That attachment might or might not have a nie:url. If it \
doesn&#39;t then this would remove the indexed data. </div> <div><br></div><div>Or \
are you going to keep it mandatory for each indexed file to have a nie:url? \
</div><div><br></div><div>- Vishesh Handa</div><div> </div><blockquote \
class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc \
solid;padding-left:1ex;">

2. Remove graphs in batches to avoid the iterator breakage that comes with removing \
iterated items.<br> <br>
 M  +1 -0      CMakeLists.txt<br>
 M  +58 -31    indexscheduler.cpp<br>
 M  +1 -0      indexscheduler.h<br>
<br>
<br>
--- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/CMakeLists.txt \
#1162154:1162155<br> @@ -60,6 +60,7 @@<br>
   ${KDE4_KIO_LIBS}<br>
   ${KDE4_SOLID_LIBS}<br>
   ${KDE4_KIDLETIME_LIBS}<br>
+  ${NEPOMUK_QUERY_LIBRARIES}<br>
   ${NEPOMUK_LIBRARIES}<br>
   ${SOPRANO_LIBRARIES}<br>
   )<br>
--- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/indexscheduler.cpp \
#1162154:1162155<br> @@ -23,6 +23,7 @@<br>
 #include &quot;indexscheduler.h&quot;<br>
 #include &quot;strigiserviceconfig.h&quot;<br>
 #include &quot;nepomukindexer.h&quot;<br>
+#include &quot;util.h&quot;<br>
 #include &quot;nfo.h&quot;<br>
 #include &quot;nie.h&quot;<br>
<br>
@@ -42,11 +43,15 @@<br>
 #include &lt;Nepomuk/Resource&gt;<br>
 #include &lt;Nepomuk/ResourceManager&gt;<br>
 #include &lt;Nepomuk/Variant&gt;<br>
+#include &lt;Nepomuk/Query/Query&gt;<br>
+#include &lt;Nepomuk/Query/ComparisonTerm&gt;<br>
+#include &lt;Nepomuk/Query/ResourceTerm&gt;<br>
<br>
 #include &lt;Soprano/Model&gt;<br>
 #include &lt;Soprano/QueryResultIterator&gt;<br>
 #include &lt;Soprano/NodeIterator&gt;<br>
 #include &lt;Soprano/Node&gt;<br>
+#include &lt;Soprano/Vocabulary/RDF&gt;<br>
 #include &lt;Soprano/Vocabulary/Xesam&gt;<br>
<br>
 #include &lt;map&gt;<br>
@@ -544,7 +549,7 @@<br>
     // We query all files that should not be in the store<br>
     // This for example excludes all filex:/ URLs.<br>
     //<br>
-    QString query = QString::fromLatin1( &quot;select distinct ?g ?url where { \
&quot;<br> +    QString query = QString::fromLatin1( &quot;select distinct ?g where { \
&quot;<br>  &quot;?r %1 ?url . &quot;<br>
                                          &quot;?g &lt;<a \
href="http://www.strigi.org/fields#indexGraphFor" \
                target="_blank">http://www.strigi.org/fields#indexGraphFor</a>&gt; ?r \
                . &quot;<br>
                                          \
&quot;FILTER(REGEX(STR(?url),&#39;^file:/&#39;)) . &quot;<br> @@ -552,21 +557,10 \
                @@<br>
                     .arg( Soprano::Node::resourceToN3( \
Nepomuk::Vocabulary::NIE::url() ),<br>  folderFilter );<br>
     kDebug() &lt;&lt; query;<br>
+    if ( !removeAllGraphsFromQuery( query ) )<br>
+        return;<br>
<br>
-    Soprano::QueryResultIterator it = \
ResourceManager::instance()-&gt;mainModel()-&gt;executeQuery( query, \
                Soprano::Query::QueryLanguageSparql );<br>
-    while ( it.next() ) {<br>
<br>
-        // wait for resume or stop (or simply continue)<br>
-        if ( !waitForContinue() ) {<br>
-            break;<br>
-        }<br>
-<br>
-        const Soprano::Node&amp; g = it[0];<br>
-        kDebug() &lt;&lt; &quot;REMOVING&quot; &lt;&lt; \
                it[&quot;url&quot;].uri();<br>
-        ResourceManager::instance()-&gt;mainModel()-&gt;removeContext( g );<br>
-    }<br>
-<br>
-<br>
     //<br>
     // Build filter query for all exclude filters<br>
     //<br>
@@ -584,7 +578,7 @@<br>
     else if( !includeExcludeFilters.isEmpty() )<br>
         filters = QString::fromLatin1(&quot;FILTER(%1) .&quot;).arg( \
includeExcludeFilters );<br> <br>
-    query = QString::fromLatin1( &quot;select distinct ?g ?url where { &quot;<br>
+    query = QString::fromLatin1( &quot;select distinct ?g where { &quot;<br>
                                  &quot;?r %1 ?url . &quot;<br>
                                  &quot;?r %2 ?fn . &quot;<br>
                                  &quot;?g &lt;<a \
href="http://www.strigi.org/fields#indexGraphFor" \
target="_blank">http://www.strigi.org/fields#indexGraphFor</a>&gt; ?r . &quot;<br> @@ \
                -594,20 +588,10 @@<br>
                   Soprano::Node::resourceToN3( Nepomuk::Vocabulary::NFO::fileName() \
),<br>  filters );<br>
     kDebug() &lt;&lt; query;<br>
-    it = ResourceManager::instance()-&gt;mainModel()-&gt;executeQuery( query, \
                Soprano::Query::QueryLanguageSparql );<br>
-    while ( it.next() ) {<br>
+    if ( !removeAllGraphsFromQuery( query ) )<br>
+        return;<br>
<br>
-        // wait for resume or stop (or simply continue)<br>
-        if ( !waitForContinue() ) {<br>
-            break;<br>
-        }<br>
<br>
-        const Soprano::Node&amp; g = it[0];<br>
-        kDebug() &lt;&lt; &quot;REMOVING&quot; &lt;&lt; \
                it[&quot;url&quot;].uri();<br>
-        ResourceManager::instance()-&gt;mainModel()-&gt;removeContext( g );<br>
-    }<br>
-<br>
-<br>
     //<br>
     // Remove all old data from Xesam-times. While we leave out the data created by \
                libnepomuk<br>
     // there is no problem since libnepomuk still uses backwards compatible queries \
and we use<br> @@ -620,19 +604,62 @@<br>
                                  &quot;{ graph ?g { ?r2 %1 ?u2 . } } &quot;<br>
                                  &quot;}&quot; )<br>
             .arg( Soprano::Node::resourceToN3( Soprano::Vocabulary::Xesam::url() ) \
                );<br>
-    it = ResourceManager::instance()-&gt;mainModel()-&gt;executeQuery( query, \
                Soprano::Query::QueryLanguageSparql );<br>
-    while ( it.next() ) {<br>
+    kDebug() &lt;&lt; query;<br>
+    if ( !removeAllGraphsFromQuery( query ) )<br>
+        return;<br>
<br>
+<br>
+    //<br>
+    // Remove data which is useless but still around from before. This could happen \
due to some buggy version of<br> +    // the indexer or the filewatch service or even \
some application messing up the data.<br> +    // We look for indexed files that do \
not have a nie:url defined and thus, will never be catched by any of the<br> +    // \
other queries.<br> +    //<br>
+    query = Query::Query(<br>
+        Strigi::Ontology::indexGraphFor() == ( Soprano::Vocabulary::RDF::type() == \
Query::ResourceTerm( Nepomuk::Vocabulary::NFO::FileDataObject() ) &amp;&amp;<br> +    \
!( Nepomuk::Vocabulary::NIE::url() == Query::Term() ) )<br> +        \
).toSparqlQuery(Query::Query::NoResultRestrictions);<br> +    kDebug() &lt;&lt; \
query;<br> +    removeAllGraphsFromQuery( query );<br>
+}<br>
+<br>
+<br>
+<br>
+/**<br>
+ * Runs the query using a limit until all graphs have been deleted. This is not \
done<br> + * in one big loop to avoid the problems with messed up iterators when one \
of the iterated<br> + * item is deleted.<br>
+ */<br>
+bool Nepomuk::IndexScheduler::removeAllGraphsFromQuery( const QString&amp; query \
)<br> +{<br>
+    while ( 1 ) {<br>
+        // get the next batch of graphs<br>
+        QList&lt;Soprano::Node&gt; graphs<br>
+            = ResourceManager::instance()-&gt;mainModel()-&gt;executeQuery( query + \
QLatin1String( &quot; LIMIT 200&quot; ),<br> +                                        \
Soprano::Query::QueryLanguageSparql ).iterateBindings( 0 ).allNodes();<br> +<br>
+        // remove all graphs in the batch<br>
+        Q_FOREACH( const Soprano::Node&amp; graph, graphs ) {<br>
+<br>
         // wait for resume or stop (or simply continue)<br>
         if ( !waitForContinue() ) {<br>
-            break;<br>
+                return false;<br>
         }<br>
<br>
-        ResourceManager::instance()-&gt;mainModel()-&gt;removeContext( it[0] );<br>
+            ResourceManager::instance()-&gt;mainModel()-&gt;removeContext( graph \
);<br>  }<br>
+<br>
+        // we are done when the last graphs are queried<br>
+        if ( graphs.count() &lt; 200 ) {<br>
+            return true;<br>
 }<br>
+    }<br>
<br>
+    // make gcc shut up<br>
+    return true;<br>
+}<br>
<br>
+<br>
 QDebug Nepomuk::operator&lt;&lt;( QDebug dbg, IndexScheduler::IndexingSpeed speed \
)<br>  {<br>
     dbg &lt;&lt; ( int )speed;<br>
--- trunk/KDE/kdebase/runtime/nepomuk/services/strigi/indexscheduler.h \
#1162154:1162155<br> @@ -206,6 +206,7 @@<br>
          * to index anymore.<br>
          */<br>
         void removeOldAndUnwantedEntries();<br>
+        bool removeAllGraphsFromQuery( const QString&amp; query_ );<br>
<br>
         bool m_suspended;<br>
         bool m_stopped;<br>
</blockquote></div><br>



[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic