[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kde-commits
Subject:    KDE/kdebase/runtime/nepomuk/strigibackend
From:       Vishesh Handa <handa.vish () gmail ! com>
Date:       2010-07-14 12:33:11
Message-ID: 20100714123311.A2248AC736 () svn ! kde ! org
[Download RAW message or body]

SVN commit 1149834 by vhanda:

Created the NepomukIndexFeeder. It takes all the statements from the StrigiAnalyzer, \
checks for duplicates and accordingly creates new resources. This gives us the added \
benefit that all Resources will be connected with the same additional resources \
instead of creating duplicates.

Please refer to the Nepomuk Mailing list (StrgiFeeder) 13-14th July 2010 for a more \
indepth discussion of the problem. 


 M  +1 -0      CMakeLists.txt  
 A             nepomukindexfeeder.cpp   [License: GPL (v2+)]
 A             nepomukindexfeeder.h   [License: GPL (v2+)]
 M  +76 -112   nepomukindexwriter.cpp  


--- trunk/KDE/kdebase/runtime/nepomuk/strigibackend/CMakeLists.txt #1149833:1149834
@@ -11,6 +11,7 @@
   nepomukindexmanager.cpp
   nepomukindexreader.cpp
   nepomukindexwriter.cpp
+  nepomukindexfeeder.cpp
   util.cpp
 )
 
--- trunk/KDE/kdebase/runtime/nepomuk/strigibackend/nepomukindexwriter.cpp \
#1149833:1149834 @@ -1,5 +1,6 @@
 /*
   Copyright (C) 2007-2010 Sebastian Trueg <trueg@kde.org>
+  Copyright (C) 2010 Vishesh Handa <handa.vish@gmail.com>
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
@@ -22,6 +23,7 @@
 #include "nfo.h"
 #include "nie.h"
 #include "nrl.h"
+#include "nepomukindexfeeder.h"
 
 #include <Soprano/Soprano>
 #include <Soprano/Vocabulary/RDF>
@@ -118,6 +120,25 @@
         return uri;
     }
 
+    class RegisteredFieldData
+    {
+    public:
+        RegisteredFieldData( const QUrl& prop, QVariant::Type t )
+        : property( prop ),
+        dataType( t ),
+        isRdfType( prop == Vocabulary::RDF::type() ) {
+        }
+
+        /// The actual property URI
+        QUrl property;
+
+        /// the literal range of the property (if applicable)
+        QVariant::Type dataType;
+
+        /// caching QUrl comparison
+        bool isRdfType;
+    };
+
     /**
      * Data objects that are used to store information relative to one
      * indexing run.
@@ -128,7 +149,7 @@
         FileMetaData( const Strigi::AnalysisResult* idx );
 
         /// stores basic data including the nie:url and the nrl:GraphMetadata in \p \
                model
-        void storeBasicData( Soprano::Model* model );
+        void storeBasicData( Nepomuk::NepomukIndexFeeder* feeder );
 
         /// map a blank node to a resource
         QUrl mapNode( const std::string& s );
@@ -142,39 +163,14 @@
         /// The file info - saved to prevent multiple stats
         QFileInfo fileInfo;
 
-        /// The URI of the graph that contains all indexed statements
-        QUrl context;
-
         /// a buffer for all plain-text content generated by strigi
         std::string content;
 
     private:
         /// The Strigi result
         const Strigi::AnalysisResult* m_analysisResult;
-
-        /// mapping from blank nodes used in addTriplet to our urns
-        QMap<std::string, QUrl> m_blankNodeMap;
     };
 
-    class RegisteredFieldData
-    {
-    public:
-        RegisteredFieldData( const QUrl& prop, QVariant::Type t )
-            : property( prop ),
-              dataType( t ),
-              isRdfType( prop == Vocabulary::RDF::type() ) {
-        }
-
-        /// The actual property URI
-        QUrl property;
-
-        /// the literal range of the property (if applicable)
-        QVariant::Type dataType;
-
-        /// caching QUrl comparison
-        bool isRdfType;
-    };
-
     FileMetaData::FileMetaData( const Strigi::AnalysisResult* idx )
         : m_analysisResult( idx )
     {
@@ -185,94 +181,68 @@
         // this will automatically find previous uses of the file in question
         // with backwards compatibility
         resourceUri = Nepomuk::Resource( fileUrl ).resourceUri();
-
-        // use a new random context URI
-        context = Nepomuk::ResourceManager::instance()->generateUniqueUri( "ctx" );
     }
 
-    QUrl FileMetaData::mapNode( const std::string& s )
+    void FileMetaData::storeBasicData( Nepomuk::NepomukIndexFeeder * feeder )
     {
-        if ( s[0] == ':' ) {
-            if( m_blankNodeMap.contains( s ) ) {
-                return m_blankNodeMap[s];
-            }
-            else {
-                QUrl urn = Nepomuk::ResourceManager::instance()->generateUniqueUri( \
                QString() );
-                m_blankNodeMap.insert( s, urn );
-                return urn;
-            }
-        }
-        // special case to properly handle nie:isPartOf relations created for \
                containers
-        else if ( s == m_analysisResult->path() ) {
-            return resourceUri;
-        }
-        else {
-            return QUrl::fromEncoded( s.c_str() );
-        }
-    }
+        feeder->addStatement( resourceUri, Nepomuk::Vocabulary::NIE::url(), fileUrl \
);  
-    void FileMetaData::storeBasicData( Soprano::Model* model )
-    {
-        model->addStatement( resourceUri, Nepomuk::Vocabulary::NIE::url(), fileUrl, \
                context );
-
         // Strigi only indexes files and extractors mostly (if at all) store the \
                nie:DataObject type (i.e. the contents)
         // Thus, here we go the easy way and mark each indexed file as a \
                nfo:FileDataObject.
-        model->addStatement( resourceUri,
+        feeder->addStatement( resourceUri,
                              Vocabulary::RDF::type(),
-                             Nepomuk::Vocabulary::NFO::FileDataObject(),
-                             context );
+                              Nepomuk::Vocabulary::NFO::FileDataObject() );
         if ( fileInfo.isDir() ) {
-            model->addStatement( resourceUri,
+            feeder->addStatement( resourceUri,
                                  Vocabulary::RDF::type(),
-                                 Nepomuk::Vocabulary::NFO::Folder(),
-                                 context );
+                                  Nepomuk::Vocabulary::NFO::Folder() );
         }
-
-
-        // create the provedance data for the data graph
-        // TODO: add more data at some point when it becomes of interest
-        QUrl metaDataContext = \
                Nepomuk::ResourceManager::instance()->generateUniqueUri( "ctx" );
-        model->addStatement( context,
-                             Vocabulary::RDF::type(),
-                             Nepomuk::Vocabulary::NRL::DiscardableInstanceBase(),
-                             metaDataContext );
-        model->addStatement( context,
-                             Vocabulary::NAO::created(),
-                             LiteralValue( QDateTime::currentDateTime() ),
-                             metaDataContext );
-        model->addStatement( context,
-                             Strigi::Ontology::indexGraphFor(),
-                             resourceUri,
-                             metaDataContext );
-        model->addStatement( metaDataContext,
-                             Vocabulary::RDF::type(),
-                             Nepomuk::Vocabulary::NRL::GraphMetadata(),
-                             metaDataContext );
-        model->addStatement( metaDataContext,
-                             Nepomuk::Vocabulary::NRL::coreGraphMetadataFor(),
-                             context,
-                             metaDataContext );
     }
 
     FileMetaData* fileDataForResult( const Strigi::AnalysisResult* idx )
     {
         return static_cast<FileMetaData*>( idx->writerData() );
     }
+
+    /**
+     * Creates a Blank or Resource Node based on the contents of the string \
provided. +     * If the string is of the form ':identifier', a Blank node is \
created. +     * Otherwise a Resource Node is returned.
+     */
+    Soprano::Node createBlankOrResourceNode( const std::string & str ) {
+        QString identifier = QString::fromUtf8( str.c_str() );
+
+        if( !identifier.isEmpty() && identifier[0] == ':' ) {
+            identifier.remove( 0, 1 );
+            return Soprano::Node::createBlankNode( identifier );
 }
 
+        //Not a blank node
+        return Soprano::Node( QUrl(identifier) );
+    }
+}
 
+
 class Strigi::NepomukIndexWriter::Private
 {
 public:
-    Private()
+    Private( Soprano::Model * model )
+        : repository( model )
     {
         literalTypes[FieldRegister::stringType] = QVariant::String;
         literalTypes[FieldRegister::floatType] = QVariant::Double;
         literalTypes[FieldRegister::integerType] = QVariant::Int;
         literalTypes[FieldRegister::binaryType] = QVariant::ByteArray;
         literalTypes[FieldRegister::datetimeType] = QVariant::DateTime; // Strigi \
encodes datetime as unsigned integer, i.e. addValue( ..., uint ) +
+        feeder = new Nepomuk::NepomukIndexFeeder( model );
     }
 
+    ~Private()
+    {
+        delete feeder;
+    }
+
     QVariant::Type literalType( const Strigi::FieldProperties& strigiType ) {
         // it looks as if the typeUri can contain arbitrary values, URIs or stuff \
                like "string"
         QHash<std::string, QVariant::Type>::const_iterator it = \
literalTypes.constFind( strigiType.typeUri() ); @@ -310,6 +280,8 @@
 
     QStack<const Strigi::AnalysisResult*> currentResultStack;
 
+    Nepomuk::NepomukIndexFeeder* feeder;
+
 private:
     QHash<std::string, QVariant::Type> literalTypes;
 };
@@ -318,8 +290,7 @@
 Strigi::NepomukIndexWriter::NepomukIndexWriter( Soprano::Model* model )
     : Strigi::IndexWriter()
 {
-    d = new Private;
-    d->repository = model;
+    d = new Private( model );
     Util::storeStrigiMiniOntology( d->repository );
 }
 
@@ -387,8 +358,11 @@
     if ( data->resourceUri.isEmpty() )
         data->resourceUri = Nepomuk::ResourceManager::instance()->generateUniqueUri( \
QString() );  
+    // Initialize the feeder to accept statements
+    d->feeder->begin( data->resourceUri );
+
     // store initial data to make sure newly created URIs are reused directly by \
                libnepomuk
-    data->storeBasicData( d->repository );
+    data->storeBasicData( d->feeder );
 
     // remember the file data
     idx->setWriterData( data );
@@ -419,7 +393,7 @@
         RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( \
field->writerData() );  
         // the statement we will create, we will determine the object below
-        Soprano::Statement statement( md->resourceUri, rfd->property, \
Soprano::Node(), md->context ); +        Soprano::Statement statement( \
md->resourceUri, rfd->property, Soprano::Node() );  
         //
         // Strigi uses rdf:type improperly since it stores the value as a string. We \
have to @@ -461,12 +435,12 @@
             if ( value[0] == ':' ) {
                 Nepomuk::Types::Property property( rfd->property );
                 if ( property.range().isValid() ) {
-                    statement.setObject( md->mapNode( value ) );
+                    statement.setObject( createBlankOrResourceNode( value ) );
                 }
             }
         }
 
-        d->repository->addStatement( statement );
+        d->feeder->addStatement( statement );
     }
 }
 
@@ -504,10 +478,7 @@
         val = QDateTime::fromTime_t( value );
     }
 
-    d->repository->addStatement( Statement( md->resourceUri,
-                                            rfd->property,
-                                            val,
-                                            md->context) );
+    d->feeder->addStatement( md->resourceUri, rfd->property, val);
 }
 
 
@@ -522,10 +493,7 @@
     FileMetaData* md = fileDataForResult( idx );
     RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( \
field->writerData() );  
-    d->repository->addStatement( Statement( md->resourceUri,
-                                            rfd->property,
-                                            LiteralValue( value ),
-                                            md->context) );
+    d->repository->addStatement( md->resourceUri, rfd->property, LiteralValue( value \
) );  }
 
 
@@ -540,10 +508,7 @@
     FileMetaData* md = fileDataForResult( idx );
     RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( \
field->writerData() );  
-    d->repository->addStatement( Statement( md->resourceUri,
-                                            rfd->property,
-                                            LiteralValue( value ),
-                                            md->context) );
+    d->repository->addStatement( md->resourceUri, rfd->property, LiteralValue( value \
) );  }
 
 
@@ -555,17 +520,17 @@
         return;
     }
 
-    FileMetaData* md = fileDataForResult( d->currentResultStack.top() );
+    //FileMetaData* md = fileDataForResult( d->currentResultStack.top() );
 
-    QUrl subject = md->mapNode( s );
-    Nepomuk::Types::Property property( md->mapNode( p ) );
+    Soprano::Node subject( createBlankOrResourceNode( s ) );
+    Nepomuk::Types::Property property( QUrl( QString::fromUtf8(p.c_str()) ) ); // \
Was mapped earlier  Soprano::Node object;
     if ( property.range().isValid() )
-        object = md->mapNode( o );
+        object = Soprano::Node( createBlankOrResourceNode( o ) );
     else
         object = Soprano::LiteralValue::fromString( QString::fromUtf8( o.c_str() ), \
property.literalRangeType().dataTypeUri() );  
-    d->repository->addStatement( subject, property.uri(), object, md->context );
+    d->feeder->addStatement( subject, property.uri(), object );
 }
 
 
@@ -582,14 +547,13 @@
 
     // store the full text of the file
     if ( md->content.length() > 0 ) {
-        d->repository->addStatement( Statement( md->resourceUri,
+        d->feeder->addStatement( md->resourceUri,
                                                 \
                Nepomuk::Vocabulary::NIE::plainTextContent(),
-                                                LiteralValue( QString::fromUtf8( \
                md->content.c_str() ) ),
-                                                md->context ) );
-        if ( d->repository->lastError() )
-            kDebug() << "Failed to add" << md->resourceUri << "as text" << \
QString::fromUtf8( md->content.c_str() ); +                                 \
LiteralValue( QString::fromUtf8( md->content.c_str() ) ) );  }
 
+    d->feeder->end();
+
     // cleanup
     delete md;
     idx->setWriterData( 0 );


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic