[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kde-commits
Subject:    playground/base/nepomuk-kde/scribo/plugins/pimo
From:       Sebastian Trueg <sebastian () trueg ! de>
Date:       2010-11-22 13:42:44
Message-ID: 20101122134244.94A55AC8A0 () svn ! kde ! org
[Download RAW message or body]

SVN commit 1199714 by trueg:

Replaced the simple query-based token detection with the one
written by Mikhail Kotelnikov which uses a token tree of all
named entities.
This is faster and also handles tokens which contain strings.
But on the other hand it does not handle substring matching
yet and uses an in-memory tree of all named entities.


 M  +3 -0      CMakeLists.txt  
 M  +77 -75    pimotextmatchplugin.cpp  
 M  +5 -6      pimotextmatchplugin.h  
 AM            tokendetector.cpp   [License: LGPL]
 AM            tokendetector.h   [License: LGPL]
 AM            tokennode.cpp   [License: LGPL]
 AM            tokennode.h   [License: LGPL]
 AM            tokentree.cpp   [License: LGPL]
 AM            tokentree.h   [License: LGPL]


--- trunk/playground/base/nepomuk-kde/scribo/plugins/pimo/CMakeLists.txt \
#1199713:1199714 @@ -9,6 +9,9 @@
 
 set(pimotextmatchplugin_SRCS
   pimotextmatchplugin.cpp
+  tokentree.cpp
+  tokennode.cpp
+  tokendetector.cpp
   )
 
 kde4_add_plugin(scribo_pimotextmatchplugin ${pimotextmatchplugin_SRCS})
--- trunk/playground/base/nepomuk-kde/scribo/plugins/pimo/pimotextmatchplugin.cpp \
#1199713:1199714 @@ -22,6 +22,8 @@
 #include "pimo.h"
 #include "textoccurrence.h"
 #include "entity.h"
+#include "tokentree.h"
+#include "tokendetector.h"
 
 #include <Soprano/Model>
 #include <Soprano/QueryResultIterator>
@@ -42,119 +44,119 @@
 #include <KPluginFactory>
 #include <KDebug>
 
+#include <QtCore/QThread>
 
-namespace {
-    const int s_minLength = 3;
-}
+class PimoTextMatchPlugin::WorkThread : public QThread
+{
+public:
+    WorkThread( QObject* parent );
+    ~WorkThread();
 
-PimoTextMatchPlugin::PimoTextMatchPlugin( QObject* parent, const QVariantList& )
-    : TextMatchPlugin( parent )
+    void start(const QString& text);
+    void run();
+
+private:
+    void buildTokenTree();
+    QString m_text;
+    TokenTree* m_tokenTree;
+};
+
+PimoTextMatchPlugin::WorkThread::WorkThread( QObject* parent )
+    : QThread(parent),
+      m_tokenTree(0)
 {
-    m_stopWords << QLatin1String( "and" )
-                << QLatin1String( "or" )
-                << QLatin1String( "the" )
-                << QLatin1String( "that" )
-                << QLatin1String( "this" )
-                << QLatin1String( "there" )
-                << QLatin1String( "for" )
-                << QLatin1String( "not" )
-                << QLatin1String( "are" )
-                << QLatin1String( "but" )
-                << QLatin1String( "into" )
-                << QLatin1String( "with" );
 }
 
-
-PimoTextMatchPlugin::~PimoTextMatchPlugin()
+PimoTextMatchPlugin::WorkThread::~WorkThread()
 {
+    delete m_tokenTree;
 }
 
-
-void PimoTextMatchPlugin::doGetPossibleMatches( const QString& text )
+void PimoTextMatchPlugin::WorkThread::start(const QString &text)
 {
     m_text = text;
-    m_pos = 0;
-    scanText();
+    QThread::start();
 }
 
+void PimoTextMatchPlugin::WorkThread::run()
+{
+    buildTokenTree();
+    TokenDetector* detector = new TokenDetector(m_tokenTree);
+    connect(detector, SIGNAL(tokenFound(int,int,QVariant)),
+            parent(), SLOT(slotTokenFound(int,int,QVariant)),
+            Qt::QueuedConnection);
+    foreach(const QChar& ch, m_text) {
+        detector->update(ch);
+    }
+    detector->finish();
+    delete detector;
+}
 
-void PimoTextMatchPlugin::scanText()
+void PimoTextMatchPlugin::WorkThread::buildTokenTree()
 {
-    // extract next word
-    int pos = m_text.indexOf( QRegExp( "\\W" ), m_pos );
-    if ( pos != -1 ) {
-        QString word = m_text.mid( m_pos, pos-m_pos ).simplified();
-        queryWord( word );
+    if(!m_tokenTree) {
+        m_tokenTree = new TokenTree();
+        // populatre tree
+        Nepomuk::Query::Query query(
+                    Nepomuk::Query::ResourceTypeTerm( \
Nepomuk::Vocabulary::PIMO::Thing() ) || +                    \
Nepomuk::Query::ResourceTypeTerm( Soprano::Vocabulary::NAO::Tag() ) +                 \
); +        query.addRequestProperty(Nepomuk::Query::Query::RequestProperty(Soprano::Vocabulary::NAO::prefLabel(), \
false));  
-        // scan for next word without blocking
-        m_pos = pos+1;
-        QMetaObject::invokeMethod( this, "scanText", Qt::QueuedConnection );
+        kDebug() << query.toSparqlQuery();
+
+        Soprano::QueryResultIterator it
+                = Nepomuk::ResourceManager::instance()->mainModel()->executeQuery( \
query.toSparqlQuery(), +                                                              \
Soprano::Query::QueryLanguageSparql ); +        while ( it.next() ) {
+            const QUrl res( it[0].uri() );
+            const QString label( it[1].toString() );
+            m_tokenTree->add(label, QVariant::fromValue(res));
     }
-    else {
-        if ( m_text.length() > m_pos ) {
-            QString word = m_text.mid( m_pos );
-            queryWord( word );
         }
-        emitFinished();
     }
+
+
+
+PimoTextMatchPlugin::PimoTextMatchPlugin( QObject* parent, const QVariantList& )
+    : TextMatchPlugin( parent ),
+      m_workThread(0)
+{
 }
 
 
-namespace {
-double calculateRankTheDumbWay(const QString& queryString, const QString& name)
+PimoTextMatchPlugin::~PimoTextMatchPlugin()
 {
-    return 1.0 - double(name.length() - queryString.length()) / \
double(name.length());  }
-}
 
-bool PimoTextMatchPlugin::queryWord( const QString& word )
+
+void PimoTextMatchPlugin::doGetPossibleMatches( const QString& text )
 {
-    if ( word.length() < s_minLength ) {
-//        kDebug() << word << "too short";
-        return false;
+    if(!m_workThread) {
+        m_workThread = new WorkThread(this);
+        connect(m_workThread, SIGNAL(finished()),
+                this, SLOT(emitFinished()));
     }
-    else if ( m_stopWords.contains( word.toLower() ) ) {
-        return false;
+    m_workThread->start(text);
     }
 
-//    kDebug() << "checking word" << word;
 
-    //
-    // We search quite a lot of words. Thus, we restrict ourselves to pimo things \
                and tags and
-    // only check their prefLabel.
-    //
-    Nepomuk::Query::Query query =
-        Nepomuk::Query::Query(
-            Nepomuk::Query::AndTerm(
-                Nepomuk::Query::OrTerm(
-                    Nepomuk::Query::ResourceTypeTerm( \
                Nepomuk::Vocabulary::PIMO::Thing() ),
-                    Nepomuk::Query::ResourceTypeTerm( \
                Soprano::Vocabulary::NAO::Tag() ) ),
-                Nepomuk::Query::ComparisonTerm( \
                Soprano::Vocabulary::NAO::prefLabel(),
-                                                Nepomuk::Query::LiteralTerm( word ) \
                ) ) );
-    query.setLimit( 5 );
+void PimoTextMatchPlugin::slotTokenFound(int pos, int endPos, const QVariant& value)
+{
+    kDebug() << pos << endPos << value;
 
-    kDebug() << query.toSparqlQuery();
-
-    Soprano::QueryResultIterator it
-        = Nepomuk::ResourceManager::instance()->mainModel()->executeQuery( \
                query.toSparqlQuery(),
-                                                                           \
                Soprano::Query::QueryLanguageSparql );
-    while ( it.next() ) {
-        Nepomuk::Resource res( it[0].uri() );
+    Nepomuk::Resource res( value.toUrl() );
         Scribo::Entity entity( res.genericLabel(), res.resourceType(), \
Soprano::Graph(), res );  
         Scribo::TextOccurrence oc;
-        oc.setStartPos( m_pos );
-        oc.setLength( word.length() );
-        oc.setRelevance( calculateRankTheDumbWay(word, entity.label()) );
+    oc.setStartPos( pos );
+    oc.setLength( endPos-pos+1 );
+    oc.setRelevance( 1.0 ); // TokenTree only produces perfect matches for now!
         entity.addOccurrence( oc );
 
         addNewMatch( entity );
     }
 
-    return true;
-}
-
-
 SCRIBO_EXPORT_TEXTMATCH_PLUGIN( PimoTextMatchPlugin, "scribo_pimotextmatchplugin" )
 
 #include "pimotextmatchplugin.moc"
--- trunk/playground/base/nepomuk-kde/scribo/plugins/pimo/pimotextmatchplugin.h \
#1199713:1199714 @@ -1,6 +1,6 @@
 /*
  * This file is part of the Nepomuk KDE project.
- * Copyright (c) 2009 Sebastian Trueg <trueg@kde.org>
+ * Copyright (c) 2009-2010 Sebastian Trueg <trueg@kde.org>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Library General Public
@@ -26,6 +26,7 @@
 #include <QtCore/QVariant>
 #include <QtCore/QStringList>
 
+
 /**
  * TextMatchPlugin that simply looks through the text, looking for each word
  * longer than N chars in the nao:prefLabel of each thing in the Nepomuk db.
@@ -42,13 +43,11 @@
     void doGetPossibleMatches( const QString& text );
 
 private Q_SLOTS:
-    void scanText();
-    bool queryWord( const QString& word );
+    void slotTokenFound(int pos, int endPos, const QVariant &value);
 
 private:
-    QString m_text;
-    int m_pos;
-    QStringList m_stopWords;
+    class WorkThread;
+    WorkThread* m_workThread;
 };
 
 #endif


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic