[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kde-commits
Subject:    branches/KDE/4.6/kdelibs/nepomuk/query
From:       Sebastian Trueg <sebastian () trueg ! de>
Date:       2011-01-20 16:02:49
Message-ID: 20110120160249.862E3AC8B8 () svn ! kde ! org
[Download RAW message or body]

SVN commit 1215973 by trueg:

Backport:
* Fixed the handling of quotes and keywords such as "AND", "OR", and "NOT" in LiteralTerm.
  Now correct bif:contains or regex filters are created for values.
* Made the query parser merge LiteralTerms into a single one to improve query performance.
  While merging two LiteralTerms into one does not yield the exact same query (when merged
  both literal tokens need to appear in the same property value while with separate \
LiteralTerms  the tokens can appear in different properties) it should cover close to all \
typical use cases  while increasing the performance significantly and getting rid of the nasty \
"Virtuoso goes  crazy when I use KRunner" bug.

CCBUG: 246678


 M  +91 -48    literalterm.cpp  
 M  +40 -5     queryparser.cpp  


--- branches/KDE/4.6/kdelibs/nepomuk/query/literalterm.cpp #1215972:1215973
@@ -73,41 +73,6 @@
 
 
 namespace {
-QString prepareQueryText( const QString& text )
-{
-    //
-    // we try to be a little smart about creating the query text
-    // by following a few simple rules:
-    //
-    // 1. enclose everything in quotes to be safe
-    // 2. quotes in search terms are not handled. replace them with spaces
-    // 3. replace double quotes with single quotes
-    // [4. wildcards can only be used if they are preceeded by at least 4 chars]
-    //
-
-    QString s = text.simplified();
-    if( s.isEmpty() )
-        return s;
-
-    // strip quotes
-    if( s[0] == '"' || s[0] == '\'' ) {
-        s = s.mid(1);
-    }
-    if( !s.isEmpty() &&
-        ( s[s.length()-1] == '"' || s[s.length()-1] == '\'' ) ) {
-        s.truncate(s.length()-1);
-    }
-
-    // replace quotes with spaces
-    s.replace( '"', ' ' );
-    s.replace( '\'', ' ' );
-
-    // add quotes
-    s = '\'' + s + '\'';
-
-    return s;
-}
-
 QString prepareRegexText( const QString& text )
 {
     QString filterRxStr = QRegExp::escape( text );
@@ -121,29 +86,107 @@
 
 QString Nepomuk::Query::LiteralTermPrivate::createContainsPattern( const QString& varName, \
const QString& text, Nepomuk::Query::QueryBuilderData* qbd )  {
-    const int i = text.indexOf( QRegExp(QLatin1String("[\\?\\*]")) );
+    // each token with a negation flag
+    QList<QPair<QString, bool> > containsTokens;
+    QList<QPair<QString, bool> > regexTokens;
 
+    // we only support AND xor OR, not both at the same time
+    bool isUnion = false;
+
+    // gather all the tokens
+    bool inQuotes = false;
+    QString currentToken;
+    bool nextIsNegated = false;
+    int i = 0;
+    while( i < text.length() ) {
+        const QChar& c = text[i];
+        bool tokenEnd = false;
+
+        if( c == QChar('"') || c == QChar('\'') ) {
+            inQuotes = !inQuotes;
+            tokenEnd = !inQuotes;
+        }
+        else if( c.isSpace() && !inQuotes ) {
+            tokenEnd = true;
+        }
+        else {
+            currentToken.append(c);
+        }
+
+        if( i == text.count()-1 ) {
+            tokenEnd = true;
+        }
+
+        if( tokenEnd && !currentToken.isEmpty() ) {
     //
+            // Handle the three special tokens supported in Virtuoso's full text search engine \
we support (there is also "near" which we do not handle yet) +            //
+            if( currentToken.toLower() == QLatin1String("and") ) {
+                isUnion = false;
+            }
+            else if( currentToken.toLower() == QLatin1String("or") ) {
+                isUnion = true;
+            }
+            else if( currentToken.toLower() == QLatin1String("not") ) {
+                nextIsNegated = true;
+            }
+            else {
+                QPair<QString, bool> currentTokenPair = qMakePair( currentToken, nextIsNegated \
); +
+                //
     // Virtuoso needs four leading chars when using wildcards. Thus, if there is less (this \
includes 0) we fall back to the slower regex filter  //
-    if( i < 0 || i > 3 ) {
-        const QString finalText = prepareQueryText( text );
+                const QStringList subTokens = currentToken.split( QLatin1Char(' '), \
QString::SkipEmptyParts ); +                bool needsRegex = false;
+                Q_FOREACH( const QString& subToken, subTokens ) {
+                    const int i = subToken.indexOf( QRegExp(QLatin1String("[\\?\\*]")) );
+                    if( i >= 0 && i < 4 ) {
+                        needsRegex = true;
+                        break;
+                    }
+                }
+                if( !needsRegex ) {
+                    containsTokens << currentTokenPair;
+                }
+                else {
+                    regexTokens << currentTokenPair;
+                }
+            }
 
-        QString scoringPattern;
-        if( qbd->query()->m_fullTextScoringEnabled ) {
-            scoringPattern = QString::fromLatin1("OPTION (score %1) \
").arg(qbd->createScoringVariable()); +            nextIsNegated = false;
+            currentToken.clear();
         }
-        qbd->addFullTextSearchTerm( varName, finalText );
 
-        return QString::fromLatin1( "%1 bif:contains \"%2\" %3. " )
+        ++i;
+    }
+
+    // convert the tokens into SPARQL filters
+    QStringList filters;
+    QStringList containsFilterTokens;
+    for( int i = 0; i < containsTokens.count(); ++i ) {
+        QString containsFilterToken;
+        if( containsTokens[i].second )
+            containsFilterToken += QLatin1String("NOT ");
+        containsFilterToken += QString::fromLatin1("'%1'").arg(containsTokens[i].first);
+        containsFilterTokens << containsFilterToken;
+    }
+    if( !containsFilterTokens.isEmpty() ) {
+        filters << QString::fromLatin1("bif:contains(%1, \"%2\")")
                 .arg( varName,
-                     finalText,
-                     scoringPattern );
+                         containsFilterTokens.join( isUnion ? QLatin1String(" OR ") : \
QLatin1String(" AND ")) );  }
-    else {
-        return QString::fromLatin1( "FILTER(REGEX(%1, \"%2\")) . " )
-                .arg( varName, prepareRegexText(text) );
+    QStringList regexFilters;
+    for( int i = 0; i < regexTokens.count(); ++i ) {
+        QString regexFilter;
+        if( regexTokens[i].second )
+            regexFilter += QLatin1Char('!');
+        regexFilter += QString::fromLatin1( "REGEX(%1, \"%2\")" )
+                .arg( varName,
+                      prepareRegexText(regexTokens[i].first) );
+        filters << regexFilter;
     }
+
+    return QString( QLatin1String("FILTER(") + filters.join( isUnion ? QLatin1String(" || ") : \
QLatin1String(" && ") ) + QLatin1String(") . ") );  }
 
 
--- branches/KDE/4.6/kdelibs/nepomuk/query/queryparser.cpp #1215972:1215973
@@ -130,9 +130,14 @@
         }
     }
 
-    Soprano::LiteralValue createLiteral( const QString& s_, bool globbing ) {
-        bool hadQuotes = false;
-        QString s = stripQuotes( s_, &hadQuotes );
+    Soprano::LiteralValue createLiteral( const QString& s, bool globbing ) {
+        // no globbing if we have quotes or if there already is a wildcard
+        if ( s[0] == QLatin1Char('\'') ||
+             s[0] == QLatin1Char('\"') ) {
+            return s;
+        }
+
+        // at this point we should have a string without spaces in it
         bool b = false;
         int i = s.toInt( &b );
         if ( b )
@@ -144,7 +149,7 @@
         //
         // we can only do query term globbing for strings longer than 3 chars
         //
-        if( !hadQuotes && globbing && s.length() > 3 && !s.endsWith('*') && !s.endsWith('?') )
+        if( globbing && s.length() > 3 && !s.endsWith('*') && !s.endsWith('?') )
             return QString(s + '*');
         else
             return s;
@@ -250,6 +255,36 @@
                                                Nepomuk::Query::ComparisonTerm::Regexp );
     }
 
+    /**
+     * Merging literal terms is an optimization which is based on the assumption that most
+     * users want to search for the full text terms they enter in the value of the same
+     * property.
+     * Since merging two literals "foo" and "bar" into one term "foo AND bar" effectively
+     * changes the result set (the former allows that "foo" occurs in a property value
+     * different from "bar" while the latter forces them to occur in the same.)
+     * But the resulting query is much faster.
+     */
+    Nepomuk::Query::Term mergeLiteralTerms( const Nepomuk::Query::Term& term )
+    {
+        if( term.isAndTerm() ) {
+            AndTerm mergedTerm;
+            QStringList fullTextTerms;
+            Q_FOREACH( const Term& st, term.toAndTerm().subTerms() ) {
+                if( st.isLiteralTerm() ) {
+                    fullTextTerms << st.toLiteralTerm().value().toString();
+                }
+                else {
+                    mergedTerm.addSubTerm( st );
+                }
+            }
+            mergedTerm.addSubTerm( LiteralTerm( fullTextTerms.join( QString::fromLatin1(" AND \
") ) ) ); +            return mergedTerm.optimized();
+        }
+        else {
+            return term;
+        }
+    }
+
 #ifndef Q_CC_MSVC
 #warning Make the parser handle different data, time, and datetime encodings as well as \
suffixes like MB or GB  #endif
@@ -620,7 +655,7 @@
         final.setTerm( t );
     }
 
-    final.setTerm( resolveFields( final.term(), this ) );
+    final.setTerm( mergeLiteralTerms( resolveFields( final.term(), this ) ) );
     return final;
 }
 


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic