[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kde-commits
Subject:    branches/KDE/4.6/kdelibs/nepomuk/query
From:       Sebastian Trueg <sebastian () trueg ! de>
Date:       2011-01-20 16:02:49
Message-ID: 20110120160249.862E3AC8B8 () svn ! kde ! org
[Download RAW message or body]

SVN commit 1215973 by trueg:

Backport:
* Fixed the handling of quotes and keywords such as "AND", "OR", and "NOT" in \
LiteralTerm.  Now correct bif:contains or regex filters are created for values.
* Made the query parser merge LiteralTerms into a single one to improve query \
performance.  While merging two LiteralTerms into one does not yield the exact same \
query (when merged  both literal tokens need to appear in the same property value \
while with separate LiteralTerms  the tokens can appear in different properties) it \
should cover close to all typical use cases  while increasing the performance \
significantly and getting rid of the nasty "Virtuoso goes  crazy when I use KRunner" \
bug.

CCBUG: 246678


 M  +91 -48    literalterm.cpp  
 M  +40 -5     queryparser.cpp  


--- branches/KDE/4.6/kdelibs/nepomuk/query/literalterm.cpp #1215972:1215973
@@ -73,41 +73,6 @@
 
 
 namespace {
-QString prepareQueryText( const QString& text )
-{
-    //
-    // we try to be a little smart about creating the query text
-    // by following a few simple rules:
-    //
-    // 1. enclose everything in quotes to be safe
-    // 2. quotes in search terms are not handled. replace them with spaces
-    // 3. replace double quotes with single quotes
-    // [4. wildcards can only be used if they are preceeded by at least 4 chars]
-    //
-
-    QString s = text.simplified();
-    if( s.isEmpty() )
-        return s;
-
-    // strip quotes
-    if( s[0] == '"' || s[0] == '\'' ) {
-        s = s.mid(1);
-    }
-    if( !s.isEmpty() &&
-        ( s[s.length()-1] == '"' || s[s.length()-1] == '\'' ) ) {
-        s.truncate(s.length()-1);
-    }
-
-    // replace quotes with spaces
-    s.replace( '"', ' ' );
-    s.replace( '\'', ' ' );
-
-    // add quotes
-    s = '\'' + s + '\'';
-
-    return s;
-}
-
 QString prepareRegexText( const QString& text )
 {
     QString filterRxStr = QRegExp::escape( text );
@@ -121,29 +86,107 @@
 
 QString Nepomuk::Query::LiteralTermPrivate::createContainsPattern( const QString& \
varName, const QString& text, Nepomuk::Query::QueryBuilderData* qbd )  {
-    const int i = text.indexOf( QRegExp(QLatin1String("[\\?\\*]")) );
+    // each token with a negation flag
+    QList<QPair<QString, bool> > containsTokens;
+    QList<QPair<QString, bool> > regexTokens;
 
+    // we only support AND xor OR, not both at the same time
+    bool isUnion = false;
+
+    // gather all the tokens
+    bool inQuotes = false;
+    QString currentToken;
+    bool nextIsNegated = false;
+    int i = 0;
+    while( i < text.length() ) {
+        const QChar& c = text[i];
+        bool tokenEnd = false;
+
+        if( c == QChar('"') || c == QChar('\'') ) {
+            inQuotes = !inQuotes;
+            tokenEnd = !inQuotes;
+        }
+        else if( c.isSpace() && !inQuotes ) {
+            tokenEnd = true;
+        }
+        else {
+            currentToken.append(c);
+        }
+
+        if( i == text.count()-1 ) {
+            tokenEnd = true;
+        }
+
+        if( tokenEnd && !currentToken.isEmpty() ) {
     //
+            // Handle the three special tokens supported in Virtuoso's full text \
search engine we support (there is also "near" which we do not handle yet) +          \
// +            if( currentToken.toLower() == QLatin1String("and") ) {
+                isUnion = false;
+            }
+            else if( currentToken.toLower() == QLatin1String("or") ) {
+                isUnion = true;
+            }
+            else if( currentToken.toLower() == QLatin1String("not") ) {
+                nextIsNegated = true;
+            }
+            else {
+                QPair<QString, bool> currentTokenPair = qMakePair( currentToken, \
nextIsNegated ); +
+                //
     // Virtuoso needs four leading chars when using wildcards. Thus, if there is \
less (this includes 0) we fall back to the slower regex filter  //
-    if( i < 0 || i > 3 ) {
-        const QString finalText = prepareQueryText( text );
+                const QStringList subTokens = currentToken.split( QLatin1Char(' '), \
QString::SkipEmptyParts ); +                bool needsRegex = false;
+                Q_FOREACH( const QString& subToken, subTokens ) {
+                    const int i = subToken.indexOf( \
QRegExp(QLatin1String("[\\?\\*]")) ); +                    if( i >= 0 && i < 4 ) {
+                        needsRegex = true;
+                        break;
+                    }
+                }
+                if( !needsRegex ) {
+                    containsTokens << currentTokenPair;
+                }
+                else {
+                    regexTokens << currentTokenPair;
+                }
+            }
 
-        QString scoringPattern;
-        if( qbd->query()->m_fullTextScoringEnabled ) {
-            scoringPattern = QString::fromLatin1("OPTION (score %1) \
").arg(qbd->createScoringVariable()); +            nextIsNegated = false;
+            currentToken.clear();
         }
-        qbd->addFullTextSearchTerm( varName, finalText );
 
-        return QString::fromLatin1( "%1 bif:contains \"%2\" %3. " )
+        ++i;
+    }
+
+    // convert the tokens into SPARQL filters
+    QStringList filters;
+    QStringList containsFilterTokens;
+    for( int i = 0; i < containsTokens.count(); ++i ) {
+        QString containsFilterToken;
+        if( containsTokens[i].second )
+            containsFilterToken += QLatin1String("NOT ");
+        containsFilterToken += \
QString::fromLatin1("'%1'").arg(containsTokens[i].first); +        \
containsFilterTokens << containsFilterToken; +    }
+    if( !containsFilterTokens.isEmpty() ) {
+        filters << QString::fromLatin1("bif:contains(%1, \"%2\")")
                 .arg( varName,
-                     finalText,
-                     scoringPattern );
+                         containsFilterTokens.join( isUnion ? QLatin1String(" OR ") \
: QLatin1String(" AND ")) );  }
-    else {
-        return QString::fromLatin1( "FILTER(REGEX(%1, \"%2\")) . " )
-                .arg( varName, prepareRegexText(text) );
+    QStringList regexFilters;
+    for( int i = 0; i < regexTokens.count(); ++i ) {
+        QString regexFilter;
+        if( regexTokens[i].second )
+            regexFilter += QLatin1Char('!');
+        regexFilter += QString::fromLatin1( "REGEX(%1, \"%2\")" )
+                .arg( varName,
+                      prepareRegexText(regexTokens[i].first) );
+        filters << regexFilter;
     }
+
+    return QString( QLatin1String("FILTER(") + filters.join( isUnion ? \
QLatin1String(" || ") : QLatin1String(" && ") ) + QLatin1String(") . ") );  }
 
 
--- branches/KDE/4.6/kdelibs/nepomuk/query/queryparser.cpp #1215972:1215973
@@ -130,9 +130,14 @@
         }
     }
 
-    Soprano::LiteralValue createLiteral( const QString& s_, bool globbing ) {
-        bool hadQuotes = false;
-        QString s = stripQuotes( s_, &hadQuotes );
+    Soprano::LiteralValue createLiteral( const QString& s, bool globbing ) {
+        // no globbing if we have quotes or if there already is a wildcard
+        if ( s[0] == QLatin1Char('\'') ||
+             s[0] == QLatin1Char('\"') ) {
+            return s;
+        }
+
+        // at this point we should have a string without spaces in it
         bool b = false;
         int i = s.toInt( &b );
         if ( b )
@@ -144,7 +149,7 @@
         //
         // we can only do query term globbing for strings longer than 3 chars
         //
-        if( !hadQuotes && globbing && s.length() > 3 && !s.endsWith('*') && \
!s.endsWith('?') ) +        if( globbing && s.length() > 3 && !s.endsWith('*') && \
!s.endsWith('?') )  return QString(s + '*');
         else
             return s;
@@ -250,6 +255,36 @@
                                                \
Nepomuk::Query::ComparisonTerm::Regexp );  }
 
+    /**
+     * Merging literal terms is an optimization which is based on the assumption \
that most +     * users want to search for the full text terms they enter in the \
value of the same +     * property.
+     * Since merging two literals "foo" and "bar" into one term "foo AND bar" \
effectively +     * changes the result set (the former allows that "foo" occurs in a \
property value +     * different from "bar" while the latter forces them to occur in \
the same.) +     * But the resulting query is much faster.
+     */
+    Nepomuk::Query::Term mergeLiteralTerms( const Nepomuk::Query::Term& term )
+    {
+        if( term.isAndTerm() ) {
+            AndTerm mergedTerm;
+            QStringList fullTextTerms;
+            Q_FOREACH( const Term& st, term.toAndTerm().subTerms() ) {
+                if( st.isLiteralTerm() ) {
+                    fullTextTerms << st.toLiteralTerm().value().toString();
+                }
+                else {
+                    mergedTerm.addSubTerm( st );
+                }
+            }
+            mergedTerm.addSubTerm( LiteralTerm( fullTextTerms.join( \
QString::fromLatin1(" AND ") ) ) ); +            return mergedTerm.optimized();
+        }
+        else {
+            return term;
+        }
+    }
+
 #ifndef Q_CC_MSVC
 #warning Make the parser handle different data, time, and datetime encodings as well \
as suffixes like MB or GB  #endif
@@ -620,7 +655,7 @@
         final.setTerm( t );
     }
 
-    final.setTerm( resolveFields( final.term(), this ) );
+    final.setTerm( mergeLiteralTerms( resolveFields( final.term(), this ) ) );
     return final;
 }
 


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic