[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kde-commits
Subject:    branches/KDE/4.6/kdelibs/nepomuk/query
From:       Sebastian Trueg <sebastian () trueg ! de>
Date:       2011-01-20 16:02:49
Message-ID: 20110120160249.862E3AC8B8 () svn ! kde ! org
[Download RAW message or body]

SVN commit 1215973 by trueg:

Backport:
* Fixed the handling of quotes and keywords such as "AND", "OR", and "NOT" \
in LiteralTerm.  Now correct bif:contains or regex filters are created for \
                values.
* Made the query parser merge LiteralTerms into a single one to improve \
query performance.  While merging two LiteralTerms into one does not yield \
the exact same query (when merged  both literal tokens need to appear in \
the same property value while with separate LiteralTerms  the tokens can \
appear in different properties) it should cover close to all typical use \
cases  while increasing the performance significantly and getting rid of \
the nasty "Virtuoso goes  crazy when I use KRunner" bug.

CCBUG: 246678


 M  +91 -48    literalterm.cpp  
 M  +40 -5     queryparser.cpp  


--- branches/KDE/4.6/kdelibs/nepomuk/query/literalterm.cpp #1215972:1215973
@@ -73,41 +73,6 @@
 
 
 namespace {
-QString prepareQueryText( const QString& text )
-{
-    //
-    // we try to be a little smart about creating the query text
-    // by following a few simple rules:
-    //
-    // 1. enclose everything in quotes to be safe
-    // 2. quotes in search terms are not handled. replace them with spaces
-    // 3. replace double quotes with single quotes
-    // [4. wildcards can only be used if they are preceeded by at least 4 \
                chars]
-    //
-
-    QString s = text.simplified();
-    if( s.isEmpty() )
-        return s;
-
-    // strip quotes
-    if( s[0] == '"' || s[0] == '\'' ) {
-        s = s.mid(1);
-    }
-    if( !s.isEmpty() &&
-        ( s[s.length()-1] == '"' || s[s.length()-1] == '\'' ) ) {
-        s.truncate(s.length()-1);
-    }
-
-    // replace quotes with spaces
-    s.replace( '"', ' ' );
-    s.replace( '\'', ' ' );
-
-    // add quotes
-    s = '\'' + s + '\'';
-
-    return s;
-}
-
 QString prepareRegexText( const QString& text )
 {
     QString filterRxStr = QRegExp::escape( text );
@@ -121,29 +86,107 @@
 
 QString Nepomuk::Query::LiteralTermPrivate::createContainsPattern( const \
QString& varName, const QString& text, Nepomuk::Query::QueryBuilderData* \
qbd )  {
-    const int i = text.indexOf( QRegExp(QLatin1String("[\\?\\*]")) );
+    // each token with a negation flag
+    QList<QPair<QString, bool> > containsTokens;
+    QList<QPair<QString, bool> > regexTokens;
 
+    // we only support AND xor OR, not both at the same time
+    bool isUnion = false;
+
+    // gather all the tokens
+    bool inQuotes = false;
+    QString currentToken;
+    bool nextIsNegated = false;
+    int i = 0;
+    while( i < text.length() ) {
+        const QChar& c = text[i];
+        bool tokenEnd = false;
+
+        if( c == QChar('"') || c == QChar('\'') ) {
+            inQuotes = !inQuotes;
+            tokenEnd = !inQuotes;
+        }
+        else if( c.isSpace() && !inQuotes ) {
+            tokenEnd = true;
+        }
+        else {
+            currentToken.append(c);
+        }
+
+        if( i == text.count()-1 ) {
+            tokenEnd = true;
+        }
+
+        if( tokenEnd && !currentToken.isEmpty() ) {
     //
+            // Handle the three special tokens supported in Virtuoso's \
full text search engine we support (there is also "near" which we do not \
handle yet) +            //
+            if( currentToken.toLower() == QLatin1String("and") ) {
+                isUnion = false;
+            }
+            else if( currentToken.toLower() == QLatin1String("or") ) {
+                isUnion = true;
+            }
+            else if( currentToken.toLower() == QLatin1String("not") ) {
+                nextIsNegated = true;
+            }
+            else {
+                QPair<QString, bool> currentTokenPair = qMakePair( \
currentToken, nextIsNegated ); +
+                //
     // Virtuoso needs four leading chars when using wildcards. Thus, if \
there is less (this includes 0) we fall back to the slower regex filter  //
-    if( i < 0 || i > 3 ) {
-        const QString finalText = prepareQueryText( text );
+                const QStringList subTokens = currentToken.split( \
QLatin1Char(' '), QString::SkipEmptyParts ); +                bool \
needsRegex = false; +                Q_FOREACH( const QString& subToken, \
subTokens ) { +                    const int i = subToken.indexOf( \
QRegExp(QLatin1String("[\\?\\*]")) ); +                    if( i >= 0 && i \
< 4 ) { +                        needsRegex = true;
+                        break;
+                    }
+                }
+                if( !needsRegex ) {
+                    containsTokens << currentTokenPair;
+                }
+                else {
+                    regexTokens << currentTokenPair;
+                }
+            }
 
-        QString scoringPattern;
-        if( qbd->query()->m_fullTextScoringEnabled ) {
-            scoringPattern = QString::fromLatin1("OPTION (score %1) \
").arg(qbd->createScoringVariable()); +            nextIsNegated = false;
+            currentToken.clear();
         }
-        qbd->addFullTextSearchTerm( varName, finalText );
 
-        return QString::fromLatin1( "%1 bif:contains \"%2\" %3. " )
+        ++i;
+    }
+
+    // convert the tokens into SPARQL filters
+    QStringList filters;
+    QStringList containsFilterTokens;
+    for( int i = 0; i < containsTokens.count(); ++i ) {
+        QString containsFilterToken;
+        if( containsTokens[i].second )
+            containsFilterToken += QLatin1String("NOT ");
+        containsFilterToken += \
QString::fromLatin1("'%1'").arg(containsTokens[i].first); +        \
containsFilterTokens << containsFilterToken; +    }
+    if( !containsFilterTokens.isEmpty() ) {
+        filters << QString::fromLatin1("bif:contains(%1, \"%2\")")
                 .arg( varName,
-                     finalText,
-                     scoringPattern );
+                         containsFilterTokens.join( isUnion ? \
QLatin1String(" OR ") : QLatin1String(" AND ")) );  }
-    else {
-        return QString::fromLatin1( "FILTER(REGEX(%1, \"%2\")) . " )
-                .arg( varName, prepareRegexText(text) );
+    QStringList regexFilters;
+    for( int i = 0; i < regexTokens.count(); ++i ) {
+        QString regexFilter;
+        if( regexTokens[i].second )
+            regexFilter += QLatin1Char('!');
+        regexFilter += QString::fromLatin1( "REGEX(%1, \"%2\")" )
+                .arg( varName,
+                      prepareRegexText(regexTokens[i].first) );
+        filters << regexFilter;
     }
+
+    return QString( QLatin1String("FILTER(") + filters.join( isUnion ? \
QLatin1String(" || ") : QLatin1String(" && ") ) + QLatin1String(") . ") );  \
}  
 
--- branches/KDE/4.6/kdelibs/nepomuk/query/queryparser.cpp #1215972:1215973
@@ -130,9 +130,14 @@
         }
     }
 
-    Soprano::LiteralValue createLiteral( const QString& s_, bool globbing \
                ) {
-        bool hadQuotes = false;
-        QString s = stripQuotes( s_, &hadQuotes );
+    Soprano::LiteralValue createLiteral( const QString& s, bool globbing ) \
{ +        // no globbing if we have quotes or if there already is a \
wildcard +        if ( s[0] == QLatin1Char('\'') ||
+             s[0] == QLatin1Char('\"') ) {
+            return s;
+        }
+
+        // at this point we should have a string without spaces in it
         bool b = false;
         int i = s.toInt( &b );
         if ( b )
@@ -144,7 +149,7 @@
         //
         // we can only do query term globbing for strings longer than 3 \
chars  //
-        if( !hadQuotes && globbing && s.length() > 3 && !s.endsWith('*') \
&& !s.endsWith('?') ) +        if( globbing && s.length() > 3 && \
!s.endsWith('*') && !s.endsWith('?') )  return QString(s + '*');
         else
             return s;
@@ -250,6 +255,36 @@
                                                \
Nepomuk::Query::ComparisonTerm::Regexp );  }
 
+    /**
+     * Merging literal terms is an optimization which is based on the \
assumption that most +     * users want to search for the full text terms \
they enter in the value of the same +     * property.
+     * Since merging two literals "foo" and "bar" into one term "foo AND \
bar" effectively +     * changes the result set (the former allows that \
"foo" occurs in a property value +     * different from "bar" while the \
latter forces them to occur in the same.) +     * But the resulting query \
is much faster. +     */
+    Nepomuk::Query::Term mergeLiteralTerms( const Nepomuk::Query::Term& \
term ) +    {
+        if( term.isAndTerm() ) {
+            AndTerm mergedTerm;
+            QStringList fullTextTerms;
+            Q_FOREACH( const Term& st, term.toAndTerm().subTerms() ) {
+                if( st.isLiteralTerm() ) {
+                    fullTextTerms << \
st.toLiteralTerm().value().toString(); +                }
+                else {
+                    mergedTerm.addSubTerm( st );
+                }
+            }
+            mergedTerm.addSubTerm( LiteralTerm( fullTextTerms.join( \
QString::fromLatin1(" AND ") ) ) ); +            return \
mergedTerm.optimized(); +        }
+        else {
+            return term;
+        }
+    }
+
 #ifndef Q_CC_MSVC
 #warning Make the parser handle different data, time, and datetime \
encodings as well as suffixes like MB or GB  #endif
@@ -620,7 +655,7 @@
         final.setTerm( t );
     }
 
-    final.setTerm( resolveFields( final.term(), this ) );
+    final.setTerm( mergeLiteralTerms( resolveFields( final.term(), this ) \
) );  return final;
 }
 


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic