This is a multi-part message in MIME format. --------------040503040006010700040105 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 8bit In the meantime I have a different patch for kdelibs which should at least solve the issue of the KRunner producing queries that let Virtuoso go berserk. Please test. Cheers, Sebastian On 01/03/2011 05:15 PM, Sebastian Trüg wrote: > I have a heavy patch indeed but it also requires an updated (still > unreleased) Virtuoso which fixes a bug. > I will let you know more soon. > > Cheers, > Sebastian > > On 01/03/2011 03:58 PM, Will Stephenson wrote: >> On Monday 03 January 2011 10:48:57 Sebastian Trüg wrote: >>> I am on that one. These come from krunner.... >> >> Glad to hear it. I'm catching up after an offline Xmas but if you need any >> more info or patch testing just let me know. >> >> Will >> _______________________________________________ >> Nepomuk mailing list >> Nepomuk@kde.org >> https://mail.kde.org/mailman/listinfo/nepomuk >> > _______________________________________________ > Nepomuk mailing list > Nepomuk@kde.org > https://mail.kde.org/mailman/listinfo/nepomuk > --------------040503040006010700040105 Content-Type: text/plain; name="1.diff" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="1.diff" commit ea85b495f1a99aa604ebb3bc17912e7406d0387d Author: Sebastian Trueg Date: Fri Jan 7 21:16:40 2011 +0100 * Fixed the handling of quotes and keywords such as "AND", "OR", and "NOT" in LiteralTerm. Now correct bif:contains or regex filters are created for values. * Made the query parser merge LiteralTerms into a single one to improve query performance. While merging two LiteralTerms into one does not yield the exact same query (when merged both literal tokens need to appear in the same property value while with separate LiteralTerms the tokens can appear in different properties) it should cover close to all typical use cases while increasing the performance significantly and getting rid of the nasty "Virtuoso goes crazy when I use KRunner" bug. diff --git a/nepomuk/query/literalterm.cpp b/nepomuk/query/literalterm.cpp index 4623e82..f8173ac 100644 --- a/nepomuk/query/literalterm.cpp +++ b/nepomuk/query/literalterm.cpp @@ -75,41 +75,6 @@ QString Nepomuk::Query::LiteralTermPrivate::toSparqlGraphPattern( const QString& namespace { -QString prepareQueryText( const QString& text ) -{ - // - // we try to be a little smart about creating the query text - // by following a few simple rules: - // - // 1. enclose everything in quotes to be safe - // 2. quotes in search terms are not handled. replace them with spaces - // 3. replace double quotes with single quotes - // [4. wildcards can only be used if they are preceeded by at least 4 chars] - // - - QString s = text.simplified(); - if( s.isEmpty() ) - return s; - - // strip quotes - if( s[0] == '"' || s[0] == '\'' ) { - s = s.mid(1); - } - if( !s.isEmpty() && - ( s[s.length()-1] == '"' || s[s.length()-1] == '\'' ) ) { - s.truncate(s.length()-1); - } - - // replace quotes with spaces - s.replace( '"', ' ' ); - s.replace( '\'', ' ' ); - - // add quotes - s = '\'' + s + '\''; - - return s; -} - QString prepareRegexText( const QString& text ) { QString filterRxStr = QRegExp::escape( text ); @@ -123,29 +88,107 @@ QString prepareRegexText( const QString& text ) QString Nepomuk::Query::LiteralTermPrivate::createContainsPattern( const QString& varName, const QString& text, Nepomuk::Query::QueryBuilderData* qbd ) { - const int i = text.indexOf( QRegExp(QLatin1String("[\\?\\*]")) ); + // each token with a negation flag + QList > containsTokens; + QList > regexTokens; + + // we only support AND xor OR, not both at the same time + bool isUnion = false; + + // gather all the tokens + bool inQuotes = false; + QString currentToken; + bool nextIsNegated = false; + int i = 0; + while( i < text.length() ) { + const QChar& c = text[i]; + bool tokenEnd = false; + + if( c == QChar('"') || c == QChar('\'') ) { + inQuotes = !inQuotes; + tokenEnd = !inQuotes; + } + else if( c.isSpace() && !inQuotes ) { + tokenEnd = true; + } + else { + currentToken.append(c); + } - // - // Virtuoso needs four leading chars when using wildcards. Thus, if there is less (this includes 0) we fall back to the slower regex filter - // - if( i < 0 || i > 3 ) { - const QString finalText = prepareQueryText( text ); + if( i == text.count()-1 ) { + tokenEnd = true; + } - QString scoringPattern; - if( qbd->query()->m_fullTextScoringEnabled ) { - scoringPattern = QString::fromLatin1("OPTION (score %1) ").arg(qbd->createScoringVariable()); + if( tokenEnd && !currentToken.isEmpty() ) { + // + // Handle the three special tokens supported in Virtuoso's full text search engine we support (there is also "near" which we do not handle yet) + // + if( currentToken.toLower() == QLatin1String("and") ) { + isUnion = false; + } + else if( currentToken.toLower() == QLatin1String("or") ) { + isUnion = true; + } + else if( currentToken.toLower() == QLatin1String("not") ) { + nextIsNegated = true; + } + else { + QPair currentTokenPair = qMakePair( currentToken, nextIsNegated ); + + // + // Virtuoso needs four leading chars when using wildcards. Thus, if there is less (this includes 0) we fall back to the slower regex filter + // + const QStringList subTokens = currentToken.split( QLatin1Char(' '), QString::SkipEmptyParts ); + bool needsRegex = false; + Q_FOREACH( const QString& subToken, subTokens ) { + const int i = subToken.indexOf( QRegExp(QLatin1String("[\\?\\*]")) ); + if( i >= 0 && i < 4 ) { + needsRegex = true; + break; + } + } + if( !needsRegex ) { + containsTokens << currentTokenPair; + } + else { + regexTokens << currentTokenPair; + } + } + + nextIsNegated = false; + currentToken.clear(); } - qbd->addFullTextSearchTerm( varName, finalText ); - return QString::fromLatin1( "%1 bif:contains \"%2\" %3. " ) - .arg( varName, - finalText, - scoringPattern ); + ++i; } - else { - return QString::fromLatin1( "FILTER(REGEX(%1, \"%2\")) . " ) - .arg( varName, prepareRegexText(text) ); + + // convert the tokens into SPARQL filters + QStringList filters; + QStringList containsFilterTokens; + for( int i = 0; i < containsTokens.count(); ++i ) { + QString containsFilterToken; + if( containsTokens[i].second ) + containsFilterToken += QLatin1String("NOT "); + containsFilterToken += QString::fromLatin1("'%1'").arg(containsTokens[i].first); + containsFilterTokens << containsFilterToken; } + if( !containsFilterTokens.isEmpty() ) { + filters << QString::fromLatin1("bif:contains(%1, \"%2\")") + .arg( varName, + containsFilterTokens.join( isUnion ? QLatin1String(" OR ") : QLatin1String(" AND ")) ); + } + QStringList regexFilters; + for( int i = 0; i < regexTokens.count(); ++i ) { + QString regexFilter; + if( regexTokens[i].second ) + regexFilter += QLatin1Char('!'); + regexFilter += QString::fromLatin1( "REGEX(%1, \"%2\")" ) + .arg( varName, + prepareRegexText(regexTokens[i].first) ); + filters << regexFilter; + } + + return QString( QLatin1String("FILTER(") + filters.join( isUnion ? QLatin1String(" || ") : QLatin1String(" && ") ) + QLatin1String(") . ") ); } diff --git a/nepomuk/query/queryparser.cpp b/nepomuk/query/queryparser.cpp index 3b793d4..656714d 100644 --- a/nepomuk/query/queryparser.cpp +++ b/nepomuk/query/queryparser.cpp @@ -130,9 +130,14 @@ namespace { } } - Soprano::LiteralValue createLiteral( const QString& s_, bool globbing ) { - bool hadQuotes = false; - QString s = stripQuotes( s_, &hadQuotes ); + Soprano::LiteralValue createLiteral( const QString& s, bool globbing ) { + // no globbing if we have quotes or if there already is a wildcard + if ( s[0] == QLatin1Char('\'') || + s[0] == QLatin1Char('\"') ) { + return s; + } + + // at this point we should have a string without spaces in it bool b = false; int i = s.toInt( &b ); if ( b ) @@ -144,7 +149,7 @@ namespace { // // we can only do query term globbing for strings longer than 3 chars // - if( !hadQuotes && globbing && s.length() > 3 && !s.endsWith('*') && !s.endsWith('?') ) + if( globbing && s.length() > 3 && !s.endsWith('*') && !s.endsWith('?') ) return QString(s + '*'); else return s; @@ -250,6 +255,36 @@ namespace { Nepomuk::Query::ComparisonTerm::Regexp ); } + /** + * Merging literal terms is an optimization which is based on the assumption that most + * users want to search for the full text terms they enter in the value of the same + * property. + * Since merging two literals "foo" and "bar" into one term "foo AND bar" effectively + * changes the result set (the former allows that "foo" occurs in a property value + * different from "bar" while the latter forces them to occur in the same.) + * But the resulting query is much faster. + */ + Nepomuk::Query::Term mergeLiteralTerms( const Nepomuk::Query::Term& term ) + { + if( term.isAndTerm() ) { + AndTerm mergedTerm; + QStringList fullTextTerms; + Q_FOREACH( const Term& st, term.toAndTerm().subTerms() ) { + if( st.isLiteralTerm() ) { + fullTextTerms << st.toLiteralTerm().value().toString(); + } + else { + mergedTerm.addSubTerm( st ); + } + } + mergedTerm.addSubTerm( LiteralTerm( QString( QLatin1String("'") + fullTextTerms.join( QString::fromLatin1("' AND '") ) + QLatin1String("'") ) ) ); + return mergedTerm.optimized(); + } + else { + return term; + } + } + #ifndef Q_CC_MSVC #warning Make the parser handle different data, time, and datetime encodings as well as suffixes like MB or GB #endif @@ -612,7 +647,7 @@ Nepomuk::Query::Query Nepomuk::Query::QueryParser::parse( const QString& query, final.setTerm( t ); } - final.setTerm( resolveFields( final.term(), this ) ); + final.setTerm( mergeLiteralTerms( resolveFields( final.term(), this ) ) ); return final; } --------------040503040006010700040105 Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: inline _______________________________________________ Nepomuk mailing list Nepomuk@kde.org https://mail.kde.org/mailman/listinfo/nepomuk --------------040503040006010700040105--