[prev in list] [next in list] [prev in thread] [next in thread]
List: slide-user
Subject: Re: How to set StandardAnalyzer Stop Words
From: Dorel bruno <bd.ais40 () wanadoo ! fr>
Date: 2007-03-26 7:55:11
Message-ID: 46077C5F.5070507 () wanadoo ! fr
[Download RAW message or body]
aslam bari a écrit :
hi aslam here is my patch code to configure Stop words You have to
compile Slide using this LuceneContentIndexer (make a diff with your
current
version before to pick up the changes ) and you can set up your
stopword file in the Domain.xml
I proposed this change and several others to the dev Team but I never
get any serious anwser I think they don't care about this king of problems
I you implement it succesfully you should propose it to the dev team may
be you gonna be more lucky tahan I am
Regards
B DOREL
> Hi,
> Yes i am interested. But plz let me know how can i set this in Slide and How I can \
> do this for English words.
>
> ----- Original Message ----
> From: Dorel bruno <bd.ais40@wanadoo.fr>
> To: Slide Users Mailing List <slide-user@jakarta.apache.org>
> Sent: Friday, 23 March, 2007 9:27:42 PM
> Subject: Re: How to set StandardAnalyzer Stop Words
>
>
> Ven Helsing a écrit :
>
> > Hello all,
> > I want to use StandardAnaylyzer for Lucene content indexing and also don't
> > need to Stop (ignore) common words which is default to StandardAnaylyzer.
> > Means I want to use StandardAnayzer's constructor with empty set.
> >
> > How to do so?
> > Thanks...
> >
> >
> We proposed a patch to the dev team ........................ but of no
> avail ! I you are interrested we have made a patch to configure stop
> words (we use french stopwords)
>
> B DOREL
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: slide-user-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: slide-user-help@jakarta.apache.org
>
>
>
> __________________________________________________________
> Yahoo! India Answers: Share what you know. Learn something new
> http://in.answers.yahoo.com/
> ---------------------------------------------------------------------------------------
> Orange vous informe que cet e-mail a ete controle par l'anti-virus mail.
> Aucun virus connu a ce jour par nos services n'a ete detecte.
>
>
>
["LuceneContentIndexer.java" (text/plain)]
/*
* $Header: /home/cvspublic/jakarta-slide/src/stores/org/apache/slide/index/lucene/LuceneContentIndexer.java,v \
1.3 2005/04/04 13:55:13 luetzkendorf Exp $
* $Revision: 1.3 $
* $Date: 2005/04/04 13:55:13 $
*
* ====================================================================
*
* Copyright 1999-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* V1.0 : NA/EADS, le 29/08/05
* Adaptation : la suppression d'une entrée de l'index ne contrôle plus
* la présence d'un extracteur (basé sur URI) car on ne peut pas faire
* ce contrôle sur 'displayname' (impossible de récupérer la propriété)
* V1.1 : NA/EADS, le 28/02/06
* FFT 2006/EADS/0656 : récupération du nouveau paramètre optionnel
* 'analyzer-stopwords' du fichier de configuration Domain.xml
* et création du StandardAnalizer avec ce paramètre.
* Si pas de fichier de stop-words, alors le StandardAnalyser utilise
* par défaut les ENGLISH_STOPWORDS
* V1.2 : JM/EADS, le 18/10/06
* FFT 2006/EADS/0973 : Passage de l'Uri plutôt que du contenu
*/
package org.apache.slide.index.lucene;
import java.io.File;
import java.io.IOException;
import java.util.Hashtable;
import javax.transaction.xa.XAException;
import javax.transaction.xa.Xid;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.slide.common.NamespaceAccessToken;
import org.apache.slide.common.ServiceInitializationFailedException;
import org.apache.slide.common.ServiceParameterErrorException;
import org.apache.slide.common.ServiceParameterMissingException;
import org.apache.slide.common.Uri;
import org.apache.slide.content.NodeRevisionContent;
import org.apache.slide.content.NodeRevisionDescriptor;
import org.apache.slide.content.NodeRevisionNumber;
import org.apache.slide.event.DomainEvent;
import org.apache.slide.event.EventDispatcher;
import org.apache.slide.extractor.ExtractorManager;
import org.apache.slide.search.IndexException;
/**
* IndexStore implementation for indexing content based on Jakarta Lucene.
*/
public class LuceneContentIndexer extends AbstractLuceneIndexer
{
private static final String ANALYZER_PARAM = "analyzer";
private String analyzerClassName;
private static final String ANALYZER_STOPWORDS_PARAM = "analyzer-stopwords";
private File analyzerStopWordsFile;
public void initialize(NamespaceAccessToken token)
throws ServiceInitializationFailedException
{
super.initialize(token);
try {
indexConfiguration.initDefaultConfiguration();
indexConfiguration.setContentAnalyzer(
createAnalyzer(this.analyzerClassName));
this.index = new Index(indexConfiguration, getLogger(),
"content " + this.scope);
if (this.index.needsInitialization()) {
DomainEvent.NAMESPACE_INITIALIZED.setEnabled(true);
EventDispatcher.getInstance().addEventListener(
new IndexInitializer(this.scope, IndexInitializer.CONTENT, \
getLogger())); }
}
catch (IndexException e) {
throw new ServiceInitializationFailedException(this, e);
}
}
public void setParameters(Hashtable parameters)
throws ServiceParameterErrorException,
ServiceParameterMissingException
{
super.setParameters(parameters);
// Récupération du fichier de StopWords
analyzerClassName = (String)parameters.get(ANALYZER_PARAM);
analyzerStopWordsFile =
new File((String)parameters.get(ANALYZER_STOPWORDS_PARAM));
// Contrôle de validité du fichier
if (!analyzerStopWordsFile.exists()
|| analyzerStopWordsFile.isDirectory()
|| ! analyzerStopWordsFile.canRead()) {
analyzerStopWordsFile = null;
}
}
/**
* This implementation just calls the super implementation and catches
* all exceptions to ensure that content indexing never makes a commit failing.
*/
public void commit(Xid xid, boolean onePhase) throws XAException
{
try {
super.commit(xid, onePhase);
} catch (XAException e) {
error("Error while committing to content index ({0})", e);
}
}
/*
* @see org.apache.slide.search.Indexer#createIndex(org.apache.slide.common.Uri, \
org.apache.slide.content.NodeRevisionDescriptor, \
org.apache.slide.content.NodeRevisionContent)
*/
public void createIndex(Uri uri, NodeRevisionDescriptor revisionDescriptor,
NodeRevisionContent revisionContent) throws IndexException
{
if (isIncluded(uri.toString())) {
if (ExtractorManager.getInstance().hasContentExtractor(
uri.getNamespace().getName(), uri.toString(), \
revisionDescriptor)) {
TransactionalIndexResource indexResource = getCurrentTxn();
indexResource.addIndexJob(uri, revisionDescriptor, true);
}
}
}
/*
* @see org.apache.slide.search.Indexer#updateIndex(org.apache.slide.common.Uri, \
org.apache.slide.content.NodeRevisionDescriptor, \
org.apache.slide.content.NodeRevisionContent)
*/
public void updateIndex(Uri uri, NodeRevisionDescriptor revisionDescriptor,
NodeRevisionContent revisionContent) throws IndexException
{
if (isIncluded(uri.toString())) {
if (ExtractorManager.getInstance().hasContentExtractor(
uri.getNamespace().getName(), uri.toString(), \
revisionDescriptor)) {
TransactionalIndexResource indexResource = getCurrentTxn();
indexResource.addUpdateJob(uri, revisionDescriptor, true);
}
}
}
/*
* @see org.apache.slide.search.Indexer#dropIndex(org.apache.slide.common.Uri, \
org.apache.slide.content.NodeRevisionNumber)
*/
public void dropIndex(Uri uri, NodeRevisionNumber number)
throws IndexException
{
if (isIncluded(uri.toString())) {
// if (ExtractorManager.getInstance().hasContentExtractor(
// uri.getNamespace().getName(), uri.toString(), null))
// {
TransactionalIndexResource indexResource = getCurrentTxn();
indexResource.addRemoveJob(uri, number);
// }
}
}
protected Analyzer createAnalyzer(String clsName)
throws ServiceInitializationFailedException
{
Analyzer analyzer;
if (clsName == null || clsName.length() == 0) {
analyzer = new SimpleAnalyzer();
} else {
try {
if (clsName.indexOf("StandardAnalyzer") > -1) {
// StandardAnalyzer
if (analyzerStopWordsFile != null) {
// utilisation des Stop-Words spécifiés dans un fichier
analyzer = new StandardAnalyzer(analyzerStopWordsFile);
} else {
// utilisation des Stop-Words par défaut
analyzer = new StandardAnalyzer();
}
} else {
// Tout autre Analyzer
Class analyzerClazz = Class.forName(clsName);
analyzer = (Analyzer)analyzerClazz.newInstance();
}
} catch (ClassNotFoundException e) {
error("Error while instantiating analyzer {1} {2}",
clsName, e.getMessage());
throw new ServiceInitializationFailedException(this, e);
} catch (InstantiationException e) {
error("Error while instantiating analyzer {1} {2}",
clsName, e.getMessage());
throw new ServiceInitializationFailedException(this, e);
} catch (IllegalAccessException e) {
error("Error while instantiating analyzer {1} {2}",
clsName, e.getMessage());
throw new ServiceInitializationFailedException(this, e);
} catch (IOException e) {
error("Error while instantiating analyzer {1} {2}",
clsName, e.getMessage());
throw new ServiceInitializationFailedException(this, e);
}
}
info("using analyzer: {0}", analyzer.getClass().getName());
return analyzer;
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: slide-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: slide-user-help@jakarta.apache.org
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic