'Re: synonyms'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       lucene-user
Subject:    Re: synonyms
From:       "Christian Kaufhold" <Christian.Kaufhold () bsb-muenchen ! de>
Date:       2017-07-25 7:51:25
Message-ID: 5977149D020000D000069D74 () gwia ! bsb-muenchen ! de
[Download RAW message or body]

[Attachment #2 (multipart/alternative)]


Yep, you hit the point. 

Thank you so much!

Output is now

#Hits: 1 : Hochschule
#Hits: 3 : Hello
#Hits: 1 : people
#Hits: 1 : universität
  term: hello
  term: hochschule
  term: people
  term: universität
  term: world

> > > Alan Woodward <alan@flax.co.uk> 07/25/17 9:14 AM >>>
You have a LowercaseFilter before your SynonymFilter, which means that the entities \
in your SynonymMap need to be all lowercase or they won't be matched.

Alan Woodward
www.flax.co.uk


> On 25 Jul 2017, at 07:52, Christian Kaufhold <Christian.Kaufhold@bsb-muenchen.de> \
> wrote: 
> Hi,
> 
> I am not able to add synonyms to the lucene index.
> I condensed my problem into the following class which is based on a Hello World \
> example. The idea behind the code was to add a document with universität and the \
> synonym 'Hochschule' (highschool) so that lucene finds universität wenn I query \
> Hochschule. But it doesn't and I checked the index contents with a term iterator.
> Hochschule is not added to the index:
> 
> 
> package test;
> 
> 
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.Tokenizer;
> import org.apache.lucene.analysis.core.LowerCaseFilter;
> import org.apache.lucene.analysis.standard.ClassicTokenizer;
> import org.apache.lucene.analysis.standard.StandardFilter;
> import org.apache.lucene.analysis.synonym.SynonymFilter;
> import org.apache.lucene.analysis.synonym.SynonymMap;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.TextField;
> import org.apache.lucene.index.*;
> import org.apache.lucene.queryparser.classic.QueryParser;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.TopDocs;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.RAMDirectory;
> import org.apache.lucene.util.BytesRef;
> import org.apache.lucene.util.CharsRef;
> import org.apache.lucene.util.Version;
> 
> import java.io.IOException;
> import java.io.Reader;
> 
> public class LuceneHelloWorld {
> 
> public static void main(String[] args) throws Exception {
> 
> Analyzer analyzer = getAnalyzer();
> Directory directory = new RAMDirectory();
> IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_48, analyzer);
> 
> IndexWriter writer = new IndexWriter(directory, config);
> addDoc( writer, "people", "Hello Universität" );
> addDoc( writer, "world", "Hello World" );
> addDoc( writer, "people", "Hello people" );
> writer.close();
> 
> IndexReader reader = DirectoryReader.open(directory);
> IndexSearcher searcher = new IndexSearcher (reader);
> QueryParser parser = new QueryParser ( Version.LUCENE_48, "content", analyzer);
> 
> test( parser, searcher, "Hochschule");
> test( parser, searcher, "Hello");
> test( parser, searcher, "people");
> test( parser, searcher, "universität");
> 
> printIndexTerms( reader);
> 
> }
> 
> public static void addDoc( IndexWriter writer, String title, String content) throws \
> Exception { Document document = new Document ();
> document.add(new TextField("title", title, Field.Store.YES));
> document.add(new TextField("content", content, Field.Store.YES));
> writer.addDocument(document);
> }
> 
> public static Analyzer getAnalyzer(){
> //return new StandardAnalyzer(Version.LUCENE_48);
> //return new SynonymAnalyzer();
> //return new SynonymFromStandardAnalyzer(Version.LUCENE_48);
> return new Analyzer() {
> @Override
> protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
> // TODO Auto-generated method stub
> Tokenizer source = new ClassicTokenizer(Version.LUCENE_48, reader);
> TokenStream filter(Version.LUCENE_48,filter);
> SynonymMap mySynonymMap = null;
> try {
> //mySynonymMap = buildSynonym();
> SynonymMap.Builder builder = new SynonymMap.Builder(true);
> //loadSynonyms(builder);
> builder.add(new CharsRef("Hochschule"), new CharsRef("Universität"), true);
> builder.add(new CharsRef("Universität"), new CharsRef("Hochschule"), true);
> mySynonymMap = builder.build();
> } catch (IOException e) {
> // TODO Auto-generated catch bl               filter = new SynonymFilter(filter, \
> mySynonymMap, false); return new TokenStreamComponents(source, filter);
> }
> };
> }
> 
> static void test( QueryParser parser, IndexSearcher searcher, String queryStr) \
> throws Exception { Query query = parser.parse(queryStr);
> TopDocs results = searcher.search(query, 10);
> System.out.println(  "#Hits: " + results.totalHits + " : " + queryStr);
> 
> }
> 
> public static void printIndexTerms( IndexReader reader) throws Exception {
> Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("content");
> TermsEnum iterator =terms.iterator(TermsEnum.EMPTY);
> BytesRef byteRef;
> while ( (byteRef = iterator.next())!=null){
> String term = byteRef.utf8ToString();
> System.out.println("  term: " + term);
> }
> }
> }
> 
> output:
> 
> #Hits: 0 : Hochschule
> #Hits: 3 : Hello
> #Hits: 1 : people
> #Hits: 1 : universität
> term: hello
> term: people
> term: universität
> term: world
> 
> 
> thanks in advance
> christian
> 
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
> 


[Attachment #5 (text/html)]

<html><head><meta http-equiv=Content-Type content="text/html; charset=UTF-8"><META \
name="Author" content="Novell GroupWise WebAccess"><style type="text/css">  body p 
{ 
	margin: 0px; 
}
</style></head><body style='font-family: Helvetica, Arial, sans-serif; font-size: \
13px; '>Yep, you hit the point. <br><br>Thank you so much!<br><br>Output is \
now<br><br>#Hits: 1 : Hochschule<br>#Hits: 3 : Hello<br>#Hits: 1 : people<br>#Hits: 1 \
: universität<br>&nbsp; term: hello<br>&nbsp; term: hochschule<br>&nbsp; term: \
people<br>&nbsp; term: universität<br>&nbsp; term: world<br><br>&gt;&gt;&gt; \
Alan&nbsp;Woodward&nbsp;&lt;alan@flax.co.uk&gt; 07/25/17 9:14 AM &gt;&gt;&gt;<br>You \
have a LowercaseFilter before your SynonymFilter, which means that the entities in \
your SynonymMap need to be all lowercase or they won't be matched.<br><br>Alan \
Woodward<br>www.flax.co.uk<br><br><br>&gt; On 25 Jul 2017, at 07:52, Christian \
Kaufhold &lt;Christian.Kaufhold@bsb-muenchen.de&gt; wrote:<br>&gt; <br>&gt; \
Hi,<br>&gt; <br>&gt; I am not able to add synonyms to the lucene index.<br>&gt; I \
condensed my problem into the following class which is based on a Hello World \
example.<br>&gt; The idea behind the code was to add a document with universität and \
the synonym 'Hochschule' (highschool)<br>&gt; so that lucene finds universität wenn \
I query Hochschule.<br>&gt; But it doesn't and I checked the index contents with a \
term iterator.<br>&gt; Hochschule is not added to the index:<br>&gt; <br>&gt; \
<br>&gt; package test;<br>&gt; <br>&gt; <br>&gt; import \
org.apache.lucene.analysis.Analyzer;<br>&gt; import \
org.apache.lucene.analysis.TokenStream;<br>&gt; import \
org.apache.lucene.analysis.Tokenizer;<br>&gt; import \
org.apache.lucene.analysis.core.LowerCaseFilter;<br>&gt; import \
org.apache.lucene.analysis.standard.ClassicTokenizer;<br>&gt; import \
org.apache.lucene.analysis.standard.StandardFilter;<br>&gt; import \
org.apache.lucene.analysis.synonym.SynonymFilter;<br>&gt; import \
org.apache.lucene.analysis.synonym.SynonymMap;<br>&gt; import \
org.apache.lucene.document.Document;<br>&gt; import \
org.apache.lucene.document.Field;<br>&gt; import \
org.apache.lucene.document.TextField;<br>&gt; import \
org.apache.lucene.index.*;<br>&gt; import \
org.apache.lucene.queryparser.classic.QueryParser;<br>&gt; import \
org.apache.lucene.search.IndexSearcher;<br>&gt; import \
org.apache.lucene.search.Query;<br>&gt; import \
org.apache.lucene.search.TopDocs;<br>&gt; import \
org.apache.lucene.store.Directory;<br>&gt; import \
org.apache.lucene.store.RAMDirectory;<br>&gt; import \
org.apache.lucene.util.BytesRef;<br>&gt; import \
org.apache.lucene.util.CharsRef;<br>&gt; import \
org.apache.lucene.util.Version;<br>&gt; <br>&gt; import java.io.IOException;<br>&gt; \
import java.io.Reader;<br>&gt; <br>&gt; public class LuceneHelloWorld {<br>&gt; \
<br>&gt;    public static void main(String[] args) throws Exception {<br>&gt; \
<br>&gt;        Analyzer analyzer = getAnalyzer();<br>&gt;        Directory directory \
= new RAMDirectory();<br>&gt;        IndexWriterConfig config = new \
IndexWriterConfig( Version.LUCENE_48, analyzer);<br>&gt; <br>&gt;        IndexWriter \
writer = new IndexWriter(directory, config);<br>&gt;        addDoc( writer, "people", \
"Hello Universität" );<br>&gt;        addDoc( writer, "world", "Hello World" \
);<br>&gt;        addDoc( writer, "people", "Hello people" );<br>&gt;        \
writer.close();<br>&gt; <br>&gt;        IndexReader reader = \
DirectoryReader.open(directory);<br>&gt;        IndexSearcher searcher = new \
IndexSearcher (reader);<br>&gt;        QueryParser parser = new QueryParser ( \
Version.LUCENE_48, "content", analyzer);<br>&gt; <br>&gt;        test( parser, \
searcher, "Hochschule");<br>&gt;        test( parser, searcher, "Hello");<br>&gt;     \
test( parser, searcher, "people");<br>&gt;        test( parser, searcher, \
"universität");<br>&gt; <br>&gt;        printIndexTerms( reader);<br>&gt; <br>&gt;   \
}<br>&gt; <br>&gt;    public static void addDoc( IndexWriter writer, String title, \
String content) throws Exception {<br>&gt;        Document document = new Document \
();<br>&gt;        document.add(new TextField("title", title, \
Field.Store.YES));<br>&gt;        document.add(new TextField("content", content, \
Field.Store.YES));<br>&gt;        writer.addDocument(document);<br>&gt;    }<br>&gt; \
<br>&gt;    public static Analyzer getAnalyzer(){<br>&gt;        //return new \
StandardAnalyzer(Version.LUCENE_48);<br>&gt;        //return new \
SynonymAnalyzer();<br>&gt;        //return new \
SynonymFromStandardAnalyzer(Version.LUCENE_48);<br>&gt;        return new Analyzer() \
{<br>&gt;            @Override<br>&gt;            protected TokenStreamComponents \
createComponents(String fieldName, Reader reader) {<br>&gt;                // TODO \
Auto-generated method stub<br>&gt;                Tokenizer source = new \
ClassicTokenizer(Version.LUCENE_48, reader);<br>&gt;                TokenStream \
filter = new StandardFilter(Version.LUCENE_48, source);<br>&gt;                filter \
= new LowerCaseFilter(Version.LUCENE_48,filter);<br>&gt;                SynonymMap \
mySynonymMap = null;<br>&gt;                try {<br>&gt;                    \
//mySynonymMap = buildSynonym();<br>&gt;                    SynonymMap.Builder \
builder = new SynonymMap.Builder(true);<br>&gt;                    \
//loadSynonyms(builder);<br>&gt;                    builder.add(new \
CharsRef("Hochschule"), new CharsRef("Universität"), true);<br>&gt;                  \
builder.add(new CharsRef("Universität"), new CharsRef("Hochschule"), true);<br>&gt;  \
mySynonymMap = builder.build();<br>&gt;                } catch (IOException e) \
{<br>&gt;                    // TODO Auto-generated catch bl               filter = \
new SynonymFilter(filter, mySynonymMap, false);<br>&gt;                return new \
TokenStreamComponents(source, filter);<br>&gt;            }<br>&gt;        };<br>&gt; \
}<br>&gt; <br>&gt;    static void test( QueryParser parser, IndexSearcher searcher, \
String queryStr) throws Exception {<br>&gt;        Query query = \
parser.parse(queryStr);<br>&gt;        TopDocs results = searcher.search(query, \
10);<br>&gt;        System.out.println(  "#Hits: " + results.totalHits + " : " + \
queryStr);<br>&gt; <br>&gt;    }<br>&gt; <br>&gt;    public static void \
printIndexTerms( IndexReader reader) throws Exception {<br>&gt;        Terms terms = \
SlowCompositeReaderWrapper.wrap(reader).terms("content");<br>&gt;        TermsEnum \
iterator =terms.iterator(TermsEnum.EMPTY);<br>&gt;        BytesRef byteRef;<br>&gt;   \
while ( (byteRef = iterator.next())!=null){<br>&gt;            String term = \
byteRef.utf8ToString();<br>&gt;            System.out.println("  term: " + \
term);<br>&gt;        }<br>&gt;    }<br>&gt; }<br>&gt; <br>&gt; output:<br>&gt; \
<br>&gt; #Hits: 0 : Hochschule<br>&gt; #Hits: 3 : Hello<br>&gt; #Hits: 1 : \
people<br>&gt; #Hits: 1 : universität<br>&gt;  term: hello<br>&gt;  term: \
people<br>&gt;  term: universität<br>&gt;  term: world<br>&gt; <br>&gt; <br>&gt; \
thanks in advance<br>&gt; christian<br>&gt; <br>&gt; <br>&gt; \
---------------------------------------------------------------------<br>&gt; To \
unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org<br>&gt; For additional \
commands, e-mail: java-user-help@lucene.apache.org<br>&gt; <br><br></body></html>



---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

[prev in list] [next in list] [prev in thread] [next in thread]
Configure | About | News | Add a list | Sponsored by KoreLogic