[prev in list] [next in list] [prev in thread] [next in thread] 

List:       solr-dev
Subject:    more generic lucene-morfologik integration
From:       Michal Hlavac <hlavki () hlavki ! eu>
Date:       2013-12-03 12:07:28
Message-ID: 2559310.0YUvc6vo1E () hlavki
[Download RAW message or body]

Hi,

I have little proposal for morfologik lucene module. Current module is tightly coupled with polish DICTIONARY enumeration.
But other people (like me) can build own dictionaries to FSA and use it with lucene. 
You can find proposal in attachment and also example usage in analyzer (SlovakLemmaAnalyzer).
It uses dictionary property as String resource from classpath, not enumeration.
One change is, that dictionary variable must be set in MofologikFilterFactory (no default value).

thanks, m.

["morfologik.zip" (morfologik.zip)]

PK
dCmorfologik/UT	NwRwRuxdPKdC-4$morfologik/MorfologikLemmatizer.javaUT	ZwRwRuxdUO0~S
 R!!uCZ'IcgCP4IC`<KΓ ha<,Tȸ
$fg6k-Fp \ \
ɮOKaGrpM۩&f$Ԇ݈	yz/nF߭/4o~jpx81\CZ:lڀjS~l*A\<Y.$6irBr \
THnmݖ\X]-qL<p +kdI<-]<zd8ȋXւ$SZ[/|Cu$& \
K	ó蔴3&6R3)F<uWѬ6+PZ/hڗO`uHH@ %Ma/-BHw.T^s!c
D=oֱZDHO[]|
^eӥV_37S4uAwͲR|r9z>^BmwkI+͝AW)sqƈO-nV*)fEC*ljFTnhWWOGCQ]X$6@i룓 \
mzRNO~DgEmp[_˰M@8V?K?1]šM( \
5(k44u4hZ_ǏPwQ]EU	ꀖއ_!:/.&pߥ徳r⛪OQ3͡PK \
dChe0morfologik/MorphosyntacticTagsAttributeImpl.javaUT	MwRwRuxdVMsH \
+z9	q5qZjزȦ|DKL<hgFr\.4L~^ˣCzN&pɧB"G06=0Vec9b뵈fF\>^zgI.+qd~+, \
)Rxi4Dnt@(ELU rF] l>L*tUeJb \
>BFPbZ(U!saR男Y<FVfΝ$wM-  YSmt: \
߄Y.7~mp;^ht؁)JօBA|< f0YO?+vmfL5 \
>jYŋJłjUbC ç*\=}N@ZS"޽@	
q$2ȷ9|ߏm{M?Mٔ`<}?'OC@ rTd:qfH) \
ϮTf2t^rAY'ڵt<VG.FɵAOe_hQZc=|^=|ipck2&TrpYcq;EFI>uvn6ihfB׎)l.r&%&
 Dԩ\1Q/<?AI!(r~/MJM4AVk4}%c P#a#,yͻV
ii-}O~Dʍ<d%LKhB4N{.5KB $@&ҕ V0|WJe \
:")x>]+YQ{7sb8PZ!ɏoCAlb1m1%5Ҏ&+eJo>g+x1jr<FabjPTmdP{ХR"PO(1RhdQ8 \
</Z!DiC* \
dT45{TӓB'~!RE\CqSdJJՍYrPLoCPz#A_iV>wM%7 \
8fp%JtjϿmJɟs=_^Xsy,g#J!oͰ._֊;ڤ \
l=XC:7	b)nQkr9'K(`J o5
ک~uV1G$%Wm܇~c CdhPKdCr{SF \
morfologik/MorfologikFilter.javaUT	MwRwRuxdYmo8_1{rbipuԛ6Xc}"v(hYԒT\n͐L˲۽$P \
އ?gZ$}Xhnv <%1ŊY,gVg|fZR<;ya"P00aý,!sF.+9IJ"s \
ļ495ύ[GXC*cBaV$ȬTϰ@Q,Mf7ŗL"_bre@nrJ1'ZYz7} \
zOCZK͙{ l4Pj_^4Z`y j/^zr3H) \
ͦ1{P98`Xjbg\koPXV%lflC	IG+6 /{ \
}IL]*r8Ma<=GGB>g?=~a68'{|0pu/C8"LAmUJ	T(OB$Z,򅫜j-4U)Z[O/R4 \
V^X,d<~h,ýwY|-7wmkOL>^dOl=ș݊FfTm/B~[Զy,d)i6K0np \
O6}&g"mME:X,s6{8,acqUXIĈdƖvHVP("R>5`9O*qMу,#V}ʤ9$]J͙ \
\K`@4' l
RsH2mz<ahNP]A~P`
n ت[Jj?sBBjmu鄦fn I{B,U?D1y9Q]AK|!ϤT
}l3Fbmh{fg3?I)Ŷ1;Rd6ݴ:F5)uB=`yX+\a|G'\ǞXyQ!hoФËh&%(s	vgrU \
*. ϛZpV[d-"tGt$򮏰./!)mz042I.e M++hT[,U \
u\+!zu.#=&ɐa%7wΌmdPWzKrb<3?_UX%8ƾ|	(mu"~kq \
ͳQ	ue]ǘ9c-ɫ+#d!w)RD9QXbXN(*	S	wAieA}` Jn* \
̩A:hhn..aWFAz42 \
$xA7X?eu.$OcAh#QQ]A!=thbaVAAAqT_/ͪ>ZKkP8ᵪ@u \
IJ0U::U]{aItɍnW \
sKkeo7w?-AI,Kc{&˃E~>]Ȍ^5!vpgп{fY0#	$n#n"#z΂B,`Dzp^ѩ6N, \
[008Kk}`jYgoK.Pw [i__P
"Kϔ
o,|GGF$LXHpKcPF&]u[߷km4MAmTɏ%n"sY \
+]K|9*,RʈnW`k%Dg \
ϳ5'Lx:guEaxHdpjbVD{غ~Xw-x/\; +1UziVJntW~? \
Δի2dgJtu&BKr"~+K@ \
].q'82lkT>}7[A7־꡷RFNeה}噰k9WOE \
;}(Cp}w4inæ#(-uK}Aa0X](:;:!`~E^pǜ;@*#' \
]{KKAX^'2HA:pWa.P>n؍~Bo!g(LIWMwj"q(6ˇ"3[O] \
;kPKdC6E+58,morfologik/MorphosyntacticTagsAttribute.javaUT	MwRwRux \
dmTMoLWT#EPQTPYq#gfCovqbr^ə*5I+䃨3A;th0FW>4"kzL	\K \
aꡰUCXhJ5p>/S Gt` "HkSL j 	>(jY^@4B|_D
;
;塳*eZhP^ZRxۃS.:Sm0vR,Z|X罍!KF8*
T݈ zA6TմZ	#qAy v	j|
D B.uw]w1;?o)Uq3UsW{
E=hI[ҪE
LMJ'M}	~>ÌQ"Y_ ؄yϋeq ߗ>6˛wp^}Xn=-`OՇK@xud*(% \
oQJIfP=:sҢkzX2V is_LO&Dk]b/nK0䑖i(/f)ٽ*Y0AпDx
 C*!߆Vj1G~9Smڔ~ hq@^R묌~I \
xdNj/}*mѼ}SFhJO'?'},/%V"@"tD87v~MyL:kwpbb<OhZk]F:0Wb \
{8v)HGiH&)uvlCtƏwBjIE_< Zpx%_ \
_PKdCHw6/'morfologik/MorfologikFilterFactory.javaUT	XwRwRuxdVQs8~WlBK=Mʴ{{]d'ɡl@~i
 >%ː,]a(%L21܄󜅹%Ԓ?ZU^'N
f`B!QfaJ1˕v:@AIJ|^Z
R#(	D?CXpqS9Q5+dWZGX2B3\!Ѹd:rIq˕͊J.ST*2Dt3(oTۭ	APl@KR^y!8)6* \
>j53_	Es0K=+kwQ^w{L >d9KY\
Jٜrl6o|bm삩w4w@.E*hc8q8']e<0.wp=܌gFOxr$(~+NZikn
 LT\n	t}R ιqj(sɜE-KExˊaGY9RWzӃ(	d*5㏸w0n}WTfXTvOޏ+#šZli \
                ̘A@K\ E68,~_%Y;GgaP(z,S%/V?޾
.a}EQ;.uuq2~Yq{]uUMzwNUӧ^wf$freς3#b~ᩊCCNIu_;]+G;**~_ \
}|i\$Eȝdtkhx}䌥&dzdx_6WvU=.u61:ɛsY'24}+AP c
eC3k5#v([io$efG$ԏ{^͞6EY:HH7 wTlĤt<
AoN⻇m5gC@p<TviKbQB/DC~ʹcPnRO{`}K\A?aMدu^zcJᶷ3'$#ă[.ѶAW/ \
}b..񧐛a^ML=w<vH<~lṮǖߦ+h \
t>	WxeD5H\ځV{2FRbAp~>sRm(HN_KcA#*Lިt4X7Xjy4WSghasP \
SD4٣ɨ@=ֶՑz5n[PK'dC8rL~ \
"morfologik/MorfologikAnalyzer.javaUT	iwRwRuxdUMsF+8-P@ \
qZҬG3l-NItI=./.!'Ke{xOHD`l	)0Ru#;"e)(˧`z1,)a \
ހ/fyMw "<Z Ka4@hh.9KbrS{zڌ \
rX.z~{Tt!Dv>AFDJ.-HMʶ9խVh]!Ӭyŵy4磩pwOj1v8=.P;	VZJIMvAx<$fŋf0Y?| \
t:i73/ۮ9ZZxQQWPJ|bg	mO9Mgкi~&4g1!|x \
I?/l-+].rAw0[< / \
:<)NLn;)u-PUL&4kއljIiu`i,oBU574 \
]uafϷŮj`{~?Rx-k/UFY?nOo@ z4 }cGA7  \
]ȟ}*ЏhÏD50PcCYUoH(A y!MyԩoWeVxdIu*} \
'Gx8TK@tS\nQoFIWĞmt7Mu]F!IVM-hn<J}I<O3g߇pB&EoD0[کIj6ցiR`
 1sNFpNyS.v(͉/{c]73n-DҿuH \
o!}Q֊r*pt<i]"neJr3*k>iQImyז;-ViNT'ސ \
=d֔$@H{f3HYi\dճV ˗Y`i<eYH^H]F/q"p6!}iܝO;t &W \
]Tmrϟ4wlR'2yXq3_PK \
dCAmorfologik/UTNwRuxdPKdC-4$ \
Emorfologik/MorfologikLemmatizer.javaUTZwRuxdPKdChe0 \
morfologik/MorphosyntacticTagsAttributeImpl.javaUTMwRuxdPKdCr{SF \
morfologik/MorfologikFilter.javaUTMwRuxdPKdC6E+58, \
morfologik/MorphosyntacticTagsAttribute.javaUTMwRuxdPKdCHw6/ \
'morfologik/MorfologikFilterFactory.javaUTXwRuxdPK'dC8rL~
 "Amorfologik/MorfologikAnalyzer.javaUTiwRuxdPK


["SlovakLemmaAnalyzer.java" (SlovakLemmaAnalyzer.java)]

package org.apache.lucene.analysis.sk;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.lemma.morfologik.MorfologikFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;

/**
 * {@link Analyzer} for Slovak language.
 * <p>
 * Supports an external list of stopwords (words that will not be indexed at all). A default set of stopwords
 * is used unless an alternative list is specified.
 * </p>
 *
 * <a name="version"/>
 * <p>
 * You must specify the required {@link Version} compatibility when creating CzechAnalyzer:
 * <ul>
 * <li>As of 3.1, words are stemmed with {@link SlovakStemFilter}
 * <li>As of 2.9, StopFilter preserves position increments
 * <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
 * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
 * </ul>
 */
public final class SlovakLemmaAnalyzer extends StopwordAnalyzerBase {

    /**
     * File containing default Slovak stopwords.
     */
    public final static String DEFAULT_STOPWORD_FILE = "stop-words.txt";
    private final CharArraySet stemExclusionSet;
    private final Dictionary dictionary;

    public enum Dictionary {

        DEFAULT("sk"), MLTEAST("mlteast-sk");

        private final String resource;

        private Dictionary(String resource) {
            this.resource = resource;
        }

        public String getResource() {
            return resource;
        }
    }

    /**
     * Returns an unmodifiable instance of the default stop words set.
     *
     * @return default stop words set.
     */
    public static CharArraySet getDefaultStopSet() {
        return SlovakLemmaAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET;
    }

    /**
     * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the static final
     * set the first time.;
     */
    private static class DefaultSetHolder {

        static final CharArraySet DEFAULT_STOP_SET = getStopSet();

        private static CharArraySet getStopSet() {
            try {
                return WordlistLoader.getWordSet(IOUtils.getDecodingReader(SlovakLemmaAnalyzer.class,
                        DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
            } catch (IOException ex) {
                // default set should always be present as it is part of the
                // distribution (JAR)
                throw new RuntimeException("Unable to load default stopword set");
            }
        }
    }

    /**
     * Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
     *
     * @param matchVersion
     */
    public SlovakLemmaAnalyzer(Version matchVersion) {
        this(matchVersion, Dictionary.DEFAULT, SlovakLemmaAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET);
    }

    /**
     * Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
     *
     * @param matchVersion
     */
    public SlovakLemmaAnalyzer(Version matchVersion, Dictionary dictionary) {
        this(matchVersion, dictionary, SlovakLemmaAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET);
    }

    /**
     * Builds an analyzer with the given stop words.
     *
     * @param matchVersion lucene compatibility version
     * @param dictionary dictionary resource
     * @param stopwords a stopword set
     */
    public SlovakLemmaAnalyzer(Version matchVersion, Dictionary dictionary, CharArraySet stopwords) {
        this(matchVersion, dictionary, stopwords, CharArraySet.EMPTY_SET);
    }

    /**
     * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided this
     * analyzer will add a {@link KeywordMarkerFilter} before stemming.
     *
     * @param matchVersion lucene compatibility version
     * @param dictionary dictionary resource
     * @param stopwords a stopword set
     * @param stemExclusionSet a set of terms not to be stemmed
     */
    public SlovakLemmaAnalyzer(Version matchVersion, Dictionary dictionary, CharArraySet stopwords,
            CharArraySet stemExclusionSet) {
        super(matchVersion, stopwords);
        this.dictionary = dictionary;
        this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
                matchVersion, stemExclusionSet));
    }

    /**
     * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all the text
     * in the provided {@link Reader}.
     *
     * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a
     * {@link StandardTokenizer} filtered with
     * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} , and {@link CzechStemFilter} (only
     * if version is >= LUCENE_31). If a version is >= LUCENE_31 and a stem exclusion set is provided via
     * {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a {@link KeywordMarkerFilter} is added
     * before {@link CzechStemFilter}.
     */
    @Override
    protected TokenStreamComponents createComponents(String fieldName,
            Reader reader) {
        final Tokenizer source = new StandardTokenizer(matchVersion, reader);
        TokenStream result = new StandardFilter(matchVersion, source);
        result = new LowerCaseFilter(matchVersion, result);
        result = new StopFilter(matchVersion, result, stopwords);
        if (matchVersion.onOrAfter(Version.LUCENE_31)) {
            if (!this.stemExclusionSet.isEmpty()) {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new MorfologikFilter(result, dictionary.getResource(), matchVersion);
        }
        result = new ASCIIFoldingFilter(result);
        return new TokenStreamComponents(source, result);
    }
}



---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org

[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic