[prev in list] [next in list] [prev in thread] [next in thread] 

List:       solr-dev
Subject:    more generic lucene-morfologik integration
From:       Michal Hlavac <hlavki () hlavki ! eu>
Date:       2013-12-03 12:07:28
Message-ID: 2559310.0YUvc6vo1E () hlavki
[Download RAW message or body]

Hi,

I have little proposal for morfologik lucene module. Current module is tightly coupled with \
polish DICTIONARY enumeration. But other people (like me) can build own dictionaries to FSA and \
use it with lucene.  You can find proposal in attachment and also example usage in analyzer \
(SlovakLemmaAnalyzer). It uses dictionary property as String resource from classpath, not \
enumeration. One change is, that dictionary variable must be set in MofologikFilterFactory (no \
default value).

thanks, m.


["morfologik.zip" (morfologik.zip)]

PK
dCmorfologik/UT	NwRwRuxdPKdC-4$morfologik/MorfologikLemmatizer.javaUT	ZwRwRuxdUO0~S
 R!!uCZ'IcgCP4IC`<KΓ ha<,Tȸ
$fg6k-Fp \ \
ɮOKaGrpM۩&f$Ԇ݈	yz/nF߭/4o~jpx81\CZ:lڀjS~l*A\<Y.$6irBr \
THnmݖ\X]-qL<p +kdI<-]<zd8ȋXւ$SZ[/|Cu$& \
K	ó蔴3&6R3)F<uWѬ6+PZ/hڗO`uHH@ \
%Ma/-BHw.T^s!c D=oֱZDHO[]|
^eӥV_37S4uAwͲR|r9z>^BmwkI+͝AW)sqƈO-nV*)fEC*ljFTnhWWOGCQ]X$6@i룓 \
mzRNO~DgEmp[_˰M@8V?K?1]šM( \
5(k44u4hZ_ǏPwQ]EU	ꀖއ_!:/.&pߥ徳r⛪OQ3͡ \
PKdChe0morfologik/MorphosyntacticTagsAttributeImpl.javaUT	MwRwRux \
dVMsH+z9	q5qZjزȦ|DKL<hgFr\.4L~^ˣCzN&pɧB"G06=0Vec9b뵈fF\>^zgI.+qd~+, \
)Rxi4Dnt@(ELU rF] l>L*tUeJb \
>BFPbZ(U!saR男Y<FVfΝ$wM-  YSmt: \
߄Y.7~mp;^ht؁)JօBA|< f0YO?+vmfL5 \
>jYŋJłjUbC ç*\=}N@ZS"޽@	
q$2ȷ9|ߏm{M?Mٔ`<}?'OC@ rTd:qfH) \
ϮTf2t^rAY'ڵt<VG.FɵAOe_hQZc=|^=|ipck2&TrpYcq;EFI>uvn6ihfB׎)l.r&%&
 Dԩ\1Q/<?AI!(r~/MJM4AVk4}%c P#a#,yͻV
ii-}O~Dʍ<d%LKhB4N{.5KB $@&ҕ V0|WJe \
:")x>]+YQ{7sb8PZ!ɏoCAlb1m1%5Ҏ&+eJo>g+x1jr<FabjPTmdP{ХR"PO(1RhdQ8 \
</Z!DiC* \
dT45{TӓB'~!RE\CqSdJJՍYrPLoCPz#A_iV>wM%7 \
8fp%JtjϿmJɟs=_^Xsy,g#J!oͰ._֊;ڤ \
l=XC:7	b)nQkr9'K(`J o5
ک~uV1G$%Wm܇~c CdhPKdCr{SF \
morfologik/MorfologikFilter.javaUT	MwRwRuxdYmo8_1{rbipuԛ6Xc}"v(hYԒT\n͐L˲۽$P \
އ?gZ$}Xhnv <%1ŊY,gVg|fZR<;ya"P00aý,!sF.+ \
9IJ"sļ495ύ[GXC*cBaV$ȬTϰ@Q,Mf7ŗL"_bre@nrJ1'ZYz7} \
zOCZK͙{ l4Pj_^4Z`y j/^zr3H) \
ͦ1{P98`Xjbg\koPXV%lflC	IG+6 /{ \
}IL]*r8Ma<=GGB>g?=~a68'{|0pu/C8"LAmUJ	T(OB$Z,򅫜j-4U)Z[O/R4 \
V^X,d<~h,ýwY|-7wmkOL>^dOl=ș݊FfTm/B~[Զy,d) \
i6K0npO6}&g"mME:X,s6{8,acqUXIĈdƖvHVP("R>5`9O*qMу,#V}ʤ9$]J͙ \
\K`@4' l
RsH2mz<ahNP]A~P`
n ت[Jj?sBBjmu鄦fn I{B,U?D1y9Q]AK|!ϤT
}l3Fbmh{fg3?I)Ŷ1;Rd6ݴ:F5)uB=`yX+\a|G'\ǞXyQ!hoФËh&%(s	vgrU \
*. ϛZpV[d-"tGt$򮏰./!)mz042I.e \
M++hT[,U u\+!zu.#=&ɐa%7wΌmdPWzKrb<3?_UX%8ƾ|	(mu"~kq
 ͳQ	ue]ǘ9c-ɫ+#d!w)RD9QXbXN(*	S	wAieA}` \
Jn* ̩A:hhn..aWFAz42 \
$xA7X?eu.$OcAh#QQ]A!=thbaVAAAqT_/ͪ>ZKkP8ᵪ@u \
IJ0U::U]{aItɍnW \
sKkeo7w?-AI,Kc{&˃E~>]Ȍ^5!vpgп{fY0#	$n#n"#z΂B,`Dzp^ѩ6N, \
[008Kk}`jYgoK.Pw [i__P
"Kϔ
o,|GGF$LXHpKcPF&]u[߷km4MAmTɏ%n"sY \
+]K|9*,RʈnW`k%Dg \
ϳ5'Lx:guEaxHdpjbVD{غ~Xw-x/\; +1UziVJntW~? \
Δի2dgJtu&BKr"~+K@ \
].q'82lkT>}7[A7־꡷RFNeה}噰k9WOE \
;}(Cp}w4inæ#(-uK}Aa0X](:;:!`~E^pǜ;@*#' \
]{KKAX^'2HA:pWa.P>n؍~Bo!g(LIWMwj"q(6ˇ"3[O] \
;kPKdC6E+58,morfologik/MorphosyntacticTagsAttribute.javaUT	MwR \
wRuxdmTMoLWT#EPQTPYq#gfCovqbr^ə*5I+䃨3A;th0FW>4"kzL	\K \
aꡰUCXhJ5p>/S Gt` "HkSL j 	>(jY^@4B|_D
;
;塳*eZhP^ZRxۃS.:Sm0vR,Z|X罍!KF8*
T݈ zA6TմZ	#qAy v	j|
D B.uw]w1;?o)Uq3UsW{
E=hI[ҪE
LMJ'M}	~>ÌQ"Y_ ؄yϋeq \
ߗ>6˛wp^}Xn=-`OՇK@xud*(% oQJIfP=:sҢkzX2V \
is_LO&Dk]b/nK0䑖i(/f)ٽ*Y0AпDx  \
C*!߆Vj1G~9Smڔ~ hq@^R묌~I \
xdNj/}*mѼ}SFhJO'?'},/%V"@"tD87v~MyL:kwpbb<OhZk]F:0Wb \
{8v)HGiH&)uvlCtƏwBjIE_< Zpx%_ \
_PKdCHw6/'morfologik/MorfologikFilterFactory.javaUT	XwRwRuxdVQs8~WlBK=Mʴ{{]d'ɡl@~i
 >%ː,]a(%L21܄󜅹%Ԓ?ZU^'N
f`B!QfaJ1˕v:@AIJ|^Z
R#(	D?CXpqS9Q5+dWZGX2B3\!Ѹd:rIq˕͊J.ST*2Dt3(oTۭ	APl@KR^y!8)6* \
>j53_	Es0K=+kwQ^w{L >d9KY\
Jٜrl6o|bm삩w4w@.E*hc8q8']e<0.wp=܌gFOxr$(~+NZikn
 LT\n	t}R ιqj(sɜE-KExˊaGY9RWzӃ(	d*5㏸w0n}WTfXTvOޏ+#šZli \
                ̘A@K\ E68,~_%Y;GgaP(z,S%/V?޾
.a}EQ;.uuq2~Yq{]uUMzwNUӧ^wf$freς3#b~ᩊCCNIu_;]+G;**~_ \
}|i\$Eȝdtkhx}䌥&dzdx_6WvU=.u61:ɛsY'24}+AP c
eC3k5#v([io$efG$ԏ{^͞6EY:HH7 wTlĤt<
AoN⻇m5gC@p<TviKbQB/DC~ʹcPnRO{`}K\A?aMدu^zcJᶷ3'$#ă[.ѶAW/ \
}b..񧐛a^ML=w<vH<~lṮǖߦ+h \
t>	WxeD5H\ځV{2FRbAp~>sRm(HN_KcA#*Lިt4X7Xjy4WSghasP \
SD4٣ɨ@=ֶՑz5n[PK'dC8rL~ \
"morfologik/MorfologikAnalyzer.javaUT	iwRwRuxdUMsF+8-P@ \
qZҬG3l-NItI=./.!'Ke{xOHD`l	)0Ru#;"e)(˧`z1,)a
 ހ/fyMw "<Z Ka4@hh.9KbrS{zڌ \
rX.z~{Tt!Dv>AFDJ.-HMʶ9խVh]!Ӭyŵy4磩pwOj1v8=.P;	VZJIMvAx<$fŋf0Y?| \
t:i73/ۮ9ZZxQQWPJ|bg	mO9Mgкi~&4g1!|x \
I?/l-+].rAw0[< / \
:<)NLn;)u-PUL&4kއljIiu`i,oBU574 \
]uafϷŮj`{~?Rx-k/UFY?nOo@ z4 }cGA7  \
]ȟ}*ЏhÏD50PcCYUoH(A y!MyԩoWeVxdIu*} \
'Gx8TK@tS\nQoFIWĞmt7Mu]F!IVM-hn<J}I<O3g߇pB&EoD0[کIj6ցiR`
 1sNFpNyS.v(͉/{c]73n-DҿuH \
o!}Q֊r*pt<i]"neJr3*k>iQImyז;-ViNT'ސ \
=d֔$@H{f3HYi\dճV ˗Y`i<eYH^H]F/q"p6!}iܝO;t &W \
]Tmrϟ4wlR'2yXq3_PK \
dCAmorfologik/UTNwRuxdPKdC- \
4$Emorfologik/MorfologikLemmatizer.javaUTZwRuxdPK \
dChe0morfologik/MorphosyntacticTagsAttributeImpl.javaUTMwRuxdPKdCr{SF \
morfologik/MorfologikFilter.javaUTMwRuxdPKdC6E+5 \
8,morfologik/MorphosyntacticTagsAttribute.javaUTMwRuxdPK \
dCHw6/'morfologik/MorfologikFilterFactory.javaUTXwRuxdPK'dC8rL~
 "Amorfologik/MorfologikAnalyzer.javaUTiwRuxdPK



["SlovakLemmaAnalyzer.java" (SlovakLemmaAnalyzer.java)]

package org.apache.lucene.analysis.sk;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.lemma.morfologik.MorfologikFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;

/**
 * {@link Analyzer} for Slovak language.
 * <p>
 * Supports an external list of stopwords (words that will not be indexed at all). A default set of stopwords
 * is used unless an alternative list is specified.
 * </p>
 *
 * <a name="version"/>
 * <p>
 * You must specify the required {@link Version} compatibility when creating CzechAnalyzer:
 * <ul>
 * <li>As of 3.1, words are stemmed with {@link SlovakStemFilter}
 * <li>As of 2.9, StopFilter preserves position increments
 * <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
 * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
 * </ul>
 */
public final class SlovakLemmaAnalyzer extends StopwordAnalyzerBase {

    /**
     * File containing default Slovak stopwords.
     */
    public final static String DEFAULT_STOPWORD_FILE = "stop-words.txt";
    private final CharArraySet stemExclusionSet;
    private final Dictionary dictionary;

    public enum Dictionary {

        DEFAULT("sk"), MLTEAST("mlteast-sk");

        private final String resource;

        private Dictionary(String resource) {
            this.resource = resource;
        }

        public String getResource() {
            return resource;
        }
    }

    /**
     * Returns an unmodifiable instance of the default stop words set.
     *
     * @return default stop words set.
     */
    public static CharArraySet getDefaultStopSet() {
        return SlovakLemmaAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET;
    }

    /**
     * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the static final
     * set the first time.;
     */
    private static class DefaultSetHolder {

        static final CharArraySet DEFAULT_STOP_SET = getStopSet();

        private static CharArraySet getStopSet() {
            try {
                return WordlistLoader.getWordSet(IOUtils.getDecodingReader(SlovakLemmaAnalyzer.class,
                        DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
            } catch (IOException ex) {
                // default set should always be present as it is part of the
                // distribution (JAR)
                throw new RuntimeException("Unable to load default stopword set");
            }
        }
    }

    /**
     * Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
     *
     * @param matchVersion
     */
    public SlovakLemmaAnalyzer(Version matchVersion) {
        this(matchVersion, Dictionary.DEFAULT, SlovakLemmaAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET);
    }

    /**
     * Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
     *
     * @param matchVersion
     */
    public SlovakLemmaAnalyzer(Version matchVersion, Dictionary dictionary) {
        this(matchVersion, dictionary, SlovakLemmaAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET);
    }

    /**
     * Builds an analyzer with the given stop words.
     *
     * @param matchVersion lucene compatibility version
     * @param dictionary dictionary resource
     * @param stopwords a stopword set
     */
    public SlovakLemmaAnalyzer(Version matchVersion, Dictionary dictionary, CharArraySet stopwords) {
        this(matchVersion, dictionary, stopwords, CharArraySet.EMPTY_SET);
    }

    /**
     * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided this
     * analyzer will add a {@link KeywordMarkerFilter} before stemming.
     *
     * @param matchVersion lucene compatibility version
     * @param dictionary dictionary resource
     * @param stopwords a stopword set
     * @param stemExclusionSet a set of terms not to be stemmed
     */
    public SlovakLemmaAnalyzer(Version matchVersion, Dictionary dictionary, CharArraySet stopwords,
            CharArraySet stemExclusionSet) {
        super(matchVersion, stopwords);
        this.dictionary = dictionary;
        this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
                matchVersion, stemExclusionSet));
    }

    /**
     * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all the text
     * in the provided {@link Reader}.
     *
     * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a
     * {@link StandardTokenizer} filtered with
     * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} , and {@link CzechStemFilter} (only
     * if version is >= LUCENE_31). If a version is >= LUCENE_31 and a stem exclusion set is provided via
     * {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a {@link KeywordMarkerFilter} is added
     * before {@link CzechStemFilter}.
     */
    @Override
    protected TokenStreamComponents createComponents(String fieldName,
            Reader reader) {
        final Tokenizer source = new StandardTokenizer(matchVersion, reader);
        TokenStream result = new StandardFilter(matchVersion, source);
        result = new LowerCaseFilter(matchVersion, result);
        result = new StopFilter(matchVersion, result, stopwords);
        if (matchVersion.onOrAfter(Version.LUCENE_31)) {
            if (!this.stemExclusionSet.isEmpty()) {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new MorfologikFilter(result, dictionary.getResource(), matchVersion);
        }
        result = new ASCIIFoldingFilter(result);
        return new TokenStreamComponents(source, result);
    }
}



---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org

[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic