[prev in list] [next in list] [prev in thread] [next in thread] 

List:       solr-dev
Subject:    more generic lucene-morfologik integration
From:       Michal Hlavac <hlavki () hlavki ! eu>
Date:       2013-12-03 12:07:28
Message-ID: 2559310.0YUvc6vo1E () hlavki
[Download RAW message or body]

Hi,

I have little proposal for morfologik lucene module. Current module is tightly \
coupled with polish DICTIONARY enumeration. But other people (like me) can build own \
dictionaries to FSA and use it with lucene.  You can find proposal in attachment and \
also example usage in analyzer (SlovakLemmaAnalyzer). It uses dictionary property as \
String resource from classpath, not enumeration. One change is, that dictionary \
variable must be set in MofologikFilterFactory (no default value).

thanks, m.


["morfologik.zip" (morfologik.zip)]

PK
dCmorfologik/UT	NwRwRuxdPKdC- \
4$morfologik/MorfologikLemmatizer.javaUT	ZwRwRuxdUO0~S
 R!!uCZ'IcgCP4IC`<KΓ ha<,Tȸ
$fg6k-Fp \ \
ɮOKaGrpM۩&f$Ԇ݈	yz/nF߭/4o~jpx81\CZ:lڀjS~l*A\<Y.$6irBr \
THnmݖ\X]-qL<p +kdI<-]<zd8ȋXւ$SZ[/|Cu$& \
K	ó蔴3&6R3)F<uWѬ6+PZ/hڗO`uHH@ \
%Ma/-BHw.T^s!c D=oֱZDHO[]|
^eӥV_37S4uAwͲR|r9z>^BmwkI+͝AW)sqƈO-nV*)fEC*ljFTnhWWOGCQ]X$6@i룓 \
mzRNO~DgEmp[_˰M@8V?K?1]šM( \
5(k44u4hZ_ǏPwQ]EU	ꀖއ_!:/.&pߥ徳r⛪ \
OQ3͡PKdChe0morfologik/MorphosyntacticTagsAttributeImpl.j \
avaUT	MwRwRuxdVMsH+z9	q5qZjزȦ|DKL<hgFr\.4L \
~^ˣCzN&pɧB"G06=0Vec9b뵈fF\>^zgI.+qd~+, \
)Rxi4Dnt@(ELU rF] l>L*tUeJb \
>BFPbZ(U!saR男Y<FVfΝ$wM-  YSmt: \
߄Y.7~mp;^ht؁)JօBA|< f0YO?+vmfL5 \
>jYŋJłjUbC ç*\=}N@ZS"޽@	
q$2ȷ9|ߏm{M?Mٔ`<}?'OC@ rTd:qfH) \
ϮTf2t^rAY'ڵt<VG.FɵAOe_hQZc=|^=|ipck2&TrpYcq;EFI>uvn6ihfB׎)l.r&%&
 Dԩ\1Q/<?AI!(r~/MJM4AVk4}%c \
P#a#,yͻV ii-}O~Dʍ<d%LKhB4N{.5KB $@&ҕ \
V0|WJe :")x>]+YQ{7sb8PZ!ɏoCAlb1m1%5Ҏ&+eJo>g+x1jr<FabjPTmdP{ХR"PO(1RhdQ8 \
</Z!DiC* \
dT45{TӓB'~!RE\CqSdJJՍYrPLoCPz#A_iV>wM%7
 8fp%JtjϿmJɟs=_^Xsy,g#J!oͰ._֊;ڤ \
l=XC:7	b)nQkr9'K(`J o5
ک~uV1G$%Wm܇~c CdhPKdCr{SF \
morfologik/MorfologikFilter.javaUT	MwRwRuxdYmo8_1{rbipuԛ6Xc}"v(hYԒT\n͐L˲۽$P \
އ?gZ$}Xhnv <%1ŊY,gVg|fZR<;ya"P00aý, \
!sF.+9IJ"sļ495ύ[GXC*cBaV$ȬTϰ@Q,Mf7ŗL"_bre@nrJ1'ZYz7} \
zOCZK͙{ l4Pj_^4Z`y j/^zr3H) \
ͦ1{P98`Xjbg\koPXV%lflC	IG+6 /{ \
}IL]*r8Ma<=GGB>g?=~a68'{|0pu/C8"LAmUJ	T(OB$Z,򅫜j-4U)Z[O/R4 \
V^X,d<~h,ýwY|-7wmkOL>^dOl=ș݊FfTm/B~[ \
Զy,d)i6K0npO6}&g"mME:X,s6{8,acqUXIĈdƖvHVP("R>5`9O*qMу,#V}ʤ9$]J͙ \
\K`@4' l
RsH2mz<ahNP]A~P`
n ت[Jj?sBBjmu鄦fn I{B,U?D1y9Q]AK|!ϤT
}l3Fbmh{fg3?I)Ŷ1;Rd6ݴ:F5)uB=`yX+\a|G'\ǞXyQ!hoФËh&%(s	vgrU \
*. ϛZpV[d-"tGt$򮏰./!)mz042I.e
 M++hT[,U u\+!zu.#=&ɐa%7wΌmdPWzKrb<3?_UX%8ƾ|	(mu"~kq
 ͳQ	ue]ǘ9c-ɫ+#d!w)RD9QXbXN(*	S	wAieA}` \
Jn* ̩A:hhn..aWFAz42 \
$xA7X?eu.$OcAh#QQ]A!=thbaVAAAqT_/ͪ>ZKkP8ᵪ@u \
IJ0U::U]{aItɍnW \
sKkeo7w?-AI,Kc{&˃E~>]Ȍ^5!vpgп{fY0#	$n#n"#z΂B,`Dzp^ѩ6N, \
[008Kk}`jYgoK.Pw [i__P
"Kϔ
o,|GGF$LXHpKcPF&]u[߷km4MAmTɏ%n"sY \
+]K|9*,RʈnW`k%Dg \
ϳ5'Lx:guEaxHdpjbVD{غ~Xw-x/\; \
+1UziVJntW~? Δի2dgJtu&BKr"~+K@ \
].q'82lkT>}7[A7־꡷RFNeה}噰k9WOE \
;}(Cp}w4inæ#(-uK}Aa0X](:;:!`~E^pǜ;@*#' \
]{KKAX^'2HA:pWa.P>n؍~Bo!g(LIWMwj"q(6ˇ"3[O] \
;kPKdC6E+58,morfologik/MorphosyntacticTagsAttribute.javaU \
T	MwRwRuxdmTMoLWT#EPQTPYq#gfCovqbr^ə*5I+䃨3A;th0FW>4"kzL	\K \
aꡰUCXhJ5p>/S Gt` "HkSL j 	>(jY^@4B|_D
;
;塳*eZhP^ZRxۃS.:Sm0vR,Z|X罍!KF8*
T݈ zA6TմZ	#qAy v	j|
D B.uw]w1;?o)Uq3UsW{
E=hI[ҪE
LMJ'M}	~>ÌQ"Y_ ؄yϋeq \
ߗ>6˛wp^}Xn=-`OՇK@xud*(% \
oQJIfP=:sҢkzX2V \
is_LO&Dk]b/nK0䑖i(/f)ٽ*Y0AпDx  \
C*!߆Vj1G~9Smڔ~ hq@^R묌~I \
xdNj/}*mѼ}SFhJO'?'},/%V"@"tD87v~MyL:kwpbb<OhZk]F:0Wb \
{8v)HGiH&)uvlCtƏwBjIE_< Zpx%_ \
_PKdCHw6/'morfologik/MorfologikFilterFactory.javaUT	XwRwRuxdVQs8~WlBK=Mʴ{{]d'ɡl@~i
 >%ː,]a(%L21܄󜅹%Ԓ?ZU^'N
f`B!QfaJ1˕v:@AIJ|^Z
R#(	D?CXpqS9Q5+dWZGX2B3\!Ѹd:rIq˕͊J.ST*2Dt3(oTۭ	APl@KR^y!8)6* \
>j53_	Es0K=+kwQ^w{L >d9KY\
Jٜrl6o|bm삩w4w@.E*hc8q8']e<0.wp=܌gFOxr$(~+NZikn
 LT\n	t}R ιqj(sɜE-KExˊaGY9RWzӃ(	d*5㏸w0n}WTfXTvOޏ+#šZli \
                ̘A@K\ E68,~_%Y;GgaP(z,S%/V?޾
.a}EQ;.uuq2~Yq{]uUMzwNUӧ^wf$freς3#b~ᩊCCNIu_;]+G;**~_ \
}|i\$Eȝdtkhx}䌥&dzdx_6WvU=.u61:ɛsY'24}+AP \
c eC3k5#v([io$efG$ԏ{^͞6EY:HH7 \
wTlĤt< AoN⻇m5gC@p<TviKbQB/DC~ʹcPnRO{`}K\A?aMدu^zcJᶷ3'$#ă[.ѶAW/ \
}b..񧐛a^ML=w<vH<~lṮǖߦ+h \
t>	WxeD5H\ځV{2FRbAp~>sRm(HN_KcA#*Lިt4X7Xjy4WSghasP \
SD4٣ɨ@=ֶՑz5n[PK'dC8rL~ \
"morfologik/MorfologikAnalyzer.javaUT	iwRwRuxdUMsF+8-P@
 qZҬG3l-NItI=./.!'Ke{xOHD`l	)0Ru#;"e)(˧`z1,)a
 ހ/fyMw "<Z Ka4@hh.9KbrS{zڌ \
rX.z~{Tt!Dv>AFDJ.-HMʶ9խVh]!Ӭyŵy4磩pwOj1v8=.P;	VZJIMvAx<$fŋf0Y?| \
t:i73/ۮ9ZZxQQWPJ|bg	mO9Mgкi~&4g1!|x \
I?/l-+].rAw0[< / \
:<)NLn;)u-PUL&4kއljIiu`i,oBU574 \
]uafϷŮj`{~?Rx-k/UFY?nOo@ z4 }cGA7  \
]ȟ}*ЏhÏD50PcCYUoH(A y!MyԩoWeVxdIu*} \
'Gx8TK@tS\nQoFIWĞmt7Mu]F!IVM-hn<J}I<O3g߇pB&EoD0[کIj6ցiR`
 1sNFpNyS.v(͉/{c]73n-DҿuH \
o!}Q֊r*pt<i]"neJr3*k>iQImyז;-ViNT'ސ \
=d֔$@H{f3HYi\dճV \
˗Y`i<eYH^H]F/q"p6!}iܝO;t &W \
]Tmrϟ4wlR'2yXq3_PK \
dCAmorfologik/UTNwRuxdPK \
dC-4$Emorfologik/MorfologikLemmatizer.javaUTZwRux \
dPKdChe0morfologik/MorphosyntacticTagsAttributeImpl.javaUTMwRuxdPKdCr{SF \
morfologik/MorfologikFilter.javaUTMwRuxdPK \
dC6E+58,morfologik/MorphosyntacticTagsAttribute.javaUTMwR \
uxdPKdCHw6/'morfologik/MorfologikFilterFactory.javaUTXwRuxdPK'dC8rL~
 "Amorfologik/MorfologikAnalyzer.javaUTiwRuxdPK



["SlovakLemmaAnalyzer.java" (SlovakLemmaAnalyzer.java)]

package org.apache.lucene.analysis.sk;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.lemma.morfologik.MorfologikFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;

/**
 * {@link Analyzer} for Slovak language.
 * <p>
 * Supports an external list of stopwords (words that will not be indexed at all). A \
                default set of stopwords
 * is used unless an alternative list is specified.
 * </p>
 *
 * <a name="version"/>
 * <p>
 * You must specify the required {@link Version} compatibility when creating \
                CzechAnalyzer:
 * <ul>
 * <li>As of 3.1, words are stemmed with {@link SlovakStemFilter}
 * <li>As of 2.9, StopFilter preserves position increments
 * <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
 * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
 * </ul>
 */
public final class SlovakLemmaAnalyzer extends StopwordAnalyzerBase {

    /**
     * File containing default Slovak stopwords.
     */
    public final static String DEFAULT_STOPWORD_FILE = "stop-words.txt";
    private final CharArraySet stemExclusionSet;
    private final Dictionary dictionary;

    public enum Dictionary {

        DEFAULT("sk"), MLTEAST("mlteast-sk");

        private final String resource;

        private Dictionary(String resource) {
            this.resource = resource;
        }

        public String getResource() {
            return resource;
        }
    }

    /**
     * Returns an unmodifiable instance of the default stop words set.
     *
     * @return default stop words set.
     */
    public static CharArraySet getDefaultStopSet() {
        return SlovakLemmaAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET;
    }

    /**
     * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class \
                accesses the static final
     * set the first time.;
     */
    private static class DefaultSetHolder {

        static final CharArraySet DEFAULT_STOP_SET = getStopSet();

        private static CharArraySet getStopSet() {
            try {
                return \
                WordlistLoader.getWordSet(IOUtils.getDecodingReader(SlovakLemmaAnalyzer.class,
                
                        DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", \
Version.LUCENE_CURRENT);  } catch (IOException ex) {
                // default set should always be present as it is part of the
                // distribution (JAR)
                throw new RuntimeException("Unable to load default stopword set");
            }
        }
    }

    /**
     * Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
     *
     * @param matchVersion
     */
    public SlovakLemmaAnalyzer(Version matchVersion) {
        this(matchVersion, Dictionary.DEFAULT, \
SlovakLemmaAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET);  }

    /**
     * Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
     *
     * @param matchVersion
     */
    public SlovakLemmaAnalyzer(Version matchVersion, Dictionary dictionary) {
        this(matchVersion, dictionary, \
SlovakLemmaAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET);  }

    /**
     * Builds an analyzer with the given stop words.
     *
     * @param matchVersion lucene compatibility version
     * @param dictionary dictionary resource
     * @param stopwords a stopword set
     */
    public SlovakLemmaAnalyzer(Version matchVersion, Dictionary dictionary, \
CharArraySet stopwords) {  this(matchVersion, dictionary, stopwords, \
CharArraySet.EMPTY_SET);  }

    /**
     * Builds an analyzer with the given stop words. If a non-empty stem exclusion \
                set is provided this
     * analyzer will add a {@link KeywordMarkerFilter} before stemming.
     *
     * @param matchVersion lucene compatibility version
     * @param dictionary dictionary resource
     * @param stopwords a stopword set
     * @param stemExclusionSet a set of terms not to be stemmed
     */
    public SlovakLemmaAnalyzer(Version matchVersion, Dictionary dictionary, \
CharArraySet stopwords,  CharArraySet stemExclusionSet) {
        super(matchVersion, stopwords);
        this.dictionary = dictionary;
        this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
                matchVersion, stemExclusionSet));
    }

    /**
     * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used \
                to tokenize all the text
     * in the provided {@link Reader}.
     *
     * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} \
                built from a
     * {@link StandardTokenizer} filtered with
     * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} , and \
                {@link CzechStemFilter} (only
     * if version is >= LUCENE_31). If a version is >= LUCENE_31 and a stem exclusion \
                set is provided via
     * {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a {@link \
                KeywordMarkerFilter} is added
     * before {@link CzechStemFilter}.
     */
    @Override
    protected TokenStreamComponents createComponents(String fieldName,
            Reader reader) {
        final Tokenizer source = new StandardTokenizer(matchVersion, reader);
        TokenStream result = new StandardFilter(matchVersion, source);
        result = new LowerCaseFilter(matchVersion, result);
        result = new StopFilter(matchVersion, result, stopwords);
        if (matchVersion.onOrAfter(Version.LUCENE_31)) {
            if (!this.stemExclusionSet.isEmpty()) {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new MorfologikFilter(result, dictionary.getResource(), \
matchVersion);  }
        result = new ASCIIFoldingFilter(result);
        return new TokenStreamComponents(source, result);
    }
}



---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org

[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic