r21739 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r21738‎ | r21739 | r21740 >
Date:00:25, 1 May 2007
Author:rainman
Status:old
Tags:
Comment:
Added more analyzers from the snowball project. The snowball porter stemmer is 2x slower than the lucene's port, but gives more accurate results. Added support for CJK. Refactored the analyzers code.
Modified paths:
  • /trunk/lucene-search-2.0/.classpath (modified) (history)
  • /trunk/lucene-search-2.0/build.xml (modified) (history)
  • /trunk/lucene-search-2.0/lib/snowball.jar (added) (history)
  • /trunk/lucene-search-2.0/mwsearch-global.conf (modified) (history)
  • /trunk/lucene-search-2.0/mwsearch.conf (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/AliasFilter.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/CJKFilter.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/LanguageAnalyzer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/.classpath
@@ -11,5 +11,6 @@
1212 <classpathentry kind="lib" path="lib/xmlrpc-server-3.0.jar"/>
1313 <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
1414 <classpathentry kind="lib" path="lib/lucene-core-2.0.1-dev.jar" sourcepath="/lucene-2.0"/>
 15+ <classpathentry kind="lib" path="lib/snowball.jar"/>
1516 <classpathentry kind="output" path="bin"/>
1617 </classpath>
Index: trunk/lucene-search-2.0/lib/snowball.jar
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: trunk/lucene-search-2.0/lib/snowball.jar
___________________________________________________________________
Added: svn:mime-type
1718 + application/octet-stream
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
@@ -183,7 +183,7 @@
184184 // ==================================
185185 // Tests with actual params :)
186186 // ==================================
187 - Analyzer analyzer = Analyzers.getSearcherAnalyzer(Analyzers.getStemmerForLanguage("en"),Analyzers.getCustomFilterForLanguage("en"));
 187+ Analyzer analyzer = Analyzers.getSearcherAnalyzer("en");
188188 parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE);
189189 q = parser.parseTwoPass("beans everyone",null);
190190 assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0)",q.toString());
@@ -234,7 +234,7 @@
235235 assertEquals("(+(+namespace:0 +(+contents:1991 +category:\"olympic cities\")) -contents:1990) (+(+namespace:0 +(+title:1991^2.0 +category:\"olympic cities\")) -title:1990^2.0)",q.toString());
236236
237237 // Localization tests
238 - analyzer = Analyzers.getSearcherAnalyzer(Analyzers.getStemmerForLanguage("sr"),Analyzers.getCustomFilterForLanguage("sr"));
 238+ analyzer = Analyzers.getSearcherAnalyzer("sr");
239239 parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE);
240240
241241 q = parser.parseTwoPass("all:добродошли на википедију",NamespacePolicy.IGNORE);
@@ -243,7 +243,7 @@
244244 q = parser.parseTwoPass("all:dobrodošli na šđčćž",NamespacePolicy.IGNORE);
245245 assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^2.0 +title:na^2.0 +title:sdjccz^2.0)",q.toString());
246246
247 - analyzer = Analyzers.getSearcherAnalyzer(Analyzers.getStemmerForLanguage("th"),Analyzers.getCustomFilterForLanguage("th"));
 247+ analyzer = Analyzers.getSearcherAnalyzer("th");
248248 parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE);
249249
250250 q = parser.parseTwoPass("ภาษาไทย",NamespacePolicy.IGNORE);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
@@ -74,12 +74,9 @@
7575 text = "{{IPstack|name = Hundai}} '''[[Hypertext]] Transfer [[communications protocol|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages.";
7676 showTokens(text);
7777
78 - //if(true)
79 - // return;
80 -
8178 ArticlesParser ap = new ArticlesParser("./test-data/indexing-articles.test");
8279 ArrayList<TestArticle> articles = ap.getArticles();
83 - timeTest(articles);
 80+ //timeTest(articles);
8481 }
8582
8683 static void timeTest(ArrayList<TestArticle> articles) throws IOException{
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -20,6 +20,7 @@
2121 import org.apache.lucene.index.IndexWriter;
2222 import org.apache.lucene.index.Term;
2323 import org.wikimedia.lsearch.analyzers.Analyzers;
 24+import org.wikimedia.lsearch.analyzers.FilterFactory;
2425 import org.wikimedia.lsearch.beans.Article;
2526 import org.wikimedia.lsearch.beans.IndexReportCard;
2627 import org.wikimedia.lsearch.config.GlobalConfiguration;
@@ -146,15 +147,14 @@
147148 if(maxFieldLength!=0)
148149 writer.setMaxFieldLength(maxFieldLength);
149150
150 - Class language = Analyzers.getStemmerForLanguage(langCode);
151 - Class customFilter = Analyzers.getCustomFilterForLanguage(langCode);
 151+ FilterFactory filters = new FilterFactory(langCode);
152152
153153 for(IndexUpdateRecord rec : records){
154154 if(rec.doAdd()){
155155 if(!rec.isAlwaysAdd() && nonDeleteDocuments.contains(rec))
156156 continue; // don't add if delete/add are paired operations
157157 IndexReportCard card = getReportCard(rec);
158 - Object[] ret = makeDocumentAndAnalyzer(rec,language,customFilter);
 158+ Object[] ret = makeDocumentAndAnalyzer(rec,filters);
159159 Document doc = (Document) ret[0];
160160 Analyzer analyzer = (Analyzer) ret[1];
161161 try {
@@ -264,7 +264,7 @@
265265 * @param languageAnalyzer
266266 * @return array { document, analyzer }
267267 */
268 - protected Object[] makeDocumentAndAnalyzer(IndexUpdateRecord rec, Class languageAnalyzer, Class customFilter){
 268+ protected Object[] makeDocumentAndAnalyzer(IndexUpdateRecord rec, FilterFactory filters){
269269 PerFieldAnalyzerWrapper perFieldAnalyzer = null;
270270 Document doc = new Document();
271271 Article article = rec.getArticle();
@@ -288,7 +288,7 @@
289289 if(article.isRedirect())
290290 text=""; // for redirects index only the title
291291
292 - perFieldAnalyzer = Analyzers.getIndexerAnalyzer(text,languageAnalyzer,customFilter,global.getLanguage(rec.getIndexId().getDBname()));
 292+ perFieldAnalyzer = Analyzers.getIndexerAnalyzer(text,filters);
293293
294294 return new Object[] { doc, perFieldAnalyzer };
295295 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/AliasFilter.java
@@ -32,17 +32,12 @@
3333 *
3434 * @param language
3535 */
36 - public AliasFilter(Constructor language, TokenStream input, TokenStream duplicate){
 36+ public AliasFilter(FilterFactory filters, TokenStream input, TokenStream duplicate){
3737 this.input = input;
3838 stemmer = null;
3939 last = null;
40 - if(language != null){
41 - try {
42 - stemmer = (TokenStream) language.newInstance(new Object[] {duplicate});
43 - } catch (Exception e){
44 - log.error("Error making a tokenizer with a constructor "+language.getName());
45 - }
46 - }
 40+ if(filters.hasStemmer())
 41+ stemmer = filters.makeStemmer(duplicate);
4742 }
4843
4944 @Override
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java
@@ -14,8 +14,8 @@
1515 public class QueryLanguageAnalyzer extends LanguageAnalyzer {
1616 static org.apache.log4j.Logger log = Logger.getLogger(QueryLanguageAnalyzer.class);
1717
18 - public QueryLanguageAnalyzer(Class languageClass, Class customFilter){
19 - super(languageClass,null,customFilter);
 18+ public QueryLanguageAnalyzer(FilterFactory filters){
 19+ super(filters,null);
2020 }
2121
2222 /**
@@ -33,6 +33,12 @@
3434 log.error("Invalid usage of QueryLanguageAnalyzer.tokenStream(String,Reader). Use tokenStream(String,String). Probably bug in the software. ");
3535 return null;
3636 }
 37+
 38+ @Override
 39+ public String toString() {
 40+ return "QueryLanguageAnalyzer for "+filters.getLanguage();
 41+ }
3742
3843
 44+
3945 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java
@@ -69,12 +69,12 @@
7070
7171 /** Analyzer for titles, for most languages just a plain
7272 * wiki tokenizer (lowercase, unicode normalization), but
73 - * no steeming or aliases.
 73+ * no stemming or aliases.
7474 * @param language
7575 * @return
7676 */
77 - public static Analyzer getTitleAnalyzer(Class language, Class customFilter){
78 - return new QueryLanguageAnalyzer(language,customFilter);
 77+ public static Analyzer getTitleAnalyzer(FilterFactory filters){
 78+ return new QueryLanguageAnalyzer(filters);
7979 }
8080
8181 /**
@@ -89,20 +89,20 @@
9090 * @param languageAnalyzer language filter class (e.g. PorterStemFilter)
9191 * @return
9292 */
93 - public static PerFieldAnalyzerWrapper getIndexerAnalyzer(String text, Class languageFilter, Class customFilter, String langCode) {
 93+ public static PerFieldAnalyzerWrapper getIndexerAnalyzer(String text, FilterFactory filters) {
9494 PerFieldAnalyzerWrapper perFieldAnalyzer = null;
9595 // parse wiki-text to get categories
96 - WikiTokenizer tokenizer = new WikiTokenizer(text,langCode);
 96+ WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage());
9797 tokenizer.tokenize();
9898 ArrayList<String> categories = tokenizer.getCategories();
9999
100100 perFieldAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer());
101101 perFieldAnalyzer.addAnalyzer("contents",
102 - new LanguageAnalyzer(languageFilter,tokenizer,customFilter));
 102+ new LanguageAnalyzer(filters,tokenizer));
103103 perFieldAnalyzer.addAnalyzer("category",
104104 new CategoryAnalyzer(categories));
105105 perFieldAnalyzer.addAnalyzer("title",
106 - getTitleAnalyzer(null,customFilter));
 106+ getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
107107
108108 return perFieldAnalyzer;
109109 }
@@ -110,22 +110,28 @@
111111 public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){
112112 if(global == null)
113113 global = GlobalConfiguration.getInstance();
114 - String langCode = global.getLanguage(iid.getDBname());
115 - return getSearcherAnalyzer(getStemmerForLanguage(langCode),getCustomFilterForLanguage(langCode));
 114+ return getSearcherAnalyzer(global.getLanguage(iid.getDBname()));
 115+
116116 }
117117
 118+ public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode){
 119+ return getSearcherAnalyzer(new FilterFactory(langCode));
 120+ }
 121+
118122 /**
119123 * Analyzer for search queries. Can be reused to parse many queries.
120124 *
121125 * @param text
122126 * @return
123127 */
124 - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(Class languageFilter, Class customFilter) {
 128+ public static PerFieldAnalyzerWrapper getSearcherAnalyzer(FilterFactory filters) {
125129 PerFieldAnalyzerWrapper perFieldAnalyzer = null;
126130
127 - perFieldAnalyzer = new PerFieldAnalyzerWrapper(getTitleAnalyzer(null,customFilter));
 131+ perFieldAnalyzer = new PerFieldAnalyzerWrapper(getTitleAnalyzer(filters));
128132 perFieldAnalyzer.addAnalyzer("contents",
129 - new QueryLanguageAnalyzer(languageFilter,customFilter));
 133+ new QueryLanguageAnalyzer(filters));
 134+ perFieldAnalyzer.addAnalyzer("title",
 135+ getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
130136
131137 return perFieldAnalyzer;
132138 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/LanguageAnalyzer.java
@@ -42,23 +42,12 @@
4343 }
4444 static org.apache.log4j.Logger log = Logger.getLogger(LanguageAnalyzer.class);
4545 protected WikiTokenizer wikitokenizer = null;
46 - protected Constructor language = null;
47 - protected Constructor customFilter = null;
 46+ protected FilterFactory filters;
4847
4948 /** Make a new analyzer that process input as: wikitokenizer -> customFilter -> languageStemmer */
50 - public LanguageAnalyzer(Class languageStemmer, WikiTokenizer wikitokenizer, Class customFilter){
 49+ public LanguageAnalyzer(FilterFactory filters, WikiTokenizer wikitokenizer){
5150 this.wikitokenizer = wikitokenizer;
52 - try{
53 - if(languageStemmer != null)
54 - language = languageStemmer.getConstructor(TokenStream.class);
55 - if(customFilter != null)
56 - this.customFilter = customFilter.getConstructor(TokenStream.class);
57 -
58 - } catch (SecurityException e) {
59 - log.error("The constructor that takes TokenStream is hidden. Class: "+language.getClass().getCanonicalName());
60 - } catch (NoSuchMethodException e) {
61 - log.error("The constructor that takes TokenStream is missing.Class: "+language.getClass().getCanonicalName());
62 - }
 51+ this.filters = filters;
6352 }
6453
6554 /**
@@ -69,10 +58,10 @@
7059 public TokenStream tokenStream(String fieldName, Reader reader) {
7160 wikitokenizer.resetIterator();
7261 ArrayList<Token> tokens = wikitokenizer.getTokens();
73 - if(customFilter != null)
 62+ if(filters.hasCustomFilter())
7463 tokens = applyCustomFilter(tokens);
7564
76 - return new AliasFilter(language,
 65+ return new AliasFilter(filters,
7766 new ArrayTokens(tokens), new ArrayTokens(tokens));
7867 }
7968
@@ -80,9 +69,9 @@
8170 * stop words, or in Thai to tokenize words properly.
8271 */
8372 protected ArrayList<Token> applyCustomFilter(ArrayList<Token> tokens) {
84 - if(customFilter != null){
 73+ if(filters.hasCustomFilter()){
8574 try {
86 - TokenStream ts = (TokenStream) customFilter.newInstance(new Object[] {new ArrayTokens(tokens)});
 75+ TokenStream ts = filters.makeCustomFilter(new ArrayTokens(tokens));
8776 ArrayList<Token> filtered = new ArrayList<Token>();
8877 Token t;
8978 while((t = ts.next())!=null)
@@ -90,9 +79,14 @@
9180
9281 return filtered;
9382 } catch (Exception e){
94 - log.error("Error applying custom filter "+customFilter.getName());
 83+ log.error("Error applying custom filter for "+filters.getLanguage());
9584 }
9685 }
9786 return tokens;
9887 }
 88+
 89+ @Override
 90+ public String toString() {
 91+ return "LanguageAnalyzer for "+filters.getLanguage();
 92+ }
9993 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/CJKFilter.java
@@ -0,0 +1,61 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+import java.util.LinkedList;
 5+
 6+import org.apache.lucene.analysis.Token;
 7+import org.apache.lucene.analysis.TokenFilter;
 8+import org.apache.lucene.analysis.TokenStream;
 9+
 10+/**
 11+ * Simple CJK (Chinese Japanese Korean) token filter. One CJK symbol per token.
 12+ */
 13+
 14+public final class CJKFilter extends TokenFilter {
 15+ LinkedList<Token> buffer = new LinkedList<Token>();
 16+
 17+ public CJKFilter(TokenStream input) {
 18+ super(input);
 19+ }
 20+
 21+ @Override
 22+ public Token next() throws java.io.IOException {
 23+ if(buffer.size()!=0)
 24+ return buffer.removeFirst();
 25+
 26+ Token token = input.next();
 27+ if(token == null)
 28+ return null;
 29+
 30+ String text = token.termText();
 31+
 32+ int i,offset,len,c;
 33+ for(i=0,offset=0,len=0;i<text.length();i++){
 34+ c = text.codePointAt(i);
 35+ if(isCJKChar(c)){
 36+ if(len != 0)
 37+ buffer.add(new Token(text.substring(offset,offset+len),token.startOffset()+offset,token.startOffset()+offset+len));
 38+ offset = i+1;
 39+ len = 0;
 40+ buffer.add(new Token(text.substring(i,i+1),token.startOffset()+i,token.startOffset()+i+1));
 41+ } else
 42+ len++;
 43+ }
 44+ if(len != 0 && len != text.length())
 45+ buffer.add(new Token(text.substring(offset,offset+len),token.startOffset()+offset,token.startOffset()+offset+len));
 46+
 47+ if(buffer.size() == 0)
 48+ return token;
 49+ else
 50+ return buffer.removeFirst();
 51+ }
 52+
 53+ public final boolean isCJKChar(int c){
 54+ return (c >= 0x3040 && c <= 0x318f) ||
 55+ (c >= 0x3300 && c <= 0x337f) ||
 56+ (c >= 0x3400 && c <= 0x3d2d) ||
 57+ (c >= 0x4e00 && c <= 0x9fff) ||
 58+ (c >= 0xf900 && c <= 0xfaff) ||
 59+ (c >= 0xac00 && c <= 0xd7af);
 60+ }
 61+
 62+}
\ No newline at end of file
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java
@@ -0,0 +1,133 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+import org.apache.lucene.analysis.PorterStemFilter;
 5+import org.apache.lucene.analysis.TokenFilter;
 6+import org.apache.lucene.analysis.TokenStream;
 7+import org.apache.lucene.analysis.de.GermanStemFilter;
 8+import org.apache.lucene.analysis.snowball.SnowballFilter;
 9+import org.apache.lucene.analysis.th.ThaiWordFilter;
 10+
 11+/**
 12+ * Make a language-dependent pair of filters. The custom filter is to be applied before the stemmer.
 13+ *
 14+ * @author rainman
 15+ *
 16+ */
 17+public class FilterFactory {
 18+ protected String lang;
 19+ protected String snowballName = null;
 20+ protected boolean useStemmer,useCustomFilter;
 21+ protected Class stemmer = null;
 22+ protected Class customFilter = null;
 23+
 24+ protected FilterFactory noStemmerFilterFactory;
 25+
 26+ public FilterFactory(String lang){
 27+ this.lang = lang;
 28+ init();
 29+ noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useCustomFilter,null,customFilter);
 30+ }
 31+
 32+ public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useCustomFilter, Class stemmer, Class customFilter) {
 33+ this.lang = lang;
 34+ this.snowballName = snowballName;
 35+ this.useStemmer = useStemmer;
 36+ this.useCustomFilter = useCustomFilter;
 37+ this.stemmer = stemmer;
 38+ this.customFilter = customFilter;
 39+ }
 40+
 41+ public FilterFactory getNoStemmerFilterFactory() {
 42+ return noStemmerFilterFactory;
 43+ }
 44+
 45+ protected void init(){
 46+ if(lang == null)
 47+ lang = "en";
 48+
 49+ // figure out stemmer
 50+ useStemmer = true;
 51+ if(lang.equals("en"))
 52+ snowballName = "English";
 53+ // stemmer = PorterStemFilter.class; -- 2x faster but less accurate
 54+ else if(lang.equals("da"))
 55+ snowballName = "Danish";
 56+ else if(lang.equals("nl"))
 57+ snowballName = "Dutch";
 58+ else if(lang.equals("fi"))
 59+ snowballName = "Finnish";
 60+ else if(lang.equals("de"))
 61+ snowballName = "German";
 62+ else if(lang.equals("it"))
 63+ snowballName = "Italian";
 64+ else if(lang.equals("no"))
 65+ snowballName = "Norwegian";
 66+ else if(lang.equals("pt"))
 67+ snowballName = "Portuguese";
 68+ else if(lang.equals("ru"))
 69+ snowballName = "Russian";
 70+ else if(lang.equals("es"))
 71+ snowballName = "Spanish";
 72+ else if(lang.equals("sv"))
 73+ snowballName = "Swedish";
 74+ else if(lang.equals("eo"))
 75+ stemmer = EsperantoStemFilter.class;
 76+ else
 77+ useStemmer = false;
 78+
 79+ // figure out custom filter
 80+ useCustomFilter = true;
 81+ if(lang.equals("th"))
 82+ customFilter = ThaiWordFilter.class;
 83+ else if(lang.equals("sr"))
 84+ customFilter = SerbianFilter.class;
 85+ else if(lang.equals("zh") || lang.equals("cjk") || lang.equals("ja") ||
 86+ lang.equals("ko") || lang.equals("zh-classical") || lang.equals("zh-yue"))
 87+ customFilter = CJKFilter.class;
 88+ else
 89+ useCustomFilter = false;
 90+
 91+ }
 92+
 93+ public TokenFilter makeStemmer(TokenStream in){
 94+ if(!useStemmer)
 95+ return null;
 96+ else if(snowballName != null)
 97+ return new SnowballFilter(in,snowballName);
 98+ else if(stemmer != null){
 99+ try {
 100+ return (TokenFilter) stemmer.getConstructor(TokenStream.class).newInstance(in);
 101+ } catch (Exception e) {
 102+ e.printStackTrace();
 103+ }
 104+ }
 105+
 106+ return null;
 107+ }
 108+
 109+ public TokenFilter makeCustomFilter(TokenStream in){
 110+ if(!useCustomFilter)
 111+ return null;
 112+ else if(customFilter != null){
 113+ try {
 114+ return (TokenFilter) customFilter.getConstructor(TokenStream.class).newInstance(in);
 115+ } catch (Exception e) {
 116+ e.printStackTrace();
 117+ }
 118+ }
 119+
 120+ return null;
 121+ }
 122+
 123+ public boolean hasStemmer(){
 124+ return useStemmer;
 125+ }
 126+
 127+ public boolean hasCustomFilter(){
 128+ return useCustomFilter;
 129+ }
 130+
 131+ public String getLanguage(){
 132+ return lang;
 133+ }
 134+}
Index: trunk/lucene-search-2.0/mwsearch-global.conf
@@ -8,14 +8,14 @@
99 # language <languageCode>
1010 # databases can be writen as {url}, where url contains list of dbs
1111 [Database]
12 -wikilucene : (mainsplit) (language,en)
 12+wikilucene : (single) (language,en)
1313
1414 # Search groups
1515 # Index parts of a split index are always taken from the node's group
1616 # host : db1.part db2.part
1717 # Mulitple hosts can search multiple dbs (N-N mapping)
1818 [Search-Group]
19 -oblak : wikilucene.mainpart wikilucene.restpart
 19+oblak : wikilucene
2020
2121 # Index nodes
2222 # host: db1.part db2.part
Index: trunk/lucene-search-2.0/mwsearch.conf
@@ -6,7 +6,7 @@
77
88 # URL to global configuration, this is the shared main config file, it can
99 # be on a NFS partition or available somewhere on the network
10 -MWConfig.global=http://localhost/wiki-lucene/lucene-search/mwsearch-global.conf
 10+MWConfig.global=http://localhost/lucene-search-2.0/mwsearch-global.conf
1111
1212 ################################################
1313 # Search node related configuration
@@ -59,7 +59,7 @@
6060 RecentUpdateDaemon.hostspareTcpPort=8112
6161
6262 # Log configuration
63 -#Logging.logconfig=/etc/mwsearch.log4j
 63+Logging.logconfig=/etc/mwsearch.log4j
6464
6565 # Set debug to true to diagnose problems with log4j configuration
6666 Logging.debug=false
Index: trunk/lucene-search-2.0/build.xml
@@ -30,7 +30,7 @@
3131 <jar destfile="${basedir}/MWSearch.jar">
3232 <manifest>
3333 <attribute name="Main-Class" value="org.wikimedia.lsearch.config.StartupManager"/>
34 - <attribute name="Class-Path" value="MWSearch.jar lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar"/>
 34+ <attribute name="Class-Path" value="MWSearch.jar lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar"/>
3535 </manifest>
3636 <zipfileset dir="${bin}" prefix="">
3737 <include name="org/**"/>