Index: trunk/lucene-search-2.0/.classpath |
— | — | @@ -11,5 +11,6 @@ |
12 | 12 | <classpathentry kind="lib" path="lib/xmlrpc-server-3.0.jar"/> |
13 | 13 | <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> |
14 | 14 | <classpathentry kind="lib" path="lib/lucene-core-2.0.1-dev.jar" sourcepath="/lucene-2.0"/> |
| 15 | + <classpathentry kind="lib" path="lib/snowball.jar"/> |
15 | 16 | <classpathentry kind="output" path="bin"/> |
16 | 17 | </classpath> |
Index: trunk/lucene-search-2.0/lib/snowball.jar |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/lucene-search-2.0/lib/snowball.jar |
___________________________________________________________________ |
Added: svn:mime-type |
17 | 18 | + application/octet-stream |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java |
— | — | @@ -183,7 +183,7 @@ |
184 | 184 | // ================================== |
185 | 185 | // Tests with actual params :) |
186 | 186 | // ================================== |
187 | | - Analyzer analyzer = Analyzers.getSearcherAnalyzer(Analyzers.getStemmerForLanguage("en"),Analyzers.getCustomFilterForLanguage("en")); |
| 187 | + Analyzer analyzer = Analyzers.getSearcherAnalyzer("en"); |
188 | 188 | parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE); |
189 | 189 | q = parser.parseTwoPass("beans everyone",null); |
190 | 190 | assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0)",q.toString()); |
— | — | @@ -234,7 +234,7 @@ |
235 | 235 | assertEquals("(+(+namespace:0 +(+contents:1991 +category:\"olympic cities\")) -contents:1990) (+(+namespace:0 +(+title:1991^2.0 +category:\"olympic cities\")) -title:1990^2.0)",q.toString()); |
236 | 236 | |
237 | 237 | // Localization tests |
238 | | - analyzer = Analyzers.getSearcherAnalyzer(Analyzers.getStemmerForLanguage("sr"),Analyzers.getCustomFilterForLanguage("sr")); |
| 238 | + analyzer = Analyzers.getSearcherAnalyzer("sr"); |
239 | 239 | parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE); |
240 | 240 | |
241 | 241 | q = parser.parseTwoPass("all:добродошли на википедију",NamespacePolicy.IGNORE); |
— | — | @@ -243,7 +243,7 @@ |
244 | 244 | q = parser.parseTwoPass("all:dobrodošli na šđčćž",NamespacePolicy.IGNORE); |
245 | 245 | assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^2.0 +title:na^2.0 +title:sdjccz^2.0)",q.toString()); |
246 | 246 | |
247 | | - analyzer = Analyzers.getSearcherAnalyzer(Analyzers.getStemmerForLanguage("th"),Analyzers.getCustomFilterForLanguage("th")); |
| 247 | + analyzer = Analyzers.getSearcherAnalyzer("th"); |
248 | 248 | parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE); |
249 | 249 | |
250 | 250 | q = parser.parseTwoPass("ภาษาไทย",NamespacePolicy.IGNORE); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -74,12 +74,9 @@ |
75 | 75 | text = "{{IPstack|name = Hundai}} '''[[Hypertext]] Transfer [[communications protocol|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages."; |
76 | 76 | showTokens(text); |
77 | 77 | |
78 | | - //if(true) |
79 | | - // return; |
80 | | - |
81 | 78 | ArticlesParser ap = new ArticlesParser("./test-data/indexing-articles.test"); |
82 | 79 | ArrayList<TestArticle> articles = ap.getArticles(); |
83 | | - timeTest(articles); |
| 80 | + //timeTest(articles); |
84 | 81 | } |
85 | 82 | |
86 | 83 | static void timeTest(ArrayList<TestArticle> articles) throws IOException{ |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -20,6 +20,7 @@ |
21 | 21 | import org.apache.lucene.index.IndexWriter; |
22 | 22 | import org.apache.lucene.index.Term; |
23 | 23 | import org.wikimedia.lsearch.analyzers.Analyzers; |
| 24 | +import org.wikimedia.lsearch.analyzers.FilterFactory; |
24 | 25 | import org.wikimedia.lsearch.beans.Article; |
25 | 26 | import org.wikimedia.lsearch.beans.IndexReportCard; |
26 | 27 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
— | — | @@ -146,15 +147,14 @@ |
147 | 148 | if(maxFieldLength!=0) |
148 | 149 | writer.setMaxFieldLength(maxFieldLength); |
149 | 150 | |
150 | | - Class language = Analyzers.getStemmerForLanguage(langCode); |
151 | | - Class customFilter = Analyzers.getCustomFilterForLanguage(langCode); |
| 151 | + FilterFactory filters = new FilterFactory(langCode); |
152 | 152 | |
153 | 153 | for(IndexUpdateRecord rec : records){ |
154 | 154 | if(rec.doAdd()){ |
155 | 155 | if(!rec.isAlwaysAdd() && nonDeleteDocuments.contains(rec)) |
156 | 156 | continue; // don't add if delete/add are paired operations |
157 | 157 | IndexReportCard card = getReportCard(rec); |
158 | | - Object[] ret = makeDocumentAndAnalyzer(rec,language,customFilter); |
| 158 | + Object[] ret = makeDocumentAndAnalyzer(rec,filters); |
159 | 159 | Document doc = (Document) ret[0]; |
160 | 160 | Analyzer analyzer = (Analyzer) ret[1]; |
161 | 161 | try { |
— | — | @@ -264,7 +264,7 @@ |
265 | 265 | * @param languageAnalyzer |
266 | 266 | * @return array { document, analyzer } |
267 | 267 | */ |
268 | | - protected Object[] makeDocumentAndAnalyzer(IndexUpdateRecord rec, Class languageAnalyzer, Class customFilter){ |
| 268 | + protected Object[] makeDocumentAndAnalyzer(IndexUpdateRecord rec, FilterFactory filters){ |
269 | 269 | PerFieldAnalyzerWrapper perFieldAnalyzer = null; |
270 | 270 | Document doc = new Document(); |
271 | 271 | Article article = rec.getArticle(); |
— | — | @@ -288,7 +288,7 @@ |
289 | 289 | if(article.isRedirect()) |
290 | 290 | text=""; // for redirects index only the title |
291 | 291 | |
292 | | - perFieldAnalyzer = Analyzers.getIndexerAnalyzer(text,languageAnalyzer,customFilter,global.getLanguage(rec.getIndexId().getDBname())); |
| 292 | + perFieldAnalyzer = Analyzers.getIndexerAnalyzer(text,filters); |
293 | 293 | |
294 | 294 | return new Object[] { doc, perFieldAnalyzer }; |
295 | 295 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/AliasFilter.java |
— | — | @@ -32,17 +32,12 @@ |
33 | 33 | * |
34 | 34 | * @param language |
35 | 35 | */ |
36 | | - public AliasFilter(Constructor language, TokenStream input, TokenStream duplicate){ |
| 36 | + public AliasFilter(FilterFactory filters, TokenStream input, TokenStream duplicate){ |
37 | 37 | this.input = input; |
38 | 38 | stemmer = null; |
39 | 39 | last = null; |
40 | | - if(language != null){ |
41 | | - try { |
42 | | - stemmer = (TokenStream) language.newInstance(new Object[] {duplicate}); |
43 | | - } catch (Exception e){ |
44 | | - log.error("Error making a tokenizer with a constructor "+language.getName()); |
45 | | - } |
46 | | - } |
| 40 | + if(filters.hasStemmer()) |
| 41 | + stemmer = filters.makeStemmer(duplicate); |
47 | 42 | } |
48 | 43 | |
49 | 44 | @Override |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java |
— | — | @@ -14,8 +14,8 @@ |
15 | 15 | public class QueryLanguageAnalyzer extends LanguageAnalyzer { |
16 | 16 | static org.apache.log4j.Logger log = Logger.getLogger(QueryLanguageAnalyzer.class); |
17 | 17 | |
18 | | - public QueryLanguageAnalyzer(Class languageClass, Class customFilter){ |
19 | | - super(languageClass,null,customFilter); |
| 18 | + public QueryLanguageAnalyzer(FilterFactory filters){ |
| 19 | + super(filters,null); |
20 | 20 | } |
21 | 21 | |
22 | 22 | /** |
— | — | @@ -33,6 +33,12 @@ |
34 | 34 | log.error("Invalid usage of QueryLanguageAnalyzer.tokenStream(String,Reader). Use tokenStream(String,String). Probably bug in the software. "); |
35 | 35 | return null; |
36 | 36 | } |
| 37 | + |
| 38 | + @Override |
| 39 | + public String toString() { |
| 40 | + return "QueryLanguageAnalyzer for "+filters.getLanguage(); |
| 41 | + } |
37 | 42 | |
38 | 43 | |
| 44 | + |
39 | 45 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java |
— | — | @@ -69,12 +69,12 @@ |
70 | 70 | |
71 | 71 | /** Analyzer for titles, for most languages just a plain |
72 | 72 | * wiki tokenizer (lowercase, unicode normalization), but |
73 | | - * no steeming or aliases. |
| 73 | + * no stemming or aliases. |
74 | 74 | * @param language |
75 | 75 | * @return |
76 | 76 | */ |
77 | | - public static Analyzer getTitleAnalyzer(Class language, Class customFilter){ |
78 | | - return new QueryLanguageAnalyzer(language,customFilter); |
| 77 | + public static Analyzer getTitleAnalyzer(FilterFactory filters){ |
| 78 | + return new QueryLanguageAnalyzer(filters); |
79 | 79 | } |
80 | 80 | |
81 | 81 | /** |
— | — | @@ -89,20 +89,20 @@ |
90 | 90 | * @param languageAnalyzer language filter class (e.g. PorterStemFilter) |
91 | 91 | * @return |
92 | 92 | */ |
93 | | - public static PerFieldAnalyzerWrapper getIndexerAnalyzer(String text, Class languageFilter, Class customFilter, String langCode) { |
| 93 | + public static PerFieldAnalyzerWrapper getIndexerAnalyzer(String text, FilterFactory filters) { |
94 | 94 | PerFieldAnalyzerWrapper perFieldAnalyzer = null; |
95 | 95 | // parse wiki-text to get categories |
96 | | - WikiTokenizer tokenizer = new WikiTokenizer(text,langCode); |
| 96 | + WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage()); |
97 | 97 | tokenizer.tokenize(); |
98 | 98 | ArrayList<String> categories = tokenizer.getCategories(); |
99 | 99 | |
100 | 100 | perFieldAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); |
101 | 101 | perFieldAnalyzer.addAnalyzer("contents", |
102 | | - new LanguageAnalyzer(languageFilter,tokenizer,customFilter)); |
| 102 | + new LanguageAnalyzer(filters,tokenizer)); |
103 | 103 | perFieldAnalyzer.addAnalyzer("category", |
104 | 104 | new CategoryAnalyzer(categories)); |
105 | 105 | perFieldAnalyzer.addAnalyzer("title", |
106 | | - getTitleAnalyzer(null,customFilter)); |
| 106 | + getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
107 | 107 | |
108 | 108 | return perFieldAnalyzer; |
109 | 109 | } |
— | — | @@ -110,22 +110,28 @@ |
111 | 111 | public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){ |
112 | 112 | if(global == null) |
113 | 113 | global = GlobalConfiguration.getInstance(); |
114 | | - String langCode = global.getLanguage(iid.getDBname()); |
115 | | - return getSearcherAnalyzer(getStemmerForLanguage(langCode),getCustomFilterForLanguage(langCode)); |
| 114 | + return getSearcherAnalyzer(global.getLanguage(iid.getDBname())); |
| 115 | + |
116 | 116 | } |
117 | 117 | |
| 118 | + public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode){ |
| 119 | + return getSearcherAnalyzer(new FilterFactory(langCode)); |
| 120 | + } |
| 121 | + |
118 | 122 | /** |
119 | 123 | * Analyzer for search queries. Can be reused to parse many queries. |
120 | 124 | * |
121 | 125 | * @param text |
122 | 126 | * @return |
123 | 127 | */ |
124 | | - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(Class languageFilter, Class customFilter) { |
| 128 | + public static PerFieldAnalyzerWrapper getSearcherAnalyzer(FilterFactory filters) { |
125 | 129 | PerFieldAnalyzerWrapper perFieldAnalyzer = null; |
126 | 130 | |
127 | | - perFieldAnalyzer = new PerFieldAnalyzerWrapper(getTitleAnalyzer(null,customFilter)); |
| 131 | + perFieldAnalyzer = new PerFieldAnalyzerWrapper(getTitleAnalyzer(filters)); |
128 | 132 | perFieldAnalyzer.addAnalyzer("contents", |
129 | | - new QueryLanguageAnalyzer(languageFilter,customFilter)); |
| 133 | + new QueryLanguageAnalyzer(filters)); |
| 134 | + perFieldAnalyzer.addAnalyzer("title", |
| 135 | + getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
130 | 136 | |
131 | 137 | return perFieldAnalyzer; |
132 | 138 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/LanguageAnalyzer.java |
— | — | @@ -42,23 +42,12 @@ |
43 | 43 | } |
44 | 44 | static org.apache.log4j.Logger log = Logger.getLogger(LanguageAnalyzer.class); |
45 | 45 | protected WikiTokenizer wikitokenizer = null; |
46 | | - protected Constructor language = null; |
47 | | - protected Constructor customFilter = null; |
| 46 | + protected FilterFactory filters; |
48 | 47 | |
49 | 48 | /** Make a new analyzer that process input as: wikitokenizer -> customFilter -> languageStemmer */ |
50 | | - public LanguageAnalyzer(Class languageStemmer, WikiTokenizer wikitokenizer, Class customFilter){ |
| 49 | + public LanguageAnalyzer(FilterFactory filters, WikiTokenizer wikitokenizer){ |
51 | 50 | this.wikitokenizer = wikitokenizer; |
52 | | - try{ |
53 | | - if(languageStemmer != null) |
54 | | - language = languageStemmer.getConstructor(TokenStream.class); |
55 | | - if(customFilter != null) |
56 | | - this.customFilter = customFilter.getConstructor(TokenStream.class); |
57 | | - |
58 | | - } catch (SecurityException e) { |
59 | | - log.error("The constructor that takes TokenStream is hidden. Class: "+language.getClass().getCanonicalName()); |
60 | | - } catch (NoSuchMethodException e) { |
61 | | - log.error("The constructor that takes TokenStream is missing.Class: "+language.getClass().getCanonicalName()); |
62 | | - } |
| 51 | + this.filters = filters; |
63 | 52 | } |
64 | 53 | |
65 | 54 | /** |
— | — | @@ -69,10 +58,10 @@ |
70 | 59 | public TokenStream tokenStream(String fieldName, Reader reader) { |
71 | 60 | wikitokenizer.resetIterator(); |
72 | 61 | ArrayList<Token> tokens = wikitokenizer.getTokens(); |
73 | | - if(customFilter != null) |
| 62 | + if(filters.hasCustomFilter()) |
74 | 63 | tokens = applyCustomFilter(tokens); |
75 | 64 | |
76 | | - return new AliasFilter(language, |
| 65 | + return new AliasFilter(filters, |
77 | 66 | new ArrayTokens(tokens), new ArrayTokens(tokens)); |
78 | 67 | } |
79 | 68 | |
— | — | @@ -80,9 +69,9 @@ |
81 | 70 | * stop words, or in Thai to tokenize words properly. |
82 | 71 | */ |
83 | 72 | protected ArrayList<Token> applyCustomFilter(ArrayList<Token> tokens) { |
84 | | - if(customFilter != null){ |
| 73 | + if(filters.hasCustomFilter()){ |
85 | 74 | try { |
86 | | - TokenStream ts = (TokenStream) customFilter.newInstance(new Object[] {new ArrayTokens(tokens)}); |
| 75 | + TokenStream ts = filters.makeCustomFilter(new ArrayTokens(tokens)); |
87 | 76 | ArrayList<Token> filtered = new ArrayList<Token>(); |
88 | 77 | Token t; |
89 | 78 | while((t = ts.next())!=null) |
— | — | @@ -90,9 +79,14 @@ |
91 | 80 | |
92 | 81 | return filtered; |
93 | 82 | } catch (Exception e){ |
94 | | - log.error("Error applying custom filter "+customFilter.getName()); |
| 83 | + log.error("Error applying custom filter for "+filters.getLanguage()); |
95 | 84 | } |
96 | 85 | } |
97 | 86 | return tokens; |
98 | 87 | } |
| 88 | + |
| 89 | + @Override |
| 90 | + public String toString() { |
| 91 | + return "LanguageAnalyzer for "+filters.getLanguage(); |
| 92 | + } |
99 | 93 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/CJKFilter.java |
— | — | @@ -0,0 +1,61 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +import java.util.LinkedList; |
| 5 | + |
| 6 | +import org.apache.lucene.analysis.Token; |
| 7 | +import org.apache.lucene.analysis.TokenFilter; |
| 8 | +import org.apache.lucene.analysis.TokenStream; |
| 9 | + |
| 10 | +/** |
| 11 | + * Simple CJK (Chinese Japanese Korean) token filter. One CJK symbol per token. |
| 12 | + */ |
| 13 | + |
| 14 | +public final class CJKFilter extends TokenFilter { |
| 15 | + LinkedList<Token> buffer = new LinkedList<Token>(); |
| 16 | + |
| 17 | + public CJKFilter(TokenStream input) { |
| 18 | + super(input); |
| 19 | + } |
| 20 | + |
| 21 | + @Override |
| 22 | + public Token next() throws java.io.IOException { |
| 23 | + if(buffer.size()!=0) |
| 24 | + return buffer.removeFirst(); |
| 25 | + |
| 26 | + Token token = input.next(); |
| 27 | + if(token == null) |
| 28 | + return null; |
| 29 | + |
| 30 | + String text = token.termText(); |
| 31 | + |
| 32 | + int i,offset,len,c; |
| 33 | + for(i=0,offset=0,len=0;i<text.length();i++){ |
| 34 | + c = text.codePointAt(i); |
| 35 | + if(isCJKChar(c)){ |
| 36 | + if(len != 0) |
| 37 | + buffer.add(new Token(text.substring(offset,offset+len),token.startOffset()+offset,token.startOffset()+offset+len)); |
| 38 | + offset = i+1; |
| 39 | + len = 0; |
| 40 | + buffer.add(new Token(text.substring(i,i+1),token.startOffset()+i,token.startOffset()+i+1)); |
| 41 | + } else |
| 42 | + len++; |
| 43 | + } |
| 44 | + if(len != 0 && len != text.length()) |
| 45 | + buffer.add(new Token(text.substring(offset,offset+len),token.startOffset()+offset,token.startOffset()+offset+len)); |
| 46 | + |
| 47 | + if(buffer.size() == 0) |
| 48 | + return token; |
| 49 | + else |
| 50 | + return buffer.removeFirst(); |
| 51 | + } |
| 52 | + |
| 53 | + public final boolean isCJKChar(int c){ |
| 54 | + return (c >= 0x3040 && c <= 0x318f) || |
| 55 | + (c >= 0x3300 && c <= 0x337f) || |
| 56 | + (c >= 0x3400 && c <= 0x3d2d) || |
| 57 | + (c >= 0x4e00 && c <= 0x9fff) || |
| 58 | + (c >= 0xf900 && c <= 0xfaff) || |
| 59 | + (c >= 0xac00 && c <= 0xd7af); |
| 60 | + } |
| 61 | + |
| 62 | +} |
\ No newline at end of file |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java |
— | — | @@ -0,0 +1,133 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +import org.apache.lucene.analysis.PorterStemFilter; |
| 5 | +import org.apache.lucene.analysis.TokenFilter; |
| 6 | +import org.apache.lucene.analysis.TokenStream; |
| 7 | +import org.apache.lucene.analysis.de.GermanStemFilter; |
| 8 | +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| 9 | +import org.apache.lucene.analysis.th.ThaiWordFilter; |
| 10 | + |
| 11 | +/** |
| 12 | + * Make a language-dependent pair of filters. The custom filter is to be applied before the stemmer. |
| 13 | + * |
| 14 | + * @author rainman |
| 15 | + * |
| 16 | + */ |
| 17 | +public class FilterFactory { |
| 18 | + protected String lang; |
| 19 | + protected String snowballName = null; |
| 20 | + protected boolean useStemmer,useCustomFilter; |
| 21 | + protected Class stemmer = null; |
| 22 | + protected Class customFilter = null; |
| 23 | + |
| 24 | + protected FilterFactory noStemmerFilterFactory; |
| 25 | + |
| 26 | + public FilterFactory(String lang){ |
| 27 | + this.lang = lang; |
| 28 | + init(); |
| 29 | + noStemmerFilterFactory = new FilterFactory(lang,snowballName,false,useCustomFilter,null,customFilter); |
| 30 | + } |
| 31 | + |
| 32 | + public FilterFactory(String lang, String snowballName, boolean useStemmer, boolean useCustomFilter, Class stemmer, Class customFilter) { |
| 33 | + this.lang = lang; |
| 34 | + this.snowballName = snowballName; |
| 35 | + this.useStemmer = useStemmer; |
| 36 | + this.useCustomFilter = useCustomFilter; |
| 37 | + this.stemmer = stemmer; |
| 38 | + this.customFilter = customFilter; |
| 39 | + } |
| 40 | + |
| 41 | + public FilterFactory getNoStemmerFilterFactory() { |
| 42 | + return noStemmerFilterFactory; |
| 43 | + } |
| 44 | + |
| 45 | + protected void init(){ |
| 46 | + if(lang == null) |
| 47 | + lang = "en"; |
| 48 | + |
| 49 | + // figure out stemmer |
| 50 | + useStemmer = true; |
| 51 | + if(lang.equals("en")) |
| 52 | + snowballName = "English"; |
| 53 | + // stemmer = PorterStemFilter.class; -- 2x faster but less accurate |
| 54 | + else if(lang.equals("da")) |
| 55 | + snowballName = "Danish"; |
| 56 | + else if(lang.equals("nl")) |
| 57 | + snowballName = "Dutch"; |
| 58 | + else if(lang.equals("fi")) |
| 59 | + snowballName = "Finnish"; |
| 60 | + else if(lang.equals("de")) |
| 61 | + snowballName = "German"; |
| 62 | + else if(lang.equals("it")) |
| 63 | + snowballName = "Italian"; |
| 64 | + else if(lang.equals("no")) |
| 65 | + snowballName = "Norwegian"; |
| 66 | + else if(lang.equals("pt")) |
| 67 | + snowballName = "Portuguese"; |
| 68 | + else if(lang.equals("ru")) |
| 69 | + snowballName = "Russian"; |
| 70 | + else if(lang.equals("es")) |
| 71 | + snowballName = "Spanish"; |
| 72 | + else if(lang.equals("sv")) |
| 73 | + snowballName = "Swedish"; |
| 74 | + else if(lang.equals("eo")) |
| 75 | + stemmer = EsperantoStemFilter.class; |
| 76 | + else |
| 77 | + useStemmer = false; |
| 78 | + |
| 79 | + // figure out custom filter |
| 80 | + useCustomFilter = true; |
| 81 | + if(lang.equals("th")) |
| 82 | + customFilter = ThaiWordFilter.class; |
| 83 | + else if(lang.equals("sr")) |
| 84 | + customFilter = SerbianFilter.class; |
| 85 | + else if(lang.equals("zh") || lang.equals("cjk") || lang.equals("ja") || |
| 86 | + lang.equals("ko") || lang.equals("zh-classical") || lang.equals("zh-yue")) |
| 87 | + customFilter = CJKFilter.class; |
| 88 | + else |
| 89 | + useCustomFilter = false; |
| 90 | + |
| 91 | + } |
| 92 | + |
| 93 | + public TokenFilter makeStemmer(TokenStream in){ |
| 94 | + if(!useStemmer) |
| 95 | + return null; |
| 96 | + else if(snowballName != null) |
| 97 | + return new SnowballFilter(in,snowballName); |
| 98 | + else if(stemmer != null){ |
| 99 | + try { |
| 100 | + return (TokenFilter) stemmer.getConstructor(TokenStream.class).newInstance(in); |
| 101 | + } catch (Exception e) { |
| 102 | + e.printStackTrace(); |
| 103 | + } |
| 104 | + } |
| 105 | + |
| 106 | + return null; |
| 107 | + } |
| 108 | + |
| 109 | + public TokenFilter makeCustomFilter(TokenStream in){ |
| 110 | + if(!useCustomFilter) |
| 111 | + return null; |
| 112 | + else if(customFilter != null){ |
| 113 | + try { |
| 114 | + return (TokenFilter) customFilter.getConstructor(TokenStream.class).newInstance(in); |
| 115 | + } catch (Exception e) { |
| 116 | + e.printStackTrace(); |
| 117 | + } |
| 118 | + } |
| 119 | + |
| 120 | + return null; |
| 121 | + } |
| 122 | + |
| 123 | + public boolean hasStemmer(){ |
| 124 | + return useStemmer; |
| 125 | + } |
| 126 | + |
| 127 | + public boolean hasCustomFilter(){ |
| 128 | + return useCustomFilter; |
| 129 | + } |
| 130 | + |
| 131 | + public String getLanguage(){ |
| 132 | + return lang; |
| 133 | + } |
| 134 | +} |
Index: trunk/lucene-search-2.0/mwsearch-global.conf |
— | — | @@ -8,14 +8,14 @@ |
9 | 9 | # language <languageCode> |
10 | 10 | # databases can be writen as {url}, where url contains list of dbs |
11 | 11 | [Database] |
12 | | -wikilucene : (mainsplit) (language,en) |
| 12 | +wikilucene : (single) (language,en) |
13 | 13 | |
14 | 14 | # Search groups |
15 | 15 | # Index parts of a split index are always taken from the node's group |
16 | 16 | # host : db1.part db2.part |
17 | 17 | # Mulitple hosts can search multiple dbs (N-N mapping) |
18 | 18 | [Search-Group] |
19 | | -oblak : wikilucene.mainpart wikilucene.restpart |
| 19 | +oblak : wikilucene |
20 | 20 | |
21 | 21 | # Index nodes |
22 | 22 | # host: db1.part db2.part |
Index: trunk/lucene-search-2.0/mwsearch.conf |
— | — | @@ -6,7 +6,7 @@ |
7 | 7 | |
8 | 8 | # URL to global configuration, this is the shared main config file, it can |
9 | 9 | # be on a NFS partition or available somewhere on the network |
10 | | -MWConfig.global=http://localhost/wiki-lucene/lucene-search/mwsearch-global.conf |
| 10 | +MWConfig.global=http://localhost/lucene-search-2.0/mwsearch-global.conf |
11 | 11 | |
12 | 12 | ################################################ |
13 | 13 | # Search node related configuration |
— | — | @@ -59,7 +59,7 @@ |
60 | 60 | RecentUpdateDaemon.hostspareTcpPort=8112 |
61 | 61 | |
62 | 62 | # Log configuration |
63 | | -#Logging.logconfig=/etc/mwsearch.log4j |
| 63 | +Logging.logconfig=/etc/mwsearch.log4j |
64 | 64 | |
65 | 65 | # Set debug to true to diagnose problems with log4j configuration |
66 | 66 | Logging.debug=false |
Index: trunk/lucene-search-2.0/build.xml |
— | — | @@ -30,7 +30,7 @@ |
31 | 31 | <jar destfile="${basedir}/MWSearch.jar"> |
32 | 32 | <manifest> |
33 | 33 | <attribute name="Main-Class" value="org.wikimedia.lsearch.config.StartupManager"/> |
34 | | - <attribute name="Class-Path" value="MWSearch.jar lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar"/> |
| 34 | + <attribute name="Class-Path" value="MWSearch.jar lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar"/> |
35 | 35 | </manifest> |
36 | 36 | <zipfileset dir="${bin}" prefix=""> |
37 | 37 | <include name="org/**"/> |