Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java |
— | — | @@ -231,6 +231,8 @@ |
232 | 232 | terms = new WordTerms("./lib/dict/german.txt.gz"); |
233 | 233 | else if(lang.equals("fr")) |
234 | 234 | terms = new WordTerms("./lib/dict/french.txt.gz"); |
| 235 | + else if(lang.equals("sample")) |
| 236 | + terms = new SampleTerms(); |
235 | 237 | else |
236 | 238 | terms = new WordTerms("./test-data/words-wikilucene.ngram.gz"); |
237 | 239 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java |
— | — | @@ -8,6 +8,7 @@ |
9 | 9 | import org.apache.lucene.analysis.Analyzer; |
10 | 10 | import org.apache.lucene.document.Document; |
11 | 11 | import org.apache.lucene.index.IndexWriter; |
| 12 | +import org.wikimedia.lsearch.analyzers.FieldBuilder; |
12 | 13 | import org.wikimedia.lsearch.analyzers.FilterFactory; |
13 | 14 | import org.wikimedia.lsearch.beans.Article; |
14 | 15 | import org.wikimedia.lsearch.beans.IndexReportCard; |
— | — | @@ -27,7 +28,7 @@ |
28 | 29 | static Logger log = Logger.getLogger(SimpleIndexWriter.class); |
29 | 30 | protected IndexId iid; |
30 | 31 | protected HashMap<String,IndexWriter> indexes; |
31 | | - protected FilterFactory filters; |
| 32 | + protected FieldBuilder builder; |
32 | 33 | protected Boolean optimize; |
33 | 34 | protected Integer mergeFactor, maxBufDocs; |
34 | 35 | protected boolean newIndex; |
— | — | @@ -39,8 +40,9 @@ |
40 | 41 | this.mergeFactor = mergeFactor; |
41 | 42 | this.maxBufDocs = maxBufDocs; |
42 | 43 | this.newIndex = newIndex; |
43 | | - langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname()); |
44 | | - filters = new FilterFactory(langCode); |
| 44 | + GlobalConfiguration global = GlobalConfiguration.getInstance(); |
| 45 | + langCode = global.getLanguage(iid.getDBname()); |
| 46 | + builder = new FieldBuilder(langCode,global.exactCaseIndex(iid.getDBname())); |
45 | 47 | indexes = new HashMap<String,IndexWriter>(); |
46 | 48 | // open all relevant indexes |
47 | 49 | if(iid.isSingle()) |
— | — | @@ -106,7 +108,7 @@ |
107 | 109 | IndexWriter writer = indexes.get(target.toString()); |
108 | 110 | if(writer == null) |
109 | 111 | return; |
110 | | - Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,filters,iid); |
| 112 | + Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid); |
111 | 113 | Document doc = (Document) ret[0]; |
112 | 114 | Analyzer analyzer = (Analyzer) ret[1]; |
113 | 115 | try { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -134,7 +134,6 @@ |
135 | 135 | System.out.println("Finished indexing in "+formatTime(end-start)+", with final index optimization in "+formatTime(finalEnd-end)); |
136 | 136 | System.out.println("Total time: "+formatTime(finalEnd-start)); |
137 | 137 | } |
138 | | - |
139 | 138 | // make snapshot if needed |
140 | 139 | if(makeSnapshot || snapshotDb){ |
141 | 140 | IndexId iid = IndexId.get(dbname); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -69,6 +69,8 @@ |
70 | 70 | protected String[] databaseSuffixes = null; |
71 | 71 | /** Databases ending in suffix will use additional keyword scores */ |
72 | 72 | protected String[] keywordScoringSuffixes = null; |
| 73 | + /** Databases ending in suffix will have 2 indexes, one with lowercased words, and one with exact case words */ |
| 74 | + protected String[] exactCaseSuffix = null; |
73 | 75 | |
74 | 76 | protected Properties globalProperties = null; |
75 | 77 | |
— | — | @@ -290,6 +292,7 @@ |
291 | 293 | // get some predifined global properties |
292 | 294 | this.databaseSuffixes = getArrayProperty("Database.suffix"); |
293 | 295 | this.keywordScoringSuffixes = getArrayProperty("KeywordScoring.suffix"); |
| 296 | + this.exactCaseSuffix = getArrayProperty("ExactCase.suffix"); |
294 | 297 | if(line == null) |
295 | 298 | break; |
296 | 299 | // else: line points to beginning of next section |
— | — | @@ -457,6 +460,7 @@ |
458 | 461 | mySearch, |
459 | 462 | oairepo); |
460 | 463 | indexIdPool.put(dbrole,iid); |
| 464 | + |
461 | 465 | } |
462 | 466 | if(indexIdPool.get(dbname).isNssplit()) |
463 | 467 | indexIdPool.get(dbname).rebuildNsMap(indexIdPool); |
— | — | @@ -831,17 +835,12 @@ |
832 | 836 | return namespacePrefixAll; |
833 | 837 | } |
834 | 838 | |
835 | | - /** Returns if keyword scoring should be used for this db, using |
836 | | - * the suffixes from the global configuration |
837 | | - * |
838 | | - * @param dbname |
839 | | - * @return |
840 | | - */ |
841 | | - public boolean useKeywordScoring(String dbname){ |
842 | | - if(keywordScoringSuffixes == null) |
| 839 | + /** Check wether dbname has some of the suffixes */ |
| 840 | + protected boolean checkSuffix(String[] suffixes, String dbname){ |
| 841 | + if(suffixes == null) |
843 | 842 | return false; |
844 | 843 | else{ |
845 | | - for (String suffix : keywordScoringSuffixes) { |
| 844 | + for (String suffix : suffixes) { |
846 | 845 | if (dbname.endsWith(suffix)) |
847 | 846 | return true; |
848 | 847 | } |
— | — | @@ -849,6 +848,25 @@ |
850 | 849 | return false; |
851 | 850 | } |
852 | 851 | |
| 852 | + /** Returns if keyword scoring should be used for this db, using |
| 853 | + * the suffixes from the global configuration |
| 854 | + * |
| 855 | + * @param dbname |
| 856 | + * @return |
| 857 | + */ |
| 858 | + public boolean useKeywordScoring(String dbname){ |
| 859 | + return checkSuffix(keywordScoringSuffixes,dbname); |
| 860 | + } |
853 | 861 | |
| 862 | + /** |
| 863 | + * If this dbname is assigned an exact-case additional index. |
| 864 | + * |
| 865 | + * @param dbname |
| 866 | + * @return |
| 867 | + */ |
| 868 | + public boolean exactCaseIndex(String dbname){ |
| 869 | + return checkSuffix(exactCaseSuffix,dbname); |
| 870 | + } |
| 871 | + |
854 | 872 | |
855 | 873 | } |
\ No newline at end of file |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -19,6 +19,7 @@ |
20 | 20 | import org.apache.lucene.search.Searcher; |
21 | 21 | import org.apache.lucene.search.TopDocs; |
22 | 22 | import org.wikimedia.lsearch.analyzers.Analyzers; |
| 23 | +import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
23 | 24 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
24 | 25 | import org.wikimedia.lsearch.beans.ResultSet; |
25 | 26 | import org.wikimedia.lsearch.beans.SearchResults; |
— | — | @@ -41,20 +42,28 @@ |
42 | 43 | |
43 | 44 | protected final int maxlines = 1000; |
44 | 45 | protected final int maxoffset = 10000; |
| 46 | + protected static GlobalConfiguration global = null; |
45 | 47 | |
| 48 | + public SearchEngine(){ |
| 49 | + if(global == null) |
| 50 | + global = GlobalConfiguration.getInstance(); |
| 51 | + } |
| 52 | + |
46 | 53 | /** Main search method, call this from the search frontend */ |
47 | 54 | public SearchResults search(IndexId iid, String what, String searchterm, HashMap query) { |
48 | 55 | |
49 | 56 | if (what.equals("titlematch")) { |
50 | 57 | // TODO: return searchTitles(searchterm); |
51 | 58 | } else if (what.equals("search") || what.equals("explain")) { |
52 | | - int offset = 0, limit = 100; |
| 59 | + int offset = 0, limit = 100; boolean exactCase = false; |
53 | 60 | if (query.containsKey("offset")) |
54 | 61 | offset = Math.max(Integer.parseInt((String)query.get("offset")), 0); |
55 | 62 | if (query.containsKey("limit")) |
56 | 63 | limit = Math.min(Integer.parseInt((String)query.get("limit")), maxlines); |
| 64 | + if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact")) |
| 65 | + exactCase = true; |
57 | 66 | NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces")); |
58 | | - SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain")); |
| 67 | + SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase); |
59 | 68 | if(res!=null && res.isRetry()){ |
60 | 69 | int retries = 0; |
61 | 70 | if(iid.isSplit() || iid.isNssplit()){ |
— | — | @@ -63,7 +72,7 @@ |
64 | 73 | retries = 1; |
65 | 74 | |
66 | 75 | while(retries > 0 && res.isRetry()){ |
67 | | - res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain")); |
| 76 | + res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase); |
68 | 77 | retries--; |
69 | 78 | } |
70 | 79 | if(res.isRetry()) |
— | — | @@ -108,11 +117,12 @@ |
109 | 118 | * Search on iid, with query searchterm. View results from offset to offset+limit, using |
110 | 119 | * the default namespaces filter |
111 | 120 | */ |
112 | | - public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault, boolean explain){ |
113 | | - Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid); |
| 121 | + public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault, boolean explain, boolean exactCase){ |
| 122 | + Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase); |
114 | 123 | if(nsDefault == null || nsDefault.cardinality() == 0) |
115 | 124 | nsDefault = new NamespaceFilter("0"); // default to main namespace |
116 | | - WikiQueryParser parser = new WikiQueryParser("contents",nsDefault,analyzer,WikiQueryParser.NamespacePolicy.IGNORE); |
| 125 | + FieldNameFactory ff = new FieldNameFactory(exactCase); |
| 126 | + WikiQueryParser parser = new WikiQueryParser(ff.contents(),nsDefault,analyzer,ff,WikiQueryParser.NamespacePolicy.IGNORE); |
117 | 127 | HashSet<NamespaceFilter> fields = parser.getFieldNamespaces(searchterm); |
118 | 128 | NamespaceFilterWrapper nsfw = null; |
119 | 129 | Query q = null; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java |
— | — | @@ -10,6 +10,7 @@ |
11 | 11 | import org.apache.lucene.search.Query; |
12 | 12 | import org.apache.lucene.search.TermQuery; |
13 | 13 | import org.wikimedia.lsearch.analyzers.Analyzers; |
| 14 | +import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
14 | 15 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
15 | 16 | import org.wikimedia.lsearch.benchmark.Terms; |
16 | 17 | import org.wikimedia.lsearch.benchmark.WordTerms; |
— | — | @@ -60,7 +61,8 @@ |
61 | 62 | |
62 | 63 | /** Warmup index using some number of simple searches */ |
63 | 64 | protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) { |
64 | | - WikiQueryParser parser = new WikiQueryParser("contents","0",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE); |
| 65 | + FieldNameFactory fields = new FieldNameFactory(); |
| 66 | + WikiQueryParser parser = new WikiQueryParser(fields.contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),fields,WikiQueryParser.NamespacePolicy.IGNORE); |
65 | 67 | Terms terms = getTermsForLang(global.getLanguage(iid.getDBname())); |
66 | 68 | |
67 | 69 | try{ |
— | — | @@ -116,7 +118,8 @@ |
117 | 119 | /** Just run one complex query and rebuild the main namespace filter */ |
118 | 120 | public static void simpleWarmup(IndexSearcherMul is, IndexId iid){ |
119 | 121 | try{ |
120 | | - WikiQueryParser parser = new WikiQueryParser("contents","0",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE); |
| 122 | + FieldNameFactory fields = new FieldNameFactory(); |
| 123 | + WikiQueryParser parser = new WikiQueryParser(fields.contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),fields,WikiQueryParser.NamespacePolicy.IGNORE); |
121 | 124 | Query q = parser.parseFourPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
122 | 125 | is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0"))); |
123 | 126 | } catch (IOException e) { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/highlight/HighlightDaemon.java |
— | — | @@ -23,6 +23,7 @@ |
24 | 24 | import org.apache.lucene.search.highlight.TextFragment; |
25 | 25 | import org.wikimedia.lsearch.analyzers.Analyzers; |
26 | 26 | import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
| 27 | +import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
27 | 28 | import org.wikimedia.lsearch.analyzers.FilterFactory; |
28 | 29 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
29 | 30 | import org.wikimedia.lsearch.analyzers.WikiTokenizer; |
— | — | @@ -121,10 +122,13 @@ |
122 | 123 | } |
123 | 124 | |
124 | 125 | // highlight all articles and return results |
125 | | - String lang = GlobalConfiguration.getInstance().getLanguage(dbname); |
126 | | - Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid); |
127 | | - WikiQueryParser parser = new WikiQueryParser("contents", |
128 | | - new NamespaceFilter("0"),analyzer,WikiQueryParser.NamespacePolicy.IGNORE); |
| 126 | + GlobalConfiguration global = GlobalConfiguration.getInstance(); |
| 127 | + boolean exactCase = global.exactCaseIndex(iid.getDBname()); |
| 128 | + String lang = global.getLanguage(dbname); |
| 129 | + Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase); |
| 130 | + FieldNameFactory fields = new FieldNameFactory(exactCase); |
| 131 | + WikiQueryParser parser = new WikiQueryParser(fields.contents(), |
| 132 | + new NamespaceFilter("0"),analyzer,fields,WikiQueryParser.NamespacePolicy.IGNORE); |
129 | 133 | Query q = parser.parseFourPass(query,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname()); |
130 | 134 | Scorer scorer = new QueryScorer(q); |
131 | 135 | SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"searchmatch\">","</span>"); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -28,6 +28,8 @@ |
29 | 29 | import org.apache.lucene.store.FSDirectory; |
30 | 30 | import org.wikimedia.lsearch.analyzers.Analyzers; |
31 | 31 | import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine; |
| 32 | +import org.wikimedia.lsearch.analyzers.FieldBuilder; |
| 33 | +import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
32 | 34 | import org.wikimedia.lsearch.analyzers.FilterFactory; |
33 | 35 | import org.wikimedia.lsearch.analyzers.KeywordsAnalyzer; |
34 | 36 | import org.wikimedia.lsearch.analyzers.WikiTokenizer; |
— | — | @@ -66,6 +68,7 @@ |
67 | 69 | protected IndexWriter writer; |
68 | 70 | protected boolean rewrite; |
69 | 71 | protected String langCode; |
| 72 | + protected boolean exactCase; |
70 | 73 | |
71 | 74 | protected HashSet<IndexUpdateRecord> nonDeleteDocuments; |
72 | 75 | |
— | — | @@ -81,10 +84,11 @@ |
82 | 85 | * @param analyzer |
83 | 86 | * @param rewrite - if true, will create new index |
84 | 87 | */ |
85 | | - SimpleIndexModifier(IndexId iid, String langCode, boolean rewrite){ |
| 88 | + SimpleIndexModifier(IndexId iid, String langCode, boolean rewrite, boolean exactCase){ |
86 | 89 | this.iid = iid; |
87 | 90 | this.rewrite = rewrite; |
88 | 91 | this.langCode = langCode; |
| 92 | + this.exactCase = exactCase; |
89 | 93 | reportQueue = new Hashtable<IndexUpdateRecord,IndexReportCard>(); |
90 | 94 | } |
91 | 95 | |
— | — | @@ -175,16 +179,16 @@ |
176 | 180 | writer.setUseCompoundFile(true); |
177 | 181 | writer.setMaxFieldLength(MAX_FIELD_LENGTH); |
178 | 182 | |
179 | | - FilterFactory filters = new FilterFactory(langCode); |
| 183 | + FieldBuilder builder = new FieldBuilder(langCode,exactCase); |
180 | 184 | |
181 | 185 | for(IndexUpdateRecord rec : records){ |
182 | 186 | if(rec.doAdd()){ |
183 | 187 | if(!rec.isAlwaysAdd() && nonDeleteDocuments.contains(rec)) |
184 | 188 | continue; // don't add if delete/add are paired operations |
185 | 189 | if(!checkPreconditions(rec)) |
186 | | - continue; // article shoouldn't be added for some (heuristic) reason |
| 190 | + continue; // article shouldn't be added for some reason |
187 | 191 | IndexReportCard card = getReportCard(rec); |
188 | | - Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters,iid); |
| 192 | + Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),builder,iid); |
189 | 193 | Document doc = (Document) ret[0]; |
190 | 194 | Analyzer analyzer = (Analyzer) ret[1]; |
191 | 195 | try { |
— | — | @@ -244,7 +248,7 @@ |
245 | 249 | |
246 | 250 | /** |
247 | 251 | * Generate the articles transient characterstics needed only for indexing, |
248 | | - * i.e. list of redirect keywords and Page Rank. |
| 252 | + * i.e. list of redirect keywords and article rank. |
249 | 253 | * |
250 | 254 | * @param article |
251 | 255 | */ |
— | — | @@ -345,7 +349,7 @@ |
346 | 350 | long now = System.currentTimeMillis(); |
347 | 351 | log.info("Starting update of "+updateRecords.size()+" records on "+iid+", started at "+now); |
348 | 352 | |
349 | | - SimpleIndexModifier modifier = new SimpleIndexModifier(iid,global.getLanguage(iid.getDBname()),false); |
| 353 | + SimpleIndexModifier modifier = new SimpleIndexModifier(iid,global.getLanguage(iid.getDBname()),false,global.exactCaseIndex(iid.getDBname())); |
350 | 354 | |
351 | 355 | Transaction trans = new Transaction(iid); |
352 | 356 | trans.begin(); |
— | — | @@ -398,60 +402,66 @@ |
399 | 403 | * @param languageAnalyzer |
400 | 404 | * @return array { document, analyzer } |
401 | 405 | */ |
402 | | - public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters, IndexId iid){ |
| 406 | + public static Object[] makeDocumentAndAnalyzer(Article article, FieldBuilder builder, IndexId iid){ |
403 | 407 | PerFieldAnalyzerWrapper perFieldAnalyzer = null; |
404 | 408 | WikiTokenizer tokenizer = null; |
405 | 409 | Document doc = new Document(); |
406 | 410 | |
407 | 411 | // tranform record so that unnecessary stuff is deleted, e.g. some redirects |
408 | | - transformArticleForIndexing(article); |
| 412 | + transformArticleForIndexing(article); |
409 | 413 | |
410 | 414 | // This will be used to look up and replace entries on index updates. |
411 | 415 | doc.add(new Field("key", article.getKey(), Field.Store.YES, Field.Index.UN_TOKENIZED)); |
412 | | - |
| 416 | + |
413 | 417 | // These fields are returned with results |
414 | 418 | doc.add(new Field("namespace", article.getNamespace(), Field.Store.YES, Field.Index.UN_TOKENIZED)); |
415 | 419 | |
416 | | - // boost document title with it's article rank |
417 | | - Field title = new Field("title", article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED); |
418 | | - //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size())); |
419 | | - float rankBoost = calculateArticleRank(article.getRank()); |
420 | | - title.setBoost(rankBoost); |
421 | | - doc.add(title); |
422 | | - |
423 | | - Field stemtitle = new Field("stemtitle", article.getTitle(),Field.Store.NO, Field.Index.TOKENIZED); |
424 | | - //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size())); |
425 | | - stemtitle.setBoost(rankBoost); |
426 | | - doc.add(stemtitle); |
427 | | - |
428 | | - // put the best redirects as alternative titles |
429 | | - makeAltTitles(doc,"alttitle",article); |
430 | | - |
431 | | - // add titles of redirects, generated from analyzer |
432 | | - makeKeywordField(doc,"redirect",rankBoost); |
433 | | - |
434 | | - if(checkKeywordPreconditions(article,iid)) |
435 | | - // most significat words in the text, gets extra score, from analyzer |
436 | | - makeKeywordField(doc,"keyword",rankBoost); |
437 | | - |
438 | | - // the next fields are generated using wikitokenizer |
439 | | - doc.add(new Field("contents", "", |
440 | | - Field.Store.NO, Field.Index.TOKENIZED)); |
441 | | - |
442 | 420 | // each token is one category (category names themself are not tokenized) |
443 | 421 | doc.add(new Field("category", "", |
444 | 422 | Field.Store.NO, Field.Index.TOKENIZED)); |
| 423 | + |
| 424 | + for(FieldBuilder.BuilderSet bs : builder.getBuilders()){ |
| 425 | + FieldNameFactory fields = bs.getFields(); |
| 426 | + // boost document title with it's article rank |
| 427 | + Field title = new Field(fields.title(), article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED); |
| 428 | + //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size())); |
| 429 | + float rankBoost = calculateArticleRank(article.getRank()); |
| 430 | + title.setBoost(rankBoost); |
| 431 | + doc.add(title); |
445 | 432 | |
| 433 | + Field stemtitle = new Field(fields.stemtitle(), article.getTitle(),Field.Store.NO, Field.Index.TOKENIZED); |
| 434 | + //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size())); |
| 435 | + stemtitle.setBoost(rankBoost); |
| 436 | + doc.add(stemtitle); |
| 437 | + |
| 438 | + // put the best redirects as alternative titles |
| 439 | + makeAltTitles(doc,fields.alttitle(),article); |
| 440 | + |
| 441 | + // add titles of redirects, generated from analyzer |
| 442 | + makeKeywordField(doc,fields.redirect(),rankBoost); |
| 443 | + |
| 444 | + if(checkKeywordPreconditions(article,iid)) |
| 445 | + // most significat words in the text, gets extra score, from analyzer |
| 446 | + makeKeywordField(doc,fields.keyword(),rankBoost); |
| 447 | + |
| 448 | + // the next fields are generated using wikitokenizer |
| 449 | + doc.add(new Field(fields.contents(), "", |
| 450 | + Field.Store.NO, Field.Index.TOKENIZED)); |
| 451 | + |
| 452 | + // set boost for keyword field |
| 453 | + // tokenizer = (WikiTokenizer) ret[1]; |
| 454 | + // keyword.setBoost(calculateKeywordsBoost(tokenizer.getTokens().size())); |
| 455 | + } |
| 456 | + // make analyzer |
| 457 | + if(article.getTitle().equalsIgnoreCase("wiki")){ |
| 458 | + int b =10; |
| 459 | + b++; |
| 460 | + } |
446 | 461 | String text = article.getContents(); |
447 | | - if(article.isRedirect()) |
448 | | - text=""; // for redirects index only the title |
449 | | - Object[] ret = Analyzers.getIndexerAnalyzer(text,filters,article.getRedirectKeywords()); |
| 462 | + Object[] ret = Analyzers.getIndexerAnalyzer(text,builder,article.getRedirectKeywords()); |
450 | 463 | perFieldAnalyzer = (PerFieldAnalyzerWrapper) ret[0]; |
| 464 | + |
451 | 465 | |
452 | | - // set boost for keyword field |
453 | | - // tokenizer = (WikiTokenizer) ret[1]; |
454 | | - // keyword.setBoost(calculateKeywordsBoost(tokenizer.getTokens().size())); |
455 | | - |
456 | 466 | return new Object[] { doc, perFieldAnalyzer }; |
457 | 467 | } |
458 | 468 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java |
— | — | @@ -36,8 +36,7 @@ |
37 | 37 | return f; |
38 | 38 | } |
39 | 39 | } else if(fieldName.equals("title") || fieldName.equals("stemtitle") || fieldName.startsWith("alttitle")){ |
40 | | - //float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens)); |
41 | | - float f = (float) (1.0 / numTokens); |
| 40 | + float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens)); |
42 | 41 | //log.debug("Length-norm: "+f+", numtokens: "+numTokens); |
43 | 42 | return f; |
44 | 43 | } else if(fieldName.startsWith("redirect") || fieldName.startsWith("keyword")){ |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java |
— | — | @@ -35,12 +35,12 @@ |
36 | 36 | * |
37 | 37 | * @param str |
38 | 38 | */ |
39 | | - public WikiTokenizer(String str){ |
40 | | - this(str,null); |
| 39 | + public WikiTokenizer(String str, boolean exactCase){ |
| 40 | + this(str,null,exactCase); |
41 | 41 | } |
42 | 42 | |
43 | | - public WikiTokenizer(String str, String lang){ |
44 | | - parser = new FastWikiTokenizerEngine(str,lang); |
| 43 | + public WikiTokenizer(String str, String lang, boolean exactCase){ |
| 44 | + parser = new FastWikiTokenizerEngine(str,lang,exactCase); |
45 | 45 | this.input = null; |
46 | 46 | } |
47 | 47 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -27,7 +27,9 @@ |
28 | 28 | */ |
29 | 29 | public class FastWikiTokenizerEngine { |
30 | 30 | private static final int MAX_WORD_LEN = 255; |
31 | | - private final char[] buffer = new char[MAX_WORD_LEN+1]; |
| 31 | + private final char[] buffer = new char[MAX_WORD_LEN]; // buffer of text, e.g. gödel |
| 32 | + private final char[] aliasBuffer = new char[MAX_WORD_LEN]; // buffer for aliases, e.g. goedel |
| 33 | + private final char[] decompBuffer = new char[MAX_WORD_LEN]; // buffer for dedomposed text e.g. godel |
32 | 34 | private static final int IO_BUFFER_SIZE = 1024; |
33 | 35 | private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; |
34 | 36 | private char[] text; |
— | — | @@ -37,6 +39,7 @@ |
38 | 40 | protected ArrayList<String> categories; |
39 | 41 | protected HashMap<String,String> interwikis; |
40 | 42 | protected HashSet<String> keywords; |
| 43 | + private int decompLength=0, aliasLength=0; |
41 | 44 | private int length = 0; // length of token |
42 | 45 | private int start = 0; // start position of token |
43 | 46 | private int cur = 0; // current position in the input string |
— | — | @@ -68,6 +71,8 @@ |
69 | 72 | private static Hashtable<String,HashSet<String>> categoryLocalized = new Hashtable<String,HashSet<String>>(); |
70 | 73 | private static HashSet<String> interwiki; |
71 | 74 | |
| 75 | + /** if true, words won't be lowercased */ |
| 76 | + private boolean exactCase = false; |
72 | 77 | private UnicodeDecomposer decomposer; |
73 | 78 | |
74 | 79 | enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END, LINK_KEYWORD, |
— | — | @@ -104,14 +109,15 @@ |
105 | 110 | } |
106 | 111 | } |
107 | 112 | |
108 | | - public FastWikiTokenizerEngine(String text){ |
109 | | - this(text,null); |
| 113 | + public FastWikiTokenizerEngine(String text, boolean exactCase){ |
| 114 | + this(text,null,exactCase); |
110 | 115 | } |
111 | 116 | |
112 | | - public FastWikiTokenizerEngine(String text, String lang){ |
| 117 | + public FastWikiTokenizerEngine(String text, String lang, boolean exactCase){ |
113 | 118 | this.text = text.toCharArray(); |
114 | 119 | this.textString = text; |
115 | 120 | this.language = lang; |
| 121 | + this.exactCase = exactCase; |
116 | 122 | textLength = text.length(); |
117 | 123 | init(); |
118 | 124 | } |
— | — | @@ -125,23 +131,112 @@ |
126 | 132 | return decomposer.decompose(c); |
127 | 133 | } |
128 | 134 | |
| 135 | + /** Add transliteration to token alias, create alias if it doesn't exist */ |
| 136 | + private final void addToTokenAlias(String transliteration) { |
| 137 | + if(aliasLength == 0){ |
| 138 | + System.arraycopy(decompBuffer,0,aliasBuffer,0,decompLength); |
| 139 | + aliasLength = decompLength; |
| 140 | + } |
| 141 | + for(char cc : transliteration.toCharArray()) |
| 142 | + if(aliasLength < aliasBuffer.length) |
| 143 | + aliasBuffer[aliasLength++] = cc; |
| 144 | + } |
| 145 | + |
129 | 146 | /** |
130 | 147 | * This function is called at word boundaries, it is used to |
131 | 148 | * make a new token and add it to token stream |
| 149 | + * |
| 150 | + * Does unicode decomposition, and will make alias token with |
| 151 | + * alternative transliterations (e.g. ö -> oe) |
132 | 152 | */ |
133 | 153 | private final void addToken(){ |
134 | 154 | if(length!=0){ |
135 | 155 | if(numberToken && (buffer[length-1]=='.' ||buffer[length-1]==',')) |
136 | 156 | length--; // strip trailing . and , in numbers |
137 | | - tokens.add(new Token( |
138 | | - new String(buffer, 0, length), start, start + length)); |
| 157 | + // decompose token, maintain alias if needed |
| 158 | + decompLength = 0; |
| 159 | + aliasLength = 0; |
| 160 | + boolean addToAlias; |
| 161 | + for(int i=0;i<length;i++){ |
| 162 | + addToAlias = true; |
| 163 | + if( ! exactCase ) |
| 164 | + cl = Character.toLowerCase(buffer[i]); |
| 165 | + else{ |
| 166 | + cl = buffer[i]; |
| 167 | + // check additional (uppercase) character aliases |
| 168 | + if(cl == 'Ä' ){ |
| 169 | + addToTokenAlias("Ae"); |
| 170 | + addToAlias = false; |
| 171 | + } else if(cl == 'Ö'){ |
| 172 | + addToTokenAlias("Oe"); |
| 173 | + addToAlias = false; |
| 174 | + } else if(cl == 'Ü'){ |
| 175 | + addToTokenAlias("Ue"); |
| 176 | + addToAlias = false; |
| 177 | + } else if(cl == 'Ñ'){ |
| 178 | + addToTokenAlias("Nh"); |
| 179 | + addToAlias = false; |
| 180 | + } else if(cl == 'Å'){ |
| 181 | + addToTokenAlias("Aa"); |
| 182 | + addToAlias = false; |
| 183 | + } |
| 184 | + } |
| 185 | + // special alias transliterations ä -> ae, etc ... |
| 186 | + if(cl == 'ä' ){ |
| 187 | + addToTokenAlias("ae"); |
| 188 | + addToAlias = false; |
| 189 | + } else if(cl == 'ö'){ |
| 190 | + addToTokenAlias("oe"); |
| 191 | + addToAlias = false; |
| 192 | + } else if(cl == 'ü'){ |
| 193 | + addToTokenAlias("ue"); |
| 194 | + addToAlias = false; |
| 195 | + } else if(cl == 'ß'){ |
| 196 | + addToTokenAlias("ss"); |
| 197 | + addToAlias = false; |
| 198 | + } else if(cl == 'ñ'){ |
| 199 | + addToTokenAlias("nh"); |
| 200 | + addToAlias = false; |
| 201 | + } else if(cl == 'å'){ |
| 202 | + addToTokenAlias("aa"); |
| 203 | + addToAlias = false; |
| 204 | + } |
| 205 | + |
| 206 | + decomp = decompose(cl); |
| 207 | + // no decomposition |
| 208 | + if(decomp == null){ |
| 209 | + if(decompLength<decompBuffer.length) |
| 210 | + decompBuffer[decompLength++] = cl; |
| 211 | + if(addToAlias && aliasLength!=0 && aliasLength<aliasBuffer.length) |
| 212 | + aliasBuffer[aliasLength++] = cl; |
| 213 | + } else{ |
| 214 | + for(decompi = 0; decompi < decomp.length; decompi++){ |
| 215 | + if(decompLength<decompBuffer.length) |
| 216 | + decompBuffer[decompLength++] = decomp[decompi]; |
| 217 | + if(addToAlias && aliasLength!=0 && aliasLength<aliasBuffer.length) |
| 218 | + aliasBuffer[aliasLength++] = decomp[decompi]; |
| 219 | + } |
| 220 | + } |
| 221 | + } |
| 222 | + // add decomposed token to stream |
| 223 | + if(decompLength!=0) |
| 224 | + tokens.add(new Token( |
| 225 | + new String(decompBuffer, 0, decompLength), start, start + length)); |
| 226 | + // add alias (if any) token to stream |
| 227 | + if(aliasLength!=0){ |
| 228 | + Token t = new Token( |
| 229 | + new String(aliasBuffer, 0, aliasLength), start, start + length); |
| 230 | + t.setPositionIncrement(0); |
| 231 | + t.setType("transliteration"); |
| 232 | + tokens.add(t); |
| 233 | + } |
139 | 234 | length = 0; |
140 | 235 | numberToken = false; |
141 | 236 | if(templateLevel == 0) |
142 | 237 | keywordTokens++; |
143 | 238 | } |
144 | 239 | } |
145 | | - |
| 240 | + |
146 | 241 | /** |
147 | 242 | * Tries to add the current letter (variable c) to the |
148 | 243 | * buffer, if it's not a letter, new token is created |
— | — | @@ -156,19 +251,9 @@ |
157 | 252 | if(length == 0) |
158 | 253 | start = cur; |
159 | 254 | |
160 | | - cl = Character.toLowerCase(c); |
161 | | - decomp = decompose(cl); |
162 | | - if(decomp == null){ |
163 | | - if(length<buffer.length) |
164 | | - buffer[length++] = cl; |
165 | | - } |
166 | | - else{ |
167 | | - for(decompi = 0; decompi < decomp.length; decompi++){ |
168 | | - if(length<buffer.length) |
169 | | - buffer[length++] = decomp[decompi]; |
170 | | - } |
171 | | - } |
172 | | - // add digits |
| 255 | + if(length < buffer.length) |
| 256 | + buffer[length++] = c; |
| 257 | + // add digits |
173 | 258 | } else if(Character.isDigit(c)){ |
174 | 259 | if(length == 0) |
175 | 260 | start = cur; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java |
— | — | @@ -83,6 +83,8 @@ |
84 | 84 | public static float ALT_TITLE_ALIAS_BOOST = 0.4f; |
85 | 85 | public static float KEYWORD_BOOST = 0.02f; |
86 | 86 | |
| 87 | + public static boolean ADD_STEM_TITLE = true; |
| 88 | + |
87 | 89 | /** Policies in treating field names: |
88 | 90 | * |
89 | 91 | * LEAVE - don't mess with field rewriting |
— | — | @@ -103,14 +105,11 @@ |
104 | 106 | private NamespacePolicy namespacePolicy; |
105 | 107 | protected NamespaceFilter defaultNamespaceFilter; |
106 | 108 | protected static GlobalConfiguration global=null; |
| 109 | + protected FieldNameFactory fields; |
107 | 110 | |
108 | 111 | /** default value for boolean queries */ |
109 | 112 | public BooleanClause.Occur boolDefault = BooleanClause.Occur.MUST; |
110 | 113 | |
111 | | - private UnicodeDecomposer decomposer; |
112 | | - private char[] decomp; // unicode decomposition letters |
113 | | - private int decompi; |
114 | | - |
115 | 114 | /** Init namespace queries */ |
116 | 115 | protected void initNamespaces(){ |
117 | 116 | if(namespaceQueries != null) |
— | — | @@ -131,8 +130,8 @@ |
132 | 131 | * @param field default field name |
133 | 132 | * @param analyzer |
134 | 133 | */ |
135 | | - public WikiQueryParser(String field, Analyzer analyzer){ |
136 | | - this(field,(NamespaceFilter)null,analyzer,NamespacePolicy.LEAVE); |
| 134 | + public WikiQueryParser(String field, Analyzer analyzer, FieldNameFactory fields){ |
| 135 | + this(field,(NamespaceFilter)null,analyzer,fields,NamespacePolicy.LEAVE); |
137 | 136 | } |
138 | 137 | |
139 | 138 | /** |
— | — | @@ -143,14 +142,14 @@ |
144 | 143 | * @param analyzer |
145 | 144 | * @param nsPolicy |
146 | 145 | */ |
147 | | - public WikiQueryParser(String field, String namespace, Analyzer analyzer, NamespacePolicy nsPolicy){ |
148 | | - this(field,new NamespaceFilter(namespace),analyzer,nsPolicy); |
| 146 | + public WikiQueryParser(String field, String namespace, Analyzer analyzer, FieldNameFactory fields, NamespacePolicy nsPolicy){ |
| 147 | + this(field,new NamespaceFilter(namespace),analyzer,fields,nsPolicy); |
149 | 148 | } |
150 | 149 | |
151 | | - public WikiQueryParser(String field, NamespaceFilter nsfilter, Analyzer analyzer, NamespacePolicy nsPolicy){ |
| 150 | + public WikiQueryParser(String field, NamespaceFilter nsfilter, Analyzer analyzer, FieldNameFactory fields, NamespacePolicy nsPolicy){ |
152 | 151 | defaultField = field; |
153 | 152 | this.analyzer = analyzer; |
154 | | - decomposer = UnicodeDecomposer.getInstance(); |
| 153 | + this.fields = fields; |
155 | 154 | tokens = new ArrayList<Token>(); |
156 | 155 | this.namespacePolicy = nsPolicy; |
157 | 156 | disableTitleAliases = true; |
— | — | @@ -284,8 +283,6 @@ |
285 | 284 | /** |
286 | 285 | * Fetch token into <code>buffer</code> starting from current position (<code>cur</code>) |
287 | 286 | * |
288 | | - * Similar to <code>FastWikiTokenizerEngine</code>, automatically |
289 | | - * normalizes (strip accents) and lowercases the words |
290 | 287 | * @return type of the token in buffer |
291 | 288 | */ |
292 | 289 | private TokenType fetchToken(){ |
— | — | @@ -298,14 +295,7 @@ |
299 | 296 | |
300 | 297 | // pluses and minuses, underscores can be within words, *,? are for wildcard queries |
301 | 298 | if(Character.isLetterOrDigit(ch) || ch=='-' || ch=='+' || ch=='_' || ch=='*' || ch=='?'){ |
302 | | - // unicode normalization -> delete accents |
303 | | - decomp = decomposer.decompose(ch); |
304 | | - if(decomp == null) |
305 | | - buffer[length++] = ch; |
306 | | - else{ |
307 | | - for(decompi = 0; decompi < decomp.length; decompi++) |
308 | | - buffer[length++] = decomp[decompi]; |
309 | | - } |
| 299 | + buffer[length++] = ch; |
310 | 300 | } else{ |
311 | 301 | cur--; // position before the nonletter character |
312 | 302 | break; |
— | — | @@ -373,11 +363,11 @@ |
374 | 364 | cur = prev_cur; |
375 | 365 | } |
376 | 366 | |
377 | | - /** make <code>tokenStream</code> from lowercased <code>buffer</code> via analyzer */ |
| 367 | + /** make <code>tokenStream</code> from <code>buffer</code> via analyzer */ |
378 | 368 | private void analyzeBuffer(){ |
379 | 369 | String analysisField = defaultField; |
380 | 370 | tokenStream = analyzer.tokenStream(analysisField, |
381 | | - new String(buffer,0,length).toLowerCase()); |
| 371 | + new String(buffer,0,length)); |
382 | 372 | |
383 | 373 | Token token; |
384 | 374 | tokens.clear(); |
— | — | @@ -404,15 +394,15 @@ |
405 | 395 | /** Make a lucene term from string */ |
406 | 396 | private Term makeTerm(String t){ |
407 | 397 | if(field == null) |
408 | | - return new Term(defaultField,t.toLowerCase()); |
| 398 | + return new Term(defaultField,t); |
409 | 399 | else if(!field.equals("incategory") && |
410 | 400 | (namespacePolicy == NamespacePolicy.IGNORE || |
411 | 401 | namespacePolicy == NamespacePolicy.REWRITE)) |
412 | | - return new Term(defaultField,t.toLowerCase()); |
| 402 | + return new Term(defaultField,t); |
413 | 403 | else if(field.equals("incategory")) |
414 | | - return new Term("category",t.toLowerCase()); |
| 404 | + return new Term("category",t); |
415 | 405 | else |
416 | | - return new Term(field,t.toLowerCase()); |
| 406 | + return new Term(field,t); |
417 | 407 | } |
418 | 408 | |
419 | 409 | /** Parses a phrase query (i.e. between ""), the cur |
— | — | @@ -673,7 +663,7 @@ |
674 | 664 | // check for wildcard seaches, they are also not analyzed/stemmed, only for titles |
675 | 665 | // wildcard signs are allowed only at the end of the word, minimum one letter word |
676 | 666 | if(length>1 && Character.isLetter(buffer[0]) && (buffer[length-1]=='*' || buffer[length-1]=='?') && |
677 | | - defaultField.equals("title")){ |
| 667 | + defaultField.equals(fields.title())){ |
678 | 668 | Query ret = new WildcardQuery(makeTerm()); |
679 | 669 | ret.setBoost(defaultBoost); |
680 | 670 | return ret; |
— | — | @@ -706,6 +696,21 @@ |
707 | 697 | t = new TermQuery(makeTerm(token)); |
708 | 698 | t.setBoost(defaultAliasBoost*defaultBoost); |
709 | 699 | cur.add(t,aliasOccur); |
| 700 | + } else if (token.type().equals("transliteration")){ |
| 701 | + // if not in nested query make one |
| 702 | + if(cur == bq && (i+1) < tokens.size() && tokens.get(i+1).getPositionIncrement()==0){ |
| 703 | + t = new TermQuery(makeTerm(token)); |
| 704 | + t.setBoost(defaultBoost); |
| 705 | + cur = new BooleanQuery(); |
| 706 | + cur.add(t,BooleanClause.Occur.SHOULD); |
| 707 | + bq.add(cur,boolDefault); |
| 708 | + continue; |
| 709 | + } else{ |
| 710 | + // alternative transliteration |
| 711 | + t = new TermQuery(makeTerm(token)); |
| 712 | + t.setBoost(defaultBoost); |
| 713 | + cur.add(t,aliasOccur); |
| 714 | + } |
710 | 715 | } |
711 | 716 | if( cur != bq) // returned from nested query |
712 | 717 | cur = bq; |
— | — | @@ -715,7 +720,7 @@ |
716 | 721 | if(tokens.size() > 2 && (i+1) < tokens.size() && tokens.get(i+1).getPositionIncrement()==0){ |
717 | 722 | // make nested query. this is needed when single word is tokenized |
718 | 723 | // into many words of which they all have aliases |
719 | | - // e.g. anti-hero => anti stemmed:anti hero stemmed:hero |
| 724 | + // e.g. anti-hero => anti hero |
720 | 725 | cur = new BooleanQuery(); |
721 | 726 | cur.add(t,BooleanClause.Occur.SHOULD); |
722 | 727 | bq.add(cur,boolDefault); |
— | — | @@ -776,7 +781,7 @@ |
777 | 782 | Term term = tq.getTerm(); |
778 | 783 | if(term.field().equals(defaultField)){ |
779 | 784 | TermQuery tq2 = new TermQuery( |
780 | | - new Term("title",term.text())); |
| 785 | + new Term(fields.title(),term.text())); |
781 | 786 | tq2.setBoost(tq.getBoost()*TITLE_BOOST); |
782 | 787 | |
783 | 788 | return tq2; |
— | — | @@ -792,7 +797,7 @@ |
793 | 798 | Term[] terms = pq.getTerms(); |
794 | 799 | if(terms.length > 0 && terms[0].field().equals(defaultField)){ |
795 | 800 | for(int j=0;j<terms.length;j++){ |
796 | | - pq2.add(new Term("title",terms[j].text())); |
| 801 | + pq2.add(new Term(fields.title(),terms[j].text())); |
797 | 802 | } |
798 | 803 | pq2.setBoost(pq.getBoost()*TITLE_BOOST); |
799 | 804 | |
— | — | @@ -999,7 +1004,8 @@ |
1000 | 1005 | snq.setBoost(boost); |
1001 | 1006 | spans.add(snq); |
1002 | 1007 | } |
1003 | | - } |
| 1008 | + } else // nested boolean or wildcard query |
| 1009 | + return null; |
1004 | 1010 | } |
1005 | 1011 | // create the queries |
1006 | 1012 | Query cat = null; |
— | — | @@ -1059,7 +1065,7 @@ |
1060 | 1066 | defaultBoost = ALT_TITLE_BOOST; |
1061 | 1067 | defaultAliasBoost = ALT_TITLE_ALIAS_BOOST; |
1062 | 1068 | for(int i=1;i<=WikiIndexModifier.ALT_TITLES;i++){ |
1063 | | - defaultField = "alttitle"+i; |
| 1069 | + defaultField = fields.alttitle()+i; |
1064 | 1070 | Query q = parseRaw(queryText); |
1065 | 1071 | if(q != null) |
1066 | 1072 | bq.add(q,BooleanClause.Occur.SHOULD); |
— | — | @@ -1069,10 +1075,11 @@ |
1070 | 1076 | defaultBoost = olfDefaultBoost; |
1071 | 1077 | defaultAliasBoost = ALIAS_BOOST; |
1072 | 1078 | |
1073 | | - Query qs = multiplySpans(qt,0,"redirect",REDIRECT_BOOST); |
| 1079 | + BooleanQuery qs = multiplySpans(qt,0,fields.redirect(),REDIRECT_BOOST); |
1074 | 1080 | // merge queries |
1075 | 1081 | if(qs != null){ |
1076 | | - bq.add(qs,BooleanClause.Occur.SHOULD); |
| 1082 | + for(BooleanClause bc : qs.getClauses()) |
| 1083 | + bq.add(bc); |
1077 | 1084 | } |
1078 | 1085 | if(bq.getClauses() == null || bq.getClauses().length==0) |
1079 | 1086 | return null; |
— | — | @@ -1085,15 +1092,18 @@ |
1086 | 1093 | protected Query makeTitleQuery(String queryText) { |
1087 | 1094 | String contentField = defaultField; |
1088 | 1095 | float olfDefaultBoost = defaultBoost; |
1089 | | - defaultField = "title"; // now parse the title part |
| 1096 | + defaultField = fields.title(); // now parse the title part |
1090 | 1097 | defaultBoost = TITLE_BOOST; |
1091 | 1098 | defaultAliasBoost = TITLE_ALIAS_BOOST; |
1092 | 1099 | Query qt = parseRaw(queryText); |
| 1100 | + Query qs = null; |
1093 | 1101 | // stemmed title |
1094 | | - defaultField = "stemtitle"; |
1095 | | - defaultBoost = STEM_TITLE_BOOST; |
1096 | | - defaultAliasBoost = STEM_TITLE_ALIAS_BOOST; |
1097 | | - Query qs = parseRaw(queryText); |
| 1102 | + if(ADD_STEM_TITLE){ |
| 1103 | + defaultField = fields.stemtitle(); |
| 1104 | + defaultBoost = STEM_TITLE_BOOST; |
| 1105 | + defaultAliasBoost = STEM_TITLE_ALIAS_BOOST; |
| 1106 | + qs = parseRaw(queryText); |
| 1107 | + } |
1098 | 1108 | // pop stack |
1099 | 1109 | defaultField = contentField; |
1100 | 1110 | defaultBoost = olfDefaultBoost; |
— | — | @@ -1138,7 +1148,7 @@ |
1139 | 1149 | Query nostem = null; |
1140 | 1150 | if(makeRedirect || makeKeywords){ |
1141 | 1151 | String contentField = defaultField; |
1142 | | - defaultField = "keyword"; // this field is never stemmed |
| 1152 | + defaultField = fields.keyword(); // this field is never stemmed |
1143 | 1153 | nostem = parseRaw(queryText); |
1144 | 1154 | defaultField = contentField; |
1145 | 1155 | } |
— | — | @@ -1151,7 +1161,7 @@ |
1152 | 1162 | } |
1153 | 1163 | // keyword pass |
1154 | 1164 | if(makeKeywords && nostem!=null){ |
1155 | | - Query qk = multiplySpans(nostem,0,"keyword",KEYWORD_BOOST); |
| 1165 | + Query qk = multiplySpans(nostem,0,fields.keyword(),KEYWORD_BOOST); |
1156 | 1166 | if(qk != null) |
1157 | 1167 | bq.add(qk,BooleanClause.Occur.SHOULD); |
1158 | 1168 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java |
— | — | @@ -12,6 +12,7 @@ |
13 | 13 | import org.apache.lucene.analysis.nl.DutchStemFilter; |
14 | 14 | import org.apache.lucene.analysis.ru.RussianStemFilter; |
15 | 15 | import org.apache.lucene.analysis.th.ThaiWordFilter; |
| 16 | +import org.apache.lucene.search.FieldSortedHitQueue; |
16 | 17 | import org.wikimedia.lsearch.config.GlobalConfiguration; |
17 | 18 | import org.wikimedia.lsearch.config.IndexId; |
18 | 19 | import org.wikimedia.lsearch.index.WikiIndexModifier; |
— | — | @@ -34,8 +35,8 @@ |
35 | 36 | * @param language |
36 | 37 | * @return |
37 | 38 | */ |
38 | | - public static Analyzer getTitleAnalyzer(FilterFactory filters){ |
39 | | - return new QueryLanguageAnalyzer(filters); |
| 39 | + public static Analyzer getTitleAnalyzer(FilterFactory filters, boolean exactCase){ |
| 40 | + return new QueryLanguageAnalyzer(filters,exactCase); |
40 | 41 | } |
41 | 42 | |
42 | 43 | /** |
— | — | @@ -50,29 +51,40 @@ |
51 | 52 | * @param languageAnalyzer language filter class (e.g. PorterStemFilter) |
52 | 53 | * @return {PerFieldAnalyzerWrapper,WikiTokenizer} |
53 | 54 | */ |
54 | | - public static Object[] getIndexerAnalyzer(String text, FilterFactory filters, ArrayList<String> redirects) { |
55 | | - PerFieldAnalyzerWrapper perFieldAnalyzer = null; |
| 55 | + public static Object[] getIndexerAnalyzer(String text, FieldBuilder builder, ArrayList<String> redirects) { |
| 56 | + PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); |
| 57 | + WikiTokenizer tokenizer = null; |
| 58 | + for(FieldBuilder.BuilderSet bs : builder.getBuilders()){ |
| 59 | + tokenizer = addFieldsForIndexing(perFieldAnalyzer,text,bs.getFilters(),bs.getFields(),redirects,bs.isExactCase()); |
| 60 | + } |
| 61 | + return new Object[] {perFieldAnalyzer,tokenizer}; |
| 62 | + } |
| 63 | + |
| 64 | + /** |
| 65 | + * Add some fields to indexer's analyzer. |
| 66 | + * |
| 67 | + */ |
| 68 | + public static WikiTokenizer addFieldsForIndexing(PerFieldAnalyzerWrapper perFieldAnalyzer, String text, FilterFactory filters, FieldNameFactory fields, ArrayList<String> redirects, boolean exactCase) { |
56 | 69 | // parse wiki-text to get categories |
57 | | - WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage()); |
| 70 | + WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase); |
58 | 71 | tokenizer.tokenize(); |
59 | 72 | ArrayList<String> categories = tokenizer.getCategories(); |
60 | 73 | |
61 | | - perFieldAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); |
62 | | - perFieldAnalyzer.addAnalyzer("contents", |
| 74 | + perFieldAnalyzer.addAnalyzer(fields.contents(), |
63 | 75 | new LanguageAnalyzer(filters,tokenizer)); |
64 | 76 | perFieldAnalyzer.addAnalyzer("category", |
65 | 77 | new CategoryAnalyzer(categories)); |
66 | | - perFieldAnalyzer.addAnalyzer("title", |
67 | | - getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
68 | | - perFieldAnalyzer.addAnalyzer("stemtitle", |
69 | | - getTitleAnalyzer(filters)); |
70 | | - setAltTitleAnalyzer(perFieldAnalyzer,"alttitle", |
71 | | - getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
72 | | - setKeywordAnalyzer(perFieldAnalyzer,"redirect", |
73 | | - new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory(),"redirect")); |
74 | | - setKeywordAnalyzer(perFieldAnalyzer,"keyword", |
75 | | - new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory(),"keyword")); |
76 | | - return new Object[] {perFieldAnalyzer,tokenizer}; |
| 78 | + perFieldAnalyzer.addAnalyzer(fields.title(), |
| 79 | + getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase)); |
| 80 | + perFieldAnalyzer.addAnalyzer(fields.stemtitle(), |
| 81 | + getTitleAnalyzer(filters,exactCase)); |
| 82 | + setAltTitleAnalyzer(perFieldAnalyzer,fields.alttitle(), |
| 83 | + getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase)); |
| 84 | + setKeywordAnalyzer(perFieldAnalyzer,fields.redirect(), |
| 85 | + new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory(),fields.redirect(),exactCase)); |
| 86 | + setKeywordAnalyzer(perFieldAnalyzer,fields.keyword(), |
| 87 | + new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory(),fields.keyword(),exactCase)); |
| 88 | + return tokenizer; |
77 | 89 | } |
78 | 90 | |
79 | 91 | protected static void setAltTitleAnalyzer(PerFieldAnalyzerWrapper perFieldAnalyzer, String prefix, Analyzer analyzer) { |
— | — | @@ -87,37 +99,42 @@ |
88 | 100 | } |
89 | 101 | } |
90 | 102 | |
91 | | - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){ |
| 103 | + public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid, boolean exactCase){ |
92 | 104 | if(global == null) |
93 | 105 | global = GlobalConfiguration.getInstance(); |
94 | | - return getSearcherAnalyzer(global.getLanguage(iid.getDBname())); |
| 106 | + return getSearcherAnalyzer(global.getLanguage(iid.getDBname()),exactCase); |
95 | 107 | |
96 | 108 | } |
97 | 109 | |
98 | 110 | public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode){ |
99 | | - return getSearcherAnalyzer(new FilterFactory(langCode)); |
| 111 | + return getSearcherAnalyzer(langCode,false); |
100 | 112 | } |
101 | 113 | |
| 114 | + public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode, boolean exactCase){ |
| 115 | + return getSearcherAnalyzer(new FilterFactory(langCode),new FieldNameFactory(exactCase)); |
| 116 | + } |
| 117 | + |
102 | 118 | /** |
103 | 119 | * Analyzer for search queries. Can be reused to parse many queries. |
104 | 120 | * |
105 | 121 | * @param text |
106 | 122 | * @return |
107 | 123 | */ |
108 | | - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(FilterFactory filters) { |
| 124 | + public static PerFieldAnalyzerWrapper getSearcherAnalyzer(FilterFactory filters, FieldNameFactory fields) { |
109 | 125 | PerFieldAnalyzerWrapper perFieldAnalyzer = null; |
| 126 | + boolean exactCase = fields.isExactCase(); |
110 | 127 | |
111 | | - perFieldAnalyzer = new PerFieldAnalyzerWrapper(getTitleAnalyzer(filters)); |
112 | | - perFieldAnalyzer.addAnalyzer("contents", |
113 | | - new QueryLanguageAnalyzer(filters)); |
114 | | - perFieldAnalyzer.addAnalyzer("title", |
115 | | - getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
116 | | - perFieldAnalyzer.addAnalyzer("stemtitle", |
117 | | - getTitleAnalyzer(filters)); |
118 | | - setAltTitleAnalyzer(perFieldAnalyzer,"alttitle", |
119 | | - getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
120 | | - perFieldAnalyzer.addAnalyzer("keyword", |
121 | | - getTitleAnalyzer(filters.getNoStemmerFilterFactory())); |
| 128 | + perFieldAnalyzer = new PerFieldAnalyzerWrapper(getTitleAnalyzer(filters,exactCase)); |
| 129 | + perFieldAnalyzer.addAnalyzer(fields.contents(), |
| 130 | + new QueryLanguageAnalyzer(filters,exactCase)); |
| 131 | + perFieldAnalyzer.addAnalyzer(fields.title(), |
| 132 | + getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase)); |
| 133 | + perFieldAnalyzer.addAnalyzer(fields.stemtitle(), |
| 134 | + getTitleAnalyzer(filters,exactCase)); |
| 135 | + setAltTitleAnalyzer(perFieldAnalyzer,fields.alttitle(), |
| 136 | + getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase)); |
| 137 | + perFieldAnalyzer.addAnalyzer(fields.keyword(), |
| 138 | + getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase)); |
122 | 139 | |
123 | 140 | return perFieldAnalyzer; |
124 | 141 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java |
— | — | @@ -0,0 +1,66 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +/** |
| 5 | + * Generate field names for the index. |
| 6 | + * |
| 7 | + * @author rainman |
| 8 | + * |
| 9 | + */ |
| 10 | +public class FieldNameFactory { |
| 11 | + public static final boolean EXACT_CASE = true; |
| 12 | + protected boolean exactCase; |
| 13 | + |
| 14 | + public FieldNameFactory(){ |
| 15 | + this.exactCase = false; |
| 16 | + } |
| 17 | + |
| 18 | + public FieldNameFactory(boolean exactCase){ |
| 19 | + this.exactCase = exactCase; |
| 20 | + } |
| 21 | + |
| 22 | + public String contents(){ |
| 23 | + if(exactCase) |
| 24 | + return "contents_exact"; |
| 25 | + else |
| 26 | + return "contents"; |
| 27 | + } |
| 28 | + |
| 29 | + public String title(){ |
| 30 | + if(exactCase) |
| 31 | + return "title_exact"; |
| 32 | + else |
| 33 | + return "title"; |
| 34 | + } |
| 35 | + |
| 36 | + public String stemtitle(){ |
| 37 | + if(exactCase) |
| 38 | + return "stemtitle_exact"; |
| 39 | + else |
| 40 | + return "stemtitle"; |
| 41 | + } |
| 42 | + |
| 43 | + public String alttitle(){ |
| 44 | + if(exactCase) |
| 45 | + return "alttitle_exact"; |
| 46 | + else |
| 47 | + return "alttitle"; |
| 48 | + } |
| 49 | + |
| 50 | + public String redirect(){ |
| 51 | + if(exactCase) |
| 52 | + return "redirect_exact"; |
| 53 | + else |
| 54 | + return "redirect"; |
| 55 | + } |
| 56 | + |
| 57 | + public String keyword(){ |
| 58 | + if(exactCase) |
| 59 | + return "keyword_exact"; |
| 60 | + else |
| 61 | + return "keyword"; |
| 62 | + } |
| 63 | + |
| 64 | + public boolean isExactCase() { |
| 65 | + return exactCase; |
| 66 | + } |
| 67 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java |
— | — | @@ -20,7 +20,7 @@ |
21 | 21 | protected Class stemmer = null; |
22 | 22 | protected Class customFilter = null; |
23 | 23 | |
24 | | - protected FilterFactory noStemmerFilterFactory; |
| 24 | + protected FilterFactory noStemmerFilterFactory=null; |
25 | 25 | |
26 | 26 | public FilterFactory(String lang){ |
27 | 27 | this.lang = lang; |
— | — | @@ -38,7 +38,10 @@ |
39 | 39 | } |
40 | 40 | |
41 | 41 | public FilterFactory getNoStemmerFilterFactory() { |
42 | | - return noStemmerFilterFactory; |
| 42 | + if(noStemmerFilterFactory == null) |
| 43 | + return this; |
| 44 | + else |
| 45 | + return noStemmerFilterFactory; |
43 | 46 | } |
44 | 47 | |
45 | 48 | protected void init(){ |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java |
— | — | @@ -36,23 +36,23 @@ |
37 | 37 | /** positional increment between different redirects */ |
38 | 38 | public static final int TOKEN_GAP = 201; |
39 | 39 | |
40 | | - public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters, String prefix){ |
| 40 | + public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters, String prefix, boolean exactCase){ |
41 | 41 | ArrayList<String> k = new ArrayList<String>(); |
42 | 42 | if(keywords != null) |
43 | 43 | k.addAll(keywords); |
44 | | - init(k,filters,prefix); |
| 44 | + init(k,filters,prefix,exactCase); |
45 | 45 | } |
46 | | - public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters, String prefix){ |
47 | | - init(keywords,filters,prefix); |
| 46 | + public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters, String prefix, boolean exactCase){ |
| 47 | + init(keywords,filters,prefix,exactCase); |
48 | 48 | } |
49 | 49 | |
50 | | - protected void init(ArrayList<String> keywords, FilterFactory filters, String prefix) { |
| 50 | + protected void init(ArrayList<String> keywords, FilterFactory filters, String prefix, boolean exactCase) { |
51 | 51 | this.prefix = prefix; |
52 | 52 | tokensBySize = new KeywordsTokenStream[KEYWORD_LEVELS]; |
53 | 53 | if(keywords == null){ |
54 | 54 | // init empty token streams |
55 | 55 | for(int i=0; i< KEYWORD_LEVELS; i++){ |
56 | | - tokensBySize[i] = new KeywordsTokenStream(null,filters); |
| 56 | + tokensBySize[i] = new KeywordsTokenStream(null,filters,exactCase); |
57 | 57 | } |
58 | 58 | return; |
59 | 59 | } |
— | — | @@ -61,7 +61,7 @@ |
62 | 62 | keywordsBySize.add(new ArrayList<String>()); |
63 | 63 | // arange keywords into a list by token number |
64 | 64 | for(String k : keywords){ |
65 | | - ArrayList<Token> parsed = new FastWikiTokenizerEngine(k).parse(); |
| 65 | + ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,exactCase).parse(); |
66 | 66 | if(parsed.size() == 0) |
67 | 67 | continue; |
68 | 68 | else if(parsed.size() < KEYWORD_LEVELS) |
— | — | @@ -70,7 +70,7 @@ |
71 | 71 | keywordsBySize.get(KEYWORD_LEVELS-1).add(k); |
72 | 72 | } |
73 | 73 | for(int i=0; i< KEYWORD_LEVELS; i++){ |
74 | | - tokensBySize[i] = new KeywordsTokenStream(keywordsBySize.get(i),filters); |
| 74 | + tokensBySize[i] = new KeywordsTokenStream(keywordsBySize.get(i),filters,exactCase); |
75 | 75 | } |
76 | 76 | } |
77 | 77 | |
— | — | @@ -96,8 +96,8 @@ |
97 | 97 | protected String keyword; |
98 | 98 | protected TokenStream tokens; |
99 | 99 | |
100 | | - public KeywordsTokenStream(ArrayList<String> keywords, FilterFactory filters){ |
101 | | - this.analyzer = new QueryLanguageAnalyzer(filters); |
| 100 | + public KeywordsTokenStream(ArrayList<String> keywords, FilterFactory filters, boolean exactCase){ |
| 101 | + this.analyzer = new QueryLanguageAnalyzer(filters,exactCase); |
102 | 102 | this.keywords = keywords; |
103 | 103 | this.index = 0; |
104 | 104 | this.keyword = null; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java |
— | — | @@ -13,9 +13,11 @@ |
14 | 14 | */ |
15 | 15 | public class QueryLanguageAnalyzer extends LanguageAnalyzer { |
16 | 16 | static org.apache.log4j.Logger log = Logger.getLogger(QueryLanguageAnalyzer.class); |
| 17 | + protected boolean exactCase; |
17 | 18 | |
18 | | - public QueryLanguageAnalyzer(FilterFactory filters){ |
| 19 | + public QueryLanguageAnalyzer(FilterFactory filters, boolean exactCase){ |
19 | 20 | super(filters,null); |
| 21 | + this.exactCase = exactCase; |
20 | 22 | } |
21 | 23 | |
22 | 24 | /** |
— | — | @@ -23,7 +25,7 @@ |
24 | 26 | */ |
25 | 27 | @Override |
26 | 28 | public TokenStream tokenStream(String fieldName, String text) { |
27 | | - wikitokenizer = new WikiTokenizer(text); |
| 29 | + wikitokenizer = new WikiTokenizer(text,exactCase); |
28 | 30 | return super.tokenStream(fieldName,(Reader)null); |
29 | 31 | } |
30 | 32 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java |
— | — | @@ -46,7 +46,7 @@ |
47 | 47 | |
48 | 48 | @Override |
49 | 49 | public TokenStream tokenStream(String fieldName, Reader reader) { |
50 | | - return new LowerCaseFilter(new ArrayTokenStream(categories)); |
| 50 | + return new ArrayTokenStream(categories); |
51 | 51 | } |
52 | 52 | |
53 | 53 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java |
— | — | @@ -0,0 +1,53 @@ |
| 2 | +package org.wikimedia.lsearch.analyzers; |
| 3 | + |
| 4 | +/** |
| 5 | + * Agregate class for FilterFactory and FieldNameFactory. This class |
| 6 | + * contains methods used to build various fields of the index, |
| 7 | + * it contains field names to be used, filter that are to be applied... |
| 8 | + * |
| 9 | + * @author rainman |
| 10 | + * |
| 11 | + */ |
| 12 | +public class FieldBuilder { |
| 13 | + public class BuilderSet{ |
| 14 | + FilterFactory filters; |
| 15 | + FieldNameFactory fields; |
| 16 | + public BuilderSet(FilterFactory filters, FieldNameFactory fields) { |
| 17 | + this.filters = filters; |
| 18 | + this.fields = fields; |
| 19 | + } |
| 20 | + public FieldNameFactory getFields() { |
| 21 | + return fields; |
| 22 | + } |
| 23 | + public FilterFactory getFilters() { |
| 24 | + return filters; |
| 25 | + } |
| 26 | + public boolean isExactCase() { |
| 27 | + return fields.isExactCase(); |
| 28 | + } |
| 29 | + } |
| 30 | + |
| 31 | + protected BuilderSet[] builders = new BuilderSet[2]; |
| 32 | + |
| 33 | + public FieldBuilder(String lang, boolean exactCase){ |
| 34 | + if(exactCase){ |
| 35 | + builders = new BuilderSet[2]; |
| 36 | + // additional exact case factory |
| 37 | + builders[1] = new BuilderSet( |
| 38 | + new FilterFactory(lang).getNoStemmerFilterFactory(), |
| 39 | + new FieldNameFactory(FieldNameFactory.EXACT_CASE)); |
| 40 | + } else |
| 41 | + builders = new BuilderSet[1]; |
| 42 | + // default factory, lowercase all data |
| 43 | + builders[0] = new BuilderSet( |
| 44 | + new FilterFactory(lang), |
| 45 | + new FieldNameFactory()); |
| 46 | + |
| 47 | + } |
| 48 | + |
| 49 | + public BuilderSet[] getBuilders() { |
| 50 | + return builders; |
| 51 | + } |
| 52 | + |
| 53 | + |
| 54 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java |
— | — | @@ -197,7 +197,11 @@ |
198 | 198 | int begin = line.indexOf("[["); |
199 | 199 | int end = line.indexOf("]]"); |
200 | 200 | if(begin != -1 && end != -1 && end > begin){ |
201 | | - return text.substring(begin+2,end); |
| 201 | + String redirectText = text.substring(begin+2,end); |
| 202 | + int fragment = redirectText.lastIndexOf('#'); |
| 203 | + if(fragment != -1) |
| 204 | + redirectText = redirectText.substring(0,fragment); |
| 205 | + return redirectText; |
202 | 206 | } |
203 | 207 | } |
204 | 208 | return null; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/CompactArticleLinks.java |
— | — | @@ -42,7 +42,7 @@ |
43 | 43 | @Override |
44 | 44 | public String toString() { |
45 | 45 | try { |
46 | | - return new String(str,0,str.length,"utf-8")+", count="+links; |
| 46 | + return new String(str,0,str.length,"utf-8"); |
47 | 47 | } catch (UnsupportedEncodingException e) { |
48 | 48 | return ""; |
49 | 49 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java |
— | — | @@ -10,6 +10,7 @@ |
11 | 11 | import org.apache.lucene.search.BooleanQuery; |
12 | 12 | import org.apache.lucene.search.Query; |
13 | 13 | import org.wikimedia.lsearch.analyzers.Analyzers; |
| 14 | +import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
14 | 15 | import org.wikimedia.lsearch.analyzers.WikiQueryParser; |
15 | 16 | import org.wikimedia.lsearch.analyzers.WikiQueryParser.NamespacePolicy; |
16 | 17 | import org.wikimedia.lsearch.config.Configuration; |
— | — | @@ -36,8 +37,10 @@ |
37 | 38 | WikiQueryParser.ALT_TITLE_BOOST = 6; |
38 | 39 | WikiQueryParser.KEYWORD_BOOST = 0.05f; |
39 | 40 | WikiIndexModifier.ALT_TITLES = 3; |
| 41 | + WikiQueryParser.ADD_STEM_TITLE=false; |
| 42 | + FieldNameFactory ff = new FieldNameFactory(); |
40 | 43 | try{ |
41 | | - WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer()); |
| 44 | + WikiQueryParser parser = new WikiQueryParser(ff.contents(),new SimpleAnalyzer(),ff); |
42 | 45 | Query q; |
43 | 46 | HashSet<String> fields; |
44 | 47 | |
— | — | @@ -78,7 +81,7 @@ |
79 | 82 | assertEquals("+category:help +category:pleh",q.toString()); |
80 | 83 | |
81 | 84 | q = parser.parseRaw("šđčćždzñ"); |
82 | | - assertEquals("contents:sđcczdzn",q.toString()); |
| 85 | + assertEquals("contents:šđčćždzñ",q.toString()); |
83 | 86 | |
84 | 87 | q = parser.parseRaw("help:making breakfast incategory:food"); |
85 | 88 | assertEquals("+help:making +help:breakfast +category:food",q.toString()); |
— | — | @@ -112,11 +115,11 @@ |
113 | 116 | assertTrue(fields.contains("contents")); |
114 | 117 | |
115 | 118 | // namespace policies |
116 | | - parser = new WikiQueryParser("contents","0",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.IGNORE); |
| 119 | + parser = new WikiQueryParser(ff.contents(),"0",new SimpleAnalyzer(), ff, WikiQueryParser.NamespacePolicy.IGNORE); |
117 | 120 | q = parser.parseRaw("help:making breakfast incategory:food"); |
118 | 121 | assertEquals("+contents:making +contents:breakfast +category:food",q.toString()); |
119 | 122 | |
120 | | - parser = new WikiQueryParser("contents","0",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE); |
| 123 | + parser = new WikiQueryParser(ff.contents(),"0",new SimpleAnalyzer(), ff, WikiQueryParser.NamespacePolicy.REWRITE); |
121 | 124 | q = parser.parseRaw("help:making breakfast incategory:food"); |
122 | 125 | assertEquals("+namespace:12 +(+contents:making +contents:breakfast +category:food)",q.toString()); |
123 | 126 | |
— | — | @@ -138,7 +141,7 @@ |
139 | 142 | |
140 | 143 | // ====== English Analyzer ======== |
141 | 144 | |
142 | | - parser = new WikiQueryParser("contents","0",new EnglishAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE); |
| 145 | + parser = new WikiQueryParser(ff.contents(),"0",new EnglishAnalyzer(), ff, WikiQueryParser.NamespacePolicy.REWRITE); |
143 | 146 | q = parser.parseRaw("main_talk:laziness"); |
144 | 147 | assertEquals("+namespace:1 +(contents:laziness contents:lazi^0.5)",q.toString()); |
145 | 148 | |
— | — | @@ -154,7 +157,7 @@ |
155 | 158 | q = parser.parse("(help:making something incategory:blah) OR (rest incategory:crest)"); |
156 | 159 | assertEquals("(+namespace:12 +(+(+(contents:making contents:make^0.5) title:making^2.0) +(+(contents:something contents:someth^0.5) title:something^2.0) +category:blah)) (+namespace:0 +(+(+contents:rest +category:crest) title:rest^2.0))",q.toString()); |
157 | 160 | |
158 | | - parser = new WikiQueryParser("contents",new EnglishAnalyzer()); |
| 161 | + parser = new WikiQueryParser(ff.contents(),new EnglishAnalyzer(),ff); |
159 | 162 | |
160 | 163 | q = parser.parseRaw("laziness"); |
161 | 164 | assertEquals("contents:laziness contents:lazi^0.5",q.toString()); |
— | — | @@ -169,7 +172,7 @@ |
170 | 173 | assertEquals("+(+(contents:beans contents:bean^0.5) +category:food) +(+contents:orchid +category:\"some flowers\")",q.toString()); |
171 | 174 | |
172 | 175 | q = parser.parseRaw("(Beans AND incategory:FOod) (orchID AND incategory:\"some FLOWERS\")"); |
173 | | - assertEquals("+(+(contents:beans contents:bean^0.5) +category:food) +(+contents:orchid +category:\"some flowers\")",q.toString()); |
| 176 | + assertEquals("+(+(contents:beans contents:bean^0.5) +category:FOod) +(+contents:orchid +category:\"some FLOWERS\")",q.toString()); |
174 | 177 | |
175 | 178 | q = parser.parse("(beans AND incategory:food) (orchid AND incategory:\"some flowers\")"); |
176 | 179 | assertEquals("+(+(+(contents:beans contents:bean^0.5) title:beans^2.0) +category:food) +(+(+contents:orchid +category:\"some flowers\") title:orchid^2.0)",q.toString()); |
— | — | @@ -204,7 +207,7 @@ |
205 | 208 | // Tests with actual params :) |
206 | 209 | // ================================== |
207 | 210 | Analyzer analyzer = Analyzers.getSearcherAnalyzer("en"); |
208 | | - parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE); |
| 211 | + parser = new WikiQueryParser(ff.contents(),"0",analyzer,ff,NamespacePolicy.LEAVE); |
209 | 212 | q = parser.parseTwoPass("beans everyone",null); |
210 | 213 | assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0)",q.toString()); |
211 | 214 | |
— | — | @@ -306,6 +309,16 @@ |
307 | 310 | |
308 | 311 | q = parser.parseFourPass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE,true); |
309 | 312 | assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0)) (((+(namespace:0 namespace:1 namespace:2) +(+alttitle1:beans^6.0 +alttitle1:everyone^6.0)) (+namespace:0 +alttitle1:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle2:beans^6.0 +alttitle2:everyone^6.0)) (+namespace:0 +alttitle2:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle3:beans^6.0 +alttitle3:everyone^6.0)) (+namespace:0 +alttitle3:mainly^6.0)))",q.toString()); |
| 313 | + |
| 314 | + q = parser.parseFourPass("Israeli-Palestinian conflict",NamespacePolicy.IGNORE,true); |
| 315 | + assertEquals("(+(+(contents:israeli contents:isra^0.5) +contents:palestinian) +contents:conflict) (+(+title:israeli^2.0 +title:palestinian^2.0) +title:conflict^2.0) ((+(+alttitle1:israeli^6.0 +alttitle1:palestinian^6.0) +alttitle1:conflict^6.0) (+(+alttitle2:israeli^6.0 +alttitle2:palestinian^6.0) +alttitle2:conflict^6.0) (+(+alttitle3:israeli^6.0 +alttitle3:palestinian^6.0) +alttitle3:conflict^6.0))",q.toString()); |
| 316 | + |
| 317 | + // alternative transliterations |
| 318 | + q = parser.parseFourPass("Something for Gödels",NamespacePolicy.IGNORE,true); |
| 319 | + assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(+(contents:godels contents:godel^0.5) +(contents:goedels contents:goedel^0.5))) (+title:something^2.0 +title:for^2.0 +(title:godels^2.0 title:goedels^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godels^6.0 alttitle1:goedels^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godels^6.0 alttitle2:goedels^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godels^6.0 alttitle3:goedels^6.0)))",q.toString()); |
| 320 | + |
| 321 | + q = parser.parseFourPass("Something for Gödel",NamespacePolicy.IGNORE,true); |
| 322 | + assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(contents:godel contents:goedel)) (+title:something^2.0 +title:for^2.0 +(title:godel^2.0 title:goedel^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godel^6.0 alttitle1:goedel^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godel^6.0 alttitle2:goedel^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godel^6.0 alttitle3:goedel^6.0)))",q.toString()); |
310 | 323 | |
311 | 324 | // Test field extraction |
312 | 325 | HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja"); |
— | — | @@ -316,16 +329,16 @@ |
317 | 330 | |
318 | 331 | // Localization tests |
319 | 332 | analyzer = Analyzers.getSearcherAnalyzer("sr"); |
320 | | - parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE); |
| 333 | + parser = new WikiQueryParser(ff.contents(),"0",analyzer,ff,NamespacePolicy.LEAVE); |
321 | 334 | |
322 | 335 | q = parser.parseTwoPass("all:добродошли на википедију",NamespacePolicy.IGNORE); |
323 | | - assertEquals("(+(contents:добродошли contents:dobrodosli^0.5) +(contents:на contents:na^0.5) +(contents:википедију contents:vikipediju^0.5)) (+(title:добродошли^2.0 title:dobrodosli) +(title:на^2.0 title:na) +(title:википедију^2.0 title:vikipediju))",q.toString()); |
| 336 | + assertEquals("(+(contents:добродошли contents:dobrodosli^0.5) +(contents:на contents:na^0.5) +(contents:википедију contents:vikipediju^0.5)) (+(title:добродошли^2.0 title:dobrodosli^0.4) +(title:на^2.0 title:na^0.4) +(title:википедију^2.0 title:vikipediju^0.4))",q.toString()); |
324 | 337 | |
325 | 338 | q = parser.parseTwoPass("all:dobrodošli na šđčćž",NamespacePolicy.IGNORE); |
326 | 339 | assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^2.0 +title:na^2.0 +title:sdjccz^2.0)",q.toString()); |
327 | 340 | |
328 | 341 | analyzer = Analyzers.getSearcherAnalyzer("th"); |
329 | | - parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE); |
| 342 | + parser = new WikiQueryParser(ff.contents(),"0",analyzer,ff,NamespacePolicy.LEAVE); |
330 | 343 | |
331 | 344 | q = parser.parseTwoPass("ภาษาไทย",NamespacePolicy.IGNORE); |
332 | 345 | assertEquals("(+contents:ภาษา +contents:ไทย) (+title:ภาษา^2.0 +title:ไทย^2.0)",q.toString()); |
— | — | @@ -335,7 +348,7 @@ |
336 | 349 | |
337 | 350 | // Backward compatiblity for complex filters |
338 | 351 | analyzer = Analyzers.getSearcherAnalyzer("en"); |
339 | | - parser = new WikiQueryParser("contents","0,1,4,12",analyzer,NamespacePolicy.IGNORE); |
| 352 | + parser = new WikiQueryParser(ff.contents(),"0,1,4,12",analyzer,ff,NamespacePolicy.IGNORE); |
340 | 353 | |
341 | 354 | q = parser.parseTwoPass("beans everyone",NamespacePolicy.REWRITE); |
342 | 355 | assertEquals("(+(namespace:0 namespace:1 namespace:4 namespace:12) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:4 namespace:12) +(+title:beans^2.0 +title:everyone^2.0))",q.toString()); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -15,7 +15,7 @@ |
16 | 16 | |
17 | 17 | public class FastWikiTokenizerTest { |
18 | 18 | public static void displayTokensForParser(String text) { |
19 | | - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"sr"); |
| 19 | + FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"sr",false); |
20 | 20 | Token[] tokens = parser.parse().toArray(new Token[] {}); |
21 | 21 | for (int i = 0; i < tokens.length; i++) { |
22 | 22 | Token token = tokens[i]; |
— | — | @@ -62,6 +62,8 @@ |
63 | 63 | public static void main(String args[]) throws IOException{ |
64 | 64 | String text = "(ant) and some"; |
65 | 65 | showTokens(text); |
| 66 | + text = " ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; čakšire"; |
| 67 | + showTokens(text); |
66 | 68 | text = "[[Category:Blah Blah?!|Caption]], and [[:Category:Link to category]]"; |
67 | 69 | showTokens(text); |
68 | 70 | text = "{{IPstack}} '''[[Hypertext]] Transfer [[communications protocol|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages."; |
— | — | @@ -110,7 +112,7 @@ |
111 | 113 | for(int i=0;i<2000;i++){ |
112 | 114 | for(TestArticle article : articles){ |
113 | 115 | String text = article.content; |
114 | | - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text); |
| 116 | + FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,false); |
115 | 117 | parser.parse(); |
116 | 118 | } |
117 | 119 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java |
— | — | @@ -58,6 +58,6 @@ |
59 | 59 | if(streams.get(fieldName) != null) |
60 | 60 | return streams.get(fieldName); |
61 | 61 | |
62 | | - return new AliasPorterStemFilter(new WikiTokenizer(text)); |
| 62 | + return new AliasPorterStemFilter(new WikiTokenizer(text,false)); |
63 | 63 | } |
64 | 64 | } |
Index: trunk/lucene-search-2.0/build.xml |
— | — | @@ -32,7 +32,7 @@ |
33 | 33 | <jar destfile="${basedir}/${jar.name}"> |
34 | 34 | <manifest> |
35 | 35 | <attribute name="Main-Class" value="org.wikimedia.lsearch.config.StartupManager"/> |
36 | | - <attribute name="Class-Path" value="${jar.name} lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar lib/mwdumper.jar"/> |
| 36 | + <attribute name="Class-Path" value="${jar.name} lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar lib/mwdumper.jar lib/mysql-connector-java-3.0.17-ga-bin.jar"/> |
37 | 37 | </manifest> |
38 | 38 | <zipfileset dir="${bin}" prefix=""> |
39 | 39 | <include name="org/**"/> |
— | — | @@ -57,7 +57,8 @@ |
58 | 58 | <zipfileset src="lib/lucene-core-2.0.1-dev.jar" /> |
59 | 59 | <zipfileset src="lib/lucene-analyzers.jar" /> |
60 | 60 | <zipfileset src="lib/snowball.jar" /> |
61 | | - <zipfileset src="lib/mwdumper.jar" /> |
| 61 | + <zipfileset src="lib/mwdumper.jar" /> |
| 62 | + <zipfileset src="lib/mysql-connector-java-3.0.17-ga-bin.jar" /> |
62 | 63 | </jar> |
63 | 64 | </target> |
64 | 65 | |
Index: trunk/lucene-search-2.0/webinterface/searchForm.html |
— | — | @@ -60,6 +60,7 @@ |
61 | 61 | <option value="nowiki">nowiki</option> |
62 | 62 | <option value="srwiki">srwiki</option> |
63 | 63 | <option value="enwiktionary">enwiktionary</option> |
| 64 | + <option value="enwiktionary-exact">enwiktionary-exact</option> |
64 | 65 | <!-- <option value="wikilucene">wikilucene</option> |
65 | 66 | <option value="wikidev">wikidev</option> --> |
66 | 67 | </select> |
Index: trunk/lucene-search-2.0/webinterface/lsweb.py |
— | — | @@ -5,7 +5,7 @@ |
6 | 6 | from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer |
7 | 7 | from urllib2 import URLError, HTTPError |
8 | 8 | |
9 | | -search_host = { 'enwiki' : "srv79:8123", '<default>': 'srv80:8123' } |
| 9 | +search_host = { 'enwiki' : "srv79:8123", '<default>': 'srv79:8123' } |
10 | 10 | |
11 | 11 | canon_namespaces = { 0 : '', 1: 'Talk', 2: 'User', 3: 'User_talk', |
12 | 12 | 4 : 'Project', 5 : 'Project_talk', 6 : 'Image', 7 : 'Image_talk', |
— | — | @@ -66,6 +66,7 @@ |
67 | 67 | limit = 20 |
68 | 68 | offset = 0 |
69 | 69 | namespaces = [] |
| 70 | + case = "ignore" |
70 | 71 | |
71 | 72 | # parameters |
72 | 73 | for key,val in params.iteritems(): |
— | — | @@ -87,9 +88,13 @@ |
88 | 89 | else: |
89 | 90 | host = search_host['<default>'] |
90 | 91 | |
| 92 | + if dbname.endswith("-exact"): |
| 93 | + case = "exact" |
| 94 | + dbname = dbname[0:-6] |
| 95 | + |
91 | 96 | # make search url for ls2 |
92 | 97 | search_url = 'http://%s/search/%s/%s' % (host,dbname,urllib.quote(rewritten.encode('utf-8'))) |
93 | | - search_params = urllib.urlencode({'limit' : limit, 'offset' : offset, 'namespaces' : ','.join(namespaces)}, True) |
| 98 | + search_params = urllib.urlencode({'limit' : limit, 'offset' : offset, 'namespaces' : ','.join(namespaces), "case" : case}, True) |
94 | 99 | |
95 | 100 | # process search results |
96 | 101 | try: |
— | — | @@ -98,6 +103,7 @@ |
99 | 104 | lasthit = min(offset+limit,numhits) |
100 | 105 | # html headers |
101 | 106 | self.send_response(200) |
| 107 | + self.send_header('Cache-Control','no-cache') |
102 | 108 | self.send_header('Content-type','text/html') |
103 | 109 | self.end_headers() |
104 | 110 | self.wfile.write('<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>') |
— | — | @@ -154,6 +160,7 @@ |
155 | 161 | search_form = f.read() |
156 | 162 | f.close() |
157 | 163 | self.send_response(200) |
| 164 | + self.send_header('Cache-Control','no-cache') |
158 | 165 | self.send_header('Content-type','text/html') |
159 | 166 | self.end_headers() |
160 | 167 | self.wfile.write(search_form) |
Index: trunk/lucene-search-2.0/lsearch-global.conf |
— | — | @@ -40,6 +40,10 @@ |
41 | 41 | # dbnames that end with the suffix will use additional keywords scores |
42 | 42 | KeywordScoring.suffix=wiki wikilucene wikidev |
43 | 43 | |
| 44 | +# suffix for databases that should also have exact-case index built |
| 45 | +# note: this will also turn off stemming! |
| 46 | +ExactCase.suffix=wiktionary wikilucene |
| 47 | + |
44 | 48 | # Put here you custom namespace prefixes |
45 | 49 | # Syntax: <prefix_name> : <coma separated list of namespaces> |
46 | 50 | # <all> is a special keyword meaning all namespaces |