r23013 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r23012‎ | r23013 | r23014 >
Date:16:21, 15 June 2007
Author:rainman
Status:old
Tags:
Comment:
Some minor stuff:
* Handle common transliterations, e.g. ö -> oe, ü -> ue, etc..
* Support for additional exact-case fields (for wiktionary)
Modified paths:
  • /trunk/lucene-search-2.0/build.xml (modified) (history)
  • /trunk/lucene-search-2.0/lsearch-global.conf (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/highlight/HighlightDaemon.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/CompactArticleLinks.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java (modified) (history)
  • /trunk/lucene-search-2.0/webinterface/lsweb.py (modified) (history)
  • /trunk/lucene-search-2.0/webinterface/searchForm.html (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java
@@ -231,6 +231,8 @@
232232 terms = new WordTerms("./lib/dict/german.txt.gz");
233233 else if(lang.equals("fr"))
234234 terms = new WordTerms("./lib/dict/french.txt.gz");
 235+ else if(lang.equals("sample"))
 236+ terms = new SampleTerms();
235237 else
236238 terms = new WordTerms("./test-data/words-wikilucene.ngram.gz");
237239
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
@@ -8,6 +8,7 @@
99 import org.apache.lucene.analysis.Analyzer;
1010 import org.apache.lucene.document.Document;
1111 import org.apache.lucene.index.IndexWriter;
 12+import org.wikimedia.lsearch.analyzers.FieldBuilder;
1213 import org.wikimedia.lsearch.analyzers.FilterFactory;
1314 import org.wikimedia.lsearch.beans.Article;
1415 import org.wikimedia.lsearch.beans.IndexReportCard;
@@ -27,7 +28,7 @@
2829 static Logger log = Logger.getLogger(SimpleIndexWriter.class);
2930 protected IndexId iid;
3031 protected HashMap<String,IndexWriter> indexes;
31 - protected FilterFactory filters;
 32+ protected FieldBuilder builder;
3233 protected Boolean optimize;
3334 protected Integer mergeFactor, maxBufDocs;
3435 protected boolean newIndex;
@@ -39,8 +40,9 @@
4041 this.mergeFactor = mergeFactor;
4142 this.maxBufDocs = maxBufDocs;
4243 this.newIndex = newIndex;
43 - langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
44 - filters = new FilterFactory(langCode);
 44+ GlobalConfiguration global = GlobalConfiguration.getInstance();
 45+ langCode = global.getLanguage(iid.getDBname());
 46+ builder = new FieldBuilder(langCode,global.exactCaseIndex(iid.getDBname()));
4547 indexes = new HashMap<String,IndexWriter>();
4648 // open all relevant indexes
4749 if(iid.isSingle())
@@ -106,7 +108,7 @@
107109 IndexWriter writer = indexes.get(target.toString());
108110 if(writer == null)
109111 return;
110 - Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,filters,iid);
 112+ Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,builder,iid);
111113 Document doc = (Document) ret[0];
112114 Analyzer analyzer = (Analyzer) ret[1];
113115 try {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
@@ -134,7 +134,6 @@
135135 System.out.println("Finished indexing in "+formatTime(end-start)+", with final index optimization in "+formatTime(finalEnd-end));
136136 System.out.println("Total time: "+formatTime(finalEnd-start));
137137 }
138 -
139138 // make snapshot if needed
140139 if(makeSnapshot || snapshotDb){
141140 IndexId iid = IndexId.get(dbname);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -69,6 +69,8 @@
7070 protected String[] databaseSuffixes = null;
7171 /** Databases ending in suffix will use additional keyword scores */
7272 protected String[] keywordScoringSuffixes = null;
 73+ /** Databases ending in suffix will have 2 indexes, one with lowercased words, and one with exact case words */
 74+ protected String[] exactCaseSuffix = null;
7375
7476 protected Properties globalProperties = null;
7577
@@ -290,6 +292,7 @@
291293 // get some predifined global properties
292294 this.databaseSuffixes = getArrayProperty("Database.suffix");
293295 this.keywordScoringSuffixes = getArrayProperty("KeywordScoring.suffix");
 296+ this.exactCaseSuffix = getArrayProperty("ExactCase.suffix");
294297 if(line == null)
295298 break;
296299 // else: line points to beginning of next section
@@ -457,6 +460,7 @@
458461 mySearch,
459462 oairepo);
460463 indexIdPool.put(dbrole,iid);
 464+
461465 }
462466 if(indexIdPool.get(dbname).isNssplit())
463467 indexIdPool.get(dbname).rebuildNsMap(indexIdPool);
@@ -831,17 +835,12 @@
832836 return namespacePrefixAll;
833837 }
834838
835 - /** Returns if keyword scoring should be used for this db, using
836 - * the suffixes from the global configuration
837 - *
838 - * @param dbname
839 - * @return
840 - */
841 - public boolean useKeywordScoring(String dbname){
842 - if(keywordScoringSuffixes == null)
 839+ /** Check wether dbname has some of the suffixes */
 840+ protected boolean checkSuffix(String[] suffixes, String dbname){
 841+ if(suffixes == null)
843842 return false;
844843 else{
845 - for (String suffix : keywordScoringSuffixes) {
 844+ for (String suffix : suffixes) {
846845 if (dbname.endsWith(suffix))
847846 return true;
848847 }
@@ -849,6 +848,25 @@
850849 return false;
851850 }
852851
 852+ /** Returns if keyword scoring should be used for this db, using
 853+ * the suffixes from the global configuration
 854+ *
 855+ * @param dbname
 856+ * @return
 857+ */
 858+ public boolean useKeywordScoring(String dbname){
 859+ return checkSuffix(keywordScoringSuffixes,dbname);
 860+ }
853861
 862+ /**
 863+ * If this dbname is assigned an exact-case additional index.
 864+ *
 865+ * @param dbname
 866+ * @return
 867+ */
 868+ public boolean exactCaseIndex(String dbname){
 869+ return checkSuffix(exactCaseSuffix,dbname);
 870+ }
 871+
854872
855873 }
\ No newline at end of file
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -19,6 +19,7 @@
2020 import org.apache.lucene.search.Searcher;
2121 import org.apache.lucene.search.TopDocs;
2222 import org.wikimedia.lsearch.analyzers.Analyzers;
 23+import org.wikimedia.lsearch.analyzers.FieldNameFactory;
2324 import org.wikimedia.lsearch.analyzers.WikiQueryParser;
2425 import org.wikimedia.lsearch.beans.ResultSet;
2526 import org.wikimedia.lsearch.beans.SearchResults;
@@ -41,20 +42,28 @@
4243
4344 protected final int maxlines = 1000;
4445 protected final int maxoffset = 10000;
 46+ protected static GlobalConfiguration global = null;
4547
 48+ public SearchEngine(){
 49+ if(global == null)
 50+ global = GlobalConfiguration.getInstance();
 51+ }
 52+
4653 /** Main search method, call this from the search frontend */
4754 public SearchResults search(IndexId iid, String what, String searchterm, HashMap query) {
4855
4956 if (what.equals("titlematch")) {
5057 // TODO: return searchTitles(searchterm);
5158 } else if (what.equals("search") || what.equals("explain")) {
52 - int offset = 0, limit = 100;
 59+ int offset = 0, limit = 100; boolean exactCase = false;
5360 if (query.containsKey("offset"))
5461 offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
5562 if (query.containsKey("limit"))
5663 limit = Math.min(Integer.parseInt((String)query.get("limit")), maxlines);
 64+ if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact"))
 65+ exactCase = true;
5766 NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
58 - SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"));
 67+ SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase);
5968 if(res!=null && res.isRetry()){
6069 int retries = 0;
6170 if(iid.isSplit() || iid.isNssplit()){
@@ -63,7 +72,7 @@
6473 retries = 1;
6574
6675 while(retries > 0 && res.isRetry()){
67 - res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"));
 76+ res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"), exactCase);
6877 retries--;
6978 }
7079 if(res.isRetry())
@@ -108,11 +117,12 @@
109118 * Search on iid, with query searchterm. View results from offset to offset+limit, using
110119 * the default namespaces filter
111120 */
112 - public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault, boolean explain){
113 - Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid);
 121+ public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault, boolean explain, boolean exactCase){
 122+ Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase);
114123 if(nsDefault == null || nsDefault.cardinality() == 0)
115124 nsDefault = new NamespaceFilter("0"); // default to main namespace
116 - WikiQueryParser parser = new WikiQueryParser("contents",nsDefault,analyzer,WikiQueryParser.NamespacePolicy.IGNORE);
 125+ FieldNameFactory ff = new FieldNameFactory(exactCase);
 126+ WikiQueryParser parser = new WikiQueryParser(ff.contents(),nsDefault,analyzer,ff,WikiQueryParser.NamespacePolicy.IGNORE);
117127 HashSet<NamespaceFilter> fields = parser.getFieldNamespaces(searchterm);
118128 NamespaceFilterWrapper nsfw = null;
119129 Query q = null;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java
@@ -10,6 +10,7 @@
1111 import org.apache.lucene.search.Query;
1212 import org.apache.lucene.search.TermQuery;
1313 import org.wikimedia.lsearch.analyzers.Analyzers;
 14+import org.wikimedia.lsearch.analyzers.FieldNameFactory;
1415 import org.wikimedia.lsearch.analyzers.WikiQueryParser;
1516 import org.wikimedia.lsearch.benchmark.Terms;
1617 import org.wikimedia.lsearch.benchmark.WordTerms;
@@ -60,7 +61,8 @@
6162
6263 /** Warmup index using some number of simple searches */
6364 protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) {
64 - WikiQueryParser parser = new WikiQueryParser("contents","0",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);
 65+ FieldNameFactory fields = new FieldNameFactory();
 66+ WikiQueryParser parser = new WikiQueryParser(fields.contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),fields,WikiQueryParser.NamespacePolicy.IGNORE);
6567 Terms terms = getTermsForLang(global.getLanguage(iid.getDBname()));
6668
6769 try{
@@ -116,7 +118,8 @@
117119 /** Just run one complex query and rebuild the main namespace filter */
118120 public static void simpleWarmup(IndexSearcherMul is, IndexId iid){
119121 try{
120 - WikiQueryParser parser = new WikiQueryParser("contents","0",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);
 122+ FieldNameFactory fields = new FieldNameFactory();
 123+ WikiQueryParser parser = new WikiQueryParser(fields.contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),fields,WikiQueryParser.NamespacePolicy.IGNORE);
121124 Query q = parser.parseFourPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
122125 is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0")));
123126 } catch (IOException e) {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/highlight/HighlightDaemon.java
@@ -23,6 +23,7 @@
2424 import org.apache.lucene.search.highlight.TextFragment;
2525 import org.wikimedia.lsearch.analyzers.Analyzers;
2626 import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
 27+import org.wikimedia.lsearch.analyzers.FieldNameFactory;
2728 import org.wikimedia.lsearch.analyzers.FilterFactory;
2829 import org.wikimedia.lsearch.analyzers.WikiQueryParser;
2930 import org.wikimedia.lsearch.analyzers.WikiTokenizer;
@@ -121,10 +122,13 @@
122123 }
123124
124125 // highlight all articles and return results
125 - String lang = GlobalConfiguration.getInstance().getLanguage(dbname);
126 - Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid);
127 - WikiQueryParser parser = new WikiQueryParser("contents",
128 - new NamespaceFilter("0"),analyzer,WikiQueryParser.NamespacePolicy.IGNORE);
 126+ GlobalConfiguration global = GlobalConfiguration.getInstance();
 127+ boolean exactCase = global.exactCaseIndex(iid.getDBname());
 128+ String lang = global.getLanguage(dbname);
 129+ Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase);
 130+ FieldNameFactory fields = new FieldNameFactory(exactCase);
 131+ WikiQueryParser parser = new WikiQueryParser(fields.contents(),
 132+ new NamespaceFilter("0"),analyzer,fields,WikiQueryParser.NamespacePolicy.IGNORE);
129133 Query q = parser.parseFourPass(query,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
130134 Scorer scorer = new QueryScorer(q);
131135 SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"searchmatch\">","</span>");
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -28,6 +28,8 @@
2929 import org.apache.lucene.store.FSDirectory;
3030 import org.wikimedia.lsearch.analyzers.Analyzers;
3131 import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
 32+import org.wikimedia.lsearch.analyzers.FieldBuilder;
 33+import org.wikimedia.lsearch.analyzers.FieldNameFactory;
3234 import org.wikimedia.lsearch.analyzers.FilterFactory;
3335 import org.wikimedia.lsearch.analyzers.KeywordsAnalyzer;
3436 import org.wikimedia.lsearch.analyzers.WikiTokenizer;
@@ -66,6 +68,7 @@
6769 protected IndexWriter writer;
6870 protected boolean rewrite;
6971 protected String langCode;
 72+ protected boolean exactCase;
7073
7174 protected HashSet<IndexUpdateRecord> nonDeleteDocuments;
7275
@@ -81,10 +84,11 @@
8285 * @param analyzer
8386 * @param rewrite - if true, will create new index
8487 */
85 - SimpleIndexModifier(IndexId iid, String langCode, boolean rewrite){
 88+ SimpleIndexModifier(IndexId iid, String langCode, boolean rewrite, boolean exactCase){
8689 this.iid = iid;
8790 this.rewrite = rewrite;
8891 this.langCode = langCode;
 92+ this.exactCase = exactCase;
8993 reportQueue = new Hashtable<IndexUpdateRecord,IndexReportCard>();
9094 }
9195
@@ -175,16 +179,16 @@
176180 writer.setUseCompoundFile(true);
177181 writer.setMaxFieldLength(MAX_FIELD_LENGTH);
178182
179 - FilterFactory filters = new FilterFactory(langCode);
 183+ FieldBuilder builder = new FieldBuilder(langCode,exactCase);
180184
181185 for(IndexUpdateRecord rec : records){
182186 if(rec.doAdd()){
183187 if(!rec.isAlwaysAdd() && nonDeleteDocuments.contains(rec))
184188 continue; // don't add if delete/add are paired operations
185189 if(!checkPreconditions(rec))
186 - continue; // article shoouldn't be added for some (heuristic) reason
 190+ continue; // article shouldn't be added for some reason
187191 IndexReportCard card = getReportCard(rec);
188 - Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters,iid);
 192+ Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),builder,iid);
189193 Document doc = (Document) ret[0];
190194 Analyzer analyzer = (Analyzer) ret[1];
191195 try {
@@ -244,7 +248,7 @@
245249
246250 /**
247251 * Generate the articles transient characterstics needed only for indexing,
248 - * i.e. list of redirect keywords and Page Rank.
 252+ * i.e. list of redirect keywords and article rank.
249253 *
250254 * @param article
251255 */
@@ -345,7 +349,7 @@
346350 long now = System.currentTimeMillis();
347351 log.info("Starting update of "+updateRecords.size()+" records on "+iid+", started at "+now);
348352
349 - SimpleIndexModifier modifier = new SimpleIndexModifier(iid,global.getLanguage(iid.getDBname()),false);
 353+ SimpleIndexModifier modifier = new SimpleIndexModifier(iid,global.getLanguage(iid.getDBname()),false,global.exactCaseIndex(iid.getDBname()));
350354
351355 Transaction trans = new Transaction(iid);
352356 trans.begin();
@@ -398,60 +402,66 @@
399403 * @param languageAnalyzer
400404 * @return array { document, analyzer }
401405 */
402 - public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters, IndexId iid){
 406+ public static Object[] makeDocumentAndAnalyzer(Article article, FieldBuilder builder, IndexId iid){
403407 PerFieldAnalyzerWrapper perFieldAnalyzer = null;
404408 WikiTokenizer tokenizer = null;
405409 Document doc = new Document();
406410
407411 // tranform record so that unnecessary stuff is deleted, e.g. some redirects
408 - transformArticleForIndexing(article);
 412+ transformArticleForIndexing(article);
409413
410414 // This will be used to look up and replace entries on index updates.
411415 doc.add(new Field("key", article.getKey(), Field.Store.YES, Field.Index.UN_TOKENIZED));
412 -
 416+
413417 // These fields are returned with results
414418 doc.add(new Field("namespace", article.getNamespace(), Field.Store.YES, Field.Index.UN_TOKENIZED));
415419
416 - // boost document title with it's article rank
417 - Field title = new Field("title", article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED);
418 - //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size()));
419 - float rankBoost = calculateArticleRank(article.getRank());
420 - title.setBoost(rankBoost);
421 - doc.add(title);
422 -
423 - Field stemtitle = new Field("stemtitle", article.getTitle(),Field.Store.NO, Field.Index.TOKENIZED);
424 - //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size()));
425 - stemtitle.setBoost(rankBoost);
426 - doc.add(stemtitle);
427 -
428 - // put the best redirects as alternative titles
429 - makeAltTitles(doc,"alttitle",article);
430 -
431 - // add titles of redirects, generated from analyzer
432 - makeKeywordField(doc,"redirect",rankBoost);
433 -
434 - if(checkKeywordPreconditions(article,iid))
435 - // most significat words in the text, gets extra score, from analyzer
436 - makeKeywordField(doc,"keyword",rankBoost);
437 -
438 - // the next fields are generated using wikitokenizer
439 - doc.add(new Field("contents", "",
440 - Field.Store.NO, Field.Index.TOKENIZED));
441 -
442420 // each token is one category (category names themself are not tokenized)
443421 doc.add(new Field("category", "",
444422 Field.Store.NO, Field.Index.TOKENIZED));
 423+
 424+ for(FieldBuilder.BuilderSet bs : builder.getBuilders()){
 425+ FieldNameFactory fields = bs.getFields();
 426+ // boost document title with it's article rank
 427+ Field title = new Field(fields.title(), article.getTitle(),Field.Store.YES, Field.Index.TOKENIZED);
 428+ //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size()));
 429+ float rankBoost = calculateArticleRank(article.getRank());
 430+ title.setBoost(rankBoost);
 431+ doc.add(title);
445432
 433+ Field stemtitle = new Field(fields.stemtitle(), article.getTitle(),Field.Store.NO, Field.Index.TOKENIZED);
 434+ //log.info(article.getNamespace()+":"+article.getTitle()+" has rank "+article.getRank()+" and redirect: "+((article.getRedirects()==null)? "" : article.getRedirects().size()));
 435+ stemtitle.setBoost(rankBoost);
 436+ doc.add(stemtitle);
 437+
 438+ // put the best redirects as alternative titles
 439+ makeAltTitles(doc,fields.alttitle(),article);
 440+
 441+ // add titles of redirects, generated from analyzer
 442+ makeKeywordField(doc,fields.redirect(),rankBoost);
 443+
 444+ if(checkKeywordPreconditions(article,iid))
 445+ // most significat words in the text, gets extra score, from analyzer
 446+ makeKeywordField(doc,fields.keyword(),rankBoost);
 447+
 448+ // the next fields are generated using wikitokenizer
 449+ doc.add(new Field(fields.contents(), "",
 450+ Field.Store.NO, Field.Index.TOKENIZED));
 451+
 452+ // set boost for keyword field
 453+ // tokenizer = (WikiTokenizer) ret[1];
 454+ // keyword.setBoost(calculateKeywordsBoost(tokenizer.getTokens().size()));
 455+ }
 456+ // make analyzer
 457+ if(article.getTitle().equalsIgnoreCase("wiki")){
 458+ int b =10;
 459+ b++;
 460+ }
446461 String text = article.getContents();
447 - if(article.isRedirect())
448 - text=""; // for redirects index only the title
449 - Object[] ret = Analyzers.getIndexerAnalyzer(text,filters,article.getRedirectKeywords());
 462+ Object[] ret = Analyzers.getIndexerAnalyzer(text,builder,article.getRedirectKeywords());
450463 perFieldAnalyzer = (PerFieldAnalyzerWrapper) ret[0];
 464+
451465
452 - // set boost for keyword field
453 - // tokenizer = (WikiTokenizer) ret[1];
454 - // keyword.setBoost(calculateKeywordsBoost(tokenizer.getTokens().size()));
455 -
456466 return new Object[] { doc, perFieldAnalyzer };
457467 }
458468
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java
@@ -36,8 +36,7 @@
3737 return f;
3838 }
3939 } else if(fieldName.equals("title") || fieldName.equals("stemtitle") || fieldName.startsWith("alttitle")){
40 - //float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
41 - float f = (float) (1.0 / numTokens);
 40+ float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
4241 //log.debug("Length-norm: "+f+", numtokens: "+numTokens);
4342 return f;
4443 } else if(fieldName.startsWith("redirect") || fieldName.startsWith("keyword")){
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java
@@ -35,12 +35,12 @@
3636 *
3737 * @param str
3838 */
39 - public WikiTokenizer(String str){
40 - this(str,null);
 39+ public WikiTokenizer(String str, boolean exactCase){
 40+ this(str,null,exactCase);
4141 }
4242
43 - public WikiTokenizer(String str, String lang){
44 - parser = new FastWikiTokenizerEngine(str,lang);
 43+ public WikiTokenizer(String str, String lang, boolean exactCase){
 44+ parser = new FastWikiTokenizerEngine(str,lang,exactCase);
4545 this.input = null;
4646 }
4747
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -27,7 +27,9 @@
2828 */
2929 public class FastWikiTokenizerEngine {
3030 private static final int MAX_WORD_LEN = 255;
31 - private final char[] buffer = new char[MAX_WORD_LEN+1];
 31+ private final char[] buffer = new char[MAX_WORD_LEN]; // buffer of text, e.g. gödel
 32+ private final char[] aliasBuffer = new char[MAX_WORD_LEN]; // buffer for aliases, e.g. goedel
 33+ private final char[] decompBuffer = new char[MAX_WORD_LEN]; // buffer for dedomposed text e.g. godel
3234 private static final int IO_BUFFER_SIZE = 1024;
3335 private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
3436 private char[] text;
@@ -37,6 +39,7 @@
3840 protected ArrayList<String> categories;
3941 protected HashMap<String,String> interwikis;
4042 protected HashSet<String> keywords;
 43+ private int decompLength=0, aliasLength=0;
4144 private int length = 0; // length of token
4245 private int start = 0; // start position of token
4346 private int cur = 0; // current position in the input string
@@ -68,6 +71,8 @@
6972 private static Hashtable<String,HashSet<String>> categoryLocalized = new Hashtable<String,HashSet<String>>();
7073 private static HashSet<String> interwiki;
7174
 75+ /** if true, words won't be lowercased */
 76+ private boolean exactCase = false;
7277 private UnicodeDecomposer decomposer;
7378
7479 enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END, LINK_KEYWORD,
@@ -104,14 +109,15 @@
105110 }
106111 }
107112
108 - public FastWikiTokenizerEngine(String text){
109 - this(text,null);
 113+ public FastWikiTokenizerEngine(String text, boolean exactCase){
 114+ this(text,null,exactCase);
110115 }
111116
112 - public FastWikiTokenizerEngine(String text, String lang){
 117+ public FastWikiTokenizerEngine(String text, String lang, boolean exactCase){
113118 this.text = text.toCharArray();
114119 this.textString = text;
115120 this.language = lang;
 121+ this.exactCase = exactCase;
116122 textLength = text.length();
117123 init();
118124 }
@@ -125,23 +131,112 @@
126132 return decomposer.decompose(c);
127133 }
128134
 135+ /** Add transliteration to token alias, create alias if it doesn't exist */
 136+ private final void addToTokenAlias(String transliteration) {
 137+ if(aliasLength == 0){
 138+ System.arraycopy(decompBuffer,0,aliasBuffer,0,decompLength);
 139+ aliasLength = decompLength;
 140+ }
 141+ for(char cc : transliteration.toCharArray())
 142+ if(aliasLength < aliasBuffer.length)
 143+ aliasBuffer[aliasLength++] = cc;
 144+ }
 145+
129146 /**
130147 * This function is called at word boundaries, it is used to
131148 * make a new token and add it to token stream
 149+ *
 150+ * Does unicode decomposition, and will make alias token with
 151+ * alternative transliterations (e.g. ö -> oe)
132152 */
133153 private final void addToken(){
134154 if(length!=0){
135155 if(numberToken && (buffer[length-1]=='.' ||buffer[length-1]==','))
136156 length--; // strip trailing . and , in numbers
137 - tokens.add(new Token(
138 - new String(buffer, 0, length), start, start + length));
 157+ // decompose token, maintain alias if needed
 158+ decompLength = 0;
 159+ aliasLength = 0;
 160+ boolean addToAlias;
 161+ for(int i=0;i<length;i++){
 162+ addToAlias = true;
 163+ if( ! exactCase )
 164+ cl = Character.toLowerCase(buffer[i]);
 165+ else{
 166+ cl = buffer[i];
 167+ // check additional (uppercase) character aliases
 168+ if(cl == 'Ä' ){
 169+ addToTokenAlias("Ae");
 170+ addToAlias = false;
 171+ } else if(cl == 'Ö'){
 172+ addToTokenAlias("Oe");
 173+ addToAlias = false;
 174+ } else if(cl == 'Ü'){
 175+ addToTokenAlias("Ue");
 176+ addToAlias = false;
 177+ } else if(cl == 'Ñ'){
 178+ addToTokenAlias("Nh");
 179+ addToAlias = false;
 180+ } else if(cl == 'Å'){
 181+ addToTokenAlias("Aa");
 182+ addToAlias = false;
 183+ }
 184+ }
 185+ // special alias transliterations ä -> ae, etc ...
 186+ if(cl == 'ä' ){
 187+ addToTokenAlias("ae");
 188+ addToAlias = false;
 189+ } else if(cl == 'ö'){
 190+ addToTokenAlias("oe");
 191+ addToAlias = false;
 192+ } else if(cl == 'ü'){
 193+ addToTokenAlias("ue");
 194+ addToAlias = false;
 195+ } else if(cl == 'ß'){
 196+ addToTokenAlias("ss");
 197+ addToAlias = false;
 198+ } else if(cl == 'ñ'){
 199+ addToTokenAlias("nh");
 200+ addToAlias = false;
 201+ } else if(cl == 'å'){
 202+ addToTokenAlias("aa");
 203+ addToAlias = false;
 204+ }
 205+
 206+ decomp = decompose(cl);
 207+ // no decomposition
 208+ if(decomp == null){
 209+ if(decompLength<decompBuffer.length)
 210+ decompBuffer[decompLength++] = cl;
 211+ if(addToAlias && aliasLength!=0 && aliasLength<aliasBuffer.length)
 212+ aliasBuffer[aliasLength++] = cl;
 213+ } else{
 214+ for(decompi = 0; decompi < decomp.length; decompi++){
 215+ if(decompLength<decompBuffer.length)
 216+ decompBuffer[decompLength++] = decomp[decompi];
 217+ if(addToAlias && aliasLength!=0 && aliasLength<aliasBuffer.length)
 218+ aliasBuffer[aliasLength++] = decomp[decompi];
 219+ }
 220+ }
 221+ }
 222+ // add decomposed token to stream
 223+ if(decompLength!=0)
 224+ tokens.add(new Token(
 225+ new String(decompBuffer, 0, decompLength), start, start + length));
 226+ // add alias (if any) token to stream
 227+ if(aliasLength!=0){
 228+ Token t = new Token(
 229+ new String(aliasBuffer, 0, aliasLength), start, start + length);
 230+ t.setPositionIncrement(0);
 231+ t.setType("transliteration");
 232+ tokens.add(t);
 233+ }
139234 length = 0;
140235 numberToken = false;
141236 if(templateLevel == 0)
142237 keywordTokens++;
143238 }
144239 }
145 -
 240+
146241 /**
147242 * Tries to add the current letter (variable c) to the
148243 * buffer, if it's not a letter, new token is created
@@ -156,19 +251,9 @@
157252 if(length == 0)
158253 start = cur;
159254
160 - cl = Character.toLowerCase(c);
161 - decomp = decompose(cl);
162 - if(decomp == null){
163 - if(length<buffer.length)
164 - buffer[length++] = cl;
165 - }
166 - else{
167 - for(decompi = 0; decompi < decomp.length; decompi++){
168 - if(length<buffer.length)
169 - buffer[length++] = decomp[decompi];
170 - }
171 - }
172 - // add digits
 255+ if(length < buffer.length)
 256+ buffer[length++] = c;
 257+ // add digits
173258 } else if(Character.isDigit(c)){
174259 if(length == 0)
175260 start = cur;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -83,6 +83,8 @@
8484 public static float ALT_TITLE_ALIAS_BOOST = 0.4f;
8585 public static float KEYWORD_BOOST = 0.02f;
8686
 87+ public static boolean ADD_STEM_TITLE = true;
 88+
8789 /** Policies in treating field names:
8890 *
8991 * LEAVE - don't mess with field rewriting
@@ -103,14 +105,11 @@
104106 private NamespacePolicy namespacePolicy;
105107 protected NamespaceFilter defaultNamespaceFilter;
106108 protected static GlobalConfiguration global=null;
 109+ protected FieldNameFactory fields;
107110
108111 /** default value for boolean queries */
109112 public BooleanClause.Occur boolDefault = BooleanClause.Occur.MUST;
110113
111 - private UnicodeDecomposer decomposer;
112 - private char[] decomp; // unicode decomposition letters
113 - private int decompi;
114 -
115114 /** Init namespace queries */
116115 protected void initNamespaces(){
117116 if(namespaceQueries != null)
@@ -131,8 +130,8 @@
132131 * @param field default field name
133132 * @param analyzer
134133 */
135 - public WikiQueryParser(String field, Analyzer analyzer){
136 - this(field,(NamespaceFilter)null,analyzer,NamespacePolicy.LEAVE);
 134+ public WikiQueryParser(String field, Analyzer analyzer, FieldNameFactory fields){
 135+ this(field,(NamespaceFilter)null,analyzer,fields,NamespacePolicy.LEAVE);
137136 }
138137
139138 /**
@@ -143,14 +142,14 @@
144143 * @param analyzer
145144 * @param nsPolicy
146145 */
147 - public WikiQueryParser(String field, String namespace, Analyzer analyzer, NamespacePolicy nsPolicy){
148 - this(field,new NamespaceFilter(namespace),analyzer,nsPolicy);
 146+ public WikiQueryParser(String field, String namespace, Analyzer analyzer, FieldNameFactory fields, NamespacePolicy nsPolicy){
 147+ this(field,new NamespaceFilter(namespace),analyzer,fields,nsPolicy);
149148 }
150149
151 - public WikiQueryParser(String field, NamespaceFilter nsfilter, Analyzer analyzer, NamespacePolicy nsPolicy){
 150+ public WikiQueryParser(String field, NamespaceFilter nsfilter, Analyzer analyzer, FieldNameFactory fields, NamespacePolicy nsPolicy){
152151 defaultField = field;
153152 this.analyzer = analyzer;
154 - decomposer = UnicodeDecomposer.getInstance();
 153+ this.fields = fields;
155154 tokens = new ArrayList<Token>();
156155 this.namespacePolicy = nsPolicy;
157156 disableTitleAliases = true;
@@ -284,8 +283,6 @@
285284 /**
286285 * Fetch token into <code>buffer</code> starting from current position (<code>cur</code>)
287286 *
288 - * Similar to <code>FastWikiTokenizerEngine</code>, automatically
289 - * normalizes (strip accents) and lowercases the words
290287 * @return type of the token in buffer
291288 */
292289 private TokenType fetchToken(){
@@ -298,14 +295,7 @@
299296
300297 // pluses and minuses, underscores can be within words, *,? are for wildcard queries
301298 if(Character.isLetterOrDigit(ch) || ch=='-' || ch=='+' || ch=='_' || ch=='*' || ch=='?'){
302 - // unicode normalization -> delete accents
303 - decomp = decomposer.decompose(ch);
304 - if(decomp == null)
305 - buffer[length++] = ch;
306 - else{
307 - for(decompi = 0; decompi < decomp.length; decompi++)
308 - buffer[length++] = decomp[decompi];
309 - }
 299+ buffer[length++] = ch;
310300 } else{
311301 cur--; // position before the nonletter character
312302 break;
@@ -373,11 +363,11 @@
374364 cur = prev_cur;
375365 }
376366
377 - /** make <code>tokenStream</code> from lowercased <code>buffer</code> via analyzer */
 367+ /** make <code>tokenStream</code> from <code>buffer</code> via analyzer */
378368 private void analyzeBuffer(){
379369 String analysisField = defaultField;
380370 tokenStream = analyzer.tokenStream(analysisField,
381 - new String(buffer,0,length).toLowerCase());
 371+ new String(buffer,0,length));
382372
383373 Token token;
384374 tokens.clear();
@@ -404,15 +394,15 @@
405395 /** Make a lucene term from string */
406396 private Term makeTerm(String t){
407397 if(field == null)
408 - return new Term(defaultField,t.toLowerCase());
 398+ return new Term(defaultField,t);
409399 else if(!field.equals("incategory") &&
410400 (namespacePolicy == NamespacePolicy.IGNORE ||
411401 namespacePolicy == NamespacePolicy.REWRITE))
412 - return new Term(defaultField,t.toLowerCase());
 402+ return new Term(defaultField,t);
413403 else if(field.equals("incategory"))
414 - return new Term("category",t.toLowerCase());
 404+ return new Term("category",t);
415405 else
416 - return new Term(field,t.toLowerCase());
 406+ return new Term(field,t);
417407 }
418408
419409 /** Parses a phrase query (i.e. between ""), the cur
@@ -673,7 +663,7 @@
674664 // check for wildcard seaches, they are also not analyzed/stemmed, only for titles
675665 // wildcard signs are allowed only at the end of the word, minimum one letter word
676666 if(length>1 && Character.isLetter(buffer[0]) && (buffer[length-1]=='*' || buffer[length-1]=='?') &&
677 - defaultField.equals("title")){
 667+ defaultField.equals(fields.title())){
678668 Query ret = new WildcardQuery(makeTerm());
679669 ret.setBoost(defaultBoost);
680670 return ret;
@@ -706,6 +696,21 @@
707697 t = new TermQuery(makeTerm(token));
708698 t.setBoost(defaultAliasBoost*defaultBoost);
709699 cur.add(t,aliasOccur);
 700+ } else if (token.type().equals("transliteration")){
 701+ // if not in nested query make one
 702+ if(cur == bq && (i+1) < tokens.size() && tokens.get(i+1).getPositionIncrement()==0){
 703+ t = new TermQuery(makeTerm(token));
 704+ t.setBoost(defaultBoost);
 705+ cur = new BooleanQuery();
 706+ cur.add(t,BooleanClause.Occur.SHOULD);
 707+ bq.add(cur,boolDefault);
 708+ continue;
 709+ } else{
 710+ // alternative transliteration
 711+ t = new TermQuery(makeTerm(token));
 712+ t.setBoost(defaultBoost);
 713+ cur.add(t,aliasOccur);
 714+ }
710715 }
711716 if( cur != bq) // returned from nested query
712717 cur = bq;
@@ -715,7 +720,7 @@
716721 if(tokens.size() > 2 && (i+1) < tokens.size() && tokens.get(i+1).getPositionIncrement()==0){
717722 // make nested query. this is needed when single word is tokenized
718723 // into many words of which they all have aliases
719 - // e.g. anti-hero => anti stemmed:anti hero stemmed:hero
 724+ // e.g. anti-hero => anti hero
720725 cur = new BooleanQuery();
721726 cur.add(t,BooleanClause.Occur.SHOULD);
722727 bq.add(cur,boolDefault);
@@ -776,7 +781,7 @@
777782 Term term = tq.getTerm();
778783 if(term.field().equals(defaultField)){
779784 TermQuery tq2 = new TermQuery(
780 - new Term("title",term.text()));
 785+ new Term(fields.title(),term.text()));
781786 tq2.setBoost(tq.getBoost()*TITLE_BOOST);
782787
783788 return tq2;
@@ -792,7 +797,7 @@
793798 Term[] terms = pq.getTerms();
794799 if(terms.length > 0 && terms[0].field().equals(defaultField)){
795800 for(int j=0;j<terms.length;j++){
796 - pq2.add(new Term("title",terms[j].text()));
 801+ pq2.add(new Term(fields.title(),terms[j].text()));
797802 }
798803 pq2.setBoost(pq.getBoost()*TITLE_BOOST);
799804
@@ -999,7 +1004,8 @@
10001005 snq.setBoost(boost);
10011006 spans.add(snq);
10021007 }
1003 - }
 1008+ } else // nested boolean or wildcard query
 1009+ return null;
10041010 }
10051011 // create the queries
10061012 Query cat = null;
@@ -1059,7 +1065,7 @@
10601066 defaultBoost = ALT_TITLE_BOOST;
10611067 defaultAliasBoost = ALT_TITLE_ALIAS_BOOST;
10621068 for(int i=1;i<=WikiIndexModifier.ALT_TITLES;i++){
1063 - defaultField = "alttitle"+i;
 1069+ defaultField = fields.alttitle()+i;
10641070 Query q = parseRaw(queryText);
10651071 if(q != null)
10661072 bq.add(q,BooleanClause.Occur.SHOULD);
@@ -1069,10 +1075,11 @@
10701076 defaultBoost = olfDefaultBoost;
10711077 defaultAliasBoost = ALIAS_BOOST;
10721078
1073 - Query qs = multiplySpans(qt,0,"redirect",REDIRECT_BOOST);
 1079+ BooleanQuery qs = multiplySpans(qt,0,fields.redirect(),REDIRECT_BOOST);
10741080 // merge queries
10751081 if(qs != null){
1076 - bq.add(qs,BooleanClause.Occur.SHOULD);
 1082+ for(BooleanClause bc : qs.getClauses())
 1083+ bq.add(bc);
10771084 }
10781085 if(bq.getClauses() == null || bq.getClauses().length==0)
10791086 return null;
@@ -1085,15 +1092,18 @@
10861093 protected Query makeTitleQuery(String queryText) {
10871094 String contentField = defaultField;
10881095 float olfDefaultBoost = defaultBoost;
1089 - defaultField = "title"; // now parse the title part
 1096+ defaultField = fields.title(); // now parse the title part
10901097 defaultBoost = TITLE_BOOST;
10911098 defaultAliasBoost = TITLE_ALIAS_BOOST;
10921099 Query qt = parseRaw(queryText);
 1100+ Query qs = null;
10931101 // stemmed title
1094 - defaultField = "stemtitle";
1095 - defaultBoost = STEM_TITLE_BOOST;
1096 - defaultAliasBoost = STEM_TITLE_ALIAS_BOOST;
1097 - Query qs = parseRaw(queryText);
 1102+ if(ADD_STEM_TITLE){
 1103+ defaultField = fields.stemtitle();
 1104+ defaultBoost = STEM_TITLE_BOOST;
 1105+ defaultAliasBoost = STEM_TITLE_ALIAS_BOOST;
 1106+ qs = parseRaw(queryText);
 1107+ }
10981108 // pop stack
10991109 defaultField = contentField;
11001110 defaultBoost = olfDefaultBoost;
@@ -1138,7 +1148,7 @@
11391149 Query nostem = null;
11401150 if(makeRedirect || makeKeywords){
11411151 String contentField = defaultField;
1142 - defaultField = "keyword"; // this field is never stemmed
 1152+ defaultField = fields.keyword(); // this field is never stemmed
11431153 nostem = parseRaw(queryText);
11441154 defaultField = contentField;
11451155 }
@@ -1151,7 +1161,7 @@
11521162 }
11531163 // keyword pass
11541164 if(makeKeywords && nostem!=null){
1155 - Query qk = multiplySpans(nostem,0,"keyword",KEYWORD_BOOST);
 1165+ Query qk = multiplySpans(nostem,0,fields.keyword(),KEYWORD_BOOST);
11561166 if(qk != null)
11571167 bq.add(qk,BooleanClause.Occur.SHOULD);
11581168 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/Analyzers.java
@@ -12,6 +12,7 @@
1313 import org.apache.lucene.analysis.nl.DutchStemFilter;
1414 import org.apache.lucene.analysis.ru.RussianStemFilter;
1515 import org.apache.lucene.analysis.th.ThaiWordFilter;
 16+import org.apache.lucene.search.FieldSortedHitQueue;
1617 import org.wikimedia.lsearch.config.GlobalConfiguration;
1718 import org.wikimedia.lsearch.config.IndexId;
1819 import org.wikimedia.lsearch.index.WikiIndexModifier;
@@ -34,8 +35,8 @@
3536 * @param language
3637 * @return
3738 */
38 - public static Analyzer getTitleAnalyzer(FilterFactory filters){
39 - return new QueryLanguageAnalyzer(filters);
 39+ public static Analyzer getTitleAnalyzer(FilterFactory filters, boolean exactCase){
 40+ return new QueryLanguageAnalyzer(filters,exactCase);
4041 }
4142
4243 /**
@@ -50,29 +51,40 @@
5152 * @param languageAnalyzer language filter class (e.g. PorterStemFilter)
5253 * @return {PerFieldAnalyzerWrapper,WikiTokenizer}
5354 */
54 - public static Object[] getIndexerAnalyzer(String text, FilterFactory filters, ArrayList<String> redirects) {
55 - PerFieldAnalyzerWrapper perFieldAnalyzer = null;
 55+ public static Object[] getIndexerAnalyzer(String text, FieldBuilder builder, ArrayList<String> redirects) {
 56+ PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer());
 57+ WikiTokenizer tokenizer = null;
 58+ for(FieldBuilder.BuilderSet bs : builder.getBuilders()){
 59+ tokenizer = addFieldsForIndexing(perFieldAnalyzer,text,bs.getFilters(),bs.getFields(),redirects,bs.isExactCase());
 60+ }
 61+ return new Object[] {perFieldAnalyzer,tokenizer};
 62+ }
 63+
 64+ /**
 65+ * Add some fields to indexer's analyzer.
 66+ *
 67+ */
 68+ public static WikiTokenizer addFieldsForIndexing(PerFieldAnalyzerWrapper perFieldAnalyzer, String text, FilterFactory filters, FieldNameFactory fields, ArrayList<String> redirects, boolean exactCase) {
5669 // parse wiki-text to get categories
57 - WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage());
 70+ WikiTokenizer tokenizer = new WikiTokenizer(text,filters.getLanguage(),exactCase);
5871 tokenizer.tokenize();
5972 ArrayList<String> categories = tokenizer.getCategories();
6073
61 - perFieldAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer());
62 - perFieldAnalyzer.addAnalyzer("contents",
 74+ perFieldAnalyzer.addAnalyzer(fields.contents(),
6375 new LanguageAnalyzer(filters,tokenizer));
6476 perFieldAnalyzer.addAnalyzer("category",
6577 new CategoryAnalyzer(categories));
66 - perFieldAnalyzer.addAnalyzer("title",
67 - getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
68 - perFieldAnalyzer.addAnalyzer("stemtitle",
69 - getTitleAnalyzer(filters));
70 - setAltTitleAnalyzer(perFieldAnalyzer,"alttitle",
71 - getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
72 - setKeywordAnalyzer(perFieldAnalyzer,"redirect",
73 - new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory(),"redirect"));
74 - setKeywordAnalyzer(perFieldAnalyzer,"keyword",
75 - new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory(),"keyword"));
76 - return new Object[] {perFieldAnalyzer,tokenizer};
 78+ perFieldAnalyzer.addAnalyzer(fields.title(),
 79+ getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase));
 80+ perFieldAnalyzer.addAnalyzer(fields.stemtitle(),
 81+ getTitleAnalyzer(filters,exactCase));
 82+ setAltTitleAnalyzer(perFieldAnalyzer,fields.alttitle(),
 83+ getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase));
 84+ setKeywordAnalyzer(perFieldAnalyzer,fields.redirect(),
 85+ new KeywordsAnalyzer(redirects,filters.getNoStemmerFilterFactory(),fields.redirect(),exactCase));
 86+ setKeywordAnalyzer(perFieldAnalyzer,fields.keyword(),
 87+ new KeywordsAnalyzer(tokenizer.getKeywords(),filters.getNoStemmerFilterFactory(),fields.keyword(),exactCase));
 88+ return tokenizer;
7789 }
7890
7991 protected static void setAltTitleAnalyzer(PerFieldAnalyzerWrapper perFieldAnalyzer, String prefix, Analyzer analyzer) {
@@ -87,37 +99,42 @@
88100 }
89101 }
90102
91 - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid){
 103+ public static PerFieldAnalyzerWrapper getSearcherAnalyzer(IndexId iid, boolean exactCase){
92104 if(global == null)
93105 global = GlobalConfiguration.getInstance();
94 - return getSearcherAnalyzer(global.getLanguage(iid.getDBname()));
 106+ return getSearcherAnalyzer(global.getLanguage(iid.getDBname()),exactCase);
95107
96108 }
97109
98110 public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode){
99 - return getSearcherAnalyzer(new FilterFactory(langCode));
 111+ return getSearcherAnalyzer(langCode,false);
100112 }
101113
 114+ public static PerFieldAnalyzerWrapper getSearcherAnalyzer(String langCode, boolean exactCase){
 115+ return getSearcherAnalyzer(new FilterFactory(langCode),new FieldNameFactory(exactCase));
 116+ }
 117+
102118 /**
103119 * Analyzer for search queries. Can be reused to parse many queries.
104120 *
105121 * @param text
106122 * @return
107123 */
108 - public static PerFieldAnalyzerWrapper getSearcherAnalyzer(FilterFactory filters) {
 124+ public static PerFieldAnalyzerWrapper getSearcherAnalyzer(FilterFactory filters, FieldNameFactory fields) {
109125 PerFieldAnalyzerWrapper perFieldAnalyzer = null;
 126+ boolean exactCase = fields.isExactCase();
110127
111 - perFieldAnalyzer = new PerFieldAnalyzerWrapper(getTitleAnalyzer(filters));
112 - perFieldAnalyzer.addAnalyzer("contents",
113 - new QueryLanguageAnalyzer(filters));
114 - perFieldAnalyzer.addAnalyzer("title",
115 - getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
116 - perFieldAnalyzer.addAnalyzer("stemtitle",
117 - getTitleAnalyzer(filters));
118 - setAltTitleAnalyzer(perFieldAnalyzer,"alttitle",
119 - getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
120 - perFieldAnalyzer.addAnalyzer("keyword",
121 - getTitleAnalyzer(filters.getNoStemmerFilterFactory()));
 128+ perFieldAnalyzer = new PerFieldAnalyzerWrapper(getTitleAnalyzer(filters,exactCase));
 129+ perFieldAnalyzer.addAnalyzer(fields.contents(),
 130+ new QueryLanguageAnalyzer(filters,exactCase));
 131+ perFieldAnalyzer.addAnalyzer(fields.title(),
 132+ getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase));
 133+ perFieldAnalyzer.addAnalyzer(fields.stemtitle(),
 134+ getTitleAnalyzer(filters,exactCase));
 135+ setAltTitleAnalyzer(perFieldAnalyzer,fields.alttitle(),
 136+ getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase));
 137+ perFieldAnalyzer.addAnalyzer(fields.keyword(),
 138+ getTitleAnalyzer(filters.getNoStemmerFilterFactory(),exactCase));
122139
123140 return perFieldAnalyzer;
124141 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FieldNameFactory.java
@@ -0,0 +1,66 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+/**
 5+ * Generate field names for the index.
 6+ *
 7+ * @author rainman
 8+ *
 9+ */
 10+public class FieldNameFactory {
 11+ public static final boolean EXACT_CASE = true;
 12+ protected boolean exactCase;
 13+
 14+ public FieldNameFactory(){
 15+ this.exactCase = false;
 16+ }
 17+
 18+ public FieldNameFactory(boolean exactCase){
 19+ this.exactCase = exactCase;
 20+ }
 21+
 22+ public String contents(){
 23+ if(exactCase)
 24+ return "contents_exact";
 25+ else
 26+ return "contents";
 27+ }
 28+
 29+ public String title(){
 30+ if(exactCase)
 31+ return "title_exact";
 32+ else
 33+ return "title";
 34+ }
 35+
 36+ public String stemtitle(){
 37+ if(exactCase)
 38+ return "stemtitle_exact";
 39+ else
 40+ return "stemtitle";
 41+ }
 42+
 43+ public String alttitle(){
 44+ if(exactCase)
 45+ return "alttitle_exact";
 46+ else
 47+ return "alttitle";
 48+ }
 49+
 50+ public String redirect(){
 51+ if(exactCase)
 52+ return "redirect_exact";
 53+ else
 54+ return "redirect";
 55+ }
 56+
 57+ public String keyword(){
 58+ if(exactCase)
 59+ return "keyword_exact";
 60+ else
 61+ return "keyword";
 62+ }
 63+
 64+ public boolean isExactCase() {
 65+ return exactCase;
 66+ }
 67+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FilterFactory.java
@@ -20,7 +20,7 @@
2121 protected Class stemmer = null;
2222 protected Class customFilter = null;
2323
24 - protected FilterFactory noStemmerFilterFactory;
 24+ protected FilterFactory noStemmerFilterFactory=null;
2525
2626 public FilterFactory(String lang){
2727 this.lang = lang;
@@ -38,7 +38,10 @@
3939 }
4040
4141 public FilterFactory getNoStemmerFilterFactory() {
42 - return noStemmerFilterFactory;
 42+ if(noStemmerFilterFactory == null)
 43+ return this;
 44+ else
 45+ return noStemmerFilterFactory;
4346 }
4447
4548 protected void init(){
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
@@ -36,23 +36,23 @@
3737 /** positional increment between different redirects */
3838 public static final int TOKEN_GAP = 201;
3939
40 - public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters, String prefix){
 40+ public KeywordsAnalyzer(HashSet<String> keywords, FilterFactory filters, String prefix, boolean exactCase){
4141 ArrayList<String> k = new ArrayList<String>();
4242 if(keywords != null)
4343 k.addAll(keywords);
44 - init(k,filters,prefix);
 44+ init(k,filters,prefix,exactCase);
4545 }
46 - public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters, String prefix){
47 - init(keywords,filters,prefix);
 46+ public KeywordsAnalyzer(ArrayList<String> keywords, FilterFactory filters, String prefix, boolean exactCase){
 47+ init(keywords,filters,prefix,exactCase);
4848 }
4949
50 - protected void init(ArrayList<String> keywords, FilterFactory filters, String prefix) {
 50+ protected void init(ArrayList<String> keywords, FilterFactory filters, String prefix, boolean exactCase) {
5151 this.prefix = prefix;
5252 tokensBySize = new KeywordsTokenStream[KEYWORD_LEVELS];
5353 if(keywords == null){
5454 // init empty token streams
5555 for(int i=0; i< KEYWORD_LEVELS; i++){
56 - tokensBySize[i] = new KeywordsTokenStream(null,filters);
 56+ tokensBySize[i] = new KeywordsTokenStream(null,filters,exactCase);
5757 }
5858 return;
5959 }
@@ -61,7 +61,7 @@
6262 keywordsBySize.add(new ArrayList<String>());
6363 // arange keywords into a list by token number
6464 for(String k : keywords){
65 - ArrayList<Token> parsed = new FastWikiTokenizerEngine(k).parse();
 65+ ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,exactCase).parse();
6666 if(parsed.size() == 0)
6767 continue;
6868 else if(parsed.size() < KEYWORD_LEVELS)
@@ -70,7 +70,7 @@
7171 keywordsBySize.get(KEYWORD_LEVELS-1).add(k);
7272 }
7373 for(int i=0; i< KEYWORD_LEVELS; i++){
74 - tokensBySize[i] = new KeywordsTokenStream(keywordsBySize.get(i),filters);
 74+ tokensBySize[i] = new KeywordsTokenStream(keywordsBySize.get(i),filters,exactCase);
7575 }
7676 }
7777
@@ -96,8 +96,8 @@
9797 protected String keyword;
9898 protected TokenStream tokens;
9999
100 - public KeywordsTokenStream(ArrayList<String> keywords, FilterFactory filters){
101 - this.analyzer = new QueryLanguageAnalyzer(filters);
 100+ public KeywordsTokenStream(ArrayList<String> keywords, FilterFactory filters, boolean exactCase){
 101+ this.analyzer = new QueryLanguageAnalyzer(filters,exactCase);
102102 this.keywords = keywords;
103103 this.index = 0;
104104 this.keyword = null;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/QueryLanguageAnalyzer.java
@@ -13,9 +13,11 @@
1414 */
1515 public class QueryLanguageAnalyzer extends LanguageAnalyzer {
1616 static org.apache.log4j.Logger log = Logger.getLogger(QueryLanguageAnalyzer.class);
 17+ protected boolean exactCase;
1718
18 - public QueryLanguageAnalyzer(FilterFactory filters){
 19+ public QueryLanguageAnalyzer(FilterFactory filters, boolean exactCase){
1920 super(filters,null);
 21+ this.exactCase = exactCase;
2022 }
2123
2224 /**
@@ -23,7 +25,7 @@
2426 */
2527 @Override
2628 public TokenStream tokenStream(String fieldName, String text) {
27 - wikitokenizer = new WikiTokenizer(text);
 29+ wikitokenizer = new WikiTokenizer(text,exactCase);
2830 return super.tokenStream(fieldName,(Reader)null);
2931 }
3032
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/CategoryAnalyzer.java
@@ -46,7 +46,7 @@
4747
4848 @Override
4949 public TokenStream tokenStream(String fieldName, Reader reader) {
50 - return new LowerCaseFilter(new ArrayTokenStream(categories));
 50+ return new ArrayTokenStream(categories);
5151 }
5252
5353 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FieldBuilder.java
@@ -0,0 +1,53 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+/**
 5+ * Agregate class for FilterFactory and FieldNameFactory. This class
 6+ * contains methods used to build various fields of the index,
 7+ * it contains field names to be used, filter that are to be applied...
 8+ *
 9+ * @author rainman
 10+ *
 11+ */
 12+public class FieldBuilder {
 13+ public class BuilderSet{
 14+ FilterFactory filters;
 15+ FieldNameFactory fields;
 16+ public BuilderSet(FilterFactory filters, FieldNameFactory fields) {
 17+ this.filters = filters;
 18+ this.fields = fields;
 19+ }
 20+ public FieldNameFactory getFields() {
 21+ return fields;
 22+ }
 23+ public FilterFactory getFilters() {
 24+ return filters;
 25+ }
 26+ public boolean isExactCase() {
 27+ return fields.isExactCase();
 28+ }
 29+ }
 30+
 31+ protected BuilderSet[] builders = new BuilderSet[2];
 32+
 33+ public FieldBuilder(String lang, boolean exactCase){
 34+ if(exactCase){
 35+ builders = new BuilderSet[2];
 36+ // additional exact case factory
 37+ builders[1] = new BuilderSet(
 38+ new FilterFactory(lang).getNoStemmerFilterFactory(),
 39+ new FieldNameFactory(FieldNameFactory.EXACT_CASE));
 40+ } else
 41+ builders = new BuilderSet[1];
 42+ // default factory, lowercase all data
 43+ builders[0] = new BuilderSet(
 44+ new FilterFactory(lang),
 45+ new FieldNameFactory());
 46+
 47+ }
 48+
 49+ public BuilderSet[] getBuilders() {
 50+ return builders;
 51+ }
 52+
 53+
 54+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java
@@ -197,7 +197,11 @@
198198 int begin = line.indexOf("[[");
199199 int end = line.indexOf("]]");
200200 if(begin != -1 && end != -1 && end > begin){
201 - return text.substring(begin+2,end);
 201+ String redirectText = text.substring(begin+2,end);
 202+ int fragment = redirectText.lastIndexOf('#');
 203+ if(fragment != -1)
 204+ redirectText = redirectText.substring(0,fragment);
 205+ return redirectText;
202206 }
203207 }
204208 return null;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/CompactArticleLinks.java
@@ -42,7 +42,7 @@
4343 @Override
4444 public String toString() {
4545 try {
46 - return new String(str,0,str.length,"utf-8")+", count="+links;
 46+ return new String(str,0,str.length,"utf-8");
4747 } catch (UnsupportedEncodingException e) {
4848 return "";
4949 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
@@ -10,6 +10,7 @@
1111 import org.apache.lucene.search.BooleanQuery;
1212 import org.apache.lucene.search.Query;
1313 import org.wikimedia.lsearch.analyzers.Analyzers;
 14+import org.wikimedia.lsearch.analyzers.FieldNameFactory;
1415 import org.wikimedia.lsearch.analyzers.WikiQueryParser;
1516 import org.wikimedia.lsearch.analyzers.WikiQueryParser.NamespacePolicy;
1617 import org.wikimedia.lsearch.config.Configuration;
@@ -36,8 +37,10 @@
3738 WikiQueryParser.ALT_TITLE_BOOST = 6;
3839 WikiQueryParser.KEYWORD_BOOST = 0.05f;
3940 WikiIndexModifier.ALT_TITLES = 3;
 41+ WikiQueryParser.ADD_STEM_TITLE=false;
 42+ FieldNameFactory ff = new FieldNameFactory();
4043 try{
41 - WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer());
 44+ WikiQueryParser parser = new WikiQueryParser(ff.contents(),new SimpleAnalyzer(),ff);
4245 Query q;
4346 HashSet<String> fields;
4447
@@ -78,7 +81,7 @@
7982 assertEquals("+category:help +category:pleh",q.toString());
8083
8184 q = parser.parseRaw("šđčćždzñ");
82 - assertEquals("contents:sđcczdzn",q.toString());
 85+ assertEquals("contents:šđčćždzñ",q.toString());
8386
8487 q = parser.parseRaw("help:making breakfast incategory:food");
8588 assertEquals("+help:making +help:breakfast +category:food",q.toString());
@@ -112,11 +115,11 @@
113116 assertTrue(fields.contains("contents"));
114117
115118 // namespace policies
116 - parser = new WikiQueryParser("contents","0",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.IGNORE);
 119+ parser = new WikiQueryParser(ff.contents(),"0",new SimpleAnalyzer(), ff, WikiQueryParser.NamespacePolicy.IGNORE);
117120 q = parser.parseRaw("help:making breakfast incategory:food");
118121 assertEquals("+contents:making +contents:breakfast +category:food",q.toString());
119122
120 - parser = new WikiQueryParser("contents","0",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE);
 123+ parser = new WikiQueryParser(ff.contents(),"0",new SimpleAnalyzer(), ff, WikiQueryParser.NamespacePolicy.REWRITE);
121124 q = parser.parseRaw("help:making breakfast incategory:food");
122125 assertEquals("+namespace:12 +(+contents:making +contents:breakfast +category:food)",q.toString());
123126
@@ -138,7 +141,7 @@
139142
140143 // ====== English Analyzer ========
141144
142 - parser = new WikiQueryParser("contents","0",new EnglishAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE);
 145+ parser = new WikiQueryParser(ff.contents(),"0",new EnglishAnalyzer(), ff, WikiQueryParser.NamespacePolicy.REWRITE);
143146 q = parser.parseRaw("main_talk:laziness");
144147 assertEquals("+namespace:1 +(contents:laziness contents:lazi^0.5)",q.toString());
145148
@@ -154,7 +157,7 @@
155158 q = parser.parse("(help:making something incategory:blah) OR (rest incategory:crest)");
156159 assertEquals("(+namespace:12 +(+(+(contents:making contents:make^0.5) title:making^2.0) +(+(contents:something contents:someth^0.5) title:something^2.0) +category:blah)) (+namespace:0 +(+(+contents:rest +category:crest) title:rest^2.0))",q.toString());
157160
158 - parser = new WikiQueryParser("contents",new EnglishAnalyzer());
 161+ parser = new WikiQueryParser(ff.contents(),new EnglishAnalyzer(),ff);
159162
160163 q = parser.parseRaw("laziness");
161164 assertEquals("contents:laziness contents:lazi^0.5",q.toString());
@@ -169,7 +172,7 @@
170173 assertEquals("+(+(contents:beans contents:bean^0.5) +category:food) +(+contents:orchid +category:\"some flowers\")",q.toString());
171174
172175 q = parser.parseRaw("(Beans AND incategory:FOod) (orchID AND incategory:\"some FLOWERS\")");
173 - assertEquals("+(+(contents:beans contents:bean^0.5) +category:food) +(+contents:orchid +category:\"some flowers\")",q.toString());
 176+ assertEquals("+(+(contents:beans contents:bean^0.5) +category:FOod) +(+contents:orchid +category:\"some FLOWERS\")",q.toString());
174177
175178 q = parser.parse("(beans AND incategory:food) (orchid AND incategory:\"some flowers\")");
176179 assertEquals("+(+(+(contents:beans contents:bean^0.5) title:beans^2.0) +category:food) +(+(+contents:orchid +category:\"some flowers\") title:orchid^2.0)",q.toString());
@@ -204,7 +207,7 @@
205208 // Tests with actual params :)
206209 // ==================================
207210 Analyzer analyzer = Analyzers.getSearcherAnalyzer("en");
208 - parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE);
 211+ parser = new WikiQueryParser(ff.contents(),"0",analyzer,ff,NamespacePolicy.LEAVE);
209212 q = parser.parseTwoPass("beans everyone",null);
210213 assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0)",q.toString());
211214
@@ -306,6 +309,16 @@
307310
308311 q = parser.parseFourPass("[0,1,2]:beans everyone [0]:mainly",NamespacePolicy.REWRITE,true);
309312 assertEquals("((+(namespace:0 namespace:1 namespace:2) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+namespace:0 +(contents:mainly contents:main^0.5))) ((+(namespace:0 namespace:1 namespace:2) +(+title:beans^2.0 +title:everyone^2.0)) (+namespace:0 +title:mainly^2.0)) (((+(namespace:0 namespace:1 namespace:2) +(+alttitle1:beans^6.0 +alttitle1:everyone^6.0)) (+namespace:0 +alttitle1:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle2:beans^6.0 +alttitle2:everyone^6.0)) (+namespace:0 +alttitle2:mainly^6.0)) ((+(namespace:0 namespace:1 namespace:2) +(+alttitle3:beans^6.0 +alttitle3:everyone^6.0)) (+namespace:0 +alttitle3:mainly^6.0)))",q.toString());
 313+
 314+ q = parser.parseFourPass("Israeli-Palestinian conflict",NamespacePolicy.IGNORE,true);
 315+ assertEquals("(+(+(contents:israeli contents:isra^0.5) +contents:palestinian) +contents:conflict) (+(+title:israeli^2.0 +title:palestinian^2.0) +title:conflict^2.0) ((+(+alttitle1:israeli^6.0 +alttitle1:palestinian^6.0) +alttitle1:conflict^6.0) (+(+alttitle2:israeli^6.0 +alttitle2:palestinian^6.0) +alttitle2:conflict^6.0) (+(+alttitle3:israeli^6.0 +alttitle3:palestinian^6.0) +alttitle3:conflict^6.0))",q.toString());
 316+
 317+ // alternative transliterations
 318+ q = parser.parseFourPass("Something for Gödels",NamespacePolicy.IGNORE,true);
 319+ assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(+(contents:godels contents:godel^0.5) +(contents:goedels contents:goedel^0.5))) (+title:something^2.0 +title:for^2.0 +(title:godels^2.0 title:goedels^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godels^6.0 alttitle1:goedels^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godels^6.0 alttitle2:goedels^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godels^6.0 alttitle3:goedels^6.0)))",q.toString());
 320+
 321+ q = parser.parseFourPass("Something for Gödel",NamespacePolicy.IGNORE,true);
 322+ assertEquals("(+(contents:something contents:someth^0.5) +contents:for +(contents:godel contents:goedel)) (+title:something^2.0 +title:for^2.0 +(title:godel^2.0 title:goedel^2.0)) ((+alttitle1:something^6.0 +alttitle1:for^6.0 +(alttitle1:godel^6.0 alttitle1:goedel^6.0)) (+alttitle2:something^6.0 +alttitle2:for^6.0 +(alttitle2:godel^6.0 alttitle2:goedel^6.0)) (+alttitle3:something^6.0 +alttitle3:for^6.0 +(alttitle3:godel^6.0 alttitle3:goedel^6.0)))",q.toString());
310323
311324 // Test field extraction
312325 HashSet<NamespaceFilter> fs = parser.getFieldNamespaces("main:something [1]:else all:oh []:nja");
@@ -316,16 +329,16 @@
317330
318331 // Localization tests
319332 analyzer = Analyzers.getSearcherAnalyzer("sr");
320 - parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE);
 333+ parser = new WikiQueryParser(ff.contents(),"0",analyzer,ff,NamespacePolicy.LEAVE);
321334
322335 q = parser.parseTwoPass("all:добродошли на википедију",NamespacePolicy.IGNORE);
323 - assertEquals("(+(contents:добродошли contents:dobrodosli^0.5) +(contents:на contents:na^0.5) +(contents:википедију contents:vikipediju^0.5)) (+(title:добродошли^2.0 title:dobrodosli) +(title:на^2.0 title:na) +(title:википедију^2.0 title:vikipediju))",q.toString());
 336+ assertEquals("(+(contents:добродошли contents:dobrodosli^0.5) +(contents:на contents:na^0.5) +(contents:википедију contents:vikipediju^0.5)) (+(title:добродошли^2.0 title:dobrodosli^0.4) +(title:на^2.0 title:na^0.4) +(title:википедију^2.0 title:vikipediju^0.4))",q.toString());
324337
325338 q = parser.parseTwoPass("all:dobrodošli na šđčćž",NamespacePolicy.IGNORE);
326339 assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^2.0 +title:na^2.0 +title:sdjccz^2.0)",q.toString());
327340
328341 analyzer = Analyzers.getSearcherAnalyzer("th");
329 - parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE);
 342+ parser = new WikiQueryParser(ff.contents(),"0",analyzer,ff,NamespacePolicy.LEAVE);
330343
331344 q = parser.parseTwoPass("ภาษาไทย",NamespacePolicy.IGNORE);
332345 assertEquals("(+contents:ภาษา +contents:ไทย) (+title:ภาษา^2.0 +title:ไทย^2.0)",q.toString());
@@ -335,7 +348,7 @@
336349
337350 // Backward compatiblity for complex filters
338351 analyzer = Analyzers.getSearcherAnalyzer("en");
339 - parser = new WikiQueryParser("contents","0,1,4,12",analyzer,NamespacePolicy.IGNORE);
 352+ parser = new WikiQueryParser(ff.contents(),"0,1,4,12",analyzer,ff,NamespacePolicy.IGNORE);
340353
341354 q = parser.parseTwoPass("beans everyone",NamespacePolicy.REWRITE);
342355 assertEquals("(+(namespace:0 namespace:1 namespace:4 namespace:12) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:4 namespace:12) +(+title:beans^2.0 +title:everyone^2.0))",q.toString());
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
@@ -15,7 +15,7 @@
1616
1717 public class FastWikiTokenizerTest {
1818 public static void displayTokensForParser(String text) {
19 - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"sr");
 19+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,"sr",false);
2020 Token[] tokens = parser.parse().toArray(new Token[] {});
2121 for (int i = 0; i < tokens.length; i++) {
2222 Token token = tokens[i];
@@ -62,6 +62,8 @@
6363 public static void main(String args[]) throws IOException{
6464 String text = "(ant) and some";
6565 showTokens(text);
 66+ text = " ä, ö, ü; for instance, Ø ÓóÒò Goedel for Gödel; čakšire";
 67+ showTokens(text);
6668 text = "[[Category:Blah Blah?!|Caption]], and [[:Category:Link to category]]";
6769 showTokens(text);
6870 text = "{{IPstack}} '''[[Hypertext]] Transfer [[communications protocol|Protocol]]''' ('''HTTP''') is a method used to transfer or convey information on the [[World Wide Web]]. Its original purpose was to provide a way to publish and retrieve [[HTML]] pages.";
@@ -110,7 +112,7 @@
111113 for(int i=0;i<2000;i++){
112114 for(TestArticle article : articles){
113115 String text = article.content;
114 - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text);
 116+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,false);
115117 parser.parse();
116118 }
117119 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java
@@ -58,6 +58,6 @@
5959 if(streams.get(fieldName) != null)
6060 return streams.get(fieldName);
6161
62 - return new AliasPorterStemFilter(new WikiTokenizer(text));
 62+ return new AliasPorterStemFilter(new WikiTokenizer(text,false));
6363 }
6464 }
Index: trunk/lucene-search-2.0/build.xml
@@ -32,7 +32,7 @@
3333 <jar destfile="${basedir}/${jar.name}">
3434 <manifest>
3535 <attribute name="Main-Class" value="org.wikimedia.lsearch.config.StartupManager"/>
36 - <attribute name="Class-Path" value="${jar.name} lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar lib/mwdumper.jar"/>
 36+ <attribute name="Class-Path" value="${jar.name} lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar lib/mwdumper.jar lib/mysql-connector-java-3.0.17-ga-bin.jar"/>
3737 </manifest>
3838 <zipfileset dir="${bin}" prefix="">
3939 <include name="org/**"/>
@@ -57,7 +57,8 @@
5858 <zipfileset src="lib/lucene-core-2.0.1-dev.jar" />
5959 <zipfileset src="lib/lucene-analyzers.jar" />
6060 <zipfileset src="lib/snowball.jar" />
61 - <zipfileset src="lib/mwdumper.jar" />
 61+ <zipfileset src="lib/mwdumper.jar" />
 62+ <zipfileset src="lib/mysql-connector-java-3.0.17-ga-bin.jar" />
6263 </jar>
6364 </target>
6465
Index: trunk/lucene-search-2.0/webinterface/searchForm.html
@@ -60,6 +60,7 @@
6161 <option value="nowiki">nowiki</option>
6262 <option value="srwiki">srwiki</option>
6363 <option value="enwiktionary">enwiktionary</option>
 64+ <option value="enwiktionary-exact">enwiktionary-exact</option>
6465 <!-- <option value="wikilucene">wikilucene</option>
6566 <option value="wikidev">wikidev</option> -->
6667 </select>
Index: trunk/lucene-search-2.0/webinterface/lsweb.py
@@ -5,7 +5,7 @@
66 from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
77 from urllib2 import URLError, HTTPError
88
9 -search_host = { 'enwiki' : "srv79:8123", '<default>': 'srv80:8123' }
 9+search_host = { 'enwiki' : "srv79:8123", '<default>': 'srv79:8123' }
1010
1111 canon_namespaces = { 0 : '', 1: 'Talk', 2: 'User', 3: 'User_talk',
1212 4 : 'Project', 5 : 'Project_talk', 6 : 'Image', 7 : 'Image_talk',
@@ -66,6 +66,7 @@
6767 limit = 20
6868 offset = 0
6969 namespaces = []
 70+ case = "ignore"
7071
7172 # parameters
7273 for key,val in params.iteritems():
@@ -87,9 +88,13 @@
8889 else:
8990 host = search_host['<default>']
9091
 92+ if dbname.endswith("-exact"):
 93+ case = "exact"
 94+ dbname = dbname[0:-6]
 95+
9196 # make search url for ls2
9297 search_url = 'http://%s/search/%s/%s' % (host,dbname,urllib.quote(rewritten.encode('utf-8')))
93 - search_params = urllib.urlencode({'limit' : limit, 'offset' : offset, 'namespaces' : ','.join(namespaces)}, True)
 98+ search_params = urllib.urlencode({'limit' : limit, 'offset' : offset, 'namespaces' : ','.join(namespaces), "case" : case}, True)
9499
95100 # process search results
96101 try:
@@ -98,6 +103,7 @@
99104 lasthit = min(offset+limit,numhits)
100105 # html headers
101106 self.send_response(200)
 107+ self.send_header('Cache-Control','no-cache')
102108 self.send_header('Content-type','text/html')
103109 self.end_headers()
104110 self.wfile.write('<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>')
@@ -154,6 +160,7 @@
155161 search_form = f.read()
156162 f.close()
157163 self.send_response(200)
 164+ self.send_header('Cache-Control','no-cache')
158165 self.send_header('Content-type','text/html')
159166 self.end_headers()
160167 self.wfile.write(search_form)
Index: trunk/lucene-search-2.0/lsearch-global.conf
@@ -40,6 +40,10 @@
4141 # dbnames that end with the suffix will use additional keywords scores
4242 KeywordScoring.suffix=wiki wikilucene wikidev
4343
 44+# suffix for databases that should also have exact-case index built
 45+# note: this will also turn off stemming!
 46+ExactCase.suffix=wiktionary wikilucene
 47+
4448 # Put here you custom namespace prefixes
4549 # Syntax: <prefix_name> : <coma separated list of namespaces>
4650 # <all> is a special keyword meaning all namespaces

Status & tagging log