r26986 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r26985‎ | r26986 | r26987 >
Date:22:45, 27 October 2007
Author:rainman
Status:old
Tags:
Comment:
* Wildcard queries, can now do both suffix and prefix queries, and
works over split indexes
* TokenizerOptions - a bag of options for the tokenizer, fixes a bug
with aggregate fields and token gaps
* (Lucene - fixed weight for DisjunctMaxQuery, was storing the
non-serializable searcher)
Modified paths:
  • /branches/lucene-search-2.1/lib/lucene-core-2.2.0.jar (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Aggregate.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/AggregateAnalyzer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ReusableLanguageAnalyzer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/Transaction.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/WikiSearcher.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java (added) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/StringUtils.java (added) (history)

Diff [purge]

Index: branches/lucene-search-2.1/lib/lucene-core-2.2.0.jar
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/StringUtils.java
@@ -0,0 +1,12 @@
 2+package org.wikimedia.lsearch.util;
 3+
 4+public class StringUtils {
 5+ /** reverse a string */
 6+ public static String reverseString(String str){
 7+ int len = str.length();
 8+ char[] buf = new char[len];
 9+ for(int i=0;i<len;i++)
 10+ buf[i] = str.charAt(len-i-1);
 11+ return new String(buf,0,len);
 12+ }
 13+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java
@@ -29,6 +29,7 @@
3030 import org.apache.log4j.Logger;
3131 import org.apache.lucene.analysis.Analyzer;
3232 import org.apache.lucene.analysis.TokenStream;
 33+import org.wikimedia.lsearch.analyzers.TokenizerOptions;
3334 import org.wikimedia.lsearch.analyzers.WikiTokenizer;
3435 import org.wikimedia.lsearch.config.IndexId;
3536
@@ -59,6 +60,6 @@
6061 if(streams.get(fieldName) != null)
6162 return streams.get(fieldName);
6263
63 - return new AliasPorterStemFilter(new WikiTokenizer(text,IndexId.get("enwiki"),false));
 64+ return new AliasPorterStemFilter(new WikiTokenizer(text,IndexId.get("enwiki"),new TokenizerOptions(false)));
6465 }
6566 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
@@ -11,13 +11,14 @@
1212 import org.apache.lucene.analysis.Token;
1313 import org.apache.lucene.analysis.TokenStream;
1414 import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
 15+import org.wikimedia.lsearch.analyzers.TokenizerOptions;
1516 import org.wikimedia.lsearch.config.Configuration;
1617 import org.wikimedia.lsearch.config.IndexId;
1718 import org.wikimedia.lsearch.index.WikiIndexModifier;
1819
1920 public class FastWikiTokenizerTest {
2021 public static void displayTokensForParser(String text) {
21 - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false);
 22+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions(false));
2223 Token[] tokens = parser.parse().toArray(new Token[] {});
2324 for (int i = 0; i < tokens.length; i++) {
2425 Token token = tokens[i];
@@ -125,7 +126,7 @@
126127 for(int i=0;i<2000;i++){
127128 for(TestArticle article : articles){
128129 String text = article.content;
129 - FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false);
 130+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions(false));
130131 parser.parse();
131132 }
132133 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java
@@ -5,6 +5,7 @@
66 import java.rmi.RemoteException;
77 import java.rmi.registry.LocateRegistry;
88 import java.rmi.registry.Registry;
 9+import java.util.ArrayList;
910 import java.util.Arrays;
1011 import java.util.Collection;
1112 import java.util.Hashtable;
@@ -20,6 +21,7 @@
2122 import org.wikimedia.lsearch.index.IndexUpdateRecord;
2223 import org.wikimedia.lsearch.search.NamespaceFilterWrapper;
2324 import org.wikimedia.lsearch.search.SearcherCache;
 25+import org.wikimedia.lsearch.search.Wildcards;
2426
2527 /**
2628 * Invokes procedures on a remote RMIMessenger.
@@ -217,4 +219,14 @@
218220 return -1;
219221 }
220222 }
 223+
 224+ public ArrayList<String> getTerms(String host, String dbrole, String wildcard, boolean exactCase) throws RemoteException {
 225+ try{
 226+ RMIMessenger r = messengerFromCache(host);
 227+ return r.getTerms(dbrole,wildcard,exactCase);
 228+ } catch(Exception e){
 229+ e.printStackTrace();
 230+ return new ArrayList<String>();
 231+ }
 232+ }
221233 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java
@@ -2,6 +2,7 @@
33
44 import java.rmi.Remote;
55 import java.rmi.RemoteException;
 6+import java.util.ArrayList;
67
78 import org.apache.lucene.search.Query;
89 import org.wikimedia.lsearch.beans.IndexReportCard;
@@ -95,4 +96,16 @@
9697 * @throws RemoteException
9798 */
9899 public Boolean isSuccessfulFlush(String dbname) throws RemoteException;
 100+
 101+ /**
 102+ * Wildcard matcher,
 103+ * Request all terms from title and reverse_title that match wildcard pattern
 104+ *
 105+ * @param dbrole - part of index, e.g. enwiki.nspart1
 106+ * @param wildcard - wildcard pattern with * and ?
 107+ * @param exactCase - if pattern is exact capitalization
 108+ * @return
 109+ * @throws RemoteException
 110+ */
 111+ public ArrayList<String> getTerms(String dbrole, String wildcard, boolean exactCase) throws RemoteException;
99112 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java
@@ -1,6 +1,8 @@
22 package org.wikimedia.lsearch.interoperability;
33
 4+import java.io.IOException;
45 import java.rmi.RemoteException;
 6+import java.util.ArrayList;
57 import java.util.Arrays;
68
79 import org.apache.log4j.Logger;
@@ -17,6 +19,8 @@
1820 import org.wikimedia.lsearch.search.NamespaceFilterWrapper;
1921 import org.wikimedia.lsearch.search.NetworkStatusThread;
2022 import org.wikimedia.lsearch.search.SearchEngine;
 23+import org.wikimedia.lsearch.search.SearcherCache;
 24+import org.wikimedia.lsearch.search.Wildcards;
2125
2226 /** Local implementation for {@link RMIMessenger} */
2327 public class RMIMessengerImpl implements RMIMessenger {
@@ -86,6 +90,14 @@
8791 return new SearchEngine().searchPart(IndexId.get(dbrole),searchterm,query,filter,offset,limit,explain);
8892 }
8993
 94+ public ArrayList<String> getTerms(String dbrole, String wildcard, boolean exactCase) throws RemoteException {
 95+ try{
 96+ return Wildcards.getLocalTerms(IndexId.get(dbrole),wildcard,exactCase);
 97+ } catch(IOException e){
 98+ throw new RemoteException("IOException on "+dbrole,e);
 99+ }
 100+ }
 101+
90102 // inherit javadoc
91103 public int getIndexerQueueSize() throws RemoteException {
92104 return IndexThread.getQueueSize();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -156,13 +156,10 @@
157157 ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection());
158158 res.setNumHits(col.size());
159159 res.setSuccess(true);
160 - // TODO: this is extremely slow
161 - //Links links = Links.openForRead(lin,lin.getSearchPath());
162160 for(int i=offset;i<offset+limit && i<col.size();i++){
163161 RelatedTitle rt = col.get(i);
164162 Title t = rt.getRelated();
165163 ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle());
166 - //rs.addContext(links.getContext(t.getKey(),key));
167164 res.addResult(rs);
168165 }
169166 } else{
@@ -305,7 +302,7 @@
306303
307304 WikiSearcher searcher = null;
308305 try {
309 - q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll);
 306+ //q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll);
310307
311308 TopDocs hits=null;
312309 // see if we can search only part of the index
@@ -331,7 +328,11 @@
332329 log.error("Error contacting searcher for "+piid);
333330 return res;
334331 }
335 - RMIMessengerClient messenger = new RMIMessengerClient();
 332+ // query
 333+ Wildcards wildcards = new Wildcards(piid,host,exactCase);
 334+ q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll,wildcards);
 335+
 336+ RMIMessengerClient messenger = new RMIMessengerClient();
336337 res = messenger.searchPart(piid,searchterm,q,nsfw,offset,limit,explain,host);
337338 if(sug != null){
338339 SuggestQuery sq = sug.suggest(searchterm,parser,res);
@@ -360,6 +361,10 @@
361362 searcher = new WikiSearcher(iid);
362363 // normal search
363364 try{
 365+ // query
 366+ Wildcards wildcards = new Wildcards(searcher.getAllHosts(),exactCase);
 367+ q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll,wildcards);
 368+
364369 hits = searcher.search(q,nsfw,offset+limit);
365370 res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
366371 if(sug != null){
@@ -406,7 +411,7 @@
407412 }
408413 }
409414
410 - protected Query parseQuery(String searchterm, WikiQueryParser parser, IndexId iid, boolean raw, NamespaceFilterWrapper nsfw, boolean searchAll) throws ParseException {
 415+ protected Query parseQuery(String searchterm, WikiQueryParser parser, IndexId iid, boolean raw, NamespaceFilterWrapper nsfw, boolean searchAll, Wildcards wildcards) throws ParseException {
411416 Query q = null;
412417 if(raw){
413418 // do minimal parsing, make a raw query
@@ -414,11 +419,11 @@
415420 q = parser.parseRaw(searchterm);
416421 } else if(nsfw == null){
417422 if(searchAll)
418 - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
 423+ q = parser.parseWithWildcards(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,wildcards);
419424 else
420 - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());
 425+ q = parser.parseWithWildcards(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,wildcards);
421426 } else{
422 - q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());
 427+ q = parser.parseWithWildcards(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,wildcards);
423428 log.info("Using NamespaceFilterWrapper "+nsfw);
424429 }
425430 return q;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java
@@ -42,10 +42,11 @@
4343 protected byte[] lengthNoStopWords = null;
4444 protected float[] boost = null;
4545 protected IndexReader reader = null;
 46+ protected String field;
4647
4748 protected AggregateMetaFieldSource(IndexReader reader, String fieldBase) throws IOException{
4849 this.reader = reader;
49 - String field = fieldBase+"_meta";
 50+ field = fieldBase+"_meta";
5051 Collection fields = reader.getFieldNames(FieldOption.ALL);
5152 if(!fields.contains(field))
5253 return; // index doesn't have ranking info
@@ -118,10 +119,10 @@
119120 int end = (docid == index.length-1)? length.length : index[docid+1];
120121 if(position >= end-start)
121122 try {
122 - throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" for "+docid+" ["+reader.document(docid).get("title")+"], but last valid index is "+(end-start-1));
 123+ throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" on field "+field+" for "+docid+" ["+reader.document(docid).get("title")+"], but last valid index is "+(end-start-1));
123124 } catch (IOException e) {
124125 e.printStackTrace();
125 - throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" unavailable");
 126+ throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" on field "+field+" unavailable");
126127 }
127128 return start+position;
128129 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java
@@ -0,0 +1,161 @@
 2+package org.wikimedia.lsearch.search;
 3+
 4+import java.io.IOException;
 5+import java.rmi.RemoteException;
 6+import java.util.ArrayList;
 7+import java.util.Collection;
 8+import java.util.HashMap;
 9+import java.util.HashSet;
 10+import java.util.Map.Entry;
 11+
 12+import org.apache.log4j.Logger;
 13+import org.apache.lucene.index.IndexReader;
 14+import org.apache.lucene.index.Term;
 15+import org.apache.lucene.search.DisjunctionMaxQuery;
 16+import org.apache.lucene.search.Query;
 17+import org.apache.lucene.search.TermQuery;
 18+import org.apache.lucene.search.WildcardTermEnum;
 19+import org.wikimedia.lsearch.analyzers.FieldNameFactory;
 20+import org.wikimedia.lsearch.config.IndexId;
 21+import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
 22+import org.wikimedia.lsearch.util.StringUtils;
 23+
 24+/**
 25+ * Wildcard-search related functions
 26+ * @author rainman
 27+ *
 28+ */
 29+public class Wildcards {
 30+ protected static Logger log = Logger.getLogger(Wildcards.class);
 31+ public static final int MAX_TERMS = 1024;
 32+ protected static SearcherCache searcherCache = null;
 33+ protected enum WildcardType { PREFIX, SUFFIX, INVALID };
 34+
 35+ /** wildcard pattern -> terms */
 36+ protected HashMap<String,HashSet<String>> wildcardCache = new HashMap<String,HashSet<String>>();
 37+ /** iid -> host */
 38+ protected HashMap<String,String> hosts = new HashMap<String,String>();
 39+
 40+ protected RMIMessengerClient client = null;
 41+ protected boolean exactCase;
 42+
 43+ public Wildcards(IndexId iid, String host, boolean exactCase){
 44+ hosts.put(iid.toString(),host);
 45+ this.exactCase = exactCase;
 46+ }
 47+
 48+ public Wildcards(HashMap<String,String> iidHostMapping, boolean exactCase){
 49+ hosts.putAll(iidHostMapping);
 50+ this.exactCase = exactCase;
 51+ }
 52+
 53+ /**
 54+ * Make a DisjunctionMaxQuery of expanded wildcard
 55+ *
 56+ * @param wildcard
 57+ * @param field
 58+ * @return null if there is no match, or on error
 59+ */
 60+ public Query makeQuery(String wildcard, String field){
 61+ if(client == null)
 62+ client = new RMIMessengerClient();
 63+
 64+ HashSet<String> terms = wildcardCache.get(wildcard);
 65+ if(terms == null){
 66+ terms = new HashSet<String>();
 67+ for(Entry<String,String> e : hosts.entrySet()){
 68+ try {
 69+ terms.addAll(client.getTerms(e.getValue(),e.getKey(),wildcard,exactCase));
 70+ } catch (RemoteException e1) {
 71+ e1.printStackTrace();
 72+ log.warn("Cannot get terms for "+wildcard+" on host "+e.getValue()+" for "+e.getKey());
 73+ }
 74+ }
 75+ wildcardCache.put(wildcard,terms);
 76+ log.info("Using "+terms.size()+" terms for pattern="+wildcard);
 77+ }
 78+
 79+ if(terms.size() == 0)
 80+ return null; // no match or error
 81+
 82+ return makeQuery(terms,field);
 83+ }
 84+
 85+ /** Construct DijunctionMaxQuery from terms */
 86+ protected Query makeQuery(HashSet<String> terms, String field){
 87+ if(terms.size() > MAX_TERMS){
 88+ HashSet<String> temp = new HashSet<String>();
 89+ int count = 0;
 90+ for(String t : terms){
 91+ if(count >= MAX_TERMS)
 92+ break;
 93+ temp.add(t);
 94+ count++;
 95+ }
 96+ terms = temp;
 97+ }
 98+ DisjunctionMaxQuery q = new DisjunctionMaxQuery(0);
 99+ for(String t : terms){
 100+ q.add(new TermQuery(new Term(field,t)));
 101+ }
 102+ return q;
 103+ }
 104+
 105+ protected static WildcardType getType(String wildcard){
 106+ if(wildcard == null || wildcard.equals(""))
 107+ return WildcardType.INVALID;
 108+ boolean pre = wildcard.startsWith("*") || wildcard.startsWith("?");
 109+ boolean suff = wildcard.endsWith("*") || wildcard.endsWith("?");
 110+ if(pre && !suff)
 111+ return WildcardType.PREFIX;
 112+ else if(suff && !pre)
 113+ return WildcardType.SUFFIX;
 114+ else
 115+ return WildcardType.INVALID;
 116+ }
 117+
 118+ public static ArrayList<String> getLocalTerms(IndexId iid, String wildcard, boolean exactCase) throws IOException {
 119+ if(searcherCache == null)
 120+ searcherCache = SearcherCache.getInstance();
 121+ ArrayList<String> ret = new ArrayList<String>();
 122+ // check type of wildcard
 123+ WildcardType type = getType(wildcard);
 124+ if(type == WildcardType.INVALID)
 125+ return ret;
 126+ // check searcher
 127+ IndexSearcherMul searcher = searcherCache.getLocalSearcher(iid);
 128+ if(searcher == null)
 129+ throw new IOException(iid+" not a local index, or index not available");
 130+
 131+ // get field
 132+ IndexReader reader = searcher.getIndexReader();
 133+ String field = null;
 134+ Term wildcardTerm = null;
 135+ FieldNameFactory fields = new FieldNameFactory(exactCase);
 136+ if(type == WildcardType.PREFIX){
 137+ field = fields.title();
 138+ wildcardTerm = new Term(field,wildcard);
 139+ } else{
 140+ field = fields.reverse_title();
 141+ wildcardTerm = new Term(field,StringUtils.reverseString(wildcard));
 142+ }
 143+
 144+ // get terms
 145+ Term t;
 146+ WildcardTermEnum te = new WildcardTermEnum(reader,wildcardTerm);
 147+ while((t = te.term()) != null){
 148+ if(type == WildcardType.SUFFIX)
 149+ ret.add(StringUtils.reverseString(t.text()));
 150+ else
 151+ ret.add(t.text());
 152+
 153+ if(!te.next())
 154+ break;
 155+ if(ret.size() >= MAX_TERMS)
 156+ break;
 157+ }
 158+
 159+ return ret;
 160+ }
 161+
 162+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/WikiSearcher.java
@@ -4,7 +4,9 @@
55 import java.util.ArrayList;
66 import java.util.Arrays;
77 import java.util.Collection;
 8+import java.util.HashMap;
89 import java.util.Hashtable;
 10+import java.util.Map.Entry;
911
1012 import org.apache.log4j.Logger;
1113 import org.apache.lucene.document.Document;
@@ -108,6 +110,15 @@
109111 else
110112 return cache.getSearchableHost(s);
111113 }
 114+
 115+ /** Get map iid->host for of all parts in this searcher */
 116+ public HashMap<String,String> getAllHosts(){
 117+ HashMap<String,String> ret = new HashMap<String,String>();
 118+ for(Entry<String,Searchable> e : searcherParts.entrySet()){
 119+ ret.put(e.getKey(),cache.getSearchableHost(e.getValue()));
 120+ }
 121+ return ret;
 122+ }
112123
113124 @Override
114125 public void close() throws IOException {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -40,6 +40,7 @@
4141 import org.wikimedia.lsearch.analyzers.LanguageAnalyzer;
4242 import org.wikimedia.lsearch.analyzers.RelatedAnalyzer;
4343 import org.wikimedia.lsearch.analyzers.StopWords;
 44+import org.wikimedia.lsearch.analyzers.TokenizerOptions;
4445 import org.wikimedia.lsearch.analyzers.WikiTokenizer;
4546 import org.wikimedia.lsearch.beans.Article;
4647 import org.wikimedia.lsearch.beans.IndexReportCard;
@@ -53,6 +54,7 @@
5455 import org.wikimedia.lsearch.spell.api.SpellCheckIndexer;
5556 import org.wikimedia.lsearch.util.Localization;
5657 import org.wikimedia.lsearch.util.MathFunc;
 58+import org.wikimedia.lsearch.util.StringUtils;
5759
5860 /**
5961 * IndexModifier for batch update of local lucene index.
@@ -481,7 +483,7 @@
482484 FilterFactory filters = bs.getFilters();
483485
484486 // tokenize the article to fill in pre-analyzed fields
485 - WikiTokenizer tokenizer = new WikiTokenizer(article.getContents(),iid,bs.isExactCase());
 487+ WikiTokenizer tokenizer = new WikiTokenizer(article.getContents(),iid,new TokenizerOptions(bs.isExactCase()));
486488 tokenizer.tokenize();
487489
488490 // title
@@ -510,7 +512,7 @@
511513 }
512514
513515 // reverse title for wildcard searches
514 - Field rtitle = new Field(fields.reverse_title(), reverseString(article.getTitle()), Field.Store.NO, Field.Index.TOKENIZED);
 516+ Field rtitle = new Field(fields.reverse_title(), StringUtils.reverseString(article.getTitle()), Field.Store.NO, Field.Index.TOKENIZED);
515517 rtitle.setBoost(rankBoost);
516518 doc.add(rtitle);
517519
@@ -518,20 +520,11 @@
519521 return doc;
520522 }
521523
522 - /** reverse a string */
523 - public static String reverseString(String str){
524 - int len = str.length();
525 - char[] buf = new char[len];
526 - for(int i=0;i<len;i++)
527 - buf[i] = str.charAt(len-i-1);
528 - return new String(buf,0,len);
529 - }
530 -
531524 /** add related aggregate field */
532525 protected static void makeRelated(Document doc, String prefix, Article article, IndexId iid, HashSet<String> stopWords){
533526 ArrayList<Aggregate> items = new ArrayList<Aggregate>();
534527 for(RelatedTitle rt : article.getRelated()){
535 - items.add(new Aggregate(rt.getRelated().getTitle(),transformRelated(rt.getScore()),iid,false,stopWords));
 528+ addToItems(items,new Aggregate(rt.getRelated().getTitle(),transformRelated(rt.getScore()),iid,false,stopWords));
536529 }
537530 makeAggregate(doc,prefix,items);
538531 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/Transaction.java
@@ -10,6 +10,7 @@
1111 import org.wikimedia.lsearch.config.Configuration;
1212 import org.wikimedia.lsearch.config.IndexId;
1313 import org.wikimedia.lsearch.util.Command;
 14+import org.wikimedia.lsearch.util.FSUtils;
1415
1516 /**
1617 * Simple transaction support for indexing. Wrap index operations by
@@ -58,20 +59,19 @@
5960 // start new transaction
6061 backup.getParentFile().mkdirs();
6162 try{
62 - if( exec("/bin/cp -lr "+iid.getIndexPath()+" "+backup.getAbsolutePath()) == 0){
63 - Properties prop = new Properties();
64 - // write out the status file
65 - prop.setProperty("status","started at "+System.currentTimeMillis());
66 - FileOutputStream fileos = new FileOutputStream(info,false);
67 - prop.store(fileos,"");
68 - fileos.close();
69 - // all is good, set transaction flag
70 - inTransaction = true;
71 - log.info("Transaction on index "+iid+" started");
72 - } else
73 - log.warn("Making a transaction copy for "+iid+" failed.");
 63+ // make a copy
 64+ FSUtils.createHardLinkRecursive(iid.getIndexPath(),backup.getAbsolutePath());
 65+ Properties prop = new Properties();
 66+ // write out the status file
 67+ prop.setProperty("status","started at "+System.currentTimeMillis());
 68+ FileOutputStream fileos = new FileOutputStream(info,false);
 69+ prop.store(fileos,"");
 70+ fileos.close();
 71+ // all is good, set transaction flag
 72+ inTransaction = true;
 73+ log.info("Transaction on index "+iid+" started");
7474 } catch(Exception e){
75 - log.warn("Error while intializing transaction: "+e.getMessage());
 75+ log.error("Error while intializing transaction: "+e.getMessage());
7676 }
7777 }
7878
@@ -82,11 +82,11 @@
8383 // cleanup before starting new transaction
8484 try{
8585 if(trans.exists())
86 - exec("/bin/rm -rf "+trans.getAbsolutePath());
 86+ FSUtils.deleteRecursive(trans.getAbsoluteFile());
8787 if(info.exists())
88 - exec("/bin/rm -rf "+info.getAbsolutePath());
 88+ FSUtils.deleteRecursive(info.getAbsoluteFile());
8989 } catch(Exception e){
90 - log.warn("Error removing old transaction data from "+iid.getTransactionPath()+" : "+e.getMessage());
 90+ log.error("Error removing old transaction data from "+iid.getTransactionPath()+" : "+e.getMessage());
9191 }
9292
9393 }
@@ -122,15 +122,14 @@
123123 try{
124124 if(index.exists()) // clear locks before recovering
125125 WikiIndexModifier.unlockIndex(iid.getIndexPath());
126 - if( exec("/bin/rm -rf "+iid.getIndexPath()) == 0 ){
127 - if( exec("/bin/mv "+backup.getAbsolutePath()+" "+iid.getIndexPath()) == 0 ){
128 - log.info("Successfully recovered index for "+iid);
129 - } else
130 - log.warn("Recovery of "+iid+" failed: cannot move "+backup.getAbsolutePath());
131 - } else
132 - log.warn("Recovery of "+iid+" failed: cannot delete "+iid.getIndexPath());
 126+
 127+ // delete old indexpath
 128+ FSUtils.deleteRecursive(new File(iid.getIndexPath()));
 129+
 130+ FSUtils.createHardLinkRecursive(backup.getAbsolutePath(),iid.getIndexPath());
 131+ FSUtils.deleteRecursive(backup.getAbsoluteFile()); // cleanup
133132 } catch(Exception e){
134 - log.warn("Recovery of index "+iid+" failed with error "+e.getMessage());
 133+ log.error("Recovery of index "+iid+" failed with error "+e.getMessage());
135134 }
136135 }
137136
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ReusableLanguageAnalyzer.java
@@ -6,18 +6,19 @@
77 import org.apache.lucene.analysis.TokenStream;
88
99 /**
10 - * Reusable language analyzer. Can be used to tokenize arbitrary text.
 10+ * Reusable language analyzer. Should be used to tokenize queries and
 11+ * other non-wikitext stuff. Has template relocation, etc... turned off.
1112 *
1213 * @author rainman
1314 *
1415 */
1516 public class ReusableLanguageAnalyzer extends LanguageAnalyzer {
1617 static org.apache.log4j.Logger log = Logger.getLogger(ReusableLanguageAnalyzer.class);
17 - protected boolean exactCase;
 18+ protected TokenizerOptions options;
1819
1920 public ReusableLanguageAnalyzer(FilterFactory filters, boolean exactCase){
2021 super(filters,null);
21 - this.exactCase = exactCase;
 22+ this.options = new TokenizerOptions.NoRelocation(exactCase);
2223 }
2324
2425 /**
@@ -25,7 +26,7 @@
2627 */
2728 @Override
2829 public TokenStream tokenStream(String fieldName, String text) {
29 - wikitokenizer = new WikiTokenizer(text,filters.getIndexId(),exactCase);
 30+ wikitokenizer = new WikiTokenizer(text,filters.getIndexId(),options);
3031 return super.tokenStream(fieldName,(Reader)null);
3132 }
3233
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
@@ -64,9 +64,10 @@
6565 ArrayList<ArrayList<String>> keywordsBySize = new ArrayList<ArrayList<String>>();
6666 for(int i=0;i<KEYWORD_LEVELS;i++)
6767 keywordsBySize.add(new ArrayList<String>());
 68+ TokenizerOptions options = new TokenizerOptions(exactCase);
6869 // arange keywords into a list by token number
6970 for(String k : keywords){
70 - ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,exactCase).parse();
 71+ ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,options).parse();
7172 if(parsed.size() == 0)
7273 continue;
7374 else if(parsed.size() < KEYWORD_LEVELS)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java
@@ -0,0 +1,26 @@
 2+package org.wikimedia.lsearch.analyzers;
 3+
 4+/**
 5+ * FastWikiTokenizerEngine options
 6+ *
 7+ * @author rainman
 8+ *
 9+ */
 10+public class TokenizerOptions {
 11+ /** if capitalization should be preserved */
 12+ boolean exactCase = false;
 13+ /** if templates should be relocated, etc.. makes sense only if whole article
 14+ * is parsed (and not query,or part of an article) */
 15+ boolean relocationParsing = true;
 16+
 17+ public TokenizerOptions(boolean exactCase){
 18+ this.exactCase = exactCase;
 19+ }
 20+
 21+ public static class NoRelocation extends TokenizerOptions {
 22+ public NoRelocation(boolean exactCase){
 23+ super(exactCase);
 24+ this.relocationParsing = false;
 25+ }
 26+ }
 27+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Aggregate.java
@@ -20,7 +20,8 @@
2121
2222 /** Construct from arbitrary text that will be tokenized */
2323 public Aggregate(String text, float boost, IndexId iid, boolean exactCase, HashSet<String> stopWords){
24 - tokens = new FastWikiTokenizerEngine(text,iid,exactCase).parse();
 24+ TokenizerOptions options = new TokenizerOptions.NoRelocation(exactCase);
 25+ tokens = new FastWikiTokenizerEngine(text,iid,options).parse();
2526 this.boost = boost;
2627 noStopWordsLength = 0;
2728 for(Token t : tokens){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/AggregateAnalyzer.java
@@ -24,7 +24,7 @@
2525 if(item >= items.size())
2626 return null;
2727 Aggregate ag = items.get(item);
28 - if(token >= ag.length()){
 28+ if(token >= ag.length() || token >= TOKEN_GAP-1){
2929 gap = true;
3030 do{
3131 // find next nonempty item
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java
@@ -38,8 +38,8 @@
3939 * @param str
4040 */
4141
42 - public WikiTokenizer(String str, IndexId iid, boolean exactCase){
43 - parser = new FastWikiTokenizerEngine(str,iid,exactCase);
 42+ public WikiTokenizer(String str, IndexId iid, TokenizerOptions options){
 43+ parser = new FastWikiTokenizerEngine(str,iid,options);
4444 this.input = null;
4545 }
4646
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -56,6 +56,8 @@
5757 private int templateLevel = 0; // level of nestedness of templates
5858 private int gap = 1;
5959 private ArrayList<Token> nonContentTokens; // tokens from the beginning of the article that are not content, but templates, images, etc..
 60+ private ArrayList<Token> references; // stuff between <ref></ref> tags should go to the end
 61+ private boolean inRef = false, firstRef = false; // if we are within a ref tag
6062
6163 private int prefixLen = 0;
6264 private final char[] prefixBuf = new char[MAX_WORD_LEN];
@@ -77,6 +79,8 @@
7880 public static int BULLETIN_GAP = 10;
7981 /** Gap between sentences */
8082 public static int SENTENCE_GAP = 2;
 83+ /** Gap between references */
 84+ public static int REFERENCE_GAP = 20;
8185
8286 /** language code */
8387 private String language;
@@ -87,9 +91,8 @@
8892 private static Hashtable<String,HashSet<String>> categoryLocalized = new Hashtable<String,HashSet<String>>();
8993 private static HashSet<String> interwiki;
9094
91 - /** if true, words won't be lowercased */
92 - private boolean exactCase = false;
93 - private UnicodeDecomposer decomposer;
 95+ private UnicodeDecomposer decomposer;
 96+ private TokenizerOptions options;
9497
9598 enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END, LINK_KEYWORD,
9699 LINK_FETCH, IGNORE, EXTERNAL_URL, EXTERNAL_WORDS,
@@ -108,6 +111,9 @@
109112 numberToken = false;
110113 headingText = new ArrayList<String>();
111114 nonContentTokens = new ArrayList<Token>();
 115+ inRef = false;
 116+ firstRef = false;
 117+ references = new ArrayList<Token>();
112118 }
113119
114120 /** Note: this will read only 1024 bytes of reader, it's
@@ -127,12 +133,12 @@
128134 }
129135 }
130136
131 - public FastWikiTokenizerEngine(String text, IndexId iid, boolean exactCase){
 137+ public FastWikiTokenizerEngine(String text, IndexId iid, TokenizerOptions options){
132138 this.text = text.toCharArray();
133139 this.textString = text;
134140 this.language = iid.getLangCode();
135141 this.iid = iid;
136 - this.exactCase = exactCase;
 142+ this.options = options;
137143 textLength = text.length();
138144 init();
139145 }
@@ -175,7 +181,7 @@
176182 boolean addDecomposed = false;
177183 for(int i=0;i<length;i++){
178184 addToAlias = true;
179 - if( ! exactCase )
 185+ if( ! options.exactCase )
180186 cl = Character.toLowerCase(buffer[i]);
181187 else{
182188 cl = buffer[i];
@@ -303,7 +309,19 @@
304310 * @param t
305311 */
306312 private final void addToTokens(Token t){
307 - if(templateLevel > 0 && keywordTokens < FIRST_SECTION_GAP){
 313+ if(!options.relocationParsing){
 314+ tokens.add(t);
 315+ return;
 316+ }
 317+ // and now, relocation parsing:
 318+ if(inRef){
 319+ if(firstRef){ // delimiter whole references from each other
 320+ firstRef = false;
 321+ t.setPositionIncrement(REFERENCE_GAP);
 322+ }
 323+ references.add(t);
 324+ return;
 325+ } else if(templateLevel > 0 && keywordTokens < FIRST_SECTION_GAP){
308326 nonContentTokens.add(t);
309327 return;
310328 } else if(t.getPositionIncrement() == FIRST_SECTION_GAP){
@@ -478,11 +496,34 @@
479497 // check
480498 if(start == end && start != 0 && start+end<endOfLine-cur && start>=2 && start<=4){
481499 headings++;
482 - headingText.add(new String(text,cur+start,endOfLine-(cur+start+end)));
 500+ headingText.add(deleteRefs(new String(text,cur+start,endOfLine-(cur+start+end))));
483501 }
484502 }
485503 }
486504
 505+ /** Delete <ref></ref> text from a string */
 506+ protected String deleteRefs(String str){
 507+ int start;
 508+ while((start = str.indexOf("<ref>")) != -1){
 509+ int end = str.indexOf("</ref>",start+1);
 510+ if(end == -1)
 511+ break;
 512+ str = str.substring(0,start)+((end+6<str.length())? str.substring(end+6) : "");
 513+ }
 514+ return str;
 515+ }
 516+
 517+ /** Check if starting from current position a string is matched */
 518+ protected boolean matchesString(String target){
 519+ if(cur + target.length() >= textLength)
 520+ return false;
 521+ for(lookup=cur,lc=0;lc<target.length();lookup++,lc++){
 522+ if(target.charAt(lc) != Character.toLowerCase(text[lookup]))
 523+ return false;
 524+ }
 525+ return true;
 526+ }
 527+
487528 /**
488529 * Parse Wiki text, and produce an arraylist of tokens.
489530 * Also fills the lists categories and interwikis.
@@ -517,14 +558,16 @@
518559 case '=':
519560 addToken();
520561 checkHeadings();
521 - if(headings == 1)
522 - gap = FIRST_SECTION_GAP;
523 - else if(headings > 1)
524 - gap = SECTION_GAP;
 562+ if(options.relocationParsing){
 563+ if(headings == 1)
 564+ gap = FIRST_SECTION_GAP;
 565+ else if(headings > 1)
 566+ gap = SECTION_GAP;
 567+ }
525568 continue;
526569 case '\n':
527570 addToken();
528 - if(cur + 1 < textLength){
 571+ if(options.relocationParsing && cur + 1 < textLength){
529572 switch(text[cur+1]){
530573 case '\n': gap = PARAGRAPH_GAP; break;
531574 case '*': case ':': case '#': gap = BULLETIN_GAP; break;
@@ -539,11 +582,19 @@
540583 case ':':
541584 case ';':
542585 addToken();
543 - if(gap == 1)
 586+ if(options.relocationParsing && gap == 1)
544587 gap = SENTENCE_GAP;
545588 continue;
546589 case '<':
547590 addToken();
 591+ if(matchesString("<ref>")){
 592+ inRef = true;
 593+ firstRef = true;
 594+ }
 595+ if(matchesString("</ref>")){
 596+ inRef = false;
 597+ gap = 1;
 598+ }
548599 state = ParserState.IGNORE;
549600 ignoreEnd = '>';
550601 continue;
@@ -817,6 +868,12 @@
818869 }
819870 nonContentTokens.clear();
820871 }
 872+ // add references to end
 873+ if(references.size() != 0){
 874+ for(Token tt : references){
 875+ tokens.add(tt);
 876+ }
 877+ }
821878 return tokens;
822879 }
823880
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -42,6 +42,7 @@
4343 import org.wikimedia.lsearch.search.NamespaceFilter;
4444 import org.wikimedia.lsearch.search.RankField;
4545 import org.wikimedia.lsearch.search.RankValue;
 46+import org.wikimedia.lsearch.search.Wildcards;
4647 import org.wikimedia.lsearch.search.RankField.RankFieldSource;
4748 import org.wikimedia.lsearch.util.UnicodeDecomposer;
4849
@@ -79,7 +80,7 @@
8081 private float defaultBoost = 1;
8182 private float defaultAliasBoost = ALIAS_BOOST;
8283 protected enum TokenType {WORD, FIELD, AND, OR, EOF };
83 -
 84+
8485 private TokenStream tokenStream;
8586 private ArrayList<Token> tokens; // tokens from analysis
8687
@@ -140,6 +141,7 @@
141142 protected FieldBuilder.BuilderSet builder;
142143 protected FieldNameFactory fields;
143144 protected HashSet<String> stopWords;
 145+ protected Wildcards wildcards = null;
144146
145147 /** default value for boolean queries */
146148 public BooleanClause.Occur boolDefault = BooleanClause.Occur.MUST;
@@ -620,7 +622,7 @@
621623 continue;
622624
623625 // terms, fields
624 - if(Character.isLetterOrDigit(c) || c == '['){
 626+ if(Character.isLetterOrDigit(c) || c == '[' || c=='*' || c=='?'){
625627 // check for generic namespace prefixes, e.g. [0,1]:
626628 if(c == '['){
627629 if(fetchGenericPrefix())
@@ -780,6 +782,33 @@
781783 return query;
782784 }
783785
 786+ /** return true if buffer is wildcard */
 787+ private boolean bufferIsWildCard(){
 788+ if(length < 1)
 789+ return false;
 790+ boolean wild = false;
 791+ int index = -1;
 792+ for(int i=0;i<length;i++){
 793+ if(buffer[i] == '*' || buffer[i] == '?'){
 794+ wild = true;
 795+ index = i;
 796+ break;
 797+ }
 798+ }
 799+ // check if it's a valid wildcard
 800+ if(wild){
 801+ if((buffer[0] == '*' || buffer[0] == '?') && (buffer[length-1]=='*' || buffer[length-1]=='?'))
 802+ return false; // don't support patterns like *a*
 803+ if(index == length-1 && buffer[index]=='?')
 804+ return false; // probably just an ordinary question mark
 805+ for(int i=0;i<length;i++){
 806+ if(Character.isLetterOrDigit(buffer[i]))
 807+ return true; // +card :P
 808+ }
 809+ }
 810+ return false;
 811+ }
 812+
784813 /**
785814 * Constructs either a termquery or a boolean query depending on
786815 * analysis of the fetched token. A single "word" might be analyzed
@@ -798,11 +827,16 @@
799828
800829 // check for wildcard seaches, they are also not analyzed/stemmed, only for titles
801830 // wildcard signs are allowed only at the end of the word, minimum one letter word
802 - if(length>1 && Character.isLetter(buffer[0]) && buffer[length-1]=='*' &&
803 - defaultField.equals(fields.title())){
804 - Query ret = new WildcardQuery(makeTerm());
805 - ret.setBoost(defaultBoost);
806 - return ret;
 831+ if(length>1 && wildcards != null && bufferIsWildCard()){
 832+ Term term = makeTerm();
 833+ Query ret = wildcards.makeQuery(term.text(),term.field());
 834+ if(ret != null){
 835+ ret.setBoost(defaultBoost);
 836+ return ret;
 837+ } else{
 838+ // something is wrong, try making normal query
 839+ return new TermQuery(term);
 840+ }
807841 }
808842
809843 if(toplevelOccur == BooleanClause.Occur.MUST_NOT)
@@ -1432,15 +1466,31 @@
14331467 /** Make the main phrase query, finds exact phrases, and sloppy phrases without stop words */
14341468 public Query makeMainPhrase(ArrayList<String> words, String field, int slop, float boost, Query stemtitle, Query related, HashSet<String> preStopWords){
14351469 RankValue val = new RankValue();
1436 - CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.ContentsSloppyOptions(val,stemtitle,related),
1437 - new QueryOptions.ContentsExactOptions(val,stemtitle,related),preStopWords);
 1470+ boolean allStopWords = true;
14381471 for(String w : words){
1439 - pq.add(new Term(field,w));
 1472+ if(!preStopWords.contains(w)){
 1473+ allStopWords = false;
 1474+ break;
 1475+ }
14401476 }
1441 - pq.setSlop(slop);
1442 - pq.setBoost(boost);
1443 - return pq;
1444 -
 1477+ if(allStopWords){
 1478+ CustomPhraseQuery pq = new CustomPhraseQuery(new QueryOptions.ContentsExactOptions(val,stemtitle,related));
 1479+ for(String w : words){
 1480+ pq.add(new Term(field,w));
 1481+ }
 1482+ pq.setSlop(slop);
 1483+ pq.setBoost(boost);
 1484+ return pq;
 1485+ } else{
 1486+ CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.ContentsSloppyOptions(val,stemtitle,related),
 1487+ new QueryOptions.ContentsExactOptions(val,stemtitle,related),preStopWords);
 1488+ for(String w : words){
 1489+ pq.add(new Term(field,w));
 1490+ }
 1491+ pq.setSlop(slop);
 1492+ pq.setBoost(boost);
 1493+ return pq;
 1494+ }
14451495 }
14461496
14471497 /** make single phrase for related field */
@@ -1616,14 +1666,31 @@
16171667 /** Make the phrase that will match redirects, etc.. */
16181668 public Query makeAlttitlePhrase(ArrayList<String> words, String field, int slop, float boost, HashSet<String> preStopWords){
16191669 AggregatePhraseInfo ap = new AggregatePhraseInfo();
1620 - CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.AlttitleSloppyOptions(ap),
1621 - new QueryOptions.AlttitleExactOptions(ap),preStopWords);
 1670+ boolean allStopWords = true;
16221671 for(String w : words){
1623 - pq.add(new Term(field,w));
 1672+ if(!preStopWords.contains(w)){
 1673+ allStopWords = false;
 1674+ break;
 1675+ }
16241676 }
1625 - pq.setSlop(slop);
1626 - pq.setBoost(boost);
1627 - return pq;
 1677+ if(allStopWords){
 1678+ CustomPhraseQuery pq = new CustomPhraseQuery(new QueryOptions.AlttitleExactOptions(ap));
 1679+ for(String w : words){
 1680+ pq.add(new Term(field,w));
 1681+ }
 1682+ pq.setSlop(slop);
 1683+ pq.setBoost(boost);
 1684+ return pq;
 1685+ } else{
 1686+ CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.AlttitleSloppyOptions(ap),
 1687+ new QueryOptions.AlttitleExactOptions(ap),preStopWords);
 1688+ for(String w : words){
 1689+ pq.add(new Term(field,w));
 1690+ }
 1691+ pq.setSlop(slop);
 1692+ pq.setBoost(boost);
 1693+ return pq;
 1694+ }
16281695
16291696 }
16301697
@@ -1650,7 +1717,8 @@
16511718 * @return
16521719 */
16531720 @SuppressWarnings("unchecked")
1654 - protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords){
 1721+ protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords, Wildcards wildcards){
 1722+ this.wildcards = wildcards;
16551723 queryText = quoteCJK(queryText);
16561724 if(policy != null)
16571725 this.namespacePolicy = policy;
@@ -1669,6 +1737,9 @@
16701738 BooleanQuery bq = new BooleanQuery(true);
16711739 bq.add(qc,BooleanClause.Occur.SHOULD);
16721740 bq.add(qt,BooleanClause.Occur.SHOULD);
 1741+
 1742+ if(words.size() == 0)
 1743+ return bq;
16731744
16741745 HashSet<String> preStopWords = StopWords.getPredefinedSet(builder.getFilters().getIndexId());
16751746 Query alttitleQuery = makeAlttitlePhrase(words,fields.alttitle(),10,1,preStopWords);
@@ -1701,7 +1772,11 @@
17021773 return coreQuery;
17031774
17041775 }
1705 -
 1776+
 1777+ public Query parseWithWildcards(String queryText, NamespacePolicy policy, Wildcards wildcards){
 1778+ return parseMultiPass(queryText,policy,false,false,wildcards);
 1779+ }
 1780+
17061781 /**
17071782 * Three parse pases: contents, title, redirect
17081783 *
@@ -1711,7 +1786,7 @@
17121787 * @throws ParseException
17131788 */
17141789 public Query parseThreePass(String queryText, NamespacePolicy policy) throws ParseException{
1715 - return parseMultiPass(queryText,policy,true,false);
 1790+ return parseMultiPass(queryText,policy,true,false,null);
17161791 }
17171792
17181793 /**
@@ -1723,11 +1798,11 @@
17241799 */
17251800 public Query parseFourPass(String queryText, NamespacePolicy policy, String dbname) throws ParseException{
17261801 boolean makeKeywords = global.useKeywordScoring(dbname);
1727 - return parseMultiPass(queryText,policy,true,makeKeywords);
 1802+ return parseMultiPass(queryText,policy,true,makeKeywords,null);
17281803 }
17291804
17301805 public Query parseFourPass(String queryText, NamespacePolicy policy, boolean makeKeywords) throws ParseException{
1731 - return parseMultiPass(queryText,policy,true,makeKeywords);
 1806+ return parseMultiPass(queryText,policy,true,makeKeywords,null);
17321807 }
17331808
17341809 /**
@@ -1740,7 +1815,7 @@
17411816 * @throws ParseException
17421817 */
17431818 public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{
1744 - return parseMultiPass(queryText,policy,false,false);
 1819+ return parseMultiPass(queryText,policy,false,false,null);
17451820 }
17461821
17471822 public NamespacePolicy getNamespacePolicy() {

Status & tagging log