r26986 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r26985‎ \| r26986 \| r26987 >
Date:	22:45, 27 October 2007
Author:	rainman
Status:	old
Tags:
Comment:	* Wildcard queries, can now do both suffix and prefix queries, and works over split indexes * TokenizerOptions - a bag of options for the tokenizer, fixes a bug with aggregate fields and token gaps * (Lucene - fixed weight for DisjunctMaxQuery, was storing the non-serializable searcher)
Modified paths:	/branches/lucene-search-2.1/lib/lucene-core-2.2.0.jar (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Aggregate.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/AggregateAnalyzer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ReusableLanguageAnalyzer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/Transaction.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/WikiSearcher.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java (added) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/StringUtils.java (added) (history)

Diff [purge]

Index: branches/lucene-search-2.1/lib/lucene-core-2.2.0.jar
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/StringUtils.java
—	—	@@ -0,0 +1,12 @@
	2	+package org.wikimedia.lsearch.util;
	3	+
	4	+public class StringUtils {
	5	+ /** reverse a string */
	6	+ public static String reverseString(String str){
	7	+ int len = str.length();
	8	+ char[] buf = new char[len];
	9	+ for(int i=0;i<len;i++)
	10	+ buf[i] = str.charAt(len-i-1);
	11	+ return new String(buf,0,len);
	12	+ }
	13	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/EnglishAnalyzer.java
—	—	@@ -29,6 +29,7 @@
30	30	import org.apache.log4j.Logger;
31	31	import org.apache.lucene.analysis.Analyzer;
32	32	import org.apache.lucene.analysis.TokenStream;
	33	+import org.wikimedia.lsearch.analyzers.TokenizerOptions;
33	34	import org.wikimedia.lsearch.analyzers.WikiTokenizer;
34	35	import org.wikimedia.lsearch.config.IndexId;
35	36
—	—	@@ -59,6 +60,6 @@
60	61	if(streams.get(fieldName) != null)
61	62	return streams.get(fieldName);
62	63
63		~~- return new AliasPorterStemFilter(new WikiTokenizer(text,IndexId.get("enwiki"),false));~~
	64	+ return new AliasPorterStemFilter(new WikiTokenizer(text,IndexId.get("enwiki"),new TokenizerOptions(false)));
64	65	}
65	66	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
—	—	@@ -11,13 +11,14 @@
12	12	import org.apache.lucene.analysis.Token;
13	13	import org.apache.lucene.analysis.TokenStream;
14	14	import org.wikimedia.lsearch.analyzers.FastWikiTokenizerEngine;
	15	+import org.wikimedia.lsearch.analyzers.TokenizerOptions;
15	16	import org.wikimedia.lsearch.config.Configuration;
16	17	import org.wikimedia.lsearch.config.IndexId;
17	18	import org.wikimedia.lsearch.index.WikiIndexModifier;
18	19
19	20	public class FastWikiTokenizerTest {
20	21	public static void displayTokensForParser(String text) {
21		~~- FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false);~~
	22	+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions(false));
22	23	Token[] tokens = parser.parse().toArray(new Token[] {});
23	24	for (int i = 0; i < tokens.length; i++) {
24	25	Token token = tokens[i];
—	—	@@ -125,7 +126,7 @@
126	127	for(int i=0;i<2000;i++){
127	128	for(TestArticle article : articles){
128	129	String text = article.content;
129		~~- FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),false);~~
	130	+ FastWikiTokenizerEngine parser = new FastWikiTokenizerEngine(text,IndexId.get("enwiki"),new TokenizerOptions(false));
130	131	parser.parse();
131	132	}
132	133	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java
—	—	@@ -5,6 +5,7 @@
6	6	import java.rmi.RemoteException;
7	7	import java.rmi.registry.LocateRegistry;
8	8	import java.rmi.registry.Registry;
	9	+import java.util.ArrayList;
9	10	import java.util.Arrays;
10	11	import java.util.Collection;
11	12	import java.util.Hashtable;
—	—	@@ -20,6 +21,7 @@
21	22	import org.wikimedia.lsearch.index.IndexUpdateRecord;
22	23	import org.wikimedia.lsearch.search.NamespaceFilterWrapper;
23	24	import org.wikimedia.lsearch.search.SearcherCache;
	25	+import org.wikimedia.lsearch.search.Wildcards;
24	26
25	27	/**
26	28	* Invokes procedures on a remote RMIMessenger.
—	—	@@ -217,4 +219,14 @@
218	220	return -1;
219	221	}
220	222	}
	223	+
	224	+ public ArrayList<String> getTerms(String host, String dbrole, String wildcard, boolean exactCase) throws RemoteException {
	225	+ try{
	226	+ RMIMessenger r = messengerFromCache(host);
	227	+ return r.getTerms(dbrole,wildcard,exactCase);
	228	+ } catch(Exception e){
	229	+ e.printStackTrace();
	230	+ return new ArrayList<String>();
	231	+ }
	232	+ }
221	233	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java
—	—	@@ -2,6 +2,7 @@
3	3
4	4	import java.rmi.Remote;
5	5	import java.rmi.RemoteException;
	6	+import java.util.ArrayList;
6	7
7	8	import org.apache.lucene.search.Query;
8	9	import org.wikimedia.lsearch.beans.IndexReportCard;
—	—	@@ -95,4 +96,16 @@
96	97	* @throws RemoteException
97	98	*/
98	99	public Boolean isSuccessfulFlush(String dbname) throws RemoteException;
	100	+
	101	+ /**
	102	+ * Wildcard matcher,
	103	+ * Request all terms from title and reverse_title that match wildcard pattern
	104	+ *
	105	+ * @param dbrole - part of index, e.g. enwiki.nspart1
	106	+ * @param wildcard - wildcard pattern with * and ?
	107	+ * @param exactCase - if pattern is exact capitalization
	108	+ * @return
	109	+ * @throws RemoteException
	110	+ */
	111	+ public ArrayList<String> getTerms(String dbrole, String wildcard, boolean exactCase) throws RemoteException;
99	112	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java
—	—	@@ -1,6 +1,8 @@
2	2	package org.wikimedia.lsearch.interoperability;
3	3
	4	+import java.io.IOException;
4	5	import java.rmi.RemoteException;
	6	+import java.util.ArrayList;
5	7	import java.util.Arrays;
6	8
7	9	import org.apache.log4j.Logger;
—	—	@@ -17,6 +19,8 @@
18	20	import org.wikimedia.lsearch.search.NamespaceFilterWrapper;
19	21	import org.wikimedia.lsearch.search.NetworkStatusThread;
20	22	import org.wikimedia.lsearch.search.SearchEngine;
	23	+import org.wikimedia.lsearch.search.SearcherCache;
	24	+import org.wikimedia.lsearch.search.Wildcards;
21	25
22	26	/** Local implementation for {@link RMIMessenger} */
23	27	public class RMIMessengerImpl implements RMIMessenger {
—	—	@@ -86,6 +90,14 @@
87	91	return new SearchEngine().searchPart(IndexId.get(dbrole),searchterm,query,filter,offset,limit,explain);
88	92	}
89	93
	94	+ public ArrayList<String> getTerms(String dbrole, String wildcard, boolean exactCase) throws RemoteException {
	95	+ try{
	96	+ return Wildcards.getLocalTerms(IndexId.get(dbrole),wildcard,exactCase);
	97	+ } catch(IOException e){
	98	+ throw new RemoteException("IOException on "+dbrole,e);
	99	+ }
	100	+ }
	101	+
90	102	// inherit javadoc
91	103	public int getIndexerQueueSize() throws RemoteException {
92	104	return IndexThread.getQueueSize();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
—	—	@@ -156,13 +156,10 @@
157	157	ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection());
158	158	res.setNumHits(col.size());
159	159	res.setSuccess(true);
160		~~- // TODO: this is extremely slow~~
161		~~- //Links links = Links.openForRead(lin,lin.getSearchPath());~~
162	160	for(int i=offset;i<offset+limit && i<col.size();i++){
163	161	RelatedTitle rt = col.get(i);
164	162	Title t = rt.getRelated();
165	163	ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle());
166		~~- //rs.addContext(links.getContext(t.getKey(),key));~~
167	164	res.addResult(rs);
168	165	}
169	166	} else{
—	—	@@ -305,7 +302,7 @@
306	303
307	304	WikiSearcher searcher = null;
308	305	try {
309		~~- q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll);~~
	306	+ //q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll);
310	307
311	308	TopDocs hits=null;
312	309	// see if we can search only part of the index
—	—	@@ -331,7 +328,11 @@
332	329	log.error("Error contacting searcher for "+piid);
333	330	return res;
334	331	}
335		~~- RMIMessengerClient messenger = new RMIMessengerClient();~~
	332	+ // query
	333	+ Wildcards wildcards = new Wildcards(piid,host,exactCase);
	334	+ q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll,wildcards);
	335	+
	336	+ RMIMessengerClient messenger = new RMIMessengerClient();
336	337	res = messenger.searchPart(piid,searchterm,q,nsfw,offset,limit,explain,host);
337	338	if(sug != null){
338	339	SuggestQuery sq = sug.suggest(searchterm,parser,res);
—	—	@@ -360,6 +361,10 @@
361	362	searcher = new WikiSearcher(iid);
362	363	// normal search
363	364	try{
	365	+ // query
	366	+ Wildcards wildcards = new Wildcards(searcher.getAllHosts(),exactCase);
	367	+ q = parseQuery(searchterm,parser,iid,raw,nsfw,searchAll,wildcards);
	368	+
364	369	hits = searcher.search(q,nsfw,offset+limit);
365	370	res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
366	371	if(sug != null){
—	—	@@ -406,7 +411,7 @@
407	412	}
408	413	}
409	414
410		~~- protected Query parseQuery(String searchterm, WikiQueryParser parser, IndexId iid, boolean raw, NamespaceFilterWrapper nsfw, boolean searchAll) throws ParseException {~~
	415	+ protected Query parseQuery(String searchterm, WikiQueryParser parser, IndexId iid, boolean raw, NamespaceFilterWrapper nsfw, boolean searchAll, Wildcards wildcards) throws ParseException {
411	416	Query q = null;
412	417	if(raw){
413	418	// do minimal parsing, make a raw query
—	—	@@ -414,11 +419,11 @@
415	420	q = parser.parseRaw(searchterm);
416	421	} else if(nsfw == null){
417	422	if(searchAll)
418		~~- q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());~~
	423	+ q = parser.parseWithWildcards(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,wildcards);
419	424	else
420		~~- q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,iid.getDBname());~~
	425	+ q = parser.parseWithWildcards(searchterm,WikiQueryParser.NamespacePolicy.REWRITE,wildcards);
421	426	} else{
422		~~- q = parser.parseFourPass(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,iid.getDBname());~~
	427	+ q = parser.parseWithWildcards(searchterm,WikiQueryParser.NamespacePolicy.IGNORE,wildcards);
423	428	log.info("Using NamespaceFilterWrapper "+nsfw);
424	429	}
425	430	return q;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java
—	—	@@ -42,10 +42,11 @@
43	43	protected byte[] lengthNoStopWords = null;
44	44	protected float[] boost = null;
45	45	protected IndexReader reader = null;
	46	+ protected String field;
46	47
47	48	protected AggregateMetaFieldSource(IndexReader reader, String fieldBase) throws IOException{
48	49	this.reader = reader;
49		~~- String field = fieldBase+"_meta";~~
	50	+ field = fieldBase+"_meta";
50	51	Collection fields = reader.getFieldNames(FieldOption.ALL);
51	52	if(!fields.contains(field))
52	53	return; // index doesn't have ranking info
—	—	@@ -118,10 +119,10 @@
119	120	int end = (docid == index.length-1)? length.length : index[docid+1];
120	121	if(position >= end-start)
121	122	try {
122		~~- throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" for "+docid+" ["+reader.document(docid).get("title")+"], but last valid index is "+(end-start-1));~~
	123	+ throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" on field "+field+" for "+docid+" ["+reader.document(docid).get("title")+"], but last valid index is "+(end-start-1));
123	124	} catch (IOException e) {
124	125	e.printStackTrace();
125		~~- throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" unavailable");~~
	126	+ throw new ArrayIndexOutOfBoundsException("Requestion position "+position+" on field "+field+" unavailable");
126	127	}
127	128	return start+position;
128	129	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java
—	—	@@ -0,0 +1,161 @@
	2	+package org.wikimedia.lsearch.search;
	3	+
	4	+import java.io.IOException;
	5	+import java.rmi.RemoteException;
	6	+import java.util.ArrayList;
	7	+import java.util.Collection;
	8	+import java.util.HashMap;
	9	+import java.util.HashSet;
	10	+import java.util.Map.Entry;
	11	+
	12	+import org.apache.log4j.Logger;
	13	+import org.apache.lucene.index.IndexReader;
	14	+import org.apache.lucene.index.Term;
	15	+import org.apache.lucene.search.DisjunctionMaxQuery;
	16	+import org.apache.lucene.search.Query;
	17	+import org.apache.lucene.search.TermQuery;
	18	+import org.apache.lucene.search.WildcardTermEnum;
	19	+import org.wikimedia.lsearch.analyzers.FieldNameFactory;
	20	+import org.wikimedia.lsearch.config.IndexId;
	21	+import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
	22	+import org.wikimedia.lsearch.util.StringUtils;
	23	+
	24	+/**
	25	+ * Wildcard-search related functions
	26	+ * @author rainman
	27	+ *
	28	+ */
	29	+public class Wildcards {
	30	+ protected static Logger log = Logger.getLogger(Wildcards.class);
	31	+ public static final int MAX_TERMS = 1024;
	32	+ protected static SearcherCache searcherCache = null;
	33	+ protected enum WildcardType { PREFIX, SUFFIX, INVALID };
	34	+
	35	+ /** wildcard pattern -> terms */
	36	+ protected HashMap<String,HashSet<String>> wildcardCache = new HashMap<String,HashSet<String>>();
	37	+ /** iid -> host */
	38	+ protected HashMap<String,String> hosts = new HashMap<String,String>();
	39	+
	40	+ protected RMIMessengerClient client = null;
	41	+ protected boolean exactCase;
	42	+
	43	+ public Wildcards(IndexId iid, String host, boolean exactCase){
	44	+ hosts.put(iid.toString(),host);
	45	+ this.exactCase = exactCase;
	46	+ }
	47	+
	48	+ public Wildcards(HashMap<String,String> iidHostMapping, boolean exactCase){
	49	+ hosts.putAll(iidHostMapping);
	50	+ this.exactCase = exactCase;
	51	+ }
	52	+
	53	+ /**
	54	+ * Make a DisjunctionMaxQuery of expanded wildcard
	55	+ *
	56	+ * @param wildcard
	57	+ * @param field
	58	+ * @return null if there is no match, or on error
	59	+ */
	60	+ public Query makeQuery(String wildcard, String field){
	61	+ if(client == null)
	62	+ client = new RMIMessengerClient();
	63	+
	64	+ HashSet<String> terms = wildcardCache.get(wildcard);
	65	+ if(terms == null){
	66	+ terms = new HashSet<String>();
	67	+ for(Entry<String,String> e : hosts.entrySet()){
	68	+ try {
	69	+ terms.addAll(client.getTerms(e.getValue(),e.getKey(),wildcard,exactCase));
	70	+ } catch (RemoteException e1) {
	71	+ e1.printStackTrace();
	72	+ log.warn("Cannot get terms for "+wildcard+" on host "+e.getValue()+" for "+e.getKey());
	73	+ }
	74	+ }
	75	+ wildcardCache.put(wildcard,terms);
	76	+ log.info("Using "+terms.size()+" terms for pattern="+wildcard);
	77	+ }
	78	+
	79	+ if(terms.size() == 0)
	80	+ return null; // no match or error
	81	+
	82	+ return makeQuery(terms,field);
	83	+ }
	84	+
	85	+ /** Construct DijunctionMaxQuery from terms */
	86	+ protected Query makeQuery(HashSet<String> terms, String field){
	87	+ if(terms.size() > MAX_TERMS){
	88	+ HashSet<String> temp = new HashSet<String>();
	89	+ int count = 0;
	90	+ for(String t : terms){
	91	+ if(count >= MAX_TERMS)
	92	+ break;
	93	+ temp.add(t);
	94	+ count++;
	95	+ }
	96	+ terms = temp;
	97	+ }
	98	+ DisjunctionMaxQuery q = new DisjunctionMaxQuery(0);
	99	+ for(String t : terms){
	100	+ q.add(new TermQuery(new Term(field,t)));
	101	+ }
	102	+ return q;
	103	+ }
	104	+
	105	+ protected static WildcardType getType(String wildcard){
	106	+ if(wildcard == null \|\| wildcard.equals(""))
	107	+ return WildcardType.INVALID;
	108	+ boolean pre = wildcard.startsWith("*") \|\| wildcard.startsWith("?");
	109	+ boolean suff = wildcard.endsWith("*") \|\| wildcard.endsWith("?");
	110	+ if(pre && !suff)
	111	+ return WildcardType.PREFIX;
	112	+ else if(suff && !pre)
	113	+ return WildcardType.SUFFIX;
	114	+ else
	115	+ return WildcardType.INVALID;
	116	+ }
	117	+
	118	+ public static ArrayList<String> getLocalTerms(IndexId iid, String wildcard, boolean exactCase) throws IOException {
	119	+ if(searcherCache == null)
	120	+ searcherCache = SearcherCache.getInstance();
	121	+ ArrayList<String> ret = new ArrayList<String>();
	122	+ // check type of wildcard
	123	+ WildcardType type = getType(wildcard);
	124	+ if(type == WildcardType.INVALID)
	125	+ return ret;
	126	+ // check searcher
	127	+ IndexSearcherMul searcher = searcherCache.getLocalSearcher(iid);
	128	+ if(searcher == null)
	129	+ throw new IOException(iid+" not a local index, or index not available");
	130	+
	131	+ // get field
	132	+ IndexReader reader = searcher.getIndexReader();
	133	+ String field = null;
	134	+ Term wildcardTerm = null;
	135	+ FieldNameFactory fields = new FieldNameFactory(exactCase);
	136	+ if(type == WildcardType.PREFIX){
	137	+ field = fields.title();
	138	+ wildcardTerm = new Term(field,wildcard);
	139	+ } else{
	140	+ field = fields.reverse_title();
	141	+ wildcardTerm = new Term(field,StringUtils.reverseString(wildcard));
	142	+ }
	143	+
	144	+ // get terms
	145	+ Term t;
	146	+ WildcardTermEnum te = new WildcardTermEnum(reader,wildcardTerm);
	147	+ while((t = te.term()) != null){
	148	+ if(type == WildcardType.SUFFIX)
	149	+ ret.add(StringUtils.reverseString(t.text()));
	150	+ else
	151	+ ret.add(t.text());
	152	+
	153	+ if(!te.next())
	154	+ break;
	155	+ if(ret.size() >= MAX_TERMS)
	156	+ break;
	157	+ }
	158	+
	159	+ return ret;
	160	+ }
	161	+
	162	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/WikiSearcher.java
—	—	@@ -4,7 +4,9 @@
5	5	import java.util.ArrayList;
6	6	import java.util.Arrays;
7	7	import java.util.Collection;
	8	+import java.util.HashMap;
8	9	import java.util.Hashtable;
	10	+import java.util.Map.Entry;
9	11
10	12	import org.apache.log4j.Logger;
11	13	import org.apache.lucene.document.Document;
—	—	@@ -108,6 +110,15 @@
109	111	else
110	112	return cache.getSearchableHost(s);
111	113	}
	114	+
	115	+ /** Get map iid->host for of all parts in this searcher */
	116	+ public HashMap<String,String> getAllHosts(){
	117	+ HashMap<String,String> ret = new HashMap<String,String>();
	118	+ for(Entry<String,Searchable> e : searcherParts.entrySet()){
	119	+ ret.put(e.getKey(),cache.getSearchableHost(e.getValue()));
	120	+ }
	121	+ return ret;
	122	+ }
112	123
113	124	@Override
114	125	public void close() throws IOException {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
—	—	@@ -40,6 +40,7 @@
41	41	import org.wikimedia.lsearch.analyzers.LanguageAnalyzer;
42	42	import org.wikimedia.lsearch.analyzers.RelatedAnalyzer;
43	43	import org.wikimedia.lsearch.analyzers.StopWords;
	44	+import org.wikimedia.lsearch.analyzers.TokenizerOptions;
44	45	import org.wikimedia.lsearch.analyzers.WikiTokenizer;
45	46	import org.wikimedia.lsearch.beans.Article;
46	47	import org.wikimedia.lsearch.beans.IndexReportCard;
—	—	@@ -53,6 +54,7 @@
54	55	import org.wikimedia.lsearch.spell.api.SpellCheckIndexer;
55	56	import org.wikimedia.lsearch.util.Localization;
56	57	import org.wikimedia.lsearch.util.MathFunc;
	58	+import org.wikimedia.lsearch.util.StringUtils;
57	59
58	60	/**
59	61	* IndexModifier for batch update of local lucene index.
—	—	@@ -481,7 +483,7 @@
482	484	FilterFactory filters = bs.getFilters();
483	485
484	486	// tokenize the article to fill in pre-analyzed fields
485		~~- WikiTokenizer tokenizer = new WikiTokenizer(article.getContents(),iid,bs.isExactCase());~~
	487	+ WikiTokenizer tokenizer = new WikiTokenizer(article.getContents(),iid,new TokenizerOptions(bs.isExactCase()));
486	488	tokenizer.tokenize();
487	489
488	490	// title
—	—	@@ -510,7 +512,7 @@
511	513	}
512	514
513	515	// reverse title for wildcard searches
514		~~- Field rtitle = new Field(fields.reverse_title(), reverseString(article.getTitle()), Field.Store.NO, Field.Index.TOKENIZED);~~
	516	+ Field rtitle = new Field(fields.reverse_title(), StringUtils.reverseString(article.getTitle()), Field.Store.NO, Field.Index.TOKENIZED);
515	517	rtitle.setBoost(rankBoost);
516	518	doc.add(rtitle);
517	519
—	—	@@ -518,20 +520,11 @@
519	521	return doc;
520	522	}
521	523
522		~~- /** reverse a string */~~
523		~~- public static String reverseString(String str){~~
524		~~- int len = str.length();~~
525		~~- char[] buf = new char[len];~~
526		~~- for(int i=0;i<len;i++)~~
527		~~- buf[i] = str.charAt(len-i-1);~~
528		~~- return new String(buf,0,len);~~
529		~~- }~~
530		-
531	524	/** add related aggregate field */
532	525	protected static void makeRelated(Document doc, String prefix, Article article, IndexId iid, HashSet<String> stopWords){
533	526	ArrayList<Aggregate> items = new ArrayList<Aggregate>();
534	527	for(RelatedTitle rt : article.getRelated()){
535		~~- items.add(new Aggregate(rt.getRelated().getTitle(),transformRelated(rt.getScore()),iid,false,stopWords));~~
	528	+ addToItems(items,new Aggregate(rt.getRelated().getTitle(),transformRelated(rt.getScore()),iid,false,stopWords));
536	529	}
537	530	makeAggregate(doc,prefix,items);
538	531	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/Transaction.java
—	—	@@ -10,6 +10,7 @@
11	11	import org.wikimedia.lsearch.config.Configuration;
12	12	import org.wikimedia.lsearch.config.IndexId;
13	13	import org.wikimedia.lsearch.util.Command;
	14	+import org.wikimedia.lsearch.util.FSUtils;
14	15
15	16	/**
16	17	* Simple transaction support for indexing. Wrap index operations by
—	—	@@ -58,20 +59,19 @@
59	60	// start new transaction
60	61	backup.getParentFile().mkdirs();
61	62	try{
62		~~- if( exec("/bin/cp -lr "+iid.getIndexPath()+" "+backup.getAbsolutePath()) == 0){~~
63		~~- Properties prop = new Properties();~~
64		~~- // write out the status file~~
65		~~- prop.setProperty("status","started at "+System.currentTimeMillis());~~
66		~~- FileOutputStream fileos = new FileOutputStream(info,false);~~
67		~~- prop.store(fileos,"");~~
68		~~- fileos.close();~~
69		~~- // all is good, set transaction flag~~
70		~~- inTransaction = true;~~
71		~~- log.info("Transaction on index "+iid+" started");~~
72		~~- } else~~
73		~~- log.warn("Making a transaction copy for "+iid+" failed.");~~
	63	+ // make a copy
	64	+ FSUtils.createHardLinkRecursive(iid.getIndexPath(),backup.getAbsolutePath());
	65	+ Properties prop = new Properties();
	66	+ // write out the status file
	67	+ prop.setProperty("status","started at "+System.currentTimeMillis());
	68	+ FileOutputStream fileos = new FileOutputStream(info,false);
	69	+ prop.store(fileos,"");
	70	+ fileos.close();
	71	+ // all is good, set transaction flag
	72	+ inTransaction = true;
	73	+ log.info("Transaction on index "+iid+" started");
74	74	} catch(Exception e){
75		~~- log.warn("Error while intializing transaction: "+e.getMessage());~~
	75	+ log.error("Error while intializing transaction: "+e.getMessage());
76	76	}
77	77	}
78	78
—	—	@@ -82,11 +82,11 @@
83	83	// cleanup before starting new transaction
84	84	try{
85	85	if(trans.exists())
86		~~- exec("/bin/rm -rf "+trans.getAbsolutePath());~~
	86	+ FSUtils.deleteRecursive(trans.getAbsoluteFile());
87	87	if(info.exists())
88		~~- exec("/bin/rm -rf "+info.getAbsolutePath());~~
	88	+ FSUtils.deleteRecursive(info.getAbsoluteFile());
89	89	} catch(Exception e){
90		~~- log.warn("Error removing old transaction data from "+iid.getTransactionPath()+" : "+e.getMessage());~~
	90	+ log.error("Error removing old transaction data from "+iid.getTransactionPath()+" : "+e.getMessage());
91	91	}
92	92
93	93	}
—	—	@@ -122,15 +122,14 @@
123	123	try{
124	124	if(index.exists()) // clear locks before recovering
125	125	WikiIndexModifier.unlockIndex(iid.getIndexPath());
126		~~- if( exec("/bin/rm -rf "+iid.getIndexPath()) == 0 ){~~
127		~~- if( exec("/bin/mv "+backup.getAbsolutePath()+" "+iid.getIndexPath()) == 0 ){~~
128		~~- log.info("Successfully recovered index for "+iid);~~
129		~~- } else~~
130		~~- log.warn("Recovery of "+iid+" failed: cannot move "+backup.getAbsolutePath());~~
131		~~- } else~~
132		~~- log.warn("Recovery of "+iid+" failed: cannot delete "+iid.getIndexPath());~~
	126	+
	127	+ // delete old indexpath
	128	+ FSUtils.deleteRecursive(new File(iid.getIndexPath()));
	129	+
	130	+ FSUtils.createHardLinkRecursive(backup.getAbsolutePath(),iid.getIndexPath());
	131	+ FSUtils.deleteRecursive(backup.getAbsoluteFile()); // cleanup
133	132	} catch(Exception e){
134		~~- log.warn("Recovery of index "+iid+" failed with error "+e.getMessage());~~
	133	+ log.error("Recovery of index "+iid+" failed with error "+e.getMessage());
135	134	}
136	135	}
137	136
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ReusableLanguageAnalyzer.java
—	—	@@ -6,18 +6,19 @@
7	7	import org.apache.lucene.analysis.TokenStream;
8	8
9	9	/**
10		~~- * Reusable language analyzer. Can be used to tokenize arbitrary text.~~
	10	+ * Reusable language analyzer. Should be used to tokenize queries and
	11	+ * other non-wikitext stuff. Has template relocation, etc... turned off.
11	12	*
12	13	* @author rainman
13	14	*
14	15	*/
15	16	public class ReusableLanguageAnalyzer extends LanguageAnalyzer {
16	17	static org.apache.log4j.Logger log = Logger.getLogger(ReusableLanguageAnalyzer.class);
17		~~- protected boolean exactCase;~~
	18	+ protected TokenizerOptions options;
18	19
19	20	public ReusableLanguageAnalyzer(FilterFactory filters, boolean exactCase){
20	21	super(filters,null);
21		~~- this.exactCase = exactCase;~~
	22	+ this.options = new TokenizerOptions.NoRelocation(exactCase);
22	23	}
23	24
24	25	/**
—	—	@@ -25,7 +26,7 @@
26	27	*/
27	28	@Override
28	29	public TokenStream tokenStream(String fieldName, String text) {
29		~~- wikitokenizer = new WikiTokenizer(text,filters.getIndexId(),exactCase);~~
	30	+ wikitokenizer = new WikiTokenizer(text,filters.getIndexId(),options);
30	31	return super.tokenStream(fieldName,(Reader)null);
31	32	}
32	33
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/KeywordsAnalyzer.java
—	—	@@ -64,9 +64,10 @@
65	65	ArrayList<ArrayList<String>> keywordsBySize = new ArrayList<ArrayList<String>>();
66	66	for(int i=0;i<KEYWORD_LEVELS;i++)
67	67	keywordsBySize.add(new ArrayList<String>());
	68	+ TokenizerOptions options = new TokenizerOptions(exactCase);
68	69	// arange keywords into a list by token number
69	70	for(String k : keywords){
70		~~- ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,exactCase).parse();~~
	71	+ ArrayList<Token> parsed = new FastWikiTokenizerEngine(k,iid,options).parse();
71	72	if(parsed.size() == 0)
72	73	continue;
73	74	else if(parsed.size() < KEYWORD_LEVELS)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java
—	—	@@ -0,0 +1,26 @@
	2	+package org.wikimedia.lsearch.analyzers;
	3	+
	4	+/**
	5	+ * FastWikiTokenizerEngine options
	6	+ *
	7	+ * @author rainman
	8	+ *
	9	+ */
	10	+public class TokenizerOptions {
	11	+ /** if capitalization should be preserved */
	12	+ boolean exactCase = false;
	13	+ /** if templates should be relocated, etc.. makes sense only if whole article
	14	+ * is parsed (and not query,or part of an article) */
	15	+ boolean relocationParsing = true;
	16	+
	17	+ public TokenizerOptions(boolean exactCase){
	18	+ this.exactCase = exactCase;
	19	+ }
	20	+
	21	+ public static class NoRelocation extends TokenizerOptions {
	22	+ public NoRelocation(boolean exactCase){
	23	+ super(exactCase);
	24	+ this.relocationParsing = false;
	25	+ }
	26	+ }
	27	+}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/Aggregate.java
—	—	@@ -20,7 +20,8 @@
21	21
22	22	/** Construct from arbitrary text that will be tokenized */
23	23	public Aggregate(String text, float boost, IndexId iid, boolean exactCase, HashSet<String> stopWords){
24		~~- tokens = new FastWikiTokenizerEngine(text,iid,exactCase).parse();~~
	24	+ TokenizerOptions options = new TokenizerOptions.NoRelocation(exactCase);
	25	+ tokens = new FastWikiTokenizerEngine(text,iid,options).parse();
25	26	this.boost = boost;
26	27	noStopWordsLength = 0;
27	28	for(Token t : tokens){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/AggregateAnalyzer.java
—	—	@@ -24,7 +24,7 @@
25	25	if(item >= items.size())
26	26	return null;
27	27	Aggregate ag = items.get(item);
28		~~- if(token >= ag.length()){~~
	28	+ if(token >= ag.length() \|\| token >= TOKEN_GAP-1){
29	29	gap = true;
30	30	do{
31	31	// find next nonempty item
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiTokenizer.java
—	—	@@ -38,8 +38,8 @@
39	39	* @param str
40	40	*/
41	41
42		~~- public WikiTokenizer(String str, IndexId iid, boolean exactCase){~~
43		~~- parser = new FastWikiTokenizerEngine(str,iid,exactCase);~~
	42	+ public WikiTokenizer(String str, IndexId iid, TokenizerOptions options){
	43	+ parser = new FastWikiTokenizerEngine(str,iid,options);
44	44	this.input = null;
45	45	}
46	46
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
—	—	@@ -56,6 +56,8 @@
57	57	private int templateLevel = 0; // level of nestedness of templates
58	58	private int gap = 1;
59	59	private ArrayList<Token> nonContentTokens; // tokens from the beginning of the article that are not content, but templates, images, etc..
	60	+ private ArrayList<Token> references; // stuff between <ref></ref> tags should go to the end
	61	+ private boolean inRef = false, firstRef = false; // if we are within a ref tag
60	62
61	63	private int prefixLen = 0;
62	64	private final char[] prefixBuf = new char[MAX_WORD_LEN];
—	—	@@ -77,6 +79,8 @@
78	80	public static int BULLETIN_GAP = 10;
79	81	/** Gap between sentences */
80	82	public static int SENTENCE_GAP = 2;
	83	+ /** Gap between references */
	84	+ public static int REFERENCE_GAP = 20;
81	85
82	86	/** language code */
83	87	private String language;
—	—	@@ -87,9 +91,8 @@
88	92	private static Hashtable<String,HashSet<String>> categoryLocalized = new Hashtable<String,HashSet<String>>();
89	93	private static HashSet<String> interwiki;
90	94
91		~~- /** if true, words won't be lowercased */~~
92		~~- private boolean exactCase = false;~~
93		~~- private UnicodeDecomposer decomposer;~~
	95	+ private UnicodeDecomposer decomposer;
	96	+ private TokenizerOptions options;
94	97
95	98	enum ParserState { WORD, LINK_BEGIN, LINK_WORDS, LINK_END, LINK_KEYWORD,
96	99	LINK_FETCH, IGNORE, EXTERNAL_URL, EXTERNAL_WORDS,
—	—	@@ -108,6 +111,9 @@
109	112	numberToken = false;
110	113	headingText = new ArrayList<String>();
111	114	nonContentTokens = new ArrayList<Token>();
	115	+ inRef = false;
	116	+ firstRef = false;
	117	+ references = new ArrayList<Token>();
112	118	}
113	119
114	120	/** Note: this will read only 1024 bytes of reader, it's
—	—	@@ -127,12 +133,12 @@
128	134	}
129	135	}
130	136
131		~~- public FastWikiTokenizerEngine(String text, IndexId iid, boolean exactCase){~~
	137	+ public FastWikiTokenizerEngine(String text, IndexId iid, TokenizerOptions options){
132	138	this.text = text.toCharArray();
133	139	this.textString = text;
134	140	this.language = iid.getLangCode();
135	141	this.iid = iid;
136		~~- this.exactCase = exactCase;~~
	142	+ this.options = options;
137	143	textLength = text.length();
138	144	init();
139	145	}
—	—	@@ -175,7 +181,7 @@
176	182	boolean addDecomposed = false;
177	183	for(int i=0;i<length;i++){
178	184	addToAlias = true;
179		~~- if( ! exactCase )~~
	185	+ if( ! options.exactCase )
180	186	cl = Character.toLowerCase(buffer[i]);
181	187	else{
182	188	cl = buffer[i];
—	—	@@ -303,7 +309,19 @@
304	310	* @param t
305	311	*/
306	312	private final void addToTokens(Token t){
307		~~- if(templateLevel > 0 && keywordTokens < FIRST_SECTION_GAP){~~
	313	+ if(!options.relocationParsing){
	314	+ tokens.add(t);
	315	+ return;
	316	+ }
	317	+ // and now, relocation parsing:
	318	+ if(inRef){
	319	+ if(firstRef){ // delimiter whole references from each other
	320	+ firstRef = false;
	321	+ t.setPositionIncrement(REFERENCE_GAP);
	322	+ }
	323	+ references.add(t);
	324	+ return;
	325	+ } else if(templateLevel > 0 && keywordTokens < FIRST_SECTION_GAP){
308	326	nonContentTokens.add(t);
309	327	return;
310	328	} else if(t.getPositionIncrement() == FIRST_SECTION_GAP){
—	—	@@ -478,11 +496,34 @@
479	497	// check
480	498	if(start == end && start != 0 && start+end<endOfLine-cur && start>=2 && start<=4){
481	499	headings++;
482		~~- headingText.add(new String(text,cur+start,endOfLine-(cur+start+end)));~~
	500	+ headingText.add(deleteRefs(new String(text,cur+start,endOfLine-(cur+start+end))));
483	501	}
484	502	}
485	503	}
486	504
	505	+ /** Delete <ref></ref> text from a string */
	506	+ protected String deleteRefs(String str){
	507	+ int start;
	508	+ while((start = str.indexOf("<ref>")) != -1){
	509	+ int end = str.indexOf("</ref>",start+1);
	510	+ if(end == -1)
	511	+ break;
	512	+ str = str.substring(0,start)+((end+6<str.length())? str.substring(end+6) : "");
	513	+ }
	514	+ return str;
	515	+ }
	516	+
	517	+ /** Check if starting from current position a string is matched */
	518	+ protected boolean matchesString(String target){
	519	+ if(cur + target.length() >= textLength)
	520	+ return false;
	521	+ for(lookup=cur,lc=0;lc<target.length();lookup++,lc++){
	522	+ if(target.charAt(lc) != Character.toLowerCase(text[lookup]))
	523	+ return false;
	524	+ }
	525	+ return true;
	526	+ }
	527	+
487	528	/**
488	529	* Parse Wiki text, and produce an arraylist of tokens.
489	530	* Also fills the lists categories and interwikis.
—	—	@@ -517,14 +558,16 @@
518	559	case '=':
519	560	addToken();
520	561	checkHeadings();
521		~~- if(headings == 1)~~
522		~~- gap = FIRST_SECTION_GAP;~~
523		~~- else if(headings > 1)~~
524		~~- gap = SECTION_GAP;~~
	562	+ if(options.relocationParsing){
	563	+ if(headings == 1)
	564	+ gap = FIRST_SECTION_GAP;
	565	+ else if(headings > 1)
	566	+ gap = SECTION_GAP;
	567	+ }
525	568	continue;
526	569	case '\n':
527	570	addToken();
528		~~- if(cur + 1 < textLength){~~
	571	+ if(options.relocationParsing && cur + 1 < textLength){
529	572	switch(text[cur+1]){
530	573	case '\n': gap = PARAGRAPH_GAP; break;
531	574	case '*': case ':': case '#': gap = BULLETIN_GAP; break;
—	—	@@ -539,11 +582,19 @@
540	583	case ':':
541	584	case ';':
542	585	addToken();
543		~~- if(gap == 1)~~
	586	+ if(options.relocationParsing && gap == 1)
544	587	gap = SENTENCE_GAP;
545	588	continue;
546	589	case '<':
547	590	addToken();
	591	+ if(matchesString("<ref>")){
	592	+ inRef = true;
	593	+ firstRef = true;
	594	+ }
	595	+ if(matchesString("</ref>")){
	596	+ inRef = false;
	597	+ gap = 1;
	598	+ }
548	599	state = ParserState.IGNORE;
549	600	ignoreEnd = '>';
550	601	continue;
—	—	@@ -817,6 +868,12 @@
818	869	}
819	870	nonContentTokens.clear();
820	871	}
	872	+ // add references to end
	873	+ if(references.size() != 0){
	874	+ for(Token tt : references){
	875	+ tokens.add(tt);
	876	+ }
	877	+ }
821	878	return tokens;
822	879	}
823	880
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -42,6 +42,7 @@
43	43	import org.wikimedia.lsearch.search.NamespaceFilter;
44	44	import org.wikimedia.lsearch.search.RankField;
45	45	import org.wikimedia.lsearch.search.RankValue;
	46	+import org.wikimedia.lsearch.search.Wildcards;
46	47	import org.wikimedia.lsearch.search.RankField.RankFieldSource;
47	48	import org.wikimedia.lsearch.util.UnicodeDecomposer;
48	49
—	—	@@ -79,7 +80,7 @@
80	81	private float defaultBoost = 1;
81	82	private float defaultAliasBoost = ALIAS_BOOST;
82	83	protected enum TokenType {WORD, FIELD, AND, OR, EOF };
83		-
	84	+
84	85	private TokenStream tokenStream;
85	86	private ArrayList<Token> tokens; // tokens from analysis
86	87
—	—	@@ -140,6 +141,7 @@
141	142	protected FieldBuilder.BuilderSet builder;
142	143	protected FieldNameFactory fields;
143	144	protected HashSet<String> stopWords;
	145	+ protected Wildcards wildcards = null;
144	146
145	147	/** default value for boolean queries */
146	148	public BooleanClause.Occur boolDefault = BooleanClause.Occur.MUST;
—	—	@@ -620,7 +622,7 @@
621	623	continue;
622	624
623	625	// terms, fields
624		~~- if(Character.isLetterOrDigit(c) \|\| c == '['){~~
	626	+ if(Character.isLetterOrDigit(c) \|\| c == '[' \|\| c=='*' \|\| c=='?'){
625	627	// check for generic namespace prefixes, e.g. [0,1]:
626	628	if(c == '['){
627	629	if(fetchGenericPrefix())
—	—	@@ -780,6 +782,33 @@
781	783	return query;
782	784	}
783	785
	786	+ /** return true if buffer is wildcard */
	787	+ private boolean bufferIsWildCard(){
	788	+ if(length < 1)
	789	+ return false;
	790	+ boolean wild = false;
	791	+ int index = -1;
	792	+ for(int i=0;i<length;i++){
	793	+ if(buffer[i] == '*' \|\| buffer[i] == '?'){
	794	+ wild = true;
	795	+ index = i;
	796	+ break;
	797	+ }
	798	+ }
	799	+ // check if it's a valid wildcard
	800	+ if(wild){
	801	+ if((buffer[0] == '' \|\| buffer[0] == '?') && (buffer[length-1]=='' \|\| buffer[length-1]=='?'))
	802	+ return false; // don't support patterns like a
	803	+ if(index == length-1 && buffer[index]=='?')
	804	+ return false; // probably just an ordinary question mark
	805	+ for(int i=0;i<length;i++){
	806	+ if(Character.isLetterOrDigit(buffer[i]))
	807	+ return true; // +card :P
	808	+ }
	809	+ }
	810	+ return false;
	811	+ }
	812	+
784	813	/**
785	814	* Constructs either a termquery or a boolean query depending on
786	815	* analysis of the fetched token. A single "word" might be analyzed
—	—	@@ -798,11 +827,16 @@
799	828
800	829	// check for wildcard seaches, they are also not analyzed/stemmed, only for titles
801	830	// wildcard signs are allowed only at the end of the word, minimum one letter word
802		~~- if(length>1 && Character.isLetter(buffer[0]) && buffer[length-1]=='*' &&~~
803		~~- defaultField.equals(fields.title())){~~
804		~~- Query ret = new WildcardQuery(makeTerm());~~
805		~~- ret.setBoost(defaultBoost);~~
806		~~- return ret;~~
	831	+ if(length>1 && wildcards != null && bufferIsWildCard()){
	832	+ Term term = makeTerm();
	833	+ Query ret = wildcards.makeQuery(term.text(),term.field());
	834	+ if(ret != null){
	835	+ ret.setBoost(defaultBoost);
	836	+ return ret;
	837	+ } else{
	838	+ // something is wrong, try making normal query
	839	+ return new TermQuery(term);
	840	+ }
807	841	}
808	842
809	843	if(toplevelOccur == BooleanClause.Occur.MUST_NOT)
—	—	@@ -1432,15 +1466,31 @@
1433	1467	/** Make the main phrase query, finds exact phrases, and sloppy phrases without stop words */
1434	1468	public Query makeMainPhrase(ArrayList<String> words, String field, int slop, float boost, Query stemtitle, Query related, HashSet<String> preStopWords){
1435	1469	RankValue val = new RankValue();
1436		~~- CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.ContentsSloppyOptions(val,stemtitle,related),~~
1437		~~- new QueryOptions.ContentsExactOptions(val,stemtitle,related),preStopWords);~~
	1470	+ boolean allStopWords = true;
1438	1471	for(String w : words){
1439		~~- pq.add(new Term(field,w));~~
	1472	+ if(!preStopWords.contains(w)){
	1473	+ allStopWords = false;
	1474	+ break;
	1475	+ }
1440	1476	}
1441		~~- pq.setSlop(slop);~~
1442		~~- pq.setBoost(boost);~~
1443		~~- return pq;~~
1444		-
	1477	+ if(allStopWords){
	1478	+ CustomPhraseQuery pq = new CustomPhraseQuery(new QueryOptions.ContentsExactOptions(val,stemtitle,related));
	1479	+ for(String w : words){
	1480	+ pq.add(new Term(field,w));
	1481	+ }
	1482	+ pq.setSlop(slop);
	1483	+ pq.setBoost(boost);
	1484	+ return pq;
	1485	+ } else{
	1486	+ CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.ContentsSloppyOptions(val,stemtitle,related),
	1487	+ new QueryOptions.ContentsExactOptions(val,stemtitle,related),preStopWords);
	1488	+ for(String w : words){
	1489	+ pq.add(new Term(field,w));
	1490	+ }
	1491	+ pq.setSlop(slop);
	1492	+ pq.setBoost(boost);
	1493	+ return pq;
	1494	+ }
1445	1495	}
1446	1496
1447	1497	/** make single phrase for related field */
—	—	@@ -1616,14 +1666,31 @@
1617	1667	/** Make the phrase that will match redirects, etc.. */
1618	1668	public Query makeAlttitlePhrase(ArrayList<String> words, String field, int slop, float boost, HashSet<String> preStopWords){
1619	1669	AggregatePhraseInfo ap = new AggregatePhraseInfo();
1620		~~- CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.AlttitleSloppyOptions(ap),~~
1621		~~- new QueryOptions.AlttitleExactOptions(ap),preStopWords);~~
	1670	+ boolean allStopWords = true;
1622	1671	for(String w : words){
1623		~~- pq.add(new Term(field,w));~~
	1672	+ if(!preStopWords.contains(w)){
	1673	+ allStopWords = false;
	1674	+ break;
	1675	+ }
1624	1676	}
1625		~~- pq.setSlop(slop);~~
1626		~~- pq.setBoost(boost);~~
1627		~~- return pq;~~
	1677	+ if(allStopWords){
	1678	+ CustomPhraseQuery pq = new CustomPhraseQuery(new QueryOptions.AlttitleExactOptions(ap));
	1679	+ for(String w : words){
	1680	+ pq.add(new Term(field,w));
	1681	+ }
	1682	+ pq.setSlop(slop);
	1683	+ pq.setBoost(boost);
	1684	+ return pq;
	1685	+ } else{
	1686	+ CombinedPhraseQuery pq = new CombinedPhraseQuery(new QueryOptions.AlttitleSloppyOptions(ap),
	1687	+ new QueryOptions.AlttitleExactOptions(ap),preStopWords);
	1688	+ for(String w : words){
	1689	+ pq.add(new Term(field,w));
	1690	+ }
	1691	+ pq.setSlop(slop);
	1692	+ pq.setBoost(boost);
	1693	+ return pq;
	1694	+ }
1628	1695
1629	1696	}
1630	1697
—	—	@@ -1650,7 +1717,8 @@
1651	1718	* @return
1652	1719	*/
1653	1720	@SuppressWarnings("unchecked")
1654		~~- protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords){~~
	1721	+ protected Query parseMultiPass(String queryText, NamespacePolicy policy, boolean makeRedirect, boolean makeKeywords, Wildcards wildcards){
	1722	+ this.wildcards = wildcards;
1655	1723	queryText = quoteCJK(queryText);
1656	1724	if(policy != null)
1657	1725	this.namespacePolicy = policy;
—	—	@@ -1669,6 +1737,9 @@
1670	1738	BooleanQuery bq = new BooleanQuery(true);
1671	1739	bq.add(qc,BooleanClause.Occur.SHOULD);
1672	1740	bq.add(qt,BooleanClause.Occur.SHOULD);
	1741	+
	1742	+ if(words.size() == 0)
	1743	+ return bq;
1673	1744
1674	1745	HashSet<String> preStopWords = StopWords.getPredefinedSet(builder.getFilters().getIndexId());
1675	1746	Query alttitleQuery = makeAlttitlePhrase(words,fields.alttitle(),10,1,preStopWords);
—	—	@@ -1701,7 +1772,11 @@
1702	1773	return coreQuery;
1703	1774
1704	1775	}
1705		-
	1776	+
	1777	+ public Query parseWithWildcards(String queryText, NamespacePolicy policy, Wildcards wildcards){
	1778	+ return parseMultiPass(queryText,policy,false,false,wildcards);
	1779	+ }
	1780	+
1706	1781	/**
1707	1782	* Three parse pases: contents, title, redirect
1708	1783	*
—	—	@@ -1711,7 +1786,7 @@
1712	1787	* @throws ParseException
1713	1788	*/
1714	1789	public Query parseThreePass(String queryText, NamespacePolicy policy) throws ParseException{
1715		~~- return parseMultiPass(queryText,policy,true,false);~~
	1790	+ return parseMultiPass(queryText,policy,true,false,null);
1716	1791	}
1717	1792
1718	1793	/**
—	—	@@ -1723,11 +1798,11 @@
1724	1799	*/
1725	1800	public Query parseFourPass(String queryText, NamespacePolicy policy, String dbname) throws ParseException{
1726	1801	boolean makeKeywords = global.useKeywordScoring(dbname);
1727		~~- return parseMultiPass(queryText,policy,true,makeKeywords);~~
	1802	+ return parseMultiPass(queryText,policy,true,makeKeywords,null);
1728	1803	}
1729	1804
1730	1805	public Query parseFourPass(String queryText, NamespacePolicy policy, boolean makeKeywords) throws ParseException{
1731		~~- return parseMultiPass(queryText,policy,true,makeKeywords);~~
	1806	+ return parseMultiPass(queryText,policy,true,makeKeywords,null);
1732	1807	}
1733	1808
1734	1809	/**
—	—	@@ -1740,7 +1815,7 @@
1741	1816	* @throws ParseException
1742	1817	*/
1743	1818	public Query parseTwoPass(String queryText, NamespacePolicy policy) throws ParseException{
1744		~~- return parseMultiPass(queryText,policy,false,false);~~
	1819	+ return parseMultiPass(queryText,policy,false,false,null);
1745	1820	}
1746	1821
1747	1822	public NamespacePolicy getNamespacePolicy() {

Status & tagging log

15:21, 12 September 2011 Meno25 (talk | contribs) changed the status of r26986 [removed: ok added: old]