r32149 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r32148‎ \| r32149 \| r32150 >
Date:	00:49, 19 March 2008
Author:	rainman
Status:	old
Tags:
Comment:	Query parser:: * wildcards/fuzzy can now be within phrases Daemon: * compatiblity modes for previous versions (version param) * warmup for new index types * related search can now be distributed Suggest: * fixed split/joins to work with other namespaces * phrase frequencies on other namespaces Incremental updates: * link information fetching delayed to index update * page_id added into links so we can to deletions * page_id keys for precursor indexes * updates on various index types (untested)
Modified paths:	/branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalOptions.java (modified) (history) /branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalScorer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/StopWords.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WordNet.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/SearchResults.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/StartupManager.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/HttpHandler.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/OAIHarvester.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedBuilder.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Fuzzy.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NamespaceFilter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/AnalysisTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/HighlightTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java (modified) (history) /branches/lucene-search-2.1/webinterface/lsweb.py (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalOptions.java
—	—	@@ -37,6 +37,8 @@
38	38	protected float completeBoost = 1;
39	39	/** use complete number of tokens (with completeBoost) only for scoring */
40	40	protected boolean useCompleteOnly = false;
	41	+ /** act exactly as a phrase query without any positional or such optimizations */
	42	+ protected boolean phraseQueryFallback = false;
41	43
42	44
43	45	/** Options specific for phrases in contents */
—	—	@@ -148,6 +150,12 @@
149	151	//wholeBoost = 8;
150	152	}
151	153	}
	154	+ /** Fallback to phasequery-type behaviour, no positional info */
	155	+ public static class PhraseQueryFallback extends PositionalOptions {
	156	+ public PhraseQueryFallback(){
	157	+ phraseQueryFallback = true;
	158	+ }
	159	+ }
152	160
153	161	public abstract static class NamespaceBoost implements Serializable {
154	162	public abstract float getBoost(int namespace);
—	—	@@ -162,6 +170,7 @@
163	171	}
164	172	}
165	173	}
	174	+
166	175
167	176
168	177	@Override
Index: branches/lucene-search-2.1/src/org/apache/lucene/search/PositionalScorer.java
—	—	@@ -195,6 +195,8 @@
196	196	* @throws IOException
197	197	*/
198	198	public float freqScore(int start, int distance) throws IOException{
	199	+ if(options.phraseQueryFallback)
	200	+ return getSimilarity().sloppyFreq(distance);
199	201	//System.out.println("freqScore at start="+start+", dist="+distance);
200	202	int offset = start + distance;
201	203	float begin = 1;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/Article.java
—	—	@@ -156,7 +156,7 @@
157	157	*
158	158	* @return Returns unique id.
159	159	*/
160		~~- public String getKey() {~~
	160	+ public String getIndexKey() {
161	161	return Long.toString(pageId);
162	162	}
163	163
—	—	@@ -270,9 +270,15 @@
271	271
272	272	public void setDate(Date date) {
273	273	this.date = date;
	274	+ }
	275	+
	276	+ public void setRedirectTo(String redirectTo) {
	277	+ this.redirectTo = redirectTo;
274	278	}
275	279
276	280
277	281
278	282
	283	+
	284	+
279	285	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/beans/SearchResults.java
—	—	@@ -32,6 +32,8 @@
33	33	protected boolean foundAllInTitle = false;
34	34	/** threshold for filtering suggestions */
35	35	protected int firstHitRank = 0;
	36	+ /** Words found in titles */
	37	+ protected HashSet<String> foundInTitles = new HashSet<String>();
36	38
37	39	public SearchResults(){
38	40	success = false;
—	—	@@ -130,6 +132,12 @@
131	133	public void addToFirstHitRank(int rank){
132	134	firstHitRank += rank;
133	135	}
	136	+ public HashSet<String> getFoundInTitles() {
	137	+ return foundInTitles;
	138	+ }
	139	+ public void setFoundInTitles(HashSet<String> foundInTitles) {
	140	+ this.foundInTitles = foundInTitles;
	141	+ }
134	142
135	143	@Override
136	144	public String toString() {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/prefix/PrefixIndexBuilder.java
—	—	@@ -52,11 +52,12 @@
53	53	return new PrefixIndexBuilder(iid,Links.openStandalone(iid),null);
54	54	}
55	55	/** Builder for incremental updates to precursor index */
56		~~- static public PrefixIndexBuilder forPrecursorModification(IndexId iid, Links links) throws IOException{~~
	56	+ static public PrefixIndexBuilder forPrecursorModification(IndexId iid) throws IOException{
	57	+ iid = iid.getPrefix();
57	58	IndexWriter writer = WikiIndexModifier.openForWrite(iid.getPrecursor().getIndexPath(),false,new PrefixAnalyzer());
58	59	writer.setMergeFactor(20);
59	60	writer.setMaxBufferedDocs(500);
60		~~- return new PrefixIndexBuilder(iid,links,writer);~~
	61	+ return new PrefixIndexBuilder(iid,null,writer);
61	62	}
62	63
63	64	private PrefixIndexBuilder(IndexId iid, Links links, IndexWriter writer) throws IOException {
—	—	@@ -177,15 +178,17 @@
178	179	else return -1;
179	180	}
180	181	});
181		~~- HashSet<String> selectedRedirects = new HashSet<String>();~~
	182	+ // hash set of selected articles and places they redirect to
	183	+ HashSet<String> selectedWithRedirects = new HashSet<String>();
182	184	ArrayList<String> selected = new ArrayList<String>();
183	185	for(int i=0;i<perPrefix && i<sorted.size();i++){
184	186	String key = sorted.get(i).getKey();
185	187	String redirect = redirects.get(key);
186		~~- if(redirect == null \|\| !selectedRedirects.contains(redirect)){~~
	188	+ if((redirect == null \|\| !selectedWithRedirects.contains(redirect))
	189	+ && !selectedWithRedirects.contains(key)){
187	190	selected.add(key);
188		~~- selectedRedirects.add(redirect);~~
189		~~- selectedRedirects.add(key);~~
	191	+ selectedWithRedirects.add(key);
	192	+ selectedWithRedirects.add(redirect);
190	193	}
191	194	}
192	195	Document d = new Document();
—	—	@@ -213,7 +216,7 @@
214	217	writer.optimize();
215	218	writer.close();
216	219
217		~~- IndexThread.makeIndexSnapshot(prefixIid,path);~~
	220	+ IndexThread.makeIndexSnapshot(prefixIid,prefixIid.getImportPath());
218	221	}
219	222
220	223	public static String strip(String s){
—	—	@@ -230,15 +233,23 @@
231	234	private static double lengthCoeff(String key, String prefix) {
232	235	return 1;
233	236	}
234		~~- /** Modify a precursor index entry */~~
235		~~- protected void modifyPrecursor(String key) throws IOException{~~
236		~~- writer.deleteDocuments(new Term("key",key));~~
237		~~- addToPrecursor(key);~~
238		~~- }~~
	237	+
	238	+
239	239	/** Add a new precursor index entry */
240	240	protected void addToPrecursor(String key) throws IOException{
241	241	int ref = links.getNumInLinks(key);
242	242	String redirect = links.getRedirectTarget(key);
	243	+ String pageid = links.getPageId(key);
	244	+ addToPrecursor(key,ref,redirect,pageid);
	245	+ }
	246	+
	247	+ /** Modify a precursor index entry */
	248	+ public void deleteFromPrecursor(String pageId) throws IOException{
	249	+ writer.deleteDocuments(new Term("pageid",pageId));
	250	+ }
	251	+
	252	+ /** Add a new precursor index entry */
	253	+ public void addToPrecursor(String key, int ref, String redirect, String pageId) throws IOException{
243	254	String strippedKey = strip(key);
244	255	String strippedTarget = redirect==null? null : strip(redirect);
245	256	if(redirect == null);
—	—	@@ -248,6 +259,7 @@
249	260	return; // ignore redirects like byzantine -> byzantine empire
250	261	// add to index
251	262	Document d = new Document();
	263	+ d.add(new Field("pageid",pageId,Field.Store.NO,Field.Index.UN_TOKENIZED));
252	264	d.add(new Field("key",key,Field.Store.YES,Field.Index.UN_TOKENIZED));
253	265	ArrayList<Token> canonized = canonize(key,iid,filters);
254	266	for(Token t : canonized){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
—	—	@@ -62,7 +62,7 @@
63	63	if(original != null)
64	64	this.suffix = original.getTitlesSuffix();
65	65	GlobalConfiguration global = GlobalConfiguration.getInstance();
66		~~- langCode = global.getLanguage(iid.getDBname());~~
	66	+ langCode = iid.getLangCode();
67	67	FieldBuilder.Case dCase = (global.exactCaseIndex(iid.getDBname()))? FieldBuilder.Case.EXACT_CASE : FieldBuilder.Case.IGNORE_CASE;
68	68	builder = new FieldBuilder(iid,dCase);
69	69	indexes = new HashMap<String,IndexWriter>();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/importer/Importer.java
—	—	@@ -143,7 +143,7 @@
144	144	if(makeIndex){
145	145	if(!useOldRelated){
146	146	try {
147		~~- RelatedBuilder.rebuildFromLinksNew(iid);~~
	147	+ RelatedBuilder.rebuildFromLinks(iid);
148	148	} catch (IOException e) {
149	149	log.fatal("Cannot make related mapping: "+e.getMessage());
150	150	return;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/IndexId.java
—	—	@@ -731,6 +731,10 @@
732	732	return GlobalConfiguration.getIndexId(dbname+".spell") != null;
733	733	}
734	734
	735	+ public boolean hasPrefix(){
	736	+ return GlobalConfiguration.getIndexId(dbname+".prefix") != null;
	737	+ }
	738	+
735	739	/** Get the coresponding spell words iid */
736	740	public IndexId getSpell() {
737	741	return get(dbname+".spell");
—	—	@@ -787,7 +791,7 @@
788	792	}
789	793
790	794	/** Get if this is index that doesn't capitalize first letters of articles */
791		~~- public boolean getExactCase(){~~
	795	+ public boolean isExactCase(){
792	796	if(exactCase == null)
793	797	exactCase = GlobalConfiguration.getInstance().exactCaseIndex(dbname);
794	798	return exactCase;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
—	—	@@ -1158,10 +1158,6 @@
1159	1159	public boolean isMyHost(String host) {
1160	1160	return host.equalsIgnoreCase(hostAddr) \|\| host.equalsIgnoreCase(hostName);
1161	1161	}
1162		-
1163		~~- public String getLanguage(IndexId iid){~~
1164		~~- return getLanguage(iid.getDBname());~~
1165		~~- }~~
1166	1162
1167	1163	/** Get language for a dbname */
1168	1164	public String getLanguage(String dbname) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/StartupManager.java
—	—	@@ -41,9 +41,9 @@
42	42	// preload localizations
43	43	HashSet<String> langCodes = new HashSet<String>();
44	44	for(IndexId iid : global.getMyIndex())
45		~~- langCodes.add(global.getLanguage(iid.getDBname()));~~
	45	+ langCodes.add(iid.getLangCode());
46	46	for(IndexId iid : global.getMySearch())
47		~~- langCodes.add(global.getLanguage(iid.getDBname()));~~
	47	+ langCodes.add(iid.getLangCode());
48	48	Localization.readLocalizations(langCodes);
49	49	Localization.loadInterwiki();
50	50	// preload the unicode decomposer
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java
—	—	@@ -250,7 +250,12 @@
251	251	protected void updateCache(SearcherCache.SearcherPool pool, LocalIndex li){
252	252	// do some typical queries to preload some lucene caches, pages into memory, etc..
253	253	for(IndexSearcherMul is : pool.searchers){
254		~~- Warmup.warmupIndexSearcher(is,li.iid,true);~~
	254	+ try{
	255	+ Warmup.warmupIndexSearcher(is,li.iid,true);
	256	+ } catch(IOException e){
	257	+ e.printStackTrace();
	258	+ log.warn("Error warmup up "+li+" : "+e.getMessage());
	259	+ }
255	260	}
256	261	// add to cache
257	262	cache.invalidateLocalSearcher(li.iid,pool);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java
—	—	@@ -3,9 +3,12 @@
4	4	import java.io.IOException;
5	5	import java.util.ArrayList;
6	6	import java.util.Collection;
	7	+import java.util.HashSet;
7	8	import java.util.Hashtable;
8	9
9	10	import org.apache.log4j.Logger;
	11	+import org.apache.lucene.analysis.SimpleAnalyzer;
	12	+import org.apache.lucene.index.IndexReader;
10	13	import org.apache.lucene.index.Term;
11	14	import org.apache.lucene.search.Hits;
12	15	import org.apache.lucene.search.Query;
—	—	@@ -13,6 +16,7 @@
14	17	import org.wikimedia.lsearch.analyzers.Analyzers;
15	18	import org.wikimedia.lsearch.analyzers.FieldBuilder;
16	19	import org.wikimedia.lsearch.analyzers.FieldNameFactory;
	20	+import org.wikimedia.lsearch.analyzers.StopWords;
17	21	import org.wikimedia.lsearch.analyzers.WikiQueryParser;
18	22	import org.wikimedia.lsearch.benchmark.SampleTerms;
19	23	import org.wikimedia.lsearch.benchmark.Terms;
—	—	@@ -20,6 +24,7 @@
21	25	import org.wikimedia.lsearch.config.Configuration;
22	26	import org.wikimedia.lsearch.config.GlobalConfiguration;
23	27	import org.wikimedia.lsearch.config.IndexId;
	28	+import org.wikimedia.lsearch.spell.Suggest;
24	29
25	30	/**
26	31	* Methods to warm up index and preload caches.
—	—	@@ -33,41 +38,57 @@
34	39	protected static Hashtable<String,Terms> langTerms = new Hashtable<String,Terms>();
35	40
36	41	/** Runs some typical queries on a local index searcher to preload caches, pages into memory, etc .. */
37		~~- public static void warmupIndexSearcher(IndexSearcherMul is, IndexId iid, boolean useDelay){~~
	42	+ public static void warmupIndexSearcher(IndexSearcherMul is, IndexId iid, boolean useDelay) throws IOException {
38	43	if(iid.isLinks() \|\| iid.isPrecursor())
39	44	return; // no warmaup for these
40	45	log.info("Warming up index "+iid+" ...");
41	46	long start = System.currentTimeMillis();
	47	+ IndexReader reader = is.getIndexReader();
42	48
43	49	if(global == null)
44	50	global = GlobalConfiguration.getInstance();
45	51
46	52	Hashtable<String,String> warmup = global.getDBParams(iid.getDBname(),"warmup");
47		~~- if(iid.isSpell() \|\| iid.isPrefix()); // no warmup for spell-chekers and prefixes (for now)~~
48		~~- else if(warmup == null){~~
49		~~- makeNamespaceFilters(is,iid);~~
50		~~- simpleWarmup(is,iid);~~
51		~~- log.info("Warmed up "+iid);~~
52		~~- }~~
53		~~- else{~~
54		~~- int count;~~
55		~~- try{~~
56		~~- count = Integer.parseInt(warmup.get("count"));~~
57		~~- } catch(Exception e){~~
58		~~- log.warn("Wrong parameters for warmup of database "+iid+" in global settings");~~
59		~~- simpleWarmup(is,iid);~~
60		~~- return;~~
	53	+ int count = warmup!=null? Integer.parseInt(warmup.get("count")) : 0;
	54	+ if(iid.isSpell() && count > 0){
	55	+ Terms terms = getTermsForLang(iid.getLangCode());
	56	+ Suggest sug = new Suggest(iid,is,false);
	57	+ WikiQueryParser parser = new WikiQueryParser("contents",new SimpleAnalyzer(),new FieldBuilder(iid).getBuilder(),StopWords.getPredefinedSet(iid));
	58	+ for(int i=0;i<count;i++){
	59	+ String searchterm = terms.next();
	60	+ sug.suggest(searchterm,parser.tokenizeBareText(searchterm),new Suggest.ExtraInfo(),new NamespaceFilter());
61	61	}
62		~~- makeNamespaceFilters(is,iid);~~
63		~~- warmupSearchTerms(is,iid,count,useDelay);~~
64		~~- long delta = System.currentTimeMillis() - start;~~
65		~~- log.info("Warmed up "+iid+" in "+delta+" ms");~~
66		~~- }~~
	62	+ } else if((iid.isPrefix() \|\| iid.isHighlight() \|\| iid.isRelated()) && count > 0 && !iid.isTitlesBySuffix()){
	63	+ // NOTE: this might not warmup all caches, but should read stuff into memory buffers
	64	+ for(int i=0;i<count;i++){
	65	+ int docid = (int)(Math.random()*is.maxDoc());
	66	+ reader.document(docid).get("key");
	67	+ }
	68	+ } else{
	69	+ // normal indexes
	70	+ if(count == 0){
	71	+ makeNamespaceFilters(is,iid);
	72	+ simpleWarmup(is,iid);
	73	+ } else{
	74	+ makeNamespaceFilters(is,iid);
	75	+ warmupWithSearchTerms(is,iid,count,useDelay);
	76	+ }
	77	+ // wait for aggregate fields to be cached
	78	+ while(AggregateMetaField.isBeingCached(reader)){
	79	+ try {
	80	+ Thread.sleep(100);
	81	+ } catch (InterruptedException e) {
	82	+ e.printStackTrace();
	83	+ }
	84	+ }
	85	+ }
	86	+ long delta = System.currentTimeMillis() - start;
	87	+ log.info("Warmed up "+iid+" in "+delta+" ms");
67	88	}
68	89
69	90	/** Warmup index using some number of simple searches */
70		~~- protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) {~~
71		~~- String lang = global.getLanguage(iid.getDBname());~~
	91	+ protected static void warmupWithSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) {
	92	+ String lang = iid.getLangCode();
72	93	FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder();
73	94	WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null);
74	95	Terms terms = getTermsForLang(lang);
—	—	@@ -79,7 +100,7 @@
80	101	for(int j =0; j<20 && j<hits.length(); j++)
81	102	hits.doc(j); // retrieve some documents
82	103	if(useDelay){
83		~~- if(i<1000)~~
	104	+ if(i<1000)
84	105	Thread.sleep(100);
85	106	else
86	107	Thread.sleep(50);
—	—	@@ -126,7 +147,6 @@
127	148	/** Just run one complex query and rebuild the main namespace filter */
128	149	public static void simpleWarmup(IndexSearcherMul is, IndexId iid){
129	150	try{
130		~~- String lang = global.getLanguage(iid.getDBname());~~
131	151	FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder();
132	152	WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null);
133	153	Query q = parser.parse("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Wildcards.java
—	—	@@ -73,6 +73,21 @@
74	74	return makeQueryFromTerms(terms,field);
75	75	}
76	76
	77	+ /** Make terms array for phrases */
	78	+ public Term[] makeTerms(String wildcard, String field){
	79	+ HashSet<String> terms = getCached(wildcard);
	80	+ if(terms.size() == 0)
	81	+ return null; // no match or error
	82	+
	83	+ trimTerms(terms);
	84	+ Term[] ret = new Term[terms.size()];
	85	+ int i = 0;
	86	+ for(String w : terms)
	87	+ ret[i++] = new Term(field,w);
	88	+ return ret;
	89	+
	90	+ }
	91	+
77	92	protected HashSet<String> getCached(String wildcard){
78	93	if(client == null)
79	94	client = new RMIMessengerClient();
—	—	@@ -99,6 +114,16 @@
100	115
101	116	/** Construct DijunctionMaxQuery from terms */
102	117	protected Query makeQueryFromTerms(HashSet<String> terms, String field){
	118	+ trimTerms(terms);
	119	+
	120	+ DisjunctionMaxQuery q = new DisjunctionMaxQuery(0);
	121	+ for(String t : terms){
	122	+ q.add(new TermQuery(new Term(field,t)));
	123	+ }
	124	+ return q;
	125	+ }
	126	+
	127	+ private void trimTerms(HashSet<String> terms) {
103	128	if(terms.size() > MAX_TERMS){
104	129	HashSet<String> temp = new HashSet<String>();
105	130	int count = 0;
—	—	@@ -110,13 +135,8 @@
111	136	}
112	137	terms = temp;
113	138	}
114		~~- DisjunctionMaxQuery q = new DisjunctionMaxQuery(0);~~
115		~~- for(String t : terms){~~
116		~~- q.add(new TermQuery(new Term(field,t)));~~
117		~~- }~~
118		~~- return q;~~
119	139	}
120		-
	140	+
121	141	public boolean hasWildcards(){
122	142	return wildcardCache.size() > 0;
123	143	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Fuzzy.java
—	—	@@ -54,6 +54,14 @@
55	55
56	56	}
57	57
	58	+ public ArrayList<Float> getBoosts(String word, NamespaceFilter nsf, Term[] tt){
	59	+ ArrayList<Float> boost = new ArrayList<Float>();
	60	+ HashMap<String,Float> terms = getCached(word,nsf);
	61	+ for(Term t : tt)
	62	+ boost.add(terms.get(t.text()));
	63	+ return boost;
	64	+ }
	65	+
58	66	public ArrayList<Float> getBoosts(String word, NamespaceFilter nsf, ArrayList<String> words){
59	67	ArrayList<Float> boost = new ArrayList<Float>();
60	68	HashMap<String,Float> terms = getCached(word,nsf);
—	—	@@ -76,6 +84,20 @@
77	85	// actually make query
78	86	return makeQueryFromTerms(terms, field);
79	87	}
	88	+ /** Make a term array without boost */
	89	+ public Term[] makeTerms(String word, String field, NamespaceFilter nsf){
	90	+ if(client == null)
	91	+ client = new RMIMessengerClient();
	92	+ HashMap<String,Float> terms = getCached(word,nsf);
	93	+ if(terms.size() == 0)
	94	+ return null;
	95	+
	96	+ Term[] ret = new Term[terms.size()];
	97	+ int i=0;
	98	+ for(String w : terms.keySet())
	99	+ ret[i++] = new Term(field,w);
	100	+ return ret;
	101	+ }
80	102
81	103	protected HashMap<String,Float> getCached(String word, NamespaceFilter nsf){
82	104	String key = cacheKey(word,nsf);
—	—	@@ -99,7 +121,10 @@
100	122	/** Calculate boost factor for suggest result - larger edit distance = smaller boost */
101	123	protected float getBoost(SuggestResult r){
102	124	int dist = r.getDist()+r.getDistMetaphone();
103		~~- return (float)(1.0/Math.pow(2,dist));~~
	125	+ double d = r.getDist();
	126	+ double l = r.getWord().length();
	127	+ // 2^(-dist) * len_prop * 2^E(dist)
	128	+ return (float)((1.0/Math.pow(2,dist))((l-d)/l)4);
104	129	}
105	130
106	131	private Query makeQueryFromTerms(HashMap<String,Float> terms, String field) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
—	—	@@ -41,10 +41,12 @@
42	42	import org.wikimedia.lsearch.frontend.SearchServer;
43	43	import org.wikimedia.lsearch.highlight.Highlight;
44	44	import org.wikimedia.lsearch.highlight.HighlightResult;
	45	+import org.wikimedia.lsearch.index.MessengerThread;
45	46	import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
46	47	import org.wikimedia.lsearch.ranks.StringList;
47	48	import org.wikimedia.lsearch.related.Related;
48	49	import org.wikimedia.lsearch.related.RelatedTitle;
	50	+import org.wikimedia.lsearch.spell.Suggest;
49	51	import org.wikimedia.lsearch.spell.SuggestQuery;
50	52	import org.wikimedia.lsearch.util.Localization;
51	53
—	—	@@ -79,24 +81,28 @@
80	82	}
81	83
82	84	/** Main search method, call this from the search frontend */
83		~~- public SearchResults search(IndexId iid, String what, String searchterm, HashMap query) {~~
	85	+ public SearchResults search(IndexId iid, String what, String searchterm, HashMap query, double version) {
84	86
85	87	if (what.equals("search") \|\| what.equals("explain")) {
86	88	int offset = 0, limit = 100; boolean exactCase = false;
87		~~- int iwlimit = 10;~~
	89	+ int iwlimit = 10; int iwoffset = 0;
88	90	boolean searchOnly = false;
89	91	if (query.containsKey("offset"))
90	92	offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
91	93	if (query.containsKey("limit"))
92	94	limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXLINES);
	95	+ if (query.containsKey("iwoffset"))
	96	+ iwoffset = Math.max(Integer.parseInt((String)query.get("iwoffset")), 0);
93	97	if (query.containsKey("iwlimit"))
94	98	iwlimit = Math.min(Integer.parseInt((String)query.get("iwlimit")), MAXLINES);
95	99	if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact"))
96	100	exactCase = true;
97	101	if(query.containsKey("searchonly"))
98	102	searchOnly = Boolean.parseBoolean((String)query.get("searchonly"));
	103	+ if(version <= 2)
	104	+ searchOnly = true;
99	105	NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
100		~~- SearchResults res = search(iid, searchterm, offset, limit, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly);~~
	106	+ SearchResults res = search(iid, searchterm, offset, limit, iwoffset, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly);
101	107	if(res!=null && res.isRetry()){
102	108	int retries = 0;
103	109	if(iid.isSplit() \|\| iid.isNssplit()){
—	—	@@ -105,7 +111,7 @@
106	112	retries = 1;
107	113
108	114	while(retries > 0 && res.isRetry()){
109		~~- res = search(iid, searchterm, offset, limit, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly);~~
	115	+ res = search(iid, searchterm, offset, limit, iwoffset, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly);
110	116	retries--;
111	117	}
112	118	if(res.isRetry())
—	—	@@ -114,24 +120,26 @@
115	121	return res;
116	122	} else if (what.equals("raw") \|\| what.equals("rawexplain")) {
117	123	int offset = 0, limit = 100; boolean exactCase = false;
118		~~- int iwlimit = 10;~~
	124	+ int iwlimit = 10; int iwoffset = 0;
119	125	if (query.containsKey("offset"))
120	126	offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
121	127	if (query.containsKey("limit"))
122	128	limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXLINES);
	129	+ if (query.containsKey("iwoffset"))
	130	+ iwoffset = Math.max(Integer.parseInt((String)query.get("iwoffset")), 0);
123	131	if (query.containsKey("iwlimit"))
124	132	iwlimit = Math.min(Integer.parseInt((String)query.get("iwlimit")), MAXLINES);
125	133	if (query.containsKey("case") && global.exactCaseIndex(iid.getDBname()) && ((String)query.get("case")).equalsIgnoreCase("exact"))
126	134	exactCase = true;
127	135	NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
128		~~- return search(iid, searchterm, offset, limit, iwlimit, namespaces, what.equals("rawexplain"), exactCase, true, true);~~
	136	+ return search(iid, searchterm, offset, limit, iwoffset, iwlimit, namespaces, what.equals("rawexplain"), exactCase, true, true);
129	137	} else if (what.equals("titlematch")) {
130	138	// TODO: return searchTitles(searchterm);
131	139	} else if (what.equals("prefix")){
132	140	int limit = MAXPREFIX;
133	141	if (query.containsKey("limit"))
134	142	limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXPREFIX);
135		~~- SearchResults res = prefixSearch(iid, searchterm, limit);~~
	143	+ SearchResults res = searchPrefix(iid, searchterm, limit);
136	144	if(query.containsKey("format")){
137	145	String format = (String)query.get("format");
138	146	if(format.equalsIgnoreCase("json"))
—	—	@@ -146,7 +154,7 @@
147	155	offset = Math.max(Integer.parseInt((String)query.get("offset")), 0);
148	156	if (query.containsKey("limit"))
149	157	limit = Math.min(Integer.parseInt((String)query.get("limit")), MAXLINES);
150		~~- return relatedSearch(iid, searchterm, offset, limit);~~
	158	+ return searchRelated(iid, searchterm, offset, limit);
151	159	} else {
152	160	SearchResults res = new SearchResults();
153	161	res.setErrorMsg("Unrecognized search type. Try one of: " +
—	—	@@ -184,43 +192,47 @@
185	193	return "";
186	194	}
187	195
188		~~- protected SearchResults relatedSearch(IndexId iid, String searchterm, int offset, int limit) {~~
	196	+ protected SearchResults searchRelated(IndexId iid, String searchterm, int offset, int limit) {
	197	+ RMIMessengerClient messenger = new RMIMessengerClient();
	198	+ String host = cache.getRandomHost(iid.getRelated());
	199	+ return messenger.searchRelated(host,iid.toString(),searchterm,offset,limit);
	200	+
	201	+ }
	202	+
	203	+ /** Search on a local related index (called via RMI) */
	204	+ public SearchResults searchRelatedLocal(IndexId iid, String searchterm, int offset, int limit) throws IOException {
189	205	readLocalization(iid);
190	206	IndexId rel = iid.getRelated();
191	207	SearcherCache cache = SearcherCache.getInstance();
192	208	SearchResults res = new SearchResults();
193		~~- try {~~
194		~~- IndexSearcherMul searcher = cache.getLocalSearcher(rel);~~
195		~~- IndexReader reader = searcher.getIndexReader();~~
196		~~- String key = getKey(searchterm,iid);~~
197		~~- TermDocs td = reader.termDocs(new Term("key",key));~~
198		~~- if(td.next()){~~
199		~~- ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection());~~
200		~~- res.setNumHits(col.size());~~
201		~~- res.setSuccess(true);~~
202		~~- for(int i=offset;i<offset+limit && i<col.size();i++){~~
203		~~- RelatedTitle rt = col.get(i);~~
204		~~- Title t = rt.getRelated();~~
205		~~- ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle());~~
206		~~- res.addResult(rs);~~
207		~~- }~~
208		~~- // highlight stuff~~
209		~~- Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid);~~
210		~~- NamespaceFilter nsDefault = new NamespaceFilter(key.substring(0,key.indexOf(':')));~~
211		~~- FieldBuilder.BuilderSet bs = new FieldBuilder(iid).getBuilder();~~
212		~~- HashSet<String> stopWords = StopWords.getPredefinedSet(iid);~~
213		~~- WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),nsDefault,analyzer,bs,NamespacePolicy.IGNORE,stopWords);~~
214		~~- Query q = parser.parse(key.substring(key.indexOf(':')+1),new WikiQueryParser.ParsingOptions(true));~~
215		~~- highlight(iid,q,parser.getWordsClean(),searcher,res,parser.hasPhrases());~~
216		~~- } else{~~
217		~~- res.setSuccess(true);~~
218		~~- res.setNumHits(0);~~
	209	+
	210	+ IndexSearcherMul searcher = cache.getLocalSearcher(rel);
	211	+ IndexReader reader = searcher.getIndexReader();
	212	+ String key = getKey(searchterm,iid);
	213	+ TermDocs td = reader.termDocs(new Term("key",key));
	214	+ if(td.next()){
	215	+ ArrayList<RelatedTitle> col = Related.convertToRelatedTitleList(new StringList(reader.document(td.doc()).get("related")).toCollection());
	216	+ res.setNumHits(col.size());
	217	+ res.setSuccess(true);
	218	+ for(int i=offset;i<offset+limit && i<col.size();i++){
	219	+ RelatedTitle rt = col.get(i);
	220	+ Title t = rt.getRelated();
	221	+ ResultSet rs = new ResultSet(rt.getScore(),t.getNamespaceAsString(),t.getTitle());
	222	+ res.addResult(rs);
219	223	}
220		~~- } catch (IOException e) {~~
221		~~- e.printStackTrace();~~
222		~~- log.error("I/O error in relatedSearch on "+rel+" : "+e.getMessage());~~
223		~~- res.setErrorMsg("I/O Error processing index for "+rel);~~
	224	+ // highlight stuff
	225	+ Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid);
	226	+ NamespaceFilter nsDefault = new NamespaceFilter(key.substring(0,key.indexOf(':')));
	227	+ FieldBuilder.BuilderSet bs = new FieldBuilder(iid).getBuilder();
	228	+ HashSet<String> stopWords = StopWords.getPredefinedSet(iid);
	229	+ WikiQueryParser parser = new WikiQueryParser(bs.getFields().contents(),nsDefault,analyzer,bs,NamespacePolicy.IGNORE,stopWords);
	230	+ Query q = parser.parse(key.substring(key.indexOf(':')+1),new WikiQueryParser.ParsingOptions(true));
	231	+ highlight(iid,q,parser.getWordsClean(),searcher,res,true,true);
	232	+ } else{
	233	+ res.setSuccess(true);
	234	+ res.setNumHits(0);
224	235	}
	236	+
225	237	return res;
226	238	}
227	239
—	—	@@ -236,7 +248,7 @@
237	249	}
238	250	}
239	251
240		~~- protected SearchResults prefixSearch(IndexId iid, String searchterm, int limit) {~~
	252	+ protected SearchResults searchPrefix(IndexId iid, String searchterm, int limit) {
241	253	readLocalization(iid);
242	254	IndexId pre = iid.getPrefix();
243	255	SearcherCache cache = SearcherCache.getInstance();
—	—	@@ -313,7 +325,7 @@
314	326	// search
315	327	SearchResults res = makeTitlesSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
316	328	// highlight
317		~~- highlightTitles(iid,q,words,searcher,res,sortByPhrases);~~
	329	+ highlightTitles(iid,q,words,searcher,res,sortByPhrases,false);
318	330	return res;
319	331	} catch (IOException e) {
320	332	e.printStackTrace();
—	—	@@ -362,7 +374,8 @@
363	375	* Search on iid, with query searchterm. View results from offset to offset+limit, using
364	376	* the default namespaces filter
365	377	*/
366		~~- public SearchResults search(IndexId iid, String searchterm, int offset, int limit, int iwlimit, NamespaceFilter nsDefault, boolean explain, boolean exactCase, boolean raw, boolean searchOnly){~~
	378	+ public SearchResults search(IndexId iid, String searchterm, int offset, int limit, int iwoffset, int iwlimit,
	379	+ NamespaceFilter nsDefault, boolean explain, boolean exactCase, boolean raw, boolean searchOnly){
367	380	Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid,exactCase);
368	381	if(nsDefault == null \|\| nsDefault.cardinality() == 0)
369	382	nsDefault = new NamespaceFilter("0"); // default to main namespace
—	—	@@ -429,8 +442,8 @@
430	443	HighlightPack pack = messenger.searchPart(piid,searchterm,q,nsfw,offset,limit,explain,host);
431	444	res = pack.res;
432	445	if(!searchOnly){
433		~~- highlight(iid,q,parser.getWordsClean(),pack.terms,pack.dfs,pack.maxDoc,res,exactCase,null,parser.hasPhrases());~~
434		~~- fetchTitles(res,searchterm,nsfw,iid,parser,offset,0,iwlimit,explain);~~
	446	+ highlight(iid,q,parser.getWordsClean(),pack.terms,pack.dfs,pack.maxDoc,res,exactCase,null,parser.hasPhrases(),false);
	447	+ fetchTitles(res,searchterm,nsfw,iid,parser,offset,iwoffset,iwlimit,explain);
435	448	suggest(iid,searchterm,parser,res,offset,nsfw);
436	449	}
437	450	return res;
—	—	@@ -459,8 +472,8 @@
460	473	hits = searcher.search(q,nsfw,offset+limit);
461	474	res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
462	475	if(!searchOnly){
463		~~- highlight(iid,q,parser.getWordsClean(),searcher,parser.getHighlightTerms(),res,exactCase,parser.hasPhrases());~~
464		~~- fetchTitles(res,searchterm,nsfw,iid,parser,offset,0,iwlimit,explain);~~
	476	+ highlight(iid,q,parser.getWordsClean(),searcher,parser.getHighlightTerms(),res,exactCase,parser.hasPhrases(),false);
	477	+ fetchTitles(res,searchterm,nsfw,iid,parser,offset,iwoffset,iwlimit,explain);
465	478	suggest(iid,searchterm,parser,res,offset,nsfw);
466	479	}
467	480	return res;
—	—	@@ -506,7 +519,8 @@
507	520	RMIMessengerClient messenger = new RMIMessengerClient();
508	521	// find host
509	522	String host = cache.getRandomHost(iid.getSpell());
510		~~- SuggestQuery sq = messenger.suggest(host,iid.toString(),searchterm,tokens,res.getPhrases(),res.getFoundInContext(),res.getFirstHitRank(),nsfw.getFilter());~~
	523	+ Suggest.ExtraInfo info = new Suggest.ExtraInfo(res.getPhrases(),res.getFoundInContext(),res.getFoundInTitles(),res.getFirstHitRank());
	524	+ SuggestQuery sq = messenger.suggest(host,iid.toString(),searchterm,tokens,info,nsfw.getFilter());
511	525	res.setSuggest(sq);
512	526	}
513	527	}
—	—	@@ -595,7 +609,7 @@
596	610
597	611	TopDocs hits = searcher.search(q,wrap,iwoffset+iwlimit);
598	612	SearchResults r = makeTitlesSearchResults(searcher,hits,iwoffset,iwlimit,main,searchterm,q,searchStart,explain);
599		~~- highlightTitles(main,q,words,searcher,r,parser.hasWildcards());~~
	613	+ highlightTitles(main,q,words,searcher,r,parser.hasWildcards(),false);
600	614
601	615	if(r.isSuccess()){
602	616	res.setTitles(r.getResults());
—	—	@@ -697,38 +711,38 @@
698	712	}
699	713
700	714	/** Highlight search results, and set the property in ResultSet */
701		~~- protected void highlight(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, Term[] terms, SearchResults res, boolean exactCase, boolean sortByPhrases) throws IOException{~~
	715	+ protected void highlight(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, Term[] terms, SearchResults res, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{
702	716	int[] df = searcher.docFreqs(terms);
703	717	int maxDoc = searcher.maxDoc();
704		~~- highlight(iid,q,words,terms,df,maxDoc,res,exactCase,null,sortByPhrases);~~
	718	+ highlight(iid,q,words,terms,df,maxDoc,res,exactCase,null,sortByPhrases,alwaysIncludeFirst);
705	719	}
706	720
707	721	/** Highlight search results, and set the property in ResultSet */
708		~~- protected void highlight(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases) throws IOException{~~
	722	+ protected void highlight(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{
709	723	Term[] terms = getTerms(q,"contents");
710	724	int[] df = searcher.docFreqs(terms);
711	725	int maxDoc = searcher.maxDoc();
712		~~- highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases);~~
	726	+ highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases,alwaysIncludeFirst);
713	727	}
714	728
715	729	/** Highlight search results from titles index */
716		~~- protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases) throws IOException{~~
	730	+ protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, IndexSearcherMul searcher, SearchResults res, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{
717	731	Term[] terms = getTerms(q,"alttitle");
718	732	int[] df = searcher.docFreqs(terms);
719	733	int maxDoc = searcher.maxDoc();
720		~~- highlight(iid,q,words,terms,df,maxDoc,res,false,searcher.getIndexReader(),sortByPhrases);~~
	734	+ highlight(iid,q,words,terms,df,maxDoc,res,false,searcher.getIndexReader(),sortByPhrases,alwaysIncludeFirst);
721	735	}
722	736
723	737	/** Highlight search results from titles index using a wikisearcher */
724		~~- protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, SearchResults res, boolean sortByPhrases) throws IOException{~~
	738	+ protected void highlightTitles(IndexId iid, Query q, ArrayList<String> words, WikiSearcher searcher, SearchResults res, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{
725	739	Term[] terms = getTerms(q,"alttitle");
726	740	int[] df = searcher.docFreqs(terms);
727	741	int maxDoc = searcher.maxDoc();
728		~~- highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases);~~
	742	+ highlight(iid,q,words,terms,df,maxDoc,res,false,null,sortByPhrases,alwaysIncludeFirst);
729	743	}
730	744
731	745	/** Highlight article (don't call directly, use one of the interfaces above instead) */
732		- protected void highlight(IndexId iid, Query q, ArrayList<String> words, Term[] terms, int[] df, int maxDoc, SearchResults res, boolean exactCase, IndexReader reader, boolean sortByPhrases) throws IOException{
	746	+ protected void highlight(IndexId iid, Query q, ArrayList<String> words, Term[] terms, int[] df, int maxDoc, SearchResults res, boolean exactCase, IndexReader reader, boolean sortByPhrases, boolean alwaysIncludeFirst) throws IOException{
733	747	// iid -> array of keys
734	748	HashMap<IndexId,ArrayList<String>> map = new HashMap<IndexId,ArrayList<String>>();
735	749	iid = iid.getHighlight();
—	—	@@ -755,17 +769,18 @@
756	770	Highlight.ResultSet rs = null;
757	771	if(reader != null){
758	772	// we got a local reader, use it
759		~~- rs = Highlight.highlight(e.getValue(),hiid,terms,df,maxDoc,words,stopWords,exactCase,reader,sortByPhrases);~~
	773	+ rs = Highlight.highlight(e.getValue(),hiid,terms,df,maxDoc,words,stopWords,exactCase,reader,sortByPhrases,alwaysIncludeFirst);
760	774	} else{
761	775	// remote call
762	776	String host = cache.getRandomHost(hiid);
763		~~- rs = messenger.highlight(host,e.getValue(),hiid.toString(),terms,df,maxDoc,words,exactCase,sortByPhrases);~~
	777	+ rs = messenger.highlight(host,e.getValue(),hiid.toString(),terms,df,maxDoc,words,exactCase,sortByPhrases,alwaysIncludeFirst);
764	778	}
765	779	results.putAll(rs.highlighted);
766	780	res.getPhrases().addAll(rs.phrases);
767	781	res.getFoundInContext().addAll(rs.foundInContext);
768	782	if(rs.foundAllInTitle && words.size()>1)
769		~~- res.setFoundAllInTitle(true);~~
	783	+ res.setFoundAllInTitle(true);
	784	+ res.getFoundInTitles().addAll(rs.foundInTitles);
770	785	}
771	786	}
772	787	res.addToFirstHitRank(res.getNumHits());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/AggregateMetaField.java
—	—	@@ -3,6 +3,9 @@
4	4	import java.io.IOException;
5	5	import java.util.Collection;
6	6	import java.util.HashMap;
	7	+import java.util.HashSet;
	8	+import java.util.Hashtable;
	9	+import java.util.Set;
7	10	import java.util.StringTokenizer;
8	11	import java.util.WeakHashMap;
9	12
—	—	@@ -21,10 +24,16 @@
22	25	*
23	26	*/
24	27	public class AggregateMetaField {
25		~~- static Logger log = Logger.getLogger(RankField.class);~~
	28	+ static Logger log = Logger.getLogger(AggregateMetaField.class);
26	29	protected static WeakHashMap<IndexReader,HashMap<String,AggregateMetaFieldSource>> cache = new WeakHashMap<IndexReader,HashMap<String,AggregateMetaFieldSource>>();
27	30	protected static Object lock = new Object();
	31	+ protected static Hashtable<IndexReader,AggregateMetaFieldSource> cachingInProgress = new Hashtable<IndexReader,AggregateMetaFieldSource>();
28	32
	33	+ /** Check if there is a current background caching on a reader */
	34	+ public static boolean isBeingCached(IndexReader reader){
	35	+ return cachingInProgress.containsKey(reader);
	36	+ }
	37	+
29	38	/** Get a cached field source
30	39	* @throws IOException */
31	40	public static AggregateMetaFieldSource getCachedSource(IndexReader reader, String field) throws IOException{
—	—	@@ -64,56 +73,63 @@
65	74
66	75	protected class CachingThread extends Thread {
67	76	public void run(){
68		~~- log.info("Caching aggregate field "+field+" for "+reader.directory());~~
69		~~- int maxdoc = reader.maxDoc();~~
70		~~- index = new int[maxdoc];~~
71		~~- int count = 0;~~
72		~~- length = new byte[maxdoc]; // estimate maxdoc values~~
73		~~- lengthNoStopWords = new byte[maxdoc];~~
74		~~- lengthComplete = new byte[maxdoc];~~
75		~~- boost = new float[maxdoc];~~
76		~~- namespaces = new byte[maxdoc];~~
77		~~- for(int i=0;i<maxdoc;i++){~~
78		~~- byte[] stored = null;~~
79		~~- try{~~
80		~~- Document doc = reader.document(i);~~
81		~~- stored = doc.getBinaryValue(field);~~
82		~~- namespaces[i] = (byte)Integer.parseInt(doc.get("namespace"));~~
83		~~- index[i] = count;~~
84		~~- if(stored == null)~~
85		~~- continue;~~
86		~~- for(int j=0;j<stored.length/7;j++){~~
87		~~- if(count >= length.length){~~
88		~~- length = extendBytes(length);~~
89		~~- lengthNoStopWords = extendBytes(lengthNoStopWords);~~
90		~~- lengthComplete = extendBytes(lengthComplete);~~
91		~~- boost = extendFloats(boost);~~
92		~~- }~~
93		~~- length[count] = stored[j*7];~~
94		~~- if(length[count] == 0){~~
95		~~- log.debug("Broken length=0 for docid="+i+", at position "+j);~~
96		~~- }~~
97		~~- lengthNoStopWords[count] = stored[j*7+1];~~
98		~~- int boostInt = (((stored[j7+2]&0xff) << 24) + ((stored[j7+3]&0xff) << 16) + ((stored[j7+4]&0xff) << 8) + ((stored[j7+5]&0xff) << 0));~~
99		~~- boost[count] = Float.intBitsToFloat(boostInt);~~
100		~~- lengthComplete[count] = stored[j*7+6];~~
101		-
102		~~- count++;~~
103		~~- }~~
104		~~- } catch(Exception e){~~
105		~~- log.error("Exception during processing stored_field="+field+" on docid="+i+", with stored="+stored+" : "+e.getMessage());~~
106		~~- e.printStackTrace();~~
	77	+ cachingInProgress.put(reader,AggregateMetaFieldSource.this);
	78	+ try{
	79	+ log.info("Caching aggregate field "+field+" for "+reader.directory());
	80	+ int maxdoc = reader.maxDoc();
	81	+ index = new int[maxdoc];
	82	+ int count = 0;
	83	+ length = new byte[maxdoc]; // estimate maxdoc values
	84	+ lengthNoStopWords = new byte[maxdoc];
	85	+ lengthComplete = new byte[maxdoc];
	86	+ boost = new float[maxdoc];
	87	+ namespaces = new byte[maxdoc];
	88	+ for(int i=0;i<maxdoc;i++){
	89	+ byte[] stored = null;
	90	+ try{
	91	+ Document doc = reader.document(i);
	92	+ stored = doc.getBinaryValue(field);
	93	+ namespaces[i] = (byte)Integer.parseInt(doc.get("namespace"));
	94	+ index[i] = count;
	95	+ if(stored == null)
	96	+ continue;
	97	+ for(int j=0;j<stored.length/7;j++){
	98	+ if(count >= length.length){
	99	+ length = extendBytes(length);
	100	+ lengthNoStopWords = extendBytes(lengthNoStopWords);
	101	+ lengthComplete = extendBytes(lengthComplete);
	102	+ boost = extendFloats(boost);
	103	+ }
	104	+ length[count] = stored[j*7];
	105	+ if(length[count] == 0){
	106	+ log.debug("Broken length=0 for docid="+i+", at position "+j);
	107	+ }
	108	+ lengthNoStopWords[count] = stored[j*7+1];
	109	+ int boostInt = (((stored[j7+2]&0xff) << 24) + ((stored[j7+3]&0xff) << 16) + ((stored[j7+4]&0xff) << 8) + ((stored[j7+5]&0xff) << 0));
	110	+ boost[count] = Float.intBitsToFloat(boostInt);
	111	+ lengthComplete[count] = stored[j*7+6];
	112	+
	113	+ count++;
	114	+ }
	115	+ } catch(Exception e){
	116	+ log.error("Exception during processing stored_field="+field+" on docid="+i+", with stored="+stored+" : "+e.getMessage());
	117	+ e.printStackTrace();
	118	+ }
107	119	}
	120	+ // compact arrays
	121	+ if(count < length.length - 1){
	122	+ length = resizeBytes(length,count);
	123	+ lengthNoStopWords = resizeBytes(lengthNoStopWords,count);
	124	+ boost = resizeFloats(boost,count);
	125	+ lengthComplete = resizeBytes(lengthComplete,count);
	126	+ }
	127	+ log.info("Finished caching aggregate "+field+" for "+reader.directory());
	128	+ cachingFinished = true;
	129	+ } catch(Exception e){
	130	+ e.printStackTrace();
	131	+ log.error("Whole caching failed on field="+field+", reader="+reader);
108	132	}
109		~~- // compact arrays~~
110		~~- if(count < length.length - 1){~~
111		~~- length = resizeBytes(length,count);~~
112		~~- lengthNoStopWords = resizeBytes(lengthNoStopWords,count);~~
113		~~- boost = resizeFloats(boost,count);~~
114		~~- lengthComplete = resizeBytes(lengthComplete,count);~~
115		~~- }~~
116		~~- log.info("Finished caching aggregate "+field+" for "+reader.directory());~~
117		~~- cachingFinished = true;~~
	133	+ cachingInProgress.remove(reader);
118	134	}
119	135	protected byte[] extendBytes(byte[] array){
120	136	return resizeBytes(array,array.length*2);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NamespaceFilter.java
—	—	@@ -39,22 +39,24 @@
40	40	included = new BitSet(64);
41	41	}
42	42
	43	+ /** "all" filter */
43	44	public NamespaceFilter() {
44	45	init();
45	46	}
46	47
	48	+ /** filter namespaces */
47	49	public NamespaceFilter(Collection<Integer> namespaces){
48	50	init();
49	51	for(Integer namespace : namespaces){
50	52	included.set(namespace.intValue());
51	53	}
52	54	}
53		-
	55	+ /** filter on one namespace */
54	56	public NamespaceFilter(int namespace){
55	57	init();
56	58	included.set(namespace);
57	59	}
58		-
	60	+ /** filter number of namespaces separated by comma, e.g. 0,2,10 */
59	61	public NamespaceFilter(String namespaces) {
60	62	init();
61	63	if (namespaces != null && !namespaces.equals("")) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/highlight/Highlight.java
—	—	@@ -70,12 +70,15 @@
71	71	public boolean foundAllInTitle;
72	72	/** Rank of the first hit, used as title-suggestion threshold */
73	73	public int firstHitRank = 0;
74		~~- public ResultSet(HashMap<String, HighlightResult> highlighted, HashSet<String> phrases, HashSet<String> foundInContext, boolean foundAllInTitle, int firstHitRank) {~~
	74	+ /** Words found in titles */
	75	+ public HashSet<String> foundInTitles;
	76	+ public ResultSet(HashMap<String, HighlightResult> highlighted, HashSet<String> phrases, HashSet<String> foundInContext, boolean foundAllInTitle, int firstHitRank, HashSet<String> foundInTitles) {
75	77	this.highlighted = highlighted;
76	78	this.phrases = phrases;
77	79	this.foundInContext = foundInContext;
78	80	this.foundAllInTitle = foundAllInTitle;
79	81	this.firstHitRank = firstHitRank;
	82	+ this.foundInTitles = foundInTitles;
80	83	}
81	84	}
82	85	/**
—	—	@@ -87,10 +90,12 @@
88	91	* @param words - in order words (from main phrase)
89	92	* @param exactCase - if these are results from exactCase search
90	93	* @throws IOException
91		~~- * @returns map: key -> what to highlight~~
	94	+ * @returns resultset
92	95	*/
93	96	@SuppressWarnings("unchecked")
94		- public static ResultSet highlight(ArrayList<String> hits, IndexId iid, Term[] terms, int df[], int maxDoc, ArrayList<String> words, HashSet<String> stopWords, boolean exactCase, IndexReader reader, boolean sortByPhrases) throws IOException{
	97	+ public static ResultSet highlight(ArrayList<String> hits, IndexId iid, Term[] terms, int df[], int maxDoc,
	98	+ ArrayList<String> words, HashSet<String> stopWords, boolean exactCase, IndexReader reader,
	99	+ boolean sortByPhrases, boolean alwaysIncludeFirstLine) throws IOException{
95	100	if(cache == null)
96	101	cache = SearcherCache.getInstance();
97	102
—	—	@@ -101,6 +106,7 @@
102	107	HashSet<String> inContext = new HashSet<String>();
103	108	boolean foundAllInTitle = false;
104	109	int firstHitRank = 0;
	110	+ HashSet<String> inTitle = new HashSet<String>();
105	111
106	112	// terms weighted with idf
107	113	HashMap<String,Double> weightTerm = new HashMap<String,Double>();
—	—	@@ -140,8 +146,8 @@
141	147	firstHitRank = alttitles.getTitle().getRank();
142	148
143	149	HashMap<String,Double> notInTitle = getTermsNotInTitle(weightTerm,alttitles,wordIndex);
144		~~- ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases );~~
145		~~- ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,wordIndex,1,true,stopWords,false,phrases,inContext,false);~~
	150	+ ArrayList<RawSnippet> textSnippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 2, false, stopWords, true, phrases, inContext, sortByPhrases, alwaysIncludeFirstLine );
	151	+ ArrayList<RawSnippet> titleSnippets = getBestTextSnippets(alttitles.getTitle().getTokens(),weightTerm,wordIndex,1,true,stopWords,false,phrases,inContext,false,false);
146	152	int redirectAdditional = 0;
147	153	if(titleSnippets.size()>0 &&
148	154	((titleSnippets.get(0).found.containsAll(words) && textTokenLength(titleSnippets.get(0).tokens) == words.size())
—	—	@@ -219,12 +225,14 @@
220	226	hr.setTitle(titleSnippets.get(0).makeSnippet(256,true));
221	227	if(titleSnippets.get(0).found.containsAll(words))
222	228	foundAllInTitle = true;
	229	+ inTitle.addAll(titleSnippets.get(0).found);
223	230	}
224	231
225	232	if(redirectSnippets != null){
226	233	hr.setRedirect(redirectSnippets.makeSnippet(MAX_CONTEXT,true));
227	234	if(!foundAllInTitle && redirectSnippets.found.containsAll(words))
228	235	foundAllInTitle = true;
	236	+ inTitle.addAll(redirectSnippets.found);
229	237	}
230	238
231	239	if(sectionSnippets != null){
—	—	@@ -240,7 +248,7 @@
241	249	res.put(key,hr);
242	250
243	251	}
244		~~- return new ResultSet(res,phrases,inContext,foundAllInTitle,firstHitRank);~~
	252	+ return new ResultSet(res,phrases,inContext,foundAllInTitle,firstHitRank,inTitle);
245	253	}
246	254
247	255	/** Number of tokens excluding aliases and glue stuff */
—	—	@@ -423,7 +431,7 @@
424	432	}
425	433	}
426	434	if((completeMatch && additional >= minAdditional) \|\| additional > minAdditional \|\| (additional != 0 && additional == notInTitle.size())){
427		~~- ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 1, false, stopWords, false, phrases, inContext, false);~~
	435	+ ArrayList<RawSnippet> snippets = getBestTextSnippets(tokens, weightTerm, wordIndex, 1, false, stopWords, false, phrases, inContext, false, false);
428	436	if(snippets.size() > 0){
429	437	RawSnippet snippet = snippets.get(0);
430	438	snippet.setAlttitle(ainf);
—	—	@@ -498,7 +506,7 @@
499	507	/** Highlight text */
500	508	protected static ArrayList<RawSnippet> getBestTextSnippets(ArrayList<ExtToken> tokens, HashMap<String, Double> weightTerms,
501	509	HashMap<String,Integer> wordIndex, int maxSnippets, boolean ignoreBreaks, HashSet<String> stopWords, boolean showFirstIfNone,
502		~~- HashSet<String> phrases, HashSet<String> foundInContext, final boolean sortByPhrases) {~~
	510	+ HashSet<String> phrases, HashSet<String> foundInContext, final boolean sortByPhrases, final boolean alwaysIncludeFirstLine) {
503	511
504	512	// pieces of text to ge highlighted
505	513	ArrayList<FragmentScore> fragments = new ArrayList<FragmentScore>();
—	—	@@ -680,6 +688,12 @@
681	689	// find fragments with best score
682	690	Collections.sort(fragments, new Comparator<FragmentScore>() {
683	691	public int compare(FragmentScore o1, FragmentScore o2) {
	692	+ if(alwaysIncludeFirstLine){
	693	+ if(o1.isFirstSentence)
	694	+ return -1;
	695	+ if(o2.isFirstSentence)
	696	+ return 1;
	697	+ }
684	698	// sort via longest phrase found
685	699	int c = o2.bestCount - o1.bestCount;
686	700	if(sortByPhrases && c != 0)
—	—	@@ -698,7 +712,7 @@
699	713	HashSet<String> termsFound = new HashSet<String>();
700	714	ArrayList<FragmentScore> resNoNew = new ArrayList<FragmentScore>();
701	715	for(FragmentScore f : fragments){
702		~~- if(f.score == 0)~~
	716	+ if(f.score == 0 && !(alwaysIncludeFirstLine && f.isFirstSentence))
703	717	break;
704	718	// check if the fragment has new terms
705	719	boolean hasNew = false;
—	—	@@ -711,7 +725,7 @@
712	726	}
713	727	}
714	728	}
715		~~- if(hasNew){~~
	729	+ if(hasNew \|\| (alwaysIncludeFirstLine && f.isFirstSentence)){
716	730	if(f.found != null)
717	731	termsFound.addAll(f.found);
718	732	adjustBest(f,tokens,weightTerms,wordIndex,newTerms);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java
—	—	@@ -437,11 +437,11 @@
438	438	dbUpdates = new Hashtable<String,IndexUpdateRecord>();
439	439	queuedUpdates.put(iid.toString(), dbUpdates);
440	440	}
441		~~- IndexUpdateRecord oldr = dbUpdates.get(record.getKey());~~
	441	+ IndexUpdateRecord oldr = dbUpdates.get(record.getIndexKey());
442	442	// combine a previous delete with current add to form update
443	443	if(oldr != null && oldr.doDelete() && record.doAdd())
444	444	record.setAction(IndexUpdateRecord.Action.UPDATE);
445		~~- dbUpdates.put(record.getKey(),record);~~
	445	+ dbUpdates.put(record.getIndexKey(),record);
446	446	}
447	447
448	448	log.debug("Locally queued item: "+record);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexUpdateRecord.java
—	—	@@ -149,16 +149,16 @@
150	150	}
151	151
152	152	/**
153		~~- * @return Returns the page key -- page_id (via article)~~
	153	+ * @return page index key -- page_id (via article)
154	154	*/
155		~~- public String getKey(){~~
156		~~- return article.getKey();~~
	155	+ public String getIndexKey(){
	156	+ return article.getIndexKey();
157	157	}
158	158
159	159	/**
160		~~- * @return Highlight key -- ns:title~~
	160	+ * @return ns:title key, used in links, highlight, prefix, etc.. indexes
161	161	*/
162		~~- public String getHighlightKey(){~~
	162	+ public String getNsTitleKey(){
163	163	return article.getTitleObject().getKey();
164	164	}
165	165
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
—	—	@@ -59,11 +59,16 @@
60	60	import org.wikimedia.lsearch.config.GlobalConfiguration;
61	61	import org.wikimedia.lsearch.config.IndexId;
62	62	import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
	63	+import org.wikimedia.lsearch.prefix.PrefixIndexBuilder;
63	64	import org.wikimedia.lsearch.ranks.Links;
64	65	import org.wikimedia.lsearch.ranks.StringList;
	66	+import org.wikimedia.lsearch.related.Related;
65	67	import org.wikimedia.lsearch.related.RelatedTitle;
66	68	import org.wikimedia.lsearch.search.NamespaceFilter;
	69	+import org.wikimedia.lsearch.spell.CleanIndexImporter;
	70	+import org.wikimedia.lsearch.spell.CleanIndexWriter;
67	71	import org.wikimedia.lsearch.spell.api.SpellCheckIndexer;
	72	+import org.wikimedia.lsearch.storage.RelatedStorage;
68	73	import org.wikimedia.lsearch.util.Buffer;
69	74	import org.wikimedia.lsearch.util.Localization;
70	75	import org.wikimedia.lsearch.util.MathFunc;
—	—	@@ -155,9 +160,9 @@
156	161	if(rec.doDelete()){
157	162	int count = 0;
158	163	if(iid.isHighlight())
159		~~- count = reader.deleteDocuments(new Term("key", rec.getHighlightKey()));~~
	164	+ count = reader.deleteDocuments(new Term("key", rec.getNsTitleKey()));
160	165	else // normal or titles index
161		~~- count = reader.deleteDocuments(new Term("key", rec.getKey()));~~
	166	+ count = reader.deleteDocuments(new Term("key", rec.getIndexKey()));
162	167	if(count == 0)
163	168	nonDeleteDocuments.add(rec);
164	169	IndexReportCard card = getReportCard(rec);
—	—	@@ -167,7 +172,7 @@
168	173	else
169	174	card.setSuccessfulDelete();
170	175	}
171		~~- log.debug(iid+": Deleting document "+rec.getKey()+" "+rec.getArticle());~~
	176	+ log.debug(iid+": Deleting document "+rec.getIndexKey()+" "+rec.getArticle());
172	177	}
173	178	}
174	179	reader.close();
—	—	@@ -231,7 +236,7 @@
232	237	writer.addDocument(doc,indexAnalyzer);
233	238	}
234	239
235		~~- log.debug(iid+": Adding document "+rec.getKey()+" "+rec.getArticle());~~
	240	+ log.debug(iid+": Adding document "+rec.getIndexKey()+" "+rec.getArticle());
236	241	if(card != null)
237	242	card.setSuccessfulAdd();
238	243	} catch (IOException e) {
—	—	@@ -241,7 +246,7 @@
242	247	succ = false; // report unsucc, but still continue, to process all cards
243	248	} catch(Exception e){
244	249	e.printStackTrace();
245		~~- log.error("Error adding document "+rec.getKey()+" with message: "+e.getMessage());~~
	250	+ log.error("Error adding document "+rec.getIndexKey()+" with message: "+e.getMessage());
246	251	if(card != null)
247	252	card.setFailedAdd();
248	253	succ = false; // report unsucc, but still continue, to process all cards
—	—	@@ -410,16 +415,124 @@
411	416	*
412	417	* @param iid
413	418	* @param updateRecords
	419	+ * @return success
414	420	*/
415	421	public boolean updateDocuments(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
416		~~- boolean index = updateDocumentsOn(iid,updateRecords,iid);~~
417		~~- boolean highlight = updateDocumentsOn(iid.getHighlight(),updateRecords,iid);~~
418		~~- boolean titles = true;~~
	422	+ return updateLinks(iid,updateRecords)
	423	+ && fetchLinksInfo(iid,updateRecords)
	424	+ && updatePrefix(iid,updateRecords)
	425	+ && updateSpell(iid,updateRecords)
	426	+ && updateDocumentsOn(iid,updateRecords,iid)
	427	+ && updateDocumentsOn(iid.getHighlight(),updateRecords,iid)
	428	+ && updateTitles(iid,updateRecords);
	429	+ }
	430	+
	431	+ public boolean updateTitles(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
419	432	if(iid.hasTitlesIndex())
420		~~- titles = updateDocumentsOn(iid.getTitlesIndex(),updateRecords,iid);~~
421		~~- return index && highlight && titles;~~
	433	+ return updateDocumentsOn(iid.getTitlesIndex(),updateRecords,iid);
	434	+ return true;
422	435	}
423	436
	437	+ /** Update articles with latest linking & related information */
	438	+ public boolean fetchLinksInfo(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
	439	+ try{
	440	+ Links links = Links.openForRead(iid,iid.getIndexPath());
	441	+ RelatedStorage related = new RelatedStorage(iid);
	442	+ for(IndexUpdateRecord rec : updateRecords){
	443	+ if(rec.doAdd()){
	444	+ String key = rec.getNsTitleKey();
	445	+ Article article = rec.getArticle();
	446	+ // references, redirect status
	447	+ article.setReferences(links.getNumInLinks(key));
	448	+ article.setRedirectTo(links.getRedirectTarget(key));
	449	+ if(article.isRedirect())
	450	+ article.setRedirectTargetNamespace(links.getRedirectTargetNamespace(key));
	451	+ else
	452	+ article.setRedirectTargetNamespace(-1);
	453	+
	454	+ // redirects
	455	+ ArrayList<Redirect> redirects = new ArrayList<Redirect>();
	456	+ for(String rk : links.getRedirectsTo(key)){
	457	+ String[] parts = rk.toString().split(":",2);
	458	+ int redirectRef = links.getNumInLinks(rk);
	459	+ redirects.add(new Redirect(Integer.parseInt(parts[0]),parts[1],redirectRef));
	460	+ }
	461	+ article.setRedirects(redirects);
	462	+ // related
	463	+ if(related != null)
	464	+ article.setRelated(related.getRelated(key));
	465	+ }
	466	+ }
	467	+ return true;
	468	+ } catch(IOException e){
	469	+ e.printStackTrace();
	470	+ log.error("Cannot fetch links info: "+e.getMessage());
	471	+ return false;
	472	+ }
	473	+ }
	474	+
	475	+ public boolean updateLinks(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
	476	+ try{
	477	+ Links links = Links.openForModification(iid);
	478	+ for(IndexUpdateRecord rec : updateRecords){
	479	+ // TODO: this might do some unnecessary additions/deletions on split index architecture
	480	+ if(rec.doDelete()){
	481	+ links.deleteArticleInfoByIndexKey(rec.getIndexKey());
	482	+ } else if(rec.doAdd()){
	483	+ Article a = rec.getArticle();
	484	+ links.addArticleInfo(a.getContents(),a.getTitleObject(),iid.isExactCase(),a.getIndexKey());
	485	+ }
	486	+ }
	487	+ links.close();
	488	+ return true;
	489	+ } catch(IOException e){
	490	+ e.printStackTrace();
	491	+ log.error("Cannot update links index: "+e.getMessage());
	492	+ return false;
	493	+ }
	494	+ }
	495	+
	496	+ public boolean updatePrefix(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
	497	+ if(!iid.hasPrefix())
	498	+ return true;
	499	+ try{
	500	+ PrefixIndexBuilder prefix = PrefixIndexBuilder.forPrecursorModification(iid);
	501	+ for(IndexUpdateRecord rec : updateRecords){
	502	+ if(rec.doDelete()){
	503	+ prefix.deleteFromPrecursor(rec.getIndexKey());
	504	+ } else if(rec.doAdd()){
	505	+ Article a = rec.getArticle();
	506	+ prefix.addToPrecursor(rec.getNsTitleKey(),a.getReferences(),a.getRedirectTarget(),rec.getIndexKey());
	507	+ }
	508	+ }
	509	+ return true;
	510	+ } catch(IOException e){
	511	+ e.printStackTrace();
	512	+ log.error("Cannot update prefix index: "+e.getMessage());
	513	+ return false;
	514	+ }
	515	+ }
	516	+
	517	+ public boolean updateSpell(IndexId iid, Collection<IndexUpdateRecord> updateRecords){
	518	+ if(!iid.hasSpell())
	519	+ return true;
	520	+ try{
	521	+ CleanIndexWriter writer = CleanIndexWriter.newForModification(iid);
	522	+ for(IndexUpdateRecord rec : updateRecords){
	523	+ if(rec.doDelete()){
	524	+ writer.deleteArticleInfo(rec.getIndexKey());
	525	+ } else if(rec.doAdd()){
	526	+ writer.addArticleInfo(rec.getArticle());
	527	+ }
	528	+ }
	529	+ return true;
	530	+ } catch(IOException e){
	531	+ e.printStackTrace();
	532	+ log.error("Cannot update spellcheck index: "+e.getMessage());
	533	+ return false;
	534	+ }
	535	+ }
	536	+
424	537	/**
425	538	* Update all documents in the collection. If needed the request
426	539	* is forwarded to a remote object (i.e. if the part of the split
—	—	@@ -518,7 +631,7 @@
519	632	transformArticleForIndexing(article);
520	633
521	634	// page_id from database, used to look up and replace entries on index updates
522		~~- doc.add(new Field("key", article.getKey(), Field.Store.YES, Field.Index.UN_TOKENIZED));~~
	635	+ doc.add(new Field("key", article.getIndexKey(), Field.Store.YES, Field.Index.UN_TOKENIZED));
523	636
524	637	// namespace, returned with results
525	638	doc.add(new Field("namespace", article.getNamespace(), Field.Store.YES, Field.Index.UN_TOKENIZED));
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WordNet.java
—	—	@@ -6,6 +6,7 @@
7	7	import java.util.ArrayList;
8	8	import java.util.Arrays;
9	9	import java.util.HashMap;
	10	+import java.util.HashSet;
10	11	import java.util.List;
11	12	import java.util.zip.GZIPInputStream;
12	13
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
—	—	@@ -564,7 +564,8 @@
565	565	c = text[cur];
566	566	else break;
567	567	}
568		~~- cur--; // we moved to next legal char~~
	568	+ if(!noTrailing)
	569	+ cur--; // we moved to next legal char
569	570	}
570	571
571	572	addToken(noTrailing);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/StopWords.java
—	—	@@ -88,11 +88,11 @@
89	89	return ret;
90	90	}
91	91
92		~~- /** Get a brand new hash set of predifined stop words (i.e. not those generated from lucene indexes) */~~
93		~~- public static HashSet<String> getPredefinedSet(IndexId iid){~~
	92	+ /** Get a brand new hash set of predifined stop words (i.e. not those generated from lucene indexes) */
	93	+ public static HashSet<String> getPredefinedSet(String langCode){
94	94	loadPredefined();
95	95	HashSet<String> ret = new HashSet<String>();
96		~~- HashSet<String> cached = cachePredefined.get(iid.getLangCode());~~
	96	+ HashSet<String> cached = cachePredefined.get(langCode);
97	97	if(cached != null){
98	98	synchronized(cached){
99	99	ret.addAll(cached);
—	—	@@ -100,6 +100,9 @@
101	101	}
102	102	return ret;
103	103	}
	104	+ public static HashSet<String> getPredefinedSet(IndexId iid){
	105	+ return getPredefinedSet(iid.getLangCode());
	106	+ }
104	107
105	108	protected static void loadPredefined(){
106	109	if(loadedPredefined)
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -20,6 +20,7 @@
21	21	import org.apache.lucene.search.BooleanClause;
22	22	import org.apache.lucene.search.BooleanQuery;
23	23	import org.apache.lucene.search.LogTransformScore;
	24	+import org.apache.lucene.search.MultiPhraseQuery;
24	25	import org.apache.lucene.search.PhraseQuery;
25	26	import org.apache.lucene.search.PositionalMultiQuery;
26	27	import org.apache.lucene.search.PositionalOptions;
—	—	@@ -340,24 +341,10 @@
341	342	c = text[cur];
342	343	if(c == '"'){
343	344	inPhrase = !inPhrase;
344		~~- if(inPhrase)~~
345		~~- length = 0;~~
346		~~- else{ // end of phrase~~
347		~~- int start = cur - length;~~
348		~~- analyzeBuffer();~~
349		~~- for(Token t : tokens){~~
350		~~- if(t.getPositionIncrement() > 0)~~
351		~~- ret.add(new Token(t.termText(),start+t.startOffset(),start+t.endOffset(),"phrase"));~~
352		~~- }~~
353		~~- }~~
354	345	}
355	346
356		~~- if(inPhrase){~~
357		~~- buffer[length++] = c;~~
358		~~- continue;~~
359		~~- }~~
360		-
361		~~- if(c == ')'){~~
	347	+ if(inPhrase);
	348	+ else if(c == ')'){
362	349	level--;
363	350	if(level < fieldLevel)
364	351	fieldLevel = -1;
—	—	@@ -368,9 +355,9 @@
369	356	} else if(fieldLevel != -1 && level>fieldLevel)
370	357	continue;
371	358
372		~~- if(Character.isLetterOrDigit(c) \|\| c=='?' \|\| c=='*' \|\| c=='~'){~~
	359	+ if(isTermChar(c)){
373	360	int start = cur;
374		~~- tokenType = fetchToken();~~
	361	+ tokenType = fetchToken(inPhrase);
375	362	if(tokenType == TokenType.WORD && (start==0 \|\| text[start-1]!='-')){
376	363	String type = "word";
377	364	if(bufferIsWildCard())
—	—	@@ -384,7 +371,7 @@
385	372	}
386	373	}
387	374	}
388		~~- } else if(c == '['){~~
	375	+ } else if(c == '[' && !inPhrase){
389	376	fetchGenericPrefix();
390	377	}
391	378	}
—	—	@@ -420,12 +407,19 @@
421	408	return defaultNamespaceFilter;
422	409	}
423	410
	411	+ private final boolean isTermChar(char ch){
	412	+ return !Character.isWhitespace(ch) && ch != ':' && ch != '(' && ch != ')' && ch !='[' && ch != ']' && ch != ',' && ch != ';' && ch != '"';
	413	+ }
	414	+
424	415	/**
425	416	* Fetch token into <code>buffer</code> starting from current position (<code>cur</code>)
426	417	*
427	418	* @return type of the token in buffer
428	419	*/
429	420	private TokenType fetchToken(){
	421	+ return fetchToken(false);
	422	+ }
	423	+ private TokenType fetchToken(boolean termOnly){
430	424	char ch;
431	425	prev_cur = cur;
432	426	for(length = 0; cur < queryLength; cur++){
—	—	@@ -434,7 +428,7 @@
435	429	continue; // ignore whitespaces
436	430
437	431	// pluses and minuses, underscores can be within words (to prevent to be missinterpeted), *,? are for wildcard queries
438		~~- if(!Character.isWhitespace(ch) && ch != ':' && ch != '(' && ch != ')' && ch !='[' && ch != ']' && ch != ',' && ch != ';' && ch != '"'){~~
	432	+ if(isTermChar(ch)){
439	433	if(length<buffer.length)
440	434	buffer[length++] = ch;
441	435	} else{
—	—	@@ -445,6 +439,9 @@
446	440	if(length == 0)
447	441	return TokenType.EOF;
448	442
	443	+ if(termOnly)
	444	+ return TokenType.WORD;
	445	+
449	446	// check for keywords
450	447	if(length == 3 && buffer[0]=='A' && buffer[1]=='N' && buffer[2]=='D')
451	448	return TokenType.AND;
—	—	@@ -553,34 +550,80 @@
554	551	*
555	552	* @return a query, or null if the query is empty
556	553	*/
557		~~- private PhraseQuery parsePhrase(){~~
558		~~- PhraseQuery query = null;~~
559		-
560		~~- length = 0;~~
	554	+ private Query parsePhrase(){
	555	+ // special case for incategory
	556	+ if(currentField!=null && currentField.equals("incategory")){
	557	+ for(; cur < queryLength ; cur++ ){
	558	+ if(text[cur] == '"')
	559	+ break;
	560	+ else if(length < buffer.length)
	561	+ buffer[length++] = text[cur];
	562	+ }
	563	+ if(length > 0){
	564	+ // no tokenization, we want whole category name
	565	+ return new TermQuery(makeTerm());
	566	+ }
	567	+ return null;
	568	+ }
	569	+ //PositionalMultiQuery query = new PositionalMultiQuery(new PositionalOptions.PhraseQueryFallback());
	570	+ MultiPhraseQuery query = new MultiPhraseQuery();
561	571	for(; cur < queryLength ; cur++ ){
	572	+ length = 0;
	573	+ // fetch next word
	574	+ while(cur<queryLength && isTermChar(text[cur]) && length<buffer.length){
	575	+ buffer[length++] = text[cur++];
	576	+ }
	577	+
	578	+ // add to phrase
	579	+ if(length > 0){
	580	+ boolean added = false;
	581	+ if(bufferIsWildCard()){
	582	+ Term term = makeTerm();
	583	+ Term[] terms = wildcards.makeTerms(term.text(),term.field());
	584	+ if(terms != null){
	585	+ query.add(terms);
	586	+ ArrayList<String> words = wildcards.getWords(term.text());
	587	+ expandedWordsFromParser.add(words);
	588	+ expandedTypesFromParser.add(ExpandedType.WILDCARD);
	589	+ ArrayList<Float> boosts = new ArrayList<Float>();
	590	+ for(int i=0;i<words.size();i++) boosts.add(1f);
	591	+ expandedBoostFromParser.add(boosts);
	592	+ added = true;
	593	+ }
	594	+ }
	595	+ if(bufferIsFuzzy()){
	596	+ Term term = makeTerm();
	597	+ NamespaceFilter nsf = getNamespaceFilter(currentField);
	598	+ Term[] terms = fuzzy.makeTerms(term.text(),term.field(),nsf);
	599	+ if(terms != null){
	600	+ //query.add(terms,fuzzy.getBoosts(term.text(),nsf,terms));
	601	+ query.add(terms);
	602	+ ArrayList<String> words = fuzzy.getWords(term.text(),nsf);
	603	+ expandedWordsFromParser.add(words);
	604	+ expandedTypesFromParser.add(ExpandedType.FUZZY);
	605	+ expandedBoostFromParser.add(fuzzy.getBoosts(term.text(),nsf,words));
	606	+ added = true;
	607	+ }
	608	+ }
	609	+ if(!added){
	610	+ // fallback to ordinary words
	611	+ analyzeBuffer();
	612	+ for(Token token : tokens){
	613	+ if(token.getPositionIncrement()>0){ // ignore aliases and stemmed words
	614	+ Term t = makeTerm(token);
	615	+ addToWords(t.text(),1,ExpandedType.PHRASE);
	616	+ query.add(t);
	617	+ }
	618	+ }
	619	+ }
	620	+ }
562	621	// end of phrase query
563	622	if(text[cur] == '"')
564	623	break;
565		~~- else if(length < buffer.length)~~
566		~~- buffer[length++] = text[cur];~~
567	624	}
568		~~- if(length != 0){~~
569		~~- query = new PhraseQuery();~~
570		~~- // if it's a category don't tokenize it, we want whole category name~~
571		~~- if(currentField!=null && currentField.equals("incategory"))~~
572		~~- query.add(makeTerm());~~
573		~~- else{~~
574		~~- analyzeBuffer();~~
575		~~- for(Token token : tokens){~~
576		~~- if(token.getPositionIncrement()>0){ // ignore aliases and stemmed words~~
577		~~- Term t = makeTerm(token);~~
578		~~- addToWords(t.text(),1,ExpandedType.PHRASE);~~
579		~~- query.add(t);~~
580		~~- }~~
581		~~- }~~
582		~~- query.setBoost(defaultBoost);~~
583		~~- }~~
584		~~- return query;~~
	625	+ if(query.getPositions().length > 0){
	626	+ query.setBoost(defaultBoost);
	627	+ return query;
585	628	} else
586	629	return null;
587	630	}
—	—	@@ -1115,20 +1158,22 @@
1116	1159	full.add(additional,Occur.SHOULD);
1117	1160
1118	1161	// redirect match (when redirect is not contained in contents or title)
1119		~~- Query redirects = makeAlttitleForRedirects(words,20,1);~~
1120		~~- if(redirects != null)~~
1121		~~- full.add(redirects,Occur.SHOULD);~~
1122		~~- if(singularWords != null){~~
1123		~~- Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f);~~
1124		~~- if(redirectsSing != null)~~
1125		~~- full.add(redirectsSing,Occur.SHOULD);~~
1126		~~- }~~
1127	1162	if(hasWildcards() \|\| hasFuzzy()){
1128	1163	Query redirectsMulti = makeAlttitleForRedirectsMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,20,1f);
1129	1164	if(redirectsMulti != null)
1130	1165	full.add(redirectsMulti,Occur.SHOULD);
	1166	+ } else{
	1167	+ Query redirects = makeAlttitleForRedirects(words,20,1);
	1168	+ if(redirects != null)
	1169	+ full.add(redirects,Occur.SHOULD);
	1170	+ if(singularWords != null){
	1171	+ Query redirectsSing = makeAlttitleForRedirects(singularWords,20,0.8f);
	1172	+ if(redirectsSing != null)
	1173	+ full.add(redirectsSing,Occur.SHOULD);
	1174	+ }
1131	1175	}
1132	1176
	1177	+
1133	1178	BooleanQuery wrap = new BooleanQuery(true);
1134	1179	wrap.add(full,Occur.SHOULD);
1135	1180	wrap.add(makeComplete(expandedWordsTitle),Occur.SHOULD);
—	—	@@ -1324,7 +1369,7 @@
1325	1370	Query main = null;
1326	1371
1327	1372	// all words as entered into the query
1328		~~- PositionalQuery exact = makePositional(words,fields.contents(),new PositionalOptions.Exact(),0,1);~~
	1373	+ Query exact = makePositionalMulti(expandedWordsTitle,expandedBoostTitle,expandedTypes,fields.contents(),new PositionalOptions.Exact(),0,1);
1329	1374	// words + stemmed + singulars + transliterations + wildcards + fuzzy - with slop factor
1330	1375	Query sloppy = makePositionalMulti(expandedWordsContents,expandedBoostContents,expandedTypes,fields.contents(),new PositionalOptions.Sloppy(),MAINPHRASE_SLOP,1,false);
1331	1376
—	—	@@ -1335,7 +1380,7 @@
1336	1381	ArrayList<ArrayList<String>> wordnet = WordNet.replaceOne(words,iid.getLangCode());
1337	1382
1338	1383	BooleanQuery combined = new BooleanQuery(true);
1339		~~- if(exact!=null && exact.getTerms().length > 0)~~
	1384	+ if(exact!=null)
1340	1385	combined.add(exact,Occur.SHOULD);
1341	1386	// combined various queries into mainphrase
1342	1387	if(sloppy != null){
—	—	@@ -1343,7 +1388,8 @@
1344	1389	// wordnet
1345	1390	if(wordnet != null){
1346	1391	for(ArrayList<String> wnwords : wordnet){
1347		~~- combined.add(makePositional(wnwords,fields.contents(),new PositionalOptions.Sloppy(),MAINPHRASE_SLOP,1),Occur.SHOULD);~~
	1392	+ if(!allStopWords(wnwords))
	1393	+ combined.add(makePositional(wnwords,fields.contents(),new PositionalOptions.Sloppy(),MAINPHRASE_SLOP,1),Occur.SHOULD);
1348	1394	}
1349	1395	}
1350	1396	}
—	—	@@ -1367,7 +1413,8 @@
1368	1414	ArrayList<Query> altAdd = new ArrayList<Query>();
1369	1415	if(wordnet!=null)
1370	1416	for(ArrayList<String> wnwords : wordnet)
1371		~~- altAdd.add(makeAlttitleRelevance(wnwords,RELEVANCE_ALTTITLE_BOOST));~~
	1417	+ if(!allStopWords(wnwords))
	1418	+ altAdd.add(makeAlttitleRelevance(wnwords,RELEVANCE_ALTTITLE_BOOST));
1372	1419	alttitle = simplify(combine(alttitle,altAdd));
1373	1420
1374	1421	// relevance: related
—	—	@@ -1375,7 +1422,8 @@
1376	1423	ArrayList<Query> relAdd = new ArrayList<Query>();
1377	1424	if(wordnet!=null)
1378	1425	for(ArrayList<String> wnwords : wordnet)
1379		~~- relAdd.add(makeRelatedRelevance(wnwords,RELEVANCE_RELATED_BOOST));~~
	1426	+ if(!allStopWords(wnwords))
	1427	+ relAdd.add(makeRelatedRelevance(wnwords,RELEVANCE_RELATED_BOOST));
1380	1428	related = simplify(combine(related,relAdd));
1381	1429
1382	1430	BooleanQuery relevances = new BooleanQuery(true);
—	—	@@ -1546,8 +1594,11 @@
1547	1595	// add the whole-only query
1548	1596	if(whole != null)
1549	1597	bq.add(makePositional(words,field,whole,slop,1),Occur.SHOULD);
1550		~~- if(wholeSloppy != null)~~
1551		~~- bq.add(makePositional(words,field,wholeSloppy,slop,1,false),Occur.SHOULD);~~
	1598	+ if(wholeSloppy != null){
	1599	+ Query ws = makePositional(words,field,wholeSloppy,slop,1,false);
	1600	+ if(ws != null)
	1601	+ bq.add(ws,Occur.SHOULD);
	1602	+ }
1552	1603	bq.setBoost(boost);
1553	1604
1554	1605	return bq;
—	—	@@ -1600,8 +1651,11 @@
1601	1652	// add the whole-only query
1602	1653	if(whole != null)
1603	1654	bq.add(makePositionalMulti(words,boosts,types,field,whole,slop,1),Occur.SHOULD);
1604		~~- if(wholeSloppy != null)~~
1605		~~- bq.add(makePositionalMulti(words,boosts,types,field,wholeSloppy,slop,0.5f,false),Occur.SHOULD);~~
	1655	+ if(wholeSloppy != null){
	1656	+ Query ws = makePositionalMulti(words,boosts,types,field,wholeSloppy,slop,0.5f,false);
	1657	+ if(ws != null)
	1658	+ bq.add(ws,Occur.SHOULD);
	1659	+ }
1606	1660	bq.setBoost(boost);
1607	1661
1608	1662	return bq;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/TokenizerOptions.java
—	—	@@ -56,12 +56,11 @@
57	57	}
58	58	}
59	59
60		~~- public static class HighlightOriginal extends TokenizerOptions {~~
	60	+ /** Used for titles, doesn't simply glue and has no case detection */
	61	+ public static class HighlightOriginal extends Highlight {
61	62	public HighlightOriginal(){
62		~~- super(false);~~
63		~~- this.highlightParsing = true;~~
64		~~- this.relocationParsing = false;~~
65	63	this.simplifyGlue = false;
	64	+ this.noCaseDetection = true;
66	65	}
67	66	}
68	67	/** Used to filter prefixes (up to FastWikiTokenizer.MAX_WORD_LEN chars) */
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/OAIHarvester.java
—	—	@@ -37,7 +37,7 @@
38	38	/** Invoke ListRecords from a certain timestamp */
39	39	public ArrayList<IndexUpdateRecord> getRecords(String from){
40	40	try{
41		~~- read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&from="+from));~~
	41	+ read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=mediawiki&from="+from));
42	42	return collector.getRecords();
43	43	} catch(IOException e){
44	44	log.warn("I/O exception listing records: "+e.getMessage());
—	—	@@ -58,7 +58,7 @@
59	59	/** Invoke ListRecords using the last resumption token */
60	60	public ArrayList<IndexUpdateRecord> getMoreRecords(){
61	61	try{
62		~~- read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&resumptionToken="+resumptionToken));~~
	62	+ read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=mediawiki&resumptionToken="+resumptionToken));
63	63	return collector.getRecords();
64	64	} catch(IOException e){
65	65	log.warn("I/O exception listing records: "+e.getMessage());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IndexUpdatesCollector.java
—	—	@@ -35,7 +35,7 @@
36	36
37	37	public IndexUpdatesCollector(IndexId iid){
38	38	this.iid = iid;
39		~~- this.langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname());~~
	39	+ this.langCode = iid.getLangCode();
40	40	}
41	41
42	42	public void addRedirect(String redirectTitle, int references) {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
—	—	@@ -93,7 +93,6 @@
94	94	HashSet<String> excludeList = new HashSet<String>();
95	95	HashSet<String> firstPass = new HashSet<String>(); // if dbname is here, then it's our update pass
96	96	String defaultTimestamp = "2001-01-01";
97		~~- boolean fetchReferences = true;~~
98	97	// args
99	98	for(int i=0; i<args.length; i++){
100	99	if(args[i].equals("-d"))
—	—	@@ -110,8 +109,6 @@
111	110	excludeList.add(args[++i]);
112	111	else if(args[i].equals("-n"))
113	112	notification = true;
114		~~- else if(args[i].equals("--no-ranks"))~~
115		~~- fetchReferences = false;~~
116	113	else if(args[i].equals("--help"))
117	114	break;
118	115	else if(args[i].startsWith("-")){
—	—	@@ -145,7 +142,6 @@
146	143	System.out.println(" -f - dblist file, one dbname per line");
147	144	System.out.println(" -n - wait for notification of flush after done updating one db (default: "+notification+")");
148	145	System.out.println(" -e - exclude dbname from incremental updates (overrides -f)");
149		~~- System.out.println(" --no-ranks - don't try to fetch any article rank data");~~
150	146	return;
151	147	}
152	148	// config
—	—	@@ -190,22 +186,8 @@
191	187	ArrayList<IndexUpdateRecord> records = harvester.getRecords(from);
192	188	if(records.size() == 0)
193	189	continue;
194		~~- LinkAnalysisStorage las = new LinkAnalysisStorage(iid);~~
195		~~- RelatedStorage related = new RelatedStorage(iid);~~
196	190	boolean hasMore = false;
197	191	do{
198		~~- if(fetchReferences){~~
199		~~- try{~~
200		~~- // fetch references for records~~
201		~~- fetchReferencesAndRelated(records,las,related);~~
202		~~- } catch(IOException e){~~
203		~~- // FIXME: quick hack, if the table cannot be found (e.g. for new wikis) don't abort~~
204		~~- if(e.getMessage().contains("Base table or view not found")){~~
205		~~- log.warn("Continuing, but could not fetch references for "+iid+": "+e.getMessage());~~
206		~~- } else~~
207		~~- throw e;~~
208		~~- }~~
209		~~- }~~
210	192	for(IndexUpdateRecord rec : records){
211	193	Article ar = rec.getArticle();
212	194	log.info("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects());
—	—	@@ -287,62 +269,5 @@
288	270	}
289	271	} while(daemon);
290	272	}
291		-
292		~~- protected static void fetchReferencesAndRelated(ArrayList<IndexUpdateRecord> records, LinkAnalysisStorage las, RelatedStorage related) throws IOException {~~
293		~~- ArrayList<Title> titles = new ArrayList<Title>();~~
294		~~- for(IndexUpdateRecord rec : records){~~
295		~~- if(rec.isDelete())~~
296		~~- continue;~~
297		~~- Article ar = rec.getArticle();~~
298		~~- titles.add(ar.makeTitle());~~
299		~~- if(ar.getRedirects() != null){~~
300		~~- for(Redirect r : ar.getRedirects()){~~
301		~~- titles.add(r.makeTitle());~~
302		~~- }~~
303		~~- }~~
304		~~- }~~
305		~~- // fetch~~
306		~~- //OldLinks links = new OldLinks(store.getPageReferences(titles,dbname));~~
307		~~- //HashMap<Title,ArrayList<RelatedTitle>> rel = store.getRelatedPages(titles,dbname);~~
308		~~- // update~~
309		~~- // FIXME: wow, this is BCE ...~~
310		~~- for(IndexUpdateRecord rec : records){~~
311		~~- if(rec.isDelete())~~
312		~~- continue;~~
313		~~- Article ar = rec.getArticle();~~
314		~~- Title t = ar.makeTitle();~~
315		~~- ArticleAnalytics aa = las.getAnaliticsForArticle(t.getKey());~~
316		~~- ArrayList<String> anchors = new ArrayList<String>();~~
317		~~- anchors.addAll(aa.getAnchorText());~~
318		~~- // set references~~
319		~~- ar.setReferences(aa.getReferences());~~
320		~~- //ar.setRedirect(aa.isRedirect());~~
321		~~- if(aa.isRedirect())~~
322		~~- ar.setRedirectTargetNamespace(aa.getRedirectTargetNamespace());~~
323		~~- if(ar.getRedirects() != null){~~
324		~~- for(Redirect r : ar.getRedirects()){~~
325		~~- ArticleAnalytics raa = las.getAnaliticsForReferences(r.makeTitle().getKey());~~
326		~~- r.setReferences(raa.getReferences());~~
327		~~- anchors.addAll(raa.getAnchorText());~~
328		~~- }~~
329		~~- }~~
330		~~- // set anchors~~
331		~~- ar.setAnchorText(anchors);~~
332		~~- // set related~~
333		~~- if(related.canRead())~~
334		~~- ar.setRelated(related.getRelated(t.getKey()));~~
335		~~- /*ArrayList<RelatedTitle> rt = rel.get(t.getKey());~~
336		~~- if(rt != null){~~
337		~~- Collections.sort(rt,new Comparator<RelatedTitle>() {~~
338		~~- public int compare(RelatedTitle o1, RelatedTitle o2){~~
339		~~- double d = o2.getScore()-o1.getScore();~~
340		~~- if(d == 0) return 0;~~
341		~~- else if(d > 0) return 1;~~
342		~~- else return -1;~~
343		~~- }~~
344		~~- });~~
345		~~- ar.setRelated(rt);~~
346		~~- }*/~~
347		~~- }~~
348		~~- }~~
349		-}
	273	+
	274	+}
\ No newline at end of file
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/LinkReader.java
—	—	@@ -46,7 +46,7 @@
47	47	langCode = "en";
48	48	this.langCode = langCode;
49	49	this.iid = iid;
50		~~- this.exactCase = iid.getExactCase();~~
	50	+ this.exactCase = iid.isExactCase();
51	51	interwiki = Localization.getInterwiki();
52	52	}
53	53	public void writeRevision(Revision revision) throws IOException {
—	—	@@ -58,7 +58,7 @@
59	59	public void writeEndPage() throws IOException {
60	60	Title t = new Title(page.Title.Namespace,page.Title.Text);
61	61	try{
62		~~- links.addArticleInfo(revision.Text,t,exactCase);~~
	62	+ links.addArticleInfo(revision.Text,t,exactCase,Integer.toString(page.Id));
63	63	} catch(Exception e){
64	64	log.error("Error adding article "+t+" : "+e.getMessage());
65	65	e.printStackTrace();
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/ranks/Links.java
—	—	@@ -70,16 +70,16 @@
71	71	protected Directory directory = null;
72	72	protected NamespaceFilter nsf; // default search
73	73	protected ObjectCache cache;
74		~~- //protected ObjectCache refCache;~~
75	74	protected FieldSelector keyOnly,redirectOnly,contextOnly,linksOnly;
76	75	protected boolean optimized = false;
	76	+ protected boolean autoOptimize = false;
77	77
78		~~- private Links(IndexId iid, String path, IndexWriter writer) throws CorruptIndexException, IOException{~~
	78	+ private Links(IndexId iid, String path, IndexWriter writer, boolean autoOptimize) throws CorruptIndexException, IOException{
79	79	this.writer = writer;
80	80	this.path = path;
81	81	this.iid = iid;
82		~~- GlobalConfiguration global = GlobalConfiguration.getInstance();~~
83		~~- this.langCode = global.getLanguage(iid);~~
	82	+ this.autoOptimize = autoOptimize;
	83	+ this.langCode = iid.getLangCode();
84	84	String dbname = iid.getDBname();
85	85	nsmap = Localization.getLocalizedNamespaces(langCode,dbname);
86	86	interwiki = Localization.getInterwiki();
—	—	@@ -87,13 +87,7 @@
88	88	imageLocalized = Localization.getLocalizedImage(langCode,dbname);
89	89	state = State.FLUSHED;
90	90	initWriter(writer);
91		~~- //reader = IndexReader.open(path);~~
92		~~- nsf = global.getDefaultNamespace(iid);~~
93		~~- cache = new ObjectCache(10000);~~
94		~~- // init cache manager~~
95		~~- /*CacheManager manager = CacheManager.create();~~
96		~~- cache = new Cache("links", 5000, false, false, 5, 2);~~
97		~~- manager.addCache(cache); */~~
	91	+ nsf = iid.getDefaultNamespace();
98	92	keyOnly = makeSelector("article_key");
99	93	redirectOnly = makeSelector("redirect");
100	94	contextOnly = makeSelector("context");
—	—	@@ -122,7 +116,7 @@
123	117	String path = iid.getIndexPath();
124	118	log.info("Using index at "+path);
125	119	IndexWriter writer = WikiIndexModifier.openForWrite(path,false);
126		~~- return new Links(iid,path,writer);~~
	120	+ return new Links(iid,path,writer,false);
127	121	}
128	122
129	123	public static Links openStandalone(IndexId iid) throws IOException {
—	—	@@ -138,7 +132,7 @@
139	133	public static Links openForRead(IndexId iid, String path) throws IOException {
140	134	iid = iid.getLinks();
141	135	log.info("Opening for read "+path);
142		~~- return new Links(iid,path,null);~~
	136	+ return new Links(iid,path,null,true);
143	137	}
144	138
145	139	/** Create new in the import path */
—	—	@@ -147,7 +141,7 @@
148	142	String path = iid.getImportPath();
149	143	log.info("Making index at "+path);
150	144	IndexWriter writer = WikiIndexModifier.openForWrite(path,true);
151		~~- Links links = new Links(iid,path,writer);~~
	145	+ Links links = new Links(iid,path,writer,true);
152	146	return links;
153	147	}
154	148
—	—	@@ -156,7 +150,7 @@
157	151	iid = iid.getLinks();
158	152	log.info("Making index in memory");
159	153	IndexWriter writer = new IndexWriter(new RAMDirectory(),new SimpleAnalyzer(),true);
160		~~- Links links = new Links(iid,null,writer);~~
	154	+ Links links = new Links(iid,null,writer,true);
161	155	return links;
162	156	}
163	157
—	—	@@ -172,7 +166,7 @@
173	167	nsmap.put(namespace.toLowerCase(),index);
174	168	}
175	169
176		~~- /** Write all changes, optimize/close everything~~
	170	+ /** Write all changes, optimize if in autoOptimize mode
177	171	* @throws IOException */
178	172	public void flush() throws IOException{
179	173	// close & optimize
—	—	@@ -181,7 +175,8 @@
182	176	if(reader != null)
183	177	reader.close();
184	178	if(writer != null){
185		~~- writer.optimize();~~
	179	+ if(autoOptimize)
	180	+ writer.optimize();
186	181	writer.close();
187	182	}
188	183	state = State.FLUSHED;
—	—	@@ -193,15 +188,7 @@
194	189	* @throws IOException
195	190	*/
196	191	protected void flushForRead() throws IOException{
197		~~- // close & optimize~~
198		~~- if(searcher != null)~~
199		~~- searcher.close();~~
200		~~- if(reader != null)~~
201		~~- reader.close();~~
202		~~- if(writer != null){~~
203		~~- writer.optimize();~~
204		~~- writer.close();~~
205		~~- }~~
	192	+ flush();
206	193	log.debug("Opening index reader");
207	194	// reopen
208	195	reader = IndexReader.open(path);
—	—	@@ -238,27 +225,28 @@
239	226	openForWrite();
240	227	}
241	228
242		~~- /** Modify existing article links info */~~
243		~~- public void modifyArticleInfo(String text, Title t, boolean exactCase) throws IOException{~~
	229	+ /** Delete article info connected to title t */
	230	+ public void deleteArticleInfo(Title t) throws IOException {
244	231	ensureWrite();
245	232	writer.deleteDocuments(new Term("article_key",t.getKey()));
246		~~- addArticleInfo(text,t,exactCase);~~
247	233	}
	234	+ /** Delete by page_id, not ns:title key */
	235	+ public void deleteArticleInfoByIndexKey(String key) throws IOException {
	236	+ ensureWrite();
	237	+ writer.deleteDocuments(new Term("article_pageid",key));
	238	+ }
248	239
249	240	/** Add links and other info from article
250	241	* @throws IOException */
251		~~- public void addArticleInfo(String text, Title t, boolean exactCase) throws IOException{~~
	242	+ public void addArticleInfo(String text, Title t, boolean exactCase, String pageId) throws IOException{
252	243	ensureWrite();
253	244	Pattern linkPat = Pattern.compile("\\[\\[(.?)(\\\|(.?))?\\]\\]");
254	245	int namespace = t.getNamespace();
255	246	Matcher matcher = linkPat.matcher(text);
256	247	int ns; String title;
257	248	boolean escaped;
258		~~- //PrefixAnalyzer prefixAnalyzer = new PrefixAnalyzer();~~
259	249
260	250	ArrayList<String> pagelinks = new ArrayList<String>();
261		~~- // article link -> contexts~~
262		~~- //HashMap<String,ArrayList<String>> contextMap = new HashMap<String,ArrayList<String>>();~~
263	251
264	252	// use context only for namespace in default search
265	253	boolean useContext = nsf.contains(t.getNamespace());
—	—	@@ -339,6 +327,7 @@
340	328	StringList lk = new StringList(pagelinks);
341	329	Analyzer an = new SplitAnalyzer(1,true);
342	330	Document doc = new Document();
	331	+ doc.add(new Field("article_pageid",pageId,Field.Store.YES,Field.Index.UN_TOKENIZED));
343	332	// ns:title
344	333	doc.add(new Field("article_key",t.getKey(),Field.Store.YES,Field.Index.UN_TOKENIZED));
345	334	if(redirectsTo != null)
—	—	@@ -348,8 +337,6 @@
349	338	// a list of all links
350	339	doc.add(new Field("links",lk.toString(),Field.Store.NO,Field.Index.TOKENIZED));
351	340	}
352		~~- // key split up into prefixes (for prefix index)~~
353		~~- // doc.add(new Field("prefix",prefixAnalyzer.tokenStream("prefix",t.getKey())));~~
354	341
355	342	writer.addDocument(doc,an);
356	343	state = State.MODIFIED;
—	—	@@ -430,18 +417,17 @@
431	418	}
432	419	return false;
433	420	}
434		-
435		~~- @Deprecated~~
436		~~- /** If article is redirect, get target, else null */~~
437		~~- public String getRedirectTargetOld(String key) throws IOException{~~
	421	+
	422	+ /** Get page_id for ns:title */
	423	+ public String getPageId(String key) throws IOException {
438	424	ensureRead();
439	425	TermDocs td = reader.termDocs(new Term("article_key",key));
440	426	if(td.next()){
441		~~- return reader.document(td.doc(),redirectOnly).get("redirect");~~
	427	+ return reader.document(td.doc()).get("article_pageid");
442	428	}
443	429	return null;
444	430	}
445		-
	431	+
446	432	/** If article is redirect, get target key, else null */
447	433	public String getRedirectTarget(String key) throws IOException{
448	434	ensureRead();
—	—	@@ -637,19 +623,16 @@
638	624	writer.close();
639	625	if(reader != null)
640	626	reader.close();
641		~~- if(directory != null)~~
642		~~- directory.close();~~
	627	+ //if(directory != null)
	628	+ // directory.close();
643	629	}
644	630
645	631	public ObjectCache getCache() {
646	632	return cache;
647	633	}
648	634
649		~~- /*public ObjectCache getRefCache() {~~
650		~~- return refCache;~~
651		~~- } */~~
652		-
653		-
654		-
655		-
	635	+ public boolean isAutoOptimize() {
	636	+ return autoOptimize;
	637	+ }
	638	+
656	639	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
—	—	@@ -72,8 +72,10 @@
73	73	Configuration.open();
74	74	String text = "bre! (ant) and some. it's stupid it's something and 5\"6' or more, links abacus";
75	75	showTokens(text);
76		~~- text = "bre! u.s. {{template\|text}} {{template\|text2\|text3}} [http://ls2.wiki link]";~~
	76	+ text = "This, is a '''list of [[African]] countries and dependencies by [[population]]'''.\n\n{\| border=\"1\" cellpadding=\"2\" cellspacing=\"0\" style=\"border-collapse:collapse; text-align:right;\"\n\|- style=\"text-align:center; background:#efefef\"\n!Pos !! Country !! Population\n\|-\n\| align=\"left\" \|-\n\| align=\"left\" \|'''Africa''' \|\| 934,283,426\n\|-\n";
77	77	showTokens(text);
	78	+ text = "u.s. {{template\|text}} {{template\|text2\|text3}} [http://ls2.wiki link]";
	79	+ showTokens(text);
78	80	text = "Good-Thomas C# C++ and so on.. ";
79	81	showTokens(text);
80	82	text = "[[Image:Argishti monument.JPG\|thumb\|King Argishti of Urartu riding a chariot with two horses in Yerevan, Armenia in front of the Erebuni Museum.]]'''Urartu''' (Assyrian ''Urarṭu'', [[Urartian language\|Urartian]] ''Biainili'') was an ancient [[kingdom (politics)\|kingdom]] of [[Armenia]]<ref>"Urartu." Columbia Electronic Encyclopedia. Columbia University Press.</ref> located in the mountainous plateau between [[Asia Minor]], [[Mesopotamia]], and [[Caucasus mountains]], later known as the [[Armenian Highland]], and it centered around [[Lake Van]] (present-day eastern [[Turkey]]). The kingdom existed from ca. [[860s BC\|860 BC]], emerging from Late Bronze Age [[Nairi]] polities, until [[585 BC]]. The name corresponds to the [[Bible\|Biblical]] '''[[Mount Ararat\|Ararat]]'''.";
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SpellCheckTest.java
—	—	@@ -113,6 +113,7 @@
114	114	{"ommmmmmmmiteed", "omitted"},
115	115	{"ommmmmmmmitted", "omitted"},
116	116	{"a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",""},
	117	+ {"Douglas Adams's Guide to The Hitch-Hiker's Guide to the Galaxy",""},
117	118
118	119	};
119	120
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/SuggestTest.java
—	—	@@ -48,7 +48,7 @@
49	49	&& res.get(1).getWord().equals(m[1]))
50	50	good++;
51	51	else if(r.getDist() > 1){
52		~~- SuggestResult split = sc.suggestSplit(m[0],0);~~
	52	+ SuggestResult split = sc.suggestSplit(m[0],null);
53	53	if(split!=null && m[1].equals(split.getWord()))
54	54	good++;
55	55	else{
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/AnalysisTest.java
—	—	@@ -104,10 +104,10 @@
105	105	Analyzer analyzer = Analyzers.getSearcherAnalyzer(IndexId.get("enwiki"));
106	106	//Analyzer analyzer = Analyzers.getHighlightAnalyzer(IndexId.get("enwiki"));
107	107	Analyzer old = new EnglishAnalyzer();
108		- String text = "a-b compatibly compatible Gödel; The who is a band. The who is Pascal's earliest work was in the natural and applied sciences where he made important contributions to the construction of mechanical calculators, the study of fluids, and clarified the concepts of pressure and vacuum by generalizing the work of Evangelista Torricelli. Pascal also wrote powerfully in defense of the scientific method.";
	108	+ String text = "Pokémons a-b compatibly compatible Gödel; The who is a band. The who is Pascal's earliest work was in the natural and applied sciences where he made important contributions to the construction of mechanical calculators, the study of fluids, and clarified the concepts of pressure and vacuum by generalizing the work of Evangelista Torricelli. Pascal also wrote powerfully in defense of the scientific method.";
109	109	displayTokens(analyzer,text);
110	110	displayTokens(old,text);
111		~~- text = "links abacus something aries douglas adams boxes bands working s and Frame semantics (linguistics)";~~
	111	+ text = "Pokémons links abacus something aries douglas adams boxes bands working s and Frame semantics (linguistics)";
112	112	displayTokens(analyzer,text);
113	113	text = "Thomas c# c++ good-thomas Good-Thomas rats RATS Frame semantics (linguistics) 16th century sixteenth .fr web.fr other";
114	114	displayTokens(analyzer,text);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java
—	—	@@ -318,7 +318,7 @@
319	319	assertTrue(ents1.isTitlesBySuffix());
320	320	assertEquals("w",ents1.getInterwikiBySuffix("wiki"));
321	321	assertEquals(ents1,IndexId.get("enwiki").getTitlesIndex());
322		~~- assertEquals("en",testgc.getLanguage(ents1));~~
	322	+ assertEquals("en",ents1.getLangCode());
323	323	assertEquals("{wiki=enwiki}",ents1.getSuffixToDbname().toString());
324	324	IndexId ents2 = IndexId.get("en-titles.tspart2");
325	325	assertEquals("{wikisource=enwikisource, wiktionary=enwiktionary, test=entest}",ents2.getSuffixToDbname().toString());
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/test/HighlightTest.java
—	—	@@ -49,7 +49,7 @@
50	50	Term[] terms = termSet.toArray(new Term[] {});
51	51	IndexSearcher searcher = SearcherCache.getInstance().getLocalSearcher(iid);
52	52	int[] df = searcher.docFreqs(terms);
53		~~- Highlight.highlight(hits,iid,terms,df,searcher.maxDoc(),parser.getWordsClean(),StopWords.getPredefinedSet(iid),false,null,false);~~
	53	+ Highlight.highlight(hits,iid,terms,df,searcher.maxDoc(),parser.getWordsClean(),StopWords.getPredefinedSet(iid),false,null,false,false);
54	54	}
55	55
56	56	public static void timeTest(String dbname, String dbnameSrc) throws Exception {
—	—	@@ -86,7 +86,7 @@
87	87	Document doc = reader.document(docid);
88	88	hits.add(doc.get("namespace")+":"+doc.get("title"));
89	89	}
90		~~- Highlight.ResultSet rs = Highlight.highlight(hits,iid,terms,df,maxDoc,words,stopWords,false,null,false);~~
	90	+ Highlight.ResultSet rs = Highlight.highlight(hits,iid,terms,df,maxDoc,words,stopWords,false,null,false,false);
91	91	HashMap<String,HighlightResult> res = rs.highlighted;
92	92	count += res.size();
93	93	if(i!=0 && i % 200 == 0){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/HttpHandler.java
—	—	@@ -221,7 +221,7 @@
222	222	"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"+
223	223	"<head>\n<title>Error: " + code + " " + message + "</title>\n"+
224	224	"</head>\n<body>\n<h1>" + code + " " + message + "</h1>\n"+
225		~~- "<p>" + detail + "</p>\n<hr />\n<p><i>MWSearch on localhost" +~~
	225	+ "<div>" + detail + "</div>\n<hr />\n<p><i>LSearch daemon on localhost" +
226	226	"</i></p>\n</body>\n</html>");
227	227	}
228	228
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
—	—	@@ -36,6 +36,8 @@
37	37	String what;
38	38	/** Client-supplied database we should operate on */
39	39	String dbname;
	40	+
	41	+ public static final double CURRENT_VERSION = 2.1;
40	42
41	43	public SearchDaemon(Socket sock) {
42	44	super(sock);
—	—	@@ -68,7 +70,8 @@
69	71	try{
70	72	SearchEngine engine = new SearchEngine();
71	73	HashMap query = new QueryStringMap(uri);
72		~~- SearchResults res = engine.search(IndexId.get(dbname),what,searchterm,query);~~
	74	+ double version = getVersion(query);
	75	+ SearchResults res = engine.search(IndexId.get(dbname),what,searchterm,query,version);
73	76	contentType = "text/plain";
74	77	// format:
75	78	// <num of hits>
—	—	@@ -85,45 +88,49 @@
86	89	}
87	90	} else{
88	91	sendOutputLine(Integer.toString(res.getNumHits()));
89		~~- SuggestQuery sq = res.getSuggest();~~
90		~~- if(sq != null && sq.hasSuggestion()){~~
91		~~- sendOutputLine("#suggest ["+sq.getRangesSerialized()+"] "+encode(sq.getSearchterm()));~~
92		~~- } else~~
93		~~- sendOutputLine("#no suggestion");~~
94		~~- if(res.getTitles() != null){~~
95		~~- sendOutputLine("#interwiki "+res.getTitles().size());~~
96		~~- for(ResultSet rs : res.getTitles()){~~
97		~~- sendOutputLine(rs.getScore()+" "+encode(rs.getInterwiki())+" "+rs.getNamespace()+" "+encodeTitle(rs.getTitle()));~~
	92	+ if(version>=2.1){
	93	+ SuggestQuery sq = res.getSuggest();
	94	+ if(sq != null && sq.hasSuggestion()){
	95	+ sendOutputLine("#suggest ["+sq.getRangesSerialized()+"] "+encode(sq.getSearchterm()));
	96	+ } else
	97	+ sendOutputLine("#no suggestion");
	98	+ if(res.getTitles() != null){
	99	+ sendOutputLine("#interwiki "+res.getTitles().size());
	100	+ for(ResultSet rs : res.getTitles()){
	101	+ sendOutputLine(rs.getScore()+" "+encode(rs.getInterwiki())+" "+rs.getNamespace()+" "+encodeTitle(rs.getTitle()));
	102	+ if(rs.getExplanation() != null)
	103	+ sendOutputLine(rs.getExplanation().toString());
	104	+ if(rs.getHighlight() != null){
	105	+ HighlightResult hr = rs.getHighlight();
	106	+ sendHighlight("title",hr.getTitle());
	107	+ sendHighlightWithTitle("redirect",hr.getRedirect());
	108	+ }
	109	+ }
	110	+ } else
	111	+ sendOutputLine("#interwiki 0");
	112	+ sendOutputLine("#results");
	113	+ }
	114	+ for(ResultSet rs : res.getResults()){
	115	+ sendResultLine(rs.score, rs.namespace, rs.title);
	116	+ if(version>=2.1){
	117	+ if(rs.getContext() != null){
	118	+ for(String c : rs.getContext())
	119	+ sendOutputLine("#context "+c);
	120	+ }
98	121	if(rs.getExplanation() != null)
99	122	sendOutputLine(rs.getExplanation().toString());
100	123	if(rs.getHighlight() != null){
101	124	HighlightResult hr = rs.getHighlight();
102		~~- sendHighlight("title",hr.getTitle());~~
	125	+ sendHighlight("title",hr.getTitle());
	126	+ for(Snippet sn : hr.getText())
	127	+ sendHighlight("text",sn);
103	128	sendHighlightWithTitle("redirect",hr.getRedirect());
	129	+ sendHighlightWithFragment("section",hr.getSection());
	130	+ if(hr.getDate() != null)
	131	+ sendHighlight("date",hr.getDate());
	132	+ sendHighlight("wordcount",Integer.toString(hr.getWordCount()));
104	133	}
105	134	}
106		~~- } else~~
107		~~- sendOutputLine("#interwiki 0");~~
108		~~- sendOutputLine("#results");~~
109		~~- for(ResultSet rs : res.getResults()){~~
110		~~- sendResultLine(rs.score, rs.namespace, rs.title);~~
111		~~- if(rs.getContext() != null){~~
112		~~- for(String c : rs.getContext())~~
113		~~- sendOutputLine("#context "+c);~~
114		~~- }~~
115		~~- if(rs.getExplanation() != null)~~
116		~~- sendOutputLine(rs.getExplanation().toString());~~
117		~~- if(rs.getHighlight() != null){~~
118		~~- HighlightResult hr = rs.getHighlight();~~
119		~~- sendHighlight("title",hr.getTitle());~~
120		~~- for(Snippet sn : hr.getText())~~
121		~~- sendHighlight("text",sn);~~
122		~~- sendHighlightWithTitle("redirect",hr.getRedirect());~~
123		~~- sendHighlightWithFragment("section",hr.getSection());~~
124		~~- if(hr.getDate() != null)~~
125		~~- sendHighlight("date",hr.getDate());~~
126		~~- sendHighlight("wordcount",Integer.toString(hr.getWordCount()));~~
127		~~- }~~
128	135	}
129	136	}
130	137	} else if(res.getFormat() == Format.JSON){
—	—	@@ -170,7 +177,17 @@
171	178	}
172	179	}
173	180
174		-
	181	+
	182	+ private double getVersion(HashMap query) {
	183	+ String v = (String)query.get("version");
	184	+ if(v == null)
	185	+ v = (String)query.get("ver");
	186	+ if(v != null)
	187	+ return Double.parseDouble(v);
	188	+ return CURRENT_VERSION;
	189	+ }
	190	+
	191	+
175	192	private String makeHighlight(String type, Snippet snippet){
176	193	if(snippet == null)
177	194	return null;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java
—	—	@@ -1,6 +1,7 @@
2	2	package org.wikimedia.lsearch.spell;
3	3
4	4	import java.io.IOException;
	5	+import java.io.Serializable;
5	6	import java.util.ArrayList;
6	7	import java.util.Collection;
7	8	import java.util.Collections;
—	—	@@ -56,6 +57,7 @@
57	58	protected NamespaceFilter defaultNs;
58	59	protected HashMap<String,Boolean> wordExistCache = new HashMap<String,Boolean>();
59	60	protected enum Filtering { STRONG, WEAK };
	61	+ protected boolean useLogging = true;
60	62
61	63	/** Distance an metaphone metrics */
62	64	static public class Metric {
—	—	@@ -123,6 +125,8 @@
124	126
125	127	protected static class Namespaces {
126	128	HashSet<Integer> namespaces = new HashSet<Integer>();
	129	+ /** If true, these namespaces are additional to the default namespaces,
	130	+ * if false, there is no intersection between these namespaces and default namespaces */
127	131	boolean additional = false;
128	132	String prefix = "ns_";
129	133	public Namespaces(HashSet<Integer> namespaces, boolean additional) {
—	—	@@ -155,19 +159,26 @@
156	160	/** Number of results to fetch for titles */
157	161	public static final int POOL_TITLE = 100;
158	162	/** Number of results to fetch for fuzzy word matches */
159		~~- public static final int POOL_FUZZY = 1000;~~
	163	+ public static final int POOL_FUZZY = 500;
160	164	/** Number of words to return for fuzzy queries */
161	165	public static final int MAX_FUZZY = 50;
162	166
163	167	/** Lower limit to hit rate for joining */
164	168	public static final int JOIN_FREQ = 1;
165	169
166		~~- public Suggest(IndexId iid) throws IOException{~~
	170	+ public Suggest(IndexId iid) throws IOException {
	171	+ this(iid,null,true);
	172	+ }
	173	+
	174	+ public Suggest(IndexId iid, IndexSearcher searcher, boolean useLogging) throws IOException{
167	175	SearcherCache cache = SearcherCache.getInstance();
168	176	this.iid = iid;
169		~~- this.searcher = cache.getLocalSearcher(iid.getSpell());~~
	177	+ if(searcher == null)
	178	+ searcher = cache.getLocalSearcher(iid.getSpell());
	179	+ this.searcher = searcher;
170	180	this.reader = searcher.getIndexReader();
171	181	this.defaultNs = iid.getDefaultNamespace();
	182	+ this.useLogging = useLogging;
172	183
173	184	synchronized(stopWordsIndexes){
174	185	if(!stopWordsIndexes.containsKey(searcher)){
—	—	@@ -209,18 +220,43 @@
210	221	}
211	222
212	223	/**
	224	+ * Extra information that will help disambiguate some suggest cases,
	225	+ * e.g. words from titles found in search, phrases found in text, ...
	226	+ * @author rainman
	227	+ *
	228	+ */
	229	+ public static class ExtraInfo implements Serializable {
	230	+ protected HashSet<String> phrases;
	231	+ protected HashSet<String> foundInContext;
	232	+ protected HashSet<String> foundInTitles;
	233	+ protected int firstRank;
	234	+
	235	+ public ExtraInfo(HashSet<String> phrases, HashSet<String> foundInContext, HashSet<String> foundInTitles, int firstRank) {
	236	+ this.phrases = phrases;
	237	+ this.foundInContext = foundInContext;
	238	+ this.foundInTitles = foundInTitles;
	239	+ this.firstRank = firstRank;
	240	+ }
	241	+
	242	+ public ExtraInfo(){
	243	+ this(new HashSet<String>(),new HashSet<String>(),new HashSet<String>(),0);
	244	+ }
	245	+
	246	+
	247	+ }
	248	+
	249	+ /**
213	250	* Make a suggestion for a query
214	251	*
215	252	* @throws IOException
216	253	*/
217	254	@SuppressWarnings("unchecked")
218		~~- public SuggestQuery suggest(String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext,~~
219		~~- int firstRank, NamespaceFilter nsf) throws IOException{~~
	255	+ public SuggestQuery suggest(String searchterm, ArrayList<Token> tokens, ExtraInfo info, NamespaceFilter nsf) throws IOException{
220	256	FilterFactory filters = new FilterFactory(iid);
221	257	wordExistCache.clear();
222	258	long start = System.currentTimeMillis();
223	259
224		~~- System.out.println("tokens: "+tokens+" inContext:"+foundInContext+" phrases:"+phrases);~~
	260	+ // System.out.println("tokens: "+tokens+" inContext:"+info.foundInContext+" phrases:"+info.phrases+", inTitles="+info.foundInTitles);
225	261
226	262	if(tokens.size() > 30){
227	263	logRequest(searchterm,"too many words to spellcheck ("+tokens.size()+")",start);
—	—	@@ -262,7 +298,6 @@
263	299	}
264	300
265	301	// init suggestions
266		~~- int minFreq = 0;~~
267	302	ArrayList<Change> suggestions = new ArrayList<Change>();
268	303	ArrayList<Change> suggestionsTitle = new ArrayList<Change>();
269	304	HashMap<String,HashSet<String>> contextCache = new HashMap<String,HashSet<String>>();
—	—	@@ -272,7 +307,7 @@
273	308	String redirectTarget = followRedirect(joinTokens,ns);
274	309	if(redirectTarget != null){
275	310	EditDistance ed = new EditDistance(joinTokens);
276		~~- if(ed.getDistance(redirectTarget) <= 2 && betterRank(titleRank(redirectTarget,ns),firstRank)){~~
	311	+ if(ed.getDistance(redirectTarget) <= 2 && betterRank(titleRank(redirectTarget,ns),info.firstRank)){
277	312	HashMap<Integer,String> changes = extractTitleChanges(joinTokens,redirectTarget,tokens);
278	313	if(changes != null){
279	314	SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,new HashSet<Integer>(),ns);
—	—	@@ -291,7 +326,7 @@
292	327	logRequest(searchterm,"CORRECT (exact title match)",start);
293	328	return new SuggestQuery(searchterm,new ArrayList<Integer>());
294	329	}
295		~~- if(betterRank(r.frequency,firstRank)){~~
	330	+ if(betterRank(r.frequency,info.firstRank)){
296	331	HashMap<Integer,String> changes = extractTitleChanges(joinTokens,r.word,tokens);
297	332	if(changes != null){
298	333	SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,changes.keySet(),ns);
—	—	@@ -312,7 +347,7 @@
313	348	if(r.isExactMatch()){
314	349	logRequest(searchterm,"CORRECT (by single word index)",start);
315	350	return new SuggestQuery(searchterm,new ArrayList<Integer>());
316		~~- } else if(r.dist == 1 && betterRank(r.frequency,firstRank)){~~
	351	+ } else if(r.dist == 1 && betterRank(r.frequency,info.firstRank)){
317	352	HashMap<Integer,String> proposedChanges = new HashMap<Integer,String>();
318	353	proposedChanges.put(0,r.word);
319	354	SuggestQuery sq = makeSuggestedQuery(tokens,proposedChanges,searchterm,filters,new HashSet<Integer>(),ns);
—	—	@@ -323,13 +358,13 @@
324	359	}
325	360
326	361	// check if all words are found within phrases during highlighting
327		~~- if(tokens.size() > 1 && tokens.size() == phrases.size() + 1){~~
	362	+ if(tokens.size() > 1 && tokens.size() == info.phrases.size() + 1){
328	363	logRequest(searchterm,"CORRECT (by highlight phrases)",start);
329	364	return new SuggestQuery(searchterm,new ArrayList<Integer>());
330	365	}
331	366
332	367	// indexes of words in found during highlighting in phrases
333		~~- HashSet<Integer> inPhrases = new HashSet<Integer>();~~
	368	+ //HashSet<Integer> inPhrases = new HashSet<Integer>();
334	369	// words that might spellcheck to stop words
335	370	ArrayList<SuggestResult> possibleStopWords = new ArrayList<SuggestResult>();
336	371	// word suggestions
—	—	@@ -377,7 +412,7 @@
378	413	possibleStopWords.add(null);
379	414	}
380	415	// suggest split
381		~~- SuggestResult split = suggestSplit(w,minFreq);~~
	416	+ SuggestResult split = suggestSplit(w,ns);
382	417	if(split != null){
383	418	Change sc = new Change(split.dist,split.frequency,Change.Type.SPLIT);
384	419	sc.substitutes.put(i,split.word.replace("_"," "));
—	—	@@ -388,7 +423,7 @@
389	424	if(i-1 >= 0
390	425	&& (wordSug.get(i-1)==null \|\| !wordSug.get(i-1).get(0).isExactMatch())
391	426	&& (wordSug.get(i)==null \|\| !wordSug.get(i).get(0).isExactMatch())){
392		~~- SuggestResult join = suggestJoin(tokens.get(i-1).termText(),w,minFreq);~~
	427	+ SuggestResult join = suggestJoin(tokens.get(i-1).termText(),w,ns);
393	428	if(join != null){
394	429	Change sc = new Change(join.dist,join.frequency,Change.Type.JOIN);
395	430	sc.substitutes.put(i-1,"");
—	—	@@ -459,27 +494,29 @@
460	495	int freq = (Integer)ret[0];
461	496	boolean inTitle = (Boolean)ret[1];
462	497
463		~~- // log.info("Checking "+phrase);~~
	498	+ //log.debug("Checking "+phrase);
464	499	boolean inContext = inContext(s1.word,s2.word,contextCache,allWords,ns) \|\| inContext(s2.word,s1.word,contextCache,allWords,ns);
465	500	if(freq > 0 \|\| inContext){
466	501	// number of characters added/substracted
467	502	int diff1 = Math.abs(s1.word.length()-w1.length());
468	503	int diff2 = Math.abs(s2.word.length()-w2.length());
469		~~- log.info("Found "+phrase+" at dist="+(s1.dist+s2.dist)+", freq="+freq+" inTitle="+inTitle);~~
	504	+ log.debug("Found "+phrase+" at dist="+(s1.dist+s2.dist)+", freq="+freq+" inTitle="+inTitle);
470	505	int dist = s1.dist + s2.dist + distOffset;
471	506	boolean accept = true;
472	507	Change c = new Change(dist,freq,Change.Type.PHRASE);
473	508	// register changes
474	509	if(s1.word.equals(w1))
475	510	c.preserves.put(i,w1);
476		~~- else if(!good1 \|\| ((inTitle\|\|inContext) && diff1 <=2 && !foundInContext.contains(w1)) )~~
	511	+ else if((!good1 && !info.foundInTitles.contains(w1))
	512	+ \|\| ((inTitle\|\|inContext) && diff1 <=2 && !info.foundInContext.contains(w1)) )
477	513	c.substitutes.put(i,s1.word);
478	514	else
479	515	accept = false;
480	516
481	517	if(s2.word.equals(w2))
482	518	c.preserves.put(i2,w2);
483		~~- else if(!good2 \|\| ((inTitle\|\|inContext) && diff2 <= 2 && !foundInContext.contains(w2)))~~
	519	+ else if((!good2 && !info.foundInTitles.contains(w2))
	520	+ \|\| ((inTitle\|\|inContext) && diff2 <= 2 && !info.foundInContext.contains(w2)))
484	521	c.substitutes.put(i2,s2.word);
485	522	else
486	523	accept = false;
—	—	@@ -522,7 +559,7 @@
523	560	return sq;
524	561	}
525	562	}
526		~~- log.info("Spell-checking based on phrases...");~~
	563	+ log.debug("Spell-checking based on phrases...");
527	564	// find best suggestion based on phrases
528	565	HashMap<Integer,String> preserveTokens = new HashMap<Integer,String>();
529	566	HashMap<Integer,String> proposedChanges = new HashMap<Integer,String>();
—	—	@@ -544,11 +581,12 @@
545	582	for(int i=0;i<tokens.size();i++){
546	583	if(preserveTokens.containsKey(i) \|\| proposedChanges.containsKey(i))
547	584	continue;
	585	+ String w = tokens.get(i).termText();
548	586	ArrayList<SuggestResult> sug = wordSug.get(i);
549	587	if(sug == null)
550	588	continue;
551	589	SuggestResult s = sug.get(0);
552		~~- if(!s.isExactMatch() && acceptWordChange(tokens.get(i).termText(),s)){~~
	590	+ if(!s.isExactMatch() && !info.foundInTitles.contains(w) && acceptWordChange(w,s)){
553	591	distance += s.dist;
554	592	proposedChanges.put(i,s.word);
555	593	if(using.equals("phrases"))
—	—	@@ -564,7 +602,7 @@
565	603	SuggestResult tr = titleRes.get(0);
566	604	HashMap<Integer,String> changes = extractTitleChanges(joinTokens,tr.word,tokens);
567	605	if(changes != null){
568		~~- if(tr.dist <= distance && (betterRank(tr.frequency,firstRank) \|\| proposedChanges.equals(changes))){~~
	606	+ if(tr.dist <= distance && (betterRank(tr.frequency,info.firstRank) \|\| proposedChanges.equals(changes))){
569	607	// we found a much better suggestion !
570	608	proposedChanges = changes;
571	609	alwaysReplace.addAll(proposedChanges.keySet());
—	—	@@ -690,6 +728,26 @@
691	729	}
692	730	return b;
693	731	}
	732	+ /** Get frequency of a word if exists (0 if not) */
	733	+ private int wordFrequency(String w, Namespaces ns) throws IOException {
	734	+ if(ns == null){ // default
	735	+ TermDocs td = reader.termDocs(new Term("word",w));
	736	+ if(td.next())
	737	+ return getFrequency(reader.document(td.doc()),null);
	738	+ return 0;
	739	+ } else{ // other
	740	+ int freq = 0;
	741	+ TermDocs td = reader.termDocs(new Term(ns.prefix+"word",w));
	742	+ if(td.next())
	743	+ freq = getFrequency(reader.document(td.doc()),ns);
	744	+ if(ns.additional){ // also look in main
	745	+ TermDocs td2 = reader.termDocs(new Term("word",w));
	746	+ if(td2.next())
	747	+ freq += getFrequency(reader.document(td2.doc()),null);
	748	+ }
	749	+ return freq;
	750	+ }
	751	+ }
694	752
695	753	/** Return true if (striped) title exists in the index */
696	754	private boolean titleExists(String w, Namespaces ns) throws IOException{
—	—	@@ -762,8 +820,9 @@
763	821	if(w.equals(nt))
764	822	continue; // trying to subtitute same
765	823	// incorrect words, or doesn't stem to same
766		~~- boolean sameStem = (alwaysReplace.contains(e.getKey()))? false : filters.stemsToSame(FastWikiTokenizerEngine.decompose(w),FastWikiTokenizerEngine.decompose(nt));~~
767		~~- if(!sameStem \|\| (sameStem && !wordExists(w,ns))){~~
	824	+ boolean sameStem = (alwaysReplace.contains(e.getKey()))? false : filters.stemsToSame(FastWikiTokenizerEngine.decompose(w),FastWikiTokenizerEngine.decompose(nt)) \|\| filters.stemsToSame(w,nt);
	825	+ //if(!sameStem \|\| (sameStem && !wordExists(w,ns))){
	826	+ if(!sameStem){
768	827	int so = t.startOffset();
769	828	int eo = t.endOffset();
770	829	if(so != start)
—	—	@@ -940,7 +999,7 @@
941	1000	}
942	1001	});
943	1002
944		~~- log.info("Sorted changes: "+changes);~~
	1003	+ log.debug("Sorted changes: "+changes);
945	1004
946	1005	HashMap<Integer,String> accept = new HashMap<Integer,String>();
947	1006	HashMap<Integer,String> preserve = new HashMap<Integer,String>();
—	—	@@ -971,7 +1030,7 @@
972	1031	break;
973	1032	}
974	1033	if(changesBadWord){
975		~~- log.info("Considering "+c);~~
	1034	+ log.debug("Considering "+c);
976	1035	boolean acceptChange = true;
977	1036	for(Entry<Integer,String> e : c.substitutes.entrySet()){
978	1037	String acceptedTerm = accept.get(e.getKey());
—	—	@@ -983,7 +1042,7 @@
984	1043	}
985	1044	}
986	1045	if(acceptChange && (dist + c.dist < maxDist)){
987		~~- log.info("Applying "+c);~~
	1046	+ log.debug("Applying "+c);
988	1047	processedChange.add(i);
989	1048	for(Entry<Integer,String> e : c.substitutes.entrySet()){
990	1049	accept.put(e.getKey(),e.getValue());
—	—	@@ -1014,7 +1073,7 @@
1015	1074	}
1016	1075	}
1017	1076	if(acceptChange && (dist + c.dist < maxDist)){
1018		~~- log.info("Applying "+c);~~
	1077	+ log.debug("Applying "+c);
1019	1078	processedChange.add(i);
1020	1079	for(Entry<Integer,String> e : c.substitutes.entrySet()){
1021	1080	accept.put(e.getKey(),e.getValue());
—	—	@@ -1058,7 +1117,7 @@
1059	1118	}
1060	1119
1061	1120	/** Merge two result sets */
1062		~~- public ArrayList<SuggestResult> mergeResults(ArrayList<SuggestResult> main, ArrayList<SuggestResult> add, int num){~~
	1121	+ public ArrayList<SuggestResult> mergeResults(ArrayList<SuggestResult> main, ArrayList<SuggestResult> add, int num, Filtering filter){
1063	1122	// merge
1064	1123	HashMap<String,SuggestResult> map = new HashMap<String,SuggestResult>();
1065	1124	ArrayList<SuggestResult> toAdd = new ArrayList<SuggestResult>();
—	—	@@ -1074,7 +1133,10 @@
1075	1134	}
1076	1135	main.addAll(toAdd);
1077	1136	// re-sort
1078		~~- Collections.sort(main,new SuggestResult.Comparator());~~
	1137	+ if(filter == Filtering.WEAK)
	1138	+ Collections.sort(main,new SuggestResult.ComparatorNoCommonMisspell());
	1139	+ else
	1140	+ Collections.sort(main,new SuggestResult.Comparator());
1079	1141	// trim
1080	1142	ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>();
1081	1143	for(int i=0;i<num && i<main.size();i++)
—	—	@@ -1097,7 +1159,7 @@
1098	1160	ArrayList<SuggestResult> res = suggestWordsOnNamespaces(word,word,num,num,namespaces,filter);
1099	1161	if(namespaces.additional){
1100	1162	ArrayList<SuggestResult> def = suggestWordsOnNamespaces(word,word,num,num,null,filter); // add from default
1101		~~- return mergeResults(def,res,num);~~
	1163	+ return mergeResults(def,res,num,filter);
1102	1164	}
1103	1165	return res;
1104	1166	}
—	—	@@ -1135,7 +1197,10 @@
1136	1198	res.add(r);
1137	1199	}
1138	1200	// sort
1139		~~- Collections.sort(res,new SuggestResult.Comparator());~~
	1201	+ if(filter == Filtering.WEAK)
	1202	+ Collections.sort(res,new SuggestResult.ComparatorNoCommonMisspell());
	1203	+ else
	1204	+ Collections.sort(res,new SuggestResult.Comparator());
1140	1205	ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>();
1141	1206	for(int i=0;i<num && i<res.size();i++)
1142	1207	ret.add(res.get(i));
—	—	@@ -1148,9 +1213,7 @@
1149	1214	}
1150	1215
1151	1216	private int getFrequency(Document d, Namespaces namespaces) {
1152		~~- String prefix = "";~~
1153		~~- if(namespaces != null) // namespaces=null -> default namespace, empty -> all~~
1154		~~- prefix = namespaces.prefix;~~
	1217	+ String prefix = getPrefix(namespaces);
1155	1218	int freq = 0;
1156	1219	if(namespaces == null)
1157	1220	freq = Integer.parseInt(d.get(prefix+"freq"));
—	—	@@ -1168,39 +1231,46 @@
1169	1232	return freq;
1170	1233	}
1171	1234
	1235	+ /** @return {frequency (int), inTitle (boolean)} */
1172	1236	private Object[] getPhrase(String phrase, Namespaces namespaces) throws IOException {
1173		~~- String prefix = "";~~
1174		~~- if(namespaces != null) // namespaces=null -> default namespace, empty -> all~~
1175		~~- prefix = namespaces.prefix;~~
1176		-
	1237	+ String prefix = getPrefix(namespaces);
1177	1238	int freq = 0;
1178		~~- boolean inTitle = false;~~
1179		~~- TermDocs td = reader.termDocs(new Term(prefix+"phrase",phrase));~~
1180		~~- if(td.next()){~~
1181		~~- Document d = reader.document(td.doc());~~
1182		~~- if(namespaces == null){~~
1183		~~- freq = Integer.parseInt(d.get(prefix+"freq"));~~
	1239	+ boolean inTitle = false;
	1240	+ // default namespaces
	1241	+ if(namespaces == null \|\| namespaces.additional){
	1242	+ TermDocs td = reader.termDocs(new Term("phrase",phrase));
	1243	+ if(td.next()){
	1244	+ Document d = reader.document(td.doc());
	1245	+ String f = d.get("freq");
	1246	+ freq = Integer.parseInt(f);
1184	1247	String it = d.get("intitle");
1185	1248	if(it!=null && it.equals("1"))
1186	1249	inTitle = true;
1187		~~- } else{ // all namespaces~~
1188		~~- if(namespaces.namespaces.isEmpty()){~~
1189		~~- freq = Integer.parseInt(d.get(prefix+"freq"));~~
1190		~~- String it = d.get("intitle");~~
1191		~~- if(it!=null && it.equals("1"))~~
1192		~~- inTitle = true;~~
1193		-
1194		~~- } else{~~
	1250	+ }
	1251	+ }
	1252	+ // other
	1253	+ if(namespaces!=null){
	1254	+ TermDocs td = reader.termDocs(new Term(prefix+"phrase",phrase));
	1255	+ if(td.next()){
	1256	+ Document d = reader.document(td.doc());
	1257	+ String it = d.get(prefix+"intitle");
	1258	+ if(it!=null && it.equals("1"))
	1259	+ inTitle = true;
	1260	+
	1261	+ if(namespaces.namespaces.isEmpty()){ // all
	1262	+ String f = d.get(prefix+"freq");
	1263	+ if(f != null)
	1264	+ freq += Integer.parseInt(f);
	1265	+ } else{ // some subset
1195	1266	for(Integer i : namespaces.namespaces){
1196	1267	String f = d.get(prefix+"freq_"+i);
1197		~~- if(f != null){~~
	1268	+ if(f != null)
1198	1269	freq += Integer.parseInt(f);
1199		~~- inTitle = true;~~
1200		~~- }~~
1201		~~- }~~
	1270	+ }
1202	1271	}
1203	1272	}
1204	1273	}
	1274	+
1205	1275	return new Object[] { freq, inTitle};
1206	1276	}
1207	1277
—	—	@@ -1211,7 +1281,7 @@
1212	1282	ArrayList<SuggestResult> res = suggestTitlesOnNamespaces(title,num,pool_size,distance,namespaces);
1213	1283	if(namespaces.additional){
1214	1284	ArrayList<SuggestResult> main = suggestTitlesOnNamespaces(title,num,pool_size,distance,null);
1215		~~- return mergeResults(main,res,num);~~
	1285	+ return mergeResults(main,res,num,Filtering.STRONG);
1216	1286	}
1217	1287	return res;
1218	1288	}
—	—	@@ -1329,25 +1399,19 @@
1330	1400	}
1331	1401
1332	1402	/** Try to split word into 2 words which make up a phrase */
1333		~~- public SuggestResult suggestSplit(String word, int minFreq){~~
1334		~~- int freq = 0;~~
1335		~~- Hits hits;~~
	1403	+ public SuggestResult suggestSplit(String word, Namespaces ns){
1336	1404	ArrayList<SuggestResult> res = new ArrayList<SuggestResult>();
1337	1405	try {
1338	1406	// find frequency
1339		~~- hits = searcher.search(new TermQuery(new Term("word",word)));~~
1340		~~- if(hits.length() == 1)~~
1341		~~- freq = Integer.parseInt(hits.doc(0).get("freq"));~~
	1407	+ int wordFreq = wordFrequency(word,ns);
1342	1408
1343	1409	// try different splits
1344	1410	for(int i=1;i<word.length()-1;i++){
1345	1411	String phrase = word.substring(0,i) + "_" + word.substring(i);
1346		~~- hits = searcher.search(new TermQuery(new Term("phrase",phrase)));~~
1347		~~- if(hits.length() > 0){~~
1348		~~- int pfreq = Integer.parseInt(hits.doc(0).get("freq"));~~
1349		~~- if(pfreq >= freq && pfreq > minFreq)~~
1350		~~- res.add(new SuggestResult(phrase,pfreq,2));~~
1351		~~- }~~
	1412	+ Object[] ret = getPhrase(phrase,ns);
	1413	+ int freq = (Integer)ret[0];
	1414	+ if(freq > wordFreq)
	1415	+ res.add(new SuggestResult(phrase,freq,2));
1352	1416	}
1353	1417	if(res.size() > 0){
1354	1418	Collections.sort(res,new SuggestResult.Comparator());
—	—	@@ -1361,14 +1425,13 @@
1362	1426	}
1363	1427
1364	1428	/** Returns suggestion if joining words makes sense */
1365		~~- public SuggestResult suggestJoin(String word1, String word2, int minFreq){~~
	1429	+ public SuggestResult suggestJoin(String word1, String word2, Namespaces ns){
1366	1430	try {
1367		~~- Hits hits = searcher.search(new TermQuery(new Term("word",word1+word2)));~~
1368		~~- if(hits.length() > 0){~~
1369		~~- int freq = Integer.parseInt(hits.doc(0).get("freq"));~~
1370		~~- if(freq >= minFreq)~~
1371		~~- return new SuggestResult(word1+word2,freq,1);~~
1372		~~- }~~
	1431	+ Object[] ret = getPhrase(word1+"_"+word2,ns);
	1432	+ int freqPhrase = (Integer)ret[0];
	1433	+ int freqJoin = wordFrequency(word1+word2,ns);
	1434	+ if(freqJoin > 0 && freqJoin > freqPhrase)
	1435	+ return new SuggestResult(word1+word2,freqJoin,1);
1373	1436	} catch (IOException e) {
1374	1437	log.warn("I/O error while suggesting join on "+iid+" : "+e.getMessage());
1375	1438	e.printStackTrace();
—	—	@@ -1379,7 +1442,10 @@
1380	1443	/** Fetch a set of string for fuzzy queries */
1381	1444	public ArrayList<SuggestResult> getFuzzy(String word, NamespaceFilter nsf){
1382	1445	Namespaces ns = makeNamespaces(nsf);
1383		~~- ArrayList<SuggestResult> sug = suggestWords(word,POOL_FUZZY,ns,Filtering.WEAK);~~
	1446	+ int pool = POOL_FUZZY;
	1447	+ if(word.length() <= 4)
	1448	+ pool *= 2;
	1449	+ ArrayList<SuggestResult> sug = suggestWords(word,pool,ns,Filtering.WEAK);
1384	1450	ArrayList<SuggestResult> ret = new ArrayList<SuggestResult>();
1385	1451	for(int i=0;i<MAX_FUZZY && i<sug.size();i++){
1386	1452	ret.add(sug.get(i));
—	—	@@ -1388,7 +1454,8 @@
1389	1455	}
1390	1456
1391	1457	protected void logRequest(String searchterm, String using, long start){
1392		~~- log.info(iid+" suggest: ["+searchterm+"] using=["+using+"] in "+(System.currentTimeMillis()-start)+" ms");~~
	1458	+ if(useLogging)
	1459	+ log.info(iid+" suggest: ["+searchterm+"] using=["+using+"] in "+(System.currentTimeMillis()-start)+" ms");
1393	1460	}
1394	1461
1395	1462	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/SuggestTest.java
—	—	@@ -73,15 +73,15 @@
74	74	System.out.println(r);
75	75	}
76	76
77		~~- System.out.println("SPLIT: "+sc.suggestSplit(text,0));~~
	77	+ System.out.println("SPLIT: "+sc.suggestSplit(text,null));
78	78	}
79	79	if(last != null){
80		~~- System.out.println("JOIN: "+sc.suggestJoin(last,text,0));~~
	80	+ System.out.println("JOIN: "+sc.suggestJoin(last,text,null));
81	81	}
82	82	last = text;
83	83	}
84	84	}
85		~~- System.out.println("#suggest: "+sc.suggest(inputtext,parser.tokenizeBareText(inputtext),new HashSet<String>(),new HashSet<String>(),0,new NamespaceFilter("0")));~~
	85	+ System.out.println("#suggest: "+sc.suggest(inputtext,parser.tokenizeBareText(inputtext),new Suggest.ExtraInfo(new HashSet<String>(),new HashSet<String>(),new HashSet<String>(),0),new NamespaceFilter("0")));
86	86	System.out.println("(finished in "+(System.currentTimeMillis()-start)+" ms)");
87	87	}
88	88
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexWriter.java
—	—	@@ -14,6 +14,7 @@
15	15	import org.apache.lucene.document.Field.Store;
16	16	import org.apache.lucene.index.CorruptIndexException;
17	17	import org.apache.lucene.index.IndexWriter;
	18	+import org.apache.lucene.index.Term;
18	19	import org.wikimedia.lsearch.analyzers.Analyzers;
19	20	import org.wikimedia.lsearch.analyzers.FieldBuilder;
20	21	import org.wikimedia.lsearch.analyzers.FilterFactory;
—	—	@@ -44,6 +45,7 @@
45	46	protected String langCode;
46	47	protected Analyzer analyzer;
47	48	protected HashSet<String> stopWords;
	49	+ protected NamespaceFilter nsf;
48	50
49	51	/** Make a new index, and init writer on it (on importPath())*/
50	52	public static CleanIndexWriter newForWrite(IndexId iid) throws IOException{
—	—	@@ -63,9 +65,10 @@
64	66	GlobalConfiguration global = GlobalConfiguration.getInstance();
65	67	this.iid = iid;
66	68	this.builder = new FieldBuilder(iid,FieldBuilder.Case.IGNORE_CASE,FieldBuilder.Stemmer.NO_STEMMER,FieldBuilder.Options.SPELL_CHECK);
67		~~- this.langCode = global.getLanguage(iid.getDBname());~~
	69	+ this.langCode = iid.getLangCode();
68	70	analyzer = Analyzers.getIndexerAnalyzer(builder);
69	71	this.stopWords = StopWords.getPredefinedSet(iid);
	72	+ nsf = global.getDefaultNamespace(iid);
70	73
71	74	HashSet<String> stopWords = new HashSet<String>();
72	75	for(String w : StopWords.getStopWords(iid))
—	—	@@ -83,6 +86,19 @@
84	87	writer.setMaxFieldLength(WikiIndexModifier.MAX_FIELD_LENGTH);
85	88	}
86	89
	90	+ public void deleteArticleInfo(String pageId) throws IOException {
	91	+ writer.deleteDocuments(new Term("key",pageId));
	92	+ }
	93	+
	94	+ /** Call this to add information about the article into index */
	95	+ public void addArticleInfo(Article a){
	96	+ // only for articles in default namespace(s)
	97	+ if(nsf.contains(Integer.parseInt(a.getNamespace())))
	98	+ addArticle(a);
	99	+ else
	100	+ addTitleOnly(a);
	101	+ }
	102	+
87	103	/** Add single article */
88	104	protected void addArticle(Article a){
89	105	//if(!WikiIndexModifier.checkAddPreconditions(a,langCode))
—	—	@@ -102,8 +118,9 @@
103	119	}
104	120
105	121	/** Add title/redirect with ranks information only */
106		~~- public void addTitleOnly(Article article) {~~
	122	+ protected void addTitleOnly(Article article) {
107	123	Document doc = new Document();
	124	+ doc.add(new Field("key",article.getIndexKey(),Store.NO,Index.UN_TOKENIZED));
108	125	doc.add(new Field("ns_title",article.getTitle(),Store.YES,Index.TOKENIZED));
109	126	doc.add(new Field("ns_namespace",article.getNamespace(),Store.YES,Index.UN_TOKENIZED));
110	127	doc.add(new Field("ns_rank",Integer.toString(article.getReferences()),Store.YES,Index.NO));
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/api/SpellCheckIndexer.java
—	—	@@ -83,7 +83,7 @@
84	84	if(minPhraseFreq < 1)
85	85	minPhraseFreq = 1;
86	86	this.createNew = createNew;
87		~~- this.langCode=GlobalConfiguration.getInstance().getLanguage(iid.getDBname());~~
	87	+ this.langCode=iid.getLangCode();
88	88	this.ngramWriter = new NgramIndexer();
89	89	this.registry = IndexRegistry.getInstance();
90	90	}
—	—	@@ -222,7 +222,7 @@
223	223	while((word = dict.next()) != null){
224	224	String w = word.getWord();
225	225	if(w.contains("_")){ // phrase
226		~~- addNsPhrase(w,ir);~~
	226	+ addNsPhrase(w,ir,true);
227	227	} else{ // word
228	228	addNsWord(w,ir);
229	229	}
—	—	@@ -329,7 +329,7 @@
330	330	}
331	331
332	332	/** Add phrase in namespace other than default */
333		~~- public void addNsPhrase(String phrase, IndexReader ir) throws IOException {~~
	333	+ public void addNsPhrase(String phrase, IndexReader ir, boolean inTitle) throws IOException {
334	334	if(phrase.length() <= 2){
335	335	log.warn("Invalid phrase: "+phrase);
336	336	return;
—	—	@@ -342,6 +342,9 @@
343	343	for(Entry<String,SimpleInt> e : freq.entrySet()){
344	344	doc.add(new Field("ns_freq_"+e.getKey(), Integer.toString(e.getValue().count), Field.Store.YES, Field.Index.NO));
345	345	}
	346	+ if(inTitle){
	347	+ doc.add(new Field("ns_intitle","1", Field.Store.YES, Field.Index.UN_TOKENIZED));
	348	+ }
346	349	ngramWriter.addDocument(doc);
347	350	}
348	351
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/CleanIndexImporter.java
—	—	@@ -47,15 +47,14 @@
48	48	CleanIndexWriter writer;
49	49	String langCode;
50	50	Links links;
51		~~- NamespaceFilter nsf;~~
52	51
53	52	public CleanIndexImporter(IndexId iid, String langCode) throws IOException{
54	53	Configuration.open(); // make sure configuration is loaded
55	54	this.writer = CleanIndexWriter.newForWrite(iid);
56	55	this.langCode = langCode;
57		~~- this.links = Links.openForRead(iid,iid.getLinks().getImportPath());~~
58		~~- nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);~~
59		~~- log.info("Rebuilding for namespaces: "+nsf);~~
	56	+ this.links = Links.openStandalone(iid);
	57	+
	58	+ //log.info("Rebuilding for namespaces: "+nsf);
60	59	}
61	60	public void writeRevision(Revision revision) throws IOException {
62	61	this.revision = revision;
—	—	@@ -72,8 +71,8 @@
73	72	ArrayList<String> redirectsHere = links.getRedirectsTo(key);
74	73	references -= redirectsHere.size(); // we want raw rank, without redirects
75	74
76		~~- if(redirectTargetNamespace<0 \|\| !nsf.contains(redirectTargetNamespace))~~
77		~~- redirectTo = null; // redirect to other namespace~~
	75	+ if(redirectTargetNamespace<0 \|\| redirectTargetNamespace != page.Title.Namespace)
	76	+ redirectTo = null; // redirect to different namespace
78	77	}
79	78	Date date = new Date(revision.Timestamp.getTimeInMillis());
80	79
—	—	@@ -88,11 +87,7 @@
89	88	Article article = new Article(page.Id,page.Title.Namespace,page.Title.Text,revision.Text,redirectTo,
90	89	references,redirectTargetNamespace,redirects,new ArrayList<RelatedTitle>(),anchors,date);
91	90
92		~~- // only for articles in default namespace(s)~~
93		~~- if(nsf.contains(page.Title.Namespace))~~
94		~~- writer.addArticle(article);~~
95		~~- else~~
96		~~- writer.addTitleOnly(article);~~
	91	+ writer.addArticleInfo(article);
97	92	}
98	93
99	94	public void close() throws IOException {
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/related/RelatedBuilder.java
—	—	@@ -60,7 +60,7 @@
61	61	}
62	62	long start = System.currentTimeMillis();
63	63	try {
64		~~- rebuildFromLinksNew(iid);~~
	64	+ rebuildFromLinks(iid);
65	65	} catch (IOException e) {
66	66	log.fatal("Rebuild I/O error: "+e.getMessage());
67	67	e.printStackTrace();
—	—	@@ -71,84 +71,9 @@
72	72
73	73	System.out.println("Finished generating related in "+formatTime(end-start));
74	74	}
75		-
76		~~- @Deprecated~~
77		~~- public static void rebuildFromDump(String inputfile, IndexId iid) throws IOException{~~
78		~~- GlobalConfiguration global = GlobalConfiguration.getInstance();~~
79		~~- String langCode = global.getLanguage(iid);~~
80		~~- log.info("First pass, getting a list of valid articles...");~~
81		~~- // first pass - titles~~
82		~~- InputStream input = null;~~
83		~~- input = Tools.openInputFile(inputfile);~~
84		~~- NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);~~
85		~~- TitleReader tr = new TitleReader(iid,langCode,nsf);~~
86		~~- XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(tr, 5000));~~
87		~~- reader.readDump();~~
88		~~- input.close();~~
89		~~- CompactLinks links = tr.getTitles();~~
90		~~- tr = null; // GC~~
91		-
92		~~- log.info("Second pass, geting in/out links...");~~
93		~~- // second pass - in/out links~~
94		~~- input = Tools.openInputFile(inputfile);~~
95		~~- LinkReader rr = new LinkReader(links,iid);~~
96		~~- reader = new XmlDumpReader(input,new ProgressFilter(rr, 5000));~~
97		~~- reader.readDump();~~
98		~~- links.compactAll();~~
99		~~- store(links,iid);~~
100		~~- }~~
101	75
102		- /**
103		~~- * Rebuild related articles index for iid~~
104		~~- * @throws IOException~~
105		~~- */~~
106		~~- @Deprecated~~
107		~~- public static void rebuildFromLinks(IndexId iid) throws IOException {~~
108		~~- CompactLinks links = new CompactLinks();~~
109		~~- Links temp = Links.openForRead(iid,iid.getLinks().getImportPath());~~
110		-
111		~~- NamespaceFilter nsf = GlobalConfiguration.getInstance().getDefaultNamespace(iid);~~
112		~~- log.info("Reading titles in default search");~~
113		~~- Dictionary dict = temp.getKeys();~~
114		~~- Word w;~~
115		~~- HashMap<Integer,CompactArticleLinks> keyCache = new HashMap<Integer,CompactArticleLinks>();~~
116		~~- while((w = dict.next()) != null){~~
117		~~- String key = w.getWord();~~
118		~~- int ns = Integer.parseInt(key.substring(0,key.indexOf(':')));~~
119		~~- if(nsf.contains(ns)){~~
120		~~- links.add(key,temp.getNumInLinks(key));~~
121		~~- keyCache.put(temp.getDocId(key),links.get(key));~~
122		~~- }~~
123		~~- }~~
124		-
125		~~- log.info("Reading in/out links");~~
126		~~- dict = temp.getKeys();~~
127		~~- while((w = dict.next()) != null){~~
128		~~- String key = w.getWord();~~
129		~~- int ns = Integer.parseInt(key.substring(0,key.indexOf(':')));~~
130		~~- if(nsf.contains(ns)){~~
131		~~- CompactArticleLinks l = links.get(key);~~
132		~~- // inlinks~~
133		~~- l.setInLinks(temp.getInLinks(l,keyCache));~~
134		~~- // outlinks~~
135		~~- ArrayList<CompactArticleLinks> out = new ArrayList<CompactArticleLinks>();~~
136		~~- for(String k : temp.getOutLinks(key).toCollection()){~~
137		~~- CompactArticleLinks cs = links.get(k);~~
138		~~- if(cs != null)~~
139		~~- out.add(cs);~~
140		~~- }~~
141		~~- l.setOutLinks(out);~~
142		~~- }~~
143		~~- }~~
144		~~- temp.close();~~
145		~~- temp = null; // GC~~
146		~~- keyCache = null; // GC~~
147		-
148		~~- store(links,iid);~~
149		~~- }~~
150		-
151	76	/** Calculate from links index */
152		~~- public static void rebuildFromLinksNew(IndexId iid) throws IOException {~~
	77	+ public static void rebuildFromLinks(IndexId iid) throws IOException {
153	78	Links links = Links.openForRead(iid,iid.getLinks().getImportPath());
154	79	RelatedStorage store = new RelatedStorage(iid);
155	80
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java
—	—	@@ -132,10 +132,10 @@
133	133	}
134	134
135	135	// inherit javadoc
136		- public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int[] df, int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases) throws RemoteException{
	136	+ public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int[] df, int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst) throws RemoteException{
137	137	IndexId iid = IndexId.get(dbrole);
138	138	try{
139		~~- return Highlight.highlight(hits,iid,terms,df,maxDoc,words,StopWords.getPredefinedSet(iid),exactCase,null,sortByPhrases);~~
	139	+ return Highlight.highlight(hits,iid,terms,df,maxDoc,words,StopWords.getPredefinedSet(iid),exactCase,null,sortByPhrases,alwaysIncludeFirst);
140	140	} catch(IOException e){
141	141	throw new RemoteException("IOException on "+dbrole,e);
142	142	}
—	—	@@ -151,10 +151,10 @@
152	152	}
153	153	}
154	154
155		- public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext, int firstRank, NamespaceFilter nsf) throws RemoteException {
	155	+ public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, Suggest.ExtraInfo info, NamespaceFilter nsf) throws RemoteException {
156	156	IndexId iid = IndexId.get(dbrole);
157	157	try{
158		~~- return new Suggest(iid).suggest(searchterm,tokens,phrases,foundInContext,firstRank,nsf);~~
	158	+ return new Suggest(iid).suggest(searchterm,tokens,info,nsf);
159	159	} catch(Exception e){
160	160	e.printStackTrace();
161	161	throw new RemoteException("Exception on "+dbrole,e);
—	—	@@ -171,6 +171,16 @@
172	172	}
173	173	}
174	174
	175	+ public SearchResults searchRelated(String dbrole, String searchterm, int offset, int limit) throws RemoteException {
	176	+ IndexId iid = IndexId.get(dbrole);
	177	+ try{
	178	+ return new SearchEngine().searchRelatedLocal(iid,searchterm,offset,limit);
	179	+ } catch(IOException e){
	180	+ e.printStackTrace();
	181	+ throw new RemoteException("Exception on "+dbrole,e);
	182	+ }
	183	+ }
	184	+
175	185	protected RMIMessengerImpl(){
176	186	networkStatus = null;
177	187	indexRegistry = null;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java
—	—	@@ -32,6 +32,7 @@
33	33	import org.wikimedia.lsearch.search.SuffixFilterWrapper;
34	34	import org.wikimedia.lsearch.search.SuffixNamespaceWrapper;
35	35	import org.wikimedia.lsearch.search.Wildcards;
	36	+import org.wikimedia.lsearch.spell.Suggest;
36	37	import org.wikimedia.lsearch.spell.SuggestQuery;
37	38	import org.wikimedia.lsearch.spell.SuggestResult;
38	39
—	—	@@ -247,13 +248,13 @@
248	249	}
249	250	}
250	251
251		~~- public Highlight.ResultSet highlight(String host, ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases){~~
	252	+ public Highlight.ResultSet highlight(String host, ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst){
252	253	try{
253	254	RMIMessenger r = messengerFromCache(host);
254		~~- return r.highlight(hits,dbrole,terms,df,maxDoc,words,exactCase,sortByPhrases);~~
	255	+ return r.highlight(hits,dbrole,terms,df,maxDoc,words,exactCase,sortByPhrases,alwaysIncludeFirst);
255	256	} catch(Exception e){
256	257	e.printStackTrace();
257		~~- return new Highlight.ResultSet(new HashMap<String,HighlightResult>(),new HashSet<String>(),new HashSet<String>(),false,0);~~
	258	+ return new Highlight.ResultSet(new HashMap<String,HighlightResult>(),new HashSet<String>(),new HashSet<String>(),false,0,new HashSet<String>());
258	259	}
259	260	}
260	261
—	—	@@ -279,10 +280,10 @@
280	281	}
281	282	}
282	283
283		~~- public SuggestQuery suggest(String host, String dbrole, String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext, int firstRank, NamespaceFilter nsf){~~
	284	+ public SuggestQuery suggest(String host, String dbrole, String searchterm, ArrayList<Token> tokens, Suggest.ExtraInfo info, NamespaceFilter nsf){
284	285	try{
285	286	RMIMessenger r = messengerFromCache(host);
286		~~- return r.suggest(dbrole,searchterm,tokens,phrases,foundInContext,firstRank,nsf);~~
	287	+ return r.suggest(dbrole,searchterm,tokens,info,nsf);
287	288	} catch(Exception e){
288	289	if(host == null){
289	290	log.warn("Cannot find spell-check host for "+dbrole);
—	—	@@ -304,9 +305,29 @@
305	306	return r.getFuzzy(dbrole,word,nsf);
306	307	} catch(Exception e){
307	308	e.printStackTrace();
308		~~- log.warn("Error invoking getFuzzyt() on "+host+" : "+e.getMessage());~~
	309	+ log.warn("Error invoking getFuzzy() on "+host+" : "+e.getMessage());
309	310	return new ArrayList<SuggestResult>();
310	311	}
311	312	}
	313	+
	314	+ /** dbrole pointing to original dbrole, not .related, e.g. wikilucene, not wikilucene.related */
	315	+ public SearchResults searchRelated(String host, String dbrole, String searchterm, int offset, int limit){
	316	+ try{
	317	+ RMIMessenger r = messengerFromCache(host);
	318	+ return r.searchRelated(dbrole,searchterm,offset,limit);
	319	+ } catch(Exception e){
	320	+ e.printStackTrace();
	321	+ log.warn("Error invoking searchRelated() on "+host+" : "+e.getMessage());
	322	+ if(host!=null && !isLocal(host)){
	323	+ if(cache == null)
	324	+ cache = SearcherCache.getInstance();
	325	+ cache.invalidateSearchable(IndexId.get(dbrole),host);
	326	+ }
	327	+ SearchResults res = new SearchResults();
	328	+ res.setErrorMsg("Error searching related index: "+e.getMessage());
	329	+ return res;
	330	+ }
	331	+
	332	+ }
312	333
313	334	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java
—	—	@@ -22,6 +22,7 @@
23	23	import org.wikimedia.lsearch.search.NamespaceFilterWrapper;
24	24	import org.wikimedia.lsearch.search.SuffixFilterWrapper;
25	25	import org.wikimedia.lsearch.search.SuffixNamespaceWrapper;
	26	+import org.wikimedia.lsearch.spell.Suggest;
26	27	import org.wikimedia.lsearch.spell.SuggestQuery;
27	28	import org.wikimedia.lsearch.spell.SuggestResult;
28	29
—	—	@@ -133,9 +134,9 @@
134	135	* @param maxDoc - max number of documents in the index (needed for idf calculation)
135	136	* @param words - main phrase words, gives extra score
136	137	* @param exactCase - if this is an exact case query
137		~~- * @return map: key -> highlighting result~~
	138	+ * @return resultset
138	139	*/
139		- public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases) throws RemoteException;
	140	+ public Highlight.ResultSet highlight(ArrayList<String> hits, String dbrole, Term[] terms, int df[], int maxDoc, ArrayList<String> words, boolean exactCase, boolean sortByPhrases, boolean alwaysIncludeFirst) throws RemoteException;
140	141
141	142	/**
142	143	* Search grouped titles, similar logic to that of searchPart()
—	—	@@ -161,7 +162,7 @@
162	163	* @return
163	164	* @throws RemoteException
164	165	*/
165		- public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, HashSet<String> phrases, HashSet<String> foundInContext, int firstRank, NamespaceFilter nsf) throws RemoteException;
	166	+ public SuggestQuery suggest(String dbrole, String searchterm, ArrayList<Token> tokens, Suggest.ExtraInfo info, NamespaceFilter nsf) throws RemoteException;
166	167
167	168	/**
168	169	* Fetch words for fuzzy queries (e.g. query~)
—	—	@@ -172,5 +173,17 @@
173	174	* @return
174	175	* @throws RemoteException
175	176	*/
176		~~- public ArrayList<SuggestResult> getFuzzy(String dbrole, String word, NamespaceFilter nsf) throws RemoteException;~~
	177	+ public ArrayList<SuggestResult> getFuzzy(String dbrole, String word, NamespaceFilter nsf) throws RemoteException;
	178	+
	179	+ /**
	180	+ * Search a remote related index
	181	+ *
	182	+ * @param dbrole
	183	+ * @param searchterm
	184	+ * @param limit
	185	+ * @param offset
	186	+ * @return
	187	+ * @throws RemoteException
	188	+ */
	189	+ public SearchResults searchRelated(String dbrole, String searchterm, int offset, int limit) throws RemoteException;
177	190	}
Index: branches/lucene-search-2.1/webinterface/lsweb.py
—	—	@@ -401,10 +401,10 @@
402	402	self.wfile.write('</body></html>')
403	403	except HTTPError:
404	404	self.send_error(400,'Bad request')
405		~~- self.wfile.write("Error in query")~~
	405	+ self.wfile.write("<div>Error in query</div>")
406	406	except URLError:
407	407	self.send_error(500,'Internal Server Error')
408		~~- self.wfile.write("Cannot connect to lucene search 2 daemon")~~
	408	+ self.wfile.write("<div>Cannot connect to lucene search 2 daemon</div>")
409	409	delta_time = time.time() - start_time
410	410	print '[%s] Processed query %s in %d ms' %(time.strftime("%Y-%m-%d %H:%M:%S"),self.path,int(delta_time*1000))
411	411	elif s[2] == '/':

Status & tagging log

15:25, 12 September 2011 Meno25 (talk | contribs) changed the status of r32149 [removed: ok added: old]