r22014 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r22013‎ \| r22014 \| r22015 >
Date:	22:06, 8 May 2007
Author:	rainman
Status:	old
Tags:
Comment:	Backward compatibility: wildcard queries and custom namespace filters. The latter should be avoided since they introduce considerable overhead, slowing down search 5-10 times. Maybe remove the checkboxes to pick namespace on search page, since namespace-based search can be done with prefixes which are fast. Some minor tweaks to previous commit, typos, etc..
Modified paths:	/trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Collector.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexThread.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/NamespaceFilter.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearcherCache.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
—	—	@@ -94,11 +94,11 @@
95	95	assertTrue(fields.contains("contents"));
96	96
97	97	// namespace policies
98		~~- parser = new WikiQueryParser("contents","main",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.IGNORE);~~
	98	+ parser = new WikiQueryParser("contents","0",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.IGNORE);
99	99	q = parser.parseRaw("help:making breakfast category:food");
100	100	assertEquals("+contents:making +contents:breakfast +category:food",q.toString());
101	101
102		~~- parser = new WikiQueryParser("contents","main",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE);~~
	102	+ parser = new WikiQueryParser("contents","0",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE);
103	103	q = parser.parseRaw("help:making breakfast category:food");
104	104	assertEquals("+namespace:12 +(+contents:making +contents:breakfast +category:food)",q.toString());
105	105
—	—	@@ -120,7 +120,7 @@
121	121
122	122	// ====== English Analyzer ========
123	123
124		~~- parser = new WikiQueryParser("contents","main",new EnglishAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE);~~
	124	+ parser = new WikiQueryParser("contents","0",new EnglishAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE);
125	125	q = parser.parseRaw("main_talk:laziness");
126	126	assertEquals("+namespace:1 +(contents:laziness contents:lazi^0.5)",q.toString());
127	127
—	—	@@ -184,7 +184,7 @@
185	185	// Tests with actual params :)
186	186	// ==================================
187	187	Analyzer analyzer = Analyzers.getSearcherAnalyzer("en");
188		~~- parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE);~~
	188	+ parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE);
189	189	q = parser.parseTwoPass("beans everyone",null);
190	190	assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0)",q.toString());
191	191
—	—	@@ -233,9 +233,21 @@
234	234	q = parser.parseTwoPass("main:1991 category:\"olympic cities\" -all:1990",NamespacePolicy.REWRITE);
235	235	assertEquals("(+(+namespace:0 +(+contents:1991 +category:\"olympic cities\")) -contents:1990) (+(+namespace:0 +(+title:1991^2.0 +category:\"olympic cities\")) -title:1990^2.0)",q.toString());
236	236
	237	+ q = parser.parseTwoPass("main:ba*",NamespacePolicy.IGNORE);
	238	+ assertEquals("contents:ba* title:ba*^2.0",q.toString());
	239	+
	240	+ q = parser.parseTwoPass("main:ba* all:lele",NamespacePolicy.REWRITE);
	241	+ assertEquals("(+(+namespace:0 +contents:ba) +contents:lele) (+(+namespace:0 +title:ba^2.0) +title:lele^2.0)",q.toString());
	242	+
	243	+ q = parser.parseTwoPass("main:ba*beans",NamespacePolicy.IGNORE);
	244	+ assertEquals("(+contents:ba +(contents:beans contents:bean^0.5)) (+title:ba^2.0 +title:beans^2.0)",q.toString());
	245	+
	246	+ q = parser.parseTwoPass("*kuta",NamespacePolicy.IGNORE);
	247	+ assertEquals("contents:kuta title:kuta^2.0",q.toString());
	248	+
237	249	// Localization tests
238	250	analyzer = Analyzers.getSearcherAnalyzer("sr");
239		~~- parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE);~~
	251	+ parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE);
240	252
241	253	q = parser.parseTwoPass("all:добродошли на википедију",NamespacePolicy.IGNORE);
242	254	assertEquals("(+(contents:добродошли contents:dobrodosli^0.5) +(contents:на contents:na^0.5) +(contents:википедију contents:vikipediju^0.5)) (+(title:добродошли^2.0 title:dobrodosli) +(title:на^2.0 title:na) +(title:википедију^2.0 title:vikipediju))",q.toString());
—	—	@@ -244,7 +256,7 @@
245	257	assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^2.0 +title:na^2.0 +title:sdjccz^2.0)",q.toString());
246	258
247	259	analyzer = Analyzers.getSearcherAnalyzer("th");
248		~~- parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE);~~
	260	+ parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE);
249	261
250	262	q = parser.parseTwoPass("ภาษาไทย",NamespacePolicy.IGNORE);
251	263	assertEquals("(+contents:ภาษา +contents:ไทย) (+title:ภาษา^2.0 +title:ไทย^2.0)",q.toString());
—	—	@@ -252,6 +264,19 @@
253	265	q = parser.parseTwoPass("help:ภาษาไทย",NamespacePolicy.REWRITE);
254	266	assertEquals("(+namespace:12 +(+contents:ภาษา +contents:ไทย)) (+namespace:12 +(+title:ภาษา^2.0 +title:ไทย^2.0))",q.toString());
255	267
	268	+ // Backward compatiblity for complex filters
	269	+ analyzer = Analyzers.getSearcherAnalyzer("en");
	270	+ parser = new WikiQueryParser("contents","0,1,4,12",analyzer,NamespacePolicy.IGNORE);
	271	+
	272	+ q = parser.parseTwoPass("beans everyone",NamespacePolicy.REWRITE);
	273	+ assertEquals("(+(namespace:0 namespace:1 namespace:4 namespace:12) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:4 namespace:12) +(+title:beans^2.0 +title:everyone^2.0))",q.toString());
	274	+
	275	+ q = parser.parseTwoPass("beans main:everyone",NamespacePolicy.REWRITE);
	276	+ assertEquals("((+(namespace:0 namespace:1 namespace:4 namespace:12) +(contents:beans contents:bean^0.5)) (+namespace:0 +(contents:everyone contents:everyon^0.5))) ((+(namespace:0 namespace:1 namespace:4 namespace:12) +title:beans^2.0) (+namespace:0 +title:everyone^2.0))",q.toString());
	277	+
	278	+ q = parser.parseTwoPass("beans everyone category:cheeses",NamespacePolicy.REWRITE);
	279	+ assertEquals("(+(namespace:0 namespace:1 namespace:4 namespace:12) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:cheeses)) (+(namespace:0 namespace:1 namespace:4 namespace:12) +(+title:beans^2.0 +title:everyone^2.0 +category:cheeses))",q.toString());
	280	+
256	281	} catch(Exception e){
257	282	e.printStackTrace();
258	283	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
—	—	@@ -9,6 +9,8 @@
10	10	import org.mediawiki.importer.XmlDumpReader;
11	11	import org.wikimedia.lsearch.config.Configuration;
12	12	import org.wikimedia.lsearch.config.GlobalConfiguration;
	13	+import org.wikimedia.lsearch.config.IndexId;
	14	+import org.wikimedia.lsearch.index.IndexThread;
13	15	import org.wikimedia.lsearch.util.Localization;
14	16	import org.wikimedia.lsearch.util.UnicodeDecomposer;
15	17
—	—	@@ -30,7 +32,8 @@
31	33	String dbname = null;
32	34	Boolean optimize = null;
33	35	Integer mergeFactor = null, maxBufDocs = null;
34		~~- boolean newIndex = false;~~
	36	+ boolean newIndex = false, makeSnapshot = false;
	37	+ boolean snapshotDb = false;
35	38
36	39	System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n");
37	40
—	—	@@ -38,13 +41,15 @@
39	42	Logger log = Logger.getLogger(Importer.class);
40	43
41	44	if(args.length < 2){
42		~~- System.out.println("Syntax: java Importer [-n] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");~~
	45	+ System.out.println("Syntax: java Importer [-n] [-s] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");
43	46	System.out.println("Options: ");
44	47	System.out.println(" -n - create a new index (erase the old one if exists)");
	48	+ System.out.println(" -s - make index snapshot when finished");
45	49	System.out.println(" -l limit_num - add at most limit_num articles");
46	50	System.out.println(" -o optimize - true/false overrides optimization param from global settings");
47	51	System.out.println(" -m mergeFactor - overrides param from global settings");
48	52	System.out.println(" -b maxBufDocs - overrides param from global settings");
	53	+ System.out.println(" --snapshot <db> - make snapshot only for dbname");
49	54	return;
50	55	}
51	56	for(int i=0;i<args.length;i++){
—	—	@@ -58,52 +63,78 @@
59	64	maxBufDocs = Integer.parseInt(args[++i]);
60	65	else if(args[i].equals("-n"))
61	66	newIndex = true;
62		~~- else if(inputfile == null)~~
	67	+ else if(args[i].equals("-s"))
	68	+ makeSnapshot = true;
	69	+ else if(args[i].equals("--snapshot")){
	70	+ dbname = args[++i];
	71	+ snapshotDb = true;
	72	+ break;
	73	+ } else if(inputfile == null)
63	74	inputfile = args[i];
64	75	else if(dbname == null)
65	76	dbname = args[i];
66	77	else
67	78	System.out.println("Unrecognized option: "+args[i]);
68	79	}
69		-
70		~~- if(inputfile == null \|\| dbname == null){~~
71		~~- System.out.println("Please specify both input xml file and database name");~~
72		~~- return;~~
73		~~- }~~
	80	+ if(!snapshotDb){
	81	+ if(inputfile == null \|\| dbname == null){
	82	+ System.out.println("Please specify both input xml file and database name");
	83	+ return;
	84	+ }
74	85
75		~~- // preload~~
76		~~- UnicodeDecomposer.getInstance();~~
77		~~- Localization.readLocalization(GlobalConfiguration.getInstance().getLanguage(dbname));~~
78		~~- Localization.loadInterwiki();~~
79		-
80		~~- long start = System.currentTimeMillis();~~
81		-
82		~~- // open~~
83		~~- InputStream input = null;~~
84		~~- try {~~
85		~~- input = Tools.openInputFile(inputfile);~~
86		~~- } catch (IOException e) {~~
87		~~- log.fatal("I/O error opening "+inputfile);~~
	86	+ // preload
	87	+ UnicodeDecomposer.getInstance();
	88	+ Localization.readLocalization(GlobalConfiguration.getInstance().getLanguage(dbname));
	89	+ Localization.loadInterwiki();
	90	+
	91	+ long start = System.currentTimeMillis();
	92	+
	93	+ // open
	94	+ InputStream input = null;
	95	+ try {
	96	+ input = Tools.openInputFile(inputfile);
	97	+ } catch (IOException e) {
	98	+ log.fatal("I/O error opening "+inputfile);
	99	+ return;
	100	+ }
	101	+
	102	+ // read
	103	+ DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex);
	104	+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 100));
	105	+ try {
	106	+ reader.readDump();
	107	+ } catch (IOException e) {
	108	+ if(!e.getMessage().equals("stopped")){
	109	+ log.fatal("I/O error reading dump for "+dbname+" from "+inputfile);
	110	+ return;
	111	+ }
	112	+ }
	113	+
	114	+ long end = System.currentTimeMillis();
	115	+
	116	+ log.info("Closing/optimizing index...");
	117	+ dp.closeIndex();
	118	+
	119	+ long finalEnd = System.currentTimeMillis();
	120	+
	121	+ System.out.println("Finished indexing in "+formatTime(end-start)+", with final index optimization in "+formatTime(finalEnd-end));
	122	+ System.out.println("Total time: "+formatTime(finalEnd-start));
88	123	}
89	124
90		~~- // read~~
91		~~- DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex);~~
92		~~- XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 100));~~
93		~~- try {~~
94		~~- reader.readDump();~~
95		~~- } catch (IOException e) {~~
96		~~- log.warn("I/O error reading dump for "+dbname+" from "+inputfile);~~
97		~~- }~~
98		-
99		~~- long end = System.currentTimeMillis();~~
100		-
101		~~- log.info("Closing/optimizing index...");~~
102		~~- dp.closeIndex();~~
103		-
104		~~- long finalEnd = System.currentTimeMillis();~~
105		-
106		~~- System.out.println("Finished indexing in "+formatTime(end-start)+", with final index optimization in "+formatTime(finalEnd-end));~~
107		~~- System.out.println("Total time: "+formatTime(finalEnd-start));~~
	125	+ // make snapshot if needed
	126	+ if(makeSnapshot \|\| snapshotDb){
	127	+ IndexId iid = IndexId.get(dbname);
	128	+ if(iid.isMainsplit()){
	129	+ IndexThread.makeIndexSnapshot(iid.getMainPart(),iid.getMainPart().getImportPath());
	130	+ IndexThread.makeIndexSnapshot(iid.getRestPart(),iid.getRestPart().getImportPath());
	131	+ } else if(iid.isSplit()){
	132	+ for(String part : iid.getSplitParts()){
	133	+ IndexId iidp = IndexId.get(part);
	134	+ IndexThread.makeIndexSnapshot(iidp,iidp.getImportPath());
	135	+ }
	136	+ } else
	137	+ IndexThread.makeIndexSnapshot(iid,iid.getImportPath());
	138	+ }
108	139	}
109	140
110	141	private static String formatTime(long l) {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearcherCache.java
—	—	@@ -165,7 +165,7 @@
166	166	for(IndexId iid : mys){
167	167	try {
168	168	IndexSearcherMul is = getLocalSearcher(iid);
169		~~- Warmup.warmupIndexSearcher(is,iid);~~
	169	+ Warmup.warmupIndexSearcher(is,iid,false);
170	170	} catch (IOException e) {
171	171	log.warn("I/O error warming index for "+iid);
172	172	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/NamespaceFilter.java
—	—	@@ -76,16 +76,16 @@
77	77	return included.get(namespace);
78	78	}
79	79
	80	+ public BitSet getIncluded() {
	81	+ return included;
	82	+ }
	83	+
80	84	public int cardinality(){
81	85	return included.cardinality();
82	86	}
83	87
84	88	public int getNamespace(){
85		~~- for(int i=0;i<included.size();i++){~~
86		~~- if(included.get(i))~~
87		~~- return i;~~
88		~~- }~~
89		~~- return Integer.MIN_VALUE;~~
	89	+ return included.nextSetBit(0);
90	90	}
91	91
92	92	@Override
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/UpdateThread.java
—	—	@@ -58,6 +58,7 @@
59	59	for(LocalIndex li : forUpdate){
60	60	log.debug("Syncing "+li.iid);
61	61	rebuild(li); // rsync, update registry, cache
	62	+ pending.remove(li.iid.toString());
62	63	}
63	64	}
64	65	}
—	—	@@ -69,6 +70,8 @@
70	71	protected long queryInterval;
71	72	protected SearcherCache cache;
72	73	protected long delayInterval;
	74	+ /** Pending updates, dbrole -> timestamp */
	75	+ protected Hashtable<String,Long> pending = new Hashtable<String,Long>();
73	76
74	77	protected static UpdateThread instance = null;
75	78
—	—	@@ -115,6 +118,8 @@
116	119
117	120	for(int i = 0; i < hiids.size(); i++){
118	121	IndexId iid = hiids.get(i);
	122	+ if(pending.containsKey(iid.toString()))
	123	+ continue; // pending update, ignore
119	124	LocalIndex myli = registry.getCurrentSearch(iid);
120	125	if(timestamps[i]!= 0 && (myli == null \|\| myli.timestamp < timestamps[i])){
121	126	LocalIndex li = new LocalIndex(
—	—	@@ -122,10 +127,12 @@
123	128	iid.getUpdatePath(),
124	129	timestamps[i]);
125	130	forUpdate.add(li); // newer snapshot available
	131	+ pending.put(iid.toString(),new Long(timestamps[i]));
126	132	}
127	133	}
128	134	}
129		~~- new DeferredUpdate(forUpdate,delayInterval);~~
	135	+ if(forUpdate.size()>0)
	136	+ new DeferredUpdate(forUpdate,delayInterval).start();
130	137	}
131	138
132	139	/** Rsync a remote snapshot to a local one, updates registry, cache */
—	—	@@ -165,19 +172,23 @@
166	173	File ind = new File(iid.getCanonicalSearchPath());
167	174
168	175	if(ind.exists()){ // prepare a local hard-linked copy of index
169		~~- try {~~
170		~~- // cp -lr update/dbname/timestamp/* update/dbname/timestamp2/~~
171		~~- command = "/bin/cp -lr "+ind.getCanonicalPath()+sep+"*"+" "+updatepath+sep;~~
172		~~- log.debug("Running shell command: "+command);~~
173		~~- Runtime.getRuntime().exec(command).waitFor();~~
174		~~- } catch (Exception e) {~~
175		~~- log.error("Error making update hardlinked copy "+updatepath+": "+e.getMessage());~~
	176	+ ind = ind.getCanonicalFile();
	177	+ for(File f: ind.listFiles()){
	178	+ // a cp -lr command for each file in the index
	179	+ command = "/bin/cp -lr "+ind.getCanonicalPath()+sep+f.getName()+" "+updatepath+sep+f.getName();
	180	+ try {
	181	+ log.debug("Running shell command: "+command);
	182	+ Runtime.getRuntime().exec(command).waitFor();
	183	+ } catch (Exception e) {
	184	+ log.error("Error making update hardlinked copy "+updatepath+": "+e.getMessage());
	185	+ continue;
	186	+ }
176	187	}
177	188	}
178	189
179	190	// rsync
180	191	String snapshotpath = iid.getRsyncSnapshotPath()+"/"+li.timestamp;
181		~~- command = "/usr/bin/rsync --delete -r rsync://"+iid.getIndexHost()+":"+snapshotpath+" "+iid.getUpdatePath();~~
	192	+ command = "/usr/bin/rsync -W --delete -r rsync://"+iid.getIndexHost()+":"+snapshotpath+" "+iid.getUpdatePath();
182	193	log.debug("Running shell command: "+command);
183	194	Runtime.getRuntime().exec(command).waitFor();
184	195
—	—	@@ -218,7 +229,7 @@
219	230	/** Update search cache after successful rsync of update version of index */
220	231	protected void updateCache(IndexSearcherMul is, LocalIndex li){
221	232	// do some typical queries to preload some lucene caches, pages into memory, etc..
222		~~- Warmup.warmupIndexSearcher(is,li.iid);~~
	233	+ Warmup.warmupIndexSearcher(is,li.iid,true);
223	234	// add to cache
224	235	cache.invalidateLocalSearcher(li.iid,is);
225	236	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java
—	—	@@ -107,7 +107,9 @@
108	108	*/
109	109	public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault){
110	110	Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid);
111		~~- WikiQueryParser parser = new WikiQueryParser("contents","main",analyzer,WikiQueryParser.NamespacePolicy.IGNORE);~~
	111	+ if(nsDefault == null \|\| nsDefault.cardinality() == 0)
	112	+ nsDefault = new NamespaceFilter("0"); // default to main namespace
	113	+ WikiQueryParser parser = new WikiQueryParser("contents",nsDefault,analyzer,WikiQueryParser.NamespacePolicy.IGNORE);
112	114	HashSet<Integer> fields = parser.getFieldNamespaces(searchterm);
113	115	NamespaceFilterWrapper nsfw = null;
114	116	Query q = null;
—	—	@@ -122,8 +124,6 @@
123	125	}
124	126	else if(fields.size()==0 && nsDefault!=null && nsDefault.cardinality()==1)
125	127	nsfw = new NamespaceFilterWrapper(nsDefault);
126		~~- else if(fields.size()==0) // default: search main namespace~~
127		~~- nsfw = new NamespaceFilterWrapper(new NamespaceFilter("0"));~~
128	128
129	129	try {
130	130	if(nsfw == null){
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java
—	—	@@ -28,7 +28,7 @@
29	29	protected static Hashtable<String,Terms> langTerms = new Hashtable<String,Terms>();
30	30
31	31	/** Runs some typical queries on a local index searcher to preload caches, pages into memory, etc .. */
32		~~- public static void warmupIndexSearcher(IndexSearcherMul is, IndexId iid){~~
	32	+ public static void warmupIndexSearcher(IndexSearcherMul is, IndexId iid, boolean useDelay){
33	33	log.info("Warming up index "+iid+" ...");
34	34	long start = System.currentTimeMillis();
35	35
—	—	@@ -50,15 +50,15 @@
51	51	return;
52	52	}
53	53	makeNamespaceFilters(is,iid);
54		~~- warmupSearchTerms(is,iid,count);~~
	54	+ warmupSearchTerms(is,iid,count,useDelay);
55	55	long delta = System.currentTimeMillis() - start;
56	56	log.info("Warmed up "+iid+" in "+delta+" ms");
57	57	}
58	58	}
59	59
60	60	/** Warmup index using some number of simple searches */
61		~~- protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count) {~~
62		~~- WikiQueryParser parser = new WikiQueryParser("contents","main",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);~~
	61	+ protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) {
	62	+ WikiQueryParser parser = new WikiQueryParser("contents","0",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);
63	63	Terms terms = getTermsForLang(global.getLanguage(iid.getDBname()));
64	64
65	65	try{
—	—	@@ -67,11 +67,18 @@
68	68	Hits hits = is.search(q);
69	69	for(int j =0; j<20 && j<hits.length(); j++)
70	70	hits.doc(j); // retrieve some documents
	71	+ if(useDelay){
	72	+ if(i<1000)
	73	+ Thread.sleep(100);
	74	+ else
	75	+ Thread.sleep(50);
	76	+ }
71	77	}
72	78	} catch (IOException e) {
73	79	log.error("Error warming up local IndexSearcherMul for "+iid);
74	80	} catch (ParseException e) {
75	81	log.error("Error parsing query in warmup of IndexSearcherMul for "+iid);
	82	+ } catch (InterruptedException e) {
76	83	}
77	84	}
78	85
—	—	@@ -101,7 +108,7 @@
102	109	/** Just run one complex query and rebuild the main namespace filter */
103	110	public static void simpleWarmup(IndexSearcherMul is, IndexId iid){
104	111	try{
105		~~- WikiQueryParser parser = new WikiQueryParser("contents","main",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);~~
	112	+ WikiQueryParser parser = new WikiQueryParser("contents","0",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);
106	113	Query q = parser.parseTwoPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE);
107	114	is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0")));
108	115	} catch (IOException e) {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexThread.java
—	—	@@ -222,7 +222,7 @@
223	223	}
224	224	}
225	225
226		~~- protected void deleteDirRecursive(File file){~~
	226	+ protected static void deleteDirRecursive(File file){
227	227	if(!file.exists())
228	228	return;
229	229	else if(file.isDirectory()){
—	—	@@ -241,12 +241,9 @@
242	242	*
243	243	*/
244	244	protected void makeSnapshot() {
245		~~- final String sep = Configuration.PATH_SEP;~~
246	245	HashSet<IndexId> indexes = WikiIndexModifier.closeAllModifiers();
247	246	IndexRegistry registry = IndexRegistry.getInstance();
248	247
249		~~- DateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");~~
250		~~- String timestamp = df.format(new Date(System.currentTimeMillis()));~~
251	248	log.debug("Making snapshots...");
252	249	// check filesystem timestamps (for those for which we are unsure if they are updated)
253	250	for( IndexId iid : global.getMyIndex()){
—	—	@@ -264,37 +261,44 @@
265	262	}
266	263	}
267	264	for( IndexId iid : indexes ){
268		~~- log.info("Making snapshot for "+iid);~~
269		~~- String index = iid.getIndexPath();~~
270		~~- String snapshotdir = iid.getSnapshotPath();~~
271		~~- String snapshot = snapshotdir+sep+timestamp;~~
272		~~- // cleanup the snapshot dir for this iid~~
273		~~- File spd = new File(snapshotdir);~~
274		~~- if(spd.exists() && spd.isDirectory()){~~
275		~~- File[] files = spd.listFiles();~~
276		~~- for(File f: files)~~
277		~~- deleteDirRecursive(f);~~
278		~~- }~~
279		~~- new File(snapshot).mkdirs();~~
280		~~- File ind =new File(index);~~
281		~~- for(File f: ind.listFiles()){~~
282		~~- // use a cp -lr command for each file in the index~~
283		~~- String command = "/bin/cp -lr "+index+sep+f.getName()+" "+snapshot+sep+f.getName();~~
284		~~- Process copy;~~
285		~~- try {~~
286		~~- log.debug("Running shell command: "+command);~~
287		~~- copy = Runtime.getRuntime().exec(command);~~
288		~~- copy.waitFor();~~
289		~~- } catch (Exception e) {~~
290		~~- log.error("Error making snapshot "+snapshot+": "+e.getMessage());~~
291		~~- continue;~~
292		~~- }~~
293		~~- }~~
	265	+ makeIndexSnapshot(iid,iid.getIndexPath());
294	266	registry.refreshSnapshots(iid);
295		~~- log.info("Made snapshot "+snapshot);~~
296	267	}
297	268	}
298	269
	270	+ public static void makeIndexSnapshot(IndexId iid, String indexPath){
	271	+ final String sep = Configuration.PATH_SEP;
	272	+ DateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
	273	+ String timestamp = df.format(new Date(System.currentTimeMillis()));
	274	+
	275	+ log.info("Making snapshot for "+iid);
	276	+ String snapshotdir = iid.getSnapshotPath();
	277	+ String snapshot = snapshotdir+sep+timestamp;
	278	+ // cleanup the snapshot dir for this iid
	279	+ File spd = new File(snapshotdir);
	280	+ if(spd.exists() && spd.isDirectory()){
	281	+ File[] files = spd.listFiles();
	282	+ for(File f: files)
	283	+ deleteDirRecursive(f);
	284	+ }
	285	+ new File(snapshot).mkdirs();
	286	+ File ind =new File(indexPath);
	287	+ for(File f: ind.listFiles()){
	288	+ // use a cp -lr command for each file in the index
	289	+ String command = "/bin/cp -lr "+indexPath+sep+f.getName()+" "+snapshot+sep+f.getName();
	290	+ Process copy;
	291	+ try {
	292	+ log.debug("Running shell command: "+command);
	293	+ copy = Runtime.getRuntime().exec(command);
	294	+ copy.waitFor();
	295	+ } catch (Exception e) {
	296	+ log.error("Error making snapshot "+snapshot+": "+e.getMessage());
	297	+ continue;
	298	+ }
	299	+ }
	300	+ log.info("Made snapshot "+snapshot);
	301	+ }
	302	+
299	303	/**
300	304	* @return if there are queued updates
301	305	*/
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -2,8 +2,10 @@
3	3
4	4	import java.io.IOException;
5	5	import java.util.ArrayList;
	6	+import java.util.BitSet;
6	7	import java.util.HashMap;
7	8	import java.util.HashSet;
	9	+import java.util.Map.Entry;
8	10
9	11	import org.apache.lucene.analysis.Analyzer;
10	12	import org.apache.lucene.analysis.Token;
—	—	@@ -15,6 +17,8 @@
16	18	import org.apache.lucene.search.PhraseQuery;
17	19	import org.apache.lucene.search.Query;
18	20	import org.apache.lucene.search.TermQuery;
	21	+import org.apache.lucene.search.WildcardQuery;
	22	+import org.wikimedia.lsearch.search.NamespaceFilter;
19	23	import org.wikimedia.lsearch.util.UnicodeDecomposer;
20	24
21	25	/**
—	—	@@ -74,7 +78,8 @@
75	79	*/
76	80	public enum NamespacePolicy { LEAVE, IGNORE, REWRITE };
77	81	protected HashMap<String,Integer> namespaceMapping;
78		~~- private String defaultNamespace;~~
	82	+ private String defaultNamespaceName;
	83	+ private Query namespaceRewriteQuery;
79	84	private NamespacePolicy namespacePolicy;
80	85
81	86	/** default value for boolean queries */
—	—	@@ -110,7 +115,7 @@
111	116	* @param analyzer
112	117	*/
113	118	public WikiQueryParser(String field, Analyzer analyzer){
114		~~- this(field,null,analyzer,NamespacePolicy.LEAVE);~~
	119	+ this(field,(NamespaceFilter)null,analyzer,NamespacePolicy.LEAVE);
115	120	}
116	121
117	122	/**
—	—	@@ -122,16 +127,53 @@
123	128	* @param nsPolicy
124	129	*/
125	130	public WikiQueryParser(String field, String namespace, Analyzer analyzer, NamespacePolicy nsPolicy){
126		~~- defaultField = field;~~
127		~~- defaultNamespace = namespace;~~
	131	+ this(field,new NamespaceFilter(namespace),analyzer,nsPolicy);
	132	+ }
	133	+
	134	+ public WikiQueryParser(String field, NamespaceFilter nsfilter, Analyzer analyzer, NamespacePolicy nsPolicy){
	135	+ defaultField = field;
128	136	this.analyzer = analyzer;
129	137	decomposer = UnicodeDecomposer.getInstance();
130	138	tokens = new ArrayList<Token>();
131	139	this.namespacePolicy = nsPolicy;
132	140	disableTitleAliases = true;
133	141	initNamespaces();
	142	+ if(nsfilter != null){
	143	+ namespaceRewriteQuery = generateRewrite(nsfilter);
	144	+ defaultNamespaceName = null;
	145	+ if(nsfilter.cardinality()==1){
	146	+ Integer in = new Integer(nsfilter.getNamespace());
	147	+ // if has only on namespace, try to get the name of default namespace
	148	+ for(Entry<String,Integer> e : namespaceMapping.entrySet()){
	149	+ if(in.equals(e.getValue())){
	150	+ defaultNamespaceName = e.getKey();
	151	+ }
	152	+ }
	153	+ }
	154	+ }
	155	+ else{
	156	+ namespaceRewriteQuery = null;
	157	+ defaultNamespaceName = null;
	158	+ }
134	159	}
135	160
	161	+ /** Generate a rewrite query for a collection of namespaces */
	162	+ protected Query generateRewrite(NamespaceFilter nsfilter){
	163	+ if(nsfilter.cardinality() == 0)
	164	+ return null;
	165	+ else if(nsfilter.cardinality() == 1)
	166	+ return new TermQuery(new Term("namespace",Integer.toString(nsfilter.getNamespace())));
	167	+
	168	+ BooleanQuery bq = new BooleanQuery();
	169	+ BitSet bs = nsfilter.getIncluded();
	170	+ // iterate over set bits
	171	+ for(int i=bs.nextSetBit(0); i>=0; i=bs.nextSetBit(i+1)){
	172	+ bq.add(new TermQuery(new Term("namespace",Integer.toString(i))),
	173	+ BooleanClause.Occur.SHOULD);
	174	+ }
	175	+ return bq;
	176	+ }
	177	+
136	178	/**
137	179	* Get a hashset of namespace numbers for fields that are
138	180	* valid namespace keys.
—	—	@@ -228,8 +270,8 @@
229	271	if(length == 0 && ch == ' ')
230	272	continue; // ignore whitespaces
231	273
232		~~- // pluses and minuses, underscores can be within words~~
233		~~- if(Character.isLetterOrDigit(ch) \|\| ch=='-' \|\| ch=='+' \|\| ch=='_'){~~
	274	+ // pluses and minuses, underscores can be within words, *,? are for wildcard queries
	275	+ if(Character.isLetterOrDigit(ch) \|\| ch=='-' \|\| ch=='+' \|\| ch=='_' \|\| ch=='*' \|\| ch=='?'){
234	276	// unicode normalization -> delete accents
235	277	decomp = decomposer.decompose(ch);
236	278	if(decomp == null)
—	—	@@ -353,7 +395,7 @@
354	396	}
355	397
356	398	private final boolean needsRewrite(){
357		~~- return defaultNamespace != null && namespacePolicy == NamespacePolicy.REWRITE;~~
	399	+ return namespaceRewriteQuery != null && namespacePolicy == NamespacePolicy.REWRITE;
358	400	}
359	401
360	402	/** Parses a clause: (in regexp notation)
—	—	@@ -382,7 +424,7 @@
383	425
384	426	// assume default namespace value on rewrite
385	427	if(!returnOnFieldDef && field == null && needsRewrite()){
386		~~- fieldQuery = getNamespaceQuery(defaultNamespace);~~
	428	+ fieldQuery = namespaceRewriteQuery;
387	429	}
388	430
389	431	mainloop: for( ; cur < queryLength; cur++ ){
—	—	@@ -409,7 +451,7 @@
410	452	if(field == null \|\| definedExplicitField){
411	453	// set field name
412	454	field = new String(buffer,0,length);
413		~~- if((defaultNamespace!=null && field.equals(defaultNamespace)) \|\| field.equals(defaultField)){~~
	455	+ if((defaultNamespaceName!=null && field.equals(defaultNamespaceName)) \|\| field.equals(defaultField)){
414	456	field = null;
415	457	break; // repeated definition of field, ignore
416	458	}
—	—	@@ -433,7 +475,7 @@
434	476	case WORD:
435	477	if(fieldQuery != null){
436	478	backToken();
437		~~- String myfield = (topFieldName != null)? topFieldName : (field !=null)? field : (defaultNamespace!=null)? defaultNamespace : defaultField;~~
	479	+ String myfield = (topFieldName != null)? topFieldName : (field !=null)? field : (defaultNamespaceName!=null)? defaultNamespaceName : defaultField;
438	480	fieldsubquery = parseClause(level+1,true,myfield);
439	481	} else{
440	482	analyzeBuffer();
—	—	@@ -561,6 +603,14 @@
562	604	return new TermQuery(makeTerm());
563	605	}
564	606
	607	+ // check for wildcard seaches, they are also not analyzed/stemmed
	608	+ // wildcard signs are allowed only at the end of the word, minimum one letter word
	609	+ if(length>1 && Character.isLetter(buffer[0]) && (buffer[length-1]=='*' \|\| buffer[length-1]=='?')){
	610	+ Query ret = new WildcardQuery(makeTerm());
	611	+ ret.setBoost(defaultBoost);
	612	+ return ret;
	613	+ }
	614	+
565	615	if(toplevelOccur == BooleanClause.Occur.MUST_NOT)
566	616	aliasOccur = null; // do not add aliases
567	617	else
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
—	—	@@ -44,6 +44,7 @@
45	45	log.error("OAI authentication error. Username/password pair not specified in configuration file.");
46	46	return null;
47	47	}
	48	+ log.info("Authenticating ... ");
48	49	return new PasswordAuthentication(username,password.toCharArray());
49	50	}
50	51	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java
—	—	@@ -22,7 +22,6 @@
23	23	public class OAIHarvester {
24	24	static Logger log = Logger.getLogger(OAIHarvester.class);
25	25	protected String urlbase;
26		~~- protected Authenticator auth;~~
27	26	protected OAIParser parser;
28	27	protected IndexUpdatesCollector collector;
29	28	protected IndexId iid;
—	—	@@ -30,8 +29,8 @@
31	30
32	31	public OAIHarvester(IndexId iid, String url, Authenticator auth){
33	32	this.urlbase = url;
34		~~- this.auth = auth;~~
35	33	this.iid = iid;
	34	+ Authenticator.setDefault(auth);
36	35	}
37	36
38	37	/** Invoke ListRecords from a certain timestamp */
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Collector.java
—	—	@@ -16,16 +16,18 @@
17	17	}
18	18
19	19	protected ArrayList<ReportSet> reports = new ArrayList<ReportSet>();
20		~~- protected long startTime;~~
	20	+ protected long startTime, lastTime;
21	21	protected int reportInc; // after how many reports to print out results
22	22	protected int curInc; // current increment
23	23	protected int total;
	24	+ protected int threads;
24	25
25		~~- Collector(int reportInc, int total){~~
26		~~- startTime = System.currentTimeMillis();~~
	26	+ Collector(int reportInc, int total, int threads){
	27	+ lastTime = startTime = System.currentTimeMillis();
27	28	this.reportInc = reportInc;
28	29	curInc = 0;
29	30	this.total = total;
	31	+ this.threads = threads;
30	32	}
31	33
32	34	synchronized public void add(int results, long time){
—	—	@@ -44,6 +46,11 @@
45	47	results += rs.results;
46	48	time += rs.time;
47	49	}
	50	+ long time1k = 0;
	51	+ if(reports.size()>=1000){
	52	+ for(int i=reports.size()-1000;i<reports.size();i++)
	53	+ time1k += reports.get(i).time;
	54	+ }
48	55	long now = System.currentTimeMillis();
49	56	int sec = (int) ((now-startTime)/1000);
50	57	int min = 0;
—	—	@@ -52,8 +59,9 @@
53	60	sec = sec%60;
54	61	}
55	62	double pers = (double)(now-startTime)/reports.size();
56		~~- //double avgtime = (double)time/reports.size();~~
57		~~- System.out.format("[%d:%02d %d/%d] %2.1fms : %d results / search\n", min, sec, reports.size(), total, pers, results/reports.size());~~
	63	+ double nowpers = (double)(now-lastTime)/reportInc;
	64	+ lastTime = now;
	65	+ System.out.format("[%d:%02d %d/%d] %2.1fms : %d results / search (now: %2.1fms, last 1k: %2.1fms)\n", min, sec, reports.size(), total, pers, results/reports.size(), nowpers, time1k/1000.0/threads);
58	66	System.out.flush();
59	67	}
60	68	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java
—	—	@@ -18,6 +18,7 @@
19	19	protected Terms terms;
20	20	protected int words;
21	21	protected String namespace;
	22	+ protected String namespaceFilter;
22	23
23	24	protected int thread; // current thread
24	25
—	—	@@ -35,12 +36,12 @@
36	37	protected static Object sharedLock = new Object();
37	38
38	39	/** Use this to construct the main thread */
39		~~- public Benchmark(String host, int port, String database, String verb, Terms terms, int words, String namespace) {~~
40		~~- this(host,port,database,verb,terms,words,namespace,0,0);~~
	40	+ public Benchmark(String host, int port, String database, String verb, Terms terms, int words, String namespace, String namespaceFilter) {
	41	+ this(host,port,database,verb,terms,words,namespace,namespaceFilter,0,0);
41	42	}
42	43
43	44	/** Use this to construct a benchmark thread */
44		~~- public Benchmark(String host, int port, String database, String verb, Terms terms, int words, String namespace, int runs, int thread) {~~
	45	+ public Benchmark(String host, int port, String database, String verb, Terms terms, int words, String namespace, String namespaceFilter, int runs, int thread) {
45	46	this.host = host;
46	47	this.port = port;
47	48	this.database = database;
—	—	@@ -50,6 +51,7 @@
51	52	this.thread = thread;
52	53	this.words = words;
53	54	this.namespace = namespace;
	55	+ this.namespaceFilter = namespaceFilter;
54	56	}
55	57
56	58	/** Start benchmarking on main thread */
—	—	@@ -61,10 +63,10 @@
62	64	activeThreads = threads;
63	65	startTime = System.currentTimeMillis();
64	66
65		~~- collector = new Collector(100,threads*runs);~~
	67	+ collector = new Collector(100,threads*runs,threads);
66	68
67	69	for(int i=0;i<threads;i++)
68		~~- new Benchmark(host,port,database,verb,terms,words,namespace,runs,i).start();~~
	70	+ new Benchmark(host,port,database,verb,terms,words,namespace,namespaceFilter,runs,i).start();
69	71
70	72	// wait until all thread finish
71	73	while(activeThreads != 0){
—	—	@@ -106,11 +108,17 @@
107	109	String query = "";
108	110	for(int i=0;i<words;i++){
109	111	if(!query.equals(""))
110		~~- query += " ";~~
	112	+ query += " OR ";
111	113	query += terms.next();
112	114	}
113		~~- query = namespace+":"+URLEncoder.encode(query).replaceAll("\\+","%20");~~
114		~~- String urlString = "http://"+host+":"+port+"/"+verb+"/"+database+"/"+query+"?limit=20";~~
	115	+ String urlString;
	116	+ if(namespace.equals("")){
	117	+ query = URLEncoder.encode(query).replaceAll("\\+","%20");
	118	+ urlString = "http://"+host+":"+port+"/"+verb+"/"+database+"/"+query+"?limit=20&namespaces="+namespaceFilter;
	119	+ } else{
	120	+ query = namespace+":"+URLEncoder.encode(query).replaceAll("\\+","%20");
	121	+ urlString = "http://"+host+":"+port+"/"+verb+"/"+database+"/"+query+"?limit=20";
	122	+ }
115	123	try {
116	124	URL url;
117	125	url = new URL(urlString);
—	—	@@ -160,7 +168,8 @@
161	169	int port = 8123;
162	170	String database = "wikilucene";
163	171	String verb = "search";
164		~~- String namespace = "all";~~
	172	+ String namespace = "main";
	173	+ String namespaceFilter= "0";
165	174	int runs = 5000;
166	175	int threads = 10;
167	176	int words = 2;
—	—	@@ -180,8 +189,11 @@
181	190	runs = Integer.parseInt(args[++i]);
182	191	} else if (args[i].equals("-v")) {
183	192	database = args[++i];
184		~~- } else if (args[i].equals("-ns")) {~~
	193	+ } else if (args[i].equals("-n") \|\| args[i].equals("-ns")) {
185	194	namespace = args[++i];
	195	+ } else if (args[i].equals("-f") ) {
	196	+ namespaceFilter = args[++i];
	197	+ namespace ="";
186	198	} else if (args[i].equals("-w")) {
187	199	words = Integer.parseInt(args[++i]);
188	200	} else if (args[i].equals("--help")) {
—	—	@@ -190,15 +202,19 @@
191	203	" -p port (default: "+port+")\n"+
192	204	" -d database (default: "+database+")\n"+
193	205	" -t threads (defaut: "+threads+")\n"+
194		~~- " -n count (default: "+runs+")\n"+~~
	206	+ " -c count (default: "+runs+")\n"+
195	207	" -w number of words in query (default: "+words+")\n"+
196	208	" -v verb (default: "+verb+")\n"+
197		~~- " -ns namespace (default: "+namespace+")\n");~~
	209	+ " -n namespace (default: "+namespace+")\n"+
	210	+ " -f namespace filter (default: "+namespaceFilter+")\n");
198	211	return;
	212	+ } else{
	213	+ System.out.println("Unrecognized switch: "+args[i]);
	214	+ return;
199	215	}
200	216	}
201	217	System.out.println("Running benchmark on "+host+":"+port+" with "+threads+" theads each "+runs+" runs");
202		~~- Benchmark bench = new Benchmark(host, port, database, verb, terms, words, namespace);~~
	218	+ Benchmark bench = new Benchmark(host, port, database, verb, terms, words, namespace, namespaceFilter);
203	219	bench.startBenchmark(threads,runs);
204	220	bench.printReport();
205	221	}