r55318 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r55317‎ \| r55318 \| r55319 >
Date:	11:38, 19 August 2009
Author:	rainman
Status:	deferred
Tags:
Comment:	Assorted stuff: * bug 18767 - ./configure does not work properly, it produces garbage * add experimental support for incremental rsync update of index with daily segment merges, needs a bit more testing * update mwdumper
Modified paths:	/branches/lucene-search-2.1/lib/mwdumper.jar (modified) (history) /branches/lucene-search-2.1/src/org/apache/lucene/analysis/KStemData7.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/IndexDaemon.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/OAIHarvester.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history) /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Configure.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/src/org/apache/lucene/analysis/KStemData7.java
—	—	@@ -231,7 +231,7 @@
232	232	"shameless","shammy","shampoo","shamrock","shandy",
233	233	"shanghai","shank","shantung","shanty","shantytown",
234	234	"shape","shaped","shapely","shard","share",
235		~~-"sharecropper","shareholder","shares","shark","sharkskin",~~
	235	+"sharecropper","shareholder","shark","sharkskin",
236	236	"sharp","sharpen","sharpener","sharper","sharpshooter",
237	237	"shatter","shave","shaver","shaving","shawl",
238	238	"shay","she","sheaf","shear","shears",
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/IndexDaemon.java
—	—	@@ -62,7 +62,7 @@
63	63	snapshotPrecursors("","true");
64	64	}
65	65	public void snapshotPrecursors(String pattern){
66		~~- indexer.makeSnapshotsNow(false,pattern,true);~~
	66	+ indexer.makeSnapshotsNow(true,pattern,true);
67	67	}
68	68
69	69	public void snapshotPrecursors(String pattern, String optimize){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java
—	—	@@ -61,11 +61,16 @@
62	62	long[] timestamps = new long[dbroles.length];
63	63	int i=0;
64	64	for(String dbrole : dbroles){
65		~~- LocalIndex li = indexRegistry.getLatestSnapshot(IndexId.get(dbrole));~~
66		~~- if(li != null)~~
67		~~- timestamps[i++] = li.timestamp;~~
68		~~- else~~
	65	+ try{
	66	+ LocalIndex li = indexRegistry.getLatestSnapshot(IndexId.get(dbrole));
	67	+ if(li != null)
	68	+ timestamps[i++] = li.timestamp;
	69	+ else
	70	+ timestamps[i++] = 0;
	71	+ } catch(RuntimeException e){
	72	+ log.warn("Error getting snapshot for index "+dbrole, e);
69	73	timestamps[i++] = 0;
	74	+ }
70	75	}
71	76	log.debug(" <-/ replying: "+Arrays.toString(timestamps));
72	77	return timestamps;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java
—	—	@@ -271,7 +271,7 @@
272	272	// rsync
273	273	log.info("Starting rsync of "+iid);
274	274	String snapshotpath = iid.getRsyncSnapshotPath()+"/"+li.timestamp;
275		~~- Command.exec(rsyncPath+" "+rsyncParams+" -W --delete -r rsync://"+iid.getIndexHost()+snapshotpath+" "+iid.getUpdatePath());~~
	275	+ Command.exec(rsyncPath+" "+rsyncParams+" -W --delete -u -t -r rsync://"+iid.getIndexHost()+snapshotpath+" "+iid.getUpdatePath());
276	276	log.info("Finished rsync of "+iid+" in "+(System.currentTimeMillis()-startTime)+" ms");
277	277
278	278	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java
—	—	@@ -121,7 +121,8 @@
122	122	}
123	123	for(int i=0;i<reader.maxDoc();i++){
124	124	for(CacheBuilder b : builders){
125		~~- b.cache(i,reader.document(i));~~
	125	+ if( !reader.isDeleted(i) )
	126	+ b.cache(i,reader.document(i));
126	127	}
127	128	}
128	129	for(CacheBuilder b : builders){
—	—	@@ -504,12 +505,20 @@
505	506	ArrayList<InitialDeployer> threads = new ArrayList<InitialDeployer>();
506	507
507	508	// divide mys list into chunks and assign them to different worker threads
508		~~- int inc = mys.size() / threadNum + 1;~~
509		~~- int start = 0;~~
	509	+ float inc = (float)mys.size() / threadNum;
	510	+ if( inc < 1 )
	511	+ inc = 1;
	512	+ float start = 0;
510	513	for(int i=0;i<threadNum;i++){
511		~~- threads.add(new InitialDeployer(~~
512		~~- mys.subList(start, Math.min(start+inc, mys.size()))));~~
	514	+ int end = Math.min((int)(start+inc), mys.size());
	515	+ if( i == threadNum-1 )
	516	+ end = mys.size(); // take rest of the list
	517	+
	518	+ threads.add(new InitialDeployer( mys.subList((int)(start), end) ));
513	519	start += inc;
	520	+ // config error, too many threads
	521	+ if( start >= mys.size())
	522	+ break;
514	523	}
515	524
516	525	// start all threads
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java
—	—	@@ -106,13 +106,13 @@
107	107	boolean optimize = true;
108	108
109	109	public Pattern(boolean optimize, String pattern, boolean forPrecursors){
110		~~- this(pattern,forPrecursors,false);~~
111		~~- this.optimize = optimize;~~
	110	+ this(pattern,forPrecursors,false,optimize);
112	111	}
113		~~- public Pattern(String pattern, boolean forPrecursors, boolean not){~~
	112	+ public Pattern(String pattern, boolean forPrecursors, boolean not, boolean optimize){
114	113	this.pattern = pattern;
115	114	this.forPrecursors = forPrecursors;
116	115	this.not = not;
	116	+ this.optimize = optimize;
117	117	}
118	118	@Override
119	119	public int hashCode() {
—	—	@@ -218,7 +218,7 @@
219	219	ArrayList<Pattern> rawPatterns = new ArrayList<Pattern>();
220	220	synchronized (snapshotPatterns) {
221	221	for(Pattern p : snapshotPatterns){ // convert wildcards into regexp
222		~~- pat.add(new Pattern(StringUtils.wildcardToRegexp(p.pattern),p.forPrecursors,p.pattern.startsWith("^")));~~
	222	+ pat.add(new Pattern(StringUtils.wildcardToRegexp(p.pattern),p.forPrecursors,p.pattern.startsWith("^"),p.optimize));
223	223	rawPatterns.add(p);
224	224	}
225	225	snapshotPatterns.clear();
—	—	@@ -247,11 +247,13 @@
248	248	try{
249	249	if(iid.isLogical())
250	250	continue;
251		~~- if(matchesPattern(pat,iid)){~~
	251	+ Pattern p = matchesPattern(pat,iid);
	252	+ if( p != null){
252	253	// enforce outer transaction lock to connect optimization & snapshot
253	254	lock = iid.getTransactionLock(IndexId.Transaction.INDEX);
254	255	lock.lock();
255		~~- optimizeIndex(iid);~~
	256	+ if( p.optimize )
	257	+ optimizeIndex(iid);
256	258	makeIndexSnapshot(iid,iid.getIndexPath());
257	259	lock.unlock();
258	260	lock = null;
—	—	@@ -269,7 +271,7 @@
270	272	for( IndexId iid : indexes ){
271	273	if(iid.isLogical() \|\| badOptimization.contains(iid))
272	274	continue;
273		~~- if(matchesPattern(pat,iid)){~~
	275	+ if(matchesPattern(pat,iid) != null){
274	276
275	277	registry.refreshSnapshots(iid);
276	278	}
—	—	@@ -281,16 +283,17 @@
282	284	}
283	285	}
284	286
285		~~- private boolean matchesPattern(ArrayList<Pattern> pat, IndexId iid) {~~
	287	+ /** Returns the matching pattern or null if none is matching */
	288	+ private Pattern matchesPattern(ArrayList<Pattern> pat, IndexId iid) {
286	289	String string = iid.toString();
287	290	for(Pattern p : pat){
288	291	if((iid.isPrecursor() && !p.forPrecursors) \|\|(!iid.isPrecursor() && p.forPrecursors))
289	292	continue;
290	293	boolean match = p.pattern.equals("")? true : string.matches(p.pattern);
291	294	if((match && !p.not) \|\| (!match && p.not))
292		~~- return true;~~
	295	+ return p;
293	296	}
294		~~- return false;~~
	297	+ return null;
295	298	}
296	299
297	300	public static void makeIndexSnapshot(IndexId iid, String indexPath){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
—	—	@@ -644,7 +644,7 @@
645	645	// check if it's a valid field
646	646	String f = new String(buffer,0,length);
647	647
648		~~- List fieldOperators = getFieldOperators();~~
	648	+ List<String> fieldOperators = getFieldOperators();
649	649
650	650	if( f.equals(namespaceAllKeyword)
651	651	\|\| fieldOperators.contains(f)
—	—	@@ -661,8 +661,8 @@
662	662	return TokenType.WORD;
663	663	}
664	664
665		~~- private List getFieldOperators() {~~
666		~~- List fieldOperators = new ArrayList();~~
	665	+ private List<String> getFieldOperators() {
	666	+ List<String> fieldOperators = new ArrayList<String>();
667	667	fieldOperators.add("intitle");
668	668	fieldOperators.add("incategory");
669	669	fieldOperators.add("inthread");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
—	—	@@ -102,6 +102,9 @@
103	103	HashSet<String> firstPass = new HashSet<String>(); // if dbname is here, then it's our update pass
104	104	String defaultTimestamp = "2001-01-01";
105	105	boolean errors = false;
	106	+ boolean requestSnapshot = false;
	107	+ String noOptimizationDBlistFile = null;
	108	+ HashSet<String> noOptimizationDBs = new HashSet<String>();
106	109
107	110	// args
108	111	for(int i=0; i<args.length; i++){
—	—	@@ -123,6 +126,10 @@
124	127	excludeFile = args[++i];
125	128	else if(args[i].equals("-n"))
126	129	notification = true;
	130	+ else if(args[i].equals("-sn"))
	131	+ requestSnapshot = true;
	132	+ else if(args[i].equals("-nof"))
	133	+ noOptimizationDBlistFile = args[++i];
127	134	else if(args[i].equals("--help"))
128	135	break;
129	136	else if(args[i].startsWith("-")){
—	—	@@ -135,6 +142,9 @@
136	143	dbnames.addAll(global.getMyIndexDBnames());
137	144	dbnames.addAll(readDBList(dblist));
138	145	excludeList.addAll(readDBList(excludeFile));
	146	+ if( noOptimizationDBlistFile != null)
	147	+ noOptimizationDBs.addAll(readDBList(noOptimizationDBlistFile));
	148	+
139	149	if(dbnames.size() == 0){
140	150	System.out.println("Syntax: java IncrementalUpdater [-d] [-s sleep] [-t timestamp] [-e dbname] [-f dblist] [-n] [--no-ranks] dbname1 dbname2 ...");
141	151	System.out.println("Options:");
—	—	@@ -147,7 +157,8 @@
148	158	System.out.println(" -n - wait for notification of flush after done updating one db (default: "+notification+")");
149	159	System.out.println(" -e - exclude dbname from incremental updates (overrides -f)");
150	160	System.out.println(" -ef - exclude db names listed in dblist file");
151		-
	161	+ System.out.println(" -sn - immediately make unoptimized snapshot as updates finish ");
	162	+ System.out.println(" -nof - use with -sn to specify a file with databases not to be optimized");
152	163	return;
153	164	}
154	165	// preload
—	—	@@ -232,10 +243,10 @@
233	244	String host = iid.getIndexHost();
234	245	boolean req = messenger.requestFlushAndNotify(dbname,host);
235	246	if(req){
236		~~- log.info("Waiting for flush notification");~~
	247	+ log.info("Waiting for flush notification for "+dbname);
237	248	Boolean succ = null;
238	249	do{
239		~~- Thread.sleep(3000);~~
	250	+ Thread.sleep(1500);
240	251	succ = messenger.isSuccessfulFlush(dbname,host);
241	252	if(succ != null){
242	253	if(succ){
—	—	@@ -248,6 +259,21 @@
249	260	}
250	261	}
251	262	} while(succ == null);
	263	+ if(requestSnapshot){
	264	+ boolean optimize = !noOptimizationDBs.contains(dbname);
	265	+ // snapshot the content and highlight indexes without optimizing them
	266	+ String p = dbname+"\|"+dbname+".pa\|"+dbname+".ns\|"+dbname+".h*";
	267	+ messenger.requestSnapshotAndNotify(host, optimize, p, false);
	268	+ log.info("Waiting for snapshot notification for "+dbname);
	269	+ while( !messenger.snapshotFinished(host,optimize,p,false) ){
	270	+ try {
	271	+ Thread.sleep(1500);
	272	+ } catch (InterruptedException e) {
	273	+ log.warn("Interrupted", e);
	274	+ }
	275	+ }
	276	+ log.info("Snapshot of "+dbname+" successful");
	277	+ }
252	278	} else
253	279	continue main_loop;
254	280	}
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/OAIHarvester.java
—	—	@@ -58,6 +58,7 @@
59	59	}
60	60
61	61	protected void read(URL url) throws IOException {
	62	+ log.info("Reading records from "+url);
62	63	collector = new IndexUpdatesCollector(iid);
63	64	InputStream in = new BufferedInputStream(url.openStream());
64	65	parser = new OAIParser(in,collector);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Configure.java
—	—	@@ -84,7 +84,7 @@
85	85	return Command.exec(new String[] {
86	86	"/bin/bash",
87	87	"-c",
88		~~- "cd "+mediawiki+" && (echo \"return \\$"+var+"\" \| php maintenance/eval.php)"}).trim();~~
	88	+ "cd "+mediawiki+" && (echo \"return \\$"+var+"\" \| php maintenance/eval.php \| sed -e 's/^> // ; /^$/d')"}).trim();
89	89	}
90	90
91	91	/** create config file from template, replacing variables
Index: branches/lucene-search-2.1/lib/mwdumper.jar
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream

Status & tagging log

00:20, 14 September 2011 Meno25 (talk | contribs) changed the status of r55318 [removed: old added: deferred]
01:04, 14 November 2010 Reedy (talk | contribs) changed the status of r55318 [removed: new added: old]