r55318 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r55317‎ | r55318 | r55319 >
Date:11:38, 19 August 2009
Author:rainman
Status:deferred
Tags:
Comment:
Assorted stuff:
* bug 18767 - ./configure does not work properly, it produces garbage
* add experimental support for incremental rsync update of index with daily segment merges, needs a bit more testing
* update mwdumper
Modified paths:
  • /branches/lucene-search-2.1/lib/mwdumper.jar (modified) (history)
  • /branches/lucene-search-2.1/src/org/apache/lucene/analysis/KStemData7.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/IndexDaemon.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/OAIHarvester.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Configure.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/src/org/apache/lucene/analysis/KStemData7.java
@@ -231,7 +231,7 @@
232232 "shameless","shammy","shampoo","shamrock","shandy",
233233 "shanghai","shank","shantung","shanty","shantytown",
234234 "shape","shaped","shapely","shard","share",
235 -"sharecropper","shareholder","shares","shark","sharkskin",
 235+"sharecropper","shareholder","shark","sharkskin",
236236 "sharp","sharpen","sharpener","sharper","sharpshooter",
237237 "shatter","shave","shaver","shaving","shawl",
238238 "shay","she","sheaf","shear","shears",
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/IndexDaemon.java
@@ -62,7 +62,7 @@
6363 snapshotPrecursors("","true");
6464 }
6565 public void snapshotPrecursors(String pattern){
66 - indexer.makeSnapshotsNow(false,pattern,true);
 66+ indexer.makeSnapshotsNow(true,pattern,true);
6767 }
6868
6969 public void snapshotPrecursors(String pattern, String optimize){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java
@@ -61,11 +61,16 @@
6262 long[] timestamps = new long[dbroles.length];
6363 int i=0;
6464 for(String dbrole : dbroles){
65 - LocalIndex li = indexRegistry.getLatestSnapshot(IndexId.get(dbrole));
66 - if(li != null)
67 - timestamps[i++] = li.timestamp;
68 - else
 65+ try{
 66+ LocalIndex li = indexRegistry.getLatestSnapshot(IndexId.get(dbrole));
 67+ if(li != null)
 68+ timestamps[i++] = li.timestamp;
 69+ else
 70+ timestamps[i++] = 0;
 71+ } catch(RuntimeException e){
 72+ log.warn("Error getting snapshot for index "+dbrole, e);
6973 timestamps[i++] = 0;
 74+ }
7075 }
7176 log.debug(" <-/ replying: "+Arrays.toString(timestamps));
7277 return timestamps;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java
@@ -271,7 +271,7 @@
272272 // rsync
273273 log.info("Starting rsync of "+iid);
274274 String snapshotpath = iid.getRsyncSnapshotPath()+"/"+li.timestamp;
275 - Command.exec(rsyncPath+" "+rsyncParams+" -W --delete -r rsync://"+iid.getIndexHost()+snapshotpath+" "+iid.getUpdatePath());
 275+ Command.exec(rsyncPath+" "+rsyncParams+" -W --delete -u -t -r rsync://"+iid.getIndexHost()+snapshotpath+" "+iid.getUpdatePath());
276276 log.info("Finished rsync of "+iid+" in "+(System.currentTimeMillis()-startTime)+" ms");
277277
278278 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java
@@ -121,7 +121,8 @@
122122 }
123123 for(int i=0;i<reader.maxDoc();i++){
124124 for(CacheBuilder b : builders){
125 - b.cache(i,reader.document(i));
 125+ if( !reader.isDeleted(i) )
 126+ b.cache(i,reader.document(i));
126127 }
127128 }
128129 for(CacheBuilder b : builders){
@@ -504,12 +505,20 @@
505506 ArrayList<InitialDeployer> threads = new ArrayList<InitialDeployer>();
506507
507508 // divide mys list into chunks and assign them to different worker threads
508 - int inc = mys.size() / threadNum + 1;
509 - int start = 0;
 509+ float inc = (float)mys.size() / threadNum;
 510+ if( inc < 1 )
 511+ inc = 1;
 512+ float start = 0;
510513 for(int i=0;i<threadNum;i++){
511 - threads.add(new InitialDeployer(
512 - mys.subList(start, Math.min(start+inc, mys.size()))));
 514+ int end = Math.min((int)(start+inc), mys.size());
 515+ if( i == threadNum-1 )
 516+ end = mys.size(); // take rest of the list
 517+
 518+ threads.add(new InitialDeployer( mys.subList((int)(start), end) ));
513519 start += inc;
 520+ // config error, too many threads
 521+ if( start >= mys.size())
 522+ break;
514523 }
515524
516525 // start all threads
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/index/IndexThread.java
@@ -106,13 +106,13 @@
107107 boolean optimize = true;
108108
109109 public Pattern(boolean optimize, String pattern, boolean forPrecursors){
110 - this(pattern,forPrecursors,false);
111 - this.optimize = optimize;
 110+ this(pattern,forPrecursors,false,optimize);
112111 }
113 - public Pattern(String pattern, boolean forPrecursors, boolean not){
 112+ public Pattern(String pattern, boolean forPrecursors, boolean not, boolean optimize){
114113 this.pattern = pattern;
115114 this.forPrecursors = forPrecursors;
116115 this.not = not;
 116+ this.optimize = optimize;
117117 }
118118 @Override
119119 public int hashCode() {
@@ -218,7 +218,7 @@
219219 ArrayList<Pattern> rawPatterns = new ArrayList<Pattern>();
220220 synchronized (snapshotPatterns) {
221221 for(Pattern p : snapshotPatterns){ // convert wildcards into regexp
222 - pat.add(new Pattern(StringUtils.wildcardToRegexp(p.pattern),p.forPrecursors,p.pattern.startsWith("^")));
 222+ pat.add(new Pattern(StringUtils.wildcardToRegexp(p.pattern),p.forPrecursors,p.pattern.startsWith("^"),p.optimize));
223223 rawPatterns.add(p);
224224 }
225225 snapshotPatterns.clear();
@@ -247,11 +247,13 @@
248248 try{
249249 if(iid.isLogical())
250250 continue;
251 - if(matchesPattern(pat,iid)){
 251+ Pattern p = matchesPattern(pat,iid);
 252+ if( p != null){
252253 // enforce outer transaction lock to connect optimization & snapshot
253254 lock = iid.getTransactionLock(IndexId.Transaction.INDEX);
254255 lock.lock();
255 - optimizeIndex(iid);
 256+ if( p.optimize )
 257+ optimizeIndex(iid);
256258 makeIndexSnapshot(iid,iid.getIndexPath());
257259 lock.unlock();
258260 lock = null;
@@ -269,7 +271,7 @@
270272 for( IndexId iid : indexes ){
271273 if(iid.isLogical() || badOptimization.contains(iid))
272274 continue;
273 - if(matchesPattern(pat,iid)){
 275+ if(matchesPattern(pat,iid) != null){
274276
275277 registry.refreshSnapshots(iid);
276278 }
@@ -281,16 +283,17 @@
282284 }
283285 }
284286
285 - private boolean matchesPattern(ArrayList<Pattern> pat, IndexId iid) {
 287+ /** Returns the matching pattern or null if none is matching */
 288+ private Pattern matchesPattern(ArrayList<Pattern> pat, IndexId iid) {
286289 String string = iid.toString();
287290 for(Pattern p : pat){
288291 if((iid.isPrecursor() && !p.forPrecursors) ||(!iid.isPrecursor() && p.forPrecursors))
289292 continue;
290293 boolean match = p.pattern.equals("")? true : string.matches(p.pattern);
291294 if((match && !p.not) || (!match && p.not))
292 - return true;
 295+ return p;
293296 }
294 - return false;
 297+ return null;
295298 }
296299
297300 public static void makeIndexSnapshot(IndexId iid, String indexPath){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -644,7 +644,7 @@
645645 // check if it's a valid field
646646 String f = new String(buffer,0,length);
647647
648 - List fieldOperators = getFieldOperators();
 648+ List<String> fieldOperators = getFieldOperators();
649649
650650 if( f.equals(namespaceAllKeyword)
651651 || fieldOperators.contains(f)
@@ -661,8 +661,8 @@
662662 return TokenType.WORD;
663663 }
664664
665 - private List getFieldOperators() {
666 - List fieldOperators = new ArrayList();
 665+ private List<String> getFieldOperators() {
 666+ List<String> fieldOperators = new ArrayList<String>();
667667 fieldOperators.add("intitle");
668668 fieldOperators.add("incategory");
669669 fieldOperators.add("inthread");
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
@@ -102,6 +102,9 @@
103103 HashSet<String> firstPass = new HashSet<String>(); // if dbname is here, then it's our update pass
104104 String defaultTimestamp = "2001-01-01";
105105 boolean errors = false;
 106+ boolean requestSnapshot = false;
 107+ String noOptimizationDBlistFile = null;
 108+ HashSet<String> noOptimizationDBs = new HashSet<String>();
106109
107110 // args
108111 for(int i=0; i<args.length; i++){
@@ -123,6 +126,10 @@
124127 excludeFile = args[++i];
125128 else if(args[i].equals("-n"))
126129 notification = true;
 130+ else if(args[i].equals("-sn"))
 131+ requestSnapshot = true;
 132+ else if(args[i].equals("-nof"))
 133+ noOptimizationDBlistFile = args[++i];
127134 else if(args[i].equals("--help"))
128135 break;
129136 else if(args[i].startsWith("-")){
@@ -135,6 +142,9 @@
136143 dbnames.addAll(global.getMyIndexDBnames());
137144 dbnames.addAll(readDBList(dblist));
138145 excludeList.addAll(readDBList(excludeFile));
 146+ if( noOptimizationDBlistFile != null)
 147+ noOptimizationDBs.addAll(readDBList(noOptimizationDBlistFile));
 148+
139149 if(dbnames.size() == 0){
140150 System.out.println("Syntax: java IncrementalUpdater [-d] [-s sleep] [-t timestamp] [-e dbname] [-f dblist] [-n] [--no-ranks] dbname1 dbname2 ...");
141151 System.out.println("Options:");
@@ -147,7 +157,8 @@
148158 System.out.println(" -n - wait for notification of flush after done updating one db (default: "+notification+")");
149159 System.out.println(" -e - exclude dbname from incremental updates (overrides -f)");
150160 System.out.println(" -ef - exclude db names listed in dblist file");
151 -
 161+ System.out.println(" -sn - immediately make unoptimized snapshot as updates finish ");
 162+ System.out.println(" -nof - use with -sn to specify a file with databases not to be optimized");
152163 return;
153164 }
154165 // preload
@@ -232,10 +243,10 @@
233244 String host = iid.getIndexHost();
234245 boolean req = messenger.requestFlushAndNotify(dbname,host);
235246 if(req){
236 - log.info("Waiting for flush notification");
 247+ log.info("Waiting for flush notification for "+dbname);
237248 Boolean succ = null;
238249 do{
239 - Thread.sleep(3000);
 250+ Thread.sleep(1500);
240251 succ = messenger.isSuccessfulFlush(dbname,host);
241252 if(succ != null){
242253 if(succ){
@@ -248,6 +259,21 @@
249260 }
250261 }
251262 } while(succ == null);
 263+ if(requestSnapshot){
 264+ boolean optimize = !noOptimizationDBs.contains(dbname);
 265+ // snapshot the content and highlight indexes without optimizing them
 266+ String p = dbname+"|"+dbname+".pa*|"+dbname+".ns*|"+dbname+".h*";
 267+ messenger.requestSnapshotAndNotify(host, optimize, p, false);
 268+ log.info("Waiting for snapshot notification for "+dbname);
 269+ while( !messenger.snapshotFinished(host,optimize,p,false) ){
 270+ try {
 271+ Thread.sleep(1500);
 272+ } catch (InterruptedException e) {
 273+ log.warn("Interrupted", e);
 274+ }
 275+ }
 276+ log.info("Snapshot of "+dbname+" successful");
 277+ }
252278 } else
253279 continue main_loop;
254280 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/oai/OAIHarvester.java
@@ -58,6 +58,7 @@
5959 }
6060
6161 protected void read(URL url) throws IOException {
 62+ log.info("Reading records from "+url);
6263 collector = new IndexUpdatesCollector(iid);
6364 InputStream in = new BufferedInputStream(url.openStream());
6465 parser = new OAIParser(in,collector);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/util/Configure.java
@@ -84,7 +84,7 @@
8585 return Command.exec(new String[] {
8686 "/bin/bash",
8787 "-c",
88 - "cd "+mediawiki+" && (echo \"return \\$"+var+"\" | php maintenance/eval.php)"}).trim();
 88+ "cd "+mediawiki+" && (echo \"return \\$"+var+"\" | php maintenance/eval.php | sed -e 's/^> // ; /^$/d')"}).trim();
8989 }
9090
9191 /** create config file from template, replacing variables
Index: branches/lucene-search-2.1/lib/mwdumper.jar
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream

Status & tagging log