r44720 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r44719‎ | r44720 | r44721 >
Date:17:05, 17 December 2008
Author:rainman
Status:deferred
Tags:
Comment:
Minor running issues stuck in my local repo:
* prevent spacemap take too much processor time, give up if cannot properly be calculated
* Don't make serialization warning fatal for now
* add original query to suggest logging
* "deployment" flag when there is redundancy in cluster
* prevent searches before initial deployment since it can sometimes slow it down too much
* a bit more test cases
Modified paths:
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/SampleTerms.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/StreamTerms.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/Terms.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/WordTerms.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIServer.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NamespaceCache.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history)
  • /branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java (modified) (history)
  • /branches/lucene-search-2.1/test/org/wikimedia/lsearch/spell/SuggestUnitTest.java (modified) (history)

Diff [purge]

Index: branches/lucene-search-2.1/test/org/wikimedia/lsearch/spell/SuggestUnitTest.java
@@ -1,13 +1,17 @@
22 package org.wikimedia.lsearch.spell;
33
44 import java.io.IOException;
 5+import java.util.HashMap;
 6+import java.util.Map;
 7+import java.util.TreeMap;
58
69 import org.wikimedia.lsearch.config.IndexId;
710 import org.wikimedia.lsearch.search.NamespaceFilter;
 11+import org.wikimedia.lsearch.search.SearcherCache;
 12+import org.wikimedia.lsearch.spell.dist.EditDistance;
813 import org.wikimedia.lsearch.test.WikiTestCase;
914
1015 public class SuggestUnitTest extends WikiTestCase {
11 -
1216 public void testMakeNamespaces() throws IOException {
1317 IndexId iid = IndexId.get("entest");
1418 Suggest sug = new Suggest(iid);
@@ -17,4 +21,19 @@
1822 assertEquals("[0, 100, 2, 4]",sug.makeNamespaces(new NamespaceFilter("0,2,4,100")).namespaces.toString());
1923 }
2024
 25+ public Map<Integer,Integer> getSpaceMap(String str1, String str2){
 26+ EditDistance ed = new EditDistance(str1);
 27+ int d[][] = ed.getMatrix(str2);
 28+ // map: space -> same space in edited string
 29+ TreeMap<Integer,Integer> spaceMap = new TreeMap<Integer,Integer>();
 30+ new Suggest().extractSpaceMap(d,str1.length(),str2.length(),spaceMap,str1,str2);
 31+ return spaceMap;
 32+ }
 33+
 34+ public void testExtractSpaceMap() throws IOException {
 35+ assertEquals("{}",getSpaceMap(".999","0 999").toString());
 36+ assertEquals("{4=3}",getSpaceMap("some string","som estring").toString());
 37+ assertEquals("",getSpaceMap(" a "," b ").toString());
 38+ }
 39+
2140 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java
@@ -321,6 +321,8 @@
322322 if(iid.toString().length()>maxlen)
323323 maxlen = iid.toString().length();
324324 }
 325+ if(cache.thisHostIsDeploying())
 326+ sendOutputLine("This host is being deployed");
325327 for(IndexId iid : mysearch){
326328 if(iid.isLogical())
327329 continue;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java
@@ -9,6 +9,7 @@
1010 import java.util.HashMap;
1111 import java.util.HashSet;
1212 import java.util.Iterator;
 13+import java.util.Map;
1314 import java.util.Set;
1415 import java.util.WeakHashMap;
1516 import java.util.Map.Entry;
@@ -178,6 +179,10 @@
179180 /** Lower limit to hit rate for joining */
180181 public static final int JOIN_FREQ = 1;
181182
 183+ /** use for testing only */
 184+ protected Suggest() {
 185+ }
 186+
182187 public Suggest(IndexId iid) throws IOException {
183188 this(iid,null,true);
184189 }
@@ -280,7 +285,7 @@
281286 log.debug("tokens: "+tokens+" inContext:"+info.foundInContext+" phrases:"+info.phrases+", inTitles="+info.foundInTitles);
282287
283288 if(tokens.size() > 15){
284 - logRequest(searchterm,"too many words to spellcheck ("+tokens.size()+")",start);
 289+ logRequest(searchterm,"too many words to spellcheck ("+tokens.size()+")",start,searchterm);
285290 return new SuggestQuery(searchterm,new ArrayList<Integer>());
286291 }
287292
@@ -310,10 +315,10 @@
311316 }
312317 if(changes.size() > 0){
313318 SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,new HashSet<Integer>(),ns);
314 - logRequest(sq.getSearchterm(),"words only (wildcard or fuzzy query)",start);
 319+ logRequest(sq.getSearchterm(),"words only (wildcard or fuzzy query)",start,searchterm);
315320 return sq;
316321 } else{
317 - logRequest(searchterm,"CORRECT (by words, wildcard or fuzzy query)",start);
 322+ logRequest(searchterm,"CORRECT (by words, wildcard or fuzzy query)",start,searchterm);
318323 return new SuggestQuery(searchterm,new ArrayList<Integer>());
319324 }
320325 }
@@ -332,7 +337,7 @@
333338 HashMap<Integer,String> changes = extractTitleChanges(joinTokens,redirectTarget,tokens);
334339 if(changes != null){
335340 SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,new HashSet<Integer>(),ns);
336 - logRequest(sq.getSearchterm(),"titles (via redirect)",start);
 341+ logRequest(sq.getSearchterm(),"titles (via redirect)",start,searchterm);
337342 return sq;
338343 }
339344 }
@@ -360,14 +365,14 @@
361366 if(titleRes.size()>0 && (titleRes.get(0).dist<2 || (correctByPhrases && titleRes.get(0).dist<=2))){
362367 SuggestResult r = titleRes.get(0);
363368 if(r.isExactMatch()){
364 - logRequest(searchterm,"CORRECT (exact title match)",start);
 369+ logRequest(searchterm,"CORRECT (exact title match)",start,searchterm);
365370 return new SuggestQuery(searchterm,new ArrayList<Integer>());
366371 }
367372 if(betterRank(r.frequency,info.firstRank)){
368373 HashMap<Integer,String> changes = extractTitleChanges(joinTokens,r.word,tokens);
369374 if(changes != null){
370375 SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,changes.keySet(),ns);
371 - logRequest(sq.getSearchterm(),"titles (misspell)",start);
 376+ logRequest(sq.getSearchterm(),"titles (misspell)",start,searchterm);
372377 return sq;
373378 }
374379 }
@@ -382,7 +387,7 @@
383388 if(singleWordSug.size() > 0){
384389 SuggestResult r = singleWordSug.get(0);
385390 if(r.isExactMatch()){
386 - logRequest(searchterm,"CORRECT (by single word index)",start);
 391+ logRequest(searchterm,"CORRECT (by single word index)",start,searchterm);
387392 return new SuggestQuery(searchterm,new ArrayList<Integer>());
388393 } else{ //if(r.dist <= 1 && betterRank(r.frequency,info.firstRank)){
389394 SuggestResult best = null;
@@ -405,7 +410,7 @@
406411 HashMap<Integer,String> proposedChanges = new HashMap<Integer,String>();
407412 proposedChanges.put(0,best.word);
408413 SuggestQuery sq = makeSuggestedQuery(tokens,proposedChanges,searchterm,filters,new HashSet<Integer>(),ns);
409 - logRequest(sq.getSearchterm(),"single word misspell",start);
 414+ logRequest(sq.getSearchterm(),"single word misspell",start,searchterm);
410415 return sq;
411416 }
412417 }
@@ -633,7 +638,7 @@
634639 // check
635640 if( titleExists(proposedTitle.toString(),ns) ){
636641 SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,changes.keySet(),ns);
637 - logRequest(sq.getSearchterm(),"phrases (title match)",start);
 642+ logRequest(sq.getSearchterm(),"phrases (title match)",start,searchterm);
638643 return sq;
639644 }
640645 }
@@ -719,13 +724,13 @@
720725 if(redirectTarget != null){
721726 String prop = followRedirect(joinTokens(" ",tokens,proposedChanges),ns);
722727 if(prop != null && prop.equals(redirectTarget)){
723 - logRequest(searchterm,"CORRECT (spellcheck to redirect to same article)",start);
 728+ logRequest(searchterm,"CORRECT (spellcheck to redirect to same article)",start,searchterm);
724729 return new SuggestQuery(searchterm,new ArrayList<Integer>());
725730 }
726731 }
727732
728733 SuggestQuery sq = makeSuggestedQuery(tokens,proposedChanges,searchterm,filters,alwaysReplace,ns);
729 - logRequest(sq.getSearchterm(),using,start);
 734+ logRequest(sq.getSearchterm(),using,start,searchterm);
730735 return sq;
731736 }
732737
@@ -936,7 +941,8 @@
937942 EditDistance ed = new EditDistance(joined);
938943 int d[][] = ed.getMatrix(corrected);
939944 // map: space -> same space in edited string
940 - HashMap<Integer,Integer> spaceMap = new HashMap<Integer,Integer>();
 945+ HashMap<Integer,Integer> spaceMap = new HashMap<Integer,Integer>();
 946+ spaceMapCalls = 0;
941947 extractSpaceMap(d,joined.length(),corrected.length(),spaceMap,joined,corrected);
942948 // indexes where spaces are in the edited string
943949 ArrayList<Integer> spaces = new ArrayList<Integer>();
@@ -995,8 +1001,16 @@
9961002 return acceptWord(r,metric);
9971003 }
9981004
 1005+ protected int spaceMapCalls = 0;
 1006+
9991007 /** Transverse the cost matrix and extract mapping of old vs new spaces */
1000 - final protected void extractSpaceMap(int[][] d, int i, int j, HashMap<Integer,Integer> spaceMap, String str1, String str2) {
 1008+ final protected void extractSpaceMap(int[][] d, int i, int j, Map<Integer,Integer> spaceMap, String str1, String str2) {
 1009+ spaceMapCalls++;
 1010+ if(spaceMapCalls > 100000){
 1011+ log.warn("Long SpaceMap call: str1="+str1+", str2="+str2);
 1012+ // FIXME !!
 1013+ return;
 1014+ }
10011015 int cost = d[i][j];
10021016 if(i == 0 || j == 0)
10031017 return;
@@ -1551,9 +1565,9 @@
15521566 return ret;
15531567 }
15541568
1555 - protected void logRequest(String searchterm, String using, long start){
 1569+ protected void logRequest(String searchterm, String using, long start, String original){
15561570 if(useLogging)
1557 - log.info(iid+" suggest: ["+searchterm+"] using=["+using+"] in "+(System.currentTimeMillis()-start)+" ms");
 1571+ log.info(iid+" for original=["+ original +"] suggest: ["+searchterm+"] using=["+using+"] in "+(System.currentTimeMillis()-start)+" ms");
15581572 }
15591573
15601574 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java
@@ -136,7 +136,7 @@
137137 log.debug("Got new RMI messenger for host "+host);
138138 return r;
139139 } catch (RemoteException e) {
140 - log.warn("Cannot contact RMI registry for host "+host+" : "+e.getMessage(),e);
 140+ log.warn("Cannot contact RMI registry for host "+host+" : "+e.getMessage());
141141 throw e;
142142 } catch (NotBoundException e) {
143143 log.warn("No RMIMessenger instance at host "+host+" : "+e.getMessage(),e);
@@ -407,4 +407,27 @@
408408 log.error("Messenger not bound: "+e.getMessage(),e);
409409 }
410410 }
 411+
 412+ public void hostDeployed(String host, String myHost) throws RemoteException {
 413+ RMIMessenger r;
 414+ try {
 415+ r = messengerFromCache(host);
 416+ r.hostDeployed(myHost);
 417+ } catch(NotBoundException e){
 418+ e.printStackTrace();
 419+ log.error("Messenger not bound: "+e.getMessage(),e);
 420+ }
 421+ }
 422+
 423+ public void hostDeploying(String host, String myHost) throws RemoteException {
 424+ RMIMessenger r;
 425+ try {
 426+ r = messengerFromCache(host);
 427+ r.hostDeploying(myHost);
 428+ } catch(NotBoundException e){
 429+ e.printStackTrace();
 430+ log.error("Messenger not bound: "+e.getMessage(),e);
 431+ }
 432+ }
 433+
411434 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java
@@ -255,4 +255,15 @@
256256 * @throws RemoteException
257257 */
258258 public void addLocalizationCustomMapping(Map<Integer,String> namespaceIndexToName, String dbname) throws RemoteException;
 259+
 260+ /**
 261+ * Signalize that the host is deploying and that is shouldn't be bugged with searches
 262+ *
 263+ * @param host
 264+ * @throws RemoteException
 265+ */
 266+ public void hostDeploying(String host) throws RemoteException;
 267+
 268+ /** Remote host has been deployed */
 269+ public void hostDeployed(String host) throws RemoteException;
259270 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java
@@ -242,6 +242,19 @@
243243
244244 }
245245
 246+ public void hostDeployed(String host) throws RemoteException {
 247+ if(cache == null)
 248+ cache = SearcherCache.getInstance();
 249+ cache.hostDeployed(host);
 250+
 251+ }
 252+
 253+ public void hostDeploying(String host) throws RemoteException {
 254+ if(cache == null)
 255+ cache = SearcherCache.getInstance();
 256+ cache.hostDeploying(host);
 257+ }
 258+
246259 protected RMIMessengerImpl(){
247260 networkStatus = null;
248261 indexRegistry = null;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIServer.java
@@ -58,7 +58,7 @@
5959 }
6060 return true;
6161 } catch (RemoteException e) {
62 - e.printStackTrace();
 62+ log.warn("Remote error unbinding iid="+iid,e);
6363 } catch (NotBoundException e) {
6464 }
6565 return false;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/StreamTerms.java
@@ -48,5 +48,10 @@
4949 }
5050 }
5151 }
 52+
 53+ public int termCount() {
 54+ // TODO Auto-generated method stub
 55+ return 0;
 56+ }
5257
5358 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/WordTerms.java
@@ -68,4 +68,10 @@
6969 return words.get((int)(Math.random()*words.size()));
7070 }
7171
 72+ public int termCount() {
 73+ return words.size();
 74+ }
 75+
 76+
 77+
7278 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/Terms.java
@@ -2,4 +2,6 @@
33
44 public interface Terms {
55 public String next();
 6+
 7+ public int termCount();
68 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/SampleTerms.java
@@ -965,4 +965,10 @@
966966 "volcom",
967967 "Charlotte ross"
968968 };
 969+
 970+ public int termCount() {
 971+ return terms.length;
 972+ }
 973+
 974+
969975 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -598,7 +598,7 @@
599599 }
600600 }
601601 }
602 - if(searchHosts.isEmpty()){
 602+ if(searchHosts.isEmpty() && !dbrole.endsWith(".links") && !dbrole.endsWith(".related")){
603603 // assign to search orphans host
604604 searchHosts.addAll(searchOrphans);
605605 }
@@ -1556,5 +1556,14 @@
15571557 return IndexId.get(commonsWiki);
15581558 }
15591559
 1560+ /** Get all searchers (NOTE: this is kindof slow...) */
 1561+ public HashSet<String> getAllSearchHosts(){
 1562+ HashSet<String> hosts = new HashSet<String>();
 1563+ for(IndexId iid : indexIdPool.values()){
 1564+ hosts.addAll(iid.getSearchHosts());
 1565+ }
 1566+ return hosts;
 1567+ }
 1568+
15601569
15611570 }
\ No newline at end of file
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java
@@ -15,7 +15,10 @@
1616 import org.apache.lucene.index.Term;
1717 import org.apache.lucene.search.Hits;
1818 import org.apache.lucene.search.Query;
 19+import org.apache.lucene.search.SearchableMul;
 20+import org.apache.lucene.search.Searcher;
1921 import org.apache.lucene.search.TermQuery;
 22+import org.apache.lucene.search.TopDocs;
2023 import org.wikimedia.lsearch.analyzers.Analyzers;
2124 import org.wikimedia.lsearch.analyzers.FieldBuilder;
2225 import org.wikimedia.lsearch.analyzers.FieldNameFactory;
@@ -179,13 +182,35 @@
180183 FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder();
181184 WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null);
182185
183 - try{
 186+ ArrayList<SearchableMul> searchers = new ArrayList<SearchableMul>();
 187+ SearcherCache cache = SearcherCache.getInstance();
 188+ for(IndexId piid : iid.getDB().getPhysicalIndexIds()){
 189+ if(piid == iid)
 190+ searchers.add(is);
 191+ else if(piid.isMySearch()){
 192+ try {
 193+ searchers.add(cache.getLocalSearcher(piid));
 194+ } catch (Exception e) {
 195+ log.warn("Error retrieving local searcher part "+piid+" for warmup", e);
 196+ }
 197+ }
 198+ }
 199+
 200+ try{
 201+ Searcher searcher = null;
 202+ if(searchers.size()<=1)
 203+ searcher = is;
 204+ else
 205+ searcher = new MultiSearcherMul(searchers.toArray(new SearchableMul[]{}));
 206+
184207 Terms terms = getTermsForLang(lang);
 208+ log.info("Warming up with "+terms.termCount()+" terms");
185209 for(int i=0; i < count ; i++){
 210+ String searchterm = terms.next();
 211+ long start = System.currentTimeMillis();
186212 Query q = parser.parse(terms.next());
187 - Hits hits = is.search(q);
188 - for(int j =0; j<20 && j<hits.length(); j++)
189 - hits.doc(j); // retrieve some documents
 213+ TopDocs hits = searcher.search(q,null,20);
 214+ new SearchEngine().makeSearchResults((SearchableMul)searcher,hits,0,20,iid,searchterm,q,start,false);
190215 if(useDelay){
191216 if(i<1000)
192217 Thread.sleep(100);
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -113,6 +113,11 @@
114114 searchOnly = true;
115115 NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces"));
116116 SearchResults res = search(iid, searchterm, offset, limit, iwoffset, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly);
 117+ if(!res.isSuccess()){
 118+ // note failed search
 119+ if(SearchServer.stats != null)
 120+ SearchServer.stats.add(false, 0, SearchDaemon.getOpenCount());
 121+ }
117122 /*if(res!=null && res.isRetry()){
118123 int retries = 1;
119124
@@ -738,6 +743,10 @@
739744 return;
740745 if(!nsfw.hasNamespaceFilter())
741746 return; // query on many overlapping namespaces, won't try to spellcheck to not mess things up
 747+ if(isNumber(searchterm))
 748+ return; // don't suggest numbers...
 749+ // strip unnecessary spaces
 750+ searchterm = searchterm.replaceAll(" +"," ");
742751 // suggest !
743752 res.setSuggest(null);
744753 ArrayList<Token> tokens = parser.tokenizeForSpellCheck(parser.extractPrefixFilter(searchterm));
@@ -754,6 +763,14 @@
755764 res.addInfo("suggest",formatHost(host));
756765 }
757766 }
 767+
 768+ protected boolean isNumber(String s){
 769+ for(char c : s.toCharArray()){
 770+ if(!(Character.isDigit(c) || c=='.' || c==','))
 771+ return false;
 772+ }
 773+ return true;
 774+ }
758775
759776 protected Query parseQuery(String searchterm, WikiQueryParser parser, IndexId iid, boolean raw, FilterWrapper nsfw, boolean searchAll, Wildcards wildcards) throws ParseException {
760777 Query q = null;
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java
@@ -74,9 +74,11 @@
7575 // get the new snapshots via rsync, might be lengthy
7676 for(LocalIndex li : forUpdate){
7777 try{
78 - log.debug("Syncing "+li.iid);
79 - rebuild(li,type); // rsync, update registry, cache
80 - pending.remove(li.iid.toString());
 78+ synchronized (threadSerialization){
 79+ log.debug("Syncing "+li.iid);
 80+ rebuild(li,type); // rsync, update registry, cache
 81+ pending.remove(li.iid.toString());
 82+ }
8183 } catch(Exception e){
8284 e.printStackTrace();
8385 log.error("Error syncing "+li+" : "+e.getMessage(),e);
@@ -100,7 +102,13 @@
101103 protected String rsyncPath = null;
102104 protected String rsyncParams = null;
103105 protected long numChecks = 0;
 106+ /** If localhost should be *always* taken out of rotation */
 107+ protected boolean forceLocalDeployment = false;
 108+ /** If old update/ dirs should be deleted once the new index is deployed */
 109+ protected boolean deleteOldUpdates = false;
104110
 111+ protected static Object threadSerialization = new Object();
 112+
105113 @Override
106114 public void run() {
107115 long lastCheck, now;
@@ -217,6 +225,18 @@
218226 }
219227 new File(updatepath).mkdirs();
220228 try{
 229+ if(forceLocalDeployment){
 230+ cache.hostDeploying("localhost");
 231+ String myHost = global.getLocalhost();
 232+ for(String host : global.getAllSearchHosts()){
 233+ try{
 234+ if(!host.equals(myHost))
 235+ messenger.hostDeploying(host,myHost);
 236+ } catch(Exception e){
 237+ log.warn("Error notifying host "+host+" of index deployment: "+e.getMessage(),e);
 238+ }
 239+ }
 240+ }
221241 // if local, use cp -lr instead of rsync
222242 if(global.isLocalhost(iid.getIndexHost())){
223243 FSUtils.createHardLinkRecursive(
@@ -261,21 +281,39 @@
262282 // notify all remote searchers of change
263283 messenger.notifyIndexUpdated(iid,iid.getDBSearchHosts());
264284
 285+ // cleanup old index updates if neccessary
 286+ if(deleteOldUpdates && myli != null){
 287+ deleteDirRecursive(new File(iid.getUpdatePath()+Configuration.PATH_SEP+myli.timestamp));
 288+ }
 289+
265290 } catch(IOException ioe){
266291 ioe.printStackTrace();
267292 log.error("I/O error updating index "+iid+" at "+li.path+" : "+ioe.getMessage(),ioe);
268293 badIndexes.put(li.iid.toString(),li.timestamp);
 294+ } finally {
 295+ if(forceLocalDeployment){
 296+ cache.hostDeployed("localhost");
 297+ String myHost = global.getLocalhost();
 298+ for(String host : global.getAllSearchHosts()){
 299+ try{
 300+ if(!host.equals(myHost))
 301+ messenger.hostDeployed(host,myHost);
 302+ } catch(Exception e){
 303+ log.warn("Error notifying host "+host+" of end of deployment: "+e.getMessage(),e);
 304+ }
 305+ }
 306+ }
269307 }
270308 }
271309
272310 /** Update searcher cache after warming up searchers */
273311 protected void warmupAndDeploy(SearcherCache.SearcherPool pool, LocalIndex li, RebuildType type){
 312+ boolean reroute = false;
274313 try{
275314 // see if we can go ahead and deploy the searcher or should we wait
276315 IndexId iid = li.iid;
277316 HashSet<String> group = iid.getSearchHosts();
278 - int succ = 0, fail = 0;
279 - boolean reroute = false;
 317+ int succ = 0, fail = 0;
280318 long waitedSoFar = 0;
281319 if(type == RebuildType.FULL){
282320 // never deploy more than one searcher of iid in a search group
@@ -318,8 +356,16 @@
319357
320358 // reoute queries to other servers
321359 if( reroute ){
322 - log.info("Deploying "+iid);
323 - beingDeployed.add(iid.toString());
 360+ String myHost = global.getLocalhost();
 361+ log.info("Deploying "+iid+" on "+myHost);
 362+ beingDeployed.add(iid.toString());
 363+ /* for(String host : global.getAllSearchHosts()){
 364+ try{
 365+ messenger.hostDeploying(host,myHost);
 366+ } catch(Exception e){
 367+ log.warn("Error notifying host "+host+" of index deployment: "+e.getMessage(),e);
 368+ }
 369+ } */
324370 try{
325371 //RMIServer.unbind(iid,cache.getLocalSearcherPool(iid));
326372 } catch(Exception e) {
@@ -337,7 +383,7 @@
338384 //Warmup.warmupIndexSearcher(is,li.iid,true,1);
339385 //Warmup.waitForAggregate(pool.searchers);
340386 // do proper warmup
341 - Warmup.warmupIndexSearcher(is,li.iid,true,null);
 387+ Warmup.warmupIndexSearcher(is,li.iid,false,null);
342388 } catch(IOException e){
343389 e.printStackTrace();
344390 log.warn("Error warmup up "+li+" : "+e.getMessage(),e);
@@ -353,7 +399,7 @@
354400 }
355401 } finally{
356402 // be sure stuff is not stuck as being deployed
357 - beingDeployed.remove(li.iid.toString());
 403+ beingDeployed.remove(li.iid.toString());
358404 }
359405 }
360406
@@ -371,6 +417,8 @@
372418 cache = SearcherCache.getInstance();
373419 rsyncPath = config.getString("Rsync","path","/usr/bin/rsync");
374420 rsyncParams = config.getString("Rsync","params","");
 421+ forceLocalDeployment = config.getBoolean("Search","forceLocalDeployment");
 422+ deleteOldUpdates = config.getBoolean("Search","deleteOldUpdates");
375423 }
376424
377425 public static UpdateThread getStandalone(){
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NamespaceCache.java
@@ -101,16 +101,19 @@
102102 if(cache.containsKey(nsf))
103103 filters.add(cache.get(nsf));
104104 else{ // didn't find the apropriate filter, make it
105 - log.debug("Making filter for "+nsf);
 105+ log.info("Making filter for "+nsf);
106106 CachedFilter cwf = makeFilter(nsf);
107107 cache.put(nsf,cwf);
108108 filters.add(cwf);
109109 }
110110 redirects.add(getRedirectFilter(nsf));
111111 }
112 - log.debug("Made composite filter for "+key);
113 - // never cache composite filters
114 - return new NamespaceCompositeFilter(filters,redirects).bits(reader);
 112+ log.info("Made composite filter for "+key);
 113+ NamespaceCompositeFilter ncf = new NamespaceCompositeFilter(filters,redirects);
 114+ // cache if defined in global settings
 115+ if(GlobalConfiguration.getInstance().getNamespacePrefixes().values().contains(key))
 116+ cache.put(key,new CachedFilter(ncf));
 117+ return ncf.bits(reader);
115118 } else if(key.isAll()){
116119 CachedFilter cwf = new CachedFilter(new AllFilter());
117120 cache.put(key,cwf); // always cache
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java
@@ -31,6 +31,7 @@
3232 import org.wikimedia.lsearch.interoperability.RMIMessengerClient;
3333 import org.wikimedia.lsearch.interoperability.RMIServer;
3434
 35+
3536 public class SearcherCache {
3637 protected static Logger log = Logger.getLogger(SearcherCache.class);
3738
@@ -165,6 +166,10 @@
166167 this.ok = ok;
167168 this.poolSize = poolSize;
168169 }
 170+
 171+ public String toString(){
 172+ return "ok="+ok+", poolSize="+poolSize;
 173+ }
169174 }
170175
171176 public static class RemoteSearcherPool {
@@ -204,9 +209,14 @@
205210 protected Set<SearchHost> deadPools = Collections.synchronizedSet(new HashSet<SearchHost>());
206211
207212 protected static SearcherCache instance = null;
 213+
 214+ /** Remote hosts being deployed, never use their searchers, unless necessary! (host->deployment level) */
 215+ protected Hashtable<String,Integer> hostsDeploying = new Hashtable<String,Integer>();
208216
209217 /** deployment has been tried at least once for these */
210 - protected static Set<String> initialWarmup = Collections.synchronizedSet(new HashSet<String>());
 218+ protected static Set<String> initialWarmup = Collections.synchronizedSet(new HashSet<String>());
 219+
 220+ protected boolean initialDeploymentRunning = false;
211221 /**
212222 * If there is a cached local searcher of iid
213223 *
@@ -217,6 +227,36 @@
218228 return localCache.containsKey(iid.toString());
219229 }
220230
 231+ /** Signalize that host is begining it's index update, and that we shouldn't touch it */
 232+ public void hostDeploying(String host){
 233+ synchronized(hostsDeploying){
 234+ Integer level = hostsDeploying.get(host);
 235+ if(level == null) // first level of deployment
 236+ hostsDeploying.put(host,1);
 237+ else // more concurrent threads doing deployment on remote host
 238+ hostsDeploying.put(host,level+1);
 239+ }
 240+ }
 241+
 242+ /** Remote host has been deployed */
 243+ public void hostDeployed(String host){
 244+ synchronized(hostsDeploying){
 245+ Integer level = hostsDeploying.get(host);
 246+ if(level == null){
 247+ log.warn("Cannot deploy host="+host+" since it hasn't been deploying");
 248+ return;
 249+ }
 250+ if(level == 1)
 251+ hostsDeploying.remove(host);
 252+ else
 253+ hostsDeploying.put(host,level-1);
 254+ }
 255+ }
 256+
 257+ public boolean thisHostIsDeploying(){
 258+ return hostsDeploying.containsKey("localhost");
 259+ }
 260+
221261 /**
222262 * Get a random host for iid, if local and being deployed
223263 * always return the localhost
@@ -225,16 +265,22 @@
226266 * @return
227267 */
228268 public String getRandomHost(IndexId iid){
229 - if(iid.isMySearch() && !UpdateThread.isBeingDeployed(iid) && hasLocalSearcher(iid))
 269+ if(iid.isMySearch() && hasLocalSearcher(iid) && !hostsDeploying.containsKey("localhost"))
230270 return "localhost";
231271 if(!initialized.contains(iid.toString()))
232272 initializeRemote(iid);
233273 synchronized(iid.getSearcherCacheLock()){
234274 Hashtable<String,RemoteSearcherPool> pools = remoteCache.get(iid.toString());
235275 if(pools == null)
 276+ return null;
 277+ // generate all suitable remote hosts
 278+ HashSet<String> hosts = new HashSet<String>();
 279+ hosts.addAll(pools.keySet());
 280+ hosts.removeAll(hostsDeploying.keySet());
 281+ if(hosts.size() == 0)
236282 return null;
237 - int num = (int)(Math.random()*pools.size());
238 - for(String host : pools.keySet()){
 283+ int num = (int)(Math.random()*hosts.size());
 284+ for(String host : hosts){
239285 if(--num < 0)
240286 return host;
241287 }
@@ -264,7 +310,7 @@
265311 if(iid == null)
266312 throw new RuntimeException("No such index");
267313 if(!initialWarmup.contains(iid.toString()))
268 - throw new RuntimeException(iid+" is being deployed");
 314+ throw new RuntimeException(iid+" is being deployed or is not searched by this host");
269315 return fromLocalCache(iid.toString());
270316 }
271317
@@ -327,8 +373,10 @@
328374 remoteCache.put(iid.toString(), hostpool = new Hashtable<String,RemoteSearcherPool>());
329375 hostpool.put(host,new RemoteSearcherPool(iid,host,status.poolSize));
330376 deadPools.remove(new SearchHost(iid,host)); // make sure not marked as dead
 377+ log.info("Reinitialized iid="+iid);
331378 return;
332379 }
 380+ log.warn("Cannot reinitialize iid="+iid+", remote pool status="+status);
333381 }
334382 } catch(RemoteException e){
335383 e.printStackTrace();
@@ -349,30 +397,35 @@
350398 */
351399 protected class InitialDeploymentThread extends Thread {
352400 public void run(){
353 - IndexRegistry registry = IndexRegistry.getInstance();
354 - // get local search indexes, deploy sorted by name
355 - ArrayList<IndexId> mys = new ArrayList<IndexId>();
356 - mys.addAll(GlobalConfiguration.getInstance().getMySearch());
357 - Collections.sort(mys,new Comparator<IndexId>(){
358 - public int compare(IndexId o1, IndexId o2) {
359 - return o1.toString().compareTo(o2.toString());
360 - }
361 - });
362 - for(IndexId iid : mys){
363 - try {
364 - // when searcher is linked into "search" path it's good, initialize it
365 - if(!iid.isLogical() && registry.getCurrentSearch(iid) != null){
366 - log.debug("Initializing local for "+iid);
367 - SearcherPool pool = initLocalPool(iid);
368 - //Warmup.warmupPool(pool.searchers,iid,false,1);
369 - //Warmup.waitForAggregate(pool.searchers);
370 - localCache.put(iid.toString(),pool);
371 -
372 - RMIServer.bind(iid,pool.searchers);
 401+ try{
 402+ initialDeploymentRunning = true;
 403+ IndexRegistry registry = IndexRegistry.getInstance();
 404+ // get local search indexes, deploy sorted by name
 405+ ArrayList<IndexId> mys = new ArrayList<IndexId>();
 406+ mys.addAll(GlobalConfiguration.getInstance().getMySearch());
 407+ Collections.sort(mys,new Comparator<IndexId>(){
 408+ public int compare(IndexId o1, IndexId o2) {
 409+ return o1.toString().compareTo(o2.toString());
373410 }
374 - } catch (IOException e) {
375 - log.warn("I/O error warming index for "+iid+" : "+e.getMessage(),e);
 411+ });
 412+ for(IndexId iid : mys){
 413+ try {
 414+ // when searcher is linked into "search" path it's good, initialize it
 415+ if(!iid.isLogical() && registry.getCurrentSearch(iid) != null){
 416+ log.debug("Initializing local for "+iid);
 417+ SearcherPool pool = initLocalPool(iid);
 418+ //Warmup.warmupPool(pool.searchers,iid,false,1);
 419+ //Warmup.waitForAggregate(pool.searchers);
 420+ localCache.put(iid.toString(),pool);
 421+
 422+ RMIServer.bind(iid,pool.searchers);
 423+ }
 424+ } catch (IOException e) {
 425+ log.warn("I/O error warming index for "+iid+" : "+e.getMessage(),e);
 426+ }
376427 }
 428+ } finally {
 429+ initialDeploymentRunning = false;
377430 }
378431 }
379432 }
@@ -452,8 +505,10 @@
453506
454507 protected SearcherCache(boolean initialize){
455508 searchPoolSize = Configuration.open().getInt("SearcherPool","size",1);
456 - if(initialize)
 509+ if(initialize){
 510+ initialDeploymentRunning = true;
457511 new InitialDeploymentThread().start();
 512+ }
458513 }
459514
460515 public int getSearchPoolSize() {
@@ -463,4 +518,16 @@
464519 public Set<SearchHost> getDeadPools() {
465520 return deadPools;
466521 }
 522+
 523+ /** Sleep until initial deployment is finished */
 524+ public void waitForInitialDeployment(){
 525+ while(initialDeploymentRunning){
 526+ try {
 527+ Thread.sleep(100);
 528+ } catch (InterruptedException e) {
 529+ // TODO Auto-generated catch block
 530+ e.printStackTrace();
 531+ }
 532+ }
 533+ }
467534 }
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java
@@ -416,8 +416,8 @@
417417 t.setOriginalEnd(cur+len);
418418 if(!t.isStub())
419419 t.unstubOriginal();
420 - if(t.type != Type.TEXT || t.getPositionIncrement()==0)
421 - raiseException(serialized,cur,t,"Bad serialized data: trying to assign original string to nontext token or alias");
 420+ //if(t.type != Type.TEXT || t.getPositionIncrement()==0)
 421+ // raiseException(serialized,cur,t,"Bad serialized data: trying to assign original string to nontext token or alias");
422422 cur += len;
423423 break; }
424424 case 2: // alias

Status & tagging log