Index: branches/lucene-search-2.1/test/org/wikimedia/lsearch/spell/SuggestUnitTest.java |
— | — | @@ -1,13 +1,17 @@ |
2 | 2 | package org.wikimedia.lsearch.spell; |
3 | 3 | |
4 | 4 | import java.io.IOException; |
| 5 | +import java.util.HashMap; |
| 6 | +import java.util.Map; |
| 7 | +import java.util.TreeMap; |
5 | 8 | |
6 | 9 | import org.wikimedia.lsearch.config.IndexId; |
7 | 10 | import org.wikimedia.lsearch.search.NamespaceFilter; |
| 11 | +import org.wikimedia.lsearch.search.SearcherCache; |
| 12 | +import org.wikimedia.lsearch.spell.dist.EditDistance; |
8 | 13 | import org.wikimedia.lsearch.test.WikiTestCase; |
9 | 14 | |
10 | 15 | public class SuggestUnitTest extends WikiTestCase { |
11 | | - |
12 | 16 | public void testMakeNamespaces() throws IOException { |
13 | 17 | IndexId iid = IndexId.get("entest"); |
14 | 18 | Suggest sug = new Suggest(iid); |
— | — | @@ -17,4 +21,19 @@ |
18 | 22 | assertEquals("[0, 100, 2, 4]",sug.makeNamespaces(new NamespaceFilter("0,2,4,100")).namespaces.toString()); |
19 | 23 | } |
20 | 24 | |
| 25 | + public Map<Integer,Integer> getSpaceMap(String str1, String str2){ |
| 26 | + EditDistance ed = new EditDistance(str1); |
| 27 | + int d[][] = ed.getMatrix(str2); |
| 28 | + // map: space -> same space in edited string |
| 29 | + TreeMap<Integer,Integer> spaceMap = new TreeMap<Integer,Integer>(); |
| 30 | + new Suggest().extractSpaceMap(d,str1.length(),str2.length(),spaceMap,str1,str2); |
| 31 | + return spaceMap; |
| 32 | + } |
| 33 | + |
| 34 | + public void testExtractSpaceMap() throws IOException { |
| 35 | + assertEquals("{}",getSpaceMap(".999","0 999").toString()); |
| 36 | + assertEquals("{4=3}",getSpaceMap("some string","som estring").toString()); |
| 37 | + assertEquals("",getSpaceMap(" a "," b ").toString()); |
| 38 | + } |
| 39 | + |
21 | 40 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/frontend/SearchDaemon.java |
— | — | @@ -321,6 +321,8 @@ |
322 | 322 | if(iid.toString().length()>maxlen) |
323 | 323 | maxlen = iid.toString().length(); |
324 | 324 | } |
| 325 | + if(cache.thisHostIsDeploying()) |
| 326 | + sendOutputLine("This host is being deployed"); |
325 | 327 | for(IndexId iid : mysearch){ |
326 | 328 | if(iid.isLogical()) |
327 | 329 | continue; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/spell/Suggest.java |
— | — | @@ -9,6 +9,7 @@ |
10 | 10 | import java.util.HashMap; |
11 | 11 | import java.util.HashSet; |
12 | 12 | import java.util.Iterator; |
| 13 | +import java.util.Map; |
13 | 14 | import java.util.Set; |
14 | 15 | import java.util.WeakHashMap; |
15 | 16 | import java.util.Map.Entry; |
— | — | @@ -178,6 +179,10 @@ |
179 | 180 | /** Lower limit to hit rate for joining */ |
180 | 181 | public static final int JOIN_FREQ = 1; |
181 | 182 | |
| 183 | + /** use for testing only */ |
| 184 | + protected Suggest() { |
| 185 | + } |
| 186 | + |
182 | 187 | public Suggest(IndexId iid) throws IOException { |
183 | 188 | this(iid,null,true); |
184 | 189 | } |
— | — | @@ -280,7 +285,7 @@ |
281 | 286 | log.debug("tokens: "+tokens+" inContext:"+info.foundInContext+" phrases:"+info.phrases+", inTitles="+info.foundInTitles); |
282 | 287 | |
283 | 288 | if(tokens.size() > 15){ |
284 | | - logRequest(searchterm,"too many words to spellcheck ("+tokens.size()+")",start); |
| 289 | + logRequest(searchterm,"too many words to spellcheck ("+tokens.size()+")",start,searchterm); |
285 | 290 | return new SuggestQuery(searchterm,new ArrayList<Integer>()); |
286 | 291 | } |
287 | 292 | |
— | — | @@ -310,10 +315,10 @@ |
311 | 316 | } |
312 | 317 | if(changes.size() > 0){ |
313 | 318 | SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,new HashSet<Integer>(),ns); |
314 | | - logRequest(sq.getSearchterm(),"words only (wildcard or fuzzy query)",start); |
| 319 | + logRequest(sq.getSearchterm(),"words only (wildcard or fuzzy query)",start,searchterm); |
315 | 320 | return sq; |
316 | 321 | } else{ |
317 | | - logRequest(searchterm,"CORRECT (by words, wildcard or fuzzy query)",start); |
| 322 | + logRequest(searchterm,"CORRECT (by words, wildcard or fuzzy query)",start,searchterm); |
318 | 323 | return new SuggestQuery(searchterm,new ArrayList<Integer>()); |
319 | 324 | } |
320 | 325 | } |
— | — | @@ -332,7 +337,7 @@ |
333 | 338 | HashMap<Integer,String> changes = extractTitleChanges(joinTokens,redirectTarget,tokens); |
334 | 339 | if(changes != null){ |
335 | 340 | SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,new HashSet<Integer>(),ns); |
336 | | - logRequest(sq.getSearchterm(),"titles (via redirect)",start); |
| 341 | + logRequest(sq.getSearchterm(),"titles (via redirect)",start,searchterm); |
337 | 342 | return sq; |
338 | 343 | } |
339 | 344 | } |
— | — | @@ -360,14 +365,14 @@ |
361 | 366 | if(titleRes.size()>0 && (titleRes.get(0).dist<2 || (correctByPhrases && titleRes.get(0).dist<=2))){ |
362 | 367 | SuggestResult r = titleRes.get(0); |
363 | 368 | if(r.isExactMatch()){ |
364 | | - logRequest(searchterm,"CORRECT (exact title match)",start); |
| 369 | + logRequest(searchterm,"CORRECT (exact title match)",start,searchterm); |
365 | 370 | return new SuggestQuery(searchterm,new ArrayList<Integer>()); |
366 | 371 | } |
367 | 372 | if(betterRank(r.frequency,info.firstRank)){ |
368 | 373 | HashMap<Integer,String> changes = extractTitleChanges(joinTokens,r.word,tokens); |
369 | 374 | if(changes != null){ |
370 | 375 | SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,changes.keySet(),ns); |
371 | | - logRequest(sq.getSearchterm(),"titles (misspell)",start); |
| 376 | + logRequest(sq.getSearchterm(),"titles (misspell)",start,searchterm); |
372 | 377 | return sq; |
373 | 378 | } |
374 | 379 | } |
— | — | @@ -382,7 +387,7 @@ |
383 | 388 | if(singleWordSug.size() > 0){ |
384 | 389 | SuggestResult r = singleWordSug.get(0); |
385 | 390 | if(r.isExactMatch()){ |
386 | | - logRequest(searchterm,"CORRECT (by single word index)",start); |
| 391 | + logRequest(searchterm,"CORRECT (by single word index)",start,searchterm); |
387 | 392 | return new SuggestQuery(searchterm,new ArrayList<Integer>()); |
388 | 393 | } else{ //if(r.dist <= 1 && betterRank(r.frequency,info.firstRank)){ |
389 | 394 | SuggestResult best = null; |
— | — | @@ -405,7 +410,7 @@ |
406 | 411 | HashMap<Integer,String> proposedChanges = new HashMap<Integer,String>(); |
407 | 412 | proposedChanges.put(0,best.word); |
408 | 413 | SuggestQuery sq = makeSuggestedQuery(tokens,proposedChanges,searchterm,filters,new HashSet<Integer>(),ns); |
409 | | - logRequest(sq.getSearchterm(),"single word misspell",start); |
| 414 | + logRequest(sq.getSearchterm(),"single word misspell",start,searchterm); |
410 | 415 | return sq; |
411 | 416 | } |
412 | 417 | } |
— | — | @@ -633,7 +638,7 @@ |
634 | 639 | // check |
635 | 640 | if( titleExists(proposedTitle.toString(),ns) ){ |
636 | 641 | SuggestQuery sq = makeSuggestedQuery(tokens,changes,searchterm,filters,changes.keySet(),ns); |
637 | | - logRequest(sq.getSearchterm(),"phrases (title match)",start); |
| 642 | + logRequest(sq.getSearchterm(),"phrases (title match)",start,searchterm); |
638 | 643 | return sq; |
639 | 644 | } |
640 | 645 | } |
— | — | @@ -719,13 +724,13 @@ |
720 | 725 | if(redirectTarget != null){ |
721 | 726 | String prop = followRedirect(joinTokens(" ",tokens,proposedChanges),ns); |
722 | 727 | if(prop != null && prop.equals(redirectTarget)){ |
723 | | - logRequest(searchterm,"CORRECT (spellcheck to redirect to same article)",start); |
| 728 | + logRequest(searchterm,"CORRECT (spellcheck to redirect to same article)",start,searchterm); |
724 | 729 | return new SuggestQuery(searchterm,new ArrayList<Integer>()); |
725 | 730 | } |
726 | 731 | } |
727 | 732 | |
728 | 733 | SuggestQuery sq = makeSuggestedQuery(tokens,proposedChanges,searchterm,filters,alwaysReplace,ns); |
729 | | - logRequest(sq.getSearchterm(),using,start); |
| 734 | + logRequest(sq.getSearchterm(),using,start,searchterm); |
730 | 735 | return sq; |
731 | 736 | } |
732 | 737 | |
— | — | @@ -936,7 +941,8 @@ |
937 | 942 | EditDistance ed = new EditDistance(joined); |
938 | 943 | int d[][] = ed.getMatrix(corrected); |
939 | 944 | // map: space -> same space in edited string |
940 | | - HashMap<Integer,Integer> spaceMap = new HashMap<Integer,Integer>(); |
| 945 | + HashMap<Integer,Integer> spaceMap = new HashMap<Integer,Integer>(); |
| 946 | + spaceMapCalls = 0; |
941 | 947 | extractSpaceMap(d,joined.length(),corrected.length(),spaceMap,joined,corrected); |
942 | 948 | // indexes where spaces are in the edited string |
943 | 949 | ArrayList<Integer> spaces = new ArrayList<Integer>(); |
— | — | @@ -995,8 +1001,16 @@ |
996 | 1002 | return acceptWord(r,metric); |
997 | 1003 | } |
998 | 1004 | |
| 1005 | + protected int spaceMapCalls = 0; |
| 1006 | + |
999 | 1007 | /** Transverse the cost matrix and extract mapping of old vs new spaces */ |
1000 | | - final protected void extractSpaceMap(int[][] d, int i, int j, HashMap<Integer,Integer> spaceMap, String str1, String str2) { |
| 1008 | + final protected void extractSpaceMap(int[][] d, int i, int j, Map<Integer,Integer> spaceMap, String str1, String str2) { |
| 1009 | + spaceMapCalls++; |
| 1010 | + if(spaceMapCalls > 100000){ |
| 1011 | + log.warn("Long SpaceMap call: str1="+str1+", str2="+str2); |
| 1012 | + // FIXME !! |
| 1013 | + return; |
| 1014 | + } |
1001 | 1015 | int cost = d[i][j]; |
1002 | 1016 | if(i == 0 || j == 0) |
1003 | 1017 | return; |
— | — | @@ -1551,9 +1565,9 @@ |
1552 | 1566 | return ret; |
1553 | 1567 | } |
1554 | 1568 | |
1555 | | - protected void logRequest(String searchterm, String using, long start){ |
| 1569 | + protected void logRequest(String searchterm, String using, long start, String original){ |
1556 | 1570 | if(useLogging) |
1557 | | - log.info(iid+" suggest: ["+searchterm+"] using=["+using+"] in "+(System.currentTimeMillis()-start)+" ms"); |
| 1571 | + log.info(iid+" for original=["+ original +"] suggest: ["+searchterm+"] using=["+using+"] in "+(System.currentTimeMillis()-start)+" ms"); |
1558 | 1572 | } |
1559 | 1573 | |
1560 | 1574 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerClient.java |
— | — | @@ -136,7 +136,7 @@ |
137 | 137 | log.debug("Got new RMI messenger for host "+host); |
138 | 138 | return r; |
139 | 139 | } catch (RemoteException e) { |
140 | | - log.warn("Cannot contact RMI registry for host "+host+" : "+e.getMessage(),e); |
| 140 | + log.warn("Cannot contact RMI registry for host "+host+" : "+e.getMessage()); |
141 | 141 | throw e; |
142 | 142 | } catch (NotBoundException e) { |
143 | 143 | log.warn("No RMIMessenger instance at host "+host+" : "+e.getMessage(),e); |
— | — | @@ -407,4 +407,27 @@ |
408 | 408 | log.error("Messenger not bound: "+e.getMessage(),e); |
409 | 409 | } |
410 | 410 | } |
| 411 | + |
| 412 | + public void hostDeployed(String host, String myHost) throws RemoteException { |
| 413 | + RMIMessenger r; |
| 414 | + try { |
| 415 | + r = messengerFromCache(host); |
| 416 | + r.hostDeployed(myHost); |
| 417 | + } catch(NotBoundException e){ |
| 418 | + e.printStackTrace(); |
| 419 | + log.error("Messenger not bound: "+e.getMessage(),e); |
| 420 | + } |
| 421 | + } |
| 422 | + |
| 423 | + public void hostDeploying(String host, String myHost) throws RemoteException { |
| 424 | + RMIMessenger r; |
| 425 | + try { |
| 426 | + r = messengerFromCache(host); |
| 427 | + r.hostDeploying(myHost); |
| 428 | + } catch(NotBoundException e){ |
| 429 | + e.printStackTrace(); |
| 430 | + log.error("Messenger not bound: "+e.getMessage(),e); |
| 431 | + } |
| 432 | + } |
| 433 | + |
411 | 434 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessenger.java |
— | — | @@ -255,4 +255,15 @@ |
256 | 256 | * @throws RemoteException |
257 | 257 | */ |
258 | 258 | public void addLocalizationCustomMapping(Map<Integer,String> namespaceIndexToName, String dbname) throws RemoteException; |
| 259 | + |
| 260 | + /** |
| 261 | + * Signalize that the host is deploying and that is shouldn't be bugged with searches |
| 262 | + * |
| 263 | + * @param host |
| 264 | + * @throws RemoteException |
| 265 | + */ |
| 266 | + public void hostDeploying(String host) throws RemoteException; |
| 267 | + |
| 268 | + /** Remote host has been deployed */ |
| 269 | + public void hostDeployed(String host) throws RemoteException; |
259 | 270 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIMessengerImpl.java |
— | — | @@ -242,6 +242,19 @@ |
243 | 243 | |
244 | 244 | } |
245 | 245 | |
| 246 | + public void hostDeployed(String host) throws RemoteException { |
| 247 | + if(cache == null) |
| 248 | + cache = SearcherCache.getInstance(); |
| 249 | + cache.hostDeployed(host); |
| 250 | + |
| 251 | + } |
| 252 | + |
| 253 | + public void hostDeploying(String host) throws RemoteException { |
| 254 | + if(cache == null) |
| 255 | + cache = SearcherCache.getInstance(); |
| 256 | + cache.hostDeploying(host); |
| 257 | + } |
| 258 | + |
246 | 259 | protected RMIMessengerImpl(){ |
247 | 260 | networkStatus = null; |
248 | 261 | indexRegistry = null; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/interoperability/RMIServer.java |
— | — | @@ -58,7 +58,7 @@ |
59 | 59 | } |
60 | 60 | return true; |
61 | 61 | } catch (RemoteException e) { |
62 | | - e.printStackTrace(); |
| 62 | + log.warn("Remote error unbinding iid="+iid,e); |
63 | 63 | } catch (NotBoundException e) { |
64 | 64 | } |
65 | 65 | return false; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/StreamTerms.java |
— | — | @@ -48,5 +48,10 @@ |
49 | 49 | } |
50 | 50 | } |
51 | 51 | } |
| 52 | + |
| 53 | + public int termCount() { |
| 54 | + // TODO Auto-generated method stub |
| 55 | + return 0; |
| 56 | + } |
52 | 57 | |
53 | 58 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/WordTerms.java |
— | — | @@ -68,4 +68,10 @@ |
69 | 69 | return words.get((int)(Math.random()*words.size())); |
70 | 70 | } |
71 | 71 | |
| 72 | + public int termCount() { |
| 73 | + return words.size(); |
| 74 | + } |
| 75 | + |
| 76 | + |
| 77 | + |
72 | 78 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/Terms.java |
— | — | @@ -2,4 +2,6 @@ |
3 | 3 | |
4 | 4 | public interface Terms { |
5 | 5 | public String next(); |
| 6 | + |
| 7 | + public int termCount(); |
6 | 8 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/benchmark/SampleTerms.java |
— | — | @@ -965,4 +965,10 @@ |
966 | 966 | "volcom", |
967 | 967 | "Charlotte ross" |
968 | 968 | }; |
| 969 | + |
| 970 | + public int termCount() { |
| 971 | + return terms.length; |
| 972 | + } |
| 973 | + |
| 974 | + |
969 | 975 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -598,7 +598,7 @@ |
599 | 599 | } |
600 | 600 | } |
601 | 601 | } |
602 | | - if(searchHosts.isEmpty()){ |
| 602 | + if(searchHosts.isEmpty() && !dbrole.endsWith(".links") && !dbrole.endsWith(".related")){ |
603 | 603 | // assign to search orphans host |
604 | 604 | searchHosts.addAll(searchOrphans); |
605 | 605 | } |
— | — | @@ -1556,5 +1556,14 @@ |
1557 | 1557 | return IndexId.get(commonsWiki); |
1558 | 1558 | } |
1559 | 1559 | |
| 1560 | + /** Get all searchers (NOTE: this is kindof slow...) */ |
| 1561 | + public HashSet<String> getAllSearchHosts(){ |
| 1562 | + HashSet<String> hosts = new HashSet<String>(); |
| 1563 | + for(IndexId iid : indexIdPool.values()){ |
| 1564 | + hosts.addAll(iid.getSearchHosts()); |
| 1565 | + } |
| 1566 | + return hosts; |
| 1567 | + } |
| 1568 | + |
1560 | 1569 | |
1561 | 1570 | } |
\ No newline at end of file |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/Warmup.java |
— | — | @@ -15,7 +15,10 @@ |
16 | 16 | import org.apache.lucene.index.Term; |
17 | 17 | import org.apache.lucene.search.Hits; |
18 | 18 | import org.apache.lucene.search.Query; |
| 19 | +import org.apache.lucene.search.SearchableMul; |
| 20 | +import org.apache.lucene.search.Searcher; |
19 | 21 | import org.apache.lucene.search.TermQuery; |
| 22 | +import org.apache.lucene.search.TopDocs; |
20 | 23 | import org.wikimedia.lsearch.analyzers.Analyzers; |
21 | 24 | import org.wikimedia.lsearch.analyzers.FieldBuilder; |
22 | 25 | import org.wikimedia.lsearch.analyzers.FieldNameFactory; |
— | — | @@ -179,13 +182,35 @@ |
180 | 183 | FieldBuilder.BuilderSet b = new FieldBuilder(iid).getBuilder(); |
181 | 184 | WikiQueryParser parser = new WikiQueryParser(b.getFields().contents(),"0",Analyzers.getSearcherAnalyzer(iid,false),b,WikiQueryParser.NamespacePolicy.IGNORE,null); |
182 | 185 | |
183 | | - try{ |
| 186 | + ArrayList<SearchableMul> searchers = new ArrayList<SearchableMul>(); |
| 187 | + SearcherCache cache = SearcherCache.getInstance(); |
| 188 | + for(IndexId piid : iid.getDB().getPhysicalIndexIds()){ |
| 189 | + if(piid == iid) |
| 190 | + searchers.add(is); |
| 191 | + else if(piid.isMySearch()){ |
| 192 | + try { |
| 193 | + searchers.add(cache.getLocalSearcher(piid)); |
| 194 | + } catch (Exception e) { |
| 195 | + log.warn("Error retrieving local searcher part "+piid+" for warmup", e); |
| 196 | + } |
| 197 | + } |
| 198 | + } |
| 199 | + |
| 200 | + try{ |
| 201 | + Searcher searcher = null; |
| 202 | + if(searchers.size()<=1) |
| 203 | + searcher = is; |
| 204 | + else |
| 205 | + searcher = new MultiSearcherMul(searchers.toArray(new SearchableMul[]{})); |
| 206 | + |
184 | 207 | Terms terms = getTermsForLang(lang); |
| 208 | + log.info("Warming up with "+terms.termCount()+" terms"); |
185 | 209 | for(int i=0; i < count ; i++){ |
| 210 | + String searchterm = terms.next(); |
| 211 | + long start = System.currentTimeMillis(); |
186 | 212 | Query q = parser.parse(terms.next()); |
187 | | - Hits hits = is.search(q); |
188 | | - for(int j =0; j<20 && j<hits.length(); j++) |
189 | | - hits.doc(j); // retrieve some documents |
| 213 | + TopDocs hits = searcher.search(q,null,20); |
| 214 | + new SearchEngine().makeSearchResults((SearchableMul)searcher,hits,0,20,iid,searchterm,q,start,false); |
190 | 215 | if(useDelay){ |
191 | 216 | if(i<1000) |
192 | 217 | Thread.sleep(100); |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -113,6 +113,11 @@ |
114 | 114 | searchOnly = true; |
115 | 115 | NamespaceFilter namespaces = new NamespaceFilter((String)query.get("namespaces")); |
116 | 116 | SearchResults res = search(iid, searchterm, offset, limit, iwoffset, iwlimit, namespaces, what.equals("explain"), exactCase, false, searchOnly); |
| 117 | + if(!res.isSuccess()){ |
| 118 | + // note failed search |
| 119 | + if(SearchServer.stats != null) |
| 120 | + SearchServer.stats.add(false, 0, SearchDaemon.getOpenCount()); |
| 121 | + } |
117 | 122 | /*if(res!=null && res.isRetry()){ |
118 | 123 | int retries = 1; |
119 | 124 | |
— | — | @@ -738,6 +743,10 @@ |
739 | 744 | return; |
740 | 745 | if(!nsfw.hasNamespaceFilter()) |
741 | 746 | return; // query on many overlapping namespaces, won't try to spellcheck to not mess things up |
| 747 | + if(isNumber(searchterm)) |
| 748 | + return; // don't suggest numbers... |
| 749 | + // strip unnecessary spaces |
| 750 | + searchterm = searchterm.replaceAll(" +"," "); |
742 | 751 | // suggest ! |
743 | 752 | res.setSuggest(null); |
744 | 753 | ArrayList<Token> tokens = parser.tokenizeForSpellCheck(parser.extractPrefixFilter(searchterm)); |
— | — | @@ -754,6 +763,14 @@ |
755 | 764 | res.addInfo("suggest",formatHost(host)); |
756 | 765 | } |
757 | 766 | } |
| 767 | + |
| 768 | + protected boolean isNumber(String s){ |
| 769 | + for(char c : s.toCharArray()){ |
| 770 | + if(!(Character.isDigit(c) || c=='.' || c==',')) |
| 771 | + return false; |
| 772 | + } |
| 773 | + return true; |
| 774 | + } |
758 | 775 | |
759 | 776 | protected Query parseQuery(String searchterm, WikiQueryParser parser, IndexId iid, boolean raw, FilterWrapper nsfw, boolean searchAll, Wildcards wildcards) throws ParseException { |
760 | 777 | Query q = null; |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/UpdateThread.java |
— | — | @@ -74,9 +74,11 @@ |
75 | 75 | // get the new snapshots via rsync, might be lengthy |
76 | 76 | for(LocalIndex li : forUpdate){ |
77 | 77 | try{ |
78 | | - log.debug("Syncing "+li.iid); |
79 | | - rebuild(li,type); // rsync, update registry, cache |
80 | | - pending.remove(li.iid.toString()); |
| 78 | + synchronized (threadSerialization){ |
| 79 | + log.debug("Syncing "+li.iid); |
| 80 | + rebuild(li,type); // rsync, update registry, cache |
| 81 | + pending.remove(li.iid.toString()); |
| 82 | + } |
81 | 83 | } catch(Exception e){ |
82 | 84 | e.printStackTrace(); |
83 | 85 | log.error("Error syncing "+li+" : "+e.getMessage(),e); |
— | — | @@ -100,7 +102,13 @@ |
101 | 103 | protected String rsyncPath = null; |
102 | 104 | protected String rsyncParams = null; |
103 | 105 | protected long numChecks = 0; |
| 106 | + /** If localhost should be *always* taken out of rotation */ |
| 107 | + protected boolean forceLocalDeployment = false; |
| 108 | + /** If old update/ dirs should be deleted once the new index is deployed */ |
| 109 | + protected boolean deleteOldUpdates = false; |
104 | 110 | |
| 111 | + protected static Object threadSerialization = new Object(); |
| 112 | + |
105 | 113 | @Override |
106 | 114 | public void run() { |
107 | 115 | long lastCheck, now; |
— | — | @@ -217,6 +225,18 @@ |
218 | 226 | } |
219 | 227 | new File(updatepath).mkdirs(); |
220 | 228 | try{ |
| 229 | + if(forceLocalDeployment){ |
| 230 | + cache.hostDeploying("localhost"); |
| 231 | + String myHost = global.getLocalhost(); |
| 232 | + for(String host : global.getAllSearchHosts()){ |
| 233 | + try{ |
| 234 | + if(!host.equals(myHost)) |
| 235 | + messenger.hostDeploying(host,myHost); |
| 236 | + } catch(Exception e){ |
| 237 | + log.warn("Error notifying host "+host+" of index deployment: "+e.getMessage(),e); |
| 238 | + } |
| 239 | + } |
| 240 | + } |
221 | 241 | // if local, use cp -lr instead of rsync |
222 | 242 | if(global.isLocalhost(iid.getIndexHost())){ |
223 | 243 | FSUtils.createHardLinkRecursive( |
— | — | @@ -261,21 +281,39 @@ |
262 | 282 | // notify all remote searchers of change |
263 | 283 | messenger.notifyIndexUpdated(iid,iid.getDBSearchHosts()); |
264 | 284 | |
| 285 | + // cleanup old index updates if neccessary |
| 286 | + if(deleteOldUpdates && myli != null){ |
| 287 | + deleteDirRecursive(new File(iid.getUpdatePath()+Configuration.PATH_SEP+myli.timestamp)); |
| 288 | + } |
| 289 | + |
265 | 290 | } catch(IOException ioe){ |
266 | 291 | ioe.printStackTrace(); |
267 | 292 | log.error("I/O error updating index "+iid+" at "+li.path+" : "+ioe.getMessage(),ioe); |
268 | 293 | badIndexes.put(li.iid.toString(),li.timestamp); |
| 294 | + } finally { |
| 295 | + if(forceLocalDeployment){ |
| 296 | + cache.hostDeployed("localhost"); |
| 297 | + String myHost = global.getLocalhost(); |
| 298 | + for(String host : global.getAllSearchHosts()){ |
| 299 | + try{ |
| 300 | + if(!host.equals(myHost)) |
| 301 | + messenger.hostDeployed(host,myHost); |
| 302 | + } catch(Exception e){ |
| 303 | + log.warn("Error notifying host "+host+" of end of deployment: "+e.getMessage(),e); |
| 304 | + } |
| 305 | + } |
| 306 | + } |
269 | 307 | } |
270 | 308 | } |
271 | 309 | |
272 | 310 | /** Update searcher cache after warming up searchers */ |
273 | 311 | protected void warmupAndDeploy(SearcherCache.SearcherPool pool, LocalIndex li, RebuildType type){ |
| 312 | + boolean reroute = false; |
274 | 313 | try{ |
275 | 314 | // see if we can go ahead and deploy the searcher or should we wait |
276 | 315 | IndexId iid = li.iid; |
277 | 316 | HashSet<String> group = iid.getSearchHosts(); |
278 | | - int succ = 0, fail = 0; |
279 | | - boolean reroute = false; |
| 317 | + int succ = 0, fail = 0; |
280 | 318 | long waitedSoFar = 0; |
281 | 319 | if(type == RebuildType.FULL){ |
282 | 320 | // never deploy more than one searcher of iid in a search group |
— | — | @@ -318,8 +356,16 @@ |
319 | 357 | |
320 | 358 | // reoute queries to other servers |
321 | 359 | if( reroute ){ |
322 | | - log.info("Deploying "+iid); |
323 | | - beingDeployed.add(iid.toString()); |
| 360 | + String myHost = global.getLocalhost(); |
| 361 | + log.info("Deploying "+iid+" on "+myHost); |
| 362 | + beingDeployed.add(iid.toString()); |
| 363 | + /* for(String host : global.getAllSearchHosts()){ |
| 364 | + try{ |
| 365 | + messenger.hostDeploying(host,myHost); |
| 366 | + } catch(Exception e){ |
| 367 | + log.warn("Error notifying host "+host+" of index deployment: "+e.getMessage(),e); |
| 368 | + } |
| 369 | + } */ |
324 | 370 | try{ |
325 | 371 | //RMIServer.unbind(iid,cache.getLocalSearcherPool(iid)); |
326 | 372 | } catch(Exception e) { |
— | — | @@ -337,7 +383,7 @@ |
338 | 384 | //Warmup.warmupIndexSearcher(is,li.iid,true,1); |
339 | 385 | //Warmup.waitForAggregate(pool.searchers); |
340 | 386 | // do proper warmup |
341 | | - Warmup.warmupIndexSearcher(is,li.iid,true,null); |
| 387 | + Warmup.warmupIndexSearcher(is,li.iid,false,null); |
342 | 388 | } catch(IOException e){ |
343 | 389 | e.printStackTrace(); |
344 | 390 | log.warn("Error warmup up "+li+" : "+e.getMessage(),e); |
— | — | @@ -353,7 +399,7 @@ |
354 | 400 | } |
355 | 401 | } finally{ |
356 | 402 | // be sure stuff is not stuck as being deployed |
357 | | - beingDeployed.remove(li.iid.toString()); |
| 403 | + beingDeployed.remove(li.iid.toString()); |
358 | 404 | } |
359 | 405 | } |
360 | 406 | |
— | — | @@ -371,6 +417,8 @@ |
372 | 418 | cache = SearcherCache.getInstance(); |
373 | 419 | rsyncPath = config.getString("Rsync","path","/usr/bin/rsync"); |
374 | 420 | rsyncParams = config.getString("Rsync","params",""); |
| 421 | + forceLocalDeployment = config.getBoolean("Search","forceLocalDeployment"); |
| 422 | + deleteOldUpdates = config.getBoolean("Search","deleteOldUpdates"); |
375 | 423 | } |
376 | 424 | |
377 | 425 | public static UpdateThread getStandalone(){ |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/NamespaceCache.java |
— | — | @@ -101,16 +101,19 @@ |
102 | 102 | if(cache.containsKey(nsf)) |
103 | 103 | filters.add(cache.get(nsf)); |
104 | 104 | else{ // didn't find the apropriate filter, make it |
105 | | - log.debug("Making filter for "+nsf); |
| 105 | + log.info("Making filter for "+nsf); |
106 | 106 | CachedFilter cwf = makeFilter(nsf); |
107 | 107 | cache.put(nsf,cwf); |
108 | 108 | filters.add(cwf); |
109 | 109 | } |
110 | 110 | redirects.add(getRedirectFilter(nsf)); |
111 | 111 | } |
112 | | - log.debug("Made composite filter for "+key); |
113 | | - // never cache composite filters |
114 | | - return new NamespaceCompositeFilter(filters,redirects).bits(reader); |
| 112 | + log.info("Made composite filter for "+key); |
| 113 | + NamespaceCompositeFilter ncf = new NamespaceCompositeFilter(filters,redirects); |
| 114 | + // cache if defined in global settings |
| 115 | + if(GlobalConfiguration.getInstance().getNamespacePrefixes().values().contains(key)) |
| 116 | + cache.put(key,new CachedFilter(ncf)); |
| 117 | + return ncf.bits(reader); |
115 | 118 | } else if(key.isAll()){ |
116 | 119 | CachedFilter cwf = new CachedFilter(new AllFilter()); |
117 | 120 | cache.put(key,cwf); // always cache |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/search/SearcherCache.java |
— | — | @@ -31,6 +31,7 @@ |
32 | 32 | import org.wikimedia.lsearch.interoperability.RMIMessengerClient; |
33 | 33 | import org.wikimedia.lsearch.interoperability.RMIServer; |
34 | 34 | |
| 35 | + |
35 | 36 | public class SearcherCache { |
36 | 37 | protected static Logger log = Logger.getLogger(SearcherCache.class); |
37 | 38 | |
— | — | @@ -165,6 +166,10 @@ |
166 | 167 | this.ok = ok; |
167 | 168 | this.poolSize = poolSize; |
168 | 169 | } |
| 170 | + |
| 171 | + public String toString(){ |
| 172 | + return "ok="+ok+", poolSize="+poolSize; |
| 173 | + } |
169 | 174 | } |
170 | 175 | |
171 | 176 | public static class RemoteSearcherPool { |
— | — | @@ -204,9 +209,14 @@ |
205 | 210 | protected Set<SearchHost> deadPools = Collections.synchronizedSet(new HashSet<SearchHost>()); |
206 | 211 | |
207 | 212 | protected static SearcherCache instance = null; |
| 213 | + |
| 214 | + /** Remote hosts being deployed, never use their searchers, unless necessary! (host->deployment level) */ |
| 215 | + protected Hashtable<String,Integer> hostsDeploying = new Hashtable<String,Integer>(); |
208 | 216 | |
209 | 217 | /** deployment has been tried at least once for these */ |
210 | | - protected static Set<String> initialWarmup = Collections.synchronizedSet(new HashSet<String>()); |
| 218 | + protected static Set<String> initialWarmup = Collections.synchronizedSet(new HashSet<String>()); |
| 219 | + |
| 220 | + protected boolean initialDeploymentRunning = false; |
211 | 221 | /** |
212 | 222 | * If there is a cached local searcher of iid |
213 | 223 | * |
— | — | @@ -217,6 +227,36 @@ |
218 | 228 | return localCache.containsKey(iid.toString()); |
219 | 229 | } |
220 | 230 | |
| 231 | + /** Signalize that host is begining it's index update, and that we shouldn't touch it */ |
| 232 | + public void hostDeploying(String host){ |
| 233 | + synchronized(hostsDeploying){ |
| 234 | + Integer level = hostsDeploying.get(host); |
| 235 | + if(level == null) // first level of deployment |
| 236 | + hostsDeploying.put(host,1); |
| 237 | + else // more concurrent threads doing deployment on remote host |
| 238 | + hostsDeploying.put(host,level+1); |
| 239 | + } |
| 240 | + } |
| 241 | + |
| 242 | + /** Remote host has been deployed */ |
| 243 | + public void hostDeployed(String host){ |
| 244 | + synchronized(hostsDeploying){ |
| 245 | + Integer level = hostsDeploying.get(host); |
| 246 | + if(level == null){ |
| 247 | + log.warn("Cannot deploy host="+host+" since it hasn't been deploying"); |
| 248 | + return; |
| 249 | + } |
| 250 | + if(level == 1) |
| 251 | + hostsDeploying.remove(host); |
| 252 | + else |
| 253 | + hostsDeploying.put(host,level-1); |
| 254 | + } |
| 255 | + } |
| 256 | + |
| 257 | + public boolean thisHostIsDeploying(){ |
| 258 | + return hostsDeploying.containsKey("localhost"); |
| 259 | + } |
| 260 | + |
221 | 261 | /** |
222 | 262 | * Get a random host for iid, if local and being deployed |
223 | 263 | * always return the localhost |
— | — | @@ -225,16 +265,22 @@ |
226 | 266 | * @return |
227 | 267 | */ |
228 | 268 | public String getRandomHost(IndexId iid){ |
229 | | - if(iid.isMySearch() && !UpdateThread.isBeingDeployed(iid) && hasLocalSearcher(iid)) |
| 269 | + if(iid.isMySearch() && hasLocalSearcher(iid) && !hostsDeploying.containsKey("localhost")) |
230 | 270 | return "localhost"; |
231 | 271 | if(!initialized.contains(iid.toString())) |
232 | 272 | initializeRemote(iid); |
233 | 273 | synchronized(iid.getSearcherCacheLock()){ |
234 | 274 | Hashtable<String,RemoteSearcherPool> pools = remoteCache.get(iid.toString()); |
235 | 275 | if(pools == null) |
| 276 | + return null; |
| 277 | + // generate all suitable remote hosts |
| 278 | + HashSet<String> hosts = new HashSet<String>(); |
| 279 | + hosts.addAll(pools.keySet()); |
| 280 | + hosts.removeAll(hostsDeploying.keySet()); |
| 281 | + if(hosts.size() == 0) |
236 | 282 | return null; |
237 | | - int num = (int)(Math.random()*pools.size()); |
238 | | - for(String host : pools.keySet()){ |
| 283 | + int num = (int)(Math.random()*hosts.size()); |
| 284 | + for(String host : hosts){ |
239 | 285 | if(--num < 0) |
240 | 286 | return host; |
241 | 287 | } |
— | — | @@ -264,7 +310,7 @@ |
265 | 311 | if(iid == null) |
266 | 312 | throw new RuntimeException("No such index"); |
267 | 313 | if(!initialWarmup.contains(iid.toString())) |
268 | | - throw new RuntimeException(iid+" is being deployed"); |
| 314 | + throw new RuntimeException(iid+" is being deployed or is not searched by this host"); |
269 | 315 | return fromLocalCache(iid.toString()); |
270 | 316 | } |
271 | 317 | |
— | — | @@ -327,8 +373,10 @@ |
328 | 374 | remoteCache.put(iid.toString(), hostpool = new Hashtable<String,RemoteSearcherPool>()); |
329 | 375 | hostpool.put(host,new RemoteSearcherPool(iid,host,status.poolSize)); |
330 | 376 | deadPools.remove(new SearchHost(iid,host)); // make sure not marked as dead |
| 377 | + log.info("Reinitialized iid="+iid); |
331 | 378 | return; |
332 | 379 | } |
| 380 | + log.warn("Cannot reinitialize iid="+iid+", remote pool status="+status); |
333 | 381 | } |
334 | 382 | } catch(RemoteException e){ |
335 | 383 | e.printStackTrace(); |
— | — | @@ -349,30 +397,35 @@ |
350 | 398 | */ |
351 | 399 | protected class InitialDeploymentThread extends Thread { |
352 | 400 | public void run(){ |
353 | | - IndexRegistry registry = IndexRegistry.getInstance(); |
354 | | - // get local search indexes, deploy sorted by name |
355 | | - ArrayList<IndexId> mys = new ArrayList<IndexId>(); |
356 | | - mys.addAll(GlobalConfiguration.getInstance().getMySearch()); |
357 | | - Collections.sort(mys,new Comparator<IndexId>(){ |
358 | | - public int compare(IndexId o1, IndexId o2) { |
359 | | - return o1.toString().compareTo(o2.toString()); |
360 | | - } |
361 | | - }); |
362 | | - for(IndexId iid : mys){ |
363 | | - try { |
364 | | - // when searcher is linked into "search" path it's good, initialize it |
365 | | - if(!iid.isLogical() && registry.getCurrentSearch(iid) != null){ |
366 | | - log.debug("Initializing local for "+iid); |
367 | | - SearcherPool pool = initLocalPool(iid); |
368 | | - //Warmup.warmupPool(pool.searchers,iid,false,1); |
369 | | - //Warmup.waitForAggregate(pool.searchers); |
370 | | - localCache.put(iid.toString(),pool); |
371 | | - |
372 | | - RMIServer.bind(iid,pool.searchers); |
| 401 | + try{ |
| 402 | + initialDeploymentRunning = true; |
| 403 | + IndexRegistry registry = IndexRegistry.getInstance(); |
| 404 | + // get local search indexes, deploy sorted by name |
| 405 | + ArrayList<IndexId> mys = new ArrayList<IndexId>(); |
| 406 | + mys.addAll(GlobalConfiguration.getInstance().getMySearch()); |
| 407 | + Collections.sort(mys,new Comparator<IndexId>(){ |
| 408 | + public int compare(IndexId o1, IndexId o2) { |
| 409 | + return o1.toString().compareTo(o2.toString()); |
373 | 410 | } |
374 | | - } catch (IOException e) { |
375 | | - log.warn("I/O error warming index for "+iid+" : "+e.getMessage(),e); |
| 411 | + }); |
| 412 | + for(IndexId iid : mys){ |
| 413 | + try { |
| 414 | + // when searcher is linked into "search" path it's good, initialize it |
| 415 | + if(!iid.isLogical() && registry.getCurrentSearch(iid) != null){ |
| 416 | + log.debug("Initializing local for "+iid); |
| 417 | + SearcherPool pool = initLocalPool(iid); |
| 418 | + //Warmup.warmupPool(pool.searchers,iid,false,1); |
| 419 | + //Warmup.waitForAggregate(pool.searchers); |
| 420 | + localCache.put(iid.toString(),pool); |
| 421 | + |
| 422 | + RMIServer.bind(iid,pool.searchers); |
| 423 | + } |
| 424 | + } catch (IOException e) { |
| 425 | + log.warn("I/O error warming index for "+iid+" : "+e.getMessage(),e); |
| 426 | + } |
376 | 427 | } |
| 428 | + } finally { |
| 429 | + initialDeploymentRunning = false; |
377 | 430 | } |
378 | 431 | } |
379 | 432 | } |
— | — | @@ -452,8 +505,10 @@ |
453 | 506 | |
454 | 507 | protected SearcherCache(boolean initialize){ |
455 | 508 | searchPoolSize = Configuration.open().getInt("SearcherPool","size",1); |
456 | | - if(initialize) |
| 509 | + if(initialize){ |
| 510 | + initialDeploymentRunning = true; |
457 | 511 | new InitialDeploymentThread().start(); |
| 512 | + } |
458 | 513 | } |
459 | 514 | |
460 | 515 | public int getSearchPoolSize() { |
— | — | @@ -463,4 +518,16 @@ |
464 | 519 | public Set<SearchHost> getDeadPools() { |
465 | 520 | return deadPools; |
466 | 521 | } |
| 522 | + |
| 523 | + /** Sleep until initial deployment is finished */ |
| 524 | + public void waitForInitialDeployment(){ |
| 525 | + while(initialDeploymentRunning){ |
| 526 | + try { |
| 527 | + Thread.sleep(100); |
| 528 | + } catch (InterruptedException e) { |
| 529 | + // TODO Auto-generated catch block |
| 530 | + e.printStackTrace(); |
| 531 | + } |
| 532 | + } |
| 533 | + } |
467 | 534 | } |
Index: branches/lucene-search-2.1/src/org/wikimedia/lsearch/analyzers/ExtToken.java |
— | — | @@ -416,8 +416,8 @@ |
417 | 417 | t.setOriginalEnd(cur+len); |
418 | 418 | if(!t.isStub()) |
419 | 419 | t.unstubOriginal(); |
420 | | - if(t.type != Type.TEXT || t.getPositionIncrement()==0) |
421 | | - raiseException(serialized,cur,t,"Bad serialized data: trying to assign original string to nontext token or alias"); |
| 420 | + //if(t.type != Type.TEXT || t.getPositionIncrement()==0) |
| 421 | + // raiseException(serialized,cur,t,"Bad serialized data: trying to assign original string to nontext token or alias"); |
422 | 422 | cur += len; |
423 | 423 | break; } |
424 | 424 | case 2: // alias |