Index: trunk/lucene-search-2.0/test-data/mwsearch-global.test |
— | — | @@ -13,6 +13,7 @@ |
14 | 14 | detest,rutest : (single,true,2,10) |
15 | 15 | frtest : (split,3) (part1) (part2) (part3) |
16 | 16 | srwiki : (single) |
| 17 | +njawiki : (nssplit,3) (nspart1,[0,1],false,5) (nspart2,[12,13,14,15]) (nspart3,[]) |
17 | 18 | |
18 | 19 | # Search nodes |
19 | 20 | # host : db1.role, db2.role |
— | — | @@ -35,7 +36,7 @@ |
36 | 37 | 192.168.0.5 : detest, rutest, frtest |
37 | 38 | 192.168.0.2 : entest.ngram |
38 | 39 | 192.168.0.2 : frtest.part1, frtest.part2, frtest.part3 |
39 | | -192.168.0.10 : srwiki |
| 40 | +192.168.0.10 : srwiki njawiki |
40 | 41 | |
41 | 42 | # Path where indexes are on hosts, after default value put hosts where |
42 | 43 | # the location differs |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java |
— | — | @@ -106,6 +106,10 @@ |
107 | 107 | assertNotNull(splitroles.get("part2")); |
108 | 108 | assertNotNull(splitroles.get("part3")); |
109 | 109 | |
| 110 | + Hashtable nspart1 = (Hashtable) ((Hashtable) database.get("njawiki")).get("nspart1"); |
| 111 | + assertEquals("false",nspart1.get("optimize")); |
| 112 | + assertEquals("5",nspart1.get("mergeFactor")); |
| 113 | + |
110 | 114 | // search |
111 | 115 | Hashtable search = testgc.getSearch(); |
112 | 116 | ArrayList sr = (ArrayList) search.get("192.168.0.2"); |
— | — | @@ -170,6 +174,8 @@ |
171 | 175 | assertTrue(testgc.useKeywordScoring("rutest")); |
172 | 176 | |
173 | 177 | |
| 178 | + |
| 179 | + |
174 | 180 | } catch (MalformedURLException e) { |
175 | 181 | e.printStackTrace(); |
176 | 182 | } catch (IOException e) { |
— | — | @@ -223,6 +229,22 @@ |
224 | 230 | IndexId detest = IndexId.get("detest"); |
225 | 231 | assertFalse(detest.isLogical()); |
226 | 232 | |
| 233 | + // check nssplit |
| 234 | + IndexId njawiki = IndexId.get("njawiki"); |
| 235 | + assertTrue(njawiki.isLogical()); |
| 236 | + assertFalse(njawiki.isSplit()); |
| 237 | + assertTrue(njawiki.isNssplit()); |
| 238 | + assertEquals(3,njawiki.getSplitFactor()); |
| 239 | + assertEquals("njawiki.nspart3",njawiki.getPartByNamespace("4").toString()); |
| 240 | + assertEquals("njawiki.nspart1",njawiki.getPartByNamespace("0").toString()); |
| 241 | + assertEquals("njawiki.nspart2",njawiki.getPartByNamespace("12").toString()); |
227 | 242 | |
| 243 | + IndexId njawiki2 = IndexId.get("njawiki.nspart2"); |
| 244 | + assertFalse(njawiki2.isLogical()); |
| 245 | + assertFalse(njawiki2.isSplit()); |
| 246 | + assertTrue(njawiki2.isNssplit()); |
| 247 | + assertEquals(3,njawiki2.getSplitFactor()); |
| 248 | + assertEquals(2,njawiki2.getPartNum()); |
| 249 | + |
228 | 250 | } |
229 | 251 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java |
— | — | @@ -80,6 +80,8 @@ |
81 | 81 | showTokens(text); |
82 | 82 | text = "This is <!-- Unclosed"; |
83 | 83 | showTokens(text); |
| 84 | + text = "This are [[bean]]s and more [[bla]]njah also Großmann"; |
| 85 | + showTokens(text); |
84 | 86 | text = "[[Category:Blah Blah?!]], and [[:Category:Link to something]]"; |
85 | 87 | showTokens(text); |
86 | 88 | text = "[[sr:Glavna stranica]], and [[:Category:Link to category]]"; |
— | — | @@ -92,7 +94,7 @@ |
93 | 95 | showTokens(text); |
94 | 96 | text = "[[First]] second third fourth and so on goes the ... [[last link]]"; |
95 | 97 | showTokens(text); |
96 | | - text = "{{Something| param = {{another}}[[First]] } }} }} }} {{name| [[many]] many many tokens }} second third fourth and so on goes the ... [[good keyword]]"; |
| 98 | + text = "{{Something| param = {{another}}[[First]] } }} }} }} [[first good]]s {{name| [[many]] many many tokens }} second third fourth and so on goes the ... [[good keyword]]"; |
97 | 99 | showTokens(text); |
98 | 100 | |
99 | 101 | if(true) |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -141,7 +141,7 @@ |
142 | 142 | if(iid.isMainsplit()){ |
143 | 143 | IndexThread.makeIndexSnapshot(iid.getMainPart(),iid.getMainPart().getImportPath()); |
144 | 144 | IndexThread.makeIndexSnapshot(iid.getRestPart(),iid.getRestPart().getImportPath()); |
145 | | - } else if(iid.isSplit()){ |
| 145 | + } else if(iid.isSplit() || iid.isNssplit()){ |
146 | 146 | for(String part : iid.getSplitParts()){ |
147 | 147 | IndexId iidp = IndexId.get(part); |
148 | 148 | IndexThread.makeIndexSnapshot(iidp,iidp.getImportPath()); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java |
— | — | @@ -48,7 +48,7 @@ |
49 | 49 | else if(iid.isMainsplit()){ |
50 | 50 | indexes.put(iid.getMainPart().toString(),openIndex(iid.getMainPart())); |
51 | 51 | indexes.put(iid.getRestPart().toString(),openIndex(iid.getRestPart())); |
52 | | - } else if(iid.isSplit()){ |
| 52 | + } else if(iid.isSplit() || iid.isNssplit()){ |
53 | 53 | for(String dbpart : iid.getSplitParts()){ |
54 | 54 | indexes.put(IndexId.get(dbpart).toString(),openIndex(IndexId.get(dbpart))); |
55 | 55 | } |
— | — | @@ -98,8 +98,8 @@ |
99 | 99 | IndexId target; |
100 | 100 | if(iid.isSingle()) |
101 | 101 | target = iid; |
102 | | - else if(iid.isMainsplit()) // assign according to namespace |
103 | | - target = (a.getNamespace().equals("0"))? iid.getMainPart() : iid.getRestPart(); |
| 102 | + else if(iid.isMainsplit() || iid.isNssplit()) // assign according to namespace |
| 103 | + target = iid.getPartByNamespace(a.getNamespace()); |
104 | 104 | else // split index, randomly assign to some index part |
105 | 105 | target = iid.getPart(1+(int)(Math.random()*iid.getSplitFactor())); |
106 | 106 | |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexId.java |
— | — | @@ -58,13 +58,19 @@ |
59 | 59 | /** If true, this machine is an indexer for this index */ |
60 | 60 | protected boolean myIndex; |
61 | 61 | |
62 | | - protected enum IndexType { SINGLE, MAINSPLIT, SPLIT }; |
| 62 | + protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT }; |
63 | 63 | |
64 | 64 | /** Type of index, enumeration */ |
65 | 65 | protected IndexType type; |
66 | 66 | /** Part number in split repestnation, e.g. 1..N */ |
67 | 67 | protected int partNum; |
68 | 68 | |
| 69 | + /** Namespace -> part (for nssplit indexes) */ |
| 70 | + protected Hashtable<String,String> nssplitMap; |
| 71 | + |
| 72 | + /** Set of namespaces for this nssplit part */ |
| 73 | + protected HashSet<String> namespaceSet; |
| 74 | + |
69 | 75 | /** All parameters as they appear in the global conf, e.g. merge factor, optimize, etc.. */ |
70 | 76 | protected Hashtable<String,String> params; |
71 | 77 | |
— | — | @@ -146,6 +152,8 @@ |
147 | 153 | this.type = IndexType.MAINSPLIT; |
148 | 154 | else if(type.equals("split")) |
149 | 155 | this.type = IndexType.SPLIT; |
| 156 | + else if(type.equals("nssplit")) |
| 157 | + this.type = IndexType.NSSPLIT; |
150 | 158 | |
151 | 159 | // parts |
152 | 160 | String[] parts = dbrole.split("\\."); |
— | — | @@ -177,9 +185,22 @@ |
178 | 186 | partNum = Integer.parseInt(part.substring(4)); |
179 | 187 | else |
180 | 188 | partNum = 0; |
| 189 | + } else if(this.type == IndexType.NSSPLIT){ |
| 190 | + splitFactor = Integer.parseInt(typeParams.get("number")); |
| 191 | + splitParts = new String[splitFactor]; |
| 192 | + for(int i=0;i<splitFactor;i++) |
| 193 | + splitParts[i] = dbname+".nspart"+(i+1); |
| 194 | + if(part!=null){ |
| 195 | + partNum = Integer.parseInt(part.substring(6)); |
| 196 | + namespaceSet = new HashSet<String>(); |
| 197 | + String[] nss = params.get("namespaces").split(","); |
| 198 | + for(String ns : nss) |
| 199 | + namespaceSet.add(ns.trim()); |
| 200 | + } else |
| 201 | + partNum = 0; |
181 | 202 | } |
182 | 203 | // for split/mainsplit the main iid is logical, it doesn't have local path |
183 | | - if(myIndex && !(part == null && (this.type==IndexType.SPLIT || this.type==IndexType.MAINSPLIT))){ |
| 204 | + if(myIndex && !(part == null && (this.type==IndexType.SPLIT || this.type==IndexType.MAINSPLIT || this.type==IndexType.NSSPLIT))){ |
184 | 205 | indexPath = localIndexPath + "index" + sep + dbrole; |
185 | 206 | importPath = localIndexPath + "import" + sep + dbrole; |
186 | 207 | snapshotPath = localIndexPath + "snapshot" + sep + dbrole; |
— | — | @@ -219,9 +240,13 @@ |
220 | 241 | public boolean isSplit(){ |
221 | 242 | return type == IndexType.SPLIT; |
222 | 243 | } |
| 244 | + /** If type of this index is mainsplit */ |
| 245 | + public boolean isNssplit(){ |
| 246 | + return type == IndexType.NSSPLIT; |
| 247 | + } |
223 | 248 | /** If this is a split index, returns the current part number, e.g. for entest.part4 will return 4 */ |
224 | 249 | public int getPartNum() { |
225 | | - if(type == IndexType.SPLIT) |
| 250 | + if(type == IndexType.SPLIT || type == IndexType.NSSPLIT || type == IndexType.MAINSPLIT) |
226 | 251 | return partNum; |
227 | 252 | else{ |
228 | 253 | log.error("Called getPartNum() on non-split object! Probably a bug in the code."); |
— | — | @@ -414,14 +439,50 @@ |
415 | 440 | HashSet<String> ret = new HashSet<String>(); |
416 | 441 | if(isSingle()) |
417 | 442 | ret.add(dbrole); |
418 | | - else if(isMainsplit() || isSplit()){ |
| 443 | + else if(isMainsplit() || isSplit() || isNssplit()){ |
419 | 444 | for(String p : splitParts) |
420 | 445 | ret.add(p); |
421 | 446 | } |
422 | 447 | |
423 | 448 | return ret; |
424 | 449 | } |
| 450 | + |
| 451 | + /** Rebuild namespace map from information, call only when sure that iid's for all parts are constructed. |
| 452 | + * Note: always call on main iid, not parts */ |
| 453 | + public void rebuildNsMap(Hashtable<String,IndexId> pool) { |
| 454 | + if(isNssplit() && part==null){ |
| 455 | + // rebuild |
| 456 | + nssplitMap = new Hashtable<String,String>(); |
| 457 | + for(String part : splitParts){ |
| 458 | + for(String ns : pool.get(part).namespaceSet){ |
| 459 | + nssplitMap.put(ns,part); |
| 460 | + } |
| 461 | + } |
| 462 | + // set on all parts as well |
| 463 | + for(String part : splitParts){ |
| 464 | + pool.get(part).nssplitMap = nssplitMap; |
| 465 | + } |
| 466 | + } |
| 467 | + } |
425 | 468 | |
| 469 | + public IndexId getPartByNamespace(int ns){ |
| 470 | + return getPartByNamespace(Integer.toString(ns)); |
| 471 | + } |
426 | 472 | |
| 473 | + /** If this is nssplit/mainsplit index, get part with certain namespace */ |
| 474 | + public IndexId getPartByNamespace(String ns){ |
| 475 | + if(isNssplit()){ |
| 476 | + String dbrole = nssplitMap.get(ns); |
| 477 | + if(dbrole == null) |
| 478 | + dbrole = nssplitMap.get("<default>"); |
| 479 | + return get(dbrole); |
| 480 | + } else if(isMainsplit()){ |
| 481 | + if(ns.equals("0")) |
| 482 | + return getMainPart(); |
| 483 | + else |
| 484 | + return getRestPart(); |
| 485 | + } else |
| 486 | + return null; |
| 487 | + } |
427 | 488 | |
428 | 489 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -160,7 +160,7 @@ |
161 | 161 | for(String typeid : database.get(dbname).keySet()){ |
162 | 162 | String type = ""; |
163 | 163 | String dbrole = ""; |
164 | | - if(typeid.equals("single") || typeid.equals("mainsplit") || typeid.equals("split")){ |
| 164 | + if(typeid.equals("single") || typeid.equals("mainsplit") || typeid.equals("split") || typeid.equals("nssplit")){ |
165 | 165 | type = typeid; |
166 | 166 | dbrole = dbname; |
167 | 167 | } else if(typeid.equals("mainpart") || typeid.equals("restpart")){ |
— | — | @@ -169,6 +169,9 @@ |
170 | 170 | } else if(typeid.matches("part[1-9][0-9]*")){ |
171 | 171 | type = "split"; |
172 | 172 | dbrole = dbname + "." + typeid; |
| 173 | + } else if(typeid.matches("nspart[1-9][0-9]*")){ |
| 174 | + type = "nssplit"; |
| 175 | + dbrole = dbname + "." + typeid; |
173 | 176 | } else |
174 | 177 | continue; // uknown type, skip |
175 | 178 | |
— | — | @@ -404,7 +407,7 @@ |
405 | 408 | for(String typeid : database.get(dbname).keySet()){ |
406 | 409 | String type = ""; |
407 | 410 | String dbrole = ""; |
408 | | - if(typeid.equals("single") || typeid.equals("mainsplit") || typeid.equals("split")){ |
| 411 | + if(typeid.equals("single") || typeid.equals("mainsplit") || typeid.equals("split") || typeid.equals("nssplit")){ |
409 | 412 | type = typeid; |
410 | 413 | dbrole = dbname; |
411 | 414 | } else if(typeid.equals("mainpart") || typeid.equals("restpart")){ |
— | — | @@ -413,6 +416,9 @@ |
414 | 417 | } else if(typeid.matches("part[1-9][0-9]*")){ |
415 | 418 | type = "split"; |
416 | 419 | dbrole = dbname + "." + typeid; |
| 420 | + } else if(typeid.matches("nspart[1-9][0-9]*")){ |
| 421 | + type = "nssplit"; |
| 422 | + dbrole = dbname + "." + typeid; |
417 | 423 | } else |
418 | 424 | continue; // uknown type, skip |
419 | 425 | |
— | — | @@ -452,6 +458,8 @@ |
453 | 459 | oairepo); |
454 | 460 | indexIdPool.put(dbrole,iid); |
455 | 461 | } |
| 462 | + if(indexIdPool.get(dbname).isNssplit()) |
| 463 | + indexIdPool.get(dbname).rebuildNsMap(indexIdPool); |
456 | 464 | } |
457 | 465 | |
458 | 466 | } |
— | — | @@ -628,7 +636,7 @@ |
629 | 637 | } else if(type.equals("mainsplit")){ |
630 | 638 | // currently no params |
631 | 639 | dbroles.put(type,params); |
632 | | - } else if(type.equals("split")){ |
| 640 | + } else if(type.equals("split") || type.equals("nssplit")){ |
633 | 641 | if(tokens.length>1) // number of segments |
634 | 642 | params.put("number",tokens[1]); |
635 | 643 | else{ |
— | — | @@ -656,6 +664,27 @@ |
657 | 665 | |
658 | 666 | dbroles.put(type,params); |
659 | 667 | |
| 668 | + } else if(type.matches("nspart[1-9][0-9]*")){ |
| 669 | + // [0,1,2] syntax gets split up in first split, retokenize |
| 670 | + String ns = role.substring(role.indexOf(",")+1,role.lastIndexOf("]")+1).trim(); |
| 671 | + tokens = role.substring(role.lastIndexOf("]")+1).split(","); |
| 672 | + // definition of namespaces, e.g. [0,1,2] |
| 673 | + if(ns.length() > 2 && ns.startsWith("[") && ns.endsWith("]")) |
| 674 | + ns = ns.substring(1,ns.length()-1); |
| 675 | + else |
| 676 | + ns = "<default>"; |
| 677 | + params.put("namespaces",ns); |
| 678 | + |
| 679 | + // all params are optional, if absent default will be used |
| 680 | + if(tokens.length>1) |
| 681 | + params.put("optimize",tokens[1].trim().toLowerCase()); |
| 682 | + if(tokens.length>2) |
| 683 | + params.put("mergeFactor",tokens[2]); |
| 684 | + if(tokens.length>3) |
| 685 | + params.put("maxBufDocs", tokens[3]); |
| 686 | + |
| 687 | + dbroles.put(type,params); |
| 688 | + |
660 | 689 | } else{ |
661 | 690 | System.out.println("Warning: Unrecognized role \""+role+"\".Ignoring."); |
662 | 691 | } |
— | — | @@ -670,10 +699,10 @@ |
671 | 700 | dbr = new Hashtable<String, Hashtable<String, String>>(); |
672 | 701 | database.put(db,dbr); |
673 | 702 | } |
674 | | - if(type.equals("split") || type.equals("mainsplit") || type.equals("single")){ |
675 | | - if(dbr.get("split")!=null || dbr.get("mainsplit")!=null || dbr.get("single")!=null){ |
| 703 | + if(type.equals("split") || type.equals("mainsplit") || type.equals("single") || type.equals("nssplit")){ |
| 704 | + if(dbr.get("split")!=null || dbr.get("mainsplit")!=null || dbr.get("single")!=null || dbr.get("nssplit")!=null){ |
676 | 705 | System.out.println("WARNING: in Global Configuration: defined new architecture "+type+" for "+db); |
677 | | - dbr.remove("split"); dbr.remove("mainsplit"); dbr.remove("single"); |
| 706 | + dbr.remove("split"); dbr.remove("mainsplit"); dbr.remove("single"); dbr.remove("nssplit"); |
678 | 707 | } |
679 | 708 | } |
680 | 709 | if(dbr.get(type)!=null) |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/WikiSearcher.java |
— | — | @@ -3,6 +3,7 @@ |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.util.ArrayList; |
6 | 6 | import java.util.Arrays; |
| 7 | +import java.util.Hashtable; |
7 | 8 | |
8 | 9 | import org.apache.log4j.Logger; |
9 | 10 | import org.apache.lucene.document.Document; |
— | — | @@ -40,7 +41,8 @@ |
41 | 42 | static org.apache.log4j.Logger log = Logger.getLogger(WikiSearcher.class); |
42 | 43 | protected SearchableMul searcher; |
43 | 44 | protected SearcherCache cache; |
44 | | - protected Searchable mainpart,restpart; |
| 45 | + /** parts of the multisearcher, dbrole -> searchable */ |
| 46 | + protected Hashtable<String,Searchable> searcherParts = new Hashtable<String,Searchable>(); |
45 | 47 | protected MultiSearcherMul ms = null; |
46 | 48 | |
47 | 49 | public static final boolean INVALIDATE_CACHE = true; |
— | — | @@ -62,12 +64,8 @@ |
63 | 65 | |
64 | 66 | if(s != null){ |
65 | 67 | ss.add(s); |
66 | | - if(iid.isMainPart()) |
67 | | - mainpart = s; |
68 | | - else if(iid.isRestPart()) |
69 | | - restpart = s; |
70 | | - } |
71 | | - else |
| 68 | + searcherParts.put(iid.toString(),s); |
| 69 | + } else |
72 | 70 | log.warn("Cannot get a search index (nor local or remote) for "+iid); |
73 | 71 | } |
74 | 72 | if(ss.size() == 0) |
— | — | @@ -79,7 +77,6 @@ |
80 | 78 | /** New object from cache */ |
81 | 79 | public WikiSearcher(IndexId iid) throws Exception { |
82 | 80 | cache = SearcherCache.getInstance(); |
83 | | - mainpart = null; restpart = null; |
84 | 81 | |
85 | 82 | if(iid.isSingle()){ // is always local |
86 | 83 | searcher = cache.getLocalSearcher(iid); |
— | — | @@ -91,7 +88,7 @@ |
92 | 89 | |
93 | 90 | ms = makeMultiSearcher(parts); |
94 | 91 | searcher = ms; |
95 | | - } else if(iid.isSplit()){ |
| 92 | + } else if(iid.isSplit() || iid.isNssplit()){ |
96 | 93 | ArrayList<IndexId> parts = new ArrayList<IndexId>(); |
97 | 94 | for(int i=1; i<=iid.getSplitFactor(); i++){ |
98 | 95 | parts.add(iid.getPart(i)); |
— | — | @@ -105,21 +102,16 @@ |
106 | 103 | |
107 | 104 | cache.checkout(searcher); |
108 | 105 | } |
109 | | - |
110 | | - public String getMainPartHost(){ |
111 | | - if(mainpart == null) |
| 106 | + |
| 107 | + /** Got host for the iid within this multi searcher */ |
| 108 | + public String getHost(IndexId iid){ |
| 109 | + Searchable s = searcherParts.get(iid.toString()); |
| 110 | + if(s == null) |
112 | 111 | return null; |
113 | 112 | else |
114 | | - return cache.getSearchableHost(mainpart); |
| 113 | + return cache.getSearchableHost(s); |
115 | 114 | } |
116 | | - |
117 | | - public String getRestPartHost(){ |
118 | | - if(restpart == null) |
119 | | - return null; |
120 | | - else |
121 | | - return cache.getSearchableHost(restpart); |
122 | | - } |
123 | | - |
| 115 | + |
124 | 116 | @Override |
125 | 117 | public void close() throws IOException { |
126 | 118 | cache.release(searcher); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java |
— | — | @@ -57,7 +57,7 @@ |
58 | 58 | SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain")); |
59 | 59 | if(res!=null && res.isRetry()){ |
60 | 60 | int retries = 0; |
61 | | - if(iid.isSplit()){ |
| 61 | + if(iid.isSplit() || iid.isNssplit()){ |
62 | 62 | retries = iid.getSplitFactor()-2; |
63 | 63 | } else if(iid.isMainsplit()) |
64 | 64 | retries = 1; |
— | — | @@ -84,7 +84,7 @@ |
85 | 85 | |
86 | 86 | /** Search mainpart or restpart of the split index */ |
87 | 87 | public SearchResults searchPart(IndexId iid, Query q, NamespaceFilterWrapper filter, int offset, int limit, boolean explain){ |
88 | | - if( ! iid.isMainsplit()) |
| 88 | + if( ! (iid.isMainsplit() || iid.isNssplit())) |
89 | 89 | return null; |
90 | 90 | try { |
91 | 91 | SearcherCache cache = SearcherCache.getInstance(); |
— | — | @@ -150,40 +150,43 @@ |
151 | 151 | |
152 | 152 | WikiSearcher searcher = new WikiSearcher(iid); |
153 | 153 | TopDocs hits=null; |
154 | | - // mainpart special case |
155 | | - if(nsfw!=null && iid.isMainsplit() && nsfw.getFilter().cardinality()==1 && nsfw.getFilter().contains(0)){ |
156 | | - String host = searcher.getMainPartHost(); |
157 | | - if(host == null){ |
158 | | - res = new SearchResults(); |
159 | | - res.setErrorMsg("Error contacting searcher for mainpart of the index."); |
160 | | - log.error("Error contacting searcher for mainpart of the index."); |
161 | | - return res; |
| 154 | + // see if we can search only part of the index |
| 155 | + if(nsfw!=null && (iid.isMainPart() || iid.isNssplit())){ |
| 156 | + String part = null; |
| 157 | + for(NamespaceFilter f : nsfw.getFilter().decompose()){ |
| 158 | + if(part == null) |
| 159 | + part = iid.getPartByNamespace(f.getNamespace()).toString(); |
| 160 | + else{ |
| 161 | + if(!part.equals(iid.getPartByNamespace(f.getNamespace()).toString())){ |
| 162 | + part = null; // namespace filter wants to search more than one index parts |
| 163 | + break; |
| 164 | + } |
| 165 | + } |
| 166 | + } |
| 167 | + if(part!=null){ |
| 168 | + IndexId piid = IndexId.get(part); |
| 169 | + String host = searcher.getHost(piid); |
| 170 | + if(host == null){ |
| 171 | + res = new SearchResults(); |
| 172 | + res.setErrorMsg("Error contacting searcher for "+part); |
| 173 | + log.error("Error contacting searcher for "+part); |
| 174 | + return res; |
| 175 | + } |
| 176 | + RMIMessengerClient messenger = new RMIMessengerClient(); |
| 177 | + return messenger.searchPart(piid,q,nsfw,offset,limit,explain,host); |
162 | 178 | } |
163 | | - RMIMessengerClient messenger = new RMIMessengerClient(); |
164 | | - return messenger.searchPart(iid.getMainPart(),q,null,offset,limit,explain,host); |
165 | | - // restpart special case |
166 | | - } else if(nsfw!=null && iid.isMainsplit() && !nsfw.getFilter().contains(0)){ |
167 | | - String host = searcher.getRestPartHost(); |
168 | | - if(host == null){ |
169 | | - res = new SearchResults(); |
170 | | - res.setErrorMsg("Error contacting searcher for restpart of the index."); |
171 | | - log.error("Error contacting searcher for restpart of the index."); |
172 | | - return res; |
173 | | - } |
174 | | - RMIMessengerClient messenger = new RMIMessengerClient(); |
175 | | - return messenger.searchPart(iid.getRestPart(),q,nsfw,offset,limit,explain,host); |
176 | | - } else{ // normal search |
177 | | - try{ |
178 | | - hits = searcher.search(q,nsfw,offset+limit); |
179 | | - res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain); |
180 | | - return res; |
181 | | - } catch(Exception e){ |
182 | | - e.printStackTrace(); |
183 | | - res = new SearchResults(); |
184 | | - res.retry(); |
185 | | - log.warn("Retry, temportal error for query: ["+q+"] on "+iid); |
186 | | - return res; |
187 | | - } |
| 179 | + } |
| 180 | + // normal search |
| 181 | + try{ |
| 182 | + hits = searcher.search(q,nsfw,offset+limit); |
| 183 | + res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain); |
| 184 | + return res; |
| 185 | + } catch(Exception e){ |
| 186 | + e.printStackTrace(); |
| 187 | + res = new SearchResults(); |
| 188 | + res.retry(); |
| 189 | + log.warn("Retry, temportal error for query: ["+q+"] on "+iid); |
| 190 | + return res; |
188 | 191 | } |
189 | 192 | } catch(ParseException e){ |
190 | 193 | res = new SearchResults(); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexThread.java |
— | — | @@ -363,23 +363,19 @@ |
364 | 364 | |
365 | 365 | if( iid.isSingle() ){ |
366 | 366 | enqueueLocally(record); |
367 | | - } else if( iid.isMainsplit() ){ |
| 367 | + } else if( iid.isMainsplit() || iid.isNssplit()){ |
368 | 368 | IndexId piid; |
369 | 369 | Article ar = record.getArticle(); |
370 | | - // deletion when we have only page_id needs to be sent to both parts, |
| 370 | + // deletion when we have only page_id needs to be sent to all parts, |
371 | 371 | // because we don't have namespace info |
372 | 372 | if(record.isDelete() && ar.getTitle().equals("")){ |
373 | | - IndexUpdateRecord rec1 = (IndexUpdateRecord) record.clone(); |
374 | | - IndexUpdateRecord rec2 = (IndexUpdateRecord) record.clone(); |
375 | | - rec1.setIndexId(iid.getMainPart()); |
376 | | - rec2.setIndexId(iid.getRestPart()); |
377 | | - enqueueRemotely(rec1.getIndexId().getIndexHost(),rec1); |
378 | | - enqueueRemotely(rec2.getIndexId().getIndexHost(),rec2); |
| 373 | + for(String dbrole : iid.getSplitParts()){ |
| 374 | + IndexUpdateRecord recp = (IndexUpdateRecord) record.clone(); |
| 375 | + recp.setIndexId(IndexId.get(dbrole)); |
| 376 | + enqueueRemotely(recp.getIndexId().getIndexHost(),recp); |
| 377 | + } |
379 | 378 | } else{ |
380 | | - if( ar.getNamespace().equals("0") ) |
381 | | - piid = iid.getMainPart(); |
382 | | - else |
383 | | - piid = iid.getRestPart(); |
| 379 | + piid = iid.getPartByNamespace(ar.getNamespace()); |
384 | 380 | // set recipient to new host |
385 | 381 | record.setIndexId(piid); |
386 | 382 | enqueueRemotely(piid.getIndexHost(),record); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java |
— | — | @@ -36,7 +36,8 @@ |
37 | 37 | return f; |
38 | 38 | } |
39 | 39 | } else if(fieldName.equals("title") || fieldName.equals("stemtitle") || fieldName.startsWith("alttitle")){ |
40 | | - float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens)); |
| 40 | + //float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens)); |
| 41 | + float f = (float) (1.0 / numTokens); |
41 | 42 | //log.debug("Length-norm: "+f+", numtokens: "+numTokens); |
42 | 43 | return f; |
43 | 44 | } else if(fieldName.startsWith("redirect") || fieldName.startsWith("keyword")){ |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java |
— | — | @@ -333,8 +333,8 @@ |
334 | 334 | return tokens; // already parsed |
335 | 335 | |
336 | 336 | // before starting, make sure this is not a redirect |
337 | | - if(isRedirect()) |
338 | | - return tokens; |
| 337 | + //if(isRedirect()) |
| 338 | + // return tokens; |
339 | 339 | |
340 | 340 | for(cur = 0; cur < textLength; cur++ ){ |
341 | 341 | c = text[cur]; |
— | — | @@ -514,7 +514,7 @@ |
515 | 515 | |
516 | 516 | switch(fetch){ |
517 | 517 | case WORD: |
518 | | - addToken(); |
| 518 | + // don't add token to get syntax like [[bean]]s |
519 | 519 | continue; |
520 | 520 | case CATEGORY: |
521 | 521 | categories.add(new String(buffer,0,length)); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java |
— | — | @@ -203,10 +203,13 @@ |
204 | 204 | return null; |
205 | 205 | } |
206 | 206 | |
| 207 | + /** If text redirects to some page, get that page's title object */ |
207 | 208 | public static Title getRedirectTitle(String text, String lang){ |
208 | 209 | String full = getRedirectTarget(text,lang); |
209 | 210 | if(full == null) |
210 | 211 | return null; |
| 212 | + if(full.startsWith(":")) |
| 213 | + full = full.substring(1); |
211 | 214 | String[] parts = full.split(":",2); |
212 | 215 | if(parts.length == 2){ |
213 | 216 | String ns = parts[0].toLowerCase(); |
Index: trunk/lucene-search-2.0/sql/references_table.sql |
— | — | @@ -0,0 +1,15 @@ |
| 2 | +-- |
| 3 | +-- Table with cached information about references to a page |
| 4 | +-- |
| 5 | +CREATE TABLE /*DBprefix*/references ( |
| 6 | + -- key in form <ns>:<title> |
| 7 | + rf_key varchar(255) binary NOT NULL, |
| 8 | + |
| 9 | + -- number of page links to this page |
| 10 | + rf_references int(10) unsigned NOT NULL, |
| 11 | + |
| 12 | + -- |
| 13 | + PRIMARY KEY rf_key(rf_key) |
| 14 | + |
| 15 | +) TYPE=InnoDB; |
| 16 | + |
Index: trunk/lucene-search-2.0/lsearch-global.conf |
— | — | @@ -9,15 +9,16 @@ |
10 | 10 | # warmup <numberOfQueries> |
11 | 11 | # databases can be writen as {url}, where url contains list of dbs |
12 | 12 | [Database] |
13 | | -wikilucene : (single) (language,en) (warmup,0) |
| 13 | +#wikilucene : (single) (language,en) (warmup,0) |
14 | 14 | wikidev : (single) (language,sr) |
| 15 | +wikilucene : (nssplit,3) (nspart1,[0]) (nspart2,[4,5,12,13]), (nspart3,[]) |
15 | 16 | |
16 | 17 | # Search groups |
17 | 18 | # Index parts of a split index are always taken from the node's group |
18 | 19 | # host : db1.part db2.part |
19 | 20 | # Mulitple hosts can search multiple dbs (N-N mapping) |
20 | 21 | [Search-Group] |
21 | | -oblak : wikilucene wikidev |
| 22 | +oblak : wikilucene wikidev wikilucene.nspart1 wikilucene.nspart2 wikilucene.nspart3 |
22 | 23 | |
23 | 24 | # Index nodes |
24 | 25 | # host: db1.part db2.part |