r22920 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r22919‎ | r22920 | r22921 >
Date:00:00, 12 June 2007
Author:rainman
Status:old
Tags:
Comment:
New: split index by any combination of namespaces.
Added sql that I forgot to commit last time.
Modified paths:
  • /trunk/lucene-search-2.0/lsearch-global.conf (modified) (history)
  • /trunk/lucene-search-2.0/sql (added) (history)
  • /trunk/lucene-search-2.0/sql/references_table.sql (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexThread.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/WikiSearcher.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java (modified) (history)
  • /trunk/lucene-search-2.0/test-data/mwsearch-global.test (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/test-data/mwsearch-global.test
@@ -13,6 +13,7 @@
1414 detest,rutest : (single,true,2,10)
1515 frtest : (split,3) (part1) (part2) (part3)
1616 srwiki : (single)
 17+njawiki : (nssplit,3) (nspart1,[0,1],false,5) (nspart2,[12,13,14,15]) (nspart3,[])
1718
1819 # Search nodes
1920 # host : db1.role, db2.role
@@ -35,7 +36,7 @@
3637 192.168.0.5 : detest, rutest, frtest
3738 192.168.0.2 : entest.ngram
3839 192.168.0.2 : frtest.part1, frtest.part2, frtest.part3
39 -192.168.0.10 : srwiki
 40+192.168.0.10 : srwiki njawiki
4041
4142 # Path where indexes are on hosts, after default value put hosts where
4243 # the location differs
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java
@@ -106,6 +106,10 @@
107107 assertNotNull(splitroles.get("part2"));
108108 assertNotNull(splitroles.get("part3"));
109109
 110+ Hashtable nspart1 = (Hashtable) ((Hashtable) database.get("njawiki")).get("nspart1");
 111+ assertEquals("false",nspart1.get("optimize"));
 112+ assertEquals("5",nspart1.get("mergeFactor"));
 113+
110114 // search
111115 Hashtable search = testgc.getSearch();
112116 ArrayList sr = (ArrayList) search.get("192.168.0.2");
@@ -170,6 +174,8 @@
171175 assertTrue(testgc.useKeywordScoring("rutest"));
172176
173177
 178+
 179+
174180 } catch (MalformedURLException e) {
175181 e.printStackTrace();
176182 } catch (IOException e) {
@@ -223,6 +229,22 @@
224230 IndexId detest = IndexId.get("detest");
225231 assertFalse(detest.isLogical());
226232
 233+ // check nssplit
 234+ IndexId njawiki = IndexId.get("njawiki");
 235+ assertTrue(njawiki.isLogical());
 236+ assertFalse(njawiki.isSplit());
 237+ assertTrue(njawiki.isNssplit());
 238+ assertEquals(3,njawiki.getSplitFactor());
 239+ assertEquals("njawiki.nspart3",njawiki.getPartByNamespace("4").toString());
 240+ assertEquals("njawiki.nspart1",njawiki.getPartByNamespace("0").toString());
 241+ assertEquals("njawiki.nspart2",njawiki.getPartByNamespace("12").toString());
227242
 243+ IndexId njawiki2 = IndexId.get("njawiki.nspart2");
 244+ assertFalse(njawiki2.isLogical());
 245+ assertFalse(njawiki2.isSplit());
 246+ assertTrue(njawiki2.isNssplit());
 247+ assertEquals(3,njawiki2.getSplitFactor());
 248+ assertEquals(2,njawiki2.getPartNum());
 249+
228250 }
229251 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/FastWikiTokenizerTest.java
@@ -80,6 +80,8 @@
8181 showTokens(text);
8282 text = "This is <!-- Unclosed";
8383 showTokens(text);
 84+ text = "This are [[bean]]s and more [[bla]]njah also Großmann";
 85+ showTokens(text);
8486 text = "[[Category:Blah Blah?!]], and [[:Category:Link to something]]";
8587 showTokens(text);
8688 text = "[[sr:Glavna stranica]], and [[:Category:Link to category]]";
@@ -92,7 +94,7 @@
9395 showTokens(text);
9496 text = "[[First]] second third fourth and so on goes the ... [[last link]]";
9597 showTokens(text);
96 - text = "{{Something| param = {{another}}[[First]] } }} }} }} {{name| [[many]] many many tokens }} second third fourth and so on goes the ... [[good keyword]]";
 98+ text = "{{Something| param = {{another}}[[First]] } }} }} }} [[first good]]s {{name| [[many]] many many tokens }} second third fourth and so on goes the ... [[good keyword]]";
9799 showTokens(text);
98100
99101 if(true)
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
@@ -141,7 +141,7 @@
142142 if(iid.isMainsplit()){
143143 IndexThread.makeIndexSnapshot(iid.getMainPart(),iid.getMainPart().getImportPath());
144144 IndexThread.makeIndexSnapshot(iid.getRestPart(),iid.getRestPart().getImportPath());
145 - } else if(iid.isSplit()){
 145+ } else if(iid.isSplit() || iid.isNssplit()){
146146 for(String part : iid.getSplitParts()){
147147 IndexId iidp = IndexId.get(part);
148148 IndexThread.makeIndexSnapshot(iidp,iidp.getImportPath());
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
@@ -48,7 +48,7 @@
4949 else if(iid.isMainsplit()){
5050 indexes.put(iid.getMainPart().toString(),openIndex(iid.getMainPart()));
5151 indexes.put(iid.getRestPart().toString(),openIndex(iid.getRestPart()));
52 - } else if(iid.isSplit()){
 52+ } else if(iid.isSplit() || iid.isNssplit()){
5353 for(String dbpart : iid.getSplitParts()){
5454 indexes.put(IndexId.get(dbpart).toString(),openIndex(IndexId.get(dbpart)));
5555 }
@@ -98,8 +98,8 @@
9999 IndexId target;
100100 if(iid.isSingle())
101101 target = iid;
102 - else if(iid.isMainsplit()) // assign according to namespace
103 - target = (a.getNamespace().equals("0"))? iid.getMainPart() : iid.getRestPart();
 102+ else if(iid.isMainsplit() || iid.isNssplit()) // assign according to namespace
 103+ target = iid.getPartByNamespace(a.getNamespace());
104104 else // split index, randomly assign to some index part
105105 target = iid.getPart(1+(int)(Math.random()*iid.getSplitFactor()));
106106
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexId.java
@@ -58,13 +58,19 @@
5959 /** If true, this machine is an indexer for this index */
6060 protected boolean myIndex;
6161
62 - protected enum IndexType { SINGLE, MAINSPLIT, SPLIT };
 62+ protected enum IndexType { SINGLE, MAINSPLIT, SPLIT, NSSPLIT };
6363
6464 /** Type of index, enumeration */
6565 protected IndexType type;
6666 /** Part number in split repestnation, e.g. 1..N */
6767 protected int partNum;
6868
 69+ /** Namespace -> part (for nssplit indexes) */
 70+ protected Hashtable<String,String> nssplitMap;
 71+
 72+ /** Set of namespaces for this nssplit part */
 73+ protected HashSet<String> namespaceSet;
 74+
6975 /** All parameters as they appear in the global conf, e.g. merge factor, optimize, etc.. */
7076 protected Hashtable<String,String> params;
7177
@@ -146,6 +152,8 @@
147153 this.type = IndexType.MAINSPLIT;
148154 else if(type.equals("split"))
149155 this.type = IndexType.SPLIT;
 156+ else if(type.equals("nssplit"))
 157+ this.type = IndexType.NSSPLIT;
150158
151159 // parts
152160 String[] parts = dbrole.split("\\.");
@@ -177,9 +185,22 @@
178186 partNum = Integer.parseInt(part.substring(4));
179187 else
180188 partNum = 0;
 189+ } else if(this.type == IndexType.NSSPLIT){
 190+ splitFactor = Integer.parseInt(typeParams.get("number"));
 191+ splitParts = new String[splitFactor];
 192+ for(int i=0;i<splitFactor;i++)
 193+ splitParts[i] = dbname+".nspart"+(i+1);
 194+ if(part!=null){
 195+ partNum = Integer.parseInt(part.substring(6));
 196+ namespaceSet = new HashSet<String>();
 197+ String[] nss = params.get("namespaces").split(",");
 198+ for(String ns : nss)
 199+ namespaceSet.add(ns.trim());
 200+ } else
 201+ partNum = 0;
181202 }
182203 // for split/mainsplit the main iid is logical, it doesn't have local path
183 - if(myIndex && !(part == null && (this.type==IndexType.SPLIT || this.type==IndexType.MAINSPLIT))){
 204+ if(myIndex && !(part == null && (this.type==IndexType.SPLIT || this.type==IndexType.MAINSPLIT || this.type==IndexType.NSSPLIT))){
184205 indexPath = localIndexPath + "index" + sep + dbrole;
185206 importPath = localIndexPath + "import" + sep + dbrole;
186207 snapshotPath = localIndexPath + "snapshot" + sep + dbrole;
@@ -219,9 +240,13 @@
220241 public boolean isSplit(){
221242 return type == IndexType.SPLIT;
222243 }
 244+ /** If type of this index is mainsplit */
 245+ public boolean isNssplit(){
 246+ return type == IndexType.NSSPLIT;
 247+ }
223248 /** If this is a split index, returns the current part number, e.g. for entest.part4 will return 4 */
224249 public int getPartNum() {
225 - if(type == IndexType.SPLIT)
 250+ if(type == IndexType.SPLIT || type == IndexType.NSSPLIT || type == IndexType.MAINSPLIT)
226251 return partNum;
227252 else{
228253 log.error("Called getPartNum() on non-split object! Probably a bug in the code.");
@@ -414,14 +439,50 @@
415440 HashSet<String> ret = new HashSet<String>();
416441 if(isSingle())
417442 ret.add(dbrole);
418 - else if(isMainsplit() || isSplit()){
 443+ else if(isMainsplit() || isSplit() || isNssplit()){
419444 for(String p : splitParts)
420445 ret.add(p);
421446 }
422447
423448 return ret;
424449 }
 450+
 451+ /** Rebuild namespace map from information, call only when sure that iid's for all parts are constructed.
 452+ * Note: always call on main iid, not parts */
 453+ public void rebuildNsMap(Hashtable<String,IndexId> pool) {
 454+ if(isNssplit() && part==null){
 455+ // rebuild
 456+ nssplitMap = new Hashtable<String,String>();
 457+ for(String part : splitParts){
 458+ for(String ns : pool.get(part).namespaceSet){
 459+ nssplitMap.put(ns,part);
 460+ }
 461+ }
 462+ // set on all parts as well
 463+ for(String part : splitParts){
 464+ pool.get(part).nssplitMap = nssplitMap;
 465+ }
 466+ }
 467+ }
425468
 469+ public IndexId getPartByNamespace(int ns){
 470+ return getPartByNamespace(Integer.toString(ns));
 471+ }
426472
 473+ /** If this is nssplit/mainsplit index, get part with certain namespace */
 474+ public IndexId getPartByNamespace(String ns){
 475+ if(isNssplit()){
 476+ String dbrole = nssplitMap.get(ns);
 477+ if(dbrole == null)
 478+ dbrole = nssplitMap.get("<default>");
 479+ return get(dbrole);
 480+ } else if(isMainsplit()){
 481+ if(ns.equals("0"))
 482+ return getMainPart();
 483+ else
 484+ return getRestPart();
 485+ } else
 486+ return null;
 487+ }
427488
428489 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -160,7 +160,7 @@
161161 for(String typeid : database.get(dbname).keySet()){
162162 String type = "";
163163 String dbrole = "";
164 - if(typeid.equals("single") || typeid.equals("mainsplit") || typeid.equals("split")){
 164+ if(typeid.equals("single") || typeid.equals("mainsplit") || typeid.equals("split") || typeid.equals("nssplit")){
165165 type = typeid;
166166 dbrole = dbname;
167167 } else if(typeid.equals("mainpart") || typeid.equals("restpart")){
@@ -169,6 +169,9 @@
170170 } else if(typeid.matches("part[1-9][0-9]*")){
171171 type = "split";
172172 dbrole = dbname + "." + typeid;
 173+ } else if(typeid.matches("nspart[1-9][0-9]*")){
 174+ type = "nssplit";
 175+ dbrole = dbname + "." + typeid;
173176 } else
174177 continue; // uknown type, skip
175178
@@ -404,7 +407,7 @@
405408 for(String typeid : database.get(dbname).keySet()){
406409 String type = "";
407410 String dbrole = "";
408 - if(typeid.equals("single") || typeid.equals("mainsplit") || typeid.equals("split")){
 411+ if(typeid.equals("single") || typeid.equals("mainsplit") || typeid.equals("split") || typeid.equals("nssplit")){
409412 type = typeid;
410413 dbrole = dbname;
411414 } else if(typeid.equals("mainpart") || typeid.equals("restpart")){
@@ -413,6 +416,9 @@
414417 } else if(typeid.matches("part[1-9][0-9]*")){
415418 type = "split";
416419 dbrole = dbname + "." + typeid;
 420+ } else if(typeid.matches("nspart[1-9][0-9]*")){
 421+ type = "nssplit";
 422+ dbrole = dbname + "." + typeid;
417423 } else
418424 continue; // uknown type, skip
419425
@@ -452,6 +458,8 @@
453459 oairepo);
454460 indexIdPool.put(dbrole,iid);
455461 }
 462+ if(indexIdPool.get(dbname).isNssplit())
 463+ indexIdPool.get(dbname).rebuildNsMap(indexIdPool);
456464 }
457465
458466 }
@@ -628,7 +636,7 @@
629637 } else if(type.equals("mainsplit")){
630638 // currently no params
631639 dbroles.put(type,params);
632 - } else if(type.equals("split")){
 640+ } else if(type.equals("split") || type.equals("nssplit")){
633641 if(tokens.length>1) // number of segments
634642 params.put("number",tokens[1]);
635643 else{
@@ -656,6 +664,27 @@
657665
658666 dbroles.put(type,params);
659667
 668+ } else if(type.matches("nspart[1-9][0-9]*")){
 669+ // [0,1,2] syntax gets split up in first split, retokenize
 670+ String ns = role.substring(role.indexOf(",")+1,role.lastIndexOf("]")+1).trim();
 671+ tokens = role.substring(role.lastIndexOf("]")+1).split(",");
 672+ // definition of namespaces, e.g. [0,1,2]
 673+ if(ns.length() > 2 && ns.startsWith("[") && ns.endsWith("]"))
 674+ ns = ns.substring(1,ns.length()-1);
 675+ else
 676+ ns = "<default>";
 677+ params.put("namespaces",ns);
 678+
 679+ // all params are optional, if absent default will be used
 680+ if(tokens.length>1)
 681+ params.put("optimize",tokens[1].trim().toLowerCase());
 682+ if(tokens.length>2)
 683+ params.put("mergeFactor",tokens[2]);
 684+ if(tokens.length>3)
 685+ params.put("maxBufDocs", tokens[3]);
 686+
 687+ dbroles.put(type,params);
 688+
660689 } else{
661690 System.out.println("Warning: Unrecognized role \""+role+"\".Ignoring.");
662691 }
@@ -670,10 +699,10 @@
671700 dbr = new Hashtable<String, Hashtable<String, String>>();
672701 database.put(db,dbr);
673702 }
674 - if(type.equals("split") || type.equals("mainsplit") || type.equals("single")){
675 - if(dbr.get("split")!=null || dbr.get("mainsplit")!=null || dbr.get("single")!=null){
 703+ if(type.equals("split") || type.equals("mainsplit") || type.equals("single") || type.equals("nssplit")){
 704+ if(dbr.get("split")!=null || dbr.get("mainsplit")!=null || dbr.get("single")!=null || dbr.get("nssplit")!=null){
676705 System.out.println("WARNING: in Global Configuration: defined new architecture "+type+" for "+db);
677 - dbr.remove("split"); dbr.remove("mainsplit"); dbr.remove("single");
 706+ dbr.remove("split"); dbr.remove("mainsplit"); dbr.remove("single"); dbr.remove("nssplit");
678707 }
679708 }
680709 if(dbr.get(type)!=null)
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/WikiSearcher.java
@@ -3,6 +3,7 @@
44 import java.io.IOException;
55 import java.util.ArrayList;
66 import java.util.Arrays;
 7+import java.util.Hashtable;
78
89 import org.apache.log4j.Logger;
910 import org.apache.lucene.document.Document;
@@ -40,7 +41,8 @@
4142 static org.apache.log4j.Logger log = Logger.getLogger(WikiSearcher.class);
4243 protected SearchableMul searcher;
4344 protected SearcherCache cache;
44 - protected Searchable mainpart,restpart;
 45+ /** parts of the multisearcher, dbrole -> searchable */
 46+ protected Hashtable<String,Searchable> searcherParts = new Hashtable<String,Searchable>();
4547 protected MultiSearcherMul ms = null;
4648
4749 public static final boolean INVALIDATE_CACHE = true;
@@ -62,12 +64,8 @@
6365
6466 if(s != null){
6567 ss.add(s);
66 - if(iid.isMainPart())
67 - mainpart = s;
68 - else if(iid.isRestPart())
69 - restpart = s;
70 - }
71 - else
 68+ searcherParts.put(iid.toString(),s);
 69+ } else
7270 log.warn("Cannot get a search index (nor local or remote) for "+iid);
7371 }
7472 if(ss.size() == 0)
@@ -79,7 +77,6 @@
8078 /** New object from cache */
8179 public WikiSearcher(IndexId iid) throws Exception {
8280 cache = SearcherCache.getInstance();
83 - mainpart = null; restpart = null;
8481
8582 if(iid.isSingle()){ // is always local
8683 searcher = cache.getLocalSearcher(iid);
@@ -91,7 +88,7 @@
9289
9390 ms = makeMultiSearcher(parts);
9491 searcher = ms;
95 - } else if(iid.isSplit()){
 92+ } else if(iid.isSplit() || iid.isNssplit()){
9693 ArrayList<IndexId> parts = new ArrayList<IndexId>();
9794 for(int i=1; i<=iid.getSplitFactor(); i++){
9895 parts.add(iid.getPart(i));
@@ -105,21 +102,16 @@
106103
107104 cache.checkout(searcher);
108105 }
109 -
110 - public String getMainPartHost(){
111 - if(mainpart == null)
 106+
 107+ /** Got host for the iid within this multi searcher */
 108+ public String getHost(IndexId iid){
 109+ Searchable s = searcherParts.get(iid.toString());
 110+ if(s == null)
112111 return null;
113112 else
114 - return cache.getSearchableHost(mainpart);
 113+ return cache.getSearchableHost(s);
115114 }
116 -
117 - public String getRestPartHost(){
118 - if(restpart == null)
119 - return null;
120 - else
121 - return cache.getSearchableHost(restpart);
122 - }
123 -
 115+
124116 @Override
125117 public void close() throws IOException {
126118 cache.release(searcher);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -57,7 +57,7 @@
5858 SearchResults res = search(iid, searchterm, offset, limit, namespaces, what.equals("explain"));
5959 if(res!=null && res.isRetry()){
6060 int retries = 0;
61 - if(iid.isSplit()){
 61+ if(iid.isSplit() || iid.isNssplit()){
6262 retries = iid.getSplitFactor()-2;
6363 } else if(iid.isMainsplit())
6464 retries = 1;
@@ -84,7 +84,7 @@
8585
8686 /** Search mainpart or restpart of the split index */
8787 public SearchResults searchPart(IndexId iid, Query q, NamespaceFilterWrapper filter, int offset, int limit, boolean explain){
88 - if( ! iid.isMainsplit())
 88+ if( ! (iid.isMainsplit() || iid.isNssplit()))
8989 return null;
9090 try {
9191 SearcherCache cache = SearcherCache.getInstance();
@@ -150,40 +150,43 @@
151151
152152 WikiSearcher searcher = new WikiSearcher(iid);
153153 TopDocs hits=null;
154 - // mainpart special case
155 - if(nsfw!=null && iid.isMainsplit() && nsfw.getFilter().cardinality()==1 && nsfw.getFilter().contains(0)){
156 - String host = searcher.getMainPartHost();
157 - if(host == null){
158 - res = new SearchResults();
159 - res.setErrorMsg("Error contacting searcher for mainpart of the index.");
160 - log.error("Error contacting searcher for mainpart of the index.");
161 - return res;
 154+ // see if we can search only part of the index
 155+ if(nsfw!=null && (iid.isMainPart() || iid.isNssplit())){
 156+ String part = null;
 157+ for(NamespaceFilter f : nsfw.getFilter().decompose()){
 158+ if(part == null)
 159+ part = iid.getPartByNamespace(f.getNamespace()).toString();
 160+ else{
 161+ if(!part.equals(iid.getPartByNamespace(f.getNamespace()).toString())){
 162+ part = null; // namespace filter wants to search more than one index parts
 163+ break;
 164+ }
 165+ }
 166+ }
 167+ if(part!=null){
 168+ IndexId piid = IndexId.get(part);
 169+ String host = searcher.getHost(piid);
 170+ if(host == null){
 171+ res = new SearchResults();
 172+ res.setErrorMsg("Error contacting searcher for "+part);
 173+ log.error("Error contacting searcher for "+part);
 174+ return res;
 175+ }
 176+ RMIMessengerClient messenger = new RMIMessengerClient();
 177+ return messenger.searchPart(piid,q,nsfw,offset,limit,explain,host);
162178 }
163 - RMIMessengerClient messenger = new RMIMessengerClient();
164 - return messenger.searchPart(iid.getMainPart(),q,null,offset,limit,explain,host);
165 - // restpart special case
166 - } else if(nsfw!=null && iid.isMainsplit() && !nsfw.getFilter().contains(0)){
167 - String host = searcher.getRestPartHost();
168 - if(host == null){
169 - res = new SearchResults();
170 - res.setErrorMsg("Error contacting searcher for restpart of the index.");
171 - log.error("Error contacting searcher for restpart of the index.");
172 - return res;
173 - }
174 - RMIMessengerClient messenger = new RMIMessengerClient();
175 - return messenger.searchPart(iid.getRestPart(),q,nsfw,offset,limit,explain,host);
176 - } else{ // normal search
177 - try{
178 - hits = searcher.search(q,nsfw,offset+limit);
179 - res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
180 - return res;
181 - } catch(Exception e){
182 - e.printStackTrace();
183 - res = new SearchResults();
184 - res.retry();
185 - log.warn("Retry, temportal error for query: ["+q+"] on "+iid);
186 - return res;
187 - }
 179+ }
 180+ // normal search
 181+ try{
 182+ hits = searcher.search(q,nsfw,offset+limit);
 183+ res = makeSearchResults(searcher,hits,offset,limit,iid,searchterm,q,searchStart,explain);
 184+ return res;
 185+ } catch(Exception e){
 186+ e.printStackTrace();
 187+ res = new SearchResults();
 188+ res.retry();
 189+ log.warn("Retry, temportal error for query: ["+q+"] on "+iid);
 190+ return res;
188191 }
189192 } catch(ParseException e){
190193 res = new SearchResults();
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexThread.java
@@ -363,23 +363,19 @@
364364
365365 if( iid.isSingle() ){
366366 enqueueLocally(record);
367 - } else if( iid.isMainsplit() ){
 367+ } else if( iid.isMainsplit() || iid.isNssplit()){
368368 IndexId piid;
369369 Article ar = record.getArticle();
370 - // deletion when we have only page_id needs to be sent to both parts,
 370+ // deletion when we have only page_id needs to be sent to all parts,
371371 // because we don't have namespace info
372372 if(record.isDelete() && ar.getTitle().equals("")){
373 - IndexUpdateRecord rec1 = (IndexUpdateRecord) record.clone();
374 - IndexUpdateRecord rec2 = (IndexUpdateRecord) record.clone();
375 - rec1.setIndexId(iid.getMainPart());
376 - rec2.setIndexId(iid.getRestPart());
377 - enqueueRemotely(rec1.getIndexId().getIndexHost(),rec1);
378 - enqueueRemotely(rec2.getIndexId().getIndexHost(),rec2);
 373+ for(String dbrole : iid.getSplitParts()){
 374+ IndexUpdateRecord recp = (IndexUpdateRecord) record.clone();
 375+ recp.setIndexId(IndexId.get(dbrole));
 376+ enqueueRemotely(recp.getIndexId().getIndexHost(),recp);
 377+ }
379378 } else{
380 - if( ar.getNamespace().equals("0") )
381 - piid = iid.getMainPart();
382 - else
383 - piid = iid.getRestPart();
 379+ piid = iid.getPartByNamespace(ar.getNamespace());
384380 // set recipient to new host
385381 record.setIndexId(piid);
386382 enqueueRemotely(piid.getIndexHost(),record);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiSimilarity.java
@@ -36,7 +36,8 @@
3737 return f;
3838 }
3939 } else if(fieldName.equals("title") || fieldName.equals("stemtitle") || fieldName.startsWith("alttitle")){
40 - float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
 40+ //float f = (float) (1.0 / (Math.sqrt(numTokens) * numTokens));
 41+ float f = (float) (1.0 / numTokens);
4142 //log.debug("Length-norm: "+f+", numtokens: "+numTokens);
4243 return f;
4344 } else if(fieldName.startsWith("redirect") || fieldName.startsWith("keyword")){
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/FastWikiTokenizerEngine.java
@@ -333,8 +333,8 @@
334334 return tokens; // already parsed
335335
336336 // before starting, make sure this is not a redirect
337 - if(isRedirect())
338 - return tokens;
 337+ //if(isRedirect())
 338+ // return tokens;
339339
340340 for(cur = 0; cur < textLength; cur++ ){
341341 c = text[cur];
@@ -514,7 +514,7 @@
515515
516516 switch(fetch){
517517 case WORD:
518 - addToken();
 518+ // don't add token to get syntax like [[bean]]s
519519 continue;
520520 case CATEGORY:
521521 categories.add(new String(buffer,0,length));
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java
@@ -203,10 +203,13 @@
204204 return null;
205205 }
206206
 207+ /** If text redirects to some page, get that page's title object */
207208 public static Title getRedirectTitle(String text, String lang){
208209 String full = getRedirectTarget(text,lang);
209210 if(full == null)
210211 return null;
 212+ if(full.startsWith(":"))
 213+ full = full.substring(1);
211214 String[] parts = full.split(":",2);
212215 if(parts.length == 2){
213216 String ns = parts[0].toLowerCase();
Index: trunk/lucene-search-2.0/sql/references_table.sql
@@ -0,0 +1,15 @@
 2+--
 3+-- Table with cached information about references to a page
 4+--
 5+CREATE TABLE /*DBprefix*/references (
 6+ -- key in form <ns>:<title>
 7+ rf_key varchar(255) binary NOT NULL,
 8+
 9+ -- number of page links to this page
 10+ rf_references int(10) unsigned NOT NULL,
 11+
 12+ --
 13+ PRIMARY KEY rf_key(rf_key)
 14+
 15+) TYPE=InnoDB;
 16+
Index: trunk/lucene-search-2.0/lsearch-global.conf
@@ -9,15 +9,16 @@
1010 # warmup <numberOfQueries>
1111 # databases can be writen as {url}, where url contains list of dbs
1212 [Database]
13 -wikilucene : (single) (language,en) (warmup,0)
 13+#wikilucene : (single) (language,en) (warmup,0)
1414 wikidev : (single) (language,sr)
 15+wikilucene : (nssplit,3) (nspart1,[0]) (nspart2,[4,5,12,13]), (nspart3,[])
1516
1617 # Search groups
1718 # Index parts of a split index are always taken from the node's group
1819 # host : db1.part db2.part
1920 # Mulitple hosts can search multiple dbs (N-N mapping)
2021 [Search-Group]
21 -oblak : wikilucene wikidev
 22+oblak : wikilucene wikidev wikilucene.nspart1 wikilucene.nspart2 wikilucene.nspart3
2223
2324 # Index nodes
2425 # host: db1.part db2.part

Status & tagging log