r22014 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r22013‎ | r22014 | r22015 >
Date:22:06, 8 May 2007
Author:rainman
Status:old
Tags:
Comment:
Backward compatibility: wildcard queries and custom namespace filters.
The latter should be avoided since they introduce considerable overhead,
slowing down search 5-10 times.

Maybe remove the checkboxes to pick namespace on search page, since
namespace-based search can be done with prefixes which are fast.

Some minor tweaks to previous commit, typos, etc..
Modified paths:
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Collector.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexThread.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/NamespaceFilter.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearcherCache.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/UpdateThread.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/WikiQueryParserTest.java
@@ -94,11 +94,11 @@
9595 assertTrue(fields.contains("contents"));
9696
9797 // namespace policies
98 - parser = new WikiQueryParser("contents","main",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.IGNORE);
 98+ parser = new WikiQueryParser("contents","0",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.IGNORE);
9999 q = parser.parseRaw("help:making breakfast category:food");
100100 assertEquals("+contents:making +contents:breakfast +category:food",q.toString());
101101
102 - parser = new WikiQueryParser("contents","main",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE);
 102+ parser = new WikiQueryParser("contents","0",new SimpleAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE);
103103 q = parser.parseRaw("help:making breakfast category:food");
104104 assertEquals("+namespace:12 +(+contents:making +contents:breakfast +category:food)",q.toString());
105105
@@ -120,7 +120,7 @@
121121
122122 // ====== English Analyzer ========
123123
124 - parser = new WikiQueryParser("contents","main",new EnglishAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE);
 124+ parser = new WikiQueryParser("contents","0",new EnglishAnalyzer(), WikiQueryParser.NamespacePolicy.REWRITE);
125125 q = parser.parseRaw("main_talk:laziness");
126126 assertEquals("+namespace:1 +(contents:laziness contents:lazi^0.5)",q.toString());
127127
@@ -184,7 +184,7 @@
185185 // Tests with actual params :)
186186 // ==================================
187187 Analyzer analyzer = Analyzers.getSearcherAnalyzer("en");
188 - parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE);
 188+ parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE);
189189 q = parser.parseTwoPass("beans everyone",null);
190190 assertEquals("(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5)) (+title:beans^2.0 +title:everyone^2.0)",q.toString());
191191
@@ -233,9 +233,21 @@
234234 q = parser.parseTwoPass("main:1991 category:\"olympic cities\" -all:1990",NamespacePolicy.REWRITE);
235235 assertEquals("(+(+namespace:0 +(+contents:1991 +category:\"olympic cities\")) -contents:1990) (+(+namespace:0 +(+title:1991^2.0 +category:\"olympic cities\")) -title:1990^2.0)",q.toString());
236236
 237+ q = parser.parseTwoPass("main:ba*",NamespacePolicy.IGNORE);
 238+ assertEquals("contents:ba* title:ba*^2.0",q.toString());
 239+
 240+ q = parser.parseTwoPass("main:ba* all:lele",NamespacePolicy.REWRITE);
 241+ assertEquals("(+(+namespace:0 +contents:ba*) +contents:lele) (+(+namespace:0 +title:ba*^2.0) +title:lele^2.0)",q.toString());
 242+
 243+ q = parser.parseTwoPass("main:ba*beans",NamespacePolicy.IGNORE);
 244+ assertEquals("(+contents:ba +(contents:beans contents:bean^0.5)) (+title:ba^2.0 +title:beans^2.0)",q.toString());
 245+
 246+ q = parser.parseTwoPass("*kuta",NamespacePolicy.IGNORE);
 247+ assertEquals("contents:kuta title:kuta^2.0",q.toString());
 248+
237249 // Localization tests
238250 analyzer = Analyzers.getSearcherAnalyzer("sr");
239 - parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE);
 251+ parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE);
240252
241253 q = parser.parseTwoPass("all:добродошли на википедију",NamespacePolicy.IGNORE);
242254 assertEquals("(+(contents:добродошли contents:dobrodosli^0.5) +(contents:на contents:na^0.5) +(contents:википедију contents:vikipediju^0.5)) (+(title:добродошли^2.0 title:dobrodosli) +(title:на^2.0 title:na) +(title:википедију^2.0 title:vikipediju))",q.toString());
@@ -244,7 +256,7 @@
245257 assertEquals("(+contents:dobrodosli +contents:na +contents:sdjccz) (+title:dobrodosli^2.0 +title:na^2.0 +title:sdjccz^2.0)",q.toString());
246258
247259 analyzer = Analyzers.getSearcherAnalyzer("th");
248 - parser = new WikiQueryParser("contents","main",analyzer,NamespacePolicy.LEAVE);
 260+ parser = new WikiQueryParser("contents","0",analyzer,NamespacePolicy.LEAVE);
249261
250262 q = parser.parseTwoPass("ภาษาไทย",NamespacePolicy.IGNORE);
251263 assertEquals("(+contents:ภาษา +contents:ไทย) (+title:ภาษา^2.0 +title:ไทย^2.0)",q.toString());
@@ -252,6 +264,19 @@
253265 q = parser.parseTwoPass("help:ภาษาไทย",NamespacePolicy.REWRITE);
254266 assertEquals("(+namespace:12 +(+contents:ภาษา +contents:ไทย)) (+namespace:12 +(+title:ภาษา^2.0 +title:ไทย^2.0))",q.toString());
255267
 268+ // Backward compatiblity for complex filters
 269+ analyzer = Analyzers.getSearcherAnalyzer("en");
 270+ parser = new WikiQueryParser("contents","0,1,4,12",analyzer,NamespacePolicy.IGNORE);
 271+
 272+ q = parser.parseTwoPass("beans everyone",NamespacePolicy.REWRITE);
 273+ assertEquals("(+(namespace:0 namespace:1 namespace:4 namespace:12) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5))) (+(namespace:0 namespace:1 namespace:4 namespace:12) +(+title:beans^2.0 +title:everyone^2.0))",q.toString());
 274+
 275+ q = parser.parseTwoPass("beans main:everyone",NamespacePolicy.REWRITE);
 276+ assertEquals("((+(namespace:0 namespace:1 namespace:4 namespace:12) +(contents:beans contents:bean^0.5)) (+namespace:0 +(contents:everyone contents:everyon^0.5))) ((+(namespace:0 namespace:1 namespace:4 namespace:12) +title:beans^2.0) (+namespace:0 +title:everyone^2.0))",q.toString());
 277+
 278+ q = parser.parseTwoPass("beans everyone category:cheeses",NamespacePolicy.REWRITE);
 279+ assertEquals("(+(namespace:0 namespace:1 namespace:4 namespace:12) +(+(contents:beans contents:bean^0.5) +(contents:everyone contents:everyon^0.5) +category:cheeses)) (+(namespace:0 namespace:1 namespace:4 namespace:12) +(+title:beans^2.0 +title:everyone^2.0 +category:cheeses))",q.toString());
 280+
256281 } catch(Exception e){
257282 e.printStackTrace();
258283 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
@@ -9,6 +9,8 @@
1010 import org.mediawiki.importer.XmlDumpReader;
1111 import org.wikimedia.lsearch.config.Configuration;
1212 import org.wikimedia.lsearch.config.GlobalConfiguration;
 13+import org.wikimedia.lsearch.config.IndexId;
 14+import org.wikimedia.lsearch.index.IndexThread;
1315 import org.wikimedia.lsearch.util.Localization;
1416 import org.wikimedia.lsearch.util.UnicodeDecomposer;
1517
@@ -30,7 +32,8 @@
3133 String dbname = null;
3234 Boolean optimize = null;
3335 Integer mergeFactor = null, maxBufDocs = null;
34 - boolean newIndex = false;
 36+ boolean newIndex = false, makeSnapshot = false;
 37+ boolean snapshotDb = false;
3538
3639 System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n");
3740
@@ -38,13 +41,15 @@
3942 Logger log = Logger.getLogger(Importer.class);
4043
4144 if(args.length < 2){
42 - System.out.println("Syntax: java Importer [-n] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");
 45+ System.out.println("Syntax: java Importer [-n] [-s] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");
4346 System.out.println("Options: ");
4447 System.out.println(" -n - create a new index (erase the old one if exists)");
 48+ System.out.println(" -s - make index snapshot when finished");
4549 System.out.println(" -l limit_num - add at most limit_num articles");
4650 System.out.println(" -o optimize - true/false overrides optimization param from global settings");
4751 System.out.println(" -m mergeFactor - overrides param from global settings");
4852 System.out.println(" -b maxBufDocs - overrides param from global settings");
 53+ System.out.println(" --snapshot <db> - make snapshot only for dbname");
4954 return;
5055 }
5156 for(int i=0;i<args.length;i++){
@@ -58,52 +63,78 @@
5964 maxBufDocs = Integer.parseInt(args[++i]);
6065 else if(args[i].equals("-n"))
6166 newIndex = true;
62 - else if(inputfile == null)
 67+ else if(args[i].equals("-s"))
 68+ makeSnapshot = true;
 69+ else if(args[i].equals("--snapshot")){
 70+ dbname = args[++i];
 71+ snapshotDb = true;
 72+ break;
 73+ } else if(inputfile == null)
6374 inputfile = args[i];
6475 else if(dbname == null)
6576 dbname = args[i];
6677 else
6778 System.out.println("Unrecognized option: "+args[i]);
6879 }
69 -
70 - if(inputfile == null || dbname == null){
71 - System.out.println("Please specify both input xml file and database name");
72 - return;
73 - }
 80+ if(!snapshotDb){
 81+ if(inputfile == null || dbname == null){
 82+ System.out.println("Please specify both input xml file and database name");
 83+ return;
 84+ }
7485
75 - // preload
76 - UnicodeDecomposer.getInstance();
77 - Localization.readLocalization(GlobalConfiguration.getInstance().getLanguage(dbname));
78 - Localization.loadInterwiki();
79 -
80 - long start = System.currentTimeMillis();
81 -
82 - // open
83 - InputStream input = null;
84 - try {
85 - input = Tools.openInputFile(inputfile);
86 - } catch (IOException e) {
87 - log.fatal("I/O error opening "+inputfile);
 86+ // preload
 87+ UnicodeDecomposer.getInstance();
 88+ Localization.readLocalization(GlobalConfiguration.getInstance().getLanguage(dbname));
 89+ Localization.loadInterwiki();
 90+
 91+ long start = System.currentTimeMillis();
 92+
 93+ // open
 94+ InputStream input = null;
 95+ try {
 96+ input = Tools.openInputFile(inputfile);
 97+ } catch (IOException e) {
 98+ log.fatal("I/O error opening "+inputfile);
 99+ return;
 100+ }
 101+
 102+ // read
 103+ DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex);
 104+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 100));
 105+ try {
 106+ reader.readDump();
 107+ } catch (IOException e) {
 108+ if(!e.getMessage().equals("stopped")){
 109+ log.fatal("I/O error reading dump for "+dbname+" from "+inputfile);
 110+ return;
 111+ }
 112+ }
 113+
 114+ long end = System.currentTimeMillis();
 115+
 116+ log.info("Closing/optimizing index...");
 117+ dp.closeIndex();
 118+
 119+ long finalEnd = System.currentTimeMillis();
 120+
 121+ System.out.println("Finished indexing in "+formatTime(end-start)+", with final index optimization in "+formatTime(finalEnd-end));
 122+ System.out.println("Total time: "+formatTime(finalEnd-start));
88123 }
89124
90 - // read
91 - DumpImporter dp = new DumpImporter(dbname,limit,optimize,mergeFactor,maxBufDocs,newIndex);
92 - XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 100));
93 - try {
94 - reader.readDump();
95 - } catch (IOException e) {
96 - log.warn("I/O error reading dump for "+dbname+" from "+inputfile);
97 - }
98 -
99 - long end = System.currentTimeMillis();
100 -
101 - log.info("Closing/optimizing index...");
102 - dp.closeIndex();
103 -
104 - long finalEnd = System.currentTimeMillis();
105 -
106 - System.out.println("Finished indexing in "+formatTime(end-start)+", with final index optimization in "+formatTime(finalEnd-end));
107 - System.out.println("Total time: "+formatTime(finalEnd-start));
 125+ // make snapshot if needed
 126+ if(makeSnapshot || snapshotDb){
 127+ IndexId iid = IndexId.get(dbname);
 128+ if(iid.isMainsplit()){
 129+ IndexThread.makeIndexSnapshot(iid.getMainPart(),iid.getMainPart().getImportPath());
 130+ IndexThread.makeIndexSnapshot(iid.getRestPart(),iid.getRestPart().getImportPath());
 131+ } else if(iid.isSplit()){
 132+ for(String part : iid.getSplitParts()){
 133+ IndexId iidp = IndexId.get(part);
 134+ IndexThread.makeIndexSnapshot(iidp,iidp.getImportPath());
 135+ }
 136+ } else
 137+ IndexThread.makeIndexSnapshot(iid,iid.getImportPath());
 138+ }
108139 }
109140
110141 private static String formatTime(long l) {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearcherCache.java
@@ -165,7 +165,7 @@
166166 for(IndexId iid : mys){
167167 try {
168168 IndexSearcherMul is = getLocalSearcher(iid);
169 - Warmup.warmupIndexSearcher(is,iid);
 169+ Warmup.warmupIndexSearcher(is,iid,false);
170170 } catch (IOException e) {
171171 log.warn("I/O error warming index for "+iid);
172172 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/NamespaceFilter.java
@@ -76,16 +76,16 @@
7777 return included.get(namespace);
7878 }
7979
 80+ public BitSet getIncluded() {
 81+ return included;
 82+ }
 83+
8084 public int cardinality(){
8185 return included.cardinality();
8286 }
8387
8488 public int getNamespace(){
85 - for(int i=0;i<included.size();i++){
86 - if(included.get(i))
87 - return i;
88 - }
89 - return Integer.MIN_VALUE;
 89+ return included.nextSetBit(0);
9090 }
9191
9292 @Override
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/UpdateThread.java
@@ -58,6 +58,7 @@
5959 for(LocalIndex li : forUpdate){
6060 log.debug("Syncing "+li.iid);
6161 rebuild(li); // rsync, update registry, cache
 62+ pending.remove(li.iid.toString());
6263 }
6364 }
6465 }
@@ -69,6 +70,8 @@
7071 protected long queryInterval;
7172 protected SearcherCache cache;
7273 protected long delayInterval;
 74+ /** Pending updates, dbrole -> timestamp */
 75+ protected Hashtable<String,Long> pending = new Hashtable<String,Long>();
7376
7477 protected static UpdateThread instance = null;
7578
@@ -115,6 +118,8 @@
116119
117120 for(int i = 0; i < hiids.size(); i++){
118121 IndexId iid = hiids.get(i);
 122+ if(pending.containsKey(iid.toString()))
 123+ continue; // pending update, ignore
119124 LocalIndex myli = registry.getCurrentSearch(iid);
120125 if(timestamps[i]!= 0 && (myli == null || myli.timestamp < timestamps[i])){
121126 LocalIndex li = new LocalIndex(
@@ -122,10 +127,12 @@
123128 iid.getUpdatePath(),
124129 timestamps[i]);
125130 forUpdate.add(li); // newer snapshot available
 131+ pending.put(iid.toString(),new Long(timestamps[i]));
126132 }
127133 }
128134 }
129 - new DeferredUpdate(forUpdate,delayInterval);
 135+ if(forUpdate.size()>0)
 136+ new DeferredUpdate(forUpdate,delayInterval).start();
130137 }
131138
132139 /** Rsync a remote snapshot to a local one, updates registry, cache */
@@ -165,19 +172,23 @@
166173 File ind = new File(iid.getCanonicalSearchPath());
167174
168175 if(ind.exists()){ // prepare a local hard-linked copy of index
169 - try {
170 - // cp -lr update/dbname/timestamp/* update/dbname/timestamp2/
171 - command = "/bin/cp -lr "+ind.getCanonicalPath()+sep+"*"+" "+updatepath+sep;
172 - log.debug("Running shell command: "+command);
173 - Runtime.getRuntime().exec(command).waitFor();
174 - } catch (Exception e) {
175 - log.error("Error making update hardlinked copy "+updatepath+": "+e.getMessage());
 176+ ind = ind.getCanonicalFile();
 177+ for(File f: ind.listFiles()){
 178+ // a cp -lr command for each file in the index
 179+ command = "/bin/cp -lr "+ind.getCanonicalPath()+sep+f.getName()+" "+updatepath+sep+f.getName();
 180+ try {
 181+ log.debug("Running shell command: "+command);
 182+ Runtime.getRuntime().exec(command).waitFor();
 183+ } catch (Exception e) {
 184+ log.error("Error making update hardlinked copy "+updatepath+": "+e.getMessage());
 185+ continue;
 186+ }
176187 }
177188 }
178189
179190 // rsync
180191 String snapshotpath = iid.getRsyncSnapshotPath()+"/"+li.timestamp;
181 - command = "/usr/bin/rsync --delete -r rsync://"+iid.getIndexHost()+":"+snapshotpath+" "+iid.getUpdatePath();
 192+ command = "/usr/bin/rsync -W --delete -r rsync://"+iid.getIndexHost()+":"+snapshotpath+" "+iid.getUpdatePath();
182193 log.debug("Running shell command: "+command);
183194 Runtime.getRuntime().exec(command).waitFor();
184195
@@ -218,7 +229,7 @@
219230 /** Update search cache after successful rsync of update version of index */
220231 protected void updateCache(IndexSearcherMul is, LocalIndex li){
221232 // do some typical queries to preload some lucene caches, pages into memory, etc..
222 - Warmup.warmupIndexSearcher(is,li.iid);
 233+ Warmup.warmupIndexSearcher(is,li.iid,true);
223234 // add to cache
224235 cache.invalidateLocalSearcher(li.iid,is);
225236 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/SearchEngine.java
@@ -107,7 +107,9 @@
108108 */
109109 public SearchResults search(IndexId iid, String searchterm, int offset, int limit, NamespaceFilter nsDefault){
110110 Analyzer analyzer = Analyzers.getSearcherAnalyzer(iid);
111 - WikiQueryParser parser = new WikiQueryParser("contents","main",analyzer,WikiQueryParser.NamespacePolicy.IGNORE);
 111+ if(nsDefault == null || nsDefault.cardinality() == 0)
 112+ nsDefault = new NamespaceFilter("0"); // default to main namespace
 113+ WikiQueryParser parser = new WikiQueryParser("contents",nsDefault,analyzer,WikiQueryParser.NamespacePolicy.IGNORE);
112114 HashSet<Integer> fields = parser.getFieldNamespaces(searchterm);
113115 NamespaceFilterWrapper nsfw = null;
114116 Query q = null;
@@ -122,8 +124,6 @@
123125 }
124126 else if(fields.size()==0 && nsDefault!=null && nsDefault.cardinality()==1)
125127 nsfw = new NamespaceFilterWrapper(nsDefault);
126 - else if(fields.size()==0) // default: search main namespace
127 - nsfw = new NamespaceFilterWrapper(new NamespaceFilter("0"));
128128
129129 try {
130130 if(nsfw == null){
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/search/Warmup.java
@@ -28,7 +28,7 @@
2929 protected static Hashtable<String,Terms> langTerms = new Hashtable<String,Terms>();
3030
3131 /** Runs some typical queries on a local index searcher to preload caches, pages into memory, etc .. */
32 - public static void warmupIndexSearcher(IndexSearcherMul is, IndexId iid){
 32+ public static void warmupIndexSearcher(IndexSearcherMul is, IndexId iid, boolean useDelay){
3333 log.info("Warming up index "+iid+" ...");
3434 long start = System.currentTimeMillis();
3535
@@ -50,15 +50,15 @@
5151 return;
5252 }
5353 makeNamespaceFilters(is,iid);
54 - warmupSearchTerms(is,iid,count);
 54+ warmupSearchTerms(is,iid,count,useDelay);
5555 long delta = System.currentTimeMillis() - start;
5656 log.info("Warmed up "+iid+" in "+delta+" ms");
5757 }
5858 }
5959
6060 /** Warmup index using some number of simple searches */
61 - protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count) {
62 - WikiQueryParser parser = new WikiQueryParser("contents","main",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);
 61+ protected static void warmupSearchTerms(IndexSearcherMul is, IndexId iid, int count, boolean useDelay) {
 62+ WikiQueryParser parser = new WikiQueryParser("contents","0",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);
6363 Terms terms = getTermsForLang(global.getLanguage(iid.getDBname()));
6464
6565 try{
@@ -67,11 +67,18 @@
6868 Hits hits = is.search(q);
6969 for(int j =0; j<20 && j<hits.length(); j++)
7070 hits.doc(j); // retrieve some documents
 71+ if(useDelay){
 72+ if(i<1000)
 73+ Thread.sleep(100);
 74+ else
 75+ Thread.sleep(50);
 76+ }
7177 }
7278 } catch (IOException e) {
7379 log.error("Error warming up local IndexSearcherMul for "+iid);
7480 } catch (ParseException e) {
7581 log.error("Error parsing query in warmup of IndexSearcherMul for "+iid);
 82+ } catch (InterruptedException e) {
7683 }
7784 }
7885
@@ -101,7 +108,7 @@
102109 /** Just run one complex query and rebuild the main namespace filter */
103110 public static void simpleWarmup(IndexSearcherMul is, IndexId iid){
104111 try{
105 - WikiQueryParser parser = new WikiQueryParser("contents","main",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);
 112+ WikiQueryParser parser = new WikiQueryParser("contents","0",Analyzers.getSearcherAnalyzer(iid),WikiQueryParser.NamespacePolicy.IGNORE);
106113 Query q = parser.parseTwoPass("a OR very OR long OR title OR involving OR both OR wikipedia OR and OR pokemons",WikiQueryParser.NamespacePolicy.IGNORE);
107114 is.search(q,new NamespaceFilterWrapper(new NamespaceFilter("0")));
108115 } catch (IOException e) {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/IndexThread.java
@@ -222,7 +222,7 @@
223223 }
224224 }
225225
226 - protected void deleteDirRecursive(File file){
 226+ protected static void deleteDirRecursive(File file){
227227 if(!file.exists())
228228 return;
229229 else if(file.isDirectory()){
@@ -241,12 +241,9 @@
242242 *
243243 */
244244 protected void makeSnapshot() {
245 - final String sep = Configuration.PATH_SEP;
246245 HashSet<IndexId> indexes = WikiIndexModifier.closeAllModifiers();
247246 IndexRegistry registry = IndexRegistry.getInstance();
248247
249 - DateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
250 - String timestamp = df.format(new Date(System.currentTimeMillis()));
251248 log.debug("Making snapshots...");
252249 // check filesystem timestamps (for those for which we are unsure if they are updated)
253250 for( IndexId iid : global.getMyIndex()){
@@ -264,37 +261,44 @@
265262 }
266263 }
267264 for( IndexId iid : indexes ){
268 - log.info("Making snapshot for "+iid);
269 - String index = iid.getIndexPath();
270 - String snapshotdir = iid.getSnapshotPath();
271 - String snapshot = snapshotdir+sep+timestamp;
272 - // cleanup the snapshot dir for this iid
273 - File spd = new File(snapshotdir);
274 - if(spd.exists() && spd.isDirectory()){
275 - File[] files = spd.listFiles();
276 - for(File f: files)
277 - deleteDirRecursive(f);
278 - }
279 - new File(snapshot).mkdirs();
280 - File ind =new File(index);
281 - for(File f: ind.listFiles()){
282 - // use a cp -lr command for each file in the index
283 - String command = "/bin/cp -lr "+index+sep+f.getName()+" "+snapshot+sep+f.getName();
284 - Process copy;
285 - try {
286 - log.debug("Running shell command: "+command);
287 - copy = Runtime.getRuntime().exec(command);
288 - copy.waitFor();
289 - } catch (Exception e) {
290 - log.error("Error making snapshot "+snapshot+": "+e.getMessage());
291 - continue;
292 - }
293 - }
 265+ makeIndexSnapshot(iid,iid.getIndexPath());
294266 registry.refreshSnapshots(iid);
295 - log.info("Made snapshot "+snapshot);
296267 }
297268 }
298269
 270+ public static void makeIndexSnapshot(IndexId iid, String indexPath){
 271+ final String sep = Configuration.PATH_SEP;
 272+ DateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
 273+ String timestamp = df.format(new Date(System.currentTimeMillis()));
 274+
 275+ log.info("Making snapshot for "+iid);
 276+ String snapshotdir = iid.getSnapshotPath();
 277+ String snapshot = snapshotdir+sep+timestamp;
 278+ // cleanup the snapshot dir for this iid
 279+ File spd = new File(snapshotdir);
 280+ if(spd.exists() && spd.isDirectory()){
 281+ File[] files = spd.listFiles();
 282+ for(File f: files)
 283+ deleteDirRecursive(f);
 284+ }
 285+ new File(snapshot).mkdirs();
 286+ File ind =new File(indexPath);
 287+ for(File f: ind.listFiles()){
 288+ // use a cp -lr command for each file in the index
 289+ String command = "/bin/cp -lr "+indexPath+sep+f.getName()+" "+snapshot+sep+f.getName();
 290+ Process copy;
 291+ try {
 292+ log.debug("Running shell command: "+command);
 293+ copy = Runtime.getRuntime().exec(command);
 294+ copy.waitFor();
 295+ } catch (Exception e) {
 296+ log.error("Error making snapshot "+snapshot+": "+e.getMessage());
 297+ continue;
 298+ }
 299+ }
 300+ log.info("Made snapshot "+snapshot);
 301+ }
 302+
299303 /**
300304 * @return if there are queued updates
301305 */
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/analyzers/WikiQueryParser.java
@@ -2,8 +2,10 @@
33
44 import java.io.IOException;
55 import java.util.ArrayList;
 6+import java.util.BitSet;
67 import java.util.HashMap;
78 import java.util.HashSet;
 9+import java.util.Map.Entry;
810
911 import org.apache.lucene.analysis.Analyzer;
1012 import org.apache.lucene.analysis.Token;
@@ -15,6 +17,8 @@
1618 import org.apache.lucene.search.PhraseQuery;
1719 import org.apache.lucene.search.Query;
1820 import org.apache.lucene.search.TermQuery;
 21+import org.apache.lucene.search.WildcardQuery;
 22+import org.wikimedia.lsearch.search.NamespaceFilter;
1923 import org.wikimedia.lsearch.util.UnicodeDecomposer;
2024
2125 /**
@@ -74,7 +78,8 @@
7579 */
7680 public enum NamespacePolicy { LEAVE, IGNORE, REWRITE };
7781 protected HashMap<String,Integer> namespaceMapping;
78 - private String defaultNamespace;
 82+ private String defaultNamespaceName;
 83+ private Query namespaceRewriteQuery;
7984 private NamespacePolicy namespacePolicy;
8085
8186 /** default value for boolean queries */
@@ -110,7 +115,7 @@
111116 * @param analyzer
112117 */
113118 public WikiQueryParser(String field, Analyzer analyzer){
114 - this(field,null,analyzer,NamespacePolicy.LEAVE);
 119+ this(field,(NamespaceFilter)null,analyzer,NamespacePolicy.LEAVE);
115120 }
116121
117122 /**
@@ -122,16 +127,53 @@
123128 * @param nsPolicy
124129 */
125130 public WikiQueryParser(String field, String namespace, Analyzer analyzer, NamespacePolicy nsPolicy){
126 - defaultField = field;
127 - defaultNamespace = namespace;
 131+ this(field,new NamespaceFilter(namespace),analyzer,nsPolicy);
 132+ }
 133+
 134+ public WikiQueryParser(String field, NamespaceFilter nsfilter, Analyzer analyzer, NamespacePolicy nsPolicy){
 135+ defaultField = field;
128136 this.analyzer = analyzer;
129137 decomposer = UnicodeDecomposer.getInstance();
130138 tokens = new ArrayList<Token>();
131139 this.namespacePolicy = nsPolicy;
132140 disableTitleAliases = true;
133141 initNamespaces();
 142+ if(nsfilter != null){
 143+ namespaceRewriteQuery = generateRewrite(nsfilter);
 144+ defaultNamespaceName = null;
 145+ if(nsfilter.cardinality()==1){
 146+ Integer in = new Integer(nsfilter.getNamespace());
 147+ // if has only on namespace, try to get the name of default namespace
 148+ for(Entry<String,Integer> e : namespaceMapping.entrySet()){
 149+ if(in.equals(e.getValue())){
 150+ defaultNamespaceName = e.getKey();
 151+ }
 152+ }
 153+ }
 154+ }
 155+ else{
 156+ namespaceRewriteQuery = null;
 157+ defaultNamespaceName = null;
 158+ }
134159 }
135160
 161+ /** Generate a rewrite query for a collection of namespaces */
 162+ protected Query generateRewrite(NamespaceFilter nsfilter){
 163+ if(nsfilter.cardinality() == 0)
 164+ return null;
 165+ else if(nsfilter.cardinality() == 1)
 166+ return new TermQuery(new Term("namespace",Integer.toString(nsfilter.getNamespace())));
 167+
 168+ BooleanQuery bq = new BooleanQuery();
 169+ BitSet bs = nsfilter.getIncluded();
 170+ // iterate over set bits
 171+ for(int i=bs.nextSetBit(0); i>=0; i=bs.nextSetBit(i+1)){
 172+ bq.add(new TermQuery(new Term("namespace",Integer.toString(i))),
 173+ BooleanClause.Occur.SHOULD);
 174+ }
 175+ return bq;
 176+ }
 177+
136178 /**
137179 * Get a hashset of namespace numbers for fields that are
138180 * valid namespace keys.
@@ -228,8 +270,8 @@
229271 if(length == 0 && ch == ' ')
230272 continue; // ignore whitespaces
231273
232 - // pluses and minuses, underscores can be within words
233 - if(Character.isLetterOrDigit(ch) || ch=='-' || ch=='+' || ch=='_'){
 274+ // pluses and minuses, underscores can be within words, *,? are for wildcard queries
 275+ if(Character.isLetterOrDigit(ch) || ch=='-' || ch=='+' || ch=='_' || ch=='*' || ch=='?'){
234276 // unicode normalization -> delete accents
235277 decomp = decomposer.decompose(ch);
236278 if(decomp == null)
@@ -353,7 +395,7 @@
354396 }
355397
356398 private final boolean needsRewrite(){
357 - return defaultNamespace != null && namespacePolicy == NamespacePolicy.REWRITE;
 399+ return namespaceRewriteQuery != null && namespacePolicy == NamespacePolicy.REWRITE;
358400 }
359401
360402 /** Parses a clause: (in regexp notation)
@@ -382,7 +424,7 @@
383425
384426 // assume default namespace value on rewrite
385427 if(!returnOnFieldDef && field == null && needsRewrite()){
386 - fieldQuery = getNamespaceQuery(defaultNamespace);
 428+ fieldQuery = namespaceRewriteQuery;
387429 }
388430
389431 mainloop: for( ; cur < queryLength; cur++ ){
@@ -409,7 +451,7 @@
410452 if(field == null || definedExplicitField){
411453 // set field name
412454 field = new String(buffer,0,length);
413 - if((defaultNamespace!=null && field.equals(defaultNamespace)) || field.equals(defaultField)){
 455+ if((defaultNamespaceName!=null && field.equals(defaultNamespaceName)) || field.equals(defaultField)){
414456 field = null;
415457 break; // repeated definition of field, ignore
416458 }
@@ -433,7 +475,7 @@
434476 case WORD:
435477 if(fieldQuery != null){
436478 backToken();
437 - String myfield = (topFieldName != null)? topFieldName : (field !=null)? field : (defaultNamespace!=null)? defaultNamespace : defaultField;
 479+ String myfield = (topFieldName != null)? topFieldName : (field !=null)? field : (defaultNamespaceName!=null)? defaultNamespaceName : defaultField;
438480 fieldsubquery = parseClause(level+1,true,myfield);
439481 } else{
440482 analyzeBuffer();
@@ -561,6 +603,14 @@
562604 return new TermQuery(makeTerm());
563605 }
564606
 607+ // check for wildcard seaches, they are also not analyzed/stemmed
 608+ // wildcard signs are allowed only at the end of the word, minimum one letter word
 609+ if(length>1 && Character.isLetter(buffer[0]) && (buffer[length-1]=='*' || buffer[length-1]=='?')){
 610+ Query ret = new WildcardQuery(makeTerm());
 611+ ret.setBoost(defaultBoost);
 612+ return ret;
 613+ }
 614+
565615 if(toplevelOccur == BooleanClause.Occur.MUST_NOT)
566616 aliasOccur = null; // do not add aliases
567617 else
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
@@ -44,6 +44,7 @@
4545 log.error("OAI authentication error. Username/password pair not specified in configuration file.");
4646 return null;
4747 }
 48+ log.info("Authenticating ... ");
4849 return new PasswordAuthentication(username,password.toCharArray());
4950 }
5051 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java
@@ -22,7 +22,6 @@
2323 public class OAIHarvester {
2424 static Logger log = Logger.getLogger(OAIHarvester.class);
2525 protected String urlbase;
26 - protected Authenticator auth;
2726 protected OAIParser parser;
2827 protected IndexUpdatesCollector collector;
2928 protected IndexId iid;
@@ -30,8 +29,8 @@
3130
3231 public OAIHarvester(IndexId iid, String url, Authenticator auth){
3332 this.urlbase = url;
34 - this.auth = auth;
3533 this.iid = iid;
 34+ Authenticator.setDefault(auth);
3635 }
3736
3837 /** Invoke ListRecords from a certain timestamp */
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Collector.java
@@ -16,16 +16,18 @@
1717 }
1818
1919 protected ArrayList<ReportSet> reports = new ArrayList<ReportSet>();
20 - protected long startTime;
 20+ protected long startTime, lastTime;
2121 protected int reportInc; // after how many reports to print out results
2222 protected int curInc; // current increment
2323 protected int total;
 24+ protected int threads;
2425
25 - Collector(int reportInc, int total){
26 - startTime = System.currentTimeMillis();
 26+ Collector(int reportInc, int total, int threads){
 27+ lastTime = startTime = System.currentTimeMillis();
2728 this.reportInc = reportInc;
2829 curInc = 0;
2930 this.total = total;
 31+ this.threads = threads;
3032 }
3133
3234 synchronized public void add(int results, long time){
@@ -44,6 +46,11 @@
4547 results += rs.results;
4648 time += rs.time;
4749 }
 50+ long time1k = 0;
 51+ if(reports.size()>=1000){
 52+ for(int i=reports.size()-1000;i<reports.size();i++)
 53+ time1k += reports.get(i).time;
 54+ }
4855 long now = System.currentTimeMillis();
4956 int sec = (int) ((now-startTime)/1000);
5057 int min = 0;
@@ -52,8 +59,9 @@
5360 sec = sec%60;
5461 }
5562 double pers = (double)(now-startTime)/reports.size();
56 - //double avgtime = (double)time/reports.size();
57 - System.out.format("[%d:%02d %d/%d] %2.1fms : %d results / search\n", min, sec, reports.size(), total, pers, results/reports.size());
 63+ double nowpers = (double)(now-lastTime)/reportInc;
 64+ lastTime = now;
 65+ System.out.format("[%d:%02d %d/%d] %2.1fms : %d results / search (now: %2.1fms, last 1k: %2.1fms)\n", min, sec, reports.size(), total, pers, results/reports.size(), nowpers, time1k/1000.0/threads);
5866 System.out.flush();
5967 }
6068 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/benchmark/Benchmark.java
@@ -18,6 +18,7 @@
1919 protected Terms terms;
2020 protected int words;
2121 protected String namespace;
 22+ protected String namespaceFilter;
2223
2324 protected int thread; // current thread
2425
@@ -35,12 +36,12 @@
3637 protected static Object sharedLock = new Object();
3738
3839 /** Use this to construct the main thread */
39 - public Benchmark(String host, int port, String database, String verb, Terms terms, int words, String namespace) {
40 - this(host,port,database,verb,terms,words,namespace,0,0);
 40+ public Benchmark(String host, int port, String database, String verb, Terms terms, int words, String namespace, String namespaceFilter) {
 41+ this(host,port,database,verb,terms,words,namespace,namespaceFilter,0,0);
4142 }
4243
4344 /** Use this to construct a benchmark thread */
44 - public Benchmark(String host, int port, String database, String verb, Terms terms, int words, String namespace, int runs, int thread) {
 45+ public Benchmark(String host, int port, String database, String verb, Terms terms, int words, String namespace, String namespaceFilter, int runs, int thread) {
4546 this.host = host;
4647 this.port = port;
4748 this.database = database;
@@ -50,6 +51,7 @@
5152 this.thread = thread;
5253 this.words = words;
5354 this.namespace = namespace;
 55+ this.namespaceFilter = namespaceFilter;
5456 }
5557
5658 /** Start benchmarking on main thread */
@@ -61,10 +63,10 @@
6264 activeThreads = threads;
6365 startTime = System.currentTimeMillis();
6466
65 - collector = new Collector(100,threads*runs);
 67+ collector = new Collector(100,threads*runs,threads);
6668
6769 for(int i=0;i<threads;i++)
68 - new Benchmark(host,port,database,verb,terms,words,namespace,runs,i).start();
 70+ new Benchmark(host,port,database,verb,terms,words,namespace,namespaceFilter,runs,i).start();
6971
7072 // wait until all thread finish
7173 while(activeThreads != 0){
@@ -106,11 +108,17 @@
107109 String query = "";
108110 for(int i=0;i<words;i++){
109111 if(!query.equals(""))
110 - query += " ";
 112+ query += " OR ";
111113 query += terms.next();
112114 }
113 - query = namespace+":"+URLEncoder.encode(query).replaceAll("\\+","%20");
114 - String urlString = "http://"+host+":"+port+"/"+verb+"/"+database+"/"+query+"?limit=20";
 115+ String urlString;
 116+ if(namespace.equals("")){
 117+ query = URLEncoder.encode(query).replaceAll("\\+","%20");
 118+ urlString = "http://"+host+":"+port+"/"+verb+"/"+database+"/"+query+"?limit=20&namespaces="+namespaceFilter;
 119+ } else{
 120+ query = namespace+":"+URLEncoder.encode(query).replaceAll("\\+","%20");
 121+ urlString = "http://"+host+":"+port+"/"+verb+"/"+database+"/"+query+"?limit=20";
 122+ }
115123 try {
116124 URL url;
117125 url = new URL(urlString);
@@ -160,7 +168,8 @@
161169 int port = 8123;
162170 String database = "wikilucene";
163171 String verb = "search";
164 - String namespace = "all";
 172+ String namespace = "main";
 173+ String namespaceFilter= "0";
165174 int runs = 5000;
166175 int threads = 10;
167176 int words = 2;
@@ -180,8 +189,11 @@
181190 runs = Integer.parseInt(args[++i]);
182191 } else if (args[i].equals("-v")) {
183192 database = args[++i];
184 - } else if (args[i].equals("-ns")) {
 193+ } else if (args[i].equals("-n") || args[i].equals("-ns")) {
185194 namespace = args[++i];
 195+ } else if (args[i].equals("-f") ) {
 196+ namespaceFilter = args[++i];
 197+ namespace ="";
186198 } else if (args[i].equals("-w")) {
187199 words = Integer.parseInt(args[++i]);
188200 } else if (args[i].equals("--help")) {
@@ -190,15 +202,19 @@
191203 " -p port (default: "+port+")\n"+
192204 " -d database (default: "+database+")\n"+
193205 " -t threads (defaut: "+threads+")\n"+
194 - " -n count (default: "+runs+")\n"+
 206+ " -c count (default: "+runs+")\n"+
195207 " -w number of words in query (default: "+words+")\n"+
196208 " -v verb (default: "+verb+")\n"+
197 - " -ns namespace (default: "+namespace+")\n");
 209+ " -n namespace (default: "+namespace+")\n"+
 210+ " -f namespace filter (default: "+namespaceFilter+")\n");
198211 return;
 212+ } else{
 213+ System.out.println("Unrecognized switch: "+args[i]);
 214+ return;
199215 }
200216 }
201217 System.out.println("Running benchmark on "+host+":"+port+" with "+threads+" theads each "+runs+" runs");
202 - Benchmark bench = new Benchmark(host, port, database, verb, terms, words, namespace);
 218+ Benchmark bench = new Benchmark(host, port, database, verb, terms, words, namespace, namespaceFilter);
203219 bench.startBenchmark(threads,runs);
204220 bench.printReport();
205221 }