r23065 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r23064‎ | r23065 | r23066 >
Date:12:10, 18 June 2007
Author:rainman
Status:old
Tags:
Comment:
Minor bugfixes, more flexible configuration.
Modified paths:
  • /trunk/lucene-search-2.0/lsearch-global.conf (modified) (history)
  • /trunk/lucene-search-2.0/lsearch.conf (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/Configuration.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/RankBuilder.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/MySQLStorage.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/PHPParser.java (modified) (history)
  • /trunk/lucene-search-2.0/test-data/mwsearch-global.test (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/test-data/mwsearch-global.test
@@ -38,6 +38,14 @@
3939 192.168.0.2 : frtest.part1, frtest.part2, frtest.part3
4040 192.168.0.10 : srwiki njawiki
4141
 42+# OAI repository info, for incremental updater
 43+# dbSuffix : base url (to index.php)
 44+# ?title=Special:OAIRepository is appended to url
 45+[OAI]
 46+wiktionary : http://$lang.wiktionary.org/w/index.php
 47+frtest : http://localhost/wiki-lucene/phase3/index.php
 48+<default> : http://$lang.wikipedia.org/w/index.php
 49+
4250 # Path where indexes are on hosts, after default value put hosts where
4351 # the location differs
4452 [Index-Path]
@@ -51,6 +59,10 @@
5260 # dbnames that end with the suffix will use additional keywords scores
5361 KeywordScoring.suffix=wiki rutest
5462
 63+# wmf-style init file, attempt to read OAI and lang info from it
 64+# for sample see http://noc.wikimedia.org/conf/InitialiseSettings.php.html
 65+WMF.InitialiseSettings=file:///home/rainman/Desktop/InitialiseSettings.php
 66+
5567 # databases can be writen as {file}, where file contains list of dbs
5668
5769 # Put here you custom namespace prefixes
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java
@@ -5,7 +5,6 @@
66 package org.wikimedia.lsearch.test;
77
88 import java.io.IOException;
9 -import java.net.Inet4Address;
109 import java.net.InetAddress;
1110 import java.net.MalformedURLException;
1211 import java.net.URL;
@@ -63,7 +62,11 @@
6463 return globalProperties;
6564 }
6665
 66+ public Hashtable<String,String> getOaiRepo(){
 67+ return oaiRepo;
 68+ }
6769
 70+
6871 }
6972
7073 public static GlobalConfigurationTest.TestGC testgc = null;
@@ -86,7 +89,7 @@
8790 String testurl = "file://"+System.getProperty("user.dir")+"/test-data/mwsearch-global.test";
8891 try {
8992 URL url = new URL(testurl);
90 - testgc.readFromURL(url,"/usr/local/var/mwsearch","");
 93+ testgc.readFromURL(url,"/usr/local/var/mwsearch");
9194
9295 // database
9396 Hashtable database = testgc.getDatabase();
@@ -173,8 +176,18 @@
174177 assertTrue(testgc.useKeywordScoring("srwiki"));
175178 assertTrue(testgc.useKeywordScoring("rutest"));
176179
 180+ // test oai repo stuff
 181+ Hashtable<String,String> oairepo = testgc.getOaiRepo();
 182+ assertEquals("http://$lang.wiktionary.org/w/index.php",oairepo.get("wiktionary"));
 183+ assertEquals("http://localhost/wiki-lucene/phase3/index.php",oairepo.get("frtest"));
 184+ assertEquals("http://$lang.wikipedia.org/w/index.php",oairepo.get("<default>"));
177185
 186+ assertEquals("http://sr.wikipedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("srwiki"));
 187+ assertEquals("http://fr.wikipedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("frtest"));
178188
 189+ // InitialiseSettings test
 190+ assertEquals("sr",testgc.getLanguage("rswikimedia"));
 191+ assertEquals("http://commons.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("commonswiki"));
179192
180193 } catch (MalformedURLException e) {
181194 e.printStackTrace();
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
@@ -18,6 +18,7 @@
1919 import org.wikimedia.lsearch.index.IndexThread;
2020 import org.wikimedia.lsearch.ranks.Links;
2121 import org.wikimedia.lsearch.ranks.RankBuilder;
 22+import org.wikimedia.lsearch.storage.Storage;
2223 import org.wikimedia.lsearch.util.Localization;
2324 import org.wikimedia.lsearch.util.UnicodeDecomposer;
2425
@@ -39,8 +40,8 @@
4041 String dbname = null;
4142 Boolean optimize = null;
4243 Integer mergeFactor = null, maxBufDocs = null;
43 - boolean newIndex = false, makeSnapshot = false;
44 - boolean snapshotDb = false;
 44+ boolean newIndex = true, makeSnapshot = false;
 45+ boolean snapshotDb = false; boolean updateReferences=false;
4546
4647 System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n");
4748
@@ -48,10 +49,11 @@
4950 log = Logger.getLogger(Importer.class);
5051
5152 if(args.length < 2){
52 - System.out.println("Syntax: java Importer [-n] [-s] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");
 53+ System.out.println("Syntax: java Importer [-n] [-s] [-r] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");
5354 System.out.println("Options: ");
54 - System.out.println(" -n - create a new index (erase the old one if exists)");
 55+ System.out.println(" -a - don't create new index, append to old");
5556 System.out.println(" -s - make index snapshot when finished");
 57+ System.out.println(" -r - update references info on storage backend");
5658 System.out.println(" -l limit_num - add at most limit_num articles");
5759 System.out.println(" -o optimize - true/false overrides optimization param from global settings");
5860 System.out.println(" -m mergeFactor - overrides param from global settings");
@@ -68,8 +70,10 @@
6971 mergeFactor = Integer.parseInt(args[++i]);
7072 else if(args[i].equals("-b"))
7173 maxBufDocs = Integer.parseInt(args[++i]);
72 - else if(args[i].equals("-n"))
73 - newIndex = true;
 74+ else if(args[i].equals("-a"))
 75+ newIndex = false;
 76+ else if(args[i].equals("-r"))
 77+ updateReferences = true;
7478 else if(args[i].equals("-s"))
7579 makeSnapshot = true;
7680 else if(args[i].equals("--snapshot")){
@@ -99,6 +103,14 @@
100104
101105 // regenerate link and redirect information
102106 Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,org.wikimedia.lsearch.ranks.LinkReader.READ_REDIRECTS);
 107+
 108+ if(updateReferences){
 109+ try {
 110+ Storage.getInstance().storePageReferences(links.getAll(),dbname);
 111+ } catch (IOException e) {
 112+ log.error("Failed to update references info: "+e.getMessage());
 113+ }
 114+ }
103115 links.generateRedirectLists();
104116
105117 log.info("Third pass, indexing articles...");
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/MySQLStorage.java
@@ -38,6 +38,10 @@
3939 protected String username;
4040 /** mysql password */
4141 protected String password;
 42+ /** administrator's username */
 43+ protected String adminUsername;
 44+ /** administrator's password */
 45+ protected String adminPassword;
4246 /** If we should separate data in many dbs */
4347 protected boolean separate;
4448 /** db where to put everything, if we are not using one db per dbname */
@@ -82,6 +86,9 @@
8387 username = config.getString("Storage","username","root");
8488 password = config.getString("Storage","password","");
8589
 90+ adminUsername = config.getString("Storage","adminuser",username);
 91+ adminPassword = config.getString("Storage","adminpass",password);
 92+
8693 // figure out db configuration
8794 separate = config.getBoolean("Storage","useSeparateDBs");
8895 if(!separate){
@@ -94,16 +101,21 @@
95102
96103 /** Get connection for writing stuff, i.e. on the master */
97104 protected Connection getReadConnection(String dbname) throws IOException{
98 - return openConnection(dbname,false);
 105+ return openConnection(dbname,false,false);
99106 }
100107
101108 /** Get connection for reading of (possibly lagged) stuff, i.e. on slaves (or master if there are no slaves) */
102109 protected Connection getWriteConnection(String dbname) throws IOException{
103 - return openConnection(dbname,true);
 110+ return openConnection(dbname,true,false);
104111 }
105112
 113+ /** Get administrators connection for creating tables/db, etc.. (on master) */
 114+ protected Connection getAdminConnection(String dbname) throws IOException {
 115+ return openConnection(dbname,true,true);
 116+ }
 117+
106118 /** Open connection on the master, or load-balanced on one of the slaves */
107 - protected Connection openConnection(String dbname, boolean onMaster) throws IOException {
 119+ protected Connection openConnection(String dbname, boolean onMaster, boolean admin) throws IOException {
108120 String host=null;
109121 if(onMaster || slaves == null)
110122 host = master;
@@ -121,8 +133,12 @@
122134 String dburl = "jdbc:mysql://"+host+":3306/";
123135 if(!separate && defaultDB!=null)
124136 dburl += defaultDB;
 137+ dburl += "?useUnicode=yes&characterEncoding=UTF-8";
125138 try {
126 - return DriverManager.getConnection(dburl, username, password);
 139+ if(admin)
 140+ return DriverManager.getConnection(dburl, adminUsername, adminPassword);
 141+ else
 142+ return DriverManager.getConnection(dburl, username, password);
127143 } catch (SQLException e) {
128144 log.error("Cannot establish connection to "+dburl+" - check host, db, username and password : "+e.getMessage());
129145 throw new IOException("Cannot establish connection to mysql database.");
@@ -134,7 +150,7 @@
135151 }
136152
137153 public String escape(String str){
138 - return str.replace("'","\\'");
 154+ return str.replace("\\","\\\\").replace("'","\\'");
139155 }
140156
141157 public String getTableName(String name, String dbname){
@@ -147,7 +163,9 @@
148164 // inherit javadoc
149165 public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException {
150166 String sql = "SELECT rf_key, rf_references from "+getTableName("references",dbname)+" WHERE ";
151 - if(titles.size()==1){
 167+ if(titles == null || titles.size()==0)
 168+ return new ArrayList<CompactArticleLinks>();
 169+ else if(titles.size()==1){
152170 sql += "rf_key="+quote(escape(titles.iterator().next().getKey()));
153171 } else{
154172 StringBuilder sb = new StringBuilder(sql);
@@ -155,7 +173,7 @@
156174 Iterator<Title> it = titles.iterator();
157175 while(it.hasNext()){
158176 sb.append('\'');
159 - sb.append(it.next().getKey());
 177+ sb.append(escape(it.next().getKey()));
160178 sb.append('\'');
161179 if(it.hasNext())
162180 sb.append(',');
@@ -256,9 +274,11 @@
257275 tdef = def;
258276 // create
259277 try {
 278+ Connection admin = getAdminConnection(dbname);
260279 log.info("Creating table "+name+" on "+dbname);
261 - Statement stmt = conn.createStatement();
262 - stmt.executeUpdate(tdef);
 280+ Statement stmt = admin.createStatement();
 281+ stmt.executeUpdate(tdef);
 282+ admin.close();
263283 } catch (SQLException e) {
264284 log.error("Cannot create table "+table+" : "+e.getMessage());
265285 throw new IOException(e.getMessage());
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/Configuration.java
@@ -105,7 +105,6 @@
106106 GlobalConfiguration global = GlobalConfiguration.getInstance();
107107 String globalurl = getString("MWConfig","global");
108108 String indexpath = getString("Indexes","path");
109 - String oairepo = getString("OAI","repo");
110109 if(globalurl==null){
111110 System.out.println("FATAL: Need to define global configuration url in local config file.");
112111 System.exit(1);
@@ -114,7 +113,7 @@
115114 System.exit(1);
116115 }
117116 try {
118 - global.readFromURL(new URL(globalurl),indexpath,oairepo);
 117+ global.readFromURL(new URL(globalurl),indexpath);
119118 } catch (MalformedURLException e) {
120119 System.out.println("Malformed URL "+globalurl+" cannot read global configuration (check MWConfig.global in "+CONF_FILE_NAME+"), exiting...");
121120 System.exit(1);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
@@ -17,6 +17,7 @@
1818 import java.net.UnknownHostException;
1919 import java.text.MessageFormat;
2020 import java.util.ArrayList;
 21+import java.util.Collection;
2122 import java.util.Enumeration;
2223 import java.util.HashSet;
2324 import java.util.Hashtable;
@@ -27,6 +28,7 @@
2829 import java.util.regex.Pattern;
2930
3031 import org.wikimedia.lsearch.search.NamespaceFilter;
 32+import org.wikimedia.lsearch.util.PHPParser;
3133
3234 /**
3335 * Read and parse the global configuration file, is also used
@@ -58,13 +60,17 @@
5961 protected Hashtable<String,NamespaceFilter> namespacePrefix;
6062 /** keyword for all namespaces (i.e. no filtering) */
6163 protected String namespacePrefixAll;
 64+ /** suffx -> OAI Repo url pattern */
 65+ protected Hashtable<String,String> oaiRepo;
 66+ /** wgLanguageCode from InitialiseSettings, suffix -> lang code */
 67+ protected Hashtable<String,String> wgLanguageCode = null;
 68+ /** wgServer, suffix -> server (default server is "default")*/
 69+ protected Hashtable<String,String> wgServer = null;
6270
6371 /** info about this host */
6472 protected static InetAddress myHost;
6573 protected static String hostAddr, hostName;
6674
67 - /** OAI repo pattern from lsearch2.conf */
68 - protected String OAIRepoPattern;
6975 /** Database suffix if dbname, the rest is supposed to be language, e.g srwiki => (suffix wiki) => sr */
7076 protected String[] databaseSuffixes = null;
7177 /** Databases ending in suffix will use additional keyword scores */
@@ -204,13 +210,13 @@
205211 * @param url
206212 * @throws IOException
207213 */
208 - public void readFromURL(URL url, String indexpath, String oaiRepo) throws IOException{
 214+ public void readFromURL(URL url, String indexpath) throws IOException{
209215 BufferedReader in;
210216 try {
211217 in = new BufferedReader(
212218 new InputStreamReader(
213219 url.openStream()));
214 - read(in,indexpath,oaiRepo);
 220+ read(in,indexpath);
215221 } catch (IOException e) {
216222 System.out.println("I/O Error in opening or reading global config at url "+url);
217223 throw e;
@@ -231,6 +237,7 @@
232238 indexRsyncPath = new Hashtable<String, String>();
233239 namespacePrefix = new Hashtable<String,NamespaceFilter>();
234240 namespacePrefixAll = "all"; // default
 241+ oaiRepo = new Hashtable<String,String>();
235242 }
236243
237244 protected String[] getArrayProperty(String name){
@@ -247,7 +254,7 @@
248255 * @param in opened reader
249256 * @throws IOException
250257 */
251 - protected void read(BufferedReader in, String indexpath, String oaiRepo) throws IOException{
 258+ protected void read(BufferedReader in, String indexpath) throws IOException{
252259 String line="";
253260 int section = -1;
254261 Pattern roleRegexp = Pattern.compile("\\((.*?)\\)");
@@ -258,12 +265,12 @@
259266 final int SEARCH = 2;
260267 final int INDEXPATH = 3;
261268 final int NAMESPACE_PREFIX = 4;
 269+ final int OAI = 5;
262270
263271 int searchGroupNum = -1;
264272
265273 init();
266274 this.indexPath = indexpath;
267 - this.OAIRepoPattern = oaiRepo == null? "" : oaiRepo;
268275
269276 while((line = in.readLine()) != null){
270277 lineNum ++;
@@ -293,6 +300,10 @@
294301 this.databaseSuffixes = getArrayProperty("Database.suffix");
295302 this.keywordScoringSuffixes = getArrayProperty("KeywordScoring.suffix");
296303 this.exactCaseSuffix = getArrayProperty("ExactCase.suffix");
 304+ // try reading intialisesettings
 305+ String initset = globalProperties.getProperty("WMF.InitialiseSettings");
 306+ if(initset != null)
 307+ initializeWmfSettings(initset);
297308 if(line == null)
298309 break;
299310 // else: line points to beginning of next section
@@ -311,6 +322,8 @@
312323 section = INDEXPATH;
313324 else if(s.equalsIgnoreCase("namespace-prefix"))
314325 section = NAMESPACE_PREFIX;
 326+ else if(s.equalsIgnoreCase("oai"))
 327+ section = OAI;
315328 } else if(section==-1 && !line.trim().equals("")){
316329 System.out.println("Ignoring a line up to first section heading...");
317330 } else if(section == DATABASE){
@@ -355,17 +368,42 @@
356369 namespacePrefixAll = prefix;
357370 else
358371 namespacePrefix.put(prefix,new NamespaceFilter(filter));
 372+ } else if(section == OAI){
 373+ String[] parts = splitBySemicolon(line,lineNum);
 374+ if(parts == null) continue;
 375+ String suffix = parts[0].trim();
 376+ String url = parts[1].trim();
 377+
 378+ oaiRepo.put(suffix,url);
359379 }
360380 }
361381 if( !checkIntegrity() ){
362382 in.close();
363383 System.exit(1);
364384 }
 385+
365386 makeIndexIdPool();
366387 in.close();
367388 }
368389
369 -
 390+ /**
 391+ * A bit hackish: read InitialiseSettings which we know have a certain
 392+ * format to avoid maintaining two copies for config files (one in php
 393+ * other for lsearch in global conf)
 394+ *
 395+ * @param initset
 396+ */
 397+ protected void initializeWmfSettings(String initset) {
 398+ try {
 399+ PHPParser parser = new PHPParser();
 400+ String text = parser.readURL(new URL(initset));
 401+ wgLanguageCode = parser.getLanguages(text);
 402+ wgServer = parser.getServer(text);
 403+ } catch (IOException e) {
 404+ System.out.println("Error: Cannot read InitialiseSettings.php from url "+initset+" : "+e.getMessage());
 405+ }
 406+ }
 407+
370408 /** Get all hosts which search this inxedId (dbrole) */
371409 protected HashSet<String> getSearchHosts(String dbrole){
372410 HashSet<String> searchHosts = new HashSet<String>();
@@ -445,7 +483,7 @@
446484 if(rsyncIndexPath == null)
447485 rsyncIndexPath = indexRsyncPath.get("<default>");
448486 }
449 - String oairepo = MessageFormat.format(OAIRepoPattern,new Object[] {dbname,getLanguage(dbname)});
 487+ String oairepo = getOAIRepo(dbname);
450488
451489 IndexId iid = new IndexId(dbrole,
452490 type,
@@ -523,7 +561,7 @@
524562 }
525563
526564 protected String[] splitBySemicolon(String line, int lineNum){
527 - String[] parts = line.split(":");
 565+ String[] parts = line.split(":",2);
528566 if(parts.length!=2){
529567 System.out.println("Error at line "+lineNum+": semicolon missing. Ignoring this line.");
530568 return null;
@@ -779,9 +817,18 @@
780818 /** Get language for a dbname */
781819 public String getLanguage(String dbname) {
782820 // first check explicit language paramter in global settings
783 - Hashtable<String,String> lang = database.get(dbname).get("language");
784 - if(lang!=null)
785 - return lang.get("code");
 821+ Hashtable<String,Hashtable<String,String>> dbparam = database.get(dbname);
 822+ if(dbparam !=null){
 823+ Hashtable<String,String> lang = dbparam.get("language");
 824+ if(lang!=null)
 825+ return lang.get("code");
 826+ }
 827+ // try to get from initialise settings
 828+ if(wgLanguageCode!=null){
 829+ String key = findSuffix(wgLanguageCode.keySet(),dbname);
 830+ if(key != null)
 831+ return wgLanguageCode.get(key);
 832+ }
786833 // try to get languages from suffixes
787834 if(databaseSuffixes != null){
788835 for (String suffix : databaseSuffixes) {
@@ -867,6 +914,51 @@
868915 public boolean exactCaseIndex(String dbname){
869916 return checkSuffix(exactCaseSuffix,dbname);
870917 }
 918+
 919+ /** Find suffix that matches dbname */
 920+ public String findSuffix(Collection<String> suffixes, String dbname){
 921+ for(String suffix : suffixes){
 922+ if(dbname.endsWith(suffix)){
 923+ return suffix;
 924+ }
 925+ }
 926+ return null;
 927+ }
871928
 929+ /** Get OAI-repo url for dbname */
 930+ public String getOAIRepo(String dbname){
 931+ String repo = null;
 932+ // try to get from initialise settings
 933+ if(wgServer != null){
 934+ String key = findSuffix(wgServer.keySet(),dbname);
 935+ if(key == null)
 936+ key = "default";
 937+ repo = wgServer.get(key);
 938+ if(repo != null){
 939+ if(!repo.endsWith("/"))
 940+ repo += "/";
 941+ repo += "w/index.php"; // FIXME: we take this as generic path to index.php
 942+ }
 943+
 944+ }
 945+ // get from global config
 946+ if(repo == null){
 947+ repo = findSuffix(oaiRepo.keySet(),dbname);
 948+ if(repo != null)
 949+ repo = oaiRepo.get(repo);
 950+ if(repo == null && oaiRepo.containsKey("<default>"))
 951+ repo = oaiRepo.get("<default>");
 952+ }
 953+ if(repo == null)
 954+ return ""; // failed, no url
 955+
 956+ // process $lang
 957+ String lang = getLanguage(dbname);
 958+ repo = repo.replace("$lang",lang);
 959+ repo = repo += "?title=Special:OAIRepository";
 960+
 961+ return repo;
 962+ }
 963+
872964
873965 }
\ No newline at end of file
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
@@ -60,24 +60,26 @@
6161
6262 /**
6363 * Syntax:
64 - * java IncrementalUpdater [-d] [-t timestamp] [-s sleep] [-f dblist] [-n] dbname1 dbname2 ...
 64+ * java IncrementalUpdater [-d] [-t timestamp] [-s sleep] [-f dblist] [-e dbname] [-n] dbname1 dbname2 ...
6565 * Options:
6666 * -d - daemonize, otherwise runs only one round of updates to dbs
67 - * -s - sleep time after one cycle (default: 3000ms)
 67+ * -s - sleep time after one cycle (default: 30000ms)
6868 * -t - default timestamp if status file is missing (default: 2001-01-01)
6969 * -f - file to read databases from
70 - * -n - wait for notification of flush after done updating one db (default: false)
 70+ * -n - wait for notification of flush after done updating one db (default: true)
 71+ * -e - exclude dbname from incremental updates (overrides -f)
7172 *
7273 * @param args
7374 */
7475 public static void main(String[] args){
7576 ArrayList<String> dbnames = new ArrayList<String>();
7677 boolean daemon = false;
77 - long sleepTime = 3000;
 78+ long sleepTime = 30000; // 30s
7879 String timestamp = null;
7980 int maxQueueSize = 500;
8081 String dblist = null;
81 - boolean notification = false;
 82+ boolean notification = true;
 83+ HashSet<String> excludeList = new HashSet<String>();
8284 HashSet<String> firstPass = new HashSet<String>(); // if dbname is here, then it's our update pass
8385 // args
8486 for(int i=0; i<args.length; i++){
@@ -89,6 +91,8 @@
9092 timestamp = args[++i];
9193 else if(args[i].equals("-f"))
9294 dblist = args[++i];
 95+ else if(args[i].equals("-e"))
 96+ excludeList.add(args[++i]);
9397 else if(args[i].equals("-n"))
9498 notification = true;
9599 else if(args[i].equals("--help"))
@@ -115,13 +119,14 @@
116120 }
117121 }
118122 if(dbnames.size() == 0){
119 - System.out.println("Syntax: java IncrementalUpdater [-d] [-s sleep] [-t timestamp] [-f dblist] dbname1 dbname2 ...");
 123+ System.out.println("Syntax: java IncrementalUpdater [-d] [-s sleep] [-t timestamp] [-e dbname] [-f dblist] dbname1 dbname2 ...");
120124 System.out.println("Options:");
121125 System.out.println(" -d - daemonize, otherwise runs only one round of updates to dbs");
122126 System.out.println(" -s - sleep time after one cycle (default: "+sleepTime+"ms)");
123127 System.out.println(" -t - timestamp to start from (if status is missing default: "+timestamp+")");
124128 System.out.println(" -f - dblist file, one dbname per line");
125129 System.out.println(" -n - wait for notification of flush after done updating one db (default: "+notification+")");
 130+ System.out.println(" -e - exclude dbname from incremental updates (overrides -f)");
126131 return;
127132 }
128133 // config
@@ -141,6 +146,8 @@
142147 do{
143148 main_loop: for(String dbname : dbnames){
144149 try{
 150+ if(excludeList.contains(dbname))
 151+ continue;
145152 IndexId iid = IndexId.get(dbname);
146153 OAIHarvester harvester = new OAIHarvester(iid,iid.getOAIRepository(),auth);
147154 Properties status = new Properties();
@@ -170,7 +177,7 @@
171178 fetchReferences(records,dbname);
172179 for(IndexUpdateRecord rec : records){
173180 Article ar = rec.getArticle();
174 - log.debug("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects());
 181+ log.info("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects());
175182 }
176183 // send to indexer
177184 RMIMessengerClient messenger = new RMIMessengerClient(true);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java
@@ -36,7 +36,7 @@
3737 /** Invoke ListRecords from a certain timestamp */
3838 public ArrayList<IndexUpdateRecord> getRecords(String from){
3939 try{
40 - read(new URL(urlbase+"?verb=ListRecords&metadataPrefix=lsearch&from="+from));
 40+ read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&from="+from));
4141 return collector.getRecords();
4242 } catch(IOException e){
4343 log.warn("I/O exception listing records: "+e.getMessage());
@@ -57,7 +57,7 @@
5858 /** Invoke ListRecords using the last resumption token */
5959 public ArrayList<IndexUpdateRecord> getMoreRecords(){
6060 try{
61 - read(new URL(urlbase+"?verb=ListRecords&metadataPrefix=lsearch&resumptionToken="+resumptionToken));
 61+ read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&resumptionToken="+resumptionToken));
6262 return collector.getRecords();
6363 } catch(IOException e){
6464 log.warn("I/O exception listing records: "+e.getMessage());
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/PHPParser.java
@@ -4,6 +4,7 @@
55 import java.io.FileReader;
66 import java.io.IOException;
77 import java.io.InputStreamReader;
 8+import java.net.MalformedURLException;
89 import java.net.URL;
910 import java.util.HashMap;
1011 import java.util.HashSet;
@@ -123,6 +124,42 @@
124125 return ns;
125126 }
126127
 128+ /** Get wgLanguages from InitialiseSettings */
 129+ public Hashtable<String,String> getLanguages(String text){
 130+ text = text.replaceAll("(#.*)",""); // strip comments
 131+ Hashtable<String,String> langs = new Hashtable<String,String>();
 132+
 133+ int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
 134+ Pattern wglang = Pattern.compile("[\"']wgLanguageCode[\"']\\s*=>\\s*array\\s*\\((.*?)\\)",flags);
 135+ Pattern entry = Pattern.compile("[\"'](.*?)[\"']\\s*=>\\s*[\"'](.*?)[\"']",flags);
 136+ Matcher matcher = wglang.matcher(text);
 137+ while(matcher.find()){
 138+ Matcher me = entry.matcher(matcher.group(1));
 139+ while(me.find()){
 140+ langs.put(me.group(1),me.group(2));
 141+ }
 142+ }
 143+ return langs;
 144+ }
 145+
 146+ /** Get wgServer from InitialiseSettings */
 147+ public Hashtable<String,String> getServer(String text){
 148+ text = text.replaceAll("(#.*)",""); // strip comments
 149+ Hashtable<String,String> servers = new Hashtable<String,String>();
 150+
 151+ int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
 152+ Pattern wgserv = Pattern.compile("[\"']wgServer[\"']\\s*=>\\s*array\\s*\\((.*?)\\)",flags);
 153+ Pattern entry = Pattern.compile("[\"'](.*?)[\"']\\s*=>\\s*[\"'](.*?)[\"']",flags);
 154+ Matcher matcher = wgserv.matcher(text);
 155+ while(matcher.find()){
 156+ Matcher me = entry.matcher(matcher.group(1));
 157+ while(me.find()){
 158+ servers.put(me.group(1),me.group(2));
 159+ }
 160+ }
 161+ return servers;
 162+ }
 163+
127164 public String readFile(String path){
128165 char buffer[] = new char[32768];
129166 String text = "";
@@ -141,7 +178,7 @@
142179 return text;
143180 }
144181
145 - public String readURL(URL url){
 182+ public String readURL(URL url) throws IOException{
146183 char buffer[] = new char[32768];
147184 String text = "";
148185 try {
@@ -155,13 +192,15 @@
156193 } while(len > 0);
157194 r.close();
158195 } catch (IOException e) {
159 - // silent
 196+ throw e;
160197 }
161198 return text;
162199 }
163200
164 - /** Test stuff */
165 - public static void main(String args[]){
 201+ /** Test stuff
 202+ * @throws IOException
 203+ * @throws MalformedURLException */
 204+ public static void main(String args[]) throws MalformedURLException, IOException{
166205 String text = "$namespaceNames = array(\n"+
167206 "NS_MEDIA => \"Medija\",\n"+
168207 "NS_SPECIAL => \"Posebno\",\n"+
@@ -178,5 +217,11 @@
179218 System.out.println(p.getFallBack(text2));
180219 System.out.println(p.getRedirectMagic(php));
181220
 221+ System.out.println(p.getLanguages("'wgLanguageCode' => array('default' => '$lang')"));
 222+ String initset = p.readURL(new URL("file:///home/rainman/Desktop/InitialiseSettings.php"));
 223+ System.out.println(p.getLanguages(initset));
 224+ System.out.println(p.getServer(initset));
 225+
 226+
182227 }
183228 }
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java
@@ -111,8 +111,7 @@
112112 if(langCode == null || langCode.equals(""))
113113 return false;
114114 if(level == 5) // max 5 recursions in depth
115 - return false;
116 - log.info("Reading localization for "+langCode);
 115+ return false;
117116 // make title case
118117 langCode = langCode.substring(0,1).toUpperCase()+langCode.substring(1).toLowerCase();
119118 if(badLocalizations.contains(langCode.toLowerCase())){
@@ -124,6 +123,7 @@
125124 log.warn("Property Localization.url not set in config file. Localization disabled.");
126125 return false;
127126 }
 127+ log.info("Reading localization for "+langCode);
128128 URL url;
129129 try {
130130 url = new URL(MessageFormat.format(loc,langCode));
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/RankBuilder.java
@@ -43,7 +43,7 @@
4444 log = Logger.getLogger(RankBuilder.class);
4545
4646 if(args.length < 2){
47 - System.out.println("Syntax: java Importer <inputfile> <dbname>");
 47+ System.out.println("Syntax: java RankBuilder <inputfile> <dbname>");
4848 return;
4949 }
5050 inputfile = args[0];
Index: trunk/lucene-search-2.0/lsearch-global.conf
@@ -36,6 +36,14 @@
3737 [Index-Path]
3838 <default> : /mwsearch
3939
 40+# OAI repository info, for incremental updater
 41+# dbSuffix : base url (to index.php)
 42+# ?title=Special:OAIRepository is appended to url
 43+[OAI]
 44+wiktionary : http://$lang.wiktionary.org/w/index.php
 45+wikilucene : http://localhost/wiki-lucene/phase3/index.php
 46+<default> : http://$lang.wikipedia.org/w/index.php
 47+
4048 # Global properies
4149 [Properties]
4250 # suffixes to database name, the rest is assumed to be language code
@@ -48,6 +56,10 @@
4957 # note: this will also turn off stemming!
5058 ExactCase.suffix=wiktionary wikilucene
5159
 60+# wmf-style init file, attempt to read wgserver (for oai) and lang info
 61+# for sample see http://noc.wikimedia.org/conf/InitialiseSettings.php.html
 62+# WMF.InitialiseSettings=http://noc.wikimedia.org/conf/InitialiseSettings.php.html
 63+
5264 # Put here you custom namespace prefixes
5365 # Syntax: <prefix_name> : <coma separated list of namespaces>
5466 # <all> is a special keyword meaning all namespaces
Index: trunk/lucene-search-2.0/lsearch.conf
@@ -63,6 +63,9 @@
6464 # Storage.username=root
6565 # Storage.password=
6666
 67+# Storage.adminuser=root
 68+# Storage.adminpass=
 69+
6770 # Values:
6871 # true - each dbname has a separate db of that name
6972 # false - each dbname is a prefix for tables in a default db (set default db below)

Status & tagging log