r23065 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r23064‎ \| r23065 \| r23066 >
Date:	12:10, 18 June 2007
Author:	rainman
Status:	old
Tags:
Comment:	Minor bugfixes, more flexible configuration.
Modified paths:	/trunk/lucene-search-2.0/lsearch-global.conf (modified) (history) /trunk/lucene-search-2.0/lsearch.conf (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/Configuration.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/RankBuilder.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/MySQLStorage.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/PHPParser.java (modified) (history) /trunk/lucene-search-2.0/test-data/mwsearch-global.test (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/test-data/mwsearch-global.test
—	—	@@ -38,6 +38,14 @@
39	39	192.168.0.2 : frtest.part1, frtest.part2, frtest.part3
40	40	192.168.0.10 : srwiki njawiki
41	41
	42	+# OAI repository info, for incremental updater
	43	+# dbSuffix : base url (to index.php)
	44	+# ?title=Special:OAIRepository is appended to url
	45	+[OAI]
	46	+wiktionary : http://$lang.wiktionary.org/w/index.php
	47	+frtest : http://localhost/wiki-lucene/phase3/index.php
	48	+<default> : http://$lang.wikipedia.org/w/index.php
	49	+
42	50	# Path where indexes are on hosts, after default value put hosts where
43	51	# the location differs
44	52	[Index-Path]
—	—	@@ -51,6 +59,10 @@
52	60	# dbnames that end with the suffix will use additional keywords scores
53	61	KeywordScoring.suffix=wiki rutest
54	62
	63	+# wmf-style init file, attempt to read OAI and lang info from it
	64	+# for sample see http://noc.wikimedia.org/conf/InitialiseSettings.php.html
	65	+WMF.InitialiseSettings=file:///home/rainman/Desktop/InitialiseSettings.php
	66	+
55	67	# databases can be writen as {file}, where file contains list of dbs
56	68
57	69	# Put here you custom namespace prefixes
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java
—	—	@@ -5,7 +5,6 @@
6	6	package org.wikimedia.lsearch.test;
7	7
8	8	import java.io.IOException;
9		~~-import java.net.Inet4Address;~~
10	9	import java.net.InetAddress;
11	10	import java.net.MalformedURLException;
12	11	import java.net.URL;
—	—	@@ -63,7 +62,11 @@
64	63	return globalProperties;
65	64	}
66	65
	66	+ public Hashtable<String,String> getOaiRepo(){
	67	+ return oaiRepo;
	68	+ }
67	69
	70	+
68	71	}
69	72
70	73	public static GlobalConfigurationTest.TestGC testgc = null;
—	—	@@ -86,7 +89,7 @@
87	90	String testurl = "file://"+System.getProperty("user.dir")+"/test-data/mwsearch-global.test";
88	91	try {
89	92	URL url = new URL(testurl);
90		~~- testgc.readFromURL(url,"/usr/local/var/mwsearch","");~~
	93	+ testgc.readFromURL(url,"/usr/local/var/mwsearch");
91	94
92	95	// database
93	96	Hashtable database = testgc.getDatabase();
—	—	@@ -173,8 +176,18 @@
174	177	assertTrue(testgc.useKeywordScoring("srwiki"));
175	178	assertTrue(testgc.useKeywordScoring("rutest"));
176	179
	180	+ // test oai repo stuff
	181	+ Hashtable<String,String> oairepo = testgc.getOaiRepo();
	182	+ assertEquals("http://$lang.wiktionary.org/w/index.php",oairepo.get("wiktionary"));
	183	+ assertEquals("http://localhost/wiki-lucene/phase3/index.php",oairepo.get("frtest"));
	184	+ assertEquals("http://$lang.wikipedia.org/w/index.php",oairepo.get("<default>"));
177	185
	186	+ assertEquals("http://sr.wikipedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("srwiki"));
	187	+ assertEquals("http://fr.wikipedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("frtest"));
178	188
	189	+ // InitialiseSettings test
	190	+ assertEquals("sr",testgc.getLanguage("rswikimedia"));
	191	+ assertEquals("http://commons.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("commonswiki"));
179	192
180	193	} catch (MalformedURLException e) {
181	194	e.printStackTrace();
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
—	—	@@ -18,6 +18,7 @@
19	19	import org.wikimedia.lsearch.index.IndexThread;
20	20	import org.wikimedia.lsearch.ranks.Links;
21	21	import org.wikimedia.lsearch.ranks.RankBuilder;
	22	+import org.wikimedia.lsearch.storage.Storage;
22	23	import org.wikimedia.lsearch.util.Localization;
23	24	import org.wikimedia.lsearch.util.UnicodeDecomposer;
24	25
—	—	@@ -39,8 +40,8 @@
40	41	String dbname = null;
41	42	Boolean optimize = null;
42	43	Integer mergeFactor = null, maxBufDocs = null;
43		~~- boolean newIndex = false, makeSnapshot = false;~~
44		~~- boolean snapshotDb = false;~~
	44	+ boolean newIndex = true, makeSnapshot = false;
	45	+ boolean snapshotDb = false; boolean updateReferences=false;
45	46
46	47	System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n");
47	48
—	—	@@ -48,10 +49,11 @@
49	50	log = Logger.getLogger(Importer.class);
50	51
51	52	if(args.length < 2){
52		~~- System.out.println("Syntax: java Importer [-n] [-s] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");~~
	53	+ System.out.println("Syntax: java Importer [-n] [-s] [-r] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>");
53	54	System.out.println("Options: ");
54		~~- System.out.println(" -n - create a new index (erase the old one if exists)");~~
	55	+ System.out.println(" -a - don't create new index, append to old");
55	56	System.out.println(" -s - make index snapshot when finished");
	57	+ System.out.println(" -r - update references info on storage backend");
56	58	System.out.println(" -l limit_num - add at most limit_num articles");
57	59	System.out.println(" -o optimize - true/false overrides optimization param from global settings");
58	60	System.out.println(" -m mergeFactor - overrides param from global settings");
—	—	@@ -68,8 +70,10 @@
69	71	mergeFactor = Integer.parseInt(args[++i]);
70	72	else if(args[i].equals("-b"))
71	73	maxBufDocs = Integer.parseInt(args[++i]);
72		~~- else if(args[i].equals("-n"))~~
73		~~- newIndex = true;~~
	74	+ else if(args[i].equals("-a"))
	75	+ newIndex = false;
	76	+ else if(args[i].equals("-r"))
	77	+ updateReferences = true;
74	78	else if(args[i].equals("-s"))
75	79	makeSnapshot = true;
76	80	else if(args[i].equals("--snapshot")){
—	—	@@ -99,6 +103,14 @@
100	104
101	105	// regenerate link and redirect information
102	106	Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,org.wikimedia.lsearch.ranks.LinkReader.READ_REDIRECTS);
	107	+
	108	+ if(updateReferences){
	109	+ try {
	110	+ Storage.getInstance().storePageReferences(links.getAll(),dbname);
	111	+ } catch (IOException e) {
	112	+ log.error("Failed to update references info: "+e.getMessage());
	113	+ }
	114	+ }
103	115	links.generateRedirectLists();
104	116
105	117	log.info("Third pass, indexing articles...");
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/MySQLStorage.java
—	—	@@ -38,6 +38,10 @@
39	39	protected String username;
40	40	/** mysql password */
41	41	protected String password;
	42	+ /** administrator's username */
	43	+ protected String adminUsername;
	44	+ /** administrator's password */
	45	+ protected String adminPassword;
42	46	/** If we should separate data in many dbs */
43	47	protected boolean separate;
44	48	/** db where to put everything, if we are not using one db per dbname */
—	—	@@ -82,6 +86,9 @@
83	87	username = config.getString("Storage","username","root");
84	88	password = config.getString("Storage","password","");
85	89
	90	+ adminUsername = config.getString("Storage","adminuser",username);
	91	+ adminPassword = config.getString("Storage","adminpass",password);
	92	+
86	93	// figure out db configuration
87	94	separate = config.getBoolean("Storage","useSeparateDBs");
88	95	if(!separate){
—	—	@@ -94,16 +101,21 @@
95	102
96	103	/** Get connection for writing stuff, i.e. on the master */
97	104	protected Connection getReadConnection(String dbname) throws IOException{
98		~~- return openConnection(dbname,false);~~
	105	+ return openConnection(dbname,false,false);
99	106	}
100	107
101	108	/** Get connection for reading of (possibly lagged) stuff, i.e. on slaves (or master if there are no slaves) */
102	109	protected Connection getWriteConnection(String dbname) throws IOException{
103		~~- return openConnection(dbname,true);~~
	110	+ return openConnection(dbname,true,false);
104	111	}
105	112
	113	+ /** Get administrators connection for creating tables/db, etc.. (on master) */
	114	+ protected Connection getAdminConnection(String dbname) throws IOException {
	115	+ return openConnection(dbname,true,true);
	116	+ }
	117	+
106	118	/** Open connection on the master, or load-balanced on one of the slaves */
107		~~- protected Connection openConnection(String dbname, boolean onMaster) throws IOException {~~
	119	+ protected Connection openConnection(String dbname, boolean onMaster, boolean admin) throws IOException {
108	120	String host=null;
109	121	if(onMaster \|\| slaves == null)
110	122	host = master;
—	—	@@ -121,8 +133,12 @@
122	134	String dburl = "jdbc:mysql://"+host+":3306/";
123	135	if(!separate && defaultDB!=null)
124	136	dburl += defaultDB;
	137	+ dburl += "?useUnicode=yes&characterEncoding=UTF-8";
125	138	try {
126		~~- return DriverManager.getConnection(dburl, username, password);~~
	139	+ if(admin)
	140	+ return DriverManager.getConnection(dburl, adminUsername, adminPassword);
	141	+ else
	142	+ return DriverManager.getConnection(dburl, username, password);
127	143	} catch (SQLException e) {
128	144	log.error("Cannot establish connection to "+dburl+" - check host, db, username and password : "+e.getMessage());
129	145	throw new IOException("Cannot establish connection to mysql database.");
—	—	@@ -134,7 +150,7 @@
135	151	}
136	152
137	153	public String escape(String str){
138		~~- return str.replace("'","\\'");~~
	154	+ return str.replace("\\","\\\\").replace("'","\\'");
139	155	}
140	156
141	157	public String getTableName(String name, String dbname){
—	—	@@ -147,7 +163,9 @@
148	164	// inherit javadoc
149	165	public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException {
150	166	String sql = "SELECT rf_key, rf_references from "+getTableName("references",dbname)+" WHERE ";
151		~~- if(titles.size()==1){~~
	167	+ if(titles == null \|\| titles.size()==0)
	168	+ return new ArrayList<CompactArticleLinks>();
	169	+ else if(titles.size()==1){
152	170	sql += "rf_key="+quote(escape(titles.iterator().next().getKey()));
153	171	} else{
154	172	StringBuilder sb = new StringBuilder(sql);
—	—	@@ -155,7 +173,7 @@
156	174	Iterator<Title> it = titles.iterator();
157	175	while(it.hasNext()){
158	176	sb.append('\'');
159		~~- sb.append(it.next().getKey());~~
	177	+ sb.append(escape(it.next().getKey()));
160	178	sb.append('\'');
161	179	if(it.hasNext())
162	180	sb.append(',');
—	—	@@ -256,9 +274,11 @@
257	275	tdef = def;
258	276	// create
259	277	try {
	278	+ Connection admin = getAdminConnection(dbname);
260	279	log.info("Creating table "+name+" on "+dbname);
261		~~- Statement stmt = conn.createStatement();~~
262		~~- stmt.executeUpdate(tdef);~~
	280	+ Statement stmt = admin.createStatement();
	281	+ stmt.executeUpdate(tdef);
	282	+ admin.close();
263	283	} catch (SQLException e) {
264	284	log.error("Cannot create table "+table+" : "+e.getMessage());
265	285	throw new IOException(e.getMessage());
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/Configuration.java
—	—	@@ -105,7 +105,6 @@
106	106	GlobalConfiguration global = GlobalConfiguration.getInstance();
107	107	String globalurl = getString("MWConfig","global");
108	108	String indexpath = getString("Indexes","path");
109		~~- String oairepo = getString("OAI","repo");~~
110	109	if(globalurl==null){
111	110	System.out.println("FATAL: Need to define global configuration url in local config file.");
112	111	System.exit(1);
—	—	@@ -114,7 +113,7 @@
115	114	System.exit(1);
116	115	}
117	116	try {
118		~~- global.readFromURL(new URL(globalurl),indexpath,oairepo);~~
	117	+ global.readFromURL(new URL(globalurl),indexpath);
119	118	} catch (MalformedURLException e) {
120	119	System.out.println("Malformed URL "+globalurl+" cannot read global configuration (check MWConfig.global in "+CONF_FILE_NAME+"), exiting...");
121	120	System.exit(1);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java
—	—	@@ -17,6 +17,7 @@
18	18	import java.net.UnknownHostException;
19	19	import java.text.MessageFormat;
20	20	import java.util.ArrayList;
	21	+import java.util.Collection;
21	22	import java.util.Enumeration;
22	23	import java.util.HashSet;
23	24	import java.util.Hashtable;
—	—	@@ -27,6 +28,7 @@
28	29	import java.util.regex.Pattern;
29	30
30	31	import org.wikimedia.lsearch.search.NamespaceFilter;
	32	+import org.wikimedia.lsearch.util.PHPParser;
31	33
32	34	/**
33	35	* Read and parse the global configuration file, is also used
—	—	@@ -58,13 +60,17 @@
59	61	protected Hashtable<String,NamespaceFilter> namespacePrefix;
60	62	/** keyword for all namespaces (i.e. no filtering) */
61	63	protected String namespacePrefixAll;
	64	+ /** suffx -> OAI Repo url pattern */
	65	+ protected Hashtable<String,String> oaiRepo;
	66	+ /** wgLanguageCode from InitialiseSettings, suffix -> lang code */
	67	+ protected Hashtable<String,String> wgLanguageCode = null;
	68	+ /** wgServer, suffix -> server (default server is "default")*/
	69	+ protected Hashtable<String,String> wgServer = null;
62	70
63	71	/** info about this host */
64	72	protected static InetAddress myHost;
65	73	protected static String hostAddr, hostName;
66	74
67		~~- /** OAI repo pattern from lsearch2.conf */~~
68		~~- protected String OAIRepoPattern;~~
69	75	/** Database suffix if dbname, the rest is supposed to be language, e.g srwiki => (suffix wiki) => sr */
70	76	protected String[] databaseSuffixes = null;
71	77	/** Databases ending in suffix will use additional keyword scores */
—	—	@@ -204,13 +210,13 @@
205	211	* @param url
206	212	* @throws IOException
207	213	*/
208		~~- public void readFromURL(URL url, String indexpath, String oaiRepo) throws IOException{~~
	214	+ public void readFromURL(URL url, String indexpath) throws IOException{
209	215	BufferedReader in;
210	216	try {
211	217	in = new BufferedReader(
212	218	new InputStreamReader(
213	219	url.openStream()));
214		~~- read(in,indexpath,oaiRepo);~~
	220	+ read(in,indexpath);
215	221	} catch (IOException e) {
216	222	System.out.println("I/O Error in opening or reading global config at url "+url);
217	223	throw e;
—	—	@@ -231,6 +237,7 @@
232	238	indexRsyncPath = new Hashtable<String, String>();
233	239	namespacePrefix = new Hashtable<String,NamespaceFilter>();
234	240	namespacePrefixAll = "all"; // default
	241	+ oaiRepo = new Hashtable<String,String>();
235	242	}
236	243
237	244	protected String[] getArrayProperty(String name){
—	—	@@ -247,7 +254,7 @@
248	255	* @param in opened reader
249	256	* @throws IOException
250	257	*/
251		~~- protected void read(BufferedReader in, String indexpath, String oaiRepo) throws IOException{~~
	258	+ protected void read(BufferedReader in, String indexpath) throws IOException{
252	259	String line="";
253	260	int section = -1;
254	261	Pattern roleRegexp = Pattern.compile("\$(.*?)\$");
—	—	@@ -258,12 +265,12 @@
259	266	final int SEARCH = 2;
260	267	final int INDEXPATH = 3;
261	268	final int NAMESPACE_PREFIX = 4;
	269	+ final int OAI = 5;
262	270
263	271	int searchGroupNum = -1;
264	272
265	273	init();
266	274	this.indexPath = indexpath;
267		~~- this.OAIRepoPattern = oaiRepo == null? "" : oaiRepo;~~
268	275
269	276	while((line = in.readLine()) != null){
270	277	lineNum ++;
—	—	@@ -293,6 +300,10 @@
294	301	this.databaseSuffixes = getArrayProperty("Database.suffix");
295	302	this.keywordScoringSuffixes = getArrayProperty("KeywordScoring.suffix");
296	303	this.exactCaseSuffix = getArrayProperty("ExactCase.suffix");
	304	+ // try reading intialisesettings
	305	+ String initset = globalProperties.getProperty("WMF.InitialiseSettings");
	306	+ if(initset != null)
	307	+ initializeWmfSettings(initset);
297	308	if(line == null)
298	309	break;
299	310	// else: line points to beginning of next section
—	—	@@ -311,6 +322,8 @@
312	323	section = INDEXPATH;
313	324	else if(s.equalsIgnoreCase("namespace-prefix"))
314	325	section = NAMESPACE_PREFIX;
	326	+ else if(s.equalsIgnoreCase("oai"))
	327	+ section = OAI;
315	328	} else if(section==-1 && !line.trim().equals("")){
316	329	System.out.println("Ignoring a line up to first section heading...");
317	330	} else if(section == DATABASE){
—	—	@@ -355,17 +368,42 @@
356	369	namespacePrefixAll = prefix;
357	370	else
358	371	namespacePrefix.put(prefix,new NamespaceFilter(filter));
	372	+ } else if(section == OAI){
	373	+ String[] parts = splitBySemicolon(line,lineNum);
	374	+ if(parts == null) continue;
	375	+ String suffix = parts[0].trim();
	376	+ String url = parts[1].trim();
	377	+
	378	+ oaiRepo.put(suffix,url);
359	379	}
360	380	}
361	381	if( !checkIntegrity() ){
362	382	in.close();
363	383	System.exit(1);
364	384	}
	385	+
365	386	makeIndexIdPool();
366	387	in.close();
367	388	}
368	389
369		-
	390	+ /**
	391	+ * A bit hackish: read InitialiseSettings which we know have a certain
	392	+ * format to avoid maintaining two copies for config files (one in php
	393	+ * other for lsearch in global conf)
	394	+ *
	395	+ * @param initset
	396	+ */
	397	+ protected void initializeWmfSettings(String initset) {
	398	+ try {
	399	+ PHPParser parser = new PHPParser();
	400	+ String text = parser.readURL(new URL(initset));
	401	+ wgLanguageCode = parser.getLanguages(text);
	402	+ wgServer = parser.getServer(text);
	403	+ } catch (IOException e) {
	404	+ System.out.println("Error: Cannot read InitialiseSettings.php from url "+initset+" : "+e.getMessage());
	405	+ }
	406	+ }
	407	+
370	408	/** Get all hosts which search this inxedId (dbrole) */
371	409	protected HashSet<String> getSearchHosts(String dbrole){
372	410	HashSet<String> searchHosts = new HashSet<String>();
—	—	@@ -445,7 +483,7 @@
446	484	if(rsyncIndexPath == null)
447	485	rsyncIndexPath = indexRsyncPath.get("<default>");
448	486	}
449		~~- String oairepo = MessageFormat.format(OAIRepoPattern,new Object[] {dbname,getLanguage(dbname)});~~
	487	+ String oairepo = getOAIRepo(dbname);
450	488
451	489	IndexId iid = new IndexId(dbrole,
452	490	type,
—	—	@@ -523,7 +561,7 @@
524	562	}
525	563
526	564	protected String[] splitBySemicolon(String line, int lineNum){
527		~~- String[] parts = line.split(":");~~
	565	+ String[] parts = line.split(":",2);
528	566	if(parts.length!=2){
529	567	System.out.println("Error at line "+lineNum+": semicolon missing. Ignoring this line.");
530	568	return null;
—	—	@@ -779,9 +817,18 @@
780	818	/** Get language for a dbname */
781	819	public String getLanguage(String dbname) {
782	820	// first check explicit language paramter in global settings
783		~~- Hashtable<String,String> lang = database.get(dbname).get("language");~~
784		~~- if(lang!=null)~~
785		~~- return lang.get("code");~~
	821	+ Hashtable<String,Hashtable<String,String>> dbparam = database.get(dbname);
	822	+ if(dbparam !=null){
	823	+ Hashtable<String,String> lang = dbparam.get("language");
	824	+ if(lang!=null)
	825	+ return lang.get("code");
	826	+ }
	827	+ // try to get from initialise settings
	828	+ if(wgLanguageCode!=null){
	829	+ String key = findSuffix(wgLanguageCode.keySet(),dbname);
	830	+ if(key != null)
	831	+ return wgLanguageCode.get(key);
	832	+ }
786	833	// try to get languages from suffixes
787	834	if(databaseSuffixes != null){
788	835	for (String suffix : databaseSuffixes) {
—	—	@@ -867,6 +914,51 @@
868	915	public boolean exactCaseIndex(String dbname){
869	916	return checkSuffix(exactCaseSuffix,dbname);
870	917	}
	918	+
	919	+ /** Find suffix that matches dbname */
	920	+ public String findSuffix(Collection<String> suffixes, String dbname){
	921	+ for(String suffix : suffixes){
	922	+ if(dbname.endsWith(suffix)){
	923	+ return suffix;
	924	+ }
	925	+ }
	926	+ return null;
	927	+ }
871	928
	929	+ /** Get OAI-repo url for dbname */
	930	+ public String getOAIRepo(String dbname){
	931	+ String repo = null;
	932	+ // try to get from initialise settings
	933	+ if(wgServer != null){
	934	+ String key = findSuffix(wgServer.keySet(),dbname);
	935	+ if(key == null)
	936	+ key = "default";
	937	+ repo = wgServer.get(key);
	938	+ if(repo != null){
	939	+ if(!repo.endsWith("/"))
	940	+ repo += "/";
	941	+ repo += "w/index.php"; // FIXME: we take this as generic path to index.php
	942	+ }
	943	+
	944	+ }
	945	+ // get from global config
	946	+ if(repo == null){
	947	+ repo = findSuffix(oaiRepo.keySet(),dbname);
	948	+ if(repo != null)
	949	+ repo = oaiRepo.get(repo);
	950	+ if(repo == null && oaiRepo.containsKey("<default>"))
	951	+ repo = oaiRepo.get("<default>");
	952	+ }
	953	+ if(repo == null)
	954	+ return ""; // failed, no url
	955	+
	956	+ // process $lang
	957	+ String lang = getLanguage(dbname);
	958	+ repo = repo.replace("$lang",lang);
	959	+ repo = repo += "?title=Special:OAIRepository";
	960	+
	961	+ return repo;
	962	+ }
	963	+
872	964
873	965	}
\ No newline at end of file
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java
—	—	@@ -60,24 +60,26 @@
61	61
62	62	/**
63	63	* Syntax:
64		~~- * java IncrementalUpdater [-d] [-t timestamp] [-s sleep] [-f dblist] [-n] dbname1 dbname2 ...~~
	64	+ * java IncrementalUpdater [-d] [-t timestamp] [-s sleep] [-f dblist] [-e dbname] [-n] dbname1 dbname2 ...
65	65	* Options:
66	66	* -d - daemonize, otherwise runs only one round of updates to dbs
67		~~- * -s - sleep time after one cycle (default: 3000ms)~~
	67	+ * -s - sleep time after one cycle (default: 30000ms)
68	68	* -t - default timestamp if status file is missing (default: 2001-01-01)
69	69	* -f - file to read databases from
70		~~- * -n - wait for notification of flush after done updating one db (default: false)~~
	70	+ * -n - wait for notification of flush after done updating one db (default: true)
	71	+ * -e - exclude dbname from incremental updates (overrides -f)
71	72	*
72	73	* @param args
73	74	*/
74	75	public static void main(String[] args){
75	76	ArrayList<String> dbnames = new ArrayList<String>();
76	77	boolean daemon = false;
77		~~- long sleepTime = 3000;~~
	78	+ long sleepTime = 30000; // 30s
78	79	String timestamp = null;
79	80	int maxQueueSize = 500;
80	81	String dblist = null;
81		~~- boolean notification = false;~~
	82	+ boolean notification = true;
	83	+ HashSet<String> excludeList = new HashSet<String>();
82	84	HashSet<String> firstPass = new HashSet<String>(); // if dbname is here, then it's our update pass
83	85	// args
84	86	for(int i=0; i<args.length; i++){
—	—	@@ -89,6 +91,8 @@
90	92	timestamp = args[++i];
91	93	else if(args[i].equals("-f"))
92	94	dblist = args[++i];
	95	+ else if(args[i].equals("-e"))
	96	+ excludeList.add(args[++i]);
93	97	else if(args[i].equals("-n"))
94	98	notification = true;
95	99	else if(args[i].equals("--help"))
—	—	@@ -115,13 +119,14 @@
116	120	}
117	121	}
118	122	if(dbnames.size() == 0){
119		~~- System.out.println("Syntax: java IncrementalUpdater [-d] [-s sleep] [-t timestamp] [-f dblist] dbname1 dbname2 ...");~~
	123	+ System.out.println("Syntax: java IncrementalUpdater [-d] [-s sleep] [-t timestamp] [-e dbname] [-f dblist] dbname1 dbname2 ...");
120	124	System.out.println("Options:");
121	125	System.out.println(" -d - daemonize, otherwise runs only one round of updates to dbs");
122	126	System.out.println(" -s - sleep time after one cycle (default: "+sleepTime+"ms)");
123	127	System.out.println(" -t - timestamp to start from (if status is missing default: "+timestamp+")");
124	128	System.out.println(" -f - dblist file, one dbname per line");
125	129	System.out.println(" -n - wait for notification of flush after done updating one db (default: "+notification+")");
	130	+ System.out.println(" -e - exclude dbname from incremental updates (overrides -f)");
126	131	return;
127	132	}
128	133	// config
—	—	@@ -141,6 +146,8 @@
142	147	do{
143	148	main_loop: for(String dbname : dbnames){
144	149	try{
	150	+ if(excludeList.contains(dbname))
	151	+ continue;
145	152	IndexId iid = IndexId.get(dbname);
146	153	OAIHarvester harvester = new OAIHarvester(iid,iid.getOAIRepository(),auth);
147	154	Properties status = new Properties();
—	—	@@ -170,7 +177,7 @@
171	178	fetchReferences(records,dbname);
172	179	for(IndexUpdateRecord rec : records){
173	180	Article ar = rec.getArticle();
174		~~- log.debug("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects());~~
	181	+ log.info("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects());
175	182	}
176	183	// send to indexer
177	184	RMIMessengerClient messenger = new RMIMessengerClient(true);
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java
—	—	@@ -36,7 +36,7 @@
37	37	/** Invoke ListRecords from a certain timestamp */
38	38	public ArrayList<IndexUpdateRecord> getRecords(String from){
39	39	try{
40		~~- read(new URL(urlbase+"?verb=ListRecords&metadataPrefix=lsearch&from="+from));~~
	40	+ read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&from="+from));
41	41	return collector.getRecords();
42	42	} catch(IOException e){
43	43	log.warn("I/O exception listing records: "+e.getMessage());
—	—	@@ -57,7 +57,7 @@
58	58	/** Invoke ListRecords using the last resumption token */
59	59	public ArrayList<IndexUpdateRecord> getMoreRecords(){
60	60	try{
61		~~- read(new URL(urlbase+"?verb=ListRecords&metadataPrefix=lsearch&resumptionToken="+resumptionToken));~~
	61	+ read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&resumptionToken="+resumptionToken));
62	62	return collector.getRecords();
63	63	} catch(IOException e){
64	64	log.warn("I/O exception listing records: "+e.getMessage());
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/PHPParser.java
—	—	@@ -4,6 +4,7 @@
5	5	import java.io.FileReader;
6	6	import java.io.IOException;
7	7	import java.io.InputStreamReader;
	8	+import java.net.MalformedURLException;
8	9	import java.net.URL;
9	10	import java.util.HashMap;
10	11	import java.util.HashSet;
—	—	@@ -123,6 +124,42 @@
124	125	return ns;
125	126	}
126	127
	128	+ /** Get wgLanguages from InitialiseSettings */
	129	+ public Hashtable<String,String> getLanguages(String text){
	130	+ text = text.replaceAll("(#.*)",""); // strip comments
	131	+ Hashtable<String,String> langs = new Hashtable<String,String>();
	132	+
	133	+ int flags = Pattern.CASE_INSENSITIVE \| Pattern.DOTALL;
	134	+ Pattern wglang = Pattern.compile("[\"']wgLanguageCode[\"']\\s=>\\sarray\\s\$(.?)\$",flags);
	135	+ Pattern entry = Pattern.compile("[\"'](.?)[\"']\\s=>\\s[\"'](.?)[\"']",flags);
	136	+ Matcher matcher = wglang.matcher(text);
	137	+ while(matcher.find()){
	138	+ Matcher me = entry.matcher(matcher.group(1));
	139	+ while(me.find()){
	140	+ langs.put(me.group(1),me.group(2));
	141	+ }
	142	+ }
	143	+ return langs;
	144	+ }
	145	+
	146	+ /** Get wgServer from InitialiseSettings */
	147	+ public Hashtable<String,String> getServer(String text){
	148	+ text = text.replaceAll("(#.*)",""); // strip comments
	149	+ Hashtable<String,String> servers = new Hashtable<String,String>();
	150	+
	151	+ int flags = Pattern.CASE_INSENSITIVE \| Pattern.DOTALL;
	152	+ Pattern wgserv = Pattern.compile("[\"']wgServer[\"']\\s=>\\sarray\\s\$(.?)\$",flags);
	153	+ Pattern entry = Pattern.compile("[\"'](.?)[\"']\\s=>\\s[\"'](.?)[\"']",flags);
	154	+ Matcher matcher = wgserv.matcher(text);
	155	+ while(matcher.find()){
	156	+ Matcher me = entry.matcher(matcher.group(1));
	157	+ while(me.find()){
	158	+ servers.put(me.group(1),me.group(2));
	159	+ }
	160	+ }
	161	+ return servers;
	162	+ }
	163	+
127	164	public String readFile(String path){
128	165	char buffer[] = new char[32768];
129	166	String text = "";
—	—	@@ -141,7 +178,7 @@
142	179	return text;
143	180	}
144	181
145		~~- public String readURL(URL url){~~
	182	+ public String readURL(URL url) throws IOException{
146	183	char buffer[] = new char[32768];
147	184	String text = "";
148	185	try {
—	—	@@ -155,13 +192,15 @@
156	193	} while(len > 0);
157	194	r.close();
158	195	} catch (IOException e) {
159		~~- // silent~~
	196	+ throw e;
160	197	}
161	198	return text;
162	199	}
163	200
164		~~- /** Test stuff */~~
165		~~- public static void main(String args[]){~~
	201	+ /** Test stuff
	202	+ * @throws IOException
	203	+ * @throws MalformedURLException */
	204	+ public static void main(String args[]) throws MalformedURLException, IOException{
166	205	String text = "$namespaceNames = array(\n"+
167	206	"NS_MEDIA => \"Medija\",\n"+
168	207	"NS_SPECIAL => \"Posebno\",\n"+
—	—	@@ -178,5 +217,11 @@
179	218	System.out.println(p.getFallBack(text2));
180	219	System.out.println(p.getRedirectMagic(php));
181	220
	221	+ System.out.println(p.getLanguages("'wgLanguageCode' => array('default' => '$lang')"));
	222	+ String initset = p.readURL(new URL("file:///home/rainman/Desktop/InitialiseSettings.php"));
	223	+ System.out.println(p.getLanguages(initset));
	224	+ System.out.println(p.getServer(initset));
	225	+
	226	+
182	227	}
183	228	}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java
—	—	@@ -111,8 +111,7 @@
112	112	if(langCode == null \|\| langCode.equals(""))
113	113	return false;
114	114	if(level == 5) // max 5 recursions in depth
115		~~- return false;~~
116		~~- log.info("Reading localization for "+langCode);~~
	115	+ return false;
117	116	// make title case
118	117	langCode = langCode.substring(0,1).toUpperCase()+langCode.substring(1).toLowerCase();
119	118	if(badLocalizations.contains(langCode.toLowerCase())){
—	—	@@ -124,6 +123,7 @@
125	124	log.warn("Property Localization.url not set in config file. Localization disabled.");
126	125	return false;
127	126	}
	127	+ log.info("Reading localization for "+langCode);
128	128	URL url;
129	129	try {
130	130	url = new URL(MessageFormat.format(loc,langCode));
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/RankBuilder.java
—	—	@@ -43,7 +43,7 @@
44	44	log = Logger.getLogger(RankBuilder.class);
45	45
46	46	if(args.length < 2){
47		~~- System.out.println("Syntax: java Importer <inputfile> <dbname>");~~
	47	+ System.out.println("Syntax: java RankBuilder <inputfile> <dbname>");
48	48	return;
49	49	}
50	50	inputfile = args[0];
Index: trunk/lucene-search-2.0/lsearch-global.conf
—	—	@@ -36,6 +36,14 @@
37	37	[Index-Path]
38	38	<default> : /mwsearch
39	39
	40	+# OAI repository info, for incremental updater
	41	+# dbSuffix : base url (to index.php)
	42	+# ?title=Special:OAIRepository is appended to url
	43	+[OAI]
	44	+wiktionary : http://$lang.wiktionary.org/w/index.php
	45	+wikilucene : http://localhost/wiki-lucene/phase3/index.php
	46	+<default> : http://$lang.wikipedia.org/w/index.php
	47	+
40	48	# Global properies
41	49	[Properties]
42	50	# suffixes to database name, the rest is assumed to be language code
—	—	@@ -48,6 +56,10 @@
49	57	# note: this will also turn off stemming!
50	58	ExactCase.suffix=wiktionary wikilucene
51	59
	60	+# wmf-style init file, attempt to read wgserver (for oai) and lang info
	61	+# for sample see http://noc.wikimedia.org/conf/InitialiseSettings.php.html
	62	+# WMF.InitialiseSettings=http://noc.wikimedia.org/conf/InitialiseSettings.php.html
	63	+
52	64	# Put here you custom namespace prefixes
53	65	# Syntax: <prefix_name> : <coma separated list of namespaces>
54	66	# <all> is a special keyword meaning all namespaces
Index: trunk/lucene-search-2.0/lsearch.conf
—	—	@@ -63,6 +63,9 @@
64	64	# Storage.username=root
65	65	# Storage.password=
66	66
	67	+# Storage.adminuser=root
	68	+# Storage.adminpass=
	69	+
67	70	# Values:
68	71	# true - each dbname has a separate db of that name
69	72	# false - each dbname is a prefix for tables in a default db (set default db below)

Status & tagging log

15:19, 12 September 2011 Meno25 (talk | contribs) changed the status of r23065 [removed: ok added: old]