Index: trunk/lucene-search-2.0/test-data/mwsearch-global.test |
— | — | @@ -38,6 +38,14 @@ |
39 | 39 | 192.168.0.2 : frtest.part1, frtest.part2, frtest.part3 |
40 | 40 | 192.168.0.10 : srwiki njawiki |
41 | 41 | |
| 42 | +# OAI repository info, for incremental updater |
| 43 | +# dbSuffix : base url (to index.php) |
| 44 | +# ?title=Special:OAIRepository is appended to url |
| 45 | +[OAI] |
| 46 | +wiktionary : http://$lang.wiktionary.org/w/index.php |
| 47 | +frtest : http://localhost/wiki-lucene/phase3/index.php |
| 48 | +<default> : http://$lang.wikipedia.org/w/index.php |
| 49 | + |
42 | 50 | # Path where indexes are on hosts, after default value put hosts where |
43 | 51 | # the location differs |
44 | 52 | [Index-Path] |
— | — | @@ -51,6 +59,10 @@ |
52 | 60 | # dbnames that end with the suffix will use additional keywords scores |
53 | 61 | KeywordScoring.suffix=wiki rutest |
54 | 62 | |
| 63 | +# wmf-style init file, attempt to read OAI and lang info from it |
| 64 | +# for sample see http://noc.wikimedia.org/conf/InitialiseSettings.php.html |
| 65 | +WMF.InitialiseSettings=file:///home/rainman/Desktop/InitialiseSettings.php |
| 66 | + |
55 | 67 | # databases can be writen as {file}, where file contains list of dbs |
56 | 68 | |
57 | 69 | # Put here you custom namespace prefixes |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/test/GlobalConfigurationTest.java |
— | — | @@ -5,7 +5,6 @@ |
6 | 6 | package org.wikimedia.lsearch.test; |
7 | 7 | |
8 | 8 | import java.io.IOException; |
9 | | -import java.net.Inet4Address; |
10 | 9 | import java.net.InetAddress; |
11 | 10 | import java.net.MalformedURLException; |
12 | 11 | import java.net.URL; |
— | — | @@ -63,7 +62,11 @@ |
64 | 63 | return globalProperties; |
65 | 64 | } |
66 | 65 | |
| 66 | + public Hashtable<String,String> getOaiRepo(){ |
| 67 | + return oaiRepo; |
| 68 | + } |
67 | 69 | |
| 70 | + |
68 | 71 | } |
69 | 72 | |
70 | 73 | public static GlobalConfigurationTest.TestGC testgc = null; |
— | — | @@ -86,7 +89,7 @@ |
87 | 90 | String testurl = "file://"+System.getProperty("user.dir")+"/test-data/mwsearch-global.test"; |
88 | 91 | try { |
89 | 92 | URL url = new URL(testurl); |
90 | | - testgc.readFromURL(url,"/usr/local/var/mwsearch",""); |
| 93 | + testgc.readFromURL(url,"/usr/local/var/mwsearch"); |
91 | 94 | |
92 | 95 | // database |
93 | 96 | Hashtable database = testgc.getDatabase(); |
— | — | @@ -173,8 +176,18 @@ |
174 | 177 | assertTrue(testgc.useKeywordScoring("srwiki")); |
175 | 178 | assertTrue(testgc.useKeywordScoring("rutest")); |
176 | 179 | |
| 180 | + // test oai repo stuff |
| 181 | + Hashtable<String,String> oairepo = testgc.getOaiRepo(); |
| 182 | + assertEquals("http://$lang.wiktionary.org/w/index.php",oairepo.get("wiktionary")); |
| 183 | + assertEquals("http://localhost/wiki-lucene/phase3/index.php",oairepo.get("frtest")); |
| 184 | + assertEquals("http://$lang.wikipedia.org/w/index.php",oairepo.get("<default>")); |
177 | 185 | |
| 186 | + assertEquals("http://sr.wikipedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("srwiki")); |
| 187 | + assertEquals("http://fr.wikipedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("frtest")); |
178 | 188 | |
| 189 | + // InitialiseSettings test |
| 190 | + assertEquals("sr",testgc.getLanguage("rswikimedia")); |
| 191 | + assertEquals("http://commons.wikimedia.org/w/index.php?title=Special:OAIRepository",testgc.getOAIRepo("commonswiki")); |
179 | 192 | |
180 | 193 | } catch (MalformedURLException e) { |
181 | 194 | e.printStackTrace(); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -18,6 +18,7 @@ |
19 | 19 | import org.wikimedia.lsearch.index.IndexThread; |
20 | 20 | import org.wikimedia.lsearch.ranks.Links; |
21 | 21 | import org.wikimedia.lsearch.ranks.RankBuilder; |
| 22 | +import org.wikimedia.lsearch.storage.Storage; |
22 | 23 | import org.wikimedia.lsearch.util.Localization; |
23 | 24 | import org.wikimedia.lsearch.util.UnicodeDecomposer; |
24 | 25 | |
— | — | @@ -39,8 +40,8 @@ |
40 | 41 | String dbname = null; |
41 | 42 | Boolean optimize = null; |
42 | 43 | Integer mergeFactor = null, maxBufDocs = null; |
43 | | - boolean newIndex = false, makeSnapshot = false; |
44 | | - boolean snapshotDb = false; |
| 44 | + boolean newIndex = true, makeSnapshot = false; |
| 45 | + boolean snapshotDb = false; boolean updateReferences=false; |
45 | 46 | |
46 | 47 | System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n"); |
47 | 48 | |
— | — | @@ -48,10 +49,11 @@ |
49 | 50 | log = Logger.getLogger(Importer.class); |
50 | 51 | |
51 | 52 | if(args.length < 2){ |
52 | | - System.out.println("Syntax: java Importer [-n] [-s] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>"); |
| 53 | + System.out.println("Syntax: java Importer [-n] [-s] [-r] [-l limit] [-o optimize] [-m mergeFactor] [-b maxBufDocs] <inputfile> <dbname>"); |
53 | 54 | System.out.println("Options: "); |
54 | | - System.out.println(" -n - create a new index (erase the old one if exists)"); |
| 55 | + System.out.println(" -a - don't create new index, append to old"); |
55 | 56 | System.out.println(" -s - make index snapshot when finished"); |
| 57 | + System.out.println(" -r - update references info on storage backend"); |
56 | 58 | System.out.println(" -l limit_num - add at most limit_num articles"); |
57 | 59 | System.out.println(" -o optimize - true/false overrides optimization param from global settings"); |
58 | 60 | System.out.println(" -m mergeFactor - overrides param from global settings"); |
— | — | @@ -68,8 +70,10 @@ |
69 | 71 | mergeFactor = Integer.parseInt(args[++i]); |
70 | 72 | else if(args[i].equals("-b")) |
71 | 73 | maxBufDocs = Integer.parseInt(args[++i]); |
72 | | - else if(args[i].equals("-n")) |
73 | | - newIndex = true; |
| 74 | + else if(args[i].equals("-a")) |
| 75 | + newIndex = false; |
| 76 | + else if(args[i].equals("-r")) |
| 77 | + updateReferences = true; |
74 | 78 | else if(args[i].equals("-s")) |
75 | 79 | makeSnapshot = true; |
76 | 80 | else if(args[i].equals("--snapshot")){ |
— | — | @@ -99,6 +103,14 @@ |
100 | 104 | |
101 | 105 | // regenerate link and redirect information |
102 | 106 | Links links = RankBuilder.processLinks(inputfile,RankBuilder.getTitles(inputfile,langCode),langCode,org.wikimedia.lsearch.ranks.LinkReader.READ_REDIRECTS); |
| 107 | + |
| 108 | + if(updateReferences){ |
| 109 | + try { |
| 110 | + Storage.getInstance().storePageReferences(links.getAll(),dbname); |
| 111 | + } catch (IOException e) { |
| 112 | + log.error("Failed to update references info: "+e.getMessage()); |
| 113 | + } |
| 114 | + } |
103 | 115 | links.generateRedirectLists(); |
104 | 116 | |
105 | 117 | log.info("Third pass, indexing articles..."); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/storage/MySQLStorage.java |
— | — | @@ -38,6 +38,10 @@ |
39 | 39 | protected String username; |
40 | 40 | /** mysql password */ |
41 | 41 | protected String password; |
| 42 | + /** administrator's username */ |
| 43 | + protected String adminUsername; |
| 44 | + /** administrator's password */ |
| 45 | + protected String adminPassword; |
42 | 46 | /** If we should separate data in many dbs */ |
43 | 47 | protected boolean separate; |
44 | 48 | /** db where to put everything, if we are not using one db per dbname */ |
— | — | @@ -82,6 +86,9 @@ |
83 | 87 | username = config.getString("Storage","username","root"); |
84 | 88 | password = config.getString("Storage","password",""); |
85 | 89 | |
| 90 | + adminUsername = config.getString("Storage","adminuser",username); |
| 91 | + adminPassword = config.getString("Storage","adminpass",password); |
| 92 | + |
86 | 93 | // figure out db configuration |
87 | 94 | separate = config.getBoolean("Storage","useSeparateDBs"); |
88 | 95 | if(!separate){ |
— | — | @@ -94,16 +101,21 @@ |
95 | 102 | |
96 | 103 | /** Get connection for writing stuff, i.e. on the master */ |
97 | 104 | protected Connection getReadConnection(String dbname) throws IOException{ |
98 | | - return openConnection(dbname,false); |
| 105 | + return openConnection(dbname,false,false); |
99 | 106 | } |
100 | 107 | |
101 | 108 | /** Get connection for reading of (possibly lagged) stuff, i.e. on slaves (or master if there are no slaves) */ |
102 | 109 | protected Connection getWriteConnection(String dbname) throws IOException{ |
103 | | - return openConnection(dbname,true); |
| 110 | + return openConnection(dbname,true,false); |
104 | 111 | } |
105 | 112 | |
| 113 | + /** Get administrators connection for creating tables/db, etc.. (on master) */ |
| 114 | + protected Connection getAdminConnection(String dbname) throws IOException { |
| 115 | + return openConnection(dbname,true,true); |
| 116 | + } |
| 117 | + |
106 | 118 | /** Open connection on the master, or load-balanced on one of the slaves */ |
107 | | - protected Connection openConnection(String dbname, boolean onMaster) throws IOException { |
| 119 | + protected Connection openConnection(String dbname, boolean onMaster, boolean admin) throws IOException { |
108 | 120 | String host=null; |
109 | 121 | if(onMaster || slaves == null) |
110 | 122 | host = master; |
— | — | @@ -121,8 +133,12 @@ |
122 | 134 | String dburl = "jdbc:mysql://"+host+":3306/"; |
123 | 135 | if(!separate && defaultDB!=null) |
124 | 136 | dburl += defaultDB; |
| 137 | + dburl += "?useUnicode=yes&characterEncoding=UTF-8"; |
125 | 138 | try { |
126 | | - return DriverManager.getConnection(dburl, username, password); |
| 139 | + if(admin) |
| 140 | + return DriverManager.getConnection(dburl, adminUsername, adminPassword); |
| 141 | + else |
| 142 | + return DriverManager.getConnection(dburl, username, password); |
127 | 143 | } catch (SQLException e) { |
128 | 144 | log.error("Cannot establish connection to "+dburl+" - check host, db, username and password : "+e.getMessage()); |
129 | 145 | throw new IOException("Cannot establish connection to mysql database."); |
— | — | @@ -134,7 +150,7 @@ |
135 | 151 | } |
136 | 152 | |
137 | 153 | public String escape(String str){ |
138 | | - return str.replace("'","\\'"); |
| 154 | + return str.replace("\\","\\\\").replace("'","\\'"); |
139 | 155 | } |
140 | 156 | |
141 | 157 | public String getTableName(String name, String dbname){ |
— | — | @@ -147,7 +163,9 @@ |
148 | 164 | // inherit javadoc |
149 | 165 | public Collection<CompactArticleLinks> getPageReferences(Collection<Title> titles, String dbname) throws IOException { |
150 | 166 | String sql = "SELECT rf_key, rf_references from "+getTableName("references",dbname)+" WHERE "; |
151 | | - if(titles.size()==1){ |
| 167 | + if(titles == null || titles.size()==0) |
| 168 | + return new ArrayList<CompactArticleLinks>(); |
| 169 | + else if(titles.size()==1){ |
152 | 170 | sql += "rf_key="+quote(escape(titles.iterator().next().getKey())); |
153 | 171 | } else{ |
154 | 172 | StringBuilder sb = new StringBuilder(sql); |
— | — | @@ -155,7 +173,7 @@ |
156 | 174 | Iterator<Title> it = titles.iterator(); |
157 | 175 | while(it.hasNext()){ |
158 | 176 | sb.append('\''); |
159 | | - sb.append(it.next().getKey()); |
| 177 | + sb.append(escape(it.next().getKey())); |
160 | 178 | sb.append('\''); |
161 | 179 | if(it.hasNext()) |
162 | 180 | sb.append(','); |
— | — | @@ -256,9 +274,11 @@ |
257 | 275 | tdef = def; |
258 | 276 | // create |
259 | 277 | try { |
| 278 | + Connection admin = getAdminConnection(dbname); |
260 | 279 | log.info("Creating table "+name+" on "+dbname); |
261 | | - Statement stmt = conn.createStatement(); |
262 | | - stmt.executeUpdate(tdef); |
| 280 | + Statement stmt = admin.createStatement(); |
| 281 | + stmt.executeUpdate(tdef); |
| 282 | + admin.close(); |
263 | 283 | } catch (SQLException e) { |
264 | 284 | log.error("Cannot create table "+table+" : "+e.getMessage()); |
265 | 285 | throw new IOException(e.getMessage()); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/Configuration.java |
— | — | @@ -105,7 +105,6 @@ |
106 | 106 | GlobalConfiguration global = GlobalConfiguration.getInstance(); |
107 | 107 | String globalurl = getString("MWConfig","global"); |
108 | 108 | String indexpath = getString("Indexes","path"); |
109 | | - String oairepo = getString("OAI","repo"); |
110 | 109 | if(globalurl==null){ |
111 | 110 | System.out.println("FATAL: Need to define global configuration url in local config file."); |
112 | 111 | System.exit(1); |
— | — | @@ -114,7 +113,7 @@ |
115 | 114 | System.exit(1); |
116 | 115 | } |
117 | 116 | try { |
118 | | - global.readFromURL(new URL(globalurl),indexpath,oairepo); |
| 117 | + global.readFromURL(new URL(globalurl),indexpath); |
119 | 118 | } catch (MalformedURLException e) { |
120 | 119 | System.out.println("Malformed URL "+globalurl+" cannot read global configuration (check MWConfig.global in "+CONF_FILE_NAME+"), exiting..."); |
121 | 120 | System.exit(1); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/GlobalConfiguration.java |
— | — | @@ -17,6 +17,7 @@ |
18 | 18 | import java.net.UnknownHostException; |
19 | 19 | import java.text.MessageFormat; |
20 | 20 | import java.util.ArrayList; |
| 21 | +import java.util.Collection; |
21 | 22 | import java.util.Enumeration; |
22 | 23 | import java.util.HashSet; |
23 | 24 | import java.util.Hashtable; |
— | — | @@ -27,6 +28,7 @@ |
28 | 29 | import java.util.regex.Pattern; |
29 | 30 | |
30 | 31 | import org.wikimedia.lsearch.search.NamespaceFilter; |
| 32 | +import org.wikimedia.lsearch.util.PHPParser; |
31 | 33 | |
32 | 34 | /** |
33 | 35 | * Read and parse the global configuration file, is also used |
— | — | @@ -58,13 +60,17 @@ |
59 | 61 | protected Hashtable<String,NamespaceFilter> namespacePrefix; |
60 | 62 | /** keyword for all namespaces (i.e. no filtering) */ |
61 | 63 | protected String namespacePrefixAll; |
| 64 | + /** suffx -> OAI Repo url pattern */ |
| 65 | + protected Hashtable<String,String> oaiRepo; |
| 66 | + /** wgLanguageCode from InitialiseSettings, suffix -> lang code */ |
| 67 | + protected Hashtable<String,String> wgLanguageCode = null; |
| 68 | + /** wgServer, suffix -> server (default server is "default")*/ |
| 69 | + protected Hashtable<String,String> wgServer = null; |
62 | 70 | |
63 | 71 | /** info about this host */ |
64 | 72 | protected static InetAddress myHost; |
65 | 73 | protected static String hostAddr, hostName; |
66 | 74 | |
67 | | - /** OAI repo pattern from lsearch2.conf */ |
68 | | - protected String OAIRepoPattern; |
69 | 75 | /** Database suffix if dbname, the rest is supposed to be language, e.g srwiki => (suffix wiki) => sr */ |
70 | 76 | protected String[] databaseSuffixes = null; |
71 | 77 | /** Databases ending in suffix will use additional keyword scores */ |
— | — | @@ -204,13 +210,13 @@ |
205 | 211 | * @param url |
206 | 212 | * @throws IOException |
207 | 213 | */ |
208 | | - public void readFromURL(URL url, String indexpath, String oaiRepo) throws IOException{ |
| 214 | + public void readFromURL(URL url, String indexpath) throws IOException{ |
209 | 215 | BufferedReader in; |
210 | 216 | try { |
211 | 217 | in = new BufferedReader( |
212 | 218 | new InputStreamReader( |
213 | 219 | url.openStream())); |
214 | | - read(in,indexpath,oaiRepo); |
| 220 | + read(in,indexpath); |
215 | 221 | } catch (IOException e) { |
216 | 222 | System.out.println("I/O Error in opening or reading global config at url "+url); |
217 | 223 | throw e; |
— | — | @@ -231,6 +237,7 @@ |
232 | 238 | indexRsyncPath = new Hashtable<String, String>(); |
233 | 239 | namespacePrefix = new Hashtable<String,NamespaceFilter>(); |
234 | 240 | namespacePrefixAll = "all"; // default |
| 241 | + oaiRepo = new Hashtable<String,String>(); |
235 | 242 | } |
236 | 243 | |
237 | 244 | protected String[] getArrayProperty(String name){ |
— | — | @@ -247,7 +254,7 @@ |
248 | 255 | * @param in opened reader |
249 | 256 | * @throws IOException |
250 | 257 | */ |
251 | | - protected void read(BufferedReader in, String indexpath, String oaiRepo) throws IOException{ |
| 258 | + protected void read(BufferedReader in, String indexpath) throws IOException{ |
252 | 259 | String line=""; |
253 | 260 | int section = -1; |
254 | 261 | Pattern roleRegexp = Pattern.compile("\\((.*?)\\)"); |
— | — | @@ -258,12 +265,12 @@ |
259 | 266 | final int SEARCH = 2; |
260 | 267 | final int INDEXPATH = 3; |
261 | 268 | final int NAMESPACE_PREFIX = 4; |
| 269 | + final int OAI = 5; |
262 | 270 | |
263 | 271 | int searchGroupNum = -1; |
264 | 272 | |
265 | 273 | init(); |
266 | 274 | this.indexPath = indexpath; |
267 | | - this.OAIRepoPattern = oaiRepo == null? "" : oaiRepo; |
268 | 275 | |
269 | 276 | while((line = in.readLine()) != null){ |
270 | 277 | lineNum ++; |
— | — | @@ -293,6 +300,10 @@ |
294 | 301 | this.databaseSuffixes = getArrayProperty("Database.suffix"); |
295 | 302 | this.keywordScoringSuffixes = getArrayProperty("KeywordScoring.suffix"); |
296 | 303 | this.exactCaseSuffix = getArrayProperty("ExactCase.suffix"); |
| 304 | + // try reading intialisesettings |
| 305 | + String initset = globalProperties.getProperty("WMF.InitialiseSettings"); |
| 306 | + if(initset != null) |
| 307 | + initializeWmfSettings(initset); |
297 | 308 | if(line == null) |
298 | 309 | break; |
299 | 310 | // else: line points to beginning of next section |
— | — | @@ -311,6 +322,8 @@ |
312 | 323 | section = INDEXPATH; |
313 | 324 | else if(s.equalsIgnoreCase("namespace-prefix")) |
314 | 325 | section = NAMESPACE_PREFIX; |
| 326 | + else if(s.equalsIgnoreCase("oai")) |
| 327 | + section = OAI; |
315 | 328 | } else if(section==-1 && !line.trim().equals("")){ |
316 | 329 | System.out.println("Ignoring a line up to first section heading..."); |
317 | 330 | } else if(section == DATABASE){ |
— | — | @@ -355,17 +368,42 @@ |
356 | 369 | namespacePrefixAll = prefix; |
357 | 370 | else |
358 | 371 | namespacePrefix.put(prefix,new NamespaceFilter(filter)); |
| 372 | + } else if(section == OAI){ |
| 373 | + String[] parts = splitBySemicolon(line,lineNum); |
| 374 | + if(parts == null) continue; |
| 375 | + String suffix = parts[0].trim(); |
| 376 | + String url = parts[1].trim(); |
| 377 | + |
| 378 | + oaiRepo.put(suffix,url); |
359 | 379 | } |
360 | 380 | } |
361 | 381 | if( !checkIntegrity() ){ |
362 | 382 | in.close(); |
363 | 383 | System.exit(1); |
364 | 384 | } |
| 385 | + |
365 | 386 | makeIndexIdPool(); |
366 | 387 | in.close(); |
367 | 388 | } |
368 | 389 | |
369 | | - |
| 390 | + /** |
| 391 | + * A bit hackish: read InitialiseSettings which we know have a certain |
| 392 | + * format to avoid maintaining two copies for config files (one in php |
| 393 | + * other for lsearch in global conf) |
| 394 | + * |
| 395 | + * @param initset |
| 396 | + */ |
| 397 | + protected void initializeWmfSettings(String initset) { |
| 398 | + try { |
| 399 | + PHPParser parser = new PHPParser(); |
| 400 | + String text = parser.readURL(new URL(initset)); |
| 401 | + wgLanguageCode = parser.getLanguages(text); |
| 402 | + wgServer = parser.getServer(text); |
| 403 | + } catch (IOException e) { |
| 404 | + System.out.println("Error: Cannot read InitialiseSettings.php from url "+initset+" : "+e.getMessage()); |
| 405 | + } |
| 406 | + } |
| 407 | + |
370 | 408 | /** Get all hosts which search this inxedId (dbrole) */ |
371 | 409 | protected HashSet<String> getSearchHosts(String dbrole){ |
372 | 410 | HashSet<String> searchHosts = new HashSet<String>(); |
— | — | @@ -445,7 +483,7 @@ |
446 | 484 | if(rsyncIndexPath == null) |
447 | 485 | rsyncIndexPath = indexRsyncPath.get("<default>"); |
448 | 486 | } |
449 | | - String oairepo = MessageFormat.format(OAIRepoPattern,new Object[] {dbname,getLanguage(dbname)}); |
| 487 | + String oairepo = getOAIRepo(dbname); |
450 | 488 | |
451 | 489 | IndexId iid = new IndexId(dbrole, |
452 | 490 | type, |
— | — | @@ -523,7 +561,7 @@ |
524 | 562 | } |
525 | 563 | |
526 | 564 | protected String[] splitBySemicolon(String line, int lineNum){ |
527 | | - String[] parts = line.split(":"); |
| 565 | + String[] parts = line.split(":",2); |
528 | 566 | if(parts.length!=2){ |
529 | 567 | System.out.println("Error at line "+lineNum+": semicolon missing. Ignoring this line."); |
530 | 568 | return null; |
— | — | @@ -779,9 +817,18 @@ |
780 | 818 | /** Get language for a dbname */ |
781 | 819 | public String getLanguage(String dbname) { |
782 | 820 | // first check explicit language paramter in global settings |
783 | | - Hashtable<String,String> lang = database.get(dbname).get("language"); |
784 | | - if(lang!=null) |
785 | | - return lang.get("code"); |
| 821 | + Hashtable<String,Hashtable<String,String>> dbparam = database.get(dbname); |
| 822 | + if(dbparam !=null){ |
| 823 | + Hashtable<String,String> lang = dbparam.get("language"); |
| 824 | + if(lang!=null) |
| 825 | + return lang.get("code"); |
| 826 | + } |
| 827 | + // try to get from initialise settings |
| 828 | + if(wgLanguageCode!=null){ |
| 829 | + String key = findSuffix(wgLanguageCode.keySet(),dbname); |
| 830 | + if(key != null) |
| 831 | + return wgLanguageCode.get(key); |
| 832 | + } |
786 | 833 | // try to get languages from suffixes |
787 | 834 | if(databaseSuffixes != null){ |
788 | 835 | for (String suffix : databaseSuffixes) { |
— | — | @@ -867,6 +914,51 @@ |
868 | 915 | public boolean exactCaseIndex(String dbname){ |
869 | 916 | return checkSuffix(exactCaseSuffix,dbname); |
870 | 917 | } |
| 918 | + |
| 919 | + /** Find suffix that matches dbname */ |
| 920 | + public String findSuffix(Collection<String> suffixes, String dbname){ |
| 921 | + for(String suffix : suffixes){ |
| 922 | + if(dbname.endsWith(suffix)){ |
| 923 | + return suffix; |
| 924 | + } |
| 925 | + } |
| 926 | + return null; |
| 927 | + } |
871 | 928 | |
| 929 | + /** Get OAI-repo url for dbname */ |
| 930 | + public String getOAIRepo(String dbname){ |
| 931 | + String repo = null; |
| 932 | + // try to get from initialise settings |
| 933 | + if(wgServer != null){ |
| 934 | + String key = findSuffix(wgServer.keySet(),dbname); |
| 935 | + if(key == null) |
| 936 | + key = "default"; |
| 937 | + repo = wgServer.get(key); |
| 938 | + if(repo != null){ |
| 939 | + if(!repo.endsWith("/")) |
| 940 | + repo += "/"; |
| 941 | + repo += "w/index.php"; // FIXME: we take this as generic path to index.php |
| 942 | + } |
| 943 | + |
| 944 | + } |
| 945 | + // get from global config |
| 946 | + if(repo == null){ |
| 947 | + repo = findSuffix(oaiRepo.keySet(),dbname); |
| 948 | + if(repo != null) |
| 949 | + repo = oaiRepo.get(repo); |
| 950 | + if(repo == null && oaiRepo.containsKey("<default>")) |
| 951 | + repo = oaiRepo.get("<default>"); |
| 952 | + } |
| 953 | + if(repo == null) |
| 954 | + return ""; // failed, no url |
| 955 | + |
| 956 | + // process $lang |
| 957 | + String lang = getLanguage(dbname); |
| 958 | + repo = repo.replace("$lang",lang); |
| 959 | + repo = repo += "?title=Special:OAIRepository"; |
| 960 | + |
| 961 | + return repo; |
| 962 | + } |
| 963 | + |
872 | 964 | |
873 | 965 | } |
\ No newline at end of file |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/IncrementalUpdater.java |
— | — | @@ -60,24 +60,26 @@ |
61 | 61 | |
62 | 62 | /** |
63 | 63 | * Syntax: |
64 | | - * java IncrementalUpdater [-d] [-t timestamp] [-s sleep] [-f dblist] [-n] dbname1 dbname2 ... |
| 64 | + * java IncrementalUpdater [-d] [-t timestamp] [-s sleep] [-f dblist] [-e dbname] [-n] dbname1 dbname2 ... |
65 | 65 | * Options: |
66 | 66 | * -d - daemonize, otherwise runs only one round of updates to dbs |
67 | | - * -s - sleep time after one cycle (default: 3000ms) |
| 67 | + * -s - sleep time after one cycle (default: 30000ms) |
68 | 68 | * -t - default timestamp if status file is missing (default: 2001-01-01) |
69 | 69 | * -f - file to read databases from |
70 | | - * -n - wait for notification of flush after done updating one db (default: false) |
| 70 | + * -n - wait for notification of flush after done updating one db (default: true) |
| 71 | + * -e - exclude dbname from incremental updates (overrides -f) |
71 | 72 | * |
72 | 73 | * @param args |
73 | 74 | */ |
74 | 75 | public static void main(String[] args){ |
75 | 76 | ArrayList<String> dbnames = new ArrayList<String>(); |
76 | 77 | boolean daemon = false; |
77 | | - long sleepTime = 3000; |
| 78 | + long sleepTime = 30000; // 30s |
78 | 79 | String timestamp = null; |
79 | 80 | int maxQueueSize = 500; |
80 | 81 | String dblist = null; |
81 | | - boolean notification = false; |
| 82 | + boolean notification = true; |
| 83 | + HashSet<String> excludeList = new HashSet<String>(); |
82 | 84 | HashSet<String> firstPass = new HashSet<String>(); // if dbname is here, then it's our update pass |
83 | 85 | // args |
84 | 86 | for(int i=0; i<args.length; i++){ |
— | — | @@ -89,6 +91,8 @@ |
90 | 92 | timestamp = args[++i]; |
91 | 93 | else if(args[i].equals("-f")) |
92 | 94 | dblist = args[++i]; |
| 95 | + else if(args[i].equals("-e")) |
| 96 | + excludeList.add(args[++i]); |
93 | 97 | else if(args[i].equals("-n")) |
94 | 98 | notification = true; |
95 | 99 | else if(args[i].equals("--help")) |
— | — | @@ -115,13 +119,14 @@ |
116 | 120 | } |
117 | 121 | } |
118 | 122 | if(dbnames.size() == 0){ |
119 | | - System.out.println("Syntax: java IncrementalUpdater [-d] [-s sleep] [-t timestamp] [-f dblist] dbname1 dbname2 ..."); |
| 123 | + System.out.println("Syntax: java IncrementalUpdater [-d] [-s sleep] [-t timestamp] [-e dbname] [-f dblist] dbname1 dbname2 ..."); |
120 | 124 | System.out.println("Options:"); |
121 | 125 | System.out.println(" -d - daemonize, otherwise runs only one round of updates to dbs"); |
122 | 126 | System.out.println(" -s - sleep time after one cycle (default: "+sleepTime+"ms)"); |
123 | 127 | System.out.println(" -t - timestamp to start from (if status is missing default: "+timestamp+")"); |
124 | 128 | System.out.println(" -f - dblist file, one dbname per line"); |
125 | 129 | System.out.println(" -n - wait for notification of flush after done updating one db (default: "+notification+")"); |
| 130 | + System.out.println(" -e - exclude dbname from incremental updates (overrides -f)"); |
126 | 131 | return; |
127 | 132 | } |
128 | 133 | // config |
— | — | @@ -141,6 +146,8 @@ |
142 | 147 | do{ |
143 | 148 | main_loop: for(String dbname : dbnames){ |
144 | 149 | try{ |
| 150 | + if(excludeList.contains(dbname)) |
| 151 | + continue; |
145 | 152 | IndexId iid = IndexId.get(dbname); |
146 | 153 | OAIHarvester harvester = new OAIHarvester(iid,iid.getOAIRepository(),auth); |
147 | 154 | Properties status = new Properties(); |
— | — | @@ -170,7 +177,7 @@ |
171 | 178 | fetchReferences(records,dbname); |
172 | 179 | for(IndexUpdateRecord rec : records){ |
173 | 180 | Article ar = rec.getArticle(); |
174 | | - log.debug("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects()); |
| 181 | + log.info("Sending "+ar+" with rank "+ar.getReferences()+" and "+ar.getRedirects().size()+" redirects: "+ar.getRedirects()); |
175 | 182 | } |
176 | 183 | // send to indexer |
177 | 184 | RMIMessengerClient messenger = new RMIMessengerClient(true); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/oai/OAIHarvester.java |
— | — | @@ -36,7 +36,7 @@ |
37 | 37 | /** Invoke ListRecords from a certain timestamp */ |
38 | 38 | public ArrayList<IndexUpdateRecord> getRecords(String from){ |
39 | 39 | try{ |
40 | | - read(new URL(urlbase+"?verb=ListRecords&metadataPrefix=lsearch&from="+from)); |
| 40 | + read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&from="+from)); |
41 | 41 | return collector.getRecords(); |
42 | 42 | } catch(IOException e){ |
43 | 43 | log.warn("I/O exception listing records: "+e.getMessage()); |
— | — | @@ -57,7 +57,7 @@ |
58 | 58 | /** Invoke ListRecords using the last resumption token */ |
59 | 59 | public ArrayList<IndexUpdateRecord> getMoreRecords(){ |
60 | 60 | try{ |
61 | | - read(new URL(urlbase+"?verb=ListRecords&metadataPrefix=lsearch&resumptionToken="+resumptionToken)); |
| 61 | + read(new URL(urlbase+"&verb=ListRecords&metadataPrefix=lsearch&resumptionToken="+resumptionToken)); |
62 | 62 | return collector.getRecords(); |
63 | 63 | } catch(IOException e){ |
64 | 64 | log.warn("I/O exception listing records: "+e.getMessage()); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/PHPParser.java |
— | — | @@ -4,6 +4,7 @@ |
5 | 5 | import java.io.FileReader; |
6 | 6 | import java.io.IOException; |
7 | 7 | import java.io.InputStreamReader; |
| 8 | +import java.net.MalformedURLException; |
8 | 9 | import java.net.URL; |
9 | 10 | import java.util.HashMap; |
10 | 11 | import java.util.HashSet; |
— | — | @@ -123,6 +124,42 @@ |
124 | 125 | return ns; |
125 | 126 | } |
126 | 127 | |
| 128 | + /** Get wgLanguages from InitialiseSettings */ |
| 129 | + public Hashtable<String,String> getLanguages(String text){ |
| 130 | + text = text.replaceAll("(#.*)",""); // strip comments |
| 131 | + Hashtable<String,String> langs = new Hashtable<String,String>(); |
| 132 | + |
| 133 | + int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL; |
| 134 | + Pattern wglang = Pattern.compile("[\"']wgLanguageCode[\"']\\s*=>\\s*array\\s*\\((.*?)\\)",flags); |
| 135 | + Pattern entry = Pattern.compile("[\"'](.*?)[\"']\\s*=>\\s*[\"'](.*?)[\"']",flags); |
| 136 | + Matcher matcher = wglang.matcher(text); |
| 137 | + while(matcher.find()){ |
| 138 | + Matcher me = entry.matcher(matcher.group(1)); |
| 139 | + while(me.find()){ |
| 140 | + langs.put(me.group(1),me.group(2)); |
| 141 | + } |
| 142 | + } |
| 143 | + return langs; |
| 144 | + } |
| 145 | + |
| 146 | + /** Get wgServer from InitialiseSettings */ |
| 147 | + public Hashtable<String,String> getServer(String text){ |
| 148 | + text = text.replaceAll("(#.*)",""); // strip comments |
| 149 | + Hashtable<String,String> servers = new Hashtable<String,String>(); |
| 150 | + |
| 151 | + int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL; |
| 152 | + Pattern wgserv = Pattern.compile("[\"']wgServer[\"']\\s*=>\\s*array\\s*\\((.*?)\\)",flags); |
| 153 | + Pattern entry = Pattern.compile("[\"'](.*?)[\"']\\s*=>\\s*[\"'](.*?)[\"']",flags); |
| 154 | + Matcher matcher = wgserv.matcher(text); |
| 155 | + while(matcher.find()){ |
| 156 | + Matcher me = entry.matcher(matcher.group(1)); |
| 157 | + while(me.find()){ |
| 158 | + servers.put(me.group(1),me.group(2)); |
| 159 | + } |
| 160 | + } |
| 161 | + return servers; |
| 162 | + } |
| 163 | + |
127 | 164 | public String readFile(String path){ |
128 | 165 | char buffer[] = new char[32768]; |
129 | 166 | String text = ""; |
— | — | @@ -141,7 +178,7 @@ |
142 | 179 | return text; |
143 | 180 | } |
144 | 181 | |
145 | | - public String readURL(URL url){ |
| 182 | + public String readURL(URL url) throws IOException{ |
146 | 183 | char buffer[] = new char[32768]; |
147 | 184 | String text = ""; |
148 | 185 | try { |
— | — | @@ -155,13 +192,15 @@ |
156 | 193 | } while(len > 0); |
157 | 194 | r.close(); |
158 | 195 | } catch (IOException e) { |
159 | | - // silent |
| 196 | + throw e; |
160 | 197 | } |
161 | 198 | return text; |
162 | 199 | } |
163 | 200 | |
164 | | - /** Test stuff */ |
165 | | - public static void main(String args[]){ |
| 201 | + /** Test stuff |
| 202 | + * @throws IOException |
| 203 | + * @throws MalformedURLException */ |
| 204 | + public static void main(String args[]) throws MalformedURLException, IOException{ |
166 | 205 | String text = "$namespaceNames = array(\n"+ |
167 | 206 | "NS_MEDIA => \"Medija\",\n"+ |
168 | 207 | "NS_SPECIAL => \"Posebno\",\n"+ |
— | — | @@ -178,5 +217,11 @@ |
179 | 218 | System.out.println(p.getFallBack(text2)); |
180 | 219 | System.out.println(p.getRedirectMagic(php)); |
181 | 220 | |
| 221 | + System.out.println(p.getLanguages("'wgLanguageCode' => array('default' => '$lang')")); |
| 222 | + String initset = p.readURL(new URL("file:///home/rainman/Desktop/InitialiseSettings.php")); |
| 223 | + System.out.println(p.getLanguages(initset)); |
| 224 | + System.out.println(p.getServer(initset)); |
| 225 | + |
| 226 | + |
182 | 227 | } |
183 | 228 | } |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java |
— | — | @@ -111,8 +111,7 @@ |
112 | 112 | if(langCode == null || langCode.equals("")) |
113 | 113 | return false; |
114 | 114 | if(level == 5) // max 5 recursions in depth |
115 | | - return false; |
116 | | - log.info("Reading localization for "+langCode); |
| 115 | + return false; |
117 | 116 | // make title case |
118 | 117 | langCode = langCode.substring(0,1).toUpperCase()+langCode.substring(1).toLowerCase(); |
119 | 118 | if(badLocalizations.contains(langCode.toLowerCase())){ |
— | — | @@ -124,6 +123,7 @@ |
125 | 124 | log.warn("Property Localization.url not set in config file. Localization disabled."); |
126 | 125 | return false; |
127 | 126 | } |
| 127 | + log.info("Reading localization for "+langCode); |
128 | 128 | URL url; |
129 | 129 | try { |
130 | 130 | url = new URL(MessageFormat.format(loc,langCode)); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/ranks/RankBuilder.java |
— | — | @@ -43,7 +43,7 @@ |
44 | 44 | log = Logger.getLogger(RankBuilder.class); |
45 | 45 | |
46 | 46 | if(args.length < 2){ |
47 | | - System.out.println("Syntax: java Importer <inputfile> <dbname>"); |
| 47 | + System.out.println("Syntax: java RankBuilder <inputfile> <dbname>"); |
48 | 48 | return; |
49 | 49 | } |
50 | 50 | inputfile = args[0]; |
Index: trunk/lucene-search-2.0/lsearch-global.conf |
— | — | @@ -36,6 +36,14 @@ |
37 | 37 | [Index-Path] |
38 | 38 | <default> : /mwsearch |
39 | 39 | |
| 40 | +# OAI repository info, for incremental updater |
| 41 | +# dbSuffix : base url (to index.php) |
| 42 | +# ?title=Special:OAIRepository is appended to url |
| 43 | +[OAI] |
| 44 | +wiktionary : http://$lang.wiktionary.org/w/index.php |
| 45 | +wikilucene : http://localhost/wiki-lucene/phase3/index.php |
| 46 | +<default> : http://$lang.wikipedia.org/w/index.php |
| 47 | + |
40 | 48 | # Global properies |
41 | 49 | [Properties] |
42 | 50 | # suffixes to database name, the rest is assumed to be language code |
— | — | @@ -48,6 +56,10 @@ |
49 | 57 | # note: this will also turn off stemming! |
50 | 58 | ExactCase.suffix=wiktionary wikilucene |
51 | 59 | |
| 60 | +# wmf-style init file, attempt to read wgserver (for oai) and lang info |
| 61 | +# for sample see http://noc.wikimedia.org/conf/InitialiseSettings.php.html |
| 62 | +# WMF.InitialiseSettings=http://noc.wikimedia.org/conf/InitialiseSettings.php.html |
| 63 | + |
52 | 64 | # Put here you custom namespace prefixes |
53 | 65 | # Syntax: <prefix_name> : <coma separated list of namespaces> |
54 | 66 | # <all> is a special keyword meaning all namespaces |
Index: trunk/lucene-search-2.0/lsearch.conf |
— | — | @@ -63,6 +63,9 @@ |
64 | 64 | # Storage.username=root |
65 | 65 | # Storage.password= |
66 | 66 | |
| 67 | +# Storage.adminuser=root |
| 68 | +# Storage.adminpass= |
| 69 | + |
67 | 70 | # Values: |
68 | 71 | # true - each dbname has a separate db of that name |
69 | 72 | # false - each dbname is a prefix for tables in a default db (set default db below) |