Index: trunk/lucene-search-2.0/.classpath |
— | — | @@ -12,5 +12,6 @@ |
13 | 13 | <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> |
14 | 14 | <classpathentry kind="lib" path="lib/lucene-core-2.0.1-dev.jar" sourcepath="/lucene-2.0"/> |
15 | 15 | <classpathentry kind="lib" path="lib/snowball.jar"/> |
| 16 | + <classpathentry kind="lib" path="lib/mwdumper.jar" sourcepath="/mwdumper"/> |
16 | 17 | <classpathentry kind="output" path="bin"/> |
17 | 18 | </classpath> |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java |
— | — | @@ -0,0 +1,66 @@ |
| 2 | +package org.wikimedia.lsearch.importer; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.InputStream; |
| 6 | + |
| 7 | +import org.apache.log4j.Logger; |
| 8 | +import org.mediawiki.dumper.ProgressFilter; |
| 9 | +import org.mediawiki.dumper.Tools; |
| 10 | +import org.mediawiki.importer.XmlDumpReader; |
| 11 | +import org.wikimedia.lsearch.config.Configuration; |
| 12 | +import org.wikimedia.lsearch.config.GlobalConfiguration; |
| 13 | +import org.wikimedia.lsearch.util.Localization; |
| 14 | +import org.wikimedia.lsearch.util.UnicodeDecomposer; |
| 15 | + |
| 16 | +/** |
| 17 | + * Main class, builds index from a database dump. |
| 18 | + * Syntax: java Importer inputfile dbname |
| 19 | + * |
| 20 | + * @author rainman |
| 21 | + * |
| 22 | + */ |
| 23 | +public class Importer { |
| 24 | + |
| 25 | + /** |
| 26 | + * @param args |
| 27 | + */ |
| 28 | + public static void main(String[] args) { |
| 29 | + System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n"); |
| 30 | + |
| 31 | + Configuration.open(); |
| 32 | + Logger log = Logger.getLogger(Importer.class); |
| 33 | + |
| 34 | + if(args.length != 2){ |
| 35 | + System.out.println("Syntax: java Importer <inputfile> <dbname>"); |
| 36 | + return; |
| 37 | + } |
| 38 | + String inputfile = args[0]; |
| 39 | + String dbname = args[1]; |
| 40 | + |
| 41 | + // preload |
| 42 | + UnicodeDecomposer.getInstance(); |
| 43 | + Localization.readLocalization(GlobalConfiguration.getInstance().getLanguage(dbname)); |
| 44 | + Localization.loadInterwiki(); |
| 45 | + |
| 46 | + // open |
| 47 | + InputStream input = null; |
| 48 | + try { |
| 49 | + input = Tools.openInputFile(inputfile); |
| 50 | + } catch (IOException e) { |
| 51 | + log.fatal("I/O error opening "+inputfile); |
| 52 | + } |
| 53 | + |
| 54 | + // read |
| 55 | + DumpImporter dp = new DumpImporter(dbname); |
| 56 | + XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 100)); |
| 57 | + try { |
| 58 | + reader.readDump(); |
| 59 | + } catch (IOException e) { |
| 60 | + log.warn("I/O error reading dump for "+dbname+" from "+inputfile); |
| 61 | + } |
| 62 | + |
| 63 | + log.info("Closing/optimizing index..."); |
| 64 | + dp.closeIndex(); |
| 65 | + } |
| 66 | + |
| 67 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java |
— | — | @@ -0,0 +1,53 @@ |
| 2 | +package org.wikimedia.lsearch.importer; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | + |
| 6 | +import org.apache.log4j.Logger; |
| 7 | +import org.mediawiki.importer.DumpWriter; |
| 8 | +import org.mediawiki.importer.Page; |
| 9 | +import org.mediawiki.importer.Revision; |
| 10 | +import org.mediawiki.importer.Siteinfo; |
| 11 | +import org.wikimedia.lsearch.beans.Article; |
| 12 | +import org.wikimedia.lsearch.config.Configuration; |
| 13 | +import org.wikimedia.lsearch.config.IndexId; |
| 14 | + |
| 15 | +public class DumpImporter implements DumpWriter { |
| 16 | + static Logger log = Logger.getLogger(DumpImporter.class); |
| 17 | + Page page; |
| 18 | + Revision revision; |
| 19 | + SimpleIndexWriter writer; |
| 20 | + |
| 21 | + public DumpImporter(String dbname){ |
| 22 | + Configuration.open(); // make sure configuration is loaded |
| 23 | + writer = new SimpleIndexWriter(IndexId.get(dbname)); |
| 24 | + } |
| 25 | + public void writeRevision(Revision revision) throws IOException { |
| 26 | + this.revision = revision; |
| 27 | + } |
| 28 | + public void writeStartPage(Page page) throws IOException { |
| 29 | + this.page = page; |
| 30 | + } |
| 31 | + public void writeEndPage() throws IOException { |
| 32 | + Article article = new Article(page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect()); |
| 33 | + writer.addArticle(article); |
| 34 | + } |
| 35 | + |
| 36 | + public void close() throws IOException { |
| 37 | + // nop |
| 38 | + } |
| 39 | + public void writeEndWiki() throws IOException { |
| 40 | + // nop |
| 41 | + } |
| 42 | + public void writeSiteinfo(Siteinfo info) throws IOException { |
| 43 | + // nop |
| 44 | + } |
| 45 | + public void writeStartWiki() throws IOException { |
| 46 | + // nop |
| 47 | + } |
| 48 | + |
| 49 | + public void closeIndex(){ |
| 50 | + writer.close(); |
| 51 | + } |
| 52 | + |
| 53 | + |
| 54 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java |
— | — | @@ -0,0 +1,120 @@ |
| 2 | +package org.wikimedia.lsearch.importer; |
| 3 | + |
| 4 | +import java.io.IOException; |
| 5 | +import java.util.HashMap; |
| 6 | +import java.util.Map.Entry; |
| 7 | + |
| 8 | +import org.apache.log4j.Logger; |
| 9 | +import org.apache.lucene.analysis.Analyzer; |
| 10 | +import org.apache.lucene.document.Document; |
| 11 | +import org.apache.lucene.index.IndexWriter; |
| 12 | +import org.wikimedia.lsearch.analyzers.FilterFactory; |
| 13 | +import org.wikimedia.lsearch.beans.Article; |
| 14 | +import org.wikimedia.lsearch.beans.IndexReportCard; |
| 15 | +import org.wikimedia.lsearch.config.GlobalConfiguration; |
| 16 | +import org.wikimedia.lsearch.config.IndexId; |
| 17 | +import org.wikimedia.lsearch.index.IndexUpdateRecord; |
| 18 | +import org.wikimedia.lsearch.index.WikiIndexModifier; |
| 19 | +import org.wikimedia.lsearch.index.WikiSimilarity; |
| 20 | + |
| 21 | +/** |
| 22 | + * IndexWriter for building indexes from scratch. |
| 23 | + * |
| 24 | + * @author rainman |
| 25 | + * |
| 26 | + */ |
| 27 | +public class SimpleIndexWriter { |
| 28 | + static Logger log = Logger.getLogger(SimpleIndexWriter.class); |
| 29 | + IndexId iid; |
| 30 | + HashMap<String,IndexWriter> indexes; |
| 31 | + FilterFactory filters; |
| 32 | + |
| 33 | + public SimpleIndexWriter(IndexId iid){ |
| 34 | + this.iid = iid; |
| 35 | + String langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname()); |
| 36 | + filters = new FilterFactory(langCode); |
| 37 | + indexes = new HashMap<String,IndexWriter>(); |
| 38 | + // open all relevant indexes |
| 39 | + if(iid.isSingle()) |
| 40 | + indexes.put(iid.toString(),openIndex(iid)); |
| 41 | + else if(iid.isMainsplit()){ |
| 42 | + indexes.put(iid.getMainPart().toString(),openIndex(iid.getMainPart())); |
| 43 | + indexes.put(iid.getRestPart().toString(),openIndex(iid.getRestPart())); |
| 44 | + } else if(iid.isSplit()){ |
| 45 | + for(String dbpart : iid.getSplitParts()){ |
| 46 | + indexes.put(IndexId.get(dbpart).toString(),openIndex(IndexId.get(dbpart))); |
| 47 | + } |
| 48 | + } else |
| 49 | + log.fatal("Unrecognized index architecture for "+iid); |
| 50 | + |
| 51 | + } |
| 52 | + |
| 53 | + /** Open and initialize index denoted by iid */ |
| 54 | + protected IndexWriter openIndex(IndexId iid) { |
| 55 | + String path = iid.getImportPath(); |
| 56 | + IndexWriter writer; |
| 57 | + try { |
| 58 | + writer = new IndexWriter(path,null,false); // never rewrite index, so we can resume |
| 59 | + } catch (IOException e) { |
| 60 | + try { |
| 61 | + // try to make brand new index |
| 62 | + WikiIndexModifier.makeDBPath(iid.getIndexPath()); // ensure all directories are made |
| 63 | + log.info("Making new index at path "+path); |
| 64 | + writer = new IndexWriter(path,null,true); |
| 65 | + } catch (IOException e1) { |
| 66 | + log.error("I/O error openning index for addition of documents at "+path+" : "+e.getMessage()); |
| 67 | + return null; |
| 68 | + } |
| 69 | + } |
| 70 | + writer.setSimilarity(new WikiSimilarity()); |
| 71 | + int mergeFactor = iid.getIntParam("mergeFactor",2); |
| 72 | + int maxBufDocs = iid.getIntParam("maxBufDocs",10); |
| 73 | + writer.setMergeFactor(mergeFactor); |
| 74 | + writer.setMaxBufferedDocs(maxBufDocs); |
| 75 | + writer.setUseCompoundFile(true); |
| 76 | + |
| 77 | + return writer; |
| 78 | + } |
| 79 | + |
| 80 | + /** Add single article to logical index. It will add the article to the right index part */ |
| 81 | + public void addArticle(Article a){ |
| 82 | + IndexId target; |
| 83 | + if(iid.isSingle()) |
| 84 | + target = iid; |
| 85 | + else if(iid.isMainsplit()) // assign according to namespace |
| 86 | + target = (a.getNamespace().equals("0"))? iid.getMainPart() : iid.getRestPart(); |
| 87 | + else // split index, randomly assign to some index part |
| 88 | + target = iid.getPart(1+(int)(Math.random()*iid.getSplitFactor())); |
| 89 | + |
| 90 | + IndexWriter writer = indexes.get(target.toString()); |
| 91 | + if(writer == null) |
| 92 | + return; |
| 93 | + Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,filters); |
| 94 | + Document doc = (Document) ret[0]; |
| 95 | + Analyzer analyzer = (Analyzer) ret[1]; |
| 96 | + try { |
| 97 | + writer.addDocument(doc,analyzer); |
| 98 | + log.debug(iid+": Adding document "+a); |
| 99 | + } catch (IOException e) { |
| 100 | + log.error("I/O Error writing articlet "+a+" to index "+target.getImportPath()); |
| 101 | + } catch(Exception e){ |
| 102 | + e.printStackTrace(); |
| 103 | + log.error("Error adding document "+a+" with message: "+e.getMessage()); |
| 104 | + } |
| 105 | + } |
| 106 | + |
| 107 | + /** Close and (if specified in global config) optimize indexes */ |
| 108 | + public void close(){ |
| 109 | + for(Entry<String,IndexWriter> en : indexes.entrySet()){ |
| 110 | + IndexId iid = IndexId.get(en.getKey()); |
| 111 | + IndexWriter writer = en.getValue(); |
| 112 | + try{ |
| 113 | + if(iid.getBooleanParam("optimize",true)) |
| 114 | + writer.optimize(); |
| 115 | + writer.close(); |
| 116 | + } catch(IOException e){ |
| 117 | + log.warn("I/O error optimizing/closing index at "+iid.getImportPath()); |
| 118 | + } |
| 119 | + } |
| 120 | + } |
| 121 | +} |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexId.java |
— | — | @@ -75,6 +75,7 @@ |
76 | 76 | /** Where the indexer places the snapshots */ |
77 | 77 | protected String snapshotPath; |
78 | 78 | protected String updatePath = null; |
| 79 | + protected String importPath; |
79 | 80 | |
80 | 81 | protected String rsyncSnapshotPath = null; |
81 | 82 | |
— | — | @@ -167,6 +168,7 @@ |
168 | 169 | } |
169 | 170 | |
170 | 171 | indexPath = indexHostPath + "index" + sep + dbrole; |
| 172 | + importPath = indexHostPath + "fromXML" + sep + dbrole; |
171 | 173 | snapshotPath = indexHostPath + "snapshot" + sep + dbrole; |
172 | 174 | rsyncSnapshotPath = "/mwsearch/snapshot/" + dbrole; |
173 | 175 | |
— | — | @@ -289,7 +291,11 @@ |
290 | 292 | public String getUpdatePath() { |
291 | 293 | return updatePath; |
292 | 294 | } |
293 | | - |
| 295 | + /** Where indexes are made when built from XML importing */ |
| 296 | + public String getImportPath() { |
| 297 | + return importPath; |
| 298 | + } |
| 299 | + |
294 | 300 | /** Get search path with resolved symlinks */ |
295 | 301 | public String getCanonicalSearchPath(){ |
296 | 302 | try { |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java |
— | — | @@ -130,7 +130,7 @@ |
131 | 131 | } catch (IOException e) { |
132 | 132 | try { |
133 | 133 | // try to make brand new index |
134 | | - makeDBPath(iid); // ensure all directories are made |
| 134 | + makeDBPath(path); // ensure all directories are made |
135 | 135 | log.info("Making new index at path "+path); |
136 | 136 | writer = new IndexWriter(path,null,true); |
137 | 137 | } catch (IOException e1) { |
— | — | @@ -154,7 +154,7 @@ |
155 | 155 | if(!rec.isAlwaysAdd() && nonDeleteDocuments.contains(rec)) |
156 | 156 | continue; // don't add if delete/add are paired operations |
157 | 157 | IndexReportCard card = getReportCard(rec); |
158 | | - Object[] ret = makeDocumentAndAnalyzer(rec,filters); |
| 158 | + Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters); |
159 | 159 | Document doc = (Document) ret[0]; |
160 | 160 | Analyzer analyzer = (Analyzer) ret[1]; |
161 | 161 | try { |
— | — | @@ -181,25 +181,26 @@ |
182 | 182 | } |
183 | 183 | } |
184 | 184 | |
185 | | - /** |
186 | | - * Create necessary directories for index |
187 | | - * @param dbname |
188 | | - * @return relative path (to document root) of db within filesystem |
189 | | - */ |
190 | | - public String makeDBPath(IndexId iid){ |
191 | | - String path = iid.getIndexPath(); |
192 | | - File dir = new File(path); |
193 | | - if(!dir.exists()){ |
194 | | - boolean succ = dir.mkdirs(); |
195 | | - if(!succ){ |
196 | | - log.error("Could not create directory "+path+", do you have permissions to create it? Updates from database "+iid+" will not be written."); |
197 | | - return null; |
198 | | - } |
199 | | - } |
200 | | - return path; |
201 | | - } |
| 185 | + |
202 | 186 | |
203 | 187 | } |
| 188 | + /** |
| 189 | + * Create necessary directories for index |
| 190 | + * @param dbname |
| 191 | + * @return relative path (to document root) of db within filesystem |
| 192 | + */ |
| 193 | + public static String makeDBPath(String path){ |
| 194 | + File dir = new File(path); |
| 195 | + if(!dir.exists()){ |
| 196 | + boolean succ = dir.mkdirs(); |
| 197 | + if(!succ){ |
| 198 | + log.error("Could not create directory "+path+", do you have permissions to create it?"); |
| 199 | + return null; |
| 200 | + } |
| 201 | + } |
| 202 | + return path; |
| 203 | + } |
| 204 | + |
204 | 205 | // ============================================================================ |
205 | 206 | static org.apache.log4j.Logger log = Logger.getLogger(WikiIndexModifier.class); |
206 | 207 | protected static GlobalConfiguration global = null; |
— | — | @@ -264,10 +265,9 @@ |
265 | 266 | * @param languageAnalyzer |
266 | 267 | * @return array { document, analyzer } |
267 | 268 | */ |
268 | | - protected Object[] makeDocumentAndAnalyzer(IndexUpdateRecord rec, FilterFactory filters){ |
| 269 | + public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters){ |
269 | 270 | PerFieldAnalyzerWrapper perFieldAnalyzer = null; |
270 | 271 | Document doc = new Document(); |
271 | | - Article article = rec.getArticle(); |
272 | 272 | |
273 | 273 | // This will be used to look up and replace entries on index updates. |
274 | 274 | doc.add(new Field("key", article.getKey(), Field.Store.YES, Field.Index.UN_TOKENIZED)); |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java |
— | — | @@ -60,6 +60,14 @@ |
61 | 61 | this.redirect = redirect; |
62 | 62 | } |
63 | 63 | |
| 64 | + public Article(int namespace, String titleText, String text, boolean redirect) { |
| 65 | + this.namespace = Integer.toString(namespace); |
| 66 | + this.title = titleText; |
| 67 | + contents = text; |
| 68 | + timestamp = null; |
| 69 | + this.redirect = redirect; |
| 70 | + } |
| 71 | + |
64 | 72 | public Article(int namespace_, String title_) { |
65 | 73 | namespace = Integer.toString(namespace_); |
66 | 74 | title = title_; |
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java |
— | — | @@ -62,7 +62,7 @@ |
63 | 63 | } |
64 | 64 | |
65 | 65 | /** Reads localization for language, return true if success */ |
66 | | - protected static boolean readLocalization(String langCode){ |
| 66 | + public static boolean readLocalization(String langCode){ |
67 | 67 | return readLocalization(langCode,0); |
68 | 68 | } |
69 | 69 | |
Index: trunk/lucene-search-2.0/build.xml |
— | — | @@ -30,7 +30,7 @@ |
31 | 31 | <jar destfile="${basedir}/MWSearch.jar"> |
32 | 32 | <manifest> |
33 | 33 | <attribute name="Main-Class" value="org.wikimedia.lsearch.config.StartupManager"/> |
34 | | - <attribute name="Class-Path" value="MWSearch.jar lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar"/> |
| 34 | + <attribute name="Class-Path" value="MWSearch.jar lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar lib/mwdumper.jar"/> |
35 | 35 | </manifest> |
36 | 36 | <zipfileset dir="${bin}" prefix=""> |
37 | 37 | <include name="org/**"/> |