r21768 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r21767‎ | r21768 | r21769 >
Date:20:01, 1 May 2007
Author:rainman
Status:old
Tags:
Comment:
Add Importer class that builds index from a xml database dump. (Needs mwdumper in classpath)
Modified paths:
  • /trunk/lucene-search-2.0/.classpath (modified) (history)
  • /trunk/lucene-search-2.0/build.xml (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (added) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history)
  • /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/.classpath
@@ -12,5 +12,6 @@
1313 <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
1414 <classpathentry kind="lib" path="lib/lucene-core-2.0.1-dev.jar" sourcepath="/lucene-2.0"/>
1515 <classpathentry kind="lib" path="lib/snowball.jar"/>
 16+ <classpathentry kind="lib" path="lib/mwdumper.jar" sourcepath="/mwdumper"/>
1617 <classpathentry kind="output" path="bin"/>
1718 </classpath>
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
@@ -0,0 +1,66 @@
 2+package org.wikimedia.lsearch.importer;
 3+
 4+import java.io.IOException;
 5+import java.io.InputStream;
 6+
 7+import org.apache.log4j.Logger;
 8+import org.mediawiki.dumper.ProgressFilter;
 9+import org.mediawiki.dumper.Tools;
 10+import org.mediawiki.importer.XmlDumpReader;
 11+import org.wikimedia.lsearch.config.Configuration;
 12+import org.wikimedia.lsearch.config.GlobalConfiguration;
 13+import org.wikimedia.lsearch.util.Localization;
 14+import org.wikimedia.lsearch.util.UnicodeDecomposer;
 15+
 16+/**
 17+ * Main class, builds index from a database dump.
 18+ * Syntax: java Importer inputfile dbname
 19+ *
 20+ * @author rainman
 21+ *
 22+ */
 23+public class Importer {
 24+
 25+ /**
 26+ * @param args
 27+ */
 28+ public static void main(String[] args) {
 29+ System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n");
 30+
 31+ Configuration.open();
 32+ Logger log = Logger.getLogger(Importer.class);
 33+
 34+ if(args.length != 2){
 35+ System.out.println("Syntax: java Importer <inputfile> <dbname>");
 36+ return;
 37+ }
 38+ String inputfile = args[0];
 39+ String dbname = args[1];
 40+
 41+ // preload
 42+ UnicodeDecomposer.getInstance();
 43+ Localization.readLocalization(GlobalConfiguration.getInstance().getLanguage(dbname));
 44+ Localization.loadInterwiki();
 45+
 46+ // open
 47+ InputStream input = null;
 48+ try {
 49+ input = Tools.openInputFile(inputfile);
 50+ } catch (IOException e) {
 51+ log.fatal("I/O error opening "+inputfile);
 52+ }
 53+
 54+ // read
 55+ DumpImporter dp = new DumpImporter(dbname);
 56+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 100));
 57+ try {
 58+ reader.readDump();
 59+ } catch (IOException e) {
 60+ log.warn("I/O error reading dump for "+dbname+" from "+inputfile);
 61+ }
 62+
 63+ log.info("Closing/optimizing index...");
 64+ dp.closeIndex();
 65+ }
 66+
 67+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java
@@ -0,0 +1,53 @@
 2+package org.wikimedia.lsearch.importer;
 3+
 4+import java.io.IOException;
 5+
 6+import org.apache.log4j.Logger;
 7+import org.mediawiki.importer.DumpWriter;
 8+import org.mediawiki.importer.Page;
 9+import org.mediawiki.importer.Revision;
 10+import org.mediawiki.importer.Siteinfo;
 11+import org.wikimedia.lsearch.beans.Article;
 12+import org.wikimedia.lsearch.config.Configuration;
 13+import org.wikimedia.lsearch.config.IndexId;
 14+
 15+public class DumpImporter implements DumpWriter {
 16+ static Logger log = Logger.getLogger(DumpImporter.class);
 17+ Page page;
 18+ Revision revision;
 19+ SimpleIndexWriter writer;
 20+
 21+ public DumpImporter(String dbname){
 22+ Configuration.open(); // make sure configuration is loaded
 23+ writer = new SimpleIndexWriter(IndexId.get(dbname));
 24+ }
 25+ public void writeRevision(Revision revision) throws IOException {
 26+ this.revision = revision;
 27+ }
 28+ public void writeStartPage(Page page) throws IOException {
 29+ this.page = page;
 30+ }
 31+ public void writeEndPage() throws IOException {
 32+ Article article = new Article(page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect());
 33+ writer.addArticle(article);
 34+ }
 35+
 36+ public void close() throws IOException {
 37+ // nop
 38+ }
 39+ public void writeEndWiki() throws IOException {
 40+ // nop
 41+ }
 42+ public void writeSiteinfo(Siteinfo info) throws IOException {
 43+ // nop
 44+ }
 45+ public void writeStartWiki() throws IOException {
 46+ // nop
 47+ }
 48+
 49+ public void closeIndex(){
 50+ writer.close();
 51+ }
 52+
 53+
 54+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
@@ -0,0 +1,120 @@
 2+package org.wikimedia.lsearch.importer;
 3+
 4+import java.io.IOException;
 5+import java.util.HashMap;
 6+import java.util.Map.Entry;
 7+
 8+import org.apache.log4j.Logger;
 9+import org.apache.lucene.analysis.Analyzer;
 10+import org.apache.lucene.document.Document;
 11+import org.apache.lucene.index.IndexWriter;
 12+import org.wikimedia.lsearch.analyzers.FilterFactory;
 13+import org.wikimedia.lsearch.beans.Article;
 14+import org.wikimedia.lsearch.beans.IndexReportCard;
 15+import org.wikimedia.lsearch.config.GlobalConfiguration;
 16+import org.wikimedia.lsearch.config.IndexId;
 17+import org.wikimedia.lsearch.index.IndexUpdateRecord;
 18+import org.wikimedia.lsearch.index.WikiIndexModifier;
 19+import org.wikimedia.lsearch.index.WikiSimilarity;
 20+
 21+/**
 22+ * IndexWriter for building indexes from scratch.
 23+ *
 24+ * @author rainman
 25+ *
 26+ */
 27+public class SimpleIndexWriter {
 28+ static Logger log = Logger.getLogger(SimpleIndexWriter.class);
 29+ IndexId iid;
 30+ HashMap<String,IndexWriter> indexes;
 31+ FilterFactory filters;
 32+
 33+ public SimpleIndexWriter(IndexId iid){
 34+ this.iid = iid;
 35+ String langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
 36+ filters = new FilterFactory(langCode);
 37+ indexes = new HashMap<String,IndexWriter>();
 38+ // open all relevant indexes
 39+ if(iid.isSingle())
 40+ indexes.put(iid.toString(),openIndex(iid));
 41+ else if(iid.isMainsplit()){
 42+ indexes.put(iid.getMainPart().toString(),openIndex(iid.getMainPart()));
 43+ indexes.put(iid.getRestPart().toString(),openIndex(iid.getRestPart()));
 44+ } else if(iid.isSplit()){
 45+ for(String dbpart : iid.getSplitParts()){
 46+ indexes.put(IndexId.get(dbpart).toString(),openIndex(IndexId.get(dbpart)));
 47+ }
 48+ } else
 49+ log.fatal("Unrecognized index architecture for "+iid);
 50+
 51+ }
 52+
 53+ /** Open and initialize index denoted by iid */
 54+ protected IndexWriter openIndex(IndexId iid) {
 55+ String path = iid.getImportPath();
 56+ IndexWriter writer;
 57+ try {
 58+ writer = new IndexWriter(path,null,false); // never rewrite index, so we can resume
 59+ } catch (IOException e) {
 60+ try {
 61+ // try to make brand new index
 62+ WikiIndexModifier.makeDBPath(iid.getIndexPath()); // ensure all directories are made
 63+ log.info("Making new index at path "+path);
 64+ writer = new IndexWriter(path,null,true);
 65+ } catch (IOException e1) {
 66+ log.error("I/O error openning index for addition of documents at "+path+" : "+e.getMessage());
 67+ return null;
 68+ }
 69+ }
 70+ writer.setSimilarity(new WikiSimilarity());
 71+ int mergeFactor = iid.getIntParam("mergeFactor",2);
 72+ int maxBufDocs = iid.getIntParam("maxBufDocs",10);
 73+ writer.setMergeFactor(mergeFactor);
 74+ writer.setMaxBufferedDocs(maxBufDocs);
 75+ writer.setUseCompoundFile(true);
 76+
 77+ return writer;
 78+ }
 79+
 80+ /** Add single article to logical index. It will add the article to the right index part */
 81+ public void addArticle(Article a){
 82+ IndexId target;
 83+ if(iid.isSingle())
 84+ target = iid;
 85+ else if(iid.isMainsplit()) // assign according to namespace
 86+ target = (a.getNamespace().equals("0"))? iid.getMainPart() : iid.getRestPart();
 87+ else // split index, randomly assign to some index part
 88+ target = iid.getPart(1+(int)(Math.random()*iid.getSplitFactor()));
 89+
 90+ IndexWriter writer = indexes.get(target.toString());
 91+ if(writer == null)
 92+ return;
 93+ Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,filters);
 94+ Document doc = (Document) ret[0];
 95+ Analyzer analyzer = (Analyzer) ret[1];
 96+ try {
 97+ writer.addDocument(doc,analyzer);
 98+ log.debug(iid+": Adding document "+a);
 99+ } catch (IOException e) {
 100+ log.error("I/O Error writing articlet "+a+" to index "+target.getImportPath());
 101+ } catch(Exception e){
 102+ e.printStackTrace();
 103+ log.error("Error adding document "+a+" with message: "+e.getMessage());
 104+ }
 105+ }
 106+
 107+ /** Close and (if specified in global config) optimize indexes */
 108+ public void close(){
 109+ for(Entry<String,IndexWriter> en : indexes.entrySet()){
 110+ IndexId iid = IndexId.get(en.getKey());
 111+ IndexWriter writer = en.getValue();
 112+ try{
 113+ if(iid.getBooleanParam("optimize",true))
 114+ writer.optimize();
 115+ writer.close();
 116+ } catch(IOException e){
 117+ log.warn("I/O error optimizing/closing index at "+iid.getImportPath());
 118+ }
 119+ }
 120+ }
 121+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexId.java
@@ -75,6 +75,7 @@
7676 /** Where the indexer places the snapshots */
7777 protected String snapshotPath;
7878 protected String updatePath = null;
 79+ protected String importPath;
7980
8081 protected String rsyncSnapshotPath = null;
8182
@@ -167,6 +168,7 @@
168169 }
169170
170171 indexPath = indexHostPath + "index" + sep + dbrole;
 172+ importPath = indexHostPath + "fromXML" + sep + dbrole;
171173 snapshotPath = indexHostPath + "snapshot" + sep + dbrole;
172174 rsyncSnapshotPath = "/mwsearch/snapshot/" + dbrole;
173175
@@ -289,7 +291,11 @@
290292 public String getUpdatePath() {
291293 return updatePath;
292294 }
293 -
 295+ /** Where indexes are made when built from XML importing */
 296+ public String getImportPath() {
 297+ return importPath;
 298+ }
 299+
294300 /** Get search path with resolved symlinks */
295301 public String getCanonicalSearchPath(){
296302 try {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
@@ -130,7 +130,7 @@
131131 } catch (IOException e) {
132132 try {
133133 // try to make brand new index
134 - makeDBPath(iid); // ensure all directories are made
 134+ makeDBPath(path); // ensure all directories are made
135135 log.info("Making new index at path "+path);
136136 writer = new IndexWriter(path,null,true);
137137 } catch (IOException e1) {
@@ -154,7 +154,7 @@
155155 if(!rec.isAlwaysAdd() && nonDeleteDocuments.contains(rec))
156156 continue; // don't add if delete/add are paired operations
157157 IndexReportCard card = getReportCard(rec);
158 - Object[] ret = makeDocumentAndAnalyzer(rec,filters);
 158+ Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters);
159159 Document doc = (Document) ret[0];
160160 Analyzer analyzer = (Analyzer) ret[1];
161161 try {
@@ -181,25 +181,26 @@
182182 }
183183 }
184184
185 - /**
186 - * Create necessary directories for index
187 - * @param dbname
188 - * @return relative path (to document root) of db within filesystem
189 - */
190 - public String makeDBPath(IndexId iid){
191 - String path = iid.getIndexPath();
192 - File dir = new File(path);
193 - if(!dir.exists()){
194 - boolean succ = dir.mkdirs();
195 - if(!succ){
196 - log.error("Could not create directory "+path+", do you have permissions to create it? Updates from database "+iid+" will not be written.");
197 - return null;
198 - }
199 - }
200 - return path;
201 - }
 185+
202186
203187 }
 188+ /**
 189+ * Create necessary directories for index
 190+ * @param dbname
 191+ * @return relative path (to document root) of db within filesystem
 192+ */
 193+ public static String makeDBPath(String path){
 194+ File dir = new File(path);
 195+ if(!dir.exists()){
 196+ boolean succ = dir.mkdirs();
 197+ if(!succ){
 198+ log.error("Could not create directory "+path+", do you have permissions to create it?");
 199+ return null;
 200+ }
 201+ }
 202+ return path;
 203+ }
 204+
204205 // ============================================================================
205206 static org.apache.log4j.Logger log = Logger.getLogger(WikiIndexModifier.class);
206207 protected static GlobalConfiguration global = null;
@@ -264,10 +265,9 @@
265266 * @param languageAnalyzer
266267 * @return array { document, analyzer }
267268 */
268 - protected Object[] makeDocumentAndAnalyzer(IndexUpdateRecord rec, FilterFactory filters){
 269+ public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters){
269270 PerFieldAnalyzerWrapper perFieldAnalyzer = null;
270271 Document doc = new Document();
271 - Article article = rec.getArticle();
272272
273273 // This will be used to look up and replace entries on index updates.
274274 doc.add(new Field("key", article.getKey(), Field.Store.YES, Field.Index.UN_TOKENIZED));
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java
@@ -60,6 +60,14 @@
6161 this.redirect = redirect;
6262 }
6363
 64+ public Article(int namespace, String titleText, String text, boolean redirect) {
 65+ this.namespace = Integer.toString(namespace);
 66+ this.title = titleText;
 67+ contents = text;
 68+ timestamp = null;
 69+ this.redirect = redirect;
 70+ }
 71+
6472 public Article(int namespace_, String title_) {
6573 namespace = Integer.toString(namespace_);
6674 title = title_;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java
@@ -62,7 +62,7 @@
6363 }
6464
6565 /** Reads localization for language, return true if success */
66 - protected static boolean readLocalization(String langCode){
 66+ public static boolean readLocalization(String langCode){
6767 return readLocalization(langCode,0);
6868 }
6969
Index: trunk/lucene-search-2.0/build.xml
@@ -30,7 +30,7 @@
3131 <jar destfile="${basedir}/MWSearch.jar">
3232 <manifest>
3333 <attribute name="Main-Class" value="org.wikimedia.lsearch.config.StartupManager"/>
34 - <attribute name="Class-Path" value="MWSearch.jar lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar"/>
 34+ <attribute name="Class-Path" value="MWSearch.jar lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar lib/mwdumper.jar"/>
3535 </manifest>
3636 <zipfileset dir="${bin}" prefix="">
3737 <include name="org/**"/>