r21768 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r21767‎ \| r21768 \| r21769 >
Date:	20:01, 1 May 2007
Author:	rainman
Status:	old
Tags:
Comment:	Add Importer class that builds index from a xml database dump. (Needs mwdumper in classpath)
Modified paths:	/trunk/lucene-search-2.0/.classpath (modified) (history) /trunk/lucene-search-2.0/build.xml (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexId.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java (added) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java (modified) (history) /trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java (modified) (history)

Diff [purge]

Index: trunk/lucene-search-2.0/.classpath
—	—	@@ -12,5 +12,6 @@
13	13	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
14	14	<classpathentry kind="lib" path="lib/lucene-core-2.0.1-dev.jar" sourcepath="/lucene-2.0"/>
15	15	<classpathentry kind="lib" path="lib/snowball.jar"/>
	16	+ <classpathentry kind="lib" path="lib/mwdumper.jar" sourcepath="/mwdumper"/>
16	17	<classpathentry kind="output" path="bin"/>
17	18	</classpath>
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/Importer.java
—	—	@@ -0,0 +1,66 @@
	2	+package org.wikimedia.lsearch.importer;
	3	+
	4	+import java.io.IOException;
	5	+import java.io.InputStream;
	6	+
	7	+import org.apache.log4j.Logger;
	8	+import org.mediawiki.dumper.ProgressFilter;
	9	+import org.mediawiki.dumper.Tools;
	10	+import org.mediawiki.importer.XmlDumpReader;
	11	+import org.wikimedia.lsearch.config.Configuration;
	12	+import org.wikimedia.lsearch.config.GlobalConfiguration;
	13	+import org.wikimedia.lsearch.util.Localization;
	14	+import org.wikimedia.lsearch.util.UnicodeDecomposer;
	15	+
	16	+/**
	17	+ * Main class, builds index from a database dump.
	18	+ * Syntax: java Importer inputfile dbname
	19	+ *
	20	+ * @author rainman
	21	+ *
	22	+ */
	23	+public class Importer {
	24	+
	25	+ /**
	26	+ * @param args
	27	+ */
	28	+ public static void main(String[] args) {
	29	+ System.out.println("MediaWiki Lucene search indexer - index builder from xml database dumps.\n");
	30	+
	31	+ Configuration.open();
	32	+ Logger log = Logger.getLogger(Importer.class);
	33	+
	34	+ if(args.length != 2){
	35	+ System.out.println("Syntax: java Importer <inputfile> <dbname>");
	36	+ return;
	37	+ }
	38	+ String inputfile = args[0];
	39	+ String dbname = args[1];
	40	+
	41	+ // preload
	42	+ UnicodeDecomposer.getInstance();
	43	+ Localization.readLocalization(GlobalConfiguration.getInstance().getLanguage(dbname));
	44	+ Localization.loadInterwiki();
	45	+
	46	+ // open
	47	+ InputStream input = null;
	48	+ try {
	49	+ input = Tools.openInputFile(inputfile);
	50	+ } catch (IOException e) {
	51	+ log.fatal("I/O error opening "+inputfile);
	52	+ }
	53	+
	54	+ // read
	55	+ DumpImporter dp = new DumpImporter(dbname);
	56	+ XmlDumpReader reader = new XmlDumpReader(input,new ProgressFilter(dp, 100));
	57	+ try {
	58	+ reader.readDump();
	59	+ } catch (IOException e) {
	60	+ log.warn("I/O error reading dump for "+dbname+" from "+inputfile);
	61	+ }
	62	+
	63	+ log.info("Closing/optimizing index...");
	64	+ dp.closeIndex();
	65	+ }
	66	+
	67	+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/DumpImporter.java
—	—	@@ -0,0 +1,53 @@
	2	+package org.wikimedia.lsearch.importer;
	3	+
	4	+import java.io.IOException;
	5	+
	6	+import org.apache.log4j.Logger;
	7	+import org.mediawiki.importer.DumpWriter;
	8	+import org.mediawiki.importer.Page;
	9	+import org.mediawiki.importer.Revision;
	10	+import org.mediawiki.importer.Siteinfo;
	11	+import org.wikimedia.lsearch.beans.Article;
	12	+import org.wikimedia.lsearch.config.Configuration;
	13	+import org.wikimedia.lsearch.config.IndexId;
	14	+
	15	+public class DumpImporter implements DumpWriter {
	16	+ static Logger log = Logger.getLogger(DumpImporter.class);
	17	+ Page page;
	18	+ Revision revision;
	19	+ SimpleIndexWriter writer;
	20	+
	21	+ public DumpImporter(String dbname){
	22	+ Configuration.open(); // make sure configuration is loaded
	23	+ writer = new SimpleIndexWriter(IndexId.get(dbname));
	24	+ }
	25	+ public void writeRevision(Revision revision) throws IOException {
	26	+ this.revision = revision;
	27	+ }
	28	+ public void writeStartPage(Page page) throws IOException {
	29	+ this.page = page;
	30	+ }
	31	+ public void writeEndPage() throws IOException {
	32	+ Article article = new Article(page.Title.Namespace,page.Title.Text,revision.Text,revision.isRedirect());
	33	+ writer.addArticle(article);
	34	+ }
	35	+
	36	+ public void close() throws IOException {
	37	+ // nop
	38	+ }
	39	+ public void writeEndWiki() throws IOException {
	40	+ // nop
	41	+ }
	42	+ public void writeSiteinfo(Siteinfo info) throws IOException {
	43	+ // nop
	44	+ }
	45	+ public void writeStartWiki() throws IOException {
	46	+ // nop
	47	+ }
	48	+
	49	+ public void closeIndex(){
	50	+ writer.close();
	51	+ }
	52	+
	53	+
	54	+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/importer/SimpleIndexWriter.java
—	—	@@ -0,0 +1,120 @@
	2	+package org.wikimedia.lsearch.importer;
	3	+
	4	+import java.io.IOException;
	5	+import java.util.HashMap;
	6	+import java.util.Map.Entry;
	7	+
	8	+import org.apache.log4j.Logger;
	9	+import org.apache.lucene.analysis.Analyzer;
	10	+import org.apache.lucene.document.Document;
	11	+import org.apache.lucene.index.IndexWriter;
	12	+import org.wikimedia.lsearch.analyzers.FilterFactory;
	13	+import org.wikimedia.lsearch.beans.Article;
	14	+import org.wikimedia.lsearch.beans.IndexReportCard;
	15	+import org.wikimedia.lsearch.config.GlobalConfiguration;
	16	+import org.wikimedia.lsearch.config.IndexId;
	17	+import org.wikimedia.lsearch.index.IndexUpdateRecord;
	18	+import org.wikimedia.lsearch.index.WikiIndexModifier;
	19	+import org.wikimedia.lsearch.index.WikiSimilarity;
	20	+
	21	+/**
	22	+ * IndexWriter for building indexes from scratch.
	23	+ *
	24	+ * @author rainman
	25	+ *
	26	+ */
	27	+public class SimpleIndexWriter {
	28	+ static Logger log = Logger.getLogger(SimpleIndexWriter.class);
	29	+ IndexId iid;
	30	+ HashMap<String,IndexWriter> indexes;
	31	+ FilterFactory filters;
	32	+
	33	+ public SimpleIndexWriter(IndexId iid){
	34	+ this.iid = iid;
	35	+ String langCode = GlobalConfiguration.getInstance().getLanguage(iid.getDBname());
	36	+ filters = new FilterFactory(langCode);
	37	+ indexes = new HashMap<String,IndexWriter>();
	38	+ // open all relevant indexes
	39	+ if(iid.isSingle())
	40	+ indexes.put(iid.toString(),openIndex(iid));
	41	+ else if(iid.isMainsplit()){
	42	+ indexes.put(iid.getMainPart().toString(),openIndex(iid.getMainPart()));
	43	+ indexes.put(iid.getRestPart().toString(),openIndex(iid.getRestPart()));
	44	+ } else if(iid.isSplit()){
	45	+ for(String dbpart : iid.getSplitParts()){
	46	+ indexes.put(IndexId.get(dbpart).toString(),openIndex(IndexId.get(dbpart)));
	47	+ }
	48	+ } else
	49	+ log.fatal("Unrecognized index architecture for "+iid);
	50	+
	51	+ }
	52	+
	53	+ /** Open and initialize index denoted by iid */
	54	+ protected IndexWriter openIndex(IndexId iid) {
	55	+ String path = iid.getImportPath();
	56	+ IndexWriter writer;
	57	+ try {
	58	+ writer = new IndexWriter(path,null,false); // never rewrite index, so we can resume
	59	+ } catch (IOException e) {
	60	+ try {
	61	+ // try to make brand new index
	62	+ WikiIndexModifier.makeDBPath(iid.getIndexPath()); // ensure all directories are made
	63	+ log.info("Making new index at path "+path);
	64	+ writer = new IndexWriter(path,null,true);
	65	+ } catch (IOException e1) {
	66	+ log.error("I/O error openning index for addition of documents at "+path+" : "+e.getMessage());
	67	+ return null;
	68	+ }
	69	+ }
	70	+ writer.setSimilarity(new WikiSimilarity());
	71	+ int mergeFactor = iid.getIntParam("mergeFactor",2);
	72	+ int maxBufDocs = iid.getIntParam("maxBufDocs",10);
	73	+ writer.setMergeFactor(mergeFactor);
	74	+ writer.setMaxBufferedDocs(maxBufDocs);
	75	+ writer.setUseCompoundFile(true);
	76	+
	77	+ return writer;
	78	+ }
	79	+
	80	+ /** Add single article to logical index. It will add the article to the right index part */
	81	+ public void addArticle(Article a){
	82	+ IndexId target;
	83	+ if(iid.isSingle())
	84	+ target = iid;
	85	+ else if(iid.isMainsplit()) // assign according to namespace
	86	+ target = (a.getNamespace().equals("0"))? iid.getMainPart() : iid.getRestPart();
	87	+ else // split index, randomly assign to some index part
	88	+ target = iid.getPart(1+(int)(Math.random()*iid.getSplitFactor()));
	89	+
	90	+ IndexWriter writer = indexes.get(target.toString());
	91	+ if(writer == null)
	92	+ return;
	93	+ Object[] ret = WikiIndexModifier.makeDocumentAndAnalyzer(a,filters);
	94	+ Document doc = (Document) ret[0];
	95	+ Analyzer analyzer = (Analyzer) ret[1];
	96	+ try {
	97	+ writer.addDocument(doc,analyzer);
	98	+ log.debug(iid+": Adding document "+a);
	99	+ } catch (IOException e) {
	100	+ log.error("I/O Error writing articlet "+a+" to index "+target.getImportPath());
	101	+ } catch(Exception e){
	102	+ e.printStackTrace();
	103	+ log.error("Error adding document "+a+" with message: "+e.getMessage());
	104	+ }
	105	+ }
	106	+
	107	+ /** Close and (if specified in global config) optimize indexes */
	108	+ public void close(){
	109	+ for(Entry<String,IndexWriter> en : indexes.entrySet()){
	110	+ IndexId iid = IndexId.get(en.getKey());
	111	+ IndexWriter writer = en.getValue();
	112	+ try{
	113	+ if(iid.getBooleanParam("optimize",true))
	114	+ writer.optimize();
	115	+ writer.close();
	116	+ } catch(IOException e){
	117	+ log.warn("I/O error optimizing/closing index at "+iid.getImportPath());
	118	+ }
	119	+ }
	120	+ }
	121	+}
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/config/IndexId.java
—	—	@@ -75,6 +75,7 @@
76	76	/** Where the indexer places the snapshots */
77	77	protected String snapshotPath;
78	78	protected String updatePath = null;
	79	+ protected String importPath;
79	80
80	81	protected String rsyncSnapshotPath = null;
81	82
—	—	@@ -167,6 +168,7 @@
168	169	}
169	170
170	171	indexPath = indexHostPath + "index" + sep + dbrole;
	172	+ importPath = indexHostPath + "fromXML" + sep + dbrole;
171	173	snapshotPath = indexHostPath + "snapshot" + sep + dbrole;
172	174	rsyncSnapshotPath = "/mwsearch/snapshot/" + dbrole;
173	175
—	—	@@ -289,7 +291,11 @@
290	292	public String getUpdatePath() {
291	293	return updatePath;
292	294	}
293		-
	295	+ /** Where indexes are made when built from XML importing */
	296	+ public String getImportPath() {
	297	+ return importPath;
	298	+ }
	299	+
294	300	/** Get search path with resolved symlinks */
295	301	public String getCanonicalSearchPath(){
296	302	try {
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/index/WikiIndexModifier.java
—	—	@@ -130,7 +130,7 @@
131	131	} catch (IOException e) {
132	132	try {
133	133	// try to make brand new index
134		~~- makeDBPath(iid); // ensure all directories are made~~
	134	+ makeDBPath(path); // ensure all directories are made
135	135	log.info("Making new index at path "+path);
136	136	writer = new IndexWriter(path,null,true);
137	137	} catch (IOException e1) {
—	—	@@ -154,7 +154,7 @@
155	155	if(!rec.isAlwaysAdd() && nonDeleteDocuments.contains(rec))
156	156	continue; // don't add if delete/add are paired operations
157	157	IndexReportCard card = getReportCard(rec);
158		~~- Object[] ret = makeDocumentAndAnalyzer(rec,filters);~~
	158	+ Object[] ret = makeDocumentAndAnalyzer(rec.getArticle(),filters);
159	159	Document doc = (Document) ret[0];
160	160	Analyzer analyzer = (Analyzer) ret[1];
161	161	try {
—	—	@@ -181,25 +181,26 @@
182	182	}
183	183	}
184	184
185		- /**
186		~~- * Create necessary directories for index~~
187		~~- * @param dbname~~
188		~~- * @return relative path (to document root) of db within filesystem~~
189		~~- */~~
190		~~- public String makeDBPath(IndexId iid){~~
191		~~- String path = iid.getIndexPath();~~
192		~~- File dir = new File(path);~~
193		~~- if(!dir.exists()){~~
194		~~- boolean succ = dir.mkdirs();~~
195		~~- if(!succ){~~
196		~~- log.error("Could not create directory "+path+", do you have permissions to create it? Updates from database "+iid+" will not be written.");~~
197		~~- return null;~~
198		~~- }~~
199		~~- }~~
200		~~- return path;~~
201		~~- }~~
	185	+
202	186
203	187	}
	188	+ /**
	189	+ * Create necessary directories for index
	190	+ * @param dbname
	191	+ * @return relative path (to document root) of db within filesystem
	192	+ */
	193	+ public static String makeDBPath(String path){
	194	+ File dir = new File(path);
	195	+ if(!dir.exists()){
	196	+ boolean succ = dir.mkdirs();
	197	+ if(!succ){
	198	+ log.error("Could not create directory "+path+", do you have permissions to create it?");
	199	+ return null;
	200	+ }
	201	+ }
	202	+ return path;
	203	+ }
	204	+
204	205	// ============================================================================
205	206	static org.apache.log4j.Logger log = Logger.getLogger(WikiIndexModifier.class);
206	207	protected static GlobalConfiguration global = null;
—	—	@@ -264,10 +265,9 @@
265	266	* @param languageAnalyzer
266	267	* @return array { document, analyzer }
267	268	*/
268		~~- protected Object[] makeDocumentAndAnalyzer(IndexUpdateRecord rec, FilterFactory filters){~~
	269	+ public static Object[] makeDocumentAndAnalyzer(Article article, FilterFactory filters){
269	270	PerFieldAnalyzerWrapper perFieldAnalyzer = null;
270	271	Document doc = new Document();
271		~~- Article article = rec.getArticle();~~
272	272
273	273	// This will be used to look up and replace entries on index updates.
274	274	doc.add(new Field("key", article.getKey(), Field.Store.YES, Field.Index.UN_TOKENIZED));
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/beans/Article.java
—	—	@@ -60,6 +60,14 @@
61	61	this.redirect = redirect;
62	62	}
63	63
	64	+ public Article(int namespace, String titleText, String text, boolean redirect) {
	65	+ this.namespace = Integer.toString(namespace);
	66	+ this.title = titleText;
	67	+ contents = text;
	68	+ timestamp = null;
	69	+ this.redirect = redirect;
	70	+ }
	71	+
64	72	public Article(int namespace_, String title_) {
65	73	namespace = Integer.toString(namespace_);
66	74	title = title_;
Index: trunk/lucene-search-2.0/src/org/wikimedia/lsearch/util/Localization.java
—	—	@@ -62,7 +62,7 @@
63	63	}
64	64
65	65	/** Reads localization for language, return true if success */
66		~~- protected static boolean readLocalization(String langCode){~~
	66	+ public static boolean readLocalization(String langCode){
67	67	return readLocalization(langCode,0);
68	68	}
69	69
Index: trunk/lucene-search-2.0/build.xml
—	—	@@ -30,7 +30,7 @@
31	31	<jar destfile="${basedir}/MWSearch.jar">
32	32	<manifest>
33	33	<attribute name="Main-Class" value="org.wikimedia.lsearch.config.StartupManager"/>
34		- <attribute name="Class-Path" value="MWSearch.jar lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar"/>
	34	+ <attribute name="Class-Path" value="MWSearch.jar lib/xmlrpc-common-3.0.jar lib/xmlrpc-client-3.0.jar lib/xmlrpc-server-3.0.jar lib/commons-logging-1.1.jar lib/ws-commons-util-1.0.1.jar lib/log4j-1.2.14.jar lib/lucene-core-2.0.1-dev.jar lib/lucene-analyzers.jar lib/snowball.jar lib/mwdumper.jar"/>
35	35	</manifest>
36	36	<zipfileset dir="${bin}" prefix="">
37	37	<include name="org/**"/>