r13926 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r13925‎ | r13926 | r13927 >
Date:17:56, 29 April 2006
Author:brion
Status:old
Tags:
Comment:
output format for Sphinx search engine's xml pipe format for indexing
untested
Modified paths:
  • /trunk/mwdumper/Makefile (modified) (history)
  • /trunk/mwdumper/src/org/mediawiki/dumper/Dumper.java (modified) (history)
  • /trunk/mwdumper/src/org/mediawiki/importer/SphinxWriter.java (added) (history)

Diff [purge]

Index: trunk/mwdumper/src/org/mediawiki/importer/SphinxWriter.java
@@ -0,0 +1,99 @@
 2+/*
 3+ * MediaWiki import/export processing tools
 4+ * Copyright 2006 by Brion Vibber
 5+ *
 6+ * Permission is hereby granted, free of charge, to any person obtaining a copy
 7+ * of this software and associated documentation files (the "Software"), to deal
 8+ * in the Software without restriction, including without limitation the rights
 9+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10+ * copies of the Software, and to permit persons to whom the Software is
 11+ * furnished to do so, subject to the following conditions:
 12+ *
 13+ * The above copyright notice and this permission notice shall be included in
 14+ * all copies or substantial portions of the Software.
 15+ *
 16+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22+ * SOFTWARE.
 23+ *
 24+ * $Id: XmlDumpWriter.java 11268 2005-10-10 06:57:30Z vibber $
 25+ */
 26+
 27+package org.mediawiki.importer;
 28+
 29+import java.io.IOException;
 30+import java.io.OutputStream;
 31+import java.text.DateFormat;
 32+import java.text.SimpleDateFormat;
 33+import java.util.Calendar;
 34+import java.util.Iterator;
 35+import java.util.Map;
 36+import java.util.TimeZone;
 37+
 38+/**
 39+ * Generates XML stream suitable for the Sphinx search engine's xmlpipe input.
 40+ */
 41+public class SphinxWriter implements DumpWriter {
 42+ protected OutputStream stream;
 43+ protected XmlWriter writer;
 44+ protected Page _page;
 45+ protected Revision _rev;
 46+
 47+ public SphinxWriter(OutputStream output) throws IOException {
 48+ stream = output;
 49+ writer = new XmlWriter(stream);
 50+ }
 51+
 52+ public void close() throws IOException {
 53+ writer.close();
 54+ }
 55+
 56+ public void writeStartWiki() throws IOException {
 57+ writer.openXml();
 58+ // No containing element to open
 59+ }
 60+
 61+ public void writeEndWiki() throws IOException {
 62+ // No containing element to close
 63+ writer.closeXml();
 64+ }
 65+
 66+ public void writeSiteinfo(Siteinfo info) throws IOException {
 67+ // Nothing!
 68+ }
 69+
 70+ public void writeStartPage(Page page) throws IOException {
 71+ _page = page;
 72+ }
 73+
 74+ /**
 75+ * @fixme What's the "group" number here do?
 76+ * @fixme preprocess the text to strip some formatting?
 77+ */
 78+ public void writeEndPage() throws IOException {
 79+ writer.openElement("document");
 80+ writer.textElement("id", Integer.toString(_page.Id));
 81+ writer.textElement("group", "0");
 82+ writer.textElement("timestamp", formatTimestamp(_rev.Timestamp));
 83+ writer.textElement("title", _page.Title.toString());
 84+ writer.textElement("body", _rev.Text);
 85+ writer.closeElement();
 86+ _rev = null;
 87+ _page = null;
 88+ }
 89+
 90+ public void writeRevision(Revision rev) throws IOException {
 91+ _rev = rev;
 92+ }
 93+
 94+ /**
 95+ * @fixme double-check that it wants Unix timestamp
 96+ */
 97+ static String formatTimestamp(Calendar ts) {
 98+ return Long.toString(ts.getTimeInMillis() / 1000L);
 99+ }
 100+}
Index: trunk/mwdumper/src/org/mediawiki/dumper/Dumper.java
@@ -226,6 +226,8 @@
227227 static DumpWriter openOutputSink(OutputWrapper output, String format, String param) throws IOException {
228228 if (format.equals("xml"))
229229 return new XmlDumpWriter(output.getFileStream());
 230+ else if (format.equals("sphinx"))
 231+ return new SphinxWriter(output.getFileStream());
230232 else if (format.equals("sql")) {
231233 SqlStream sqlStream = output.getSqlStream();
232234 if (param.equals("1.4"))
Index: trunk/mwdumper/Makefile
@@ -16,6 +16,7 @@
1717 SOURCES_DUMPER=\
1818 src/org/mediawiki/dumper/Dumper.java \
1919 src/org/mediawiki/dumper/ProgressFilter.java \
 20+ src/org/mediawiki/dumper/Tools.java \
2021 src/org/apache/commons/compress/bzip2/BZip2Constants.java \
2122 src/org/apache/commons/compress/bzip2/CBZip2InputStream.java \
2223 src/org/apache/commons/compress/bzip2/CBZip2OutputStream.java \
@@ -37,6 +38,7 @@
3839 src/org/mediawiki/importer/PageFilter.java \
3940 src/org/mediawiki/importer/Revision.java \
4041 src/org/mediawiki/importer/Siteinfo.java \
 42+ src/org/mediawiki/importer/Sphinx.java \
4143 src/org/mediawiki/importer/SqlFileStream.java \
4244 src/org/mediawiki/importer/SqlLiteral.java \
4345 src/org/mediawiki/importer/SqlStream.java \

Status & tagging log