r11173 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r11172‎ | r11173 | r11174 >
Date:04:05, 2 October 2005
Author:vibber
Status:old
Tags:
Comment:
* Added filter options, compression piping, and multiple output streams for
dumpBackup.php
Modified paths:
  • /trunk/phase3/RELEASE-NOTES (modified) (history)
  • /trunk/phase3/includes/Export.php (added) (history)
  • /trunk/phase3/includes/SpecialExport.php (modified) (history)
  • /trunk/phase3/maintenance/dumpBackup.php (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/dumpBackup.php
@@ -40,11 +40,78 @@
4141 var $skipFooter = false; // don't output </mediawiki>
4242 var $startId = 0;
4343 var $endId = 0;
 44+ var $sink = null; // Output filters
4445
45 - function BackupDumper() {
 46+ function BackupDumper( $args ) {
4647 $this->stderr = fopen( "php://stderr", "wt" );
 48+ $this->sink = $this->processArgs( $args );
4749 }
4850
 51+ /**
 52+ * @param array $args
 53+ * @return array
 54+ * @static
 55+ */
 56+ function processArgs( $args ) {
 57+ $outputTypes = array(
 58+ 'file' => 'DumpFileOutput',
 59+ 'gzip' => 'DumpGZipOutput',
 60+ 'bzip2' => 'DumpBZip2Output',
 61+ '7zip' => 'Dump7ZipOutput' );
 62+ $filterTypes = array(
 63+ 'latest' => 'DumpLatestFilter',
 64+ 'notalk' => 'DumpNotalkFilter',
 65+ 'namespace' => 'DumpNamespaceFilter' );
 66+ $sink = null;
 67+ $sinks = array();
 68+ foreach( $args as $arg ) {
 69+ if( preg_match( '/^--(.+?)(?:=(.+?)(?::(.+?))?)?$/', $arg, $matches ) ) {
 70+ @list( $full, $opt, $val, $param ) = $matches;
 71+ switch( $opt ) {
 72+ case "output":
 73+ if( !is_null( $sink ) ) {
 74+ $sinks[] = $sink;
 75+ }
 76+ if( !isset( $outputTypes[$val] ) ) {
 77+ die( "Unrecognized output sink type '$val'\n" );
 78+ }
 79+ $type = $outputTypes[$val];
 80+ $sink = new $type( $param );
 81+ break;
 82+ case "filter":
 83+ if( is_null( $sink ) ) {
 84+ $this->progress( "Warning: assuming stdout for filter output\n" );
 85+ $sink = new DumpOutput();
 86+ }
 87+ if( !isset( $filterTypes[$val] ) ) {
 88+ die( "Unrecognized filter type '$val'\n" );
 89+ }
 90+ $type = $filterTypes[$val];
 91+ $filter = new $type( $sink, $param );
 92+
 93+ // references are lame in php...
 94+ unset( $sink );
 95+ $sink = $filter;
 96+
 97+ break;
 98+ default:
 99+ //die( "Unrecognized dump option'$opt'\n" );
 100+ }
 101+ }
 102+ }
 103+
 104+ if( is_null( $sink ) ) {
 105+ $sink = new DumpOutput();
 106+ }
 107+ $sinks[] = $sink;
 108+
 109+ if( count( $sinks ) > 1 ) {
 110+ return new DumpMultiWriter( $sinks );
 111+ } else {
 112+ return $sink;
 113+ }
 114+ }
 115+
49116 function dump( $history ) {
50117 # This shouldn't happen if on console... ;)
51118 header( 'Content-type: text/html; charset=UTF-8' );
@@ -61,9 +128,10 @@
62129
63130 $db =& $this->backupDb();
64131 $exporter = new WikiExporter( $db, $history, MW_EXPORT_STREAM );
65 - $exporter->setPageCallback( array( &$this, 'reportPage' ) );
66 - $exporter->setRevisionCallback( array( &$this, 'revCount' ) );
67132
 133+ $wrapper = new ExportProgressFilter( $this->sink, $this );
 134+ $exporter->setOutputSink( $wrapper );
 135+
68136 if( !$this->skipHeader )
69137 $exporter->openStream();
70138
@@ -100,12 +168,12 @@
101169 : $wgDBserver;
102170 }
103171
104 - function reportPage( $page ) {
 172+ function reportPage() {
105173 $this->pageCount++;
106174 $this->report();
107175 }
108176
109 - function revCount( $rev ) {
 177+ function revCount() {
110178 $this->revCount++;
111179 }
112180
@@ -140,7 +208,25 @@
141209 }
142210 }
143211
144 -$dumper = new BackupDumper();
 212+class ExportProgressFilter extends DumpFilter {
 213+ function ExportProgressFilter( &$sink, &$progress ) {
 214+ parent::DumpFilter( $sink );
 215+ $this->progress = $progress;
 216+ }
 217+
 218+ function writeClosePage( $string ) {
 219+ parent::writeClosePage( $string );
 220+ $this->progress->reportPage();
 221+ }
 222+
 223+ function writeRevision( $rev, $string ) {
 224+ parent::writeRevision( $rev, $string );
 225+ $this->progress->revCount();
 226+ }
 227+}
 228+
 229+$dumper = new BackupDumper( $argv );
 230+
145231 if( isset( $options['quiet'] ) ) {
146232 $dumper->reporting = false;
147233 }
Index: trunk/phase3/includes/Export.php
@@ -0,0 +1,675 @@
 2+<?php
 3+# Copyright (C) 2003, 2005 Brion Vibber <brion@pobox.com>
 4+# http://www.mediawiki.org/
 5+#
 6+# This program is free software; you can redistribute it and/or modify
 7+# it under the terms of the GNU General Public License as published by
 8+# the Free Software Foundation; either version 2 of the License, or
 9+# (at your option) any later version.
 10+#
 11+# This program is distributed in the hope that it will be useful,
 12+# but WITHOUT ANY WARRANTY; without even the implied warranty of
 13+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 14+# GNU General Public License for more details.
 15+#
 16+# You should have received a copy of the GNU General Public License along
 17+# with this program; if not, write to the Free Software Foundation, Inc.,
 18+# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 19+# http://www.gnu.org/copyleft/gpl.html
 20+/**
 21+ *
 22+ * @package MediaWiki
 23+ * @subpackage SpecialPage
 24+ */
 25+
 26+/** */
 27+require_once( 'Revision.php' );
 28+
 29+define( 'MW_EXPORT_FULL', 0 );
 30+define( 'MW_EXPORT_CURRENT', 1 );
 31+
 32+define( 'MW_EXPORT_BUFFER', 0 );
 33+define( 'MW_EXPORT_STREAM', 1 );
 34+
 35+
 36+/**
 37+ * @package MediaWiki
 38+ * @subpackage SpecialPage
 39+ */
 40+class WikiExporter {
 41+ /**
 42+ * If using MW_EXPORT_STREAM to stream a large amount of data,
 43+ * provide a database connection which is not managed by
 44+ * LoadBalancer to read from: some history blob types will
 45+ * make additional queries to pull source data while the
 46+ * main query is still running.
 47+ *
 48+ * @param Database $db
 49+ * @param int $history one of MW_EXPORT_FULL or MW_EXPORT_CURRENT
 50+ * @param int $buffer one of MW_EXPORT_BUFFER or MW_EXPORT_STREAM
 51+ */
 52+ function WikiExporter( &$db, $history = MW_EXPORT_CURRENT,
 53+ $buffer = MW_EXPORT_BUFFER ) {
 54+ $this->db =& $db;
 55+ $this->history = $history;
 56+ $this->buffer = $buffer;
 57+ $this->writer = new XmlDumpWriter();
 58+ $this->sink = new DumpOutput();
 59+ }
 60+
 61+ /**
 62+ * Set the DumpOutput or DumpFilter object which will receive
 63+ * various row objects and XML output for filtering. Filters
 64+ * can be chained or used as callbacks.
 65+ *
 66+ * @param mixed $callback
 67+ */
 68+ function setOutputSink( &$sink ) {
 69+ $this->sink =& $sink;
 70+ }
 71+
 72+ function openStream() {
 73+ $output = $this->writer->openStream();
 74+ $this->sink->writeOpenStream( $output );
 75+ }
 76+
 77+ function closeStream() {
 78+ $output = $this->writer->closeStream();
 79+ $this->sink->writeCloseStream( $output );
 80+ }
 81+
 82+ /**
 83+ * Dumps a series of page and revision records for all pages
 84+ * in the database, either including complete history or only
 85+ * the most recent version.
 86+ */
 87+ function allPages() {
 88+ return $this->dumpFrom( '' );
 89+ }
 90+
 91+ /**
 92+ * Dumps a series of page and revision records for those pages
 93+ * in the database falling within the page_id range given.
 94+ * @param int $start Inclusive lower limit (this id is included)
 95+ * @param int $end Exclusive upper limit (this id is not included)
 96+ * If 0, no upper limit.
 97+ */
 98+ function pagesByRange( $start, $end ) {
 99+ $condition = 'page_id >= ' . intval( $start );
 100+ if( $end ) {
 101+ $condition .= ' AND page_id < ' . intval( $end );
 102+ }
 103+ return $this->dumpFrom( $condition );
 104+ }
 105+
 106+ /**
 107+ * @param Title $title
 108+ */
 109+ function pageByTitle( $title ) {
 110+ return $this->dumpFrom(
 111+ 'page_namespace=' . $title->getNamespace() .
 112+ ' AND page_title=' . $this->db->addQuotes( $title->getDbKey() ) );
 113+ }
 114+
 115+ function pageByName( $name ) {
 116+ $title = Title::newFromText( $name );
 117+ if( is_null( $title ) ) {
 118+ return new WikiError( "Can't export invalid title" );
 119+ } else {
 120+ return $this->pageByTitle( $title );
 121+ }
 122+ }
 123+
 124+ function pagesByName( $names ) {
 125+ foreach( $names as $name ) {
 126+ $this->pageByName( $name );
 127+ }
 128+ }
 129+
 130+
 131+ // -------------------- private implementation below --------------------
 132+
 133+ function dumpFrom( $cond = '' ) {
 134+ $fname = 'WikiExporter::dumpFrom';
 135+ wfProfileIn( $fname );
 136+
 137+ $page = $this->db->tableName( 'page' );
 138+ $revision = $this->db->tableName( 'revision' );
 139+ $text = $this->db->tableName( 'text' );
 140+
 141+ if( $this->history == MW_EXPORT_FULL ) {
 142+ $join = 'page_id=rev_page';
 143+ } elseif( $this->history == MW_EXPORT_CURRENT ) {
 144+ $join = 'page_id=rev_page AND page_latest=rev_id';
 145+ } else {
 146+ wfProfileOut( $fname );
 147+ return new WikiError( "$fname given invalid history dump type." );
 148+ }
 149+ $where = ( $cond == '' ) ? '' : "$cond AND";
 150+
 151+ if( $this->buffer == MW_EXPORT_STREAM ) {
 152+ $prev = $this->db->bufferResults( false );
 153+ }
 154+ if( $cond == '' ) {
 155+ // Optimization hack for full-database dump
 156+ $pageindex = 'FORCE INDEX (PRIMARY)';
 157+ $revindex = 'FORCE INDEX(page_timestamp)';
 158+ } else {
 159+ $pageindex = '';
 160+ $revindex = '';
 161+ }
 162+ $result = $this->db->query(
 163+ "SELECT * FROM
 164+ $page $pageindex,
 165+ $revision $revindex,
 166+ $text
 167+ WHERE $where $join AND rev_text_id=old_id
 168+ ORDER BY page_id", $fname );
 169+ $wrapper = $this->db->resultObject( $result );
 170+ $this->outputStream( $wrapper );
 171+
 172+ if( $this->buffer == MW_EXPORT_STREAM ) {
 173+ $this->db->bufferResults( $prev );
 174+ }
 175+
 176+ wfProfileOut( $fname );
 177+ }
 178+
 179+ /**
 180+ * Runs through a query result set dumping page and revision records.
 181+ * The result set should be sorted/grouped by page to avoid duplicate
 182+ * page records in the output.
 183+ *
 184+ * The result set will be freed once complete. Should be safe for
 185+ * streaming (non-buffered) queries, as long as it was made on a
 186+ * separate database connection not managed by LoadBalancer; some
 187+ * blob storage types will make queries to pull source data.
 188+ *
 189+ * @param ResultWrapper $resultset
 190+ * @access private
 191+ */
 192+ function outputStream( $resultset ) {
 193+ $last = null;
 194+ while( $row = $resultset->fetchObject() ) {
 195+ if( is_null( $last ) ||
 196+ $last->page_namespace != $row->page_namespace ||
 197+ $last->page_title != $row->page_title ) {
 198+ if( isset( $last ) ) {
 199+ $output = $this->writer->closePage();
 200+ $this->sink->writeClosePage( $output );
 201+ }
 202+ $output = $this->writer->openPage( $row );
 203+ $this->sink->writeOpenPage( $row, $output );
 204+ $last = $row;
 205+ }
 206+ $output = $this->writer->writeRevision( $row );
 207+ $this->sink->writeRevision( $row, $output );
 208+ }
 209+ if( isset( $last ) ) {
 210+ $output = $this->writer->closePage();
 211+ $this->sink->writeClosePage( $output );
 212+ }
 213+ $resultset->free();
 214+ }
 215+}
 216+
 217+class XmlDumpWriter {
 218+
 219+ /**
 220+ * Returns the export schema version.
 221+ * @return string
 222+ */
 223+ function schemaVersion() {
 224+ return "0.3";
 225+ }
 226+
 227+ /**
 228+ * Opens the XML output stream's root <mediawiki> element.
 229+ * This does not include an xml directive, so is safe to include
 230+ * as a subelement in a larger XML stream. Namespace and XML Schema
 231+ * references are included.
 232+ *
 233+ * Output will be encoded in UTF-8.
 234+ *
 235+ * @return string
 236+ */
 237+ function openStream() {
 238+ global $wgContLanguageCode;
 239+ $ver = $this->schemaVersion();
 240+ return wfElement( 'mediawiki', array(
 241+ 'xmlns' => "http://www.mediawiki.org/xml/export-$ver/",
 242+ 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
 243+ 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " .
 244+ "http://www.mediawiki.org/xml/export-$ver.xsd",
 245+ 'version' => $ver,
 246+ 'xml:lang' => $wgContLanguageCode ),
 247+ null ) .
 248+ "\n" .
 249+ $this->siteInfo();
 250+ }
 251+
 252+ function siteInfo() {
 253+ $info = array(
 254+ $this->sitename(),
 255+ $this->homelink(),
 256+ $this->generator(),
 257+ $this->caseSetting(),
 258+ $this->namespaces() );
 259+ return " <siteinfo>\n " .
 260+ implode( "\n ", $info ) .
 261+ "\n </siteinfo>\n";
 262+ }
 263+
 264+ function sitename() {
 265+ global $wgSitename;
 266+ return wfElement( 'sitename', array(), $wgSitename );
 267+ }
 268+
 269+ function generator() {
 270+ global $wgVersion;
 271+ return wfElement( 'generator', array(), "MediaWiki $wgVersion" );
 272+ }
 273+
 274+ function homelink() {
 275+ $page = Title::newFromText( wfMsgForContent( 'mainpage' ) );
 276+ return wfElement( 'base', array(), $page->getFullUrl() );
 277+ }
 278+
 279+ function caseSetting() {
 280+ global $wgCapitalLinks;
 281+ // "case-insensitive" option is reserved for future
 282+ $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive';
 283+ return wfElement( 'case', array(), $sensitivity );
 284+ }
 285+
 286+ function namespaces() {
 287+ global $wgContLang;
 288+ $spaces = " <namespaces>\n";
 289+ foreach( $wgContLang->getFormattedNamespaces() as $ns => $title ) {
 290+ $spaces .= ' ' . wfElement( 'namespace', array( 'key' => $ns ), $title ) . "\n";
 291+ }
 292+ $spaces .= " </namespaces>";
 293+ return $spaces;
 294+ }
 295+
 296+ /**
 297+ * Closes the output stream with the closing root element.
 298+ * Call when finished dumping things.
 299+ */
 300+ function closeStream() {
 301+ return "</mediawiki>\n";
 302+ }
 303+
 304+
 305+ /**
 306+ * Opens a <page> section on the output stream, with data
 307+ * from the given database row.
 308+ *
 309+ * @param object $row
 310+ * @return string
 311+ * @access private
 312+ */
 313+ function openPage( $row ) {
 314+ $out = " <page>\n";
 315+ $title = Title::makeTitle( $row->page_namespace, $row->page_title );
 316+ $out .= ' ' . wfElementClean( 'title', array(), $title->getPrefixedText() ) . "\n";
 317+ $out .= ' ' . wfElement( 'id', array(), strval( $row->page_id ) ) . "\n";
 318+ if( '' != $row->page_restrictions ) {
 319+ $out .= ' ' . wfElement( 'restrictions', array(),
 320+ strval( $row->page_restrictions ) ) . "\n";
 321+ }
 322+ return $out;
 323+ }
 324+
 325+ /**
 326+ * Closes a <page> section on the output stream.
 327+ *
 328+ * @access private
 329+ */
 330+ function closePage() {
 331+ return " </page>\n";
 332+ }
 333+
 334+ /**
 335+ * Dumps a <revision> section on the output stream, with
 336+ * data filled in from the given database row.
 337+ *
 338+ * @param object $row
 339+ * @return string
 340+ * @access private
 341+ */
 342+ function writeRevision( $row ) {
 343+ $fname = 'WikiExporter::dumpRev';
 344+ wfProfileIn( $fname );
 345+
 346+ $out = " <revision>\n";
 347+ $out .= " " . wfElement( 'id', null, strval( $row->rev_id ) ) . "\n";
 348+
 349+ $ts = wfTimestamp2ISO8601( strval( $row->rev_timestamp ) );
 350+ $out .= " " . wfElement( 'timestamp', null, $ts ) . "\n";
 351+
 352+ $out .= " <contributor>\n";
 353+ if( $row->rev_user ) {
 354+ $out .= " " . wfElementClean( 'username', null, strval( $row->rev_user_text ) ) . "\n";
 355+ $out .= " " . wfElement( 'id', null, strval( $row->rev_user ) ) . "\n";
 356+ } else {
 357+ $out .= " " . wfElementClean( 'ip', null, strval( $row->rev_user_text ) ) . "\n";
 358+ }
 359+ $out .= " </contributor>\n";
 360+
 361+ if( $row->rev_minor_edit ) {
 362+ $out .= " <minor/>\n";
 363+ }
 364+ if( $row->rev_comment != '' ) {
 365+ $out .= " " . wfElementClean( 'comment', null, strval( $row->rev_comment ) ) . "\n";
 366+ }
 367+
 368+ $text = strval( Revision::getRevisionText( $row ) );
 369+ $out .= " " . wfElementClean( 'text',
 370+ array( 'xml:space' => 'preserve' ),
 371+ strval( $text ) ) . "\n";
 372+
 373+ $out .= " </revision>\n";
 374+
 375+ wfProfileOut( $fname );
 376+ return $out;
 377+ }
 378+
 379+}
 380+
 381+
 382+/**
 383+ * Base class for output stream; prints to stdout or buffer or whereever.
 384+ */
 385+class DumpOutput {
 386+ function writeOpenStream( $string ) {
 387+ $this->write( $string );
 388+ }
 389+
 390+ function writeCloseStream( $string ) {
 391+ $this->write( $string );
 392+ }
 393+
 394+ function writeOpenPage( $page, $string ) {
 395+ $this->write( $string );
 396+ }
 397+
 398+ function writeClosePage( $string ) {
 399+ $this->write( $string );
 400+ }
 401+
 402+ function writeRevision( $rev, $string ) {
 403+ $this->write( $string );
 404+ }
 405+
 406+ /**
 407+ * Override to write to a different stream type.
 408+ * @return bool
 409+ */
 410+ function write( $string ) {
 411+ print $string;
 412+ }
 413+}
 414+
 415+/**
 416+ * Stream outputter to send data to a file.
 417+ */
 418+class DumpFileOutput extends DumpOutput {
 419+ var $handle;
 420+
 421+ function DumpFileOutput( $file ) {
 422+ $this->handle = fopen( $file, "wt" );
 423+ }
 424+
 425+ function write( $string ) {
 426+ fputs( $this->handle, $string );
 427+ }
 428+}
 429+
 430+/**
 431+ * Stream outputter to send data to a file via some filter program.
 432+ * Even if compression is available in a library, using a separate
 433+ * program can allow us to make use of a multi-processor system.
 434+ */
 435+class DumpPipeOutput extends DumpFileOutput {
 436+ function DumpPipeOutput( $command, $file = null ) {
 437+ if( !is_null( $file ) ) {
 438+ $command .= " > " . wfEscapeShellArg( $file );
 439+ }
 440+ $this->handle = popen( $command, "w" );
 441+ }
 442+}
 443+
 444+/**
 445+ * Sends dump output via the gzip compressor.
 446+ */
 447+class DumpGZipOutput extends DumpPipeOutput {
 448+ function DumpGZipOutput( $file ) {
 449+ parent::DumpPipeOutput( "gzip", $file );
 450+ }
 451+}
 452+
 453+/**
 454+ * Sends dump output via the bgzip2 compressor.
 455+ */
 456+class DumpBZip2Output extends DumpPipeOutput {
 457+ function DumpBZip2Output( $file ) {
 458+ parent::DumpPipeOutput( "bzip2", $file );
 459+ }
 460+}
 461+
 462+/**
 463+ * Sends dump output via the p7zip compressor.
 464+ */
 465+class Dump7ZipOutput extends DumpPipeOutput {
 466+ function Dump7ZipOutput( $file ) {
 467+ $command = "7za a -si " . wfEscapeShellArg( $file );
 468+ parent::DumpPipeOutput( $command );
 469+ }
 470+}
 471+
 472+
 473+
 474+/**
 475+ * Dump output filter class.
 476+ * This just does output filtering and streaming; XML formatting is done
 477+ * higher up, so be careful in what you do.
 478+ */
 479+class DumpFilter {
 480+ function DumpFilter( &$sink ) {
 481+ $this->sink =& $sink;
 482+ }
 483+
 484+ function writeOpenStream( $string ) {
 485+ $this->sink->writeOpenStream( $string );
 486+ }
 487+
 488+ function writeCloseStream( $string ) {
 489+ $this->sink->writeCloseStream( $string );
 490+ }
 491+
 492+ function writeOpenPage( $page, $string ) {
 493+ $this->sendingThisPage = $this->pass( $page, $string );
 494+ if( $this->sendingThisPage ) {
 495+ $this->sink->writeOpenPage( $page, $string );
 496+ }
 497+ }
 498+
 499+ function writeClosePage( $string ) {
 500+ if( $this->sendingThisPage ) {
 501+ $this->sink->writeClosePage( $string );
 502+ $this->sendingThisPage = false;
 503+ }
 504+ }
 505+
 506+ function writeRevision( $rev, $string ) {
 507+ if( $this->sendingThisPage ) {
 508+ $this->sink->writeRevision( $rev, $string );
 509+ }
 510+ }
 511+
 512+ /**
 513+ * Override for page-based filter types.
 514+ * @return bool
 515+ */
 516+ function pass( $page, $string ) {
 517+ return true;
 518+ }
 519+}
 520+
 521+/**
 522+ * Simple dump output filter to exclude all talk pages.
 523+ */
 524+class DumpNotalkFilter extends DumpFilter {
 525+ function pass( $page ) {
 526+ return Namespace::isTalk( $page->page_namespace );
 527+ }
 528+}
 529+
 530+/**
 531+ * Dump output filter to include or exclude pages in a given set of namespaces.
 532+ */
 533+class DumpNamespaceFilter extends DumpFilter {
 534+ var $invert = false;
 535+ var $match = array();
 536+
 537+ function DumpNamespaceFilter( &$sink, $param ) {
 538+ parent::DumpFilter( $sink );
 539+
 540+ $constants = array(
 541+ "NS_MAIN" => NS_MAIN,
 542+ "NS_TALK" => NS_TALK,
 543+ "NS_USER" => NS_USER,
 544+ "NS_USER_TALK" => NS_USER_TALK,
 545+ "NS_PROJECT" => NS_PROJECT,
 546+ "NS_PROJECT_TALK" => NS_PROJECT_TALK,
 547+ "NS_IMAGE" => NS_IMAGE,
 548+ "NS_IMAGE_TALK" => NS_IMAGE_TALK,
 549+ "NS_MEDIAWIKI" => NS_MEDIAWIKI,
 550+ "NS_MEDIAWIKI_TALK" => NS_MEDIAWIKI_TALK,
 551+ "NS_TEMPLATE" => NS_TEMPLATE,
 552+ "NS_TEMPLATE_TALK" => NS_TEMPLATE_TALK,
 553+ "NS_HELP" => NS_HELP,
 554+ "NS_HELP_TALK" => NS_HELP_TALK,
 555+ "NS_CATEGORY" => NS_CATEGORY,
 556+ "NS_CATEGORY_TALK" => NS_CATEGORY_TALK );
 557+
 558+ if( $param{0} == '!' ) {
 559+ $this->invert = true;
 560+ $param = substr( $param, 1 );
 561+ }
 562+
 563+ foreach( explode( ',', $param ) as $key ) {
 564+ $key = trim( $key );
 565+ if( isset( $contants[$key] ) ) {
 566+ $ns = $constants[$key];
 567+ $this->namespaces[$ns] = true;
 568+ } elseif( is_numeric( $key ) ) {
 569+ $ns = intval( $key );
 570+ $this->namespaces[$ns] = true;
 571+ }
 572+ }
 573+ }
 574+
 575+ function pass( $page ) {
 576+ $match = isset( $this->namespaces[$page->page_namespace] );
 577+ return $this->invert xor $match;
 578+ }
 579+}
 580+
 581+
 582+/**
 583+ * Dump output filter to include only the last revision in each page sequence.
 584+ */
 585+class DumpLatestFilter extends DumpFilter {
 586+ var $page, $pageString, $rev, $revString;
 587+
 588+ function writeOpenPage( $page, $string ) {
 589+ $this->page = $page;
 590+ $this->pageString = $string;
 591+ }
 592+
 593+ function writeClosePage( $string ) {
 594+ if( $this->rev ) {
 595+ $this->sink->writeOpenPage( $this->page, $this->pageString );
 596+ $this->sink->writeRevision( $this->rev, $this->revString );
 597+ $this->sink->writeClosePage( $string );
 598+ }
 599+ $this->rev = null;
 600+ $this->revString = null;
 601+ $this->page = null;
 602+ $this->pageString = null;
 603+ }
 604+
 605+ function writeRevision( $rev, $string ) {
 606+ if( $rev->rev_id == $this->page->page_latest ) {
 607+ $this->rev = $rev;
 608+ $this->revString = $string;
 609+ }
 610+ }
 611+}
 612+
 613+/**
 614+ * Base class for output stream; prints to stdout or buffer or whereever.
 615+ */
 616+class DumpMultiWriter {
 617+ function DumpMultiWriter( $sinks ) {
 618+ $this->sinks = $sinks;
 619+ $this->count = count( $sinks );
 620+ }
 621+
 622+ function writeOpenStream( $string ) {
 623+ for( $i = 0; $i < $this->count; $i++ ) {
 624+ $this->sinks[$i]->writeOpenStream( $string );
 625+ }
 626+ }
 627+
 628+ function writeCloseStream( $string ) {
 629+ for( $i = 0; $i < $this->count; $i++ ) {
 630+ $this->sinks[$i]->writeCloseStream( $string );
 631+ }
 632+ }
 633+
 634+ function writeOpenPage( $page, $string ) {
 635+ for( $i = 0; $i < $this->count; $i++ ) {
 636+ $this->sinks[$i]->writeOpenPage( $page, $string );
 637+ }
 638+ }
 639+
 640+ function writeClosePage( $string ) {
 641+ for( $i = 0; $i < $this->count; $i++ ) {
 642+ $this->sinks[$i]->writeClosePage( $string );
 643+ }
 644+ }
 645+
 646+ function writeRevision( $rev, $string ) {
 647+ for( $i = 0; $i < $this->count; $i++ ) {
 648+ $this->sinks[$i]->writeRevision( $rev, $string );
 649+ }
 650+ }
 651+}
 652+
 653+
 654+
 655+function wfTimestamp2ISO8601( $ts ) {
 656+ #2003-08-05T18:30:02Z
 657+ return preg_replace( '/^(....)(..)(..)(..)(..)(..)$/', '$1-$2-$3T$4:$5:$6Z', wfTimestamp( TS_MW, $ts ) );
 658+}
 659+
 660+function xmlsafe( $string ) {
 661+ $fname = 'xmlsafe';
 662+ wfProfileIn( $fname );
 663+
 664+ /**
 665+ * The page may contain old data which has not been properly normalized.
 666+ * Invalid UTF-8 sequences or forbidden control characters will make our
 667+ * XML output invalid, so be sure to strip them out.
 668+ */
 669+ $string = UtfNormal::cleanUp( $string );
 670+
 671+ $string = htmlspecialchars( $string );
 672+ wfProfileOut( $fname );
 673+ return $string;
 674+}
 675+
 676+?>
Property changes on: trunk/phase3/includes/Export.php
___________________________________________________________________
Added: svn:eol-style
1677 + native
Added: svn:keywords
2678 + Author Date Id Revision
Index: trunk/phase3/includes/SpecialExport.php
@@ -24,6 +24,7 @@
2525
2626 /** */
2727 require_once( 'Revision.php' );
 28+require_once( 'Export.php' );
2829
2930 /**
3031 *
@@ -67,378 +68,4 @@
6869 " );
6970 }
7071
71 -define( 'MW_EXPORT_FULL', 0 );
72 -define( 'MW_EXPORT_CURRENT', 1 );
73 -
74 -define( 'MW_EXPORT_BUFFER', 0 );
75 -define( 'MW_EXPORT_STREAM', 1 );
76 -
77 -/**
78 - * @package MediaWiki
79 - * @subpackage SpecialPage
80 - */
81 -class WikiExporter {
82 - var $pageCallback = null;
83 - var $revCallback = null;
84 -
85 - /**
86 - * If using MW_EXPORT_STREAM to stream a large amount of data,
87 - * provide a database connection which is not managed by
88 - * LoadBalancer to read from: some history blob types will
89 - * make additional queries to pull source data while the
90 - * main query is still running.
91 - *
92 - * @param Database $db
93 - * @param int $history one of MW_EXPORT_FULL or MW_EXPORT_CURRENT
94 - * @param int $buffer one of MW_EXPORT_BUFFER or MW_EXPORT_STREAM
95 - */
96 - function WikiExporter( &$db, $history = MW_EXPORT_CURRENT,
97 - $buffer = MW_EXPORT_BUFFER ) {
98 - $this->db =& $db;
99 - $this->history = $history;
100 - $this->buffer = $buffer;
101 - }
102 -
103 - /**
104 - * Set a callback to be called after each page in the output
105 - * stream is closed. The callback will be passed a database row
106 - * object with the last revision output.
107 - *
108 - * A set callback can be removed by passing null here.
109 - *
110 - * @param mixed $callback
111 - */
112 - function setPageCallback( $callback ) {
113 - $this->pageCallback = $callback;
114 - }
115 -
116 - /**
117 - * Set a callback to be called after each revision in the output
118 - * stream is closed. The callback will be passed a database row
119 - * object with the revision data.
120 - *
121 - * A set callback can be removed by passing null here.
122 - *
123 - * @param mixed $callback
124 - */
125 - function setRevisionCallback( $callback ) {
126 - $this->revCallback = $callback;
127 - }
128 -
129 - /**
130 - * Returns the export schema version.
131 - * @return string
132 - */
133 - function schemaVersion() {
134 - return "0.3";
135 - }
136 -
137 - /**
138 - * Opens the XML output stream's root <mediawiki> element.
139 - * This does not include an xml directive, so is safe to include
140 - * as a subelement in a larger XML stream. Namespace and XML Schema
141 - * references are included.
142 - *
143 - * To capture the stream to a string, use PHP's output buffering
144 - * functions. Output will be encoded in UTF-8.
145 - */
146 - function openStream() {
147 - global $wgContLanguageCode;
148 - $ver = $this->schemaVersion();
149 - print wfElement( 'mediawiki', array(
150 - 'xmlns' => "http://www.mediawiki.org/xml/export-$ver/",
151 - 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
152 - 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " .
153 - "http://www.mediawiki.org/xml/export-$ver.xsd",
154 - 'version' => $ver,
155 - 'xml:lang' => $wgContLanguageCode ),
156 - null ) . "\n";
157 - $this->siteInfo();
158 - }
159 -
160 - function siteInfo() {
161 - $info = array(
162 - $this->sitename(),
163 - $this->homelink(),
164 - $this->generator(),
165 - $this->caseSetting(),
166 - $this->namespaces() );
167 - print "<siteinfo>\n";
168 - foreach( $info as $item ) {
169 - print " $item\n";
170 - }
171 - print "</siteinfo>\n";
172 - }
173 -
174 - function sitename() {
175 - global $wgSitename;
176 - return wfElement( 'sitename', array(), $wgSitename );
177 - }
178 -
179 - function generator() {
180 - global $wgVersion;
181 - return wfElement( 'generator', array(), "MediaWiki $wgVersion" );
182 - }
183 -
184 - function homelink() {
185 - $page = Title::newFromText( wfMsgForContent( 'mainpage' ) );
186 - return wfElement( 'base', array(), $page->getFullUrl() );
187 - }
188 -
189 - function caseSetting() {
190 - global $wgCapitalLinks;
191 - // "case-insensitive" option is reserved for future
192 - $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive';
193 - return wfElement( 'case', array(), $sensitivity );
194 - }
195 -
196 - function namespaces() {
197 - global $wgContLang;
198 - $spaces = "<namespaces>\n";
199 - foreach( $wgContLang->getFormattedNamespaces() as $ns => $title ) {
200 - $spaces .= ' ' . wfElement( 'namespace', array( 'key' => $ns ), $title ) . "\n";
201 - }
202 - $spaces .= " </namespaces>";
203 - return $spaces;
204 - }
205 -
206 - /**
207 - * Closes the output stream with the closing root element.
208 - * Call when finished dumping things.
209 - */
210 - function closeStream() {
211 - print "</mediawiki>\n";
212 - }
213 -
214 - /**
215 - * Dumps a series of page and revision records for all pages
216 - * in the database, either including complete history or only
217 - * the most recent version.
218 - */
219 - function allPages() {
220 - return $this->dumpFrom( '' );
221 - }
222 -
223 - /**
224 - * Dumps a series of page and revision records for those pages
225 - * in the database falling within the page_id range given.
226 - * @param int $start Inclusive lower limit (this id is included)
227 - * @param int $end Exclusive upper limit (this id is not included)
228 - * If 0, no upper limit.
229 - */
230 - function pagesByRange( $start, $end ) {
231 - $condition = 'page_id >= ' . intval( $start );
232 - if( $end ) {
233 - $condition .= ' AND page_id < ' . intval( $end );
234 - }
235 - return $this->dumpFrom( $condition );
236 - }
237 -
238 - /**
239 - * @param Title $title
240 - */
241 - function pageByTitle( $title ) {
242 - return $this->dumpFrom(
243 - 'page_namespace=' . $title->getNamespace() .
244 - ' AND page_title=' . $this->db->addQuotes( $title->getDbKey() ) );
245 - }
246 -
247 - function pageByName( $name ) {
248 - $title = Title::newFromText( $name );
249 - if( is_null( $title ) ) {
250 - return new WikiError( "Can't export invalid title" );
251 - } else {
252 - return $this->pageByTitle( $title );
253 - }
254 - }
255 -
256 - function pagesByName( $names ) {
257 - foreach( $names as $name ) {
258 - $this->pageByName( $name );
259 - }
260 - }
261 -
262 -
263 - // -------------------- private implementation below --------------------
264 -
265 - function dumpFrom( $cond = '' ) {
266 - $fname = 'WikiExporter::dumpFrom';
267 - wfProfileIn( $fname );
268 -
269 - $page = $this->db->tableName( 'page' );
270 - $revision = $this->db->tableName( 'revision' );
271 - $text = $this->db->tableName( 'text' );
272 -
273 - if( $this->history == MW_EXPORT_FULL ) {
274 - $join = 'page_id=rev_page';
275 - } elseif( $this->history == MW_EXPORT_CURRENT ) {
276 - $join = 'page_id=rev_page AND page_latest=rev_id';
277 - } else {
278 - wfProfileOut( $fname );
279 - return new WikiError( "$fname given invalid history dump type." );
280 - }
281 - $where = ( $cond == '' ) ? '' : "$cond AND";
282 -
283 - if( $this->buffer == MW_EXPORT_STREAM ) {
284 - $prev = $this->db->bufferResults( false );
285 - }
286 - if( $cond == '' ) {
287 - // Optimization hack for full-database dump
288 - $pageindex = 'FORCE INDEX (PRIMARY)';
289 - $revindex = 'FORCE INDEX(page_timestamp)';
290 - } else {
291 - $pageindex = '';
292 - $revindex = '';
293 - }
294 - $result = $this->db->query(
295 - "SELECT * FROM
296 - $page $pageindex,
297 - $revision $revindex,
298 - $text
299 - WHERE $where $join AND rev_text_id=old_id
300 - ORDER BY page_id", $fname );
301 - $wrapper = $this->db->resultObject( $result );
302 - $this->outputStream( $wrapper );
303 -
304 - if( $this->buffer == MW_EXPORT_STREAM ) {
305 - $this->db->bufferResults( $prev );
306 - }
307 -
308 - wfProfileOut( $fname );
309 - }
310 -
311 - /**
312 - * Runs through a query result set dumping page and revision records.
313 - * The result set should be sorted/grouped by page to avoid duplicate
314 - * page records in the output.
315 - *
316 - * The result set will be freed once complete. Should be safe for
317 - * streaming (non-buffered) queries, as long as it was made on a
318 - * separate database connection not managed by LoadBalancer; some
319 - * blob storage types will make queries to pull source data.
320 - *
321 - * @param ResultWrapper $resultset
322 - * @access private
323 - */
324 - function outputStream( $resultset ) {
325 - $last = null;
326 - while( $row = $resultset->fetchObject() ) {
327 - if( is_null( $last ) ||
328 - $last->page_namespace != $row->page_namespace ||
329 - $last->page_title != $row->page_title ) {
330 - if( isset( $last ) ) {
331 - $this->closePage( $last );
332 - }
333 - $this->openPage( $row );
334 - $last = $row;
335 - }
336 - $this->dumpRev( $row );
337 - }
338 - if( isset( $last ) ) {
339 - $this->closePage( $last );
340 - }
341 - $resultset->free();
342 - }
343 -
344 - /**
345 - * Opens a <page> section on the output stream, with data
346 - * from the given database row.
347 - *
348 - * @param object $row
349 - * @access private
350 - */
351 - function openPage( $row ) {
352 - print "<page>\n";
353 - $title = Title::makeTitle( $row->page_namespace, $row->page_title );
354 - print ' ' . wfElementClean( 'title', array(), $title->getPrefixedText() ) . "\n";
355 - print ' ' . wfElement( 'id', array(), strval( $row->page_id ) ) . "\n";
356 - if( '' != $row->page_restrictions ) {
357 - print ' ' . wfElement( 'restrictions', array(),
358 - strval( $row->page_restrictions ) ) . "\n";
359 - }
360 - }
361 -
362 - /**
363 - * Closes a <page> section on the output stream.
364 - * If a per-page callback has been set, it will be called
365 - * and passed the last database row used for this page.
366 - *
367 - * @param object $row
368 - * @access private
369 - */
370 - function closePage( $row ) {
371 - print "</page>\n";
372 - if( isset( $this->pageCallback ) ) {
373 - call_user_func( $this->pageCallback, $row );
374 - }
375 - }
376 -
377 - /**
378 - * Dumps a <revision> section on the output stream, with
379 - * data filled in from the given database row.
380 - *
381 - * @param object $row
382 - * @access private
383 - */
384 - function dumpRev( $row ) {
385 - $fname = 'WikiExporter::dumpRev';
386 - wfProfileIn( $fname );
387 -
388 - print " <revision>\n";
389 - print " " . wfElement( 'id', null, strval( $row->rev_id ) ) . "\n";
390 -
391 - $ts = wfTimestamp2ISO8601( strval( $row->rev_timestamp ) );
392 - print " " . wfElement( 'timestamp', null, $ts ) . "\n";
393 -
394 - print " <contributor>\n";
395 - if( $row->rev_user ) {
396 - print " " . wfElementClean( 'username', null, strval( $row->rev_user_text ) ) . "\n";
397 - print " " . wfElement( 'id', null, strval( $row->rev_user ) ) . "\n";
398 - } else {
399 - print " " . wfElementClean( 'ip', null, strval( $row->rev_user_text ) ) . "\n";
400 - }
401 - print " </contributor>\n";
402 -
403 - if( $row->rev_minor_edit ) {
404 - print " <minor/>\n";
405 - }
406 - if( $row->rev_comment != '' ) {
407 - print " " . wfElementClean( 'comment', null, strval( $row->rev_comment ) ) . "\n";
408 - }
409 -
410 - $text = strval( Revision::getRevisionText( $row ) );
411 - print " " . wfElementClean( 'text', array( 'xml:space' => 'preserve' ), $text ) . "\n";
412 -
413 - print " </revision>\n";
414 -
415 - wfProfileOut( $fname );
416 -
417 - if( isset( $this->revCallback ) ) {
418 - call_user_func( $this->revCallback, $row );
419 - }
420 - }
421 -
422 -}
423 -
424 -function wfTimestamp2ISO8601( $ts ) {
425 - #2003-08-05T18:30:02Z
426 - return preg_replace( '/^(....)(..)(..)(..)(..)(..)$/', '$1-$2-$3T$4:$5:$6Z', wfTimestamp( TS_MW, $ts ) );
427 -}
428 -
429 -function xmlsafe( $string ) {
430 - $fname = 'xmlsafe';
431 - wfProfileIn( $fname );
432 -
433 - /**
434 - * The page may contain old data which has not been properly normalized.
435 - * Invalid UTF-8 sequences or forbidden control characters will make our
436 - * XML output invalid, so be sure to strip them out.
437 - */
438 - $string = UtfNormal::cleanUp( $string );
439 -
440 - $string = htmlspecialchars( $string );
441 - wfProfileOut( $fname );
442 - return $string;
443 -}
444 -
44572 ?>
Index: trunk/phase3/RELEASE-NOTES
@@ -124,6 +124,8 @@
125125 * (bug 3503) Update LanguageSq.php from sq.wikipedia.org messages
126126 * Added EditFilter hook, and output callback on EditPage::showEditForm()
127127 for a place to add in captcha-type extensions in the edit flow
 128+* Added filter options, compression piping, and multiple output streams for
 129+ dumpBackup.php
128130
129131
130132 === Caveats ===

Status & tagging log