r92707 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r92706‎ | r92707 | r92708 >
Date:23:06, 20 July 2011
Author:demon
Status:ok
Tags:
Comment:
Revert r85034, r81186, r77638: per CR on r77638: XMLReader and XMLWriter are memory-hungry beasts and this script OOMs constantly.

This was already reverted in 1.17wmf1 (r82930) by Ariel. Doing the same in trunk now. I'm not opposed to using the new pretty XML* classes, but somebody needs to debug why they eat memory for breakfast, lunch and dinner.
Modified paths:
  • /trunk/phase3/maintenance/dumpTextPass.php (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/dumpTextPass.php
@@ -2,7 +2,7 @@
33 /**
44 * Script that postprocesses XML dumps from dumpBackup.php to add page text
55 *
6 - * Copyright © 2005 Brion Vibber <brion@pobox.com>, 2010 Alexandre Emsenhuber
 6+ * Copyright (C) 2005 Brion Vibber <brion@pobox.com>
77 * http://www.mediawiki.org/
88 *
99 * This program is free software; you can redistribute it and/or modify
@@ -35,6 +35,7 @@
3636 class TextPassDumper extends BackupDumper {
3737 var $prefetch = null;
3838 var $input = "php://stdin";
 39+ var $history = WikiExporter::FULL;
3940 var $fetchCount = 0;
4041 var $prefetchCount = 0;
4142 var $lastTime = 0;
@@ -73,12 +74,19 @@
7475 if ( ini_get( 'display_errors' ) )
7576 ini_set( 'display_errors', 'stderr' );
7677
77 - $this->initProgress( $history );
 78+ $this->initProgress( $this->history );
7879
7980 $this->db = $this->backupDb();
8081
81 - $this->readDump();
 82+ $this->egress = new ExportProgressFilter( $this->sink, $this );
8283
 84+ $input = fopen( $this->input, "rt" );
 85+ $result = $this->readDump( $input );
 86+
 87+ if ( WikiError::isError( $result ) ) {
 88+ wfDie( $result->getMessage() );
 89+ }
 90+
8391 if ( $this->spawnProc ) {
8492 $this->closeSpawn();
8593 }
@@ -98,6 +106,12 @@
99107 case 'stub':
100108 $this->input = $url;
101109 break;
 110+ case 'current':
 111+ $this->history = WikiExporter::CURRENT;
 112+ break;
 113+ case 'full':
 114+ $this->history = WikiExporter::FULL;
 115+ break;
102116 case 'spawn':
103117 $this->spawn = true;
104118 if ( $val ) {
@@ -190,76 +204,34 @@
191205 }
192206 }
193207
194 - function readDump() {
195 - $state = '';
196 - $lastName = '';
 208+ function readDump( $input ) {
 209+ $this->buffer = "";
 210+ $this->openElement = false;
 211+ $this->atStart = true;
 212+ $this->state = "";
 213+ $this->lastName = "";
197214 $this->thisPage = 0;
198215 $this->thisRev = 0;
199216
200 - $reader = new XMLReader();
201 - $reader->open( $this->input );
202 - $writer = new XMLWriter();
203 - $writer->openMemory();
 217+ $parser = xml_parser_create( "UTF-8" );
 218+ xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
204219
 220+ xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) );
 221+ xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
205222
206 - while ( $reader->read() ) {
207 - $tag = $reader->name;
208 - $type = $reader->nodeType;
 223+ $offset = 0; // for context extraction on error reporting
 224+ $bufferSize = 512 * 1024;
 225+ do {
 226+ $chunk = fread( $input, $bufferSize );
 227+ if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
 228+ wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
 229+ return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset );
 230+ }
 231+ $offset += strlen( $chunk );
 232+ } while ( $chunk !== false && !feof( $input ) );
 233+ xml_parser_free( $parser );
209234
210 - if ( $type == XmlReader::END_ELEMENT ) {
211 - $writer->endElement();
212 -
213 - if ( $tag == 'revision' ) {
214 - $this->revCount();
215 - $this->thisRev = '';
216 - } elseif ( $tag == 'page' ) {
217 - $this->reportPage();
218 - $this->thisPage = '';
219 - }
220 - } elseif ( $type == XmlReader::ELEMENT ) {
221 - $attribs = array();
222 - if ( $reader->hasAttributes ) {
223 - for ( $i = 0; $reader->moveToAttributeNo( $i ); $i++ ) {
224 - $attribs[$reader->name] = $reader->value;
225 - }
226 - }
227 -
228 - if ( $reader->isEmptyElement && $tag == 'text' && isset( $attribs['id'] ) ) {
229 - $writer->startElement( 'text' );
230 - $writer->writeAttribute( 'xml:space', 'preserve' );
231 - $text = $this->getText( $attribs['id'] );
232 - if ( strlen( $text ) ) {
233 - $writer->text( $text );
234 - }
235 - $writer->endElement();
236 - } else {
237 - $writer->startElement( $tag );
238 - foreach( $attribs as $name => $val ) {
239 - $writer->writeAttribute( $name, $val );
240 - }
241 - if ( $reader->isEmptyElement ) {
242 - $writer->endElement();
243 - }
244 - }
245 -
246 - $lastName = $tag;
247 - if ( $tag == 'revision' ) {
248 - $state = 'revision';
249 - } elseif ( $tag == 'page' ) {
250 - $state = 'page';
251 - }
252 - } elseif ( $type == XMLReader::SIGNIFICANT_WHITESPACE || $type == XMLReader::TEXT ) {
253 - if ( $lastName == 'id' ) {
254 - if ( $state == 'revision' ) {
255 - $this->thisRev .= $reader->value;
256 - } elseif ( $state == 'page' ) {
257 - $this->thisPage .= $reader->value;
258 - }
259 - }
260 - $writer->text( $reader->value );
261 - }
262 - $this->sink->write( $writer->outputMemory() );
263 - }
 235+ return true;
264236 }
265237
266238 function getText( $id ) {
@@ -282,6 +254,7 @@
283255 }
284256
285257 private function doGetText( $id ) {
 258+
286259 $id = intval( $id );
287260 $this->failures = 0;
288261 $ex = new MWException( "Graceful storage failure" );
@@ -469,13 +442,81 @@
470443 $normalized = $wgContLang->normalize( $stripped );
471444 return $normalized;
472445 }
 446+
 447+ function startElement( $parser, $name, $attribs ) {
 448+ $this->clearOpenElement( null );
 449+ $this->lastName = $name;
 450+
 451+ if ( $name == 'revision' ) {
 452+ $this->state = $name;
 453+ $this->egress->writeOpenPage( null, $this->buffer );
 454+ $this->buffer = "";
 455+ } elseif ( $name == 'page' ) {
 456+ $this->state = $name;
 457+ if ( $this->atStart ) {
 458+ $this->egress->writeOpenStream( $this->buffer );
 459+ $this->buffer = "";
 460+ $this->atStart = false;
 461+ }
 462+ }
 463+
 464+ if ( $name == "text" && isset( $attribs['id'] ) ) {
 465+ $text = $this->getText( $attribs['id'] );
 466+ $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
 467+ if ( strlen( $text ) > 0 ) {
 468+ $this->characterData( $parser, $text );
 469+ }
 470+ } else {
 471+ $this->openElement = array( $name, $attribs );
 472+ }
 473+ }
 474+
 475+ function endElement( $parser, $name ) {
 476+ if ( $this->openElement ) {
 477+ $this->clearOpenElement( "" );
 478+ } else {
 479+ $this->buffer .= "</$name>";
 480+ }
 481+
 482+ if ( $name == 'revision' ) {
 483+ $this->egress->writeRevision( null, $this->buffer );
 484+ $this->buffer = "";
 485+ $this->thisRev = "";
 486+ } elseif ( $name == 'page' ) {
 487+ $this->egress->writeClosePage( $this->buffer );
 488+ $this->buffer = "";
 489+ $this->thisPage = "";
 490+ } elseif ( $name == 'mediawiki' ) {
 491+ $this->egress->writeCloseStream( $this->buffer );
 492+ $this->buffer = "";
 493+ }
 494+ }
 495+
 496+ function characterData( $parser, $data ) {
 497+ $this->clearOpenElement( null );
 498+ if ( $this->lastName == "id" ) {
 499+ if ( $this->state == "revision" ) {
 500+ $this->thisRev .= $data;
 501+ } elseif ( $this->state == "page" ) {
 502+ $this->thisPage .= $data;
 503+ }
 504+ }
 505+ $this->buffer .= htmlspecialchars( $data );
 506+ }
 507+
 508+ function clearOpenElement( $style ) {
 509+ if ( $this->openElement ) {
 510+ $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
 511+ $this->openElement = false;
 512+ }
 513+ }
473514 }
474515
475516
476517 $dumper = new TextPassDumper( $argv );
477518
478519 if ( !isset( $options['help'] ) ) {
479 - $dumper->dump( WikiExporter::FULL );
 520+ $dumper->dump( true );
480521 } else {
481522 $dumper->progress( <<<ENDS
482523 This script postprocesses XML dumps from dumpBackup.php to add
@@ -489,12 +530,11 @@
490531 --stub=<type>:<file> To load a compressed stub dump instead of stdin
491532 --prefetch=<type>:<file> Use a prior dump file as a text source, to save
492533 pressure on the database.
 534+ (Requires the XMLReader extension)
493535 --quiet Don't dump status reports to stderr.
494536 --report=n Report position and speed after every n pages processed.
495537 (Default: 100)
496538 --server=h Force reading from MySQL server h
497 - --output=<type>:<file> Write to a file instead of stdout
498 - <type>s: file, gzip, bzip2, 7zip
499539 --current Base ETA on number of pages in database instead of all revisions
500540 --spawn Spawn a subprocess for loading text records
501541 --help Display this help message

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r77638Modifier dumpTextPass.php to use XMLReader and XMLWriter rather than xml_* fu...ialex10:05, 3 December 2010
r81186* (bug 27016) Fix for r77638: dumpTextPass.php now consider the "output" para...ialex09:00, 29 January 2011
r82930revert most of 77638, use of XMLReader() was causing a 3 to 4-fold slowdown o...ariel11:55, 28 February 2011
r85034Swap = to == in conditionalreedy18:56, 30 March 2011

Status & tagging log