r82930 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r82929‎ | r82930 | r82931 >
Date:11:55, 28 February 2011
Author:ariel
Status:ok
Tags:
Comment:
revert most of 77638, use of XMLReader() was causing a 3 to 4-fold slowdown on history dumps
Modified paths:
  • /branches/wmf/1.17wmf1/maintenance/dumpTextPass.php (modified) (history)

Diff [purge]

Index: branches/wmf/1.17wmf1/maintenance/dumpTextPass.php
@@ -35,6 +35,7 @@
3636 class TextPassDumper extends BackupDumper {
3737 var $prefetch = null;
3838 var $input = "php://stdin";
 39+ var $history = WikiExporter::FULL;
3940 var $fetchCount = 0;
4041 var $prefetchCount = 0;
4142
@@ -60,12 +61,19 @@
6162 if ( ini_get( 'display_errors' ) )
6263 ini_set( 'display_errors', 'stderr' );
6364
64 - $this->initProgress( $history );
 65+ $this->initProgress( $this->history );
6566
6667 $this->db = $this->backupDb();
6768
68 - $this->readDump();
 69+ $this->egress = new ExportProgressFilter( $this->sink, $this );
6970
 71+ $input = fopen( $this->input, "rt" );
 72+ $result = $this->readDump( $input );
 73+
 74+ if ( WikiError::isError( $result ) ) {
 75+ wfDie( $result->getMessage() );
 76+ }
 77+
7078 if ( $this->spawnProc ) {
7179 $this->closeSpawn();
7280 }
@@ -85,6 +93,12 @@
8694 case 'stub':
8795 $this->input = $url;
8896 break;
 97+ case 'current':
 98+ $this->history = WikiExporter::CURRENT;
 99+ break;
 100+ case 'full':
 101+ $this->history = WikiExporter::FULL;
 102+ break;
89103 case 'spawn':
90104 $this->spawn = true;
91105 if ( $val ) {
@@ -148,76 +162,34 @@
149163 }
150164 }
151165
152 - function readDump() {
153 - $state = '';
154 - $lastName = '';
 166+ function readDump( $input ) {
 167+ $this->buffer = "";
 168+ $this->openElement = false;
 169+ $this->atStart = true;
 170+ $this->state = "";
 171+ $this->lastName = "";
155172 $this->thisPage = 0;
156173 $this->thisRev = 0;
157174
158 - $reader = new XMLReader();
159 - $reader->open( $this->input );
160 - $writer = new XMLWriter();
161 - $writer->openMemory();
 175+ $parser = xml_parser_create( "UTF-8" );
 176+ xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
162177
 178+ xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) );
 179+ xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
163180
164 - while ( $reader->read() ) {
165 - $tag = $reader->name;
166 - $type = $reader->nodeType;
 181+ $offset = 0; // for context extraction on error reporting
 182+ $bufferSize = 512 * 1024;
 183+ do {
 184+ $chunk = fread( $input, $bufferSize );
 185+ if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
 186+ wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
 187+ return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset );
 188+ }
 189+ $offset += strlen( $chunk );
 190+ } while ( $chunk !== false && !feof( $input ) );
 191+ xml_parser_free( $parser );
167192
168 - if ( $type == XmlReader::END_ELEMENT ) {
169 - $writer->endElement();
170 -
171 - if ( $tag == 'revision' ) {
172 - $this->revCount();
173 - $this->thisRev = '';
174 - } elseif ( $tag == 'page' ) {
175 - $this->reportPage();
176 - $this->thisPage = '';
177 - }
178 - } elseif ( $type == XmlReader::ELEMENT ) {
179 - $attribs = array();
180 - if ( $reader->hasAttributes ) {
181 - for ( $i = 0; $reader->moveToAttributeNo( $i ); $i++ ) {
182 - $attribs[$reader->name] = $reader->value;
183 - }
184 - }
185 -
186 - if ( $reader->isEmptyElement && $tag == 'text' && isset( $attribs['id'] ) ) {
187 - $writer->startElement( 'text' );
188 - $writer->writeAttribute( 'xml:space', 'preserve' );
189 - $text = $this->getText( $attribs['id'] );
190 - if ( strlen( $text ) ) {
191 - $writer->text( $text );
192 - }
193 - $writer->endElement();
194 - } else {
195 - $writer->startElement( $tag );
196 - foreach( $attribs as $name => $val ) {
197 - $writer->writeAttribute( $name, $val );
198 - }
199 - if ( $reader->isEmptyElement ) {
200 - $writer->endElement();
201 - }
202 - }
203 -
204 - $lastName = $tag;
205 - if ( $tag == 'revision' ) {
206 - $state = 'revision';
207 - } elseif ( $tag == 'page' ) {
208 - $state = 'page';
209 - }
210 - } elseif ( $type == XMLReader::SIGNIFICANT_WHITESPACE || $type = XMLReader::TEXT ) {
211 - if ( $lastName == 'id' ) {
212 - if ( $state == 'revision' ) {
213 - $this->thisRev .= $reader->value;
214 - } elseif ( $state == 'page' ) {
215 - $this->thisPage .= $reader->value;
216 - }
217 - }
218 - $writer->text( $reader->value );
219 - }
220 - $this->sink->write( $writer->outputMemory() );
221 - }
 193+ return true;
222194 }
223195
224196 function getText( $id ) {
@@ -240,6 +212,7 @@
241213 }
242214
243215 private function doGetText( $id ) {
 216+
244217 $id = intval( $id );
245218 $this->failures = 0;
246219 $ex = new MWException( "Graceful storage failure" );
@@ -427,13 +400,81 @@
428401 $normalized = $wgContLang->normalize( $stripped );
429402 return $normalized;
430403 }
 404+
 405+ function startElement( $parser, $name, $attribs ) {
 406+ $this->clearOpenElement( null );
 407+ $this->lastName = $name;
 408+
 409+ if ( $name == 'revision' ) {
 410+ $this->state = $name;
 411+ $this->egress->writeOpenPage( null, $this->buffer );
 412+ $this->buffer = "";
 413+ } elseif ( $name == 'page' ) {
 414+ $this->state = $name;
 415+ if ( $this->atStart ) {
 416+ $this->egress->writeOpenStream( $this->buffer );
 417+ $this->buffer = "";
 418+ $this->atStart = false;
 419+ }
 420+ }
 421+
 422+ if ( $name == "text" && isset( $attribs['id'] ) ) {
 423+ $text = $this->getText( $attribs['id'] );
 424+ $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
 425+ if ( strlen( $text ) > 0 ) {
 426+ $this->characterData( $parser, $text );
 427+ }
 428+ } else {
 429+ $this->openElement = array( $name, $attribs );
 430+ }
 431+ }
 432+
 433+ function endElement( $parser, $name ) {
 434+ if ( $this->openElement ) {
 435+ $this->clearOpenElement( "" );
 436+ } else {
 437+ $this->buffer .= "</$name>";
 438+ }
 439+
 440+ if ( $name == 'revision' ) {
 441+ $this->egress->writeRevision( null, $this->buffer );
 442+ $this->buffer = "";
 443+ $this->thisRev = "";
 444+ } elseif ( $name == 'page' ) {
 445+ $this->egress->writeClosePage( $this->buffer );
 446+ $this->buffer = "";
 447+ $this->thisPage = "";
 448+ } elseif ( $name == 'mediawiki' ) {
 449+ $this->egress->writeCloseStream( $this->buffer );
 450+ $this->buffer = "";
 451+ }
 452+ }
 453+
 454+ function characterData( $parser, $data ) {
 455+ $this->clearOpenElement( null );
 456+ if ( $this->lastName == "id" ) {
 457+ if ( $this->state == "revision" ) {
 458+ $this->thisRev .= $data;
 459+ } elseif ( $this->state == "page" ) {
 460+ $this->thisPage .= $data;
 461+ }
 462+ }
 463+ $this->buffer .= htmlspecialchars( $data );
 464+ }
 465+
 466+ function clearOpenElement( $style ) {
 467+ if ( $this->openElement ) {
 468+ $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
 469+ $this->openElement = false;
 470+ }
 471+ }
431472 }
432473
433474
434475 $dumper = new TextPassDumper( $argv );
435476
436477 if ( !isset( $options['help'] ) ) {
437 - $dumper->dump( WikiExporter::FULL );
 478+ $dumper->dump( true );
438479 } else {
439480 $dumper->progress( <<<ENDS
440481 This script postprocesses XML dumps from dumpBackup.php to add
@@ -447,6 +488,7 @@
448489 --stub=<type>:<file> To load a compressed stub dump instead of stdin
449490 --prefetch=<type>:<file> Use a prior dump file as a text source, to save
450491 pressure on the database.
 492+ (Requires the XMLReader extension)
451493 --quiet Don't dump status reports to stderr.
452494 --report=n Report position and speed after every n pages processed.
453495 (Default: 100)
@@ -459,5 +501,3 @@
460502 ENDS
461503 );
462504 }
463 -
464 -

Follow-up revisions

RevisionCommit summaryAuthorDate
r92707Revert r85034, r81186, r77638: per CR on r77638: XMLReader and XMLWriter are ...demon23:06, 20 July 2011

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r77638Modifier dumpTextPass.php to use XMLReader and XMLWriter rather than xml_* fu...ialex10:05, 3 December 2010

Status & tagging log