r77638 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r77637‎ | r77638 | r77639 >
Date:10:05, 3 December 2010
Author:ialex
Status:reverted (Comments)
Tags:
Comment:
Modifier dumpTextPass.php to use XMLReader and XMLWriter rather than xml_* functions
Modified paths:
  • /trunk/phase3/maintenance/dumpTextPass.php (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/dumpTextPass.php
@@ -2,7 +2,7 @@
33 /**
44 * Script that postprocesses XML dumps from dumpBackup.php to add page text
55 *
6 - * Copyright (C) 2005 Brion Vibber <brion@pobox.com>
 6+ * Copyright © 2005 Brion Vibber <brion@pobox.com>, 2010 Alexandre Emsenhuber
77 * http://www.mediawiki.org/
88 *
99 * This program is free software; you can redistribute it and/or modify
@@ -35,7 +35,6 @@
3636 class TextPassDumper extends BackupDumper {
3737 var $prefetch = null;
3838 var $input = "php://stdin";
39 - var $history = WikiExporter::FULL;
4039 var $fetchCount = 0;
4140 var $prefetchCount = 0;
4241
@@ -61,19 +60,12 @@
6261 if ( ini_get( 'display_errors' ) )
6362 ini_set( 'display_errors', 'stderr' );
6463
65 - $this->initProgress( $this->history );
 64+ $this->initProgress( $history );
6665
6766 $this->db = $this->backupDb();
6867
69 - $this->egress = new ExportProgressFilter( $this->sink, $this );
 68+ $this->readDump();
7069
71 - $input = fopen( $this->input, "rt" );
72 - $result = $this->readDump( $input );
73 -
74 - if ( WikiError::isError( $result ) ) {
75 - wfDie( $result->getMessage() );
76 - }
77 -
7870 if ( $this->spawnProc ) {
7971 $this->closeSpawn();
8072 }
@@ -93,12 +85,6 @@
9486 case 'stub':
9587 $this->input = $url;
9688 break;
97 - case 'current':
98 - $this->history = WikiExporter::CURRENT;
99 - break;
100 - case 'full':
101 - $this->history = WikiExporter::FULL;
102 - break;
10389 case 'spawn':
10490 $this->spawn = true;
10591 if ( $val ) {
@@ -152,34 +138,76 @@
153139 }
154140 }
155141
156 - function readDump( $input ) {
157 - $this->buffer = "";
158 - $this->openElement = false;
159 - $this->atStart = true;
160 - $this->state = "";
161 - $this->lastName = "";
 142+ function readDump() {
 143+ $state = '';
 144+ $lastName = '';
162145 $this->thisPage = 0;
163146 $this->thisRev = 0;
164147
165 - $parser = xml_parser_create( "UTF-8" );
166 - xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
 148+ $reader = new XMLReader();
 149+ $reader->open( $this->input );
 150+ $writer = new XMLWriter();
 151+ $writer->openURI( 'php://stdout' );
167152
168 - xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) );
169 - xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
170153
171 - $offset = 0; // for context extraction on error reporting
172 - $bufferSize = 512 * 1024;
173 - do {
174 - $chunk = fread( $input, $bufferSize );
175 - if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
176 - wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
177 - return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset );
178 - }
179 - $offset += strlen( $chunk );
180 - } while ( $chunk !== false && !feof( $input ) );
181 - xml_parser_free( $parser );
 154+ while ( $reader->read() ) {
 155+ $tag = $reader->name;
 156+ $type = $reader->nodeType;
182157
183 - return true;
 158+ if ( $type == XmlReader::END_ELEMENT ) {
 159+ $writer->endElement();
 160+
 161+ if ( $tag == 'revision' ) {
 162+ $this->revCount();
 163+ $this->thisRev = '';
 164+ } elseif ( $tag == 'page' ) {
 165+ $this->reportPage();
 166+ $this->thisPage = '';
 167+ }
 168+ } elseif ( $type == XmlReader::ELEMENT ) {
 169+ $attribs = array();
 170+ if ( $reader->hasAttributes ) {
 171+ for ( $i = 0; $reader->moveToAttributeNo( $i ); $i++ ) {
 172+ $attribs[$reader->name] = $reader->value;
 173+ }
 174+ }
 175+
 176+ if ( $reader->isEmptyElement && $tag == 'text' && isset( $attribs['id'] ) ) {
 177+ $writer->startElement( 'text' );
 178+ $writer->writeAttribute( 'xml:space', 'preserve' );
 179+ $text = $this->getText( $attribs['id'] );
 180+ if ( strlen( $text ) ) {
 181+ $writer->text( $text );
 182+ }
 183+ $writer->endElement();
 184+ } else {
 185+ $writer->startElement( $tag );
 186+ foreach( $attribs as $name => $val ) {
 187+ $writer->writeAttribute( $name, $val );
 188+ }
 189+ if ( $reader->isEmptyElement ) {
 190+ $writer->endElement();
 191+ }
 192+ }
 193+
 194+ $lastName = $tag;
 195+ if ( $tag == 'revision' ) {
 196+ $state = 'revision';
 197+ } elseif ( $tag == 'page' ) {
 198+ $state = 'page';
 199+ }
 200+ } elseif ( $type == XMLReader::SIGNIFICANT_WHITESPACE || $type = XMLReader::TEXT ) {
 201+ if ( $lastName == 'id' ) {
 202+ if ( $state == 'revision' ) {
 203+ $this->thisRev .= $reader->value;
 204+ } elseif ( $state == 'page' ) {
 205+ $this->thisPage .= $reader->value;
 206+ }
 207+ }
 208+ $writer->text( $reader->value );
 209+ }
 210+ }
 211+ $writer->flush();
184212 }
185213
186214 function getText( $id ) {
@@ -207,7 +235,6 @@
208236 }
209237
210238 private function doGetText( $id ) {
211 -
212239 $id = intval( $id );
213240 $this->failures = 0;
214241 $ex = new MWException( "Graceful storage failure" );
@@ -395,81 +422,13 @@
396423 $normalized = $wgContLang->normalize( $stripped );
397424 return $normalized;
398425 }
399 -
400 - function startElement( $parser, $name, $attribs ) {
401 - $this->clearOpenElement( null );
402 - $this->lastName = $name;
403 -
404 - if ( $name == 'revision' ) {
405 - $this->state = $name;
406 - $this->egress->writeOpenPage( null, $this->buffer );
407 - $this->buffer = "";
408 - } elseif ( $name == 'page' ) {
409 - $this->state = $name;
410 - if ( $this->atStart ) {
411 - $this->egress->writeOpenStream( $this->buffer );
412 - $this->buffer = "";
413 - $this->atStart = false;
414 - }
415 - }
416 -
417 - if ( $name == "text" && isset( $attribs['id'] ) ) {
418 - $text = $this->getText( $attribs['id'] );
419 - $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
420 - if ( strlen( $text ) > 0 ) {
421 - $this->characterData( $parser, $text );
422 - }
423 - } else {
424 - $this->openElement = array( $name, $attribs );
425 - }
426 - }
427 -
428 - function endElement( $parser, $name ) {
429 - if ( $this->openElement ) {
430 - $this->clearOpenElement( "" );
431 - } else {
432 - $this->buffer .= "</$name>";
433 - }
434 -
435 - if ( $name == 'revision' ) {
436 - $this->egress->writeRevision( null, $this->buffer );
437 - $this->buffer = "";
438 - $this->thisRev = "";
439 - } elseif ( $name == 'page' ) {
440 - $this->egress->writeClosePage( $this->buffer );
441 - $this->buffer = "";
442 - $this->thisPage = "";
443 - } elseif ( $name == 'mediawiki' ) {
444 - $this->egress->writeCloseStream( $this->buffer );
445 - $this->buffer = "";
446 - }
447 - }
448 -
449 - function characterData( $parser, $data ) {
450 - $this->clearOpenElement( null );
451 - if ( $this->lastName == "id" ) {
452 - if ( $this->state == "revision" ) {
453 - $this->thisRev .= $data;
454 - } elseif ( $this->state == "page" ) {
455 - $this->thisPage .= $data;
456 - }
457 - }
458 - $this->buffer .= htmlspecialchars( $data );
459 - }
460 -
461 - function clearOpenElement( $style ) {
462 - if ( $this->openElement ) {
463 - $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
464 - $this->openElement = false;
465 - }
466 - }
467426 }
468427
469428
470429 $dumper = new TextPassDumper( $argv );
471430
472431 if ( !isset( $options['help'] ) ) {
473 - $dumper->dump( true );
 432+ $dumper->dump( WikiExporter::FULL );
474433 } else {
475434 $dumper->progress( <<<ENDS
476435 This script postprocesses XML dumps from dumpBackup.php to add
@@ -483,7 +442,6 @@
484443 --stub=<type>:<file> To load a compressed stub dump instead of stdin
485444 --prefetch=<type>:<file> Use a prior dump file as a text source, to save
486445 pressure on the database.
487 - (Requires the XMLReader extension)
488446 --quiet Don't dump status reports to stderr.
489447 --report=n Report position and speed after every n pages processed.
490448 (Default: 100)

Follow-up revisions

RevisionCommit summaryAuthorDate
r81186* (bug 27016) Fix for r77638: dumpTextPass.php now consider the "output" para...ialex09:00, 29 January 2011
r82930revert most of 77638, use of XMLReader() was causing a 3 to 4-fold slowdown o...ariel11:55, 28 February 2011
r92707Revert r85034, r81186, r77638: per CR on r77638: XMLReader and XMLWriter are ...demon23:06, 20 July 2011

Comments

#Comment by 😂 (talk | contribs)   13:09, 3 December 2010

Thank you!

#Comment by Reedy (talk | contribs)   07:35, 29 January 2011

Ping bug 27016

#Comment by IAlex (talk | contribs)   14:50, 29 January 2011

Fixed in r81186.

#Comment by ArielGlenn (talk | contribs)   12:22, 28 February 2011

Is there something that can be done to resolve the speed issues?

#Comment by Catrope (talk | contribs)   22:54, 20 July 2011

Chad is reverting this, might be a bit tricky due to followups.

Status & tagging log