Index: trunk/phase3/maintenance/dumpTextPass.php |
— | — | @@ -2,7 +2,7 @@ |
3 | 3 | /** |
4 | 4 | * Script that postprocesses XML dumps from dumpBackup.php to add page text |
5 | 5 | * |
6 | | - * Copyright © 2005 Brion Vibber <brion@pobox.com>, 2010 Alexandre Emsenhuber |
| 6 | + * Copyright (C) 2005 Brion Vibber <brion@pobox.com> |
7 | 7 | * http://www.mediawiki.org/ |
8 | 8 | * |
9 | 9 | * This program is free software; you can redistribute it and/or modify |
— | — | @@ -35,6 +35,7 @@ |
36 | 36 | class TextPassDumper extends BackupDumper { |
37 | 37 | var $prefetch = null; |
38 | 38 | var $input = "php://stdin"; |
| 39 | + var $history = WikiExporter::FULL; |
39 | 40 | var $fetchCount = 0; |
40 | 41 | var $prefetchCount = 0; |
41 | 42 | var $lastTime = 0; |
— | — | @@ -73,12 +74,19 @@ |
74 | 75 | if ( ini_get( 'display_errors' ) ) |
75 | 76 | ini_set( 'display_errors', 'stderr' ); |
76 | 77 | |
77 | | - $this->initProgress( $history ); |
| 78 | + $this->initProgress( $this->history ); |
78 | 79 | |
79 | 80 | $this->db = $this->backupDb(); |
80 | 81 | |
81 | | - $this->readDump(); |
| 82 | + $this->egress = new ExportProgressFilter( $this->sink, $this ); |
82 | 83 | |
| 84 | + $input = fopen( $this->input, "rt" ); |
| 85 | + $result = $this->readDump( $input ); |
| 86 | + |
| 87 | + if ( WikiError::isError( $result ) ) { |
| 88 | + wfDie( $result->getMessage() ); |
| 89 | + } |
| 90 | + |
83 | 91 | if ( $this->spawnProc ) { |
84 | 92 | $this->closeSpawn(); |
85 | 93 | } |
— | — | @@ -98,6 +106,12 @@ |
99 | 107 | case 'stub': |
100 | 108 | $this->input = $url; |
101 | 109 | break; |
| 110 | + case 'current': |
| 111 | + $this->history = WikiExporter::CURRENT; |
| 112 | + break; |
| 113 | + case 'full': |
| 114 | + $this->history = WikiExporter::FULL; |
| 115 | + break; |
102 | 116 | case 'spawn': |
103 | 117 | $this->spawn = true; |
104 | 118 | if ( $val ) { |
— | — | @@ -190,76 +204,34 @@ |
191 | 205 | } |
192 | 206 | } |
193 | 207 | |
194 | | - function readDump() { |
195 | | - $state = ''; |
196 | | - $lastName = ''; |
| 208 | + function readDump( $input ) { |
| 209 | + $this->buffer = ""; |
| 210 | + $this->openElement = false; |
| 211 | + $this->atStart = true; |
| 212 | + $this->state = ""; |
| 213 | + $this->lastName = ""; |
197 | 214 | $this->thisPage = 0; |
198 | 215 | $this->thisRev = 0; |
199 | 216 | |
200 | | - $reader = new XMLReader(); |
201 | | - $reader->open( $this->input ); |
202 | | - $writer = new XMLWriter(); |
203 | | - $writer->openMemory(); |
| 217 | + $parser = xml_parser_create( "UTF-8" ); |
| 218 | + xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false ); |
204 | 219 | |
| 220 | + xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) ); |
| 221 | + xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) ); |
205 | 222 | |
206 | | - while ( $reader->read() ) { |
207 | | - $tag = $reader->name; |
208 | | - $type = $reader->nodeType; |
| 223 | + $offset = 0; // for context extraction on error reporting |
| 224 | + $bufferSize = 512 * 1024; |
| 225 | + do { |
| 226 | + $chunk = fread( $input, $bufferSize ); |
| 227 | + if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) { |
| 228 | + wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" ); |
| 229 | + return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset ); |
| 230 | + } |
| 231 | + $offset += strlen( $chunk ); |
| 232 | + } while ( $chunk !== false && !feof( $input ) ); |
| 233 | + xml_parser_free( $parser ); |
209 | 234 | |
210 | | - if ( $type == XmlReader::END_ELEMENT ) { |
211 | | - $writer->endElement(); |
212 | | - |
213 | | - if ( $tag == 'revision' ) { |
214 | | - $this->revCount(); |
215 | | - $this->thisRev = ''; |
216 | | - } elseif ( $tag == 'page' ) { |
217 | | - $this->reportPage(); |
218 | | - $this->thisPage = ''; |
219 | | - } |
220 | | - } elseif ( $type == XmlReader::ELEMENT ) { |
221 | | - $attribs = array(); |
222 | | - if ( $reader->hasAttributes ) { |
223 | | - for ( $i = 0; $reader->moveToAttributeNo( $i ); $i++ ) { |
224 | | - $attribs[$reader->name] = $reader->value; |
225 | | - } |
226 | | - } |
227 | | - |
228 | | - if ( $reader->isEmptyElement && $tag == 'text' && isset( $attribs['id'] ) ) { |
229 | | - $writer->startElement( 'text' ); |
230 | | - $writer->writeAttribute( 'xml:space', 'preserve' ); |
231 | | - $text = $this->getText( $attribs['id'] ); |
232 | | - if ( strlen( $text ) ) { |
233 | | - $writer->text( $text ); |
234 | | - } |
235 | | - $writer->endElement(); |
236 | | - } else { |
237 | | - $writer->startElement( $tag ); |
238 | | - foreach( $attribs as $name => $val ) { |
239 | | - $writer->writeAttribute( $name, $val ); |
240 | | - } |
241 | | - if ( $reader->isEmptyElement ) { |
242 | | - $writer->endElement(); |
243 | | - } |
244 | | - } |
245 | | - |
246 | | - $lastName = $tag; |
247 | | - if ( $tag == 'revision' ) { |
248 | | - $state = 'revision'; |
249 | | - } elseif ( $tag == 'page' ) { |
250 | | - $state = 'page'; |
251 | | - } |
252 | | - } elseif ( $type == XMLReader::SIGNIFICANT_WHITESPACE || $type == XMLReader::TEXT ) { |
253 | | - if ( $lastName == 'id' ) { |
254 | | - if ( $state == 'revision' ) { |
255 | | - $this->thisRev .= $reader->value; |
256 | | - } elseif ( $state == 'page' ) { |
257 | | - $this->thisPage .= $reader->value; |
258 | | - } |
259 | | - } |
260 | | - $writer->text( $reader->value ); |
261 | | - } |
262 | | - $this->sink->write( $writer->outputMemory() ); |
263 | | - } |
| 235 | + return true; |
264 | 236 | } |
265 | 237 | |
266 | 238 | function getText( $id ) { |
— | — | @@ -282,6 +254,7 @@ |
283 | 255 | } |
284 | 256 | |
285 | 257 | private function doGetText( $id ) { |
| 258 | + |
286 | 259 | $id = intval( $id ); |
287 | 260 | $this->failures = 0; |
288 | 261 | $ex = new MWException( "Graceful storage failure" ); |
— | — | @@ -469,13 +442,81 @@ |
470 | 443 | $normalized = $wgContLang->normalize( $stripped ); |
471 | 444 | return $normalized; |
472 | 445 | } |
| 446 | + |
| 447 | + function startElement( $parser, $name, $attribs ) { |
| 448 | + $this->clearOpenElement( null ); |
| 449 | + $this->lastName = $name; |
| 450 | + |
| 451 | + if ( $name == 'revision' ) { |
| 452 | + $this->state = $name; |
| 453 | + $this->egress->writeOpenPage( null, $this->buffer ); |
| 454 | + $this->buffer = ""; |
| 455 | + } elseif ( $name == 'page' ) { |
| 456 | + $this->state = $name; |
| 457 | + if ( $this->atStart ) { |
| 458 | + $this->egress->writeOpenStream( $this->buffer ); |
| 459 | + $this->buffer = ""; |
| 460 | + $this->atStart = false; |
| 461 | + } |
| 462 | + } |
| 463 | + |
| 464 | + if ( $name == "text" && isset( $attribs['id'] ) ) { |
| 465 | + $text = $this->getText( $attribs['id'] ); |
| 466 | + $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) ); |
| 467 | + if ( strlen( $text ) > 0 ) { |
| 468 | + $this->characterData( $parser, $text ); |
| 469 | + } |
| 470 | + } else { |
| 471 | + $this->openElement = array( $name, $attribs ); |
| 472 | + } |
| 473 | + } |
| 474 | + |
| 475 | + function endElement( $parser, $name ) { |
| 476 | + if ( $this->openElement ) { |
| 477 | + $this->clearOpenElement( "" ); |
| 478 | + } else { |
| 479 | + $this->buffer .= "</$name>"; |
| 480 | + } |
| 481 | + |
| 482 | + if ( $name == 'revision' ) { |
| 483 | + $this->egress->writeRevision( null, $this->buffer ); |
| 484 | + $this->buffer = ""; |
| 485 | + $this->thisRev = ""; |
| 486 | + } elseif ( $name == 'page' ) { |
| 487 | + $this->egress->writeClosePage( $this->buffer ); |
| 488 | + $this->buffer = ""; |
| 489 | + $this->thisPage = ""; |
| 490 | + } elseif ( $name == 'mediawiki' ) { |
| 491 | + $this->egress->writeCloseStream( $this->buffer ); |
| 492 | + $this->buffer = ""; |
| 493 | + } |
| 494 | + } |
| 495 | + |
| 496 | + function characterData( $parser, $data ) { |
| 497 | + $this->clearOpenElement( null ); |
| 498 | + if ( $this->lastName == "id" ) { |
| 499 | + if ( $this->state == "revision" ) { |
| 500 | + $this->thisRev .= $data; |
| 501 | + } elseif ( $this->state == "page" ) { |
| 502 | + $this->thisPage .= $data; |
| 503 | + } |
| 504 | + } |
| 505 | + $this->buffer .= htmlspecialchars( $data ); |
| 506 | + } |
| 507 | + |
| 508 | + function clearOpenElement( $style ) { |
| 509 | + if ( $this->openElement ) { |
| 510 | + $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style ); |
| 511 | + $this->openElement = false; |
| 512 | + } |
| 513 | + } |
473 | 514 | } |
474 | 515 | |
475 | 516 | |
476 | 517 | $dumper = new TextPassDumper( $argv ); |
477 | 518 | |
478 | 519 | if ( !isset( $options['help'] ) ) { |
479 | | - $dumper->dump( WikiExporter::FULL ); |
| 520 | + $dumper->dump( true ); |
480 | 521 | } else { |
481 | 522 | $dumper->progress( <<<ENDS |
482 | 523 | This script postprocesses XML dumps from dumpBackup.php to add |
— | — | @@ -489,12 +530,11 @@ |
490 | 531 | --stub=<type>:<file> To load a compressed stub dump instead of stdin |
491 | 532 | --prefetch=<type>:<file> Use a prior dump file as a text source, to save |
492 | 533 | pressure on the database. |
| 534 | + (Requires the XMLReader extension) |
493 | 535 | --quiet Don't dump status reports to stderr. |
494 | 536 | --report=n Report position and speed after every n pages processed. |
495 | 537 | (Default: 100) |
496 | 538 | --server=h Force reading from MySQL server h |
497 | | - --output=<type>:<file> Write to a file instead of stdout |
498 | | - <type>s: file, gzip, bzip2, 7zip |
499 | 539 | --current Base ETA on number of pages in database instead of all revisions |
500 | 540 | --spawn Spawn a subprocess for loading text records |
501 | 541 | --help Display this help message |