Index: trunk/phase3/maintenance/dumpTextPass.php |
— | — | @@ -2,7 +2,7 @@ |
3 | 3 | /** |
4 | 4 | * Script that postprocesses XML dumps from dumpBackup.php to add page text |
5 | 5 | * |
6 | | - * Copyright (C) 2005 Brion Vibber <brion@pobox.com> |
| 6 | + * Copyright © 2005 Brion Vibber <brion@pobox.com>, 2010 Alexandre Emsenhuber |
7 | 7 | * http://www.mediawiki.org/ |
8 | 8 | * |
9 | 9 | * This program is free software; you can redistribute it and/or modify |
— | — | @@ -35,7 +35,6 @@ |
36 | 36 | class TextPassDumper extends BackupDumper { |
37 | 37 | var $prefetch = null; |
38 | 38 | var $input = "php://stdin"; |
39 | | - var $history = WikiExporter::FULL; |
40 | 39 | var $fetchCount = 0; |
41 | 40 | var $prefetchCount = 0; |
42 | 41 | |
— | — | @@ -61,19 +60,12 @@ |
62 | 61 | if ( ini_get( 'display_errors' ) ) |
63 | 62 | ini_set( 'display_errors', 'stderr' ); |
64 | 63 | |
65 | | - $this->initProgress( $this->history ); |
| 64 | + $this->initProgress( $history ); |
66 | 65 | |
67 | 66 | $this->db = $this->backupDb(); |
68 | 67 | |
69 | | - $this->egress = new ExportProgressFilter( $this->sink, $this ); |
| 68 | + $this->readDump(); |
70 | 69 | |
71 | | - $input = fopen( $this->input, "rt" ); |
72 | | - $result = $this->readDump( $input ); |
73 | | - |
74 | | - if ( WikiError::isError( $result ) ) { |
75 | | - wfDie( $result->getMessage() ); |
76 | | - } |
77 | | - |
78 | 70 | if ( $this->spawnProc ) { |
79 | 71 | $this->closeSpawn(); |
80 | 72 | } |
— | — | @@ -93,12 +85,6 @@ |
94 | 86 | case 'stub': |
95 | 87 | $this->input = $url; |
96 | 88 | break; |
97 | | - case 'current': |
98 | | - $this->history = WikiExporter::CURRENT; |
99 | | - break; |
100 | | - case 'full': |
101 | | - $this->history = WikiExporter::FULL; |
102 | | - break; |
103 | 89 | case 'spawn': |
104 | 90 | $this->spawn = true; |
105 | 91 | if ( $val ) { |
— | — | @@ -152,34 +138,76 @@ |
153 | 139 | } |
154 | 140 | } |
155 | 141 | |
156 | | - function readDump( $input ) { |
157 | | - $this->buffer = ""; |
158 | | - $this->openElement = false; |
159 | | - $this->atStart = true; |
160 | | - $this->state = ""; |
161 | | - $this->lastName = ""; |
| 142 | + function readDump() { |
| 143 | + $state = ''; |
| 144 | + $lastName = ''; |
162 | 145 | $this->thisPage = 0; |
163 | 146 | $this->thisRev = 0; |
164 | 147 | |
165 | | - $parser = xml_parser_create( "UTF-8" ); |
166 | | - xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false ); |
| 148 | + $reader = new XMLReader(); |
| 149 | + $reader->open( $this->input ); |
| 150 | + $writer = new XMLWriter(); |
| 151 | + $writer->openURI( 'php://stdout' ); |
167 | 152 | |
168 | | - xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) ); |
169 | | - xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) ); |
170 | 153 | |
171 | | - $offset = 0; // for context extraction on error reporting |
172 | | - $bufferSize = 512 * 1024; |
173 | | - do { |
174 | | - $chunk = fread( $input, $bufferSize ); |
175 | | - if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) { |
176 | | - wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" ); |
177 | | - return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset ); |
178 | | - } |
179 | | - $offset += strlen( $chunk ); |
180 | | - } while ( $chunk !== false && !feof( $input ) ); |
181 | | - xml_parser_free( $parser ); |
| 154 | + while ( $reader->read() ) { |
| 155 | + $tag = $reader->name; |
| 156 | + $type = $reader->nodeType; |
182 | 157 | |
183 | | - return true; |
| 158 | + if ( $type == XmlReader::END_ELEMENT ) { |
| 159 | + $writer->endElement(); |
| 160 | + |
| 161 | + if ( $tag == 'revision' ) { |
| 162 | + $this->revCount(); |
| 163 | + $this->thisRev = ''; |
| 164 | + } elseif ( $tag == 'page' ) { |
| 165 | + $this->reportPage(); |
| 166 | + $this->thisPage = ''; |
| 167 | + } |
| 168 | + } elseif ( $type == XmlReader::ELEMENT ) { |
| 169 | + $attribs = array(); |
| 170 | + if ( $reader->hasAttributes ) { |
| 171 | + for ( $i = 0; $reader->moveToAttributeNo( $i ); $i++ ) { |
| 172 | + $attribs[$reader->name] = $reader->value; |
| 173 | + } |
| 174 | + } |
| 175 | + |
| 176 | + if ( $reader->isEmptyElement && $tag == 'text' && isset( $attribs['id'] ) ) { |
| 177 | + $writer->startElement( 'text' ); |
| 178 | + $writer->writeAttribute( 'xml:space', 'preserve' ); |
| 179 | + $text = $this->getText( $attribs['id'] ); |
| 180 | + if ( strlen( $text ) ) { |
| 181 | + $writer->text( $text ); |
| 182 | + } |
| 183 | + $writer->endElement(); |
| 184 | + } else { |
| 185 | + $writer->startElement( $tag ); |
| 186 | + foreach( $attribs as $name => $val ) { |
| 187 | + $writer->writeAttribute( $name, $val ); |
| 188 | + } |
| 189 | + if ( $reader->isEmptyElement ) { |
| 190 | + $writer->endElement(); |
| 191 | + } |
| 192 | + } |
| 193 | + |
| 194 | + $lastName = $tag; |
| 195 | + if ( $tag == 'revision' ) { |
| 196 | + $state = 'revision'; |
| 197 | + } elseif ( $tag == 'page' ) { |
| 198 | + $state = 'page'; |
| 199 | + } |
| 200 | + } elseif ( $type == XMLReader::SIGNIFICANT_WHITESPACE || $type = XMLReader::TEXT ) { |
| 201 | + if ( $lastName == 'id' ) { |
| 202 | + if ( $state == 'revision' ) { |
| 203 | + $this->thisRev .= $reader->value; |
| 204 | + } elseif ( $state == 'page' ) { |
| 205 | + $this->thisPage .= $reader->value; |
| 206 | + } |
| 207 | + } |
| 208 | + $writer->text( $reader->value ); |
| 209 | + } |
| 210 | + } |
| 211 | + $writer->flush(); |
184 | 212 | } |
185 | 213 | |
186 | 214 | function getText( $id ) { |
— | — | @@ -207,7 +235,6 @@ |
208 | 236 | } |
209 | 237 | |
210 | 238 | private function doGetText( $id ) { |
211 | | - |
212 | 239 | $id = intval( $id ); |
213 | 240 | $this->failures = 0; |
214 | 241 | $ex = new MWException( "Graceful storage failure" ); |
— | — | @@ -395,81 +422,13 @@ |
396 | 423 | $normalized = $wgContLang->normalize( $stripped ); |
397 | 424 | return $normalized; |
398 | 425 | } |
399 | | - |
400 | | - function startElement( $parser, $name, $attribs ) { |
401 | | - $this->clearOpenElement( null ); |
402 | | - $this->lastName = $name; |
403 | | - |
404 | | - if ( $name == 'revision' ) { |
405 | | - $this->state = $name; |
406 | | - $this->egress->writeOpenPage( null, $this->buffer ); |
407 | | - $this->buffer = ""; |
408 | | - } elseif ( $name == 'page' ) { |
409 | | - $this->state = $name; |
410 | | - if ( $this->atStart ) { |
411 | | - $this->egress->writeOpenStream( $this->buffer ); |
412 | | - $this->buffer = ""; |
413 | | - $this->atStart = false; |
414 | | - } |
415 | | - } |
416 | | - |
417 | | - if ( $name == "text" && isset( $attribs['id'] ) ) { |
418 | | - $text = $this->getText( $attribs['id'] ); |
419 | | - $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) ); |
420 | | - if ( strlen( $text ) > 0 ) { |
421 | | - $this->characterData( $parser, $text ); |
422 | | - } |
423 | | - } else { |
424 | | - $this->openElement = array( $name, $attribs ); |
425 | | - } |
426 | | - } |
427 | | - |
428 | | - function endElement( $parser, $name ) { |
429 | | - if ( $this->openElement ) { |
430 | | - $this->clearOpenElement( "" ); |
431 | | - } else { |
432 | | - $this->buffer .= "</$name>"; |
433 | | - } |
434 | | - |
435 | | - if ( $name == 'revision' ) { |
436 | | - $this->egress->writeRevision( null, $this->buffer ); |
437 | | - $this->buffer = ""; |
438 | | - $this->thisRev = ""; |
439 | | - } elseif ( $name == 'page' ) { |
440 | | - $this->egress->writeClosePage( $this->buffer ); |
441 | | - $this->buffer = ""; |
442 | | - $this->thisPage = ""; |
443 | | - } elseif ( $name == 'mediawiki' ) { |
444 | | - $this->egress->writeCloseStream( $this->buffer ); |
445 | | - $this->buffer = ""; |
446 | | - } |
447 | | - } |
448 | | - |
449 | | - function characterData( $parser, $data ) { |
450 | | - $this->clearOpenElement( null ); |
451 | | - if ( $this->lastName == "id" ) { |
452 | | - if ( $this->state == "revision" ) { |
453 | | - $this->thisRev .= $data; |
454 | | - } elseif ( $this->state == "page" ) { |
455 | | - $this->thisPage .= $data; |
456 | | - } |
457 | | - } |
458 | | - $this->buffer .= htmlspecialchars( $data ); |
459 | | - } |
460 | | - |
461 | | - function clearOpenElement( $style ) { |
462 | | - if ( $this->openElement ) { |
463 | | - $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style ); |
464 | | - $this->openElement = false; |
465 | | - } |
466 | | - } |
467 | 426 | } |
468 | 427 | |
469 | 428 | |
470 | 429 | $dumper = new TextPassDumper( $argv ); |
471 | 430 | |
472 | 431 | if ( !isset( $options['help'] ) ) { |
473 | | - $dumper->dump( true ); |
| 432 | + $dumper->dump( WikiExporter::FULL ); |
474 | 433 | } else { |
475 | 434 | $dumper->progress( <<<ENDS |
476 | 435 | This script postprocesses XML dumps from dumpBackup.php to add |
— | — | @@ -483,7 +442,6 @@ |
484 | 443 | --stub=<type>:<file> To load a compressed stub dump instead of stdin |
485 | 444 | --prefetch=<type>:<file> Use a prior dump file as a text source, to save |
486 | 445 | pressure on the database. |
487 | | - (Requires the XMLReader extension) |
488 | 446 | --quiet Don't dump status reports to stderr. |
489 | 447 | --report=n Report position and speed after every n pages processed. |
490 | 448 | (Default: 100) |