Index: branches/wmf/1.17wmf1/maintenance/dumpTextPass.php |
— | — | @@ -35,6 +35,7 @@ |
36 | 36 | class TextPassDumper extends BackupDumper { |
37 | 37 | var $prefetch = null; |
38 | 38 | var $input = "php://stdin"; |
| 39 | + var $history = WikiExporter::FULL; |
39 | 40 | var $fetchCount = 0; |
40 | 41 | var $prefetchCount = 0; |
41 | 42 | |
— | — | @@ -60,12 +61,19 @@ |
61 | 62 | if ( ini_get( 'display_errors' ) ) |
62 | 63 | ini_set( 'display_errors', 'stderr' ); |
63 | 64 | |
64 | | - $this->initProgress( $history ); |
| 65 | + $this->initProgress( $this->history ); |
65 | 66 | |
66 | 67 | $this->db = $this->backupDb(); |
67 | 68 | |
68 | | - $this->readDump(); |
| 69 | + $this->egress = new ExportProgressFilter( $this->sink, $this ); |
69 | 70 | |
| 71 | + $input = fopen( $this->input, "rt" ); |
| 72 | + $result = $this->readDump( $input ); |
| 73 | + |
| 74 | + if ( WikiError::isError( $result ) ) { |
| 75 | + wfDie( $result->getMessage() ); |
| 76 | + } |
| 77 | + |
70 | 78 | if ( $this->spawnProc ) { |
71 | 79 | $this->closeSpawn(); |
72 | 80 | } |
— | — | @@ -85,6 +93,12 @@ |
86 | 94 | case 'stub': |
87 | 95 | $this->input = $url; |
88 | 96 | break; |
| 97 | + case 'current': |
| 98 | + $this->history = WikiExporter::CURRENT; |
| 99 | + break; |
| 100 | + case 'full': |
| 101 | + $this->history = WikiExporter::FULL; |
| 102 | + break; |
89 | 103 | case 'spawn': |
90 | 104 | $this->spawn = true; |
91 | 105 | if ( $val ) { |
— | — | @@ -148,76 +162,34 @@ |
149 | 163 | } |
150 | 164 | } |
151 | 165 | |
152 | | - function readDump() { |
153 | | - $state = ''; |
154 | | - $lastName = ''; |
| 166 | + function readDump( $input ) { |
| 167 | + $this->buffer = ""; |
| 168 | + $this->openElement = false; |
| 169 | + $this->atStart = true; |
| 170 | + $this->state = ""; |
| 171 | + $this->lastName = ""; |
155 | 172 | $this->thisPage = 0; |
156 | 173 | $this->thisRev = 0; |
157 | 174 | |
158 | | - $reader = new XMLReader(); |
159 | | - $reader->open( $this->input ); |
160 | | - $writer = new XMLWriter(); |
161 | | - $writer->openMemory(); |
| 175 | + $parser = xml_parser_create( "UTF-8" ); |
| 176 | + xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false ); |
162 | 177 | |
| 178 | + xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) ); |
| 179 | + xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) ); |
163 | 180 | |
164 | | - while ( $reader->read() ) { |
165 | | - $tag = $reader->name; |
166 | | - $type = $reader->nodeType; |
| 181 | + $offset = 0; // for context extraction on error reporting |
| 182 | + $bufferSize = 512 * 1024; |
| 183 | + do { |
| 184 | + $chunk = fread( $input, $bufferSize ); |
| 185 | + if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) { |
| 186 | + wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" ); |
| 187 | + return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset ); |
| 188 | + } |
| 189 | + $offset += strlen( $chunk ); |
| 190 | + } while ( $chunk !== false && !feof( $input ) ); |
| 191 | + xml_parser_free( $parser ); |
167 | 192 | |
168 | | - if ( $type == XmlReader::END_ELEMENT ) { |
169 | | - $writer->endElement(); |
170 | | - |
171 | | - if ( $tag == 'revision' ) { |
172 | | - $this->revCount(); |
173 | | - $this->thisRev = ''; |
174 | | - } elseif ( $tag == 'page' ) { |
175 | | - $this->reportPage(); |
176 | | - $this->thisPage = ''; |
177 | | - } |
178 | | - } elseif ( $type == XmlReader::ELEMENT ) { |
179 | | - $attribs = array(); |
180 | | - if ( $reader->hasAttributes ) { |
181 | | - for ( $i = 0; $reader->moveToAttributeNo( $i ); $i++ ) { |
182 | | - $attribs[$reader->name] = $reader->value; |
183 | | - } |
184 | | - } |
185 | | - |
186 | | - if ( $reader->isEmptyElement && $tag == 'text' && isset( $attribs['id'] ) ) { |
187 | | - $writer->startElement( 'text' ); |
188 | | - $writer->writeAttribute( 'xml:space', 'preserve' ); |
189 | | - $text = $this->getText( $attribs['id'] ); |
190 | | - if ( strlen( $text ) ) { |
191 | | - $writer->text( $text ); |
192 | | - } |
193 | | - $writer->endElement(); |
194 | | - } else { |
195 | | - $writer->startElement( $tag ); |
196 | | - foreach( $attribs as $name => $val ) { |
197 | | - $writer->writeAttribute( $name, $val ); |
198 | | - } |
199 | | - if ( $reader->isEmptyElement ) { |
200 | | - $writer->endElement(); |
201 | | - } |
202 | | - } |
203 | | - |
204 | | - $lastName = $tag; |
205 | | - if ( $tag == 'revision' ) { |
206 | | - $state = 'revision'; |
207 | | - } elseif ( $tag == 'page' ) { |
208 | | - $state = 'page'; |
209 | | - } |
210 | | - } elseif ( $type == XMLReader::SIGNIFICANT_WHITESPACE || $type = XMLReader::TEXT ) { |
211 | | - if ( $lastName == 'id' ) { |
212 | | - if ( $state == 'revision' ) { |
213 | | - $this->thisRev .= $reader->value; |
214 | | - } elseif ( $state == 'page' ) { |
215 | | - $this->thisPage .= $reader->value; |
216 | | - } |
217 | | - } |
218 | | - $writer->text( $reader->value ); |
219 | | - } |
220 | | - $this->sink->write( $writer->outputMemory() ); |
221 | | - } |
| 193 | + return true; |
222 | 194 | } |
223 | 195 | |
224 | 196 | function getText( $id ) { |
— | — | @@ -240,6 +212,7 @@ |
241 | 213 | } |
242 | 214 | |
243 | 215 | private function doGetText( $id ) { |
| 216 | + |
244 | 217 | $id = intval( $id ); |
245 | 218 | $this->failures = 0; |
246 | 219 | $ex = new MWException( "Graceful storage failure" ); |
— | — | @@ -427,13 +400,81 @@ |
428 | 401 | $normalized = $wgContLang->normalize( $stripped ); |
429 | 402 | return $normalized; |
430 | 403 | } |
| 404 | + |
| 405 | + function startElement( $parser, $name, $attribs ) { |
| 406 | + $this->clearOpenElement( null ); |
| 407 | + $this->lastName = $name; |
| 408 | + |
| 409 | + if ( $name == 'revision' ) { |
| 410 | + $this->state = $name; |
| 411 | + $this->egress->writeOpenPage( null, $this->buffer ); |
| 412 | + $this->buffer = ""; |
| 413 | + } elseif ( $name == 'page' ) { |
| 414 | + $this->state = $name; |
| 415 | + if ( $this->atStart ) { |
| 416 | + $this->egress->writeOpenStream( $this->buffer ); |
| 417 | + $this->buffer = ""; |
| 418 | + $this->atStart = false; |
| 419 | + } |
| 420 | + } |
| 421 | + |
| 422 | + if ( $name == "text" && isset( $attribs['id'] ) ) { |
| 423 | + $text = $this->getText( $attribs['id'] ); |
| 424 | + $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) ); |
| 425 | + if ( strlen( $text ) > 0 ) { |
| 426 | + $this->characterData( $parser, $text ); |
| 427 | + } |
| 428 | + } else { |
| 429 | + $this->openElement = array( $name, $attribs ); |
| 430 | + } |
| 431 | + } |
| 432 | + |
| 433 | + function endElement( $parser, $name ) { |
| 434 | + if ( $this->openElement ) { |
| 435 | + $this->clearOpenElement( "" ); |
| 436 | + } else { |
| 437 | + $this->buffer .= "</$name>"; |
| 438 | + } |
| 439 | + |
| 440 | + if ( $name == 'revision' ) { |
| 441 | + $this->egress->writeRevision( null, $this->buffer ); |
| 442 | + $this->buffer = ""; |
| 443 | + $this->thisRev = ""; |
| 444 | + } elseif ( $name == 'page' ) { |
| 445 | + $this->egress->writeClosePage( $this->buffer ); |
| 446 | + $this->buffer = ""; |
| 447 | + $this->thisPage = ""; |
| 448 | + } elseif ( $name == 'mediawiki' ) { |
| 449 | + $this->egress->writeCloseStream( $this->buffer ); |
| 450 | + $this->buffer = ""; |
| 451 | + } |
| 452 | + } |
| 453 | + |
| 454 | + function characterData( $parser, $data ) { |
| 455 | + $this->clearOpenElement( null ); |
| 456 | + if ( $this->lastName == "id" ) { |
| 457 | + if ( $this->state == "revision" ) { |
| 458 | + $this->thisRev .= $data; |
| 459 | + } elseif ( $this->state == "page" ) { |
| 460 | + $this->thisPage .= $data; |
| 461 | + } |
| 462 | + } |
| 463 | + $this->buffer .= htmlspecialchars( $data ); |
| 464 | + } |
| 465 | + |
| 466 | + function clearOpenElement( $style ) { |
| 467 | + if ( $this->openElement ) { |
| 468 | + $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style ); |
| 469 | + $this->openElement = false; |
| 470 | + } |
| 471 | + } |
431 | 472 | } |
432 | 473 | |
433 | 474 | |
434 | 475 | $dumper = new TextPassDumper( $argv ); |
435 | 476 | |
436 | 477 | if ( !isset( $options['help'] ) ) { |
437 | | - $dumper->dump( WikiExporter::FULL ); |
| 478 | + $dumper->dump( true ); |
438 | 479 | } else { |
439 | 480 | $dumper->progress( <<<ENDS |
440 | 481 | This script postprocesses XML dumps from dumpBackup.php to add |
— | — | @@ -447,6 +488,7 @@ |
448 | 489 | --stub=<type>:<file> To load a compressed stub dump instead of stdin |
449 | 490 | --prefetch=<type>:<file> Use a prior dump file as a text source, to save |
450 | 491 | pressure on the database. |
| 492 | + (Requires the XMLReader extension) |
451 | 493 | --quiet Don't dump status reports to stderr. |
452 | 494 | --report=n Report position and speed after every n pages processed. |
453 | 495 | (Default: 100) |
— | — | @@ -459,5 +501,3 @@ |
460 | 502 | ENDS |
461 | 503 | ); |
462 | 504 | } |
463 | | - |
464 | | - |