Index: trunk/phase3/maintenance/dumpTextPass.php |
— | — | @@ -59,10 +59,23 @@ |
60 | 60 | |
61 | 61 | var $ID = 0; |
62 | 62 | |
| 63 | + var $xmlwriterobj = false; |
| 64 | + |
| 65 | + # when we spend more than maxTimeAllowed seconds on this run, we continue |
| 66 | + # processing until we write out the next complete page, then save output file(s), |
| 67 | + # rename it/them and open new one(s) |
| 68 | + var $maxTimeAllowed = 0; // 0 = no limit |
| 69 | + var $timeExceeded = false; |
| 70 | + var $firstPageWritten = false; |
| 71 | + var $lastPageWritten = false; |
| 72 | + var $checkpointJustWritten = false; |
| 73 | + var $checkpointFiles = array(); |
| 74 | + |
63 | 75 | function initProgress( $history ) { |
64 | 76 | parent::initProgress(); |
65 | 77 | $this->ID = getmypid(); |
66 | 78 | $this->lastTime = $this->startTime; |
| 79 | + $this->timeOfCheckpoint = $this->startTime; |
67 | 80 | } |
68 | 81 | |
69 | 82 | function dump( $history, $text = WikiExporter::TEXT ) { |
— | — | @@ -80,6 +93,12 @@ |
81 | 94 | |
82 | 95 | $this->egress = new ExportProgressFilter( $this->sink, $this ); |
83 | 96 | |
| 97 | + # it would be nice to do it in the constructor, oh well. need egress set |
| 98 | + $this->finalOptionCheck(); |
| 99 | + |
| 100 | + # we only want this so we know how to close a stream :-P |
| 101 | + $this->xmlwriterobj = new XmlDumpWriter(); |
| 102 | + |
84 | 103 | $input = fopen( $this->input, "rt" ); |
85 | 104 | $result = $this->readDump( $input ); |
86 | 105 | |
— | — | @@ -106,6 +125,12 @@ |
107 | 126 | case 'stub': |
108 | 127 | $this->input = $url; |
109 | 128 | break; |
| 129 | + case 'maxtime': |
| 130 | + $this->maxTimeAllowed = intval($val)*60; |
| 131 | + break; |
| 132 | + case 'checkpointfile': |
| 133 | + $this->checkpointFiles[] = $val; |
| 134 | + break; |
110 | 135 | case 'current': |
111 | 136 | $this->history = WikiExporter::CURRENT; |
112 | 137 | break; |
— | — | @@ -204,6 +229,39 @@ |
205 | 230 | } |
206 | 231 | } |
207 | 232 | |
| 233 | + function setTimeExceeded() { |
| 234 | + $this->timeExceeded = True; |
| 235 | + } |
| 236 | + |
| 237 | + function checkIfTimeExceeded() { |
| 238 | + if ( $this->maxTimeAllowed && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed ) ) { |
| 239 | + return True; |
| 240 | + } |
| 241 | + return False; |
| 242 | + } |
| 243 | + |
| 244 | + function finalOptionCheck() { |
| 245 | + if (($this->checkpointFiles && ! $this->maxTimeAllowed) || |
| 246 | + ($this->maxTimeAllowed && !$this->checkpointFiles)) { |
| 247 | + wfDie("Options checkpointfile and maxtime must be specified together.\n"); |
| 248 | + } |
| 249 | + foreach ($this->checkpointFiles as $checkpointFile) { |
| 250 | + $count = substr_count ($checkpointFile,"%s"); |
| 251 | + if (substr_count ($checkpointFile,"%s") != 2) { |
| 252 | + wfDie("Option checkpointfile must contain two '%s' for substitution of first and last pageids, count is $count instead, fil |
| 253 | +e is $checkpointFile.\n"); |
| 254 | + } |
| 255 | + } |
| 256 | + |
| 257 | + $filenameList = $this->egress->getFilename(); |
| 258 | + if (! is_array($filenameList)) { |
| 259 | + $filenameList = array( $filenameList ); |
| 260 | + } |
| 261 | + if (count($filenameList) != count($this->checkpointFiles)) { |
| 262 | + wfDie("One checkpointfile must be specified for each output option, if maxtime is used.\n"); |
| 263 | + } |
| 264 | + } |
| 265 | + |
208 | 266 | function readDump( $input ) { |
209 | 267 | $this->buffer = ""; |
210 | 268 | $this->openElement = false; |
— | — | @@ -222,6 +280,9 @@ |
223 | 281 | $offset = 0; // for context extraction on error reporting |
224 | 282 | $bufferSize = 512 * 1024; |
225 | 283 | do { |
| 284 | + if ($this->checkIfTimeExceeded()) { |
| 285 | + $this->setTimeExceeded(); |
| 286 | + } |
226 | 287 | $chunk = fread( $input, $bufferSize ); |
227 | 288 | if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) { |
228 | 289 | wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" ); |
— | — | @@ -229,6 +290,24 @@ |
230 | 291 | } |
231 | 292 | $offset += strlen( $chunk ); |
232 | 293 | } while ( $chunk !== false && !feof( $input ) ); |
| 294 | + if ($this->maxTimeAllowed) { |
| 295 | + $filenameList = $this->egress->getFilename(); |
| 296 | + # we wrote some stuff after last checkpoint that needs renamed */ |
| 297 | + if (! is_array($filenameList)) { |
| 298 | + $filenameList = array( $filenameList ); |
| 299 | + } |
| 300 | + if (file_exists($filenameList[0])) { |
| 301 | + $newFilenames = array(); |
| 302 | + $firstPageID = str_pad($this->firstPageWritten,9,"0",STR_PAD_LEFT); |
| 303 | + $lastPageID = str_pad($this->lastPageWritten,9,"0",STR_PAD_LEFT); |
| 304 | + for ($i =0; $i < count($filenameList); $i++) { |
| 305 | + $checkpointNameFilledIn = sprintf($this->checkpointFiles[$i], $firstPageID, $lastPageID); |
| 306 | + $fileinfo = pathinfo($filenameList[$i]); |
| 307 | + $newFilenames[] = $fileinfo{'dirname'} . '/' . $checkpointNameFilledIn; |
| 308 | + } |
| 309 | + $this->egress->rename( $newFilenames ); |
| 310 | + } |
| 311 | + } |
233 | 312 | xml_parser_free( $parser ); |
234 | 313 | |
235 | 314 | return true; |
— | — | @@ -444,6 +523,8 @@ |
445 | 524 | } |
446 | 525 | |
447 | 526 | function startElement( $parser, $name, $attribs ) { |
| 527 | + $this->checkpointJustWritten = false; |
| 528 | + |
448 | 529 | $this->clearOpenElement( null ); |
449 | 530 | $this->lastName = $name; |
450 | 531 | |
— | — | @@ -472,6 +553,8 @@ |
473 | 554 | } |
474 | 555 | |
475 | 556 | function endElement( $parser, $name ) { |
| 557 | + $this->checkpointJustWritten = false; |
| 558 | + |
476 | 559 | if ( $this->openElement ) { |
477 | 560 | $this->clearOpenElement( "" ); |
478 | 561 | } else { |
— | — | @@ -483,9 +566,49 @@ |
484 | 567 | $this->buffer = ""; |
485 | 568 | $this->thisRev = ""; |
486 | 569 | } elseif ( $name == 'page' ) { |
487 | | - $this->egress->writeClosePage( $this->buffer ); |
488 | | - $this->buffer = ""; |
489 | | - $this->thisPage = ""; |
| 570 | + if (! $this->firstPageWritten) { |
| 571 | + $this->firstPageWritten = trim($this->thisPage); |
| 572 | + } |
| 573 | + $this->lastPageWritten = trim($this->thisPage); |
| 574 | + if ($this->timeExceeded) { |
| 575 | + $this->egress->writeClosePage( $this->buffer ); |
| 576 | + # nasty hack, we can't just write the chardata after the |
| 577 | + # page tag, it will include leading blanks from the next line |
| 578 | + $this->egress->sink->write("\n"); |
| 579 | + |
| 580 | + $this->buffer = $this->xmlwriterobj->closeStream(); |
| 581 | + $this->egress->writeCloseStream( $this->buffer ); |
| 582 | + |
| 583 | + $this->buffer = ""; |
| 584 | + $this->thisPage = ""; |
| 585 | + /* this could be more than one file if we had more than one output arg */ |
| 586 | + $checkpointFilenames = array(); |
| 587 | + $filenameList = $this->egress->getFilename(); |
| 588 | + |
| 589 | + if (! is_array($filenameList)) { |
| 590 | + $filenameList = array( $filenameList ); |
| 591 | + } |
| 592 | + $newFilenames = array(); |
| 593 | + $firstPageID = str_pad($this->firstPageWritten,9,"0",STR_PAD_LEFT); |
| 594 | + $lastPageID = str_pad($this->lastPageWritten,9,"0",STR_PAD_LEFT); |
| 595 | + for ($i =0; $i < count($filenameList); $i++) { |
| 596 | + $checkpointNameFilledIn = sprintf($this->checkpointFiles[$i], $firstPageID, $lastPageID); |
| 597 | + $fileinfo = pathinfo($filenameList[$i]); |
| 598 | + $newFilenames[] = $fileinfo{'dirname'} . '/' . $checkpointNameFilledIn; |
| 599 | + } |
| 600 | + $this->egress->closeRenameAndReopen( $newFilenames ); |
| 601 | + $this->buffer = $this->xmlwriterobj->openStream(); |
| 602 | + $this->timeExceeded = false; |
| 603 | + $this->timeOfCheckpoint = $this->lastTime; |
| 604 | + $this->firstPageWritten = false; |
| 605 | + $this->checkpointJustWritten = true; |
| 606 | + } |
| 607 | + else { |
| 608 | + $this->egress->writeClosePage( $this->buffer ); |
| 609 | + $this->buffer = ""; |
| 610 | + $this->thisPage = ""; |
| 611 | + } |
| 612 | + |
490 | 613 | } elseif ( $name == 'mediawiki' ) { |
491 | 614 | $this->egress->writeCloseStream( $this->buffer ); |
492 | 615 | $this->buffer = ""; |
— | — | @@ -501,6 +624,14 @@ |
502 | 625 | $this->thisPage .= $data; |
503 | 626 | } |
504 | 627 | } |
| 628 | + # have to skip the newline left over from closepagetag line of |
| 629 | + # end of checkpoint files. nasty hack!! |
| 630 | + if ($this->checkpointJustWritten) { |
| 631 | + if ($data[0] == "\n") { |
| 632 | + $data = substr($data,1); |
| 633 | + } |
| 634 | + $this->checkpointJustWritten = false; |
| 635 | + } |
505 | 636 | $this->buffer .= htmlspecialchars( $data ); |
506 | 637 | } |
507 | 638 | |
— | — | @@ -531,6 +662,12 @@ |
532 | 663 | --prefetch=<type>:<file> Use a prior dump file as a text source, to save |
533 | 664 | pressure on the database. |
534 | 665 | (Requires the XMLReader extension) |
| 666 | + --maxtime=<minutes> Write out checkpoint file after this many minutes (writing |
| 667 | + out complete page, closing xml file properly, and opening new one |
| 668 | + with header). This option requires the checkpointfile option. |
| 669 | + --checkpointfile=<filenamepattern> Use this string for checkpoint filenames, |
| 670 | + substituting first pageid written for the first %s (required) and the |
| 671 | + last pageid written for the second %s if it exists. |
535 | 672 | --quiet Don't dump status reports to stderr. |
536 | 673 | --report=n Report position and speed after every n pages processed. |
537 | 674 | (Default: 100) |