r95272 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r95271‎ | r95272 | r95273 >
Date:22:45, 22 August 2011
Author:ariel
Status:ok (Comments)
Tags:
Comment:
add support for writing out checkpoint files of xml dump at regular intervals (close and rename file based on filename pattern which includes first and last page id written)
Modified paths:
  • /trunk/phase3/maintenance/dumpTextPass.php (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/dumpTextPass.php
@@ -59,10 +59,23 @@
6060
6161 var $ID = 0;
6262
 63+ var $xmlwriterobj = false;
 64+
 65+ # when we spend more than maxTimeAllowed seconds on this run, we continue
 66+ # processing until we write out the next complete page, then save output file(s),
 67+ # rename it/them and open new one(s)
 68+ var $maxTimeAllowed = 0; // 0 = no limit
 69+ var $timeExceeded = false;
 70+ var $firstPageWritten = false;
 71+ var $lastPageWritten = false;
 72+ var $checkpointJustWritten = false;
 73+ var $checkpointFiles = array();
 74+
6375 function initProgress( $history ) {
6476 parent::initProgress();
6577 $this->ID = getmypid();
6678 $this->lastTime = $this->startTime;
 79+ $this->timeOfCheckpoint = $this->startTime;
6780 }
6881
6982 function dump( $history, $text = WikiExporter::TEXT ) {
@@ -80,6 +93,12 @@
8194
8295 $this->egress = new ExportProgressFilter( $this->sink, $this );
8396
 97+ # it would be nice to do it in the constructor, oh well. need egress set
 98+ $this->finalOptionCheck();
 99+
 100+ # we only want this so we know how to close a stream :-P
 101+ $this->xmlwriterobj = new XmlDumpWriter();
 102+
84103 $input = fopen( $this->input, "rt" );
85104 $result = $this->readDump( $input );
86105
@@ -106,6 +125,12 @@
107126 case 'stub':
108127 $this->input = $url;
109128 break;
 129+ case 'maxtime':
 130+ $this->maxTimeAllowed = intval($val)*60;
 131+ break;
 132+ case 'checkpointfile':
 133+ $this->checkpointFiles[] = $val;
 134+ break;
110135 case 'current':
111136 $this->history = WikiExporter::CURRENT;
112137 break;
@@ -204,6 +229,39 @@
205230 }
206231 }
207232
 233+ function setTimeExceeded() {
 234+ $this->timeExceeded = True;
 235+ }
 236+
 237+ function checkIfTimeExceeded() {
 238+ if ( $this->maxTimeAllowed && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed ) ) {
 239+ return True;
 240+ }
 241+ return False;
 242+ }
 243+
 244+ function finalOptionCheck() {
 245+ if (($this->checkpointFiles && ! $this->maxTimeAllowed) ||
 246+ ($this->maxTimeAllowed && !$this->checkpointFiles)) {
 247+ wfDie("Options checkpointfile and maxtime must be specified together.\n");
 248+ }
 249+ foreach ($this->checkpointFiles as $checkpointFile) {
 250+ $count = substr_count ($checkpointFile,"%s");
 251+ if (substr_count ($checkpointFile,"%s") != 2) {
 252+ wfDie("Option checkpointfile must contain two '%s' for substitution of first and last pageids, count is $count instead, fil
 253+e is $checkpointFile.\n");
 254+ }
 255+ }
 256+
 257+ $filenameList = $this->egress->getFilename();
 258+ if (! is_array($filenameList)) {
 259+ $filenameList = array( $filenameList );
 260+ }
 261+ if (count($filenameList) != count($this->checkpointFiles)) {
 262+ wfDie("One checkpointfile must be specified for each output option, if maxtime is used.\n");
 263+ }
 264+ }
 265+
208266 function readDump( $input ) {
209267 $this->buffer = "";
210268 $this->openElement = false;
@@ -222,6 +280,9 @@
223281 $offset = 0; // for context extraction on error reporting
224282 $bufferSize = 512 * 1024;
225283 do {
 284+ if ($this->checkIfTimeExceeded()) {
 285+ $this->setTimeExceeded();
 286+ }
226287 $chunk = fread( $input, $bufferSize );
227288 if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
228289 wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
@@ -229,6 +290,24 @@
230291 }
231292 $offset += strlen( $chunk );
232293 } while ( $chunk !== false && !feof( $input ) );
 294+ if ($this->maxTimeAllowed) {
 295+ $filenameList = $this->egress->getFilename();
 296+ # we wrote some stuff after last checkpoint that needs renamed */
 297+ if (! is_array($filenameList)) {
 298+ $filenameList = array( $filenameList );
 299+ }
 300+ if (file_exists($filenameList[0])) {
 301+ $newFilenames = array();
 302+ $firstPageID = str_pad($this->firstPageWritten,9,"0",STR_PAD_LEFT);
 303+ $lastPageID = str_pad($this->lastPageWritten,9,"0",STR_PAD_LEFT);
 304+ for ($i =0; $i < count($filenameList); $i++) {
 305+ $checkpointNameFilledIn = sprintf($this->checkpointFiles[$i], $firstPageID, $lastPageID);
 306+ $fileinfo = pathinfo($filenameList[$i]);
 307+ $newFilenames[] = $fileinfo{'dirname'} . '/' . $checkpointNameFilledIn;
 308+ }
 309+ $this->egress->rename( $newFilenames );
 310+ }
 311+ }
233312 xml_parser_free( $parser );
234313
235314 return true;
@@ -444,6 +523,8 @@
445524 }
446525
447526 function startElement( $parser, $name, $attribs ) {
 527+ $this->checkpointJustWritten = false;
 528+
448529 $this->clearOpenElement( null );
449530 $this->lastName = $name;
450531
@@ -472,6 +553,8 @@
473554 }
474555
475556 function endElement( $parser, $name ) {
 557+ $this->checkpointJustWritten = false;
 558+
476559 if ( $this->openElement ) {
477560 $this->clearOpenElement( "" );
478561 } else {
@@ -483,9 +566,49 @@
484567 $this->buffer = "";
485568 $this->thisRev = "";
486569 } elseif ( $name == 'page' ) {
487 - $this->egress->writeClosePage( $this->buffer );
488 - $this->buffer = "";
489 - $this->thisPage = "";
 570+ if (! $this->firstPageWritten) {
 571+ $this->firstPageWritten = trim($this->thisPage);
 572+ }
 573+ $this->lastPageWritten = trim($this->thisPage);
 574+ if ($this->timeExceeded) {
 575+ $this->egress->writeClosePage( $this->buffer );
 576+ # nasty hack, we can't just write the chardata after the
 577+ # page tag, it will include leading blanks from the next line
 578+ $this->egress->sink->write("\n");
 579+
 580+ $this->buffer = $this->xmlwriterobj->closeStream();
 581+ $this->egress->writeCloseStream( $this->buffer );
 582+
 583+ $this->buffer = "";
 584+ $this->thisPage = "";
 585+ /* this could be more than one file if we had more than one output arg */
 586+ $checkpointFilenames = array();
 587+ $filenameList = $this->egress->getFilename();
 588+
 589+ if (! is_array($filenameList)) {
 590+ $filenameList = array( $filenameList );
 591+ }
 592+ $newFilenames = array();
 593+ $firstPageID = str_pad($this->firstPageWritten,9,"0",STR_PAD_LEFT);
 594+ $lastPageID = str_pad($this->lastPageWritten,9,"0",STR_PAD_LEFT);
 595+ for ($i =0; $i < count($filenameList); $i++) {
 596+ $checkpointNameFilledIn = sprintf($this->checkpointFiles[$i], $firstPageID, $lastPageID);
 597+ $fileinfo = pathinfo($filenameList[$i]);
 598+ $newFilenames[] = $fileinfo{'dirname'} . '/' . $checkpointNameFilledIn;
 599+ }
 600+ $this->egress->closeRenameAndReopen( $newFilenames );
 601+ $this->buffer = $this->xmlwriterobj->openStream();
 602+ $this->timeExceeded = false;
 603+ $this->timeOfCheckpoint = $this->lastTime;
 604+ $this->firstPageWritten = false;
 605+ $this->checkpointJustWritten = true;
 606+ }
 607+ else {
 608+ $this->egress->writeClosePage( $this->buffer );
 609+ $this->buffer = "";
 610+ $this->thisPage = "";
 611+ }
 612+
490613 } elseif ( $name == 'mediawiki' ) {
491614 $this->egress->writeCloseStream( $this->buffer );
492615 $this->buffer = "";
@@ -501,6 +624,14 @@
502625 $this->thisPage .= $data;
503626 }
504627 }
 628+ # have to skip the newline left over from closepagetag line of
 629+ # end of checkpoint files. nasty hack!!
 630+ if ($this->checkpointJustWritten) {
 631+ if ($data[0] == "\n") {
 632+ $data = substr($data,1);
 633+ }
 634+ $this->checkpointJustWritten = false;
 635+ }
505636 $this->buffer .= htmlspecialchars( $data );
506637 }
507638
@@ -531,6 +662,12 @@
532663 --prefetch=<type>:<file> Use a prior dump file as a text source, to save
533664 pressure on the database.
534665 (Requires the XMLReader extension)
 666+ --maxtime=<minutes> Write out checkpoint file after this many minutes (writing
 667+ out complete page, closing xml file properly, and opening new one
 668+ with header). This option requires the checkpointfile option.
 669+ --checkpointfile=<filenamepattern> Use this string for checkpoint filenames,
 670+ substituting first pageid written for the first %s (required) and the
 671+ last pageid written for the second %s if it exists.
535672 --quiet Don't dump status reports to stderr.
536673 --report=n Report position and speed after every n pages processed.
537674 (Default: 100)

Follow-up revisions

RevisionCommit summaryAuthorDate
r95810Remove wfDie() that Ariel keeps trying to resurrect :)demon20:45, 30 August 2011
r96276Whitespace fixes for r95272, r95604catrope11:22, 5 September 2011
r96556MFT r95260, r95272, r95288, r95290, r95443, r95601, r95604, r95634, r95720, r...reedy12:28, 8 September 2011
r96616uniform comment style, fix a few space issues, address couple issues from com...ariel21:06, 8 September 2011

Comments

#Comment by Catrope (talk | contribs)   11:36, 5 September 2011
+			$count = substr_count ($checkpointFile,"%s");
+			if (substr_count ($checkpointFile,"%s") != 2) {

Huh? :P You should use $count in the if here.

+		$filenameList = $this->egress->getFilename();
+		if (! is_array($filenameList)) {
+			$filenameList = array( $filenameList );
+		}

You can also do this with $filenameList = (array)$this->egress->getFilename(); , provided that the return value of that function isn't an object. Besides, it seems you don't even need it here: count( 'a string' ) === 1

+			# we wrote some stuff after last checkpoint that needs renamed */

Interesting comment style ;)

+				$firstPageID = str_pad($this->firstPageWritten,9,"0",STR_PAD_LEFT);
+				$lastPageID = str_pad($this->lastPageWritten,9,"0",STR_PAD_LEFT);

Wouldn't you be able to do this inside the printf format string using %09d ?

+					$newFilenames[] = $fileinfo{'dirname'} . '/' . $checkpointNameFilledIn;

I'm pretty sure the {'dirname'} syntax is deprecated in favor of ['dirname'] and I'm surprised it even worked: AFAIK the only use of braces that way was to get the n-th character of a string with $str{2} .

OK otherwise, marking as such.

Status & tagging log