r88914 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r88913‎ | r88914 | r88915 >
Date:18:49, 26 May 2011
Author:platonides
Status:resolved (Comments)
Tags:
Comment:
Refactor the common code of compareParsers.php and preprocessDump.php into a dumpIterator.php script.
Implement a simple 'search into this dump'
Modified paths:
  • /trunk/phase3/maintenance/compareParsers.php (modified) (history)
  • /trunk/phase3/maintenance/dumpIterator.php (added) (history)
  • /trunk/phase3/maintenance/preprocessDump.php (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/dumpIterator.php
@@ -0,0 +1,149 @@
 2+<?php
 3+/**
 4+ * Take page text out of an XML dump file and perform some operation on it.
 5+ * Used as a base class for CompareParsers and PreprocessDump.
 6+ * We implement below the simple task of searching inside a dump.
 7+ *
 8+ * Copyright (C) 2011 Platonides - http://www.mediawiki.org/
 9+ *
 10+ * This program is free software; you can redistribute it and/or modify
 11+ * it under the terms of the GNU General Public License as published by
 12+ * the Free Software Foundation; either version 2 of the License, or
 13+ * (at your option) any later version.
 14+ *
 15+ * This program is distributed in the hope that it will be useful,
 16+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
 17+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 18+ * GNU General Public License for more details.
 19+ *
 20+ * You should have received a copy of the GNU General Public License along
 21+ * with this program; if not, write to the Free Software Foundation, Inc.,
 22+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 23+ * http://www.gnu.org/copyleft/gpl.html
 24+ *
 25+ * @file
 26+ * @ingroup Maintenance
 27+ */
 28+
 29+require_once( dirname( __FILE__ ) . '/Maintenance.php' );
 30+
 31+abstract class DumpIterator extends Maintenance {
 32+
 33+ private $count = 0;
 34+ private $startTime;
 35+
 36+ public function __construct() {
 37+ parent::__construct();
 38+ $this->mDescription = "Does something with a dump";
 39+ $this->addOption( 'file', 'File with text to run.', false, true );
 40+ $this->addOption( 'dump', 'XML dump to execute all revisions.', false, true );
 41+ $this->addOption( 'from', 'Article from XML dump to start from.', false, true );
 42+ }
 43+
 44+ public function execute() {
 45+ if (! ( $this->hasOption('file') ^ $this->hasOption('dump') ) ) {
 46+ $this->error("You must provide a file or dump", true);
 47+ }
 48+
 49+ $this->checkOptions();
 50+
 51+ if ( $this->hasOption('file') ) {
 52+ $revision = new WikiRevision;
 53+
 54+ $revision->setText( file_get_contents( $this->getOption( 'file' ) ) );
 55+ $revision->setTitle( Title::newFromText( rawurldecode( basename( $this->getOption( 'file' ), '.txt' ) ) ) );
 56+ $this->handleRevision( $revision );
 57+ return;
 58+ }
 59+
 60+ $this->startTime = wfTime();
 61+
 62+ if ( $this->getOption('dump') == '-' ) {
 63+ $source = new ImportStreamSource( $this->getStdin() );
 64+ } else {
 65+ $this->error("Sorry, I don't support dump filenames yet. Use - and provide it on stdin on the meantime.", true);
 66+ }
 67+ $importer = new WikiImporter( $source );
 68+
 69+ $importer->setRevisionCallback(
 70+ array( &$this, 'handleRevision' ) );
 71+
 72+ $this->from = $this->getOption( 'from', null );
 73+ $this->count = 0;
 74+ $importer->doImport();
 75+
 76+ $this->conclusions();
 77+
 78+ $delta = wfTime() - $this->startTime;
 79+ $this->error( "Done {$this->count} revisions in " . round($delta, 2) . " seconds " );
 80+ if ($delta > 0)
 81+ $this->error( round($this->count / $delta, 2) . " pages/sec" );
 82+
 83+ # Perform the memory_get_peak_usage() when all the other data has been output so there's no damage if it dies.
 84+ # It is only available since 5.2.0 (since 5.2.1 if you haven't compiled with --enable-memory-limit)
 85+ $this->error( "Memory peak usage of " . memory_get_peak_usage() . " bytes\n" );
 86+ }
 87+
 88+ function stripParameters( $text ) {
 89+ if ( !$this->stripParametersEnabled ) {
 90+ return $text;
 91+ }
 92+ return preg_replace( '/(<a) [^>]+>/', '$1>', $text );
 93+ }
 94+
 95+ /**
 96+ * Callback function for each revision, child classes should override
 97+ * processRevision instead.
 98+ * @param $rev Revision
 99+ */
 100+ public function handleRevision( $rev ) {
 101+ $title = $rev->getTitle();
 102+ if ( !$title ) {
 103+ $this->error( "Got bogus revision with null title!" );
 104+ return;
 105+ }
 106+
 107+ $this->count++;
 108+ if ( isset( $this->from ) ) {
 109+ if ( $this->from != $title )
 110+ return;
 111+ $this->output( "Skipped " . ($this->count - 1) . " pages\n" );
 112+
 113+ $this->count = 1;
 114+ $this->from = null;
 115+ }
 116+
 117+ $this->processRevision( $rev );
 118+ }
 119+
 120+ /* Stub function for processing additional options */
 121+ public function checkOptions() {
 122+ return;
 123+ }
 124+
 125+ /* Stub function for giving data about what was computed */
 126+ public function conclusions() {
 127+ return;
 128+ }
 129+
 130+ /* Core function which does whatever the maintenance script is designed to do */
 131+ abstract public function processRevision( $rev );
 132+}
 133+
 134+class SearchDump extends DumpIterator {
 135+
 136+ public function __construct() {
 137+ parent::__construct();
 138+ $this->mDescription = "Runs a regex in the revisions from a dump";
 139+ $this->addOption( 'regex', 'Searching regex', true, true );
 140+ }
 141+
 142+ public function processRevision( $rev ) {
 143+ if ( preg_match( $this->getOption( 'regex' ), $rev->getText() ) ) {
 144+ $this->output( $rev->getTitle() . " matches at edit from " . $rev->getTimestamp() . "\n" );
 145+ }
 146+ }
 147+}
 148+
 149+$maintClass = "SearchDump";
 150+require_once( RUN_MAINTENANCE_IF_MAIN );
Property changes on: trunk/phase3/maintenance/dumpIterator.php
___________________________________________________________________
Added: svn:eol-style
1151 + native
Index: trunk/phase3/maintenance/preprocessDump.php
@@ -25,13 +25,10 @@
2626 * @ingroup Maintenance
2727 */
2828
29 -require_once( dirname( __FILE__ ) . '/Maintenance.php' );
 29+require_once( dirname( __FILE__ ) . '/dumpIterator.php' );
3030
31 -class PreprocessDump extends Maintenance {
 31+class PreprocessDump extends DumpIterator {
3232
33 - private $count = 0;
34 - private $startTime;
35 -
3633 /* Variables for dressing up as a parser */
3734 public $mTitle = 'PreprocessDump';
3835 public $mPPNodeCount = 0;
@@ -43,11 +40,6 @@
4441
4542 public function __construct() {
4643 parent::__construct();
47 - $this->saveFailed = false;
48 - $this->mDescription = "Run a file or dump with a preprocessor";
49 - $this->addOption( 'file', 'File with text to run.', false, true );
50 - $this->addOption( 'dump', 'XML dump to execute all revisions.', false, true );
51 - $this->addOption( 'from', 'Article from XML dump to start from.', false, true );
5244 $this->addOption( 'cache', 'Use and populate the preprocessor cache.', false, false );
5345 $this->addOption( 'preprocessor', 'Preprocessor to use.', false, false );
5446 }
@@ -72,16 +64,11 @@
7365 return false;
7466 }
7567
76 - public function execute() {
 68+ public function checkOptions() {
7769 global $wgParser, $wgParserConf, $wgPreprocessorCacheThreshold;
78 -
79 - if (! ( $this->hasOption( 'file' ) ^ $this->hasOption( 'dump' ) ) ) {
80 - $this->error("You must provide a file or dump", true);
81 - }
8270
8371 if ( !$this->hasOption( 'cache' ) ) {
8472 $wgPreprocessorCacheThreshold = false;
85 - $this->saveFailed = $this->getOption('save-failed');
8673 }
8774
8875 if ( $this->hasOption( 'preprocessor' ) ) {
@@ -94,71 +81,22 @@
9582
9683 $wgParser->firstCallInit();
9784 $this->mPreprocessor = new $name( $this );
98 -
99 - if ( $this->hasOption( 'file' ) ) {
100 - $revision = new WikiRevision;
101 -
102 - $revision->setText( file_get_contents( $this->getOption( 'file' ) ) );
103 - $revision->setTitle( Title::newFromText( rawurldecode( basename( $this->getOption('file'), '.txt' ) ) ) );
104 - $this->handleRevision( $revision );
105 - return;
106 - }
107 -
108 - $this->startTime = wfTime();
109 -
110 - if ( $this->getOption('dump') == '-' ) {
111 - $source = new ImportStreamSource( $this->getStdin() );
112 - } else {
113 - $this->error("Sorry, I don't support dump filenames yet. Use - and provide it on stdin on the meantime.", true);
114 - }
115 - $importer = new WikiImporter( $source );
116 -
117 - $importer->setRevisionCallback(
118 - array( &$this, 'handleRevision' ) );
119 -
120 - $this->from = $this->getOption( 'from', null );
121 - $this->count = 0;
122 - $importer->doImport();
123 -
124 - $delta = wfTime() - $this->startTime;
125 - $this->error( "{$this->count} revisions preprocessed in " . round($delta, 2) . " seconds " );
126 - if ($delta > 0)
127 - $this->error( round($this->count / $delta, 2) . " pages/sec" );
128 -
129 - # Perform the memory_get_peak_usage() when all the other data has been output so there's no damage if it dies.
130 - # It is only available since 5.2.0 (since 5.2.1 if you haven't compiled with --enable-memory-limit)
131 - $this->error( "Memory peak usage of " . memory_get_peak_usage() . " bytes\n" );
13285 }
13386
13487 /**
13588 * Callback function for each revision, preprocessToObj()
13689 * @param $rev Revision
13790 */
138 - public function handleRevision( $rev ) {
139 - $title = $rev->getTitle();
140 - if ( !$title ) {
141 - $this->error( "Got bogus revision with null title!" );
142 - return;
143 - }
144 -
145 - $this->count++;
146 - if ( isset( $this->from ) ) {
147 - if ( $this->from != $title )
148 - return;
149 - $this->output( "Skipped " . ($this->count - 1) . " pages\n" );
150 -
151 - $this->count = 1;
152 - $this->from = null;
153 - }
 91+ public function processRevision( $rev ) {
15492 try {
15593 $this->mPreprocessor->preprocessToObj( $rev->getText(), 0 );
15694 }
15795 catch(Exception $e) {
158 - $this->error("Caught exception " . $e->getMessage() . " in " . $title-> getPrefixedText() );
 96+ $this->error("Caught exception " . $e->getMessage() . " in " . $rev->getTitle()->getPrefixedText() );
15997 }
16098 }
16199 }
162100
163101 $maintClass = "PreprocessDump";
164 -require_once( RUN_MAINTENANCE_IF_MAIN );
 102+require( RUN_MAINTENANCE_IF_MAIN );
165103
Index: trunk/phase3/maintenance/compareParsers.php
@@ -27,9 +27,9 @@
2828 * @ingroup Maintenance
2929 */
3030
31 -require_once( dirname( __FILE__ ) . '/Maintenance.php' );
 31+require_once( dirname( __FILE__ ) . '/dumpIterator.php' );
3232
33 -class CompareParsers extends Maintenance {
 33+class CompareParsers extends DumpIterator {
3434
3535 private $count = 0;
3636 private $startTime;
@@ -40,9 +40,6 @@
4141 $this->mDescription = "Run a file or dump with several parsers";
4242 $this->addOption( 'parser1', 'The first parser to compare.', true, true );
4343 $this->addOption( 'parser2', 'The second parser to compare.', true, true );
44 - $this->addOption( 'file', 'File with text to run.', false, true );
45 - $this->addOption( 'dump', 'XML dump to execute all revisions.', false, true );
46 - $this->addOption( 'from', 'Article from XML dump to start from.', false, true );
4744 $this->addOption( 'tidy', 'Run tidy on the articles.', false, false );
4845 $this->addOption( 'save-failed', 'Folder in which articles which differ will be stored.', false, true );
4946 $this->addOption( 'show-diff', 'Show a diff of the two renderings.', false, false );
@@ -51,11 +48,7 @@
5249 $this->addOption( 'show-parsed-output', 'Show the parsed html if both Parsers give the same output.', false, false );
5350 }
5451
55 - public function execute() {
56 - if (! ( $this->hasOption('file') ^ $this->hasOption('dump') ) ) {
57 - $this->error("You must provide file or dump", true);
58 - }
59 -
 52+ public function checkOptions() {
6053 if ( $this->hasOption('save-failed') ) {
6154 $this->saveFailed = $this->getOption('save-failed');
6255 }
@@ -83,41 +76,13 @@
8477 $this->options->setTidy( true );
8578 }
8679
87 - if ( $this->hasOption('file') ) {
88 - $revision = new WikiRevision;
89 -
90 - $revision->setText( file_get_contents( $this->getOption('file') ) );
91 - $revision->setTitle( Title::newFromText( rawurldecode( basename( $this->getOption('file'), '.txt' ) ) ) );
92 - $this->handleRevision( $revision );
93 - return;
94 - }
95 -
96 - $this->startTime = wfTime();
97 -
98 - if ( $this->getOption('dump') == '-' ) {
99 - $source = new ImportStreamSource( $this->getStdin() );
100 - } else {
101 - $this->error("Sorry, I don't support dump filenames yet. Use - and provide it on stdin on the meantime.", true);
102 - }
103 - $importer = new WikiImporter( $source );
104 -
105 - $importer->setRevisionCallback(
106 - array( &$this, 'handleRevision' ) );
107 -
108 - $this->from = $this->getOption( 'from', null );
109 - $this->count = 0;
11080 $this->failed = 0;
111 - $importer->doImport();
112 -
 81+ }
 82+
 83+ public function conclusions() {
11384 $this->error( "{$this->failed} failed revisions out of {$this->count}" );
11485 if ($this->count > 0)
11586 $this->output( " (" . ( $this->failed / $this->count ) . "%)\n" );
116 -
117 - $delta = wfTime() - $this->startTime;
118 - $this->error( "Compared {$this->count} pages in " . round($delta, 2) . " seconds " );
119 - if ($delta > 0)
120 - $this->error( round($this->count / $delta, 2) . " pages/sec" );
121 - $this->error( "\n" );
12287 }
12388
12489 function stripParameters( $text ) {
@@ -131,25 +96,9 @@
13297 * Callback function for each revision, parse with both parsers and compare
13398 * @param $rev Revision
13499 */
135 - public function handleRevision( $rev ) {
 100+ public function processRevision( $rev ) {
136101 $title = $rev->getTitle();
137 - if ( !$title ) {
138 - $this->error( "Got bogus revision with null title!" );
139 - return;
140 - }
141 -
142 - $this->count++;
143 - if ( isset( $this->from ) ) {
144 - if ( $this->from != $title )
145 - return;
146 - $this->output( "Skipped " . ($this->count - 1) . " pages\n" );
147 -
148 - $this->count = 1;
149 - $this->from = null;
150 - }
151 -
152 -
153 -
 102+
154103 $parser1Name = $this->getOption( 'parser1' );
155104 $parser2Name = $this->getOption( 'parser2' );
156105
@@ -191,4 +140,4 @@
192141 }
193142
194143 $maintClass = "CompareParsers";
195 -require_once( RUN_MAINTENANCE_IF_MAIN );
 144+require( RUN_MAINTENANCE_IF_MAIN );

Follow-up revisions

RevisionCommit summaryAuthorDate
r88942Move down interwiki disabling to dumpIterator and make SearchDump work withou...platonides22:30, 26 May 2011

Comments

#Comment by Aaron Schulz (talk | contribs)   03:41, 7 July 2011

$this->hasOption('file') ^ $this->hasOption('dump')

Should that be 'xor' instead of "bitwise xor"? I guess the later still works due to casting.

Status & tagging log