r80179 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80178‎ | r80179 | r80180 >
Date:17:32, 13 January 2011
Author:platonides
Status:ok
Tags:
Comment:
I had this parser comparer in my working copy since the summer.
Modified paths:
  • /trunk/phase3/maintenance/compareParsers.php (added) (history)

Diff [purge]

Index: trunk/phase3/maintenance/compareParsers.php
@@ -0,0 +1,194 @@
 2+<?php
 3+/**
 4+ * Take page text out of an XML dump file and render basic HTML out to files.
 5+ * This is *NOT* suitable for publishing or offline use; it's intended for
 6+ * running comparative tests of parsing behavior using real-world data.
 7+ *
 8+ * Templates etc are pulled from the local wiki database, not from the dump.
 9+ *
 10+ * Copyright (C) 2011 Platonides - http://www.mediawiki.org/
 11+ *
 12+ * This program is free software; you can redistribute it and/or modify
 13+ * it under the terms of the GNU General Public License as published by
 14+ * the Free Software Foundation; either version 2 of the License, or
 15+ * (at your option) any later version.
 16+ *
 17+ * This program is distributed in the hope that it will be useful,
 18+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
 19+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 20+ * GNU General Public License for more details.
 21+ *
 22+ * You should have received a copy of the GNU General Public License along
 23+ * with this program; if not, write to the Free Software Foundation, Inc.,
 24+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 25+ * http://www.gnu.org/copyleft/gpl.html
 26+ *
 27+ * @file
 28+ * @ingroup Maintenance
 29+ */
 30+
 31+require_once( dirname( __FILE__ ) . '/Maintenance.php' );
 32+
 33+class CompareParsers extends Maintenance {
 34+
 35+ private $count = 0;
 36+ private $outputDirectory, $startTime;
 37+
 38+ public function __construct() {
 39+ parent::__construct();
 40+ $this->saveFailed = false;
 41+ $this->mDescription = "Run a file or dump with several parsers";
 42+ $this->addOption( 'parser1', 'The first parser to compare.', true, true );
 43+ $this->addOption( 'parser2', 'The second parser to compare.', true, true );
 44+ $this->addOption( 'file', 'File with text to run.', false, true );
 45+ $this->addOption( 'dump', 'XML dump to execute all revisions.', false, true );
 46+ $this->addOption( 'from', 'Article from XML dump to start from.', false, true );
 47+ $this->addOption( 'tidy', 'Run tidy on the articles.', false, false );
 48+ $this->addOption( 'save-failed', 'Folder in which articles which differ will be stored.', false, true );
 49+ $this->addOption( 'show-diff', 'Show a diff of the two renderings.', false, false );
 50+ $this->addOption( 'diff-bin', 'Binary to use for diffing (can also be provided by DIFF env var).', false, false );
 51+ $this->addOption( 'strip-parameters', 'Remove parameters of html tags to increase readability.', false, false );
 52+ $this->addOption( 'show-parsed-output', 'Show the parsed html if both Parsers give the same output.', false, false );
 53+ }
 54+
 55+ public function execute() {
 56+ if (! ( $this->hasOption('file') ^ $this->hasOption('dump') ) ) {
 57+ $this->error("You must provide file or dump", true);
 58+ }
 59+
 60+ if ( $this->hasOption('save-failed') ) {
 61+ $this->saveFailed = $this->getOption('save-failed');
 62+ }
 63+
 64+ $this->stripParametersEnabled = $this->hasOption( 'strip-parameters' );
 65+ $this->showParsedOutput = $this->hasOption( 'show-parsed-output' );
 66+
 67+ $this->showDiff = $this->hasOption( 'show-diff' );
 68+ if ( $this->showDiff ) {
 69+ $bin = $this->getOption( 'diff-bin', getenv( 'DIFF' ) );
 70+ if ( $bin != '' ) {
 71+ global $wgDiff;
 72+ $wgDiff = $bin;
 73+ }
 74+ }
 75+
 76+ $user = new User();
 77+ $this->options = ParserOptions::newFromUser( $user );
 78+
 79+ if ( $this->hasOption( 'tidy' ) ) {
 80+ global $wgUseTidy;
 81+ if ( !$wgUseTidy ) {
 82+ $this->error( 'Tidy was requested but $wgUseTidy is not set in LocalSettings.php', true );
 83+ }
 84+ $this->options->setTidy( true );
 85+ }
 86+
 87+ if ( $this->hasOption('file') ) {
 88+ $revision = new WikiRevision;
 89+
 90+ $revision->setText( file_get_contents( $this->getOption('file') ) );
 91+ $revision->setTitle( Title::newFromText( rawurldecode( basename( $this->getOption('file'), '.txt' ) ) ) );
 92+ $this->handleRevision( $revision );
 93+ return;
 94+ }
 95+
 96+ $this->startTime = wfTime();
 97+
 98+ if ( $this->getOption('dump') == '-' ) {
 99+ $source = new ImportStreamSource( $this->getStdin() );
 100+ } else {
 101+ $this->error("Sorry, I don't support dump filenames yet. Use - and provide it on stdin on the meantime.", true);
 102+ }
 103+ $importer = new WikiImporter( $source );
 104+
 105+ $importer->setRevisionCallback(
 106+ array( &$this, 'handleRevision' ) );
 107+
 108+ $this->from = $this->getOption( 'from', null );
 109+ $this->count = 0;
 110+ $this->failed = 0;
 111+ $importer->doImport();
 112+
 113+ $this->error( "{$this->failed} failed revisions out of {$this->count}" );
 114+ if ($this->count > 0)
 115+ $this->output( " (" . ( $this->failed / $this->count ) . "%)\n" );
 116+
 117+ $delta = wfTime() - $this->startTime;
 118+ $this->error( "Compared {$this->count} pages in " . round($delta, 2) . " seconds " );
 119+ if ($delta > 0)
 120+ $this->error( round($this->count / $delta, 2) . " pages/sec" );
 121+ $this->error( "\n" );
 122+ }
 123+
 124+ function stripParameters( $text ) {
 125+ if ( !$this->stripParametersEnabled ) {
 126+ return $text;
 127+ }
 128+ return preg_replace( '/(<a) [^>]+>/', '$1>', $text );
 129+ }
 130+
 131+ /**
 132+ * Callback function for each revision, parse with both parsers and compare
 133+ * @param $rev Revision
 134+ */
 135+ public function handleRevision( $rev ) {
 136+ $title = $rev->getTitle();
 137+ if ( !$title ) {
 138+ $this->error( "Got bogus revision with null title!" );
 139+ return;
 140+ }
 141+
 142+ $this->count++;
 143+ if ( isset( $this->from ) ) {
 144+ if ( $this->from != $title )
 145+ return;
 146+ $this->output( "Skipped " . ($this->count - 1) . " pages\n" );
 147+
 148+ $this->count = 1;
 149+ $this->from = null;
 150+ }
 151+
 152+
 153+
 154+ $parser1Name = $this->getOption( 'parser1' );
 155+ $parser2Name = $this->getOption( 'parser2' );
 156+
 157+ self::checkParserLocally( $parser1Name );
 158+ self::checkParserLocally( $parser2Name );
 159+
 160+ $parser1 = new $parser1Name();
 161+ $parser2 = new $parser2Name();
 162+
 163+ $output1 = $parser1->parse( $rev->getText(), $title, $this->options );
 164+ $output2 = $parser2->parse( $rev->getText(), $title, $this->options );
 165+
 166+ if ( $output1->getText() != $output2->getText() ) {
 167+ $this->failed++;
 168+ $this->error( "Parsing for {$title->getPrefixedText()} differs\n" );
 169+
 170+ if ($this->saveFailed) {
 171+ file_put_contents( $this->saveFailed . '/' . rawurlencode( $title->getPrefixedText() ) . ".txt", $rev->getText());
 172+ }
 173+ if ( $this->showDiff ) {
 174+ $this->output( wfDiff( $this->stripParameters( $output1->getText() ), $this->stripParameters( $output2->getText() ), '' ) );
 175+ }
 176+ } else {
 177+ $this->output( $title->getPrefixedText() . "\tOK\n" );
 178+ if ( $this->showParsedOutput ) {
 179+ $this->output( $this->stripParameters( $output1->getText() ) );
 180+ }
 181+ }
 182+ }
 183+
 184+ private static function checkParserLocally( $parserName ) {
 185+ /* Look for the parser in a file appropiately named in the current folder */
 186+ if ( !class_exists( $parserName ) && file_exists( "$parserName.php" ) ) {
 187+ global $wgAutoloadClasses;
 188+ $wgAutoloadClasses[ $parserName ] = realpath( '.' ) . "/$parserName.php";
 189+ }
 190+ }
 191+
 192+}
 193+
 194+$maintClass = "CompareParsers";
 195+require_once( DO_MAINTENANCE );
Property changes on: trunk/phase3/maintenance/compareParsers.php
___________________________________________________________________
Added: svn:eol-style
1196 + native

Status & tagging log