r78674 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r78673‎ | r78674 | r78675 >
Date:13:59, 21 December 2010
Author:tstarling
Status:deferred (Comments)
Tags:
Comment:
Long and ugly script to convert the recently-discovered August 2001 backup of Wikipedia to a MediaWiki XML file.
Modified paths:
  • /trunk/phase3/maintenance/importUseModWikipedia.php (added) (history)

Diff [purge]

Index: trunk/phase3/maintenance/importUseModWikipedia.php
@@ -0,0 +1,1112 @@
 2+<?php
 3+
 4+/**
 5+ * A script to read a dump of the English Wikipedia from the UseModWiki period, and to
 6+ * generate an XML dump in MediaWiki format.
 7+ *
 8+ * Some relevant code was ported from UseModWiki 0.92.
 9+ *
 10+ */
 11+
 12+require_once( dirname( __FILE__ ) . '/Maintenance.php' );
 13+require_once( dirname( __FILE__ ) .'/../includes/normal/UtfNormalUtil.php' );
 14+
 15+
 16+class ImportUseModWikipedia extends Maintenance {
 17+ var $encodeMap, $decodeMap;
 18+
 19+ var $deepRenames = array(
 20+ 'JimboWales' => 983862286,
 21+ 'TexaS' => 983918410,
 22+ 'HistoryOfUnitedStatesTalk' => 984795423,
 23+ 'MetallicA' => 985128533,
 24+ 'PythagoreanTheorem' => 985225545,
 25+ 'TheCanonofScripture' => 985368223,
 26+ 'TaoTehChing' => 985368222,
 27+ //'TheMostRemarkableFormulaInTheWorld' => 985368221,
 28+ 'TheRecorder' => 985368220,
 29+ 'GladstoneOregon' => 985368219,
 30+ #'UnitedStatesConstitution/AmendmentTwo' =>
 31+ );
 32+
 33+ var $replacements = array();
 34+
 35+ var $renameTextLinksOps = array(
 36+ 983846265 => array(
 37+ 'TestIgnore' => 'IgnoreTest',
 38+ ),
 39+ 983848080 => array(
 40+ 'UnitedLocomotiveWorks' => 'Atlas Shrugged/United Locomotive Works'
 41+ ),
 42+ 983856376 => array(
 43+ 'WikiPedia' => 'Wikipedia',
 44+ ),
 45+ 983896152 => array(
 46+ 'John_F_Kennedy' => 'John_F._Kennedy',
 47+ ),
 48+ 983905871 => array(
 49+ 'LarrySanger' => 'Larry_Sanger'
 50+ ),
 51+ 984697068 => array(
 52+ 'UnitedStates' => 'United States',
 53+ ),
 54+ 984792748 => array(
 55+ 'LibertarianisM' => 'Libertarianism'
 56+ ),
 57+ 985327832 => array(
 58+ 'AnarchisM' => 'Anarchism',
 59+ ),
 60+ 985290063 => array(
 61+ 'HistoryOfUnitedStatesDiscussion' => 'History_Of_United_States_Discussion'
 62+ ),
 63+ 985290091 => array(
 64+ 'BritishEmpire' => 'British Empire'
 65+ ),
 66+ /*
 67+ 985468958 => array(
 68+ 'ScienceFiction' => 'Science fiction',
 69+ ),*/
 70+ );
 71+
 72+ /**
 73+ * Hack for observed substitution issues
 74+ */
 75+ var $skipSelfSubstitution = array(
 76+ 'Pythagorean_Theorem',
 77+ 'The_Most_Remarkable_Formula_In_The_World',
 78+ 'Wine',
 79+ );
 80+
 81+ var $unixLineEndingsOps = array(
 82+ 987743732 => 'Wikipedia_FAQ'
 83+ );
 84+
 85+ var $replacementsDone = array();
 86+
 87+ var $moveLog = array();
 88+ var $moveDests = array();
 89+ var $revId;
 90+
 91+ var $rc = array();
 92+ var $textCache = array();
 93+ var $blacklist = array();
 94+
 95+ var $FS, $FS1, $FS2, $FS3;
 96+ var $FreeLinkPattern, $UrlPattern, $LinkPattern, $InterLinkPattern;
 97+
 98+ var $cp1252Table = <<<EOT
 99+0x00 0x0000
 100+0x01 0x0001
 101+0x02 0x0002
 102+0x03 0x0003
 103+0x04 0x0004
 104+0x05 0x0005
 105+0x06 0x0006
 106+0x07 0x0007
 107+0x08 0x0008
 108+0x09 0x0009
 109+0x0a 0x000a
 110+0x0b 0x000b
 111+0x0c 0x000c
 112+0x0d 0x000d
 113+0x0e 0x000e
 114+0x0f 0x000f
 115+0x10 0x0010
 116+0x11 0x0011
 117+0x12 0x0012
 118+0x13 0x0013
 119+0x14 0x0014
 120+0x15 0x0015
 121+0x16 0x0016
 122+0x17 0x0017
 123+0x18 0x0018
 124+0x19 0x0019
 125+0x1a 0x001a
 126+0x1b 0x001b
 127+0x1c 0x001c
 128+0x1d 0x001d
 129+0x1e 0x001e
 130+0x1f 0x001f
 131+0x20 0x0020
 132+0x21 0x0021
 133+0x22 0x0022
 134+0x23 0x0023
 135+0x24 0x0024
 136+0x25 0x0025
 137+0x26 0x0026
 138+0x27 0x0027
 139+0x28 0x0028
 140+0x29 0x0029
 141+0x2a 0x002a
 142+0x2b 0x002b
 143+0x2c 0x002c
 144+0x2d 0x002d
 145+0x2e 0x002e
 146+0x2f 0x002f
 147+0x30 0x0030
 148+0x31 0x0031
 149+0x32 0x0032
 150+0x33 0x0033
 151+0x34 0x0034
 152+0x35 0x0035
 153+0x36 0x0036
 154+0x37 0x0037
 155+0x38 0x0038
 156+0x39 0x0039
 157+0x3a 0x003a
 158+0x3b 0x003b
 159+0x3c 0x003c
 160+0x3d 0x003d
 161+0x3e 0x003e
 162+0x3f 0x003f
 163+0x40 0x0040
 164+0x41 0x0041
 165+0x42 0x0042
 166+0x43 0x0043
 167+0x44 0x0044
 168+0x45 0x0045
 169+0x46 0x0046
 170+0x47 0x0047
 171+0x48 0x0048
 172+0x49 0x0049
 173+0x4a 0x004a
 174+0x4b 0x004b
 175+0x4c 0x004c
 176+0x4d 0x004d
 177+0x4e 0x004e
 178+0x4f 0x004f
 179+0x50 0x0050
 180+0x51 0x0051
 181+0x52 0x0052
 182+0x53 0x0053
 183+0x54 0x0054
 184+0x55 0x0055
 185+0x56 0x0056
 186+0x57 0x0057
 187+0x58 0x0058
 188+0x59 0x0059
 189+0x5a 0x005a
 190+0x5b 0x005b
 191+0x5c 0x005c
 192+0x5d 0x005d
 193+0x5e 0x005e
 194+0x5f 0x005f
 195+0x60 0x0060
 196+0x61 0x0061
 197+0x62 0x0062
 198+0x63 0x0063
 199+0x64 0x0064
 200+0x65 0x0065
 201+0x66 0x0066
 202+0x67 0x0067
 203+0x68 0x0068
 204+0x69 0x0069
 205+0x6a 0x006a
 206+0x6b 0x006b
 207+0x6c 0x006c
 208+0x6d 0x006d
 209+0x6e 0x006e
 210+0x6f 0x006f
 211+0x70 0x0070
 212+0x71 0x0071
 213+0x72 0x0072
 214+0x73 0x0073
 215+0x74 0x0074
 216+0x75 0x0075
 217+0x76 0x0076
 218+0x77 0x0077
 219+0x78 0x0078
 220+0x79 0x0079
 221+0x7a 0x007a
 222+0x7b 0x007b
 223+0x7c 0x007c
 224+0x7d 0x007d
 225+0x7e 0x007e
 226+0x7f 0x007f
 227+0x80 0x20ac
 228+0x81 0x0081
 229+0x82 0x201a
 230+0x83 0x0192
 231+0x84 0x201e
 232+0x85 0x2026
 233+0x86 0x2020
 234+0x87 0x2021
 235+0x88 0x02c6
 236+0x89 0x2030
 237+0x8a 0x0160
 238+0x8b 0x2039
 239+0x8c 0x0152
 240+0x8d 0x008d
 241+0x8e 0x017d
 242+0x8f 0x008f
 243+0x90 0x0090
 244+0x91 0x2018
 245+0x92 0x2019
 246+0x93 0x201c
 247+0x94 0x201d
 248+0x95 0x2022
 249+0x96 0x2013
 250+0x97 0x2014
 251+0x98 0x02dc
 252+0x99 0x2122
 253+0x9a 0x0161
 254+0x9b 0x203a
 255+0x9c 0x0153
 256+0x9d 0x009d
 257+0x9e 0x017e
 258+0x9f 0x0178
 259+0xa0 0x00a0
 260+0xa1 0x00a1
 261+0xa2 0x00a2
 262+0xa3 0x00a3
 263+0xa4 0x00a4
 264+0xa5 0x00a5
 265+0xa6 0x00a6
 266+0xa7 0x00a7
 267+0xa8 0x00a8
 268+0xa9 0x00a9
 269+0xaa 0x00aa
 270+0xab 0x00ab
 271+0xac 0x00ac
 272+0xad 0x00ad
 273+0xae 0x00ae
 274+0xaf 0x00af
 275+0xb0 0x00b0
 276+0xb1 0x00b1
 277+0xb2 0x00b2
 278+0xb3 0x00b3
 279+0xb4 0x00b4
 280+0xb5 0x00b5
 281+0xb6 0x00b6
 282+0xb7 0x00b7
 283+0xb8 0x00b8
 284+0xb9 0x00b9
 285+0xba 0x00ba
 286+0xbb 0x00bb
 287+0xbc 0x00bc
 288+0xbd 0x00bd
 289+0xbe 0x00be
 290+0xbf 0x00bf
 291+0xc0 0x00c0
 292+0xc1 0x00c1
 293+0xc2 0x00c2
 294+0xc3 0x00c3
 295+0xc4 0x00c4
 296+0xc5 0x00c5
 297+0xc6 0x00c6
 298+0xc7 0x00c7
 299+0xc8 0x00c8
 300+0xc9 0x00c9
 301+0xca 0x00ca
 302+0xcb 0x00cb
 303+0xcc 0x00cc
 304+0xcd 0x00cd
 305+0xce 0x00ce
 306+0xcf 0x00cf
 307+0xd0 0x00d0
 308+0xd1 0x00d1
 309+0xd2 0x00d2
 310+0xd3 0x00d3
 311+0xd4 0x00d4
 312+0xd5 0x00d5
 313+0xd6 0x00d6
 314+0xd7 0x00d7
 315+0xd8 0x00d8
 316+0xd9 0x00d9
 317+0xda 0x00da
 318+0xdb 0x00db
 319+0xdc 0x00dc
 320+0xdd 0x00dd
 321+0xde 0x00de
 322+0xdf 0x00df
 323+0xe0 0x00e0
 324+0xe1 0x00e1
 325+0xe2 0x00e2
 326+0xe3 0x00e3
 327+0xe4 0x00e4
 328+0xe5 0x00e5
 329+0xe6 0x00e6
 330+0xe7 0x00e7
 331+0xe8 0x00e8
 332+0xe9 0x00e9
 333+0xea 0x00ea
 334+0xeb 0x00eb
 335+0xec 0x00ec
 336+0xed 0x00ed
 337+0xee 0x00ee
 338+0xef 0x00ef
 339+0xf0 0x00f0
 340+0xf1 0x00f1
 341+0xf2 0x00f2
 342+0xf3 0x00f3
 343+0xf4 0x00f4
 344+0xf5 0x00f5
 345+0xf6 0x00f6
 346+0xf7 0x00f7
 347+0xf8 0x00f8
 348+0xf9 0x00f9
 349+0xfa 0x00fa
 350+0xfb 0x00fb
 351+0xfc 0x00fc
 352+0xfd 0x00fd
 353+0xfe 0x00fe
 354+0xff 0x00ff
 355+EOT;
 356+ public function __construct() {
 357+ parent::__construct();
 358+ $this->addOption( 'datadir', 'the value of $DataDir from wiki.cgi', true, true );
 359+ $this->addOption( 'outfile', 'the name of the output XML file', true, true );
 360+ $this->initLinkPatterns();
 361+
 362+ $this->encodeMap = $this->decodeMap = array();
 363+ foreach ( explode( "\n", $this->cp1252Table ) as $line ) {
 364+ list( $source, $dest ) = explode( "\t", $line );
 365+ $sourceChar = chr( base_convert( substr( $source, 2 ), 16, 10 ) );
 366+ $destChar = codepointToUtf8( base_convert( substr( $dest, 2 ), 16, 10 ) );
 367+ $this->encodeMap[$sourceChar] = $destChar;
 368+ $this->decodeMap[$destChar] = $sourceChar;
 369+ }
 370+ }
 371+
 372+ function initLinkPatterns() {
 373+ # Field separators are used in the URL-style patterns below.
 374+ $this->FS = "\xb3"; # The FS character is a superscript "3"
 375+ $this->FS1 = $this->FS . "1"; # The FS values are used to separate fields
 376+ $this->FS2 = $this->FS . "2"; # in stored hashtables and other data structures.
 377+ $this->FS3 = $this->FS . "3"; # The FS character is not allowed in user data.
 378+
 379+ $UpperLetter = "[A-Z";
 380+ $LowerLetter = "[a-z";
 381+ $AnyLetter = "[A-Za-z";
 382+ $AnyLetter .= "_0-9";
 383+ $UpperLetter .= "]"; $LowerLetter .= "]"; $AnyLetter .= "]";
 384+
 385+ # Main link pattern: lowercase between uppercase, then anything
 386+ $LpA = $UpperLetter . "+" . $LowerLetter . "+" . $UpperLetter
 387+ . $AnyLetter . "*";
 388+ # Optional subpage link pattern: uppercase, lowercase, then anything
 389+ $LpB = $UpperLetter . "+" . $LowerLetter . "+" . $AnyLetter . "*";
 390+
 391+ # Loose pattern: If subpage is used, subpage may be simple name
 392+ $this->LinkPattern = "((?:(?:$LpA)?\\/$LpB)|$LpA)";
 393+ $QDelim = '(?:"")?'; # Optional quote delimiter (not in output)
 394+ $this->LinkPattern .= $QDelim;
 395+
 396+ # Inter-site convention: sites must start with uppercase letter
 397+ # (Uppercase letter avoids confusion with URLs)
 398+ $InterSitePattern = $UpperLetter . $AnyLetter . "+";
 399+ $this->InterLinkPattern = "((?:$InterSitePattern:[^\\]\\s\"<>{$this->FS}]+)$QDelim)";
 400+
 401+ $AnyLetter = "[-,. _0-9A-Za-z]";
 402+ $this->FreeLinkPattern = "($AnyLetter+)";
 403+ $this->FreeLinkPattern = "((?:(?:$AnyLetter+)?\\/)?$AnyLetter+)";
 404+ $this->FreeLinkPattern .= $QDelim;
 405+
 406+ # Url-style links are delimited by one of:
 407+ # 1. Whitespace (kept in output)
 408+ # 2. Left or right angle-bracket (< or >) (kept in output)
 409+ # 3. Right square-bracket (]) (kept in output)
 410+ # 4. A single double-quote (") (kept in output)
 411+ # 5. A $FS (field separator) character (kept in output)
 412+ # 6. A double double-quote ("") (removed from output)
 413+
 414+ $UrlProtocols = "http|https|ftp|afs|news|nntp|mid|cid|mailto|wais|"
 415+ . "prospero|telnet|gopher";
 416+ $UrlProtocols .= '|file';
 417+ $this->UrlPattern = "((?:(?:$UrlProtocols):[^\\]\\s\"<>{$this->FS}]+)$QDelim)";
 418+ $ImageExtensions = "(gif|jpg|png|bmp|jpeg)";
 419+ $RFCPattern = "RFC\\s?(\\d+)";
 420+ $ISBNPattern = "ISBN:?([0-9- xX]{10,})";
 421+ }
 422+
 423+ function execute() {
 424+ $this->articleFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp';
 425+ $this->patchFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp';
 426+ $this->dataDir = $this->getOption( 'datadir' );
 427+ $this->outFile = fopen( $this->getOption( 'outfile' ), 'w' );
 428+ if ( !$this->outFile ) {
 429+ echo "Unable to open output file\n";
 430+ return 1;
 431+ }
 432+ $this->writeXmlHeader();
 433+ $this->readRclog();
 434+ $this->writeMoveLog();
 435+ $this->writeRevisions();
 436+ $this->reconcileCurrentRevs();
 437+ $this->writeXmlFooter();
 438+ unlink( $this->articleFileName );
 439+ unlink( $this->patchFileName );
 440+ return 0;
 441+ }
 442+
 443+ function writeXmlHeader() {
 444+ fwrite( $this->outFile, <<<EOT
 445+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en">
 446+ <siteinfo>
 447+ <sitename>Wikipedia</sitename>
 448+ <base>http://www.wikipedia.com/</base>
 449+ <generator>MediaWiki 1.18alpha importUseModWikipedia.php</generator>
 450+ <case>case-sensitive</case>
 451+ <namespaces>
 452+ <namespace key="0" />
 453+ </namespaces>
 454+ </siteinfo>
 455+
 456+EOT
 457+ );
 458+ }
 459+
 460+ function writeXmlFooter() {
 461+ fwrite( $this->outFile, "</mediawiki>\n" );
 462+ }
 463+
 464+ function readRclog() {
 465+ $rcFile = fopen( "{$this->dataDir}/rclog", 'r' );
 466+ while ( $line = fgets( $rcFile ) ) {
 467+ $bits = explode( $this->FS3, $line );
 468+ if ( count( $bits ) !== 7 ) {
 469+ echo "Error reading rclog\n";
 470+ return;
 471+ }
 472+ $params = array(
 473+ 'timestamp' => $bits[0],
 474+ 'rctitle' => $bits[1],
 475+ 'summary' => $bits[2],
 476+ 'minor' => $bits[3],
 477+ 'host' => $bits[4],
 478+ 'kind' => $bits[5],
 479+ 'extra' => array()
 480+ );
 481+ $extraList = explode( $this->FS2, $bits[6] );
 482+
 483+ for ( $i = 0; $i < count( $extraList ); $i += 2 ) {
 484+ $params['extra'][$extraList[$i]] = $extraList[$i + 1];
 485+ }
 486+ $this->rc[$params['timestamp']][] = $params;
 487+ }
 488+ }
 489+
 490+ function writeMoveLog() {
 491+ $this->moveLog = array();
 492+ $deepRenames = $this->deepRenames;
 493+ echo "Calculating move log...\n";
 494+ $this->processDiffFile( array( $this, 'moveLogCallback' ) );
 495+
 496+ // We have the timestamp intervals, now make a guess at the actual timestamp
 497+ foreach ( $this->moveLog as $newTitle => $params ) {
 498+ // Is there a time specified?
 499+ $drTime = false;
 500+ if ( isset( $deepRenames[$params['old']] ) ) {
 501+ $drTime = $deepRenames[$params['old']];
 502+ if ( $drTime !== '?' ) {
 503+ if ( ( !isset( $params['endTime'] ) || $drTime < $params['endTime'] )
 504+ && $drTime > $params['startTime'] )
 505+ {
 506+ $this->moveLog[$newTitle]['timestamp'] = $drTime;
 507+ $this->moveLog[$newTitle]['deep'] = true;
 508+
 509+ echo "{$params['old']} -> $newTitle at $drTime\n";
 510+ unset( $deepRenames[$params['old']] );
 511+ continue;
 512+ } else {
 513+ echo "WARNING: deep rename time invalid: {$params['old']}\n";
 514+ unset( $deepRenames[$params['old']] );
 515+ }
 516+ }
 517+ }
 518+
 519+ // Guess that it is one second after the last edit to the page before it was moved
 520+ $this->moveLog[$newTitle]['timestamp'] = $params['startTime'] + 1;
 521+ if ( $drTime === '?' ) {
 522+ $this->moveLog[$newTitle]['deep'] = true;
 523+ unset( $deepRenames[$params['old']] );
 524+ }
 525+ if ( isset( $params['endTime'] ) ) {
 526+ $this->printLatin1( "{$params['old']} -> $newTitle between " .
 527+ "{$params['startTime']} and {$params['endTime']}\n" );
 528+ } else {
 529+ $this->printLatin1( "{$params['old']} -> $newTitle after " .
 530+ "{$params['startTime']}\n" );
 531+ }
 532+ }
 533+
 534+ // Write the move log to the XML file
 535+ $id = 1;
 536+ foreach ( $this->moveLog as $newTitle => $params ) {
 537+ $out = "<logitem>\n" .
 538+ $this->element( 'id', $id++ ) .
 539+ $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) .
 540+ "<contributor>\n" .
 541+ $this->element( 'username', 'UseModWiki admin' ) .
 542+ "</contributor>" .
 543+ $this->element( 'type', 'move' ) .
 544+ $this->element( 'action', 'move' ) .
 545+ $this->element( 'logtitle', $params['old'] ) .
 546+ "<params xml:space=\"preserve\">" .
 547+ htmlspecialchars( $this->encode( "{$newTitle}\n1" ) ) .
 548+ "</params>\n" .
 549+ "</logitem>\n";
 550+ fwrite( $this->outFile, $out );
 551+ }
 552+
 553+ // Check for remaining deep rename entries
 554+ if ( $deepRenames ) {
 555+ echo "WARNING: the following entries in \$this->deepRenames are " .
 556+ "invalid, since no such move exists:\n" .
 557+ implode( "\n", array_keys( $deepRenames ) ) .
 558+ "\n\n";
 559+ }
 560+
 561+ }
 562+
 563+ function element( $name, $value ) {
 564+ return "<$name>" . htmlspecialchars( $this->encode( $value ) ) . "</$name>\n";
 565+ }
 566+
 567+ function moveLogCallback( $entry ) {
 568+ $rctitle = $entry['rctitle'];
 569+ $title = $entry['title'];
 570+ $this->moveDests[$rctitle] = $title;
 571+
 572+ if ( $rctitle === $title ) {
 573+ if ( isset( $this->moveLog[$rctitle] )
 574+ && !isset( $this->moveLog[$rctitle]['endTime'] ) )
 575+ {
 576+ // This is the latest time that the page could have been moved
 577+ $this->moveLog[$rctitle]['endTime'] = $entry['timestamp'];
 578+ }
 579+ } else {
 580+ if ( !isset( $this->moveLog[$rctitle] ) ) {
 581+ // Initialise the move log entry
 582+ $this->moveLog[$rctitle] = array(
 583+ 'old' => $title
 584+ );
 585+ }
 586+ // Update the earliest time the page could have been moved
 587+ $this->moveLog[$rctitle]['startTime'] = $entry['timestamp'];
 588+ }
 589+ }
 590+
 591+ function writeRevisions() {
 592+ $this->numGoodRevs = 0;
 593+ $this->revId = 1;
 594+ $this->processDiffFile( array( $this, 'revisionCallback' ) );
 595+ echo "\n\nImported {$this->numGoodRevs} out of {$this->numRevs}\n";
 596+ }
 597+
 598+ function revisionCallback( $params ) {
 599+ $origTitle = $params['title'];
 600+ $title = $params['rctitle'];
 601+ $editTime = $params['timestamp'];
 602+
 603+ if ( isset( $this->blacklist[$title] ) ) {
 604+ return;
 605+ }
 606+ $this->doPendingOps( $editTime );
 607+
 608+ $origText = $this->getText( $title );
 609+ $text = $this->patch( $origText, $params['diff'] );
 610+ if ( $text === false ) {
 611+ echo "$editTime $title attempting resolution...\n";
 612+ $linkSubstitutes = $this->resolveFailedDiff( $origText, $params['diff'] );
 613+ if ( !$linkSubstitutes ) {
 614+ $this->printLatin1( "$editTime $title DIFF FAILED\n" );
 615+ $this->blacklist[$title] = true;
 616+ return;
 617+ }
 618+ $this->printLatin1( "$editTime $title requires substitutions:\n" );
 619+ $time = $editTime - 1;
 620+ foreach ( $linkSubstitutes as $old => $new ) {
 621+ $this->printLatin1( "SUBSTITUTE $old -> $new\n" );
 622+ $this->renameTextLinks( $old, $new, $time-- );
 623+ }
 624+ $origText = $this->getText( $title );
 625+ $text = $this->patch( $origText, $params['diff'] );
 626+ if ( $text === false ) {
 627+ $this->printLatin1( "$editTime $title STILL FAILS!\n" );
 628+ $this->blacklist[$title] = true;
 629+ return;
 630+ }
 631+
 632+ echo "\n";
 633+ }
 634+
 635+ $params['text'] = $text;
 636+ $this->saveRevision( $params );
 637+ $this->numGoodRevs++;
 638+ #$this->printLatin1( "$editTime $title\n" );
 639+ }
 640+
 641+ function doPendingOps( $editTime ) {
 642+ foreach ( $this->moveLog as $newTitle => $entry ) {
 643+ if ( $entry['timestamp'] <= $editTime ) {
 644+ unset( $this->moveLog[$newTitle] );
 645+ if ( isset( $entry['deep'] ) ) {
 646+ $this->renameTextLinks( $entry['old'], $newTitle, $entry['timestamp'] );
 647+ }
 648+ }
 649+ }
 650+
 651+ foreach ( $this->renameTextLinksOps as $renameTime => $replacements ) {
 652+ if ( $editTime >= $renameTime ) {
 653+ foreach ( $replacements as $old => $new ) {
 654+ $this->printLatin1( "SUBSTITUTE $old -> $new\n" );
 655+ $this->renameTextLinks( $old, $new, $renameTime );
 656+ }
 657+ unset( $this->renameTextLinksOps[$renameTime] );
 658+ }
 659+ }
 660+
 661+ foreach ( $this->unixLineEndingsOps as $fixTime => $title ) {
 662+ if ( $editTime >= $fixTime ) {
 663+ $this->printLatin1( "$fixTime $title FIXING LINE ENDINGS\n" );
 664+ $text = $this->getText( $title );
 665+ $text = str_replace( "\r", '', $text );
 666+ $this->saveRevision( array(
 667+ 'rctitle' => $title,
 668+ 'timestamp' => $fixTime,
 669+ 'extra' => array( 'name' => 'UseModWiki admin' ),
 670+ 'text' => $text,
 671+ 'summary' => 'Fixing line endings',
 672+ ) );
 673+ unset( $this->unixLineEndingsOps[$fixTime] );
 674+ }
 675+ }
 676+ }
 677+
 678+ function patch( $source, $diff ) {
 679+ file_put_contents( $this->articleFileName, $source );
 680+ file_put_contents( $this->patchFileName, $diff );
 681+ $error = wfShellExec(
 682+ wfEscapeShellArg(
 683+ 'patch',
 684+ '-n',
 685+ '-r', '-',
 686+ '--no-backup-if-mismatch',
 687+ '--binary',
 688+ $this->articleFileName,
 689+ $this->patchFileName
 690+ ) . ' 2>&1',
 691+ $status
 692+ );
 693+ $text = file_get_contents( $this->articleFileName );
 694+ if ( $status || $text === false ) {
 695+ return false;
 696+ } else {
 697+ return $text;
 698+ }
 699+ }
 700+
 701+ function resolveFailedDiff( $origText, $diff ) {
 702+ $context = array();
 703+ $rxRange = '\d+(?:,(\d+))?';
 704+ $diffLines = explode( "\n", $diff );
 705+ for ( $i = 0; $i < count( $diffLines ); $i++ ) {
 706+ $diffLine = $diffLines[$i];
 707+ if ( !preg_match( '/^(\d+)(?:,\d+)?[acd]\d+(?:,\d+)?$/', $diffLine, $m ) ) {
 708+ continue;
 709+ }
 710+
 711+ $sourceIndex = intval( $m[1] );
 712+ $i++;
 713+ while ( $i < count( $diffLines ) && substr( $diffLines[$i], 0, 1 ) === '<' ) {
 714+ $context[$sourceIndex - 1] = substr( $diffLines[$i], 2 );
 715+ $sourceIndex++;
 716+ $i++;
 717+ }
 718+ $i--;
 719+ }
 720+
 721+ $changedLinks = array();
 722+ $origLines = explode( "\n", $origText );
 723+ foreach ( $context as $i => $contextLine ) {
 724+ $origLine = isset( $origLines[$i] ) ? $origLines[$i] : '';
 725+ if ( $contextLine === $origLine ) {
 726+ continue;
 727+ }
 728+ $newChanges = $this->resolveTextChange( $origLine, $contextLine );
 729+ if ( is_array( $newChanges ) ) {
 730+ $changedLinks += $newChanges;
 731+ } else {
 732+ echo "Resolution failure on line " . ( $i + 1 ) . "\n";
 733+ $this->printLatin1( $newChanges );
 734+ }
 735+ }
 736+
 737+ return $changedLinks;
 738+ }
 739+
 740+ function resolveTextChange( $source, $dest ) {
 741+ $changedLinks = array();
 742+ $sourceLinks = $this->getLinkList( $source );
 743+ $destLinks = $this->getLinkList( $dest );
 744+ $newLinks = array_diff( $destLinks, $sourceLinks );
 745+ $removedLinks = array_diff( $sourceLinks, $destLinks );
 746+
 747+ // Match up the removed links with the new links
 748+ foreach ( $newLinks as $j => $newLink ) {
 749+ $minDistance = 100000000;
 750+ $bestRemovedLink = false;
 751+ foreach ( $removedLinks as $k => $removedLink ) {
 752+ $editDistance = levenshtein( $newLink, $removedLink );
 753+ if ( $editDistance < $minDistance ) {
 754+ $minDistance = $editDistance;
 755+ $bestRemovedLink = $removedLink;
 756+ }
 757+ }
 758+ if ( $bestRemovedLink !== false ) {
 759+ $changedLinks[$bestRemovedLink] = $newLink;
 760+ $newLinks = array_diff( $newLinks, array( $newLink ) );
 761+ $removedLinks = array_diff( $removedLinks, array( $bestRemovedLink ) );
 762+ }
 763+ }
 764+
 765+ $proposal = $source;
 766+ foreach ( $changedLinks as $removedLink => $newLink ) {
 767+ $proposal = $this->substituteTextLinks( $removedLink, $newLink, $proposal );
 768+ }
 769+ if ( $proposal !== $dest ) {
 770+ // Resolution failed
 771+ $msg = "Source line: $source\n" .
 772+ "Source links: " . implode( ', ', $sourceLinks ) . "\n" .
 773+ "Context line: $dest\n" .
 774+ "Context links: " . implode( ', ', $destLinks ) . "\n" .
 775+ "Proposal: $proposal\n";
 776+ return $msg;
 777+ }
 778+ return $changedLinks;
 779+ }
 780+
 781+ function processDiffFile( $callback ) {
 782+ $diffFile = fopen( "{$this->dataDir}/diff_log", 'r' );
 783+
 784+ $delimiter = "------\n";
 785+ file_put_contents( $this->articleFileName, "Describe the new page here.\n" );
 786+
 787+ $line = fgets( $diffFile );
 788+ $lineNum = 1;
 789+ if ( $line !== $delimiter ) {
 790+ echo "Invalid diff file\n";
 791+ return false;
 792+ }
 793+ $lastReportLine = 0;
 794+ $this->numRevs = 0;
 795+
 796+ while ( true ) {
 797+ $line = fgets( $diffFile );
 798+ $lineNum++;
 799+ if ( $line === false ) {
 800+ break;
 801+ }
 802+ if ( $lineNum > $lastReportLine + 1000 ) {
 803+ $lastReportLine = $lineNum;
 804+ fwrite( STDERR, "$lineNum \r" );
 805+ fflush( STDERR );
 806+ }
 807+ $line = trim( $line );
 808+ if ( !preg_match( '/^([^|]+)\|(\d+)$/', $line, $matches ) ) {
 809+ echo "Invalid header on line $lineNum\n";
 810+ return true;
 811+ }
 812+ list( , $title, $editTime ) = $matches;
 813+
 814+ $diff = '';
 815+ $diffStartLine = $lineNum;
 816+ while ( true ) {
 817+ $line = fgets( $diffFile );
 818+ $lineNum++;
 819+ if ( $line === $delimiter ) {
 820+ break;
 821+ }
 822+ if ( $line === false ) {
 823+ break 2;
 824+ }
 825+ $diff .= $line;
 826+ }
 827+
 828+ $this->numRevs++;
 829+
 830+ if ( !isset( $this->rc[$editTime] ) ) {
 831+ $this->printLatin1( "$editTime $title DELETED, skipping\n" );
 832+ continue;
 833+ }
 834+
 835+ if ( count( $this->rc[$editTime] ) == 1 ) {
 836+ $params = $this->rc[$editTime][0];
 837+ } else {
 838+ $params = false;
 839+ $candidates = '';
 840+ foreach ( $this->rc[$editTime] as $rc ) {
 841+ if ( $rc['rctitle'] === $title ) {
 842+ $params = $rc;
 843+ break;
 844+ }
 845+ if ( $candidates === '' ) {
 846+ $candidates = $rc['rctitle'];
 847+ } else {
 848+ $candidates .= ', ' . $rc['rctitle'];
 849+ }
 850+ }
 851+ if ( !$params ) {
 852+ $this->printLatin1( "$editTime $title ERROR cannot resolve rclog\n" );
 853+ $this->printLatin1( "$editTime $title CANDIDATES: $candidates\n" );
 854+ continue;
 855+ }
 856+ }
 857+ $params['diff'] = $diff;
 858+ $params['title'] = $title;
 859+ $params['diffStartLine'] = $diffStartLine;
 860+ call_user_func( $callback, $params );
 861+ }
 862+ echo "\n";
 863+
 864+ if ( !feof( $diffFile ) ) {
 865+ echo "Stopped at line $lineNum\n";
 866+ }
 867+ return true;
 868+ }
 869+
 870+ function reconcileCurrentRevs() {
 871+ foreach ( $this->textCache as $title => $text ) {
 872+ $fileName = "{$this->dataDir}/page/";
 873+ if ( preg_match( '/^[A-Z]/', $title, $m ) ) {
 874+ $fileName .= $m[0];
 875+ } else {
 876+ $fileName .= 'other';
 877+ }
 878+ $fileName .= "/$title.db";
 879+
 880+ if ( !file_exists( $fileName ) ) {
 881+ $this->printLatin1( "ERROR: Cannot find page file for {$title}\n" );
 882+ continue;
 883+ }
 884+
 885+ $fileContents = file_get_contents( $fileName );
 886+ $page = $this->unserializeUseMod( $fileContents, $this->FS1 );
 887+ $section = $this->unserializeUseMod( $page['text_default'], $this->FS2 );
 888+ $data = $this->unserializeUseMod( $section['data'], $this->FS3 );
 889+ $pageText = $data['text'];
 890+ if ( $text !== $pageText ) {
 891+ $substs = $this->resolveTextChange( $text, $pageText );
 892+ if ( is_array( $substs ) ) {
 893+ foreach ( $substs as $source => $dest ) {
 894+ if ( isset( $this->moveLog[$dest] ) ) {
 895+ $this->printLatin1( "ERROR: need deep rename: $source\n" );
 896+ } else {
 897+ $this->printLatin1( "ERROR: need substitute: $source -> $dest\n" );
 898+ }
 899+ }
 900+ } else {
 901+ $this->printLatin1( "ERROR: unresolved diff in $title:\n" );
 902+ wfSuppressWarnings();
 903+ $diff = xdiff_string_diff( $text, $pageText ) . '';
 904+ wfRestoreWarnings();
 905+ $this->printLatin1( "$diff\n" );
 906+ }
 907+ }
 908+ }
 909+ }
 910+
 911+ function makeTitle( $titleText ) {
 912+ return Title::newFromText( $this->encode( $titleText ) );
 913+ }
 914+
 915+ function getText( $titleText ) {
 916+ if ( !isset( $this->textCache[$titleText] ) ) {
 917+ return "Describe the new page here.\n";
 918+ } else {
 919+ return $this->textCache[$titleText];
 920+ }
 921+ }
 922+
 923+ function saveRevision( $params ) {
 924+ $this->textCache[$params['rctitle']] = $params['text'];
 925+
 926+ $out = "<page>\n" .
 927+ $this->element( 'title', $params['rctitle'] ) .
 928+ "<revision>\n" .
 929+ $this->element( 'id', $this->revId ++ ) .
 930+ $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) .
 931+ "<contributor>\n";
 932+ if ( isset( $params['extra']['name'] ) ) {
 933+ $out .= $this->element( 'username', $params['extra']['name'] );
 934+ }
 935+ if ( isset( $params['extra']['id'] ) ) {
 936+ $out .= $this->element( 'id', $params['extra']['id'] );
 937+ }
 938+ if ( isset( $params['host'] ) ) {
 939+ $out .= $this->element( 'ip', $params['host'] );
 940+ }
 941+ $out .=
 942+ "</contributor>\n" .
 943+ $this->element( 'comment', $params['summary'] ) .
 944+ "<text xml:space=\"preserve\">" .
 945+ htmlspecialchars( $this->encode( $params['text'] ) ) .
 946+ "</text>\n" .
 947+ "</revision>\n" .
 948+ "</page>\n";
 949+ fwrite( $this->outFile, $out );
 950+ }
 951+
 952+ function renameTextLinks( $old, $new, $timestamp ) {
 953+ $newWithUnderscores = $new;
 954+ $old = str_replace( '_', ' ', $old );
 955+ $new = str_replace( '_', ' ', $new );
 956+
 957+ foreach ( $this->textCache as $title => $oldText ) {
 958+ if ( $newWithUnderscores === $title
 959+ && in_array( $title, $this->skipSelfSubstitution ) )
 960+ {
 961+ // Hack to make Pythagorean_Theorem etc. work
 962+ continue;
 963+ }
 964+
 965+ $newText = $this->substituteTextLinks( $old, $new, $oldText );
 966+ if ( $oldText !== $newText ) {
 967+ $this->saveRevision( array(
 968+ 'rctitle' => $title,
 969+ 'timestamp' => $timestamp,
 970+ 'text' => $newText,
 971+ 'extra' => array( 'name' => 'Page move link fixup script' ),
 972+ 'summary' => '',
 973+ 'minor' => true
 974+ ) );
 975+ }
 976+ }
 977+ }
 978+
 979+ function substituteTextLinks( $old, $new, $text ) {
 980+ $this->saveUrl = array();
 981+ $this->old = $old;
 982+ $this->new = $new;
 983+
 984+ $text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia)
 985+ $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is',
 986+ array( $this, 'storeRaw' ), $text );
 987+ $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is',
 988+ array( $this, 'storeRaw' ), $text );
 989+ $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s',
 990+ array( $this, 'storeRaw' ), $text );
 991+
 992+ $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/",
 993+ array( $this, 'subFreeLink' ), $text );
 994+ $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/",
 995+ array( $this, 'subFreeLink' ), $text );
 996+ $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/",
 997+ array( $this, 'storeRaw' ), $text );
 998+ $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/",
 999+ array( $this, 'storeRaw' ), $text );
 1000+ $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/",
 1001+ array( $this, 'storeRaw' ), $text );
 1002+ $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/",
 1003+ array( $this, 'storeRaw' ), $text );
 1004+ $text = preg_replace_callback( "/{$this->LinkPattern}/",
 1005+ array( $this, 'subWikiLink' ), $text );
 1006+
 1007+ $text = preg_replace_callback( "/{$this->FS}(\d+){$this->FS}/",
 1008+ array( $this, 'restoreRaw' ), $text ); # Restore saved text
 1009+ return $text;
 1010+ }
 1011+
 1012+ function getLinkList( $text ) {
 1013+ $this->saveUrl = array();
 1014+ $this->linkList = array();
 1015+
 1016+ $text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia)
 1017+ $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is',
 1018+ array( $this, 'storeRaw' ), $text );
 1019+ $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is',
 1020+ array( $this, 'storeRaw' ), $text );
 1021+ $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s',
 1022+ array( $this, 'storeRaw' ), $text );
 1023+
 1024+ $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/",
 1025+ array( $this, 'storeLink' ), $text );
 1026+ $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/",
 1027+ array( $this, 'storeLink' ), $text );
 1028+ $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/",
 1029+ array( $this, 'storeRaw' ), $text );
 1030+ $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/",
 1031+ array( $this, 'storeRaw' ), $text );
 1032+ $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/",
 1033+ array( $this, 'storeRaw' ), $text );
 1034+ $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/",
 1035+ array( $this, 'storeRaw' ), $text );
 1036+ $text = preg_replace_callback( "/{$this->LinkPattern}/",
 1037+ array( $this, 'storeLink' ), $text );
 1038+
 1039+ return $this->linkList;
 1040+ }
 1041+
 1042+ function storeRaw( $m ) {
 1043+ $this->saveUrl[] = $m[1];
 1044+ return $this->FS . (count( $this->saveUrl ) - 1) . $this->FS;
 1045+ }
 1046+
 1047+ function subFreeLink( $m ) {
 1048+ $link = $m[1];
 1049+ if ( isset( $m[2] ) ) {
 1050+ $name = $m[2];
 1051+ } else {
 1052+ $name = '';
 1053+ }
 1054+ $oldlink = $link;
 1055+ $link = preg_replace( '/^\s+/', '', $link );
 1056+ $link = preg_replace( '/\s+$/', '', $link );
 1057+ if ( $link == $this->old ) {
 1058+ $link = $this->new;
 1059+ } else {
 1060+ $link = $oldlink; # Preserve spaces if no match
 1061+ }
 1062+ $link = "[[$link";
 1063+ if ( $name !== "" ) {
 1064+ $link .= "|$name";
 1065+ }
 1066+ $link .= "]]";
 1067+ return $this->storeRaw( array( 1 => $link ) );
 1068+ }
 1069+
 1070+ function subWikiLink( $m ) {
 1071+ $link = $m[1];
 1072+ if ( $link == $this->old ) {
 1073+ $link = $this->new;
 1074+ if ( !preg_match( "/^{$this->LinkPattern}$/", $this->new ) ) {
 1075+ $link = "[[$link]]";
 1076+ }
 1077+ }
 1078+ return $this->storeRaw( array( 1 => $link ) );
 1079+ }
 1080+
 1081+ function restoreRaw( $m ) {
 1082+ return $this->saveUrl[$m[1]];
 1083+ }
 1084+
 1085+ function storeLink( $m ) {
 1086+ $this->linkList[] = $m[1];
 1087+ return $this->storeRaw( $m );
 1088+ }
 1089+
 1090+ function encode( $s ) {
 1091+ return strtr( $s, $this->encodeMap );
 1092+ }
 1093+
 1094+ function decode( $s ) {
 1095+ return strtr( $s, $this->decodeMap );
 1096+ }
 1097+
 1098+ function printLatin1( $s ) {
 1099+ echo $this->encode( $s );
 1100+ }
 1101+
 1102+ function unserializeUseMod( $s, $sep ) {
 1103+ $parts = explode( $sep, $s );
 1104+ $result = array();
 1105+ for ( $i = 0; $i < count( $parts ); $i += 2 ) {
 1106+ $result[$parts[$i]] = $parts[$i+1];
 1107+ }
 1108+ return $result;
 1109+ }
 1110+}
 1111+
 1112+$maintClass = 'ImportUseModWikipedia';
 1113+require_once( DO_MAINTENANCE );
Property changes on: trunk/phase3/maintenance/importUseModWikipedia.php
___________________________________________________________________
Added: svn:eol-style
11114 + native

Comments

#Comment by Platonides (talk | contribs)   22:20, 21 December 2010

PHP 5.2 Parse error: syntax error, unexpected T_START_HEREDOC in ./maintenance/importUseModWikipedia.php on line 98

#Comment by Tim Starling (talk | contribs)   00:14, 28 January 2011

Was fixed in r78842, so setting back to new.

#Comment by Bryan (talk | contribs)   12:53, 10 March 2011

Marking deferred.

Status & tagging log