Index: trunk/phase3/maintenance/importUseModWikipedia.php |
— | — | @@ -0,0 +1,1112 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +/** |
| 5 | + * A script to read a dump of the English Wikipedia from the UseModWiki period, and to |
| 6 | + * generate an XML dump in MediaWiki format. |
| 7 | + * |
| 8 | + * Some relevant code was ported from UseModWiki 0.92. |
| 9 | + * |
| 10 | + */ |
| 11 | + |
| 12 | +require_once( dirname( __FILE__ ) . '/Maintenance.php' ); |
| 13 | +require_once( dirname( __FILE__ ) .'/../includes/normal/UtfNormalUtil.php' ); |
| 14 | + |
| 15 | + |
| 16 | +class ImportUseModWikipedia extends Maintenance { |
| 17 | + var $encodeMap, $decodeMap; |
| 18 | + |
| 19 | + var $deepRenames = array( |
| 20 | + 'JimboWales' => 983862286, |
| 21 | + 'TexaS' => 983918410, |
| 22 | + 'HistoryOfUnitedStatesTalk' => 984795423, |
| 23 | + 'MetallicA' => 985128533, |
| 24 | + 'PythagoreanTheorem' => 985225545, |
| 25 | + 'TheCanonofScripture' => 985368223, |
| 26 | + 'TaoTehChing' => 985368222, |
| 27 | + //'TheMostRemarkableFormulaInTheWorld' => 985368221, |
| 28 | + 'TheRecorder' => 985368220, |
| 29 | + 'GladstoneOregon' => 985368219, |
| 30 | + #'UnitedStatesConstitution/AmendmentTwo' => |
| 31 | + ); |
| 32 | + |
| 33 | + var $replacements = array(); |
| 34 | + |
| 35 | + var $renameTextLinksOps = array( |
| 36 | + 983846265 => array( |
| 37 | + 'TestIgnore' => 'IgnoreTest', |
| 38 | + ), |
| 39 | + 983848080 => array( |
| 40 | + 'UnitedLocomotiveWorks' => 'Atlas Shrugged/United Locomotive Works' |
| 41 | + ), |
| 42 | + 983856376 => array( |
| 43 | + 'WikiPedia' => 'Wikipedia', |
| 44 | + ), |
| 45 | + 983896152 => array( |
| 46 | + 'John_F_Kennedy' => 'John_F._Kennedy', |
| 47 | + ), |
| 48 | + 983905871 => array( |
| 49 | + 'LarrySanger' => 'Larry_Sanger' |
| 50 | + ), |
| 51 | + 984697068 => array( |
| 52 | + 'UnitedStates' => 'United States', |
| 53 | + ), |
| 54 | + 984792748 => array( |
| 55 | + 'LibertarianisM' => 'Libertarianism' |
| 56 | + ), |
| 57 | + 985327832 => array( |
| 58 | + 'AnarchisM' => 'Anarchism', |
| 59 | + ), |
| 60 | + 985290063 => array( |
| 61 | + 'HistoryOfUnitedStatesDiscussion' => 'History_Of_United_States_Discussion' |
| 62 | + ), |
| 63 | + 985290091 => array( |
| 64 | + 'BritishEmpire' => 'British Empire' |
| 65 | + ), |
| 66 | + /* |
| 67 | + 985468958 => array( |
| 68 | + 'ScienceFiction' => 'Science fiction', |
| 69 | + ),*/ |
| 70 | + ); |
| 71 | + |
| 72 | + /** |
| 73 | + * Hack for observed substitution issues |
| 74 | + */ |
| 75 | + var $skipSelfSubstitution = array( |
| 76 | + 'Pythagorean_Theorem', |
| 77 | + 'The_Most_Remarkable_Formula_In_The_World', |
| 78 | + 'Wine', |
| 79 | + ); |
| 80 | + |
| 81 | + var $unixLineEndingsOps = array( |
| 82 | + 987743732 => 'Wikipedia_FAQ' |
| 83 | + ); |
| 84 | + |
| 85 | + var $replacementsDone = array(); |
| 86 | + |
| 87 | + var $moveLog = array(); |
| 88 | + var $moveDests = array(); |
| 89 | + var $revId; |
| 90 | + |
| 91 | + var $rc = array(); |
| 92 | + var $textCache = array(); |
| 93 | + var $blacklist = array(); |
| 94 | + |
| 95 | + var $FS, $FS1, $FS2, $FS3; |
| 96 | + var $FreeLinkPattern, $UrlPattern, $LinkPattern, $InterLinkPattern; |
| 97 | + |
| 98 | + var $cp1252Table = <<<EOT |
| 99 | +0x00 0x0000 |
| 100 | +0x01 0x0001 |
| 101 | +0x02 0x0002 |
| 102 | +0x03 0x0003 |
| 103 | +0x04 0x0004 |
| 104 | +0x05 0x0005 |
| 105 | +0x06 0x0006 |
| 106 | +0x07 0x0007 |
| 107 | +0x08 0x0008 |
| 108 | +0x09 0x0009 |
| 109 | +0x0a 0x000a |
| 110 | +0x0b 0x000b |
| 111 | +0x0c 0x000c |
| 112 | +0x0d 0x000d |
| 113 | +0x0e 0x000e |
| 114 | +0x0f 0x000f |
| 115 | +0x10 0x0010 |
| 116 | +0x11 0x0011 |
| 117 | +0x12 0x0012 |
| 118 | +0x13 0x0013 |
| 119 | +0x14 0x0014 |
| 120 | +0x15 0x0015 |
| 121 | +0x16 0x0016 |
| 122 | +0x17 0x0017 |
| 123 | +0x18 0x0018 |
| 124 | +0x19 0x0019 |
| 125 | +0x1a 0x001a |
| 126 | +0x1b 0x001b |
| 127 | +0x1c 0x001c |
| 128 | +0x1d 0x001d |
| 129 | +0x1e 0x001e |
| 130 | +0x1f 0x001f |
| 131 | +0x20 0x0020 |
| 132 | +0x21 0x0021 |
| 133 | +0x22 0x0022 |
| 134 | +0x23 0x0023 |
| 135 | +0x24 0x0024 |
| 136 | +0x25 0x0025 |
| 137 | +0x26 0x0026 |
| 138 | +0x27 0x0027 |
| 139 | +0x28 0x0028 |
| 140 | +0x29 0x0029 |
| 141 | +0x2a 0x002a |
| 142 | +0x2b 0x002b |
| 143 | +0x2c 0x002c |
| 144 | +0x2d 0x002d |
| 145 | +0x2e 0x002e |
| 146 | +0x2f 0x002f |
| 147 | +0x30 0x0030 |
| 148 | +0x31 0x0031 |
| 149 | +0x32 0x0032 |
| 150 | +0x33 0x0033 |
| 151 | +0x34 0x0034 |
| 152 | +0x35 0x0035 |
| 153 | +0x36 0x0036 |
| 154 | +0x37 0x0037 |
| 155 | +0x38 0x0038 |
| 156 | +0x39 0x0039 |
| 157 | +0x3a 0x003a |
| 158 | +0x3b 0x003b |
| 159 | +0x3c 0x003c |
| 160 | +0x3d 0x003d |
| 161 | +0x3e 0x003e |
| 162 | +0x3f 0x003f |
| 163 | +0x40 0x0040 |
| 164 | +0x41 0x0041 |
| 165 | +0x42 0x0042 |
| 166 | +0x43 0x0043 |
| 167 | +0x44 0x0044 |
| 168 | +0x45 0x0045 |
| 169 | +0x46 0x0046 |
| 170 | +0x47 0x0047 |
| 171 | +0x48 0x0048 |
| 172 | +0x49 0x0049 |
| 173 | +0x4a 0x004a |
| 174 | +0x4b 0x004b |
| 175 | +0x4c 0x004c |
| 176 | +0x4d 0x004d |
| 177 | +0x4e 0x004e |
| 178 | +0x4f 0x004f |
| 179 | +0x50 0x0050 |
| 180 | +0x51 0x0051 |
| 181 | +0x52 0x0052 |
| 182 | +0x53 0x0053 |
| 183 | +0x54 0x0054 |
| 184 | +0x55 0x0055 |
| 185 | +0x56 0x0056 |
| 186 | +0x57 0x0057 |
| 187 | +0x58 0x0058 |
| 188 | +0x59 0x0059 |
| 189 | +0x5a 0x005a |
| 190 | +0x5b 0x005b |
| 191 | +0x5c 0x005c |
| 192 | +0x5d 0x005d |
| 193 | +0x5e 0x005e |
| 194 | +0x5f 0x005f |
| 195 | +0x60 0x0060 |
| 196 | +0x61 0x0061 |
| 197 | +0x62 0x0062 |
| 198 | +0x63 0x0063 |
| 199 | +0x64 0x0064 |
| 200 | +0x65 0x0065 |
| 201 | +0x66 0x0066 |
| 202 | +0x67 0x0067 |
| 203 | +0x68 0x0068 |
| 204 | +0x69 0x0069 |
| 205 | +0x6a 0x006a |
| 206 | +0x6b 0x006b |
| 207 | +0x6c 0x006c |
| 208 | +0x6d 0x006d |
| 209 | +0x6e 0x006e |
| 210 | +0x6f 0x006f |
| 211 | +0x70 0x0070 |
| 212 | +0x71 0x0071 |
| 213 | +0x72 0x0072 |
| 214 | +0x73 0x0073 |
| 215 | +0x74 0x0074 |
| 216 | +0x75 0x0075 |
| 217 | +0x76 0x0076 |
| 218 | +0x77 0x0077 |
| 219 | +0x78 0x0078 |
| 220 | +0x79 0x0079 |
| 221 | +0x7a 0x007a |
| 222 | +0x7b 0x007b |
| 223 | +0x7c 0x007c |
| 224 | +0x7d 0x007d |
| 225 | +0x7e 0x007e |
| 226 | +0x7f 0x007f |
| 227 | +0x80 0x20ac |
| 228 | +0x81 0x0081 |
| 229 | +0x82 0x201a |
| 230 | +0x83 0x0192 |
| 231 | +0x84 0x201e |
| 232 | +0x85 0x2026 |
| 233 | +0x86 0x2020 |
| 234 | +0x87 0x2021 |
| 235 | +0x88 0x02c6 |
| 236 | +0x89 0x2030 |
| 237 | +0x8a 0x0160 |
| 238 | +0x8b 0x2039 |
| 239 | +0x8c 0x0152 |
| 240 | +0x8d 0x008d |
| 241 | +0x8e 0x017d |
| 242 | +0x8f 0x008f |
| 243 | +0x90 0x0090 |
| 244 | +0x91 0x2018 |
| 245 | +0x92 0x2019 |
| 246 | +0x93 0x201c |
| 247 | +0x94 0x201d |
| 248 | +0x95 0x2022 |
| 249 | +0x96 0x2013 |
| 250 | +0x97 0x2014 |
| 251 | +0x98 0x02dc |
| 252 | +0x99 0x2122 |
| 253 | +0x9a 0x0161 |
| 254 | +0x9b 0x203a |
| 255 | +0x9c 0x0153 |
| 256 | +0x9d 0x009d |
| 257 | +0x9e 0x017e |
| 258 | +0x9f 0x0178 |
| 259 | +0xa0 0x00a0 |
| 260 | +0xa1 0x00a1 |
| 261 | +0xa2 0x00a2 |
| 262 | +0xa3 0x00a3 |
| 263 | +0xa4 0x00a4 |
| 264 | +0xa5 0x00a5 |
| 265 | +0xa6 0x00a6 |
| 266 | +0xa7 0x00a7 |
| 267 | +0xa8 0x00a8 |
| 268 | +0xa9 0x00a9 |
| 269 | +0xaa 0x00aa |
| 270 | +0xab 0x00ab |
| 271 | +0xac 0x00ac |
| 272 | +0xad 0x00ad |
| 273 | +0xae 0x00ae |
| 274 | +0xaf 0x00af |
| 275 | +0xb0 0x00b0 |
| 276 | +0xb1 0x00b1 |
| 277 | +0xb2 0x00b2 |
| 278 | +0xb3 0x00b3 |
| 279 | +0xb4 0x00b4 |
| 280 | +0xb5 0x00b5 |
| 281 | +0xb6 0x00b6 |
| 282 | +0xb7 0x00b7 |
| 283 | +0xb8 0x00b8 |
| 284 | +0xb9 0x00b9 |
| 285 | +0xba 0x00ba |
| 286 | +0xbb 0x00bb |
| 287 | +0xbc 0x00bc |
| 288 | +0xbd 0x00bd |
| 289 | +0xbe 0x00be |
| 290 | +0xbf 0x00bf |
| 291 | +0xc0 0x00c0 |
| 292 | +0xc1 0x00c1 |
| 293 | +0xc2 0x00c2 |
| 294 | +0xc3 0x00c3 |
| 295 | +0xc4 0x00c4 |
| 296 | +0xc5 0x00c5 |
| 297 | +0xc6 0x00c6 |
| 298 | +0xc7 0x00c7 |
| 299 | +0xc8 0x00c8 |
| 300 | +0xc9 0x00c9 |
| 301 | +0xca 0x00ca |
| 302 | +0xcb 0x00cb |
| 303 | +0xcc 0x00cc |
| 304 | +0xcd 0x00cd |
| 305 | +0xce 0x00ce |
| 306 | +0xcf 0x00cf |
| 307 | +0xd0 0x00d0 |
| 308 | +0xd1 0x00d1 |
| 309 | +0xd2 0x00d2 |
| 310 | +0xd3 0x00d3 |
| 311 | +0xd4 0x00d4 |
| 312 | +0xd5 0x00d5 |
| 313 | +0xd6 0x00d6 |
| 314 | +0xd7 0x00d7 |
| 315 | +0xd8 0x00d8 |
| 316 | +0xd9 0x00d9 |
| 317 | +0xda 0x00da |
| 318 | +0xdb 0x00db |
| 319 | +0xdc 0x00dc |
| 320 | +0xdd 0x00dd |
| 321 | +0xde 0x00de |
| 322 | +0xdf 0x00df |
| 323 | +0xe0 0x00e0 |
| 324 | +0xe1 0x00e1 |
| 325 | +0xe2 0x00e2 |
| 326 | +0xe3 0x00e3 |
| 327 | +0xe4 0x00e4 |
| 328 | +0xe5 0x00e5 |
| 329 | +0xe6 0x00e6 |
| 330 | +0xe7 0x00e7 |
| 331 | +0xe8 0x00e8 |
| 332 | +0xe9 0x00e9 |
| 333 | +0xea 0x00ea |
| 334 | +0xeb 0x00eb |
| 335 | +0xec 0x00ec |
| 336 | +0xed 0x00ed |
| 337 | +0xee 0x00ee |
| 338 | +0xef 0x00ef |
| 339 | +0xf0 0x00f0 |
| 340 | +0xf1 0x00f1 |
| 341 | +0xf2 0x00f2 |
| 342 | +0xf3 0x00f3 |
| 343 | +0xf4 0x00f4 |
| 344 | +0xf5 0x00f5 |
| 345 | +0xf6 0x00f6 |
| 346 | +0xf7 0x00f7 |
| 347 | +0xf8 0x00f8 |
| 348 | +0xf9 0x00f9 |
| 349 | +0xfa 0x00fa |
| 350 | +0xfb 0x00fb |
| 351 | +0xfc 0x00fc |
| 352 | +0xfd 0x00fd |
| 353 | +0xfe 0x00fe |
| 354 | +0xff 0x00ff |
| 355 | +EOT; |
| 356 | + public function __construct() { |
| 357 | + parent::__construct(); |
| 358 | + $this->addOption( 'datadir', 'the value of $DataDir from wiki.cgi', true, true ); |
| 359 | + $this->addOption( 'outfile', 'the name of the output XML file', true, true ); |
| 360 | + $this->initLinkPatterns(); |
| 361 | + |
| 362 | + $this->encodeMap = $this->decodeMap = array(); |
| 363 | + foreach ( explode( "\n", $this->cp1252Table ) as $line ) { |
| 364 | + list( $source, $dest ) = explode( "\t", $line ); |
| 365 | + $sourceChar = chr( base_convert( substr( $source, 2 ), 16, 10 ) ); |
| 366 | + $destChar = codepointToUtf8( base_convert( substr( $dest, 2 ), 16, 10 ) ); |
| 367 | + $this->encodeMap[$sourceChar] = $destChar; |
| 368 | + $this->decodeMap[$destChar] = $sourceChar; |
| 369 | + } |
| 370 | + } |
| 371 | + |
| 372 | + function initLinkPatterns() { |
| 373 | + # Field separators are used in the URL-style patterns below. |
| 374 | + $this->FS = "\xb3"; # The FS character is a superscript "3" |
| 375 | + $this->FS1 = $this->FS . "1"; # The FS values are used to separate fields |
| 376 | + $this->FS2 = $this->FS . "2"; # in stored hashtables and other data structures. |
| 377 | + $this->FS3 = $this->FS . "3"; # The FS character is not allowed in user data. |
| 378 | + |
| 379 | + $UpperLetter = "[A-Z"; |
| 380 | + $LowerLetter = "[a-z"; |
| 381 | + $AnyLetter = "[A-Za-z"; |
| 382 | + $AnyLetter .= "_0-9"; |
| 383 | + $UpperLetter .= "]"; $LowerLetter .= "]"; $AnyLetter .= "]"; |
| 384 | + |
| 385 | + # Main link pattern: lowercase between uppercase, then anything |
| 386 | + $LpA = $UpperLetter . "+" . $LowerLetter . "+" . $UpperLetter |
| 387 | + . $AnyLetter . "*"; |
| 388 | + # Optional subpage link pattern: uppercase, lowercase, then anything |
| 389 | + $LpB = $UpperLetter . "+" . $LowerLetter . "+" . $AnyLetter . "*"; |
| 390 | + |
| 391 | + # Loose pattern: If subpage is used, subpage may be simple name |
| 392 | + $this->LinkPattern = "((?:(?:$LpA)?\\/$LpB)|$LpA)"; |
| 393 | + $QDelim = '(?:"")?'; # Optional quote delimiter (not in output) |
| 394 | + $this->LinkPattern .= $QDelim; |
| 395 | + |
| 396 | + # Inter-site convention: sites must start with uppercase letter |
| 397 | + # (Uppercase letter avoids confusion with URLs) |
| 398 | + $InterSitePattern = $UpperLetter . $AnyLetter . "+"; |
| 399 | + $this->InterLinkPattern = "((?:$InterSitePattern:[^\\]\\s\"<>{$this->FS}]+)$QDelim)"; |
| 400 | + |
| 401 | + $AnyLetter = "[-,. _0-9A-Za-z]"; |
| 402 | + $this->FreeLinkPattern = "($AnyLetter+)"; |
| 403 | + $this->FreeLinkPattern = "((?:(?:$AnyLetter+)?\\/)?$AnyLetter+)"; |
| 404 | + $this->FreeLinkPattern .= $QDelim; |
| 405 | + |
| 406 | + # Url-style links are delimited by one of: |
| 407 | + # 1. Whitespace (kept in output) |
| 408 | + # 2. Left or right angle-bracket (< or >) (kept in output) |
| 409 | + # 3. Right square-bracket (]) (kept in output) |
| 410 | + # 4. A single double-quote (") (kept in output) |
| 411 | + # 5. A $FS (field separator) character (kept in output) |
| 412 | + # 6. A double double-quote ("") (removed from output) |
| 413 | + |
| 414 | + $UrlProtocols = "http|https|ftp|afs|news|nntp|mid|cid|mailto|wais|" |
| 415 | + . "prospero|telnet|gopher"; |
| 416 | + $UrlProtocols .= '|file'; |
| 417 | + $this->UrlPattern = "((?:(?:$UrlProtocols):[^\\]\\s\"<>{$this->FS}]+)$QDelim)"; |
| 418 | + $ImageExtensions = "(gif|jpg|png|bmp|jpeg)"; |
| 419 | + $RFCPattern = "RFC\\s?(\\d+)"; |
| 420 | + $ISBNPattern = "ISBN:?([0-9- xX]{10,})"; |
| 421 | + } |
| 422 | + |
| 423 | + function execute() { |
| 424 | + $this->articleFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp'; |
| 425 | + $this->patchFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp'; |
| 426 | + $this->dataDir = $this->getOption( 'datadir' ); |
| 427 | + $this->outFile = fopen( $this->getOption( 'outfile' ), 'w' ); |
| 428 | + if ( !$this->outFile ) { |
| 429 | + echo "Unable to open output file\n"; |
| 430 | + return 1; |
| 431 | + } |
| 432 | + $this->writeXmlHeader(); |
| 433 | + $this->readRclog(); |
| 434 | + $this->writeMoveLog(); |
| 435 | + $this->writeRevisions(); |
| 436 | + $this->reconcileCurrentRevs(); |
| 437 | + $this->writeXmlFooter(); |
| 438 | + unlink( $this->articleFileName ); |
| 439 | + unlink( $this->patchFileName ); |
| 440 | + return 0; |
| 441 | + } |
| 442 | + |
| 443 | + function writeXmlHeader() { |
| 444 | + fwrite( $this->outFile, <<<EOT |
| 445 | +<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en"> |
| 446 | + <siteinfo> |
| 447 | + <sitename>Wikipedia</sitename> |
| 448 | + <base>http://www.wikipedia.com/</base> |
| 449 | + <generator>MediaWiki 1.18alpha importUseModWikipedia.php</generator> |
| 450 | + <case>case-sensitive</case> |
| 451 | + <namespaces> |
| 452 | + <namespace key="0" /> |
| 453 | + </namespaces> |
| 454 | + </siteinfo> |
| 455 | + |
| 456 | +EOT |
| 457 | + ); |
| 458 | + } |
| 459 | + |
| 460 | + function writeXmlFooter() { |
| 461 | + fwrite( $this->outFile, "</mediawiki>\n" ); |
| 462 | + } |
| 463 | + |
| 464 | + function readRclog() { |
| 465 | + $rcFile = fopen( "{$this->dataDir}/rclog", 'r' ); |
| 466 | + while ( $line = fgets( $rcFile ) ) { |
| 467 | + $bits = explode( $this->FS3, $line ); |
| 468 | + if ( count( $bits ) !== 7 ) { |
| 469 | + echo "Error reading rclog\n"; |
| 470 | + return; |
| 471 | + } |
| 472 | + $params = array( |
| 473 | + 'timestamp' => $bits[0], |
| 474 | + 'rctitle' => $bits[1], |
| 475 | + 'summary' => $bits[2], |
| 476 | + 'minor' => $bits[3], |
| 477 | + 'host' => $bits[4], |
| 478 | + 'kind' => $bits[5], |
| 479 | + 'extra' => array() |
| 480 | + ); |
| 481 | + $extraList = explode( $this->FS2, $bits[6] ); |
| 482 | + |
| 483 | + for ( $i = 0; $i < count( $extraList ); $i += 2 ) { |
| 484 | + $params['extra'][$extraList[$i]] = $extraList[$i + 1]; |
| 485 | + } |
| 486 | + $this->rc[$params['timestamp']][] = $params; |
| 487 | + } |
| 488 | + } |
| 489 | + |
| 490 | + function writeMoveLog() { |
| 491 | + $this->moveLog = array(); |
| 492 | + $deepRenames = $this->deepRenames; |
| 493 | + echo "Calculating move log...\n"; |
| 494 | + $this->processDiffFile( array( $this, 'moveLogCallback' ) ); |
| 495 | + |
| 496 | + // We have the timestamp intervals, now make a guess at the actual timestamp |
| 497 | + foreach ( $this->moveLog as $newTitle => $params ) { |
| 498 | + // Is there a time specified? |
| 499 | + $drTime = false; |
| 500 | + if ( isset( $deepRenames[$params['old']] ) ) { |
| 501 | + $drTime = $deepRenames[$params['old']]; |
| 502 | + if ( $drTime !== '?' ) { |
| 503 | + if ( ( !isset( $params['endTime'] ) || $drTime < $params['endTime'] ) |
| 504 | + && $drTime > $params['startTime'] ) |
| 505 | + { |
| 506 | + $this->moveLog[$newTitle]['timestamp'] = $drTime; |
| 507 | + $this->moveLog[$newTitle]['deep'] = true; |
| 508 | + |
| 509 | + echo "{$params['old']} -> $newTitle at $drTime\n"; |
| 510 | + unset( $deepRenames[$params['old']] ); |
| 511 | + continue; |
| 512 | + } else { |
| 513 | + echo "WARNING: deep rename time invalid: {$params['old']}\n"; |
| 514 | + unset( $deepRenames[$params['old']] ); |
| 515 | + } |
| 516 | + } |
| 517 | + } |
| 518 | + |
| 519 | + // Guess that it is one second after the last edit to the page before it was moved |
| 520 | + $this->moveLog[$newTitle]['timestamp'] = $params['startTime'] + 1; |
| 521 | + if ( $drTime === '?' ) { |
| 522 | + $this->moveLog[$newTitle]['deep'] = true; |
| 523 | + unset( $deepRenames[$params['old']] ); |
| 524 | + } |
| 525 | + if ( isset( $params['endTime'] ) ) { |
| 526 | + $this->printLatin1( "{$params['old']} -> $newTitle between " . |
| 527 | + "{$params['startTime']} and {$params['endTime']}\n" ); |
| 528 | + } else { |
| 529 | + $this->printLatin1( "{$params['old']} -> $newTitle after " . |
| 530 | + "{$params['startTime']}\n" ); |
| 531 | + } |
| 532 | + } |
| 533 | + |
| 534 | + // Write the move log to the XML file |
| 535 | + $id = 1; |
| 536 | + foreach ( $this->moveLog as $newTitle => $params ) { |
| 537 | + $out = "<logitem>\n" . |
| 538 | + $this->element( 'id', $id++ ) . |
| 539 | + $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) . |
| 540 | + "<contributor>\n" . |
| 541 | + $this->element( 'username', 'UseModWiki admin' ) . |
| 542 | + "</contributor>" . |
| 543 | + $this->element( 'type', 'move' ) . |
| 544 | + $this->element( 'action', 'move' ) . |
| 545 | + $this->element( 'logtitle', $params['old'] ) . |
| 546 | + "<params xml:space=\"preserve\">" . |
| 547 | + htmlspecialchars( $this->encode( "{$newTitle}\n1" ) ) . |
| 548 | + "</params>\n" . |
| 549 | + "</logitem>\n"; |
| 550 | + fwrite( $this->outFile, $out ); |
| 551 | + } |
| 552 | + |
| 553 | + // Check for remaining deep rename entries |
| 554 | + if ( $deepRenames ) { |
| 555 | + echo "WARNING: the following entries in \$this->deepRenames are " . |
| 556 | + "invalid, since no such move exists:\n" . |
| 557 | + implode( "\n", array_keys( $deepRenames ) ) . |
| 558 | + "\n\n"; |
| 559 | + } |
| 560 | + |
| 561 | + } |
| 562 | + |
| 563 | + function element( $name, $value ) { |
| 564 | + return "<$name>" . htmlspecialchars( $this->encode( $value ) ) . "</$name>\n"; |
| 565 | + } |
| 566 | + |
| 567 | + function moveLogCallback( $entry ) { |
| 568 | + $rctitle = $entry['rctitle']; |
| 569 | + $title = $entry['title']; |
| 570 | + $this->moveDests[$rctitle] = $title; |
| 571 | + |
| 572 | + if ( $rctitle === $title ) { |
| 573 | + if ( isset( $this->moveLog[$rctitle] ) |
| 574 | + && !isset( $this->moveLog[$rctitle]['endTime'] ) ) |
| 575 | + { |
| 576 | + // This is the latest time that the page could have been moved |
| 577 | + $this->moveLog[$rctitle]['endTime'] = $entry['timestamp']; |
| 578 | + } |
| 579 | + } else { |
| 580 | + if ( !isset( $this->moveLog[$rctitle] ) ) { |
| 581 | + // Initialise the move log entry |
| 582 | + $this->moveLog[$rctitle] = array( |
| 583 | + 'old' => $title |
| 584 | + ); |
| 585 | + } |
| 586 | + // Update the earliest time the page could have been moved |
| 587 | + $this->moveLog[$rctitle]['startTime'] = $entry['timestamp']; |
| 588 | + } |
| 589 | + } |
| 590 | + |
| 591 | + function writeRevisions() { |
| 592 | + $this->numGoodRevs = 0; |
| 593 | + $this->revId = 1; |
| 594 | + $this->processDiffFile( array( $this, 'revisionCallback' ) ); |
| 595 | + echo "\n\nImported {$this->numGoodRevs} out of {$this->numRevs}\n"; |
| 596 | + } |
| 597 | + |
| 598 | + function revisionCallback( $params ) { |
| 599 | + $origTitle = $params['title']; |
| 600 | + $title = $params['rctitle']; |
| 601 | + $editTime = $params['timestamp']; |
| 602 | + |
| 603 | + if ( isset( $this->blacklist[$title] ) ) { |
| 604 | + return; |
| 605 | + } |
| 606 | + $this->doPendingOps( $editTime ); |
| 607 | + |
| 608 | + $origText = $this->getText( $title ); |
| 609 | + $text = $this->patch( $origText, $params['diff'] ); |
| 610 | + if ( $text === false ) { |
| 611 | + echo "$editTime $title attempting resolution...\n"; |
| 612 | + $linkSubstitutes = $this->resolveFailedDiff( $origText, $params['diff'] ); |
| 613 | + if ( !$linkSubstitutes ) { |
| 614 | + $this->printLatin1( "$editTime $title DIFF FAILED\n" ); |
| 615 | + $this->blacklist[$title] = true; |
| 616 | + return; |
| 617 | + } |
| 618 | + $this->printLatin1( "$editTime $title requires substitutions:\n" ); |
| 619 | + $time = $editTime - 1; |
| 620 | + foreach ( $linkSubstitutes as $old => $new ) { |
| 621 | + $this->printLatin1( "SUBSTITUTE $old -> $new\n" ); |
| 622 | + $this->renameTextLinks( $old, $new, $time-- ); |
| 623 | + } |
| 624 | + $origText = $this->getText( $title ); |
| 625 | + $text = $this->patch( $origText, $params['diff'] ); |
| 626 | + if ( $text === false ) { |
| 627 | + $this->printLatin1( "$editTime $title STILL FAILS!\n" ); |
| 628 | + $this->blacklist[$title] = true; |
| 629 | + return; |
| 630 | + } |
| 631 | + |
| 632 | + echo "\n"; |
| 633 | + } |
| 634 | + |
| 635 | + $params['text'] = $text; |
| 636 | + $this->saveRevision( $params ); |
| 637 | + $this->numGoodRevs++; |
| 638 | + #$this->printLatin1( "$editTime $title\n" ); |
| 639 | + } |
| 640 | + |
| 641 | + function doPendingOps( $editTime ) { |
| 642 | + foreach ( $this->moveLog as $newTitle => $entry ) { |
| 643 | + if ( $entry['timestamp'] <= $editTime ) { |
| 644 | + unset( $this->moveLog[$newTitle] ); |
| 645 | + if ( isset( $entry['deep'] ) ) { |
| 646 | + $this->renameTextLinks( $entry['old'], $newTitle, $entry['timestamp'] ); |
| 647 | + } |
| 648 | + } |
| 649 | + } |
| 650 | + |
| 651 | + foreach ( $this->renameTextLinksOps as $renameTime => $replacements ) { |
| 652 | + if ( $editTime >= $renameTime ) { |
| 653 | + foreach ( $replacements as $old => $new ) { |
| 654 | + $this->printLatin1( "SUBSTITUTE $old -> $new\n" ); |
| 655 | + $this->renameTextLinks( $old, $new, $renameTime ); |
| 656 | + } |
| 657 | + unset( $this->renameTextLinksOps[$renameTime] ); |
| 658 | + } |
| 659 | + } |
| 660 | + |
| 661 | + foreach ( $this->unixLineEndingsOps as $fixTime => $title ) { |
| 662 | + if ( $editTime >= $fixTime ) { |
| 663 | + $this->printLatin1( "$fixTime $title FIXING LINE ENDINGS\n" ); |
| 664 | + $text = $this->getText( $title ); |
| 665 | + $text = str_replace( "\r", '', $text ); |
| 666 | + $this->saveRevision( array( |
| 667 | + 'rctitle' => $title, |
| 668 | + 'timestamp' => $fixTime, |
| 669 | + 'extra' => array( 'name' => 'UseModWiki admin' ), |
| 670 | + 'text' => $text, |
| 671 | + 'summary' => 'Fixing line endings', |
| 672 | + ) ); |
| 673 | + unset( $this->unixLineEndingsOps[$fixTime] ); |
| 674 | + } |
| 675 | + } |
| 676 | + } |
| 677 | + |
| 678 | + function patch( $source, $diff ) { |
| 679 | + file_put_contents( $this->articleFileName, $source ); |
| 680 | + file_put_contents( $this->patchFileName, $diff ); |
| 681 | + $error = wfShellExec( |
| 682 | + wfEscapeShellArg( |
| 683 | + 'patch', |
| 684 | + '-n', |
| 685 | + '-r', '-', |
| 686 | + '--no-backup-if-mismatch', |
| 687 | + '--binary', |
| 688 | + $this->articleFileName, |
| 689 | + $this->patchFileName |
| 690 | + ) . ' 2>&1', |
| 691 | + $status |
| 692 | + ); |
| 693 | + $text = file_get_contents( $this->articleFileName ); |
| 694 | + if ( $status || $text === false ) { |
| 695 | + return false; |
| 696 | + } else { |
| 697 | + return $text; |
| 698 | + } |
| 699 | + } |
| 700 | + |
| 701 | + function resolveFailedDiff( $origText, $diff ) { |
| 702 | + $context = array(); |
| 703 | + $rxRange = '\d+(?:,(\d+))?'; |
| 704 | + $diffLines = explode( "\n", $diff ); |
| 705 | + for ( $i = 0; $i < count( $diffLines ); $i++ ) { |
| 706 | + $diffLine = $diffLines[$i]; |
| 707 | + if ( !preg_match( '/^(\d+)(?:,\d+)?[acd]\d+(?:,\d+)?$/', $diffLine, $m ) ) { |
| 708 | + continue; |
| 709 | + } |
| 710 | + |
| 711 | + $sourceIndex = intval( $m[1] ); |
| 712 | + $i++; |
| 713 | + while ( $i < count( $diffLines ) && substr( $diffLines[$i], 0, 1 ) === '<' ) { |
| 714 | + $context[$sourceIndex - 1] = substr( $diffLines[$i], 2 ); |
| 715 | + $sourceIndex++; |
| 716 | + $i++; |
| 717 | + } |
| 718 | + $i--; |
| 719 | + } |
| 720 | + |
| 721 | + $changedLinks = array(); |
| 722 | + $origLines = explode( "\n", $origText ); |
| 723 | + foreach ( $context as $i => $contextLine ) { |
| 724 | + $origLine = isset( $origLines[$i] ) ? $origLines[$i] : ''; |
| 725 | + if ( $contextLine === $origLine ) { |
| 726 | + continue; |
| 727 | + } |
| 728 | + $newChanges = $this->resolveTextChange( $origLine, $contextLine ); |
| 729 | + if ( is_array( $newChanges ) ) { |
| 730 | + $changedLinks += $newChanges; |
| 731 | + } else { |
| 732 | + echo "Resolution failure on line " . ( $i + 1 ) . "\n"; |
| 733 | + $this->printLatin1( $newChanges ); |
| 734 | + } |
| 735 | + } |
| 736 | + |
| 737 | + return $changedLinks; |
| 738 | + } |
| 739 | + |
| 740 | + function resolveTextChange( $source, $dest ) { |
| 741 | + $changedLinks = array(); |
| 742 | + $sourceLinks = $this->getLinkList( $source ); |
| 743 | + $destLinks = $this->getLinkList( $dest ); |
| 744 | + $newLinks = array_diff( $destLinks, $sourceLinks ); |
| 745 | + $removedLinks = array_diff( $sourceLinks, $destLinks ); |
| 746 | + |
| 747 | + // Match up the removed links with the new links |
| 748 | + foreach ( $newLinks as $j => $newLink ) { |
| 749 | + $minDistance = 100000000; |
| 750 | + $bestRemovedLink = false; |
| 751 | + foreach ( $removedLinks as $k => $removedLink ) { |
| 752 | + $editDistance = levenshtein( $newLink, $removedLink ); |
| 753 | + if ( $editDistance < $minDistance ) { |
| 754 | + $minDistance = $editDistance; |
| 755 | + $bestRemovedLink = $removedLink; |
| 756 | + } |
| 757 | + } |
| 758 | + if ( $bestRemovedLink !== false ) { |
| 759 | + $changedLinks[$bestRemovedLink] = $newLink; |
| 760 | + $newLinks = array_diff( $newLinks, array( $newLink ) ); |
| 761 | + $removedLinks = array_diff( $removedLinks, array( $bestRemovedLink ) ); |
| 762 | + } |
| 763 | + } |
| 764 | + |
| 765 | + $proposal = $source; |
| 766 | + foreach ( $changedLinks as $removedLink => $newLink ) { |
| 767 | + $proposal = $this->substituteTextLinks( $removedLink, $newLink, $proposal ); |
| 768 | + } |
| 769 | + if ( $proposal !== $dest ) { |
| 770 | + // Resolution failed |
| 771 | + $msg = "Source line: $source\n" . |
| 772 | + "Source links: " . implode( ', ', $sourceLinks ) . "\n" . |
| 773 | + "Context line: $dest\n" . |
| 774 | + "Context links: " . implode( ', ', $destLinks ) . "\n" . |
| 775 | + "Proposal: $proposal\n"; |
| 776 | + return $msg; |
| 777 | + } |
| 778 | + return $changedLinks; |
| 779 | + } |
| 780 | + |
| 781 | + function processDiffFile( $callback ) { |
| 782 | + $diffFile = fopen( "{$this->dataDir}/diff_log", 'r' ); |
| 783 | + |
| 784 | + $delimiter = "------\n"; |
| 785 | + file_put_contents( $this->articleFileName, "Describe the new page here.\n" ); |
| 786 | + |
| 787 | + $line = fgets( $diffFile ); |
| 788 | + $lineNum = 1; |
| 789 | + if ( $line !== $delimiter ) { |
| 790 | + echo "Invalid diff file\n"; |
| 791 | + return false; |
| 792 | + } |
| 793 | + $lastReportLine = 0; |
| 794 | + $this->numRevs = 0; |
| 795 | + |
| 796 | + while ( true ) { |
| 797 | + $line = fgets( $diffFile ); |
| 798 | + $lineNum++; |
| 799 | + if ( $line === false ) { |
| 800 | + break; |
| 801 | + } |
| 802 | + if ( $lineNum > $lastReportLine + 1000 ) { |
| 803 | + $lastReportLine = $lineNum; |
| 804 | + fwrite( STDERR, "$lineNum \r" ); |
| 805 | + fflush( STDERR ); |
| 806 | + } |
| 807 | + $line = trim( $line ); |
| 808 | + if ( !preg_match( '/^([^|]+)\|(\d+)$/', $line, $matches ) ) { |
| 809 | + echo "Invalid header on line $lineNum\n"; |
| 810 | + return true; |
| 811 | + } |
| 812 | + list( , $title, $editTime ) = $matches; |
| 813 | + |
| 814 | + $diff = ''; |
| 815 | + $diffStartLine = $lineNum; |
| 816 | + while ( true ) { |
| 817 | + $line = fgets( $diffFile ); |
| 818 | + $lineNum++; |
| 819 | + if ( $line === $delimiter ) { |
| 820 | + break; |
| 821 | + } |
| 822 | + if ( $line === false ) { |
| 823 | + break 2; |
| 824 | + } |
| 825 | + $diff .= $line; |
| 826 | + } |
| 827 | + |
| 828 | + $this->numRevs++; |
| 829 | + |
| 830 | + if ( !isset( $this->rc[$editTime] ) ) { |
| 831 | + $this->printLatin1( "$editTime $title DELETED, skipping\n" ); |
| 832 | + continue; |
| 833 | + } |
| 834 | + |
| 835 | + if ( count( $this->rc[$editTime] ) == 1 ) { |
| 836 | + $params = $this->rc[$editTime][0]; |
| 837 | + } else { |
| 838 | + $params = false; |
| 839 | + $candidates = ''; |
| 840 | + foreach ( $this->rc[$editTime] as $rc ) { |
| 841 | + if ( $rc['rctitle'] === $title ) { |
| 842 | + $params = $rc; |
| 843 | + break; |
| 844 | + } |
| 845 | + if ( $candidates === '' ) { |
| 846 | + $candidates = $rc['rctitle']; |
| 847 | + } else { |
| 848 | + $candidates .= ', ' . $rc['rctitle']; |
| 849 | + } |
| 850 | + } |
| 851 | + if ( !$params ) { |
| 852 | + $this->printLatin1( "$editTime $title ERROR cannot resolve rclog\n" ); |
| 853 | + $this->printLatin1( "$editTime $title CANDIDATES: $candidates\n" ); |
| 854 | + continue; |
| 855 | + } |
| 856 | + } |
| 857 | + $params['diff'] = $diff; |
| 858 | + $params['title'] = $title; |
| 859 | + $params['diffStartLine'] = $diffStartLine; |
| 860 | + call_user_func( $callback, $params ); |
| 861 | + } |
| 862 | + echo "\n"; |
| 863 | + |
| 864 | + if ( !feof( $diffFile ) ) { |
| 865 | + echo "Stopped at line $lineNum\n"; |
| 866 | + } |
| 867 | + return true; |
| 868 | + } |
| 869 | + |
| 870 | + function reconcileCurrentRevs() { |
| 871 | + foreach ( $this->textCache as $title => $text ) { |
| 872 | + $fileName = "{$this->dataDir}/page/"; |
| 873 | + if ( preg_match( '/^[A-Z]/', $title, $m ) ) { |
| 874 | + $fileName .= $m[0]; |
| 875 | + } else { |
| 876 | + $fileName .= 'other'; |
| 877 | + } |
| 878 | + $fileName .= "/$title.db"; |
| 879 | + |
| 880 | + if ( !file_exists( $fileName ) ) { |
| 881 | + $this->printLatin1( "ERROR: Cannot find page file for {$title}\n" ); |
| 882 | + continue; |
| 883 | + } |
| 884 | + |
| 885 | + $fileContents = file_get_contents( $fileName ); |
| 886 | + $page = $this->unserializeUseMod( $fileContents, $this->FS1 ); |
| 887 | + $section = $this->unserializeUseMod( $page['text_default'], $this->FS2 ); |
| 888 | + $data = $this->unserializeUseMod( $section['data'], $this->FS3 ); |
| 889 | + $pageText = $data['text']; |
| 890 | + if ( $text !== $pageText ) { |
| 891 | + $substs = $this->resolveTextChange( $text, $pageText ); |
| 892 | + if ( is_array( $substs ) ) { |
| 893 | + foreach ( $substs as $source => $dest ) { |
| 894 | + if ( isset( $this->moveLog[$dest] ) ) { |
| 895 | + $this->printLatin1( "ERROR: need deep rename: $source\n" ); |
| 896 | + } else { |
| 897 | + $this->printLatin1( "ERROR: need substitute: $source -> $dest\n" ); |
| 898 | + } |
| 899 | + } |
| 900 | + } else { |
| 901 | + $this->printLatin1( "ERROR: unresolved diff in $title:\n" ); |
| 902 | + wfSuppressWarnings(); |
| 903 | + $diff = xdiff_string_diff( $text, $pageText ) . ''; |
| 904 | + wfRestoreWarnings(); |
| 905 | + $this->printLatin1( "$diff\n" ); |
| 906 | + } |
| 907 | + } |
| 908 | + } |
| 909 | + } |
| 910 | + |
| 911 | + function makeTitle( $titleText ) { |
| 912 | + return Title::newFromText( $this->encode( $titleText ) ); |
| 913 | + } |
| 914 | + |
| 915 | + function getText( $titleText ) { |
| 916 | + if ( !isset( $this->textCache[$titleText] ) ) { |
| 917 | + return "Describe the new page here.\n"; |
| 918 | + } else { |
| 919 | + return $this->textCache[$titleText]; |
| 920 | + } |
| 921 | + } |
| 922 | + |
| 923 | + function saveRevision( $params ) { |
| 924 | + $this->textCache[$params['rctitle']] = $params['text']; |
| 925 | + |
| 926 | + $out = "<page>\n" . |
| 927 | + $this->element( 'title', $params['rctitle'] ) . |
| 928 | + "<revision>\n" . |
| 929 | + $this->element( 'id', $this->revId ++ ) . |
| 930 | + $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) . |
| 931 | + "<contributor>\n"; |
| 932 | + if ( isset( $params['extra']['name'] ) ) { |
| 933 | + $out .= $this->element( 'username', $params['extra']['name'] ); |
| 934 | + } |
| 935 | + if ( isset( $params['extra']['id'] ) ) { |
| 936 | + $out .= $this->element( 'id', $params['extra']['id'] ); |
| 937 | + } |
| 938 | + if ( isset( $params['host'] ) ) { |
| 939 | + $out .= $this->element( 'ip', $params['host'] ); |
| 940 | + } |
| 941 | + $out .= |
| 942 | + "</contributor>\n" . |
| 943 | + $this->element( 'comment', $params['summary'] ) . |
| 944 | + "<text xml:space=\"preserve\">" . |
| 945 | + htmlspecialchars( $this->encode( $params['text'] ) ) . |
| 946 | + "</text>\n" . |
| 947 | + "</revision>\n" . |
| 948 | + "</page>\n"; |
| 949 | + fwrite( $this->outFile, $out ); |
| 950 | + } |
| 951 | + |
| 952 | + function renameTextLinks( $old, $new, $timestamp ) { |
| 953 | + $newWithUnderscores = $new; |
| 954 | + $old = str_replace( '_', ' ', $old ); |
| 955 | + $new = str_replace( '_', ' ', $new ); |
| 956 | + |
| 957 | + foreach ( $this->textCache as $title => $oldText ) { |
| 958 | + if ( $newWithUnderscores === $title |
| 959 | + && in_array( $title, $this->skipSelfSubstitution ) ) |
| 960 | + { |
| 961 | + // Hack to make Pythagorean_Theorem etc. work |
| 962 | + continue; |
| 963 | + } |
| 964 | + |
| 965 | + $newText = $this->substituteTextLinks( $old, $new, $oldText ); |
| 966 | + if ( $oldText !== $newText ) { |
| 967 | + $this->saveRevision( array( |
| 968 | + 'rctitle' => $title, |
| 969 | + 'timestamp' => $timestamp, |
| 970 | + 'text' => $newText, |
| 971 | + 'extra' => array( 'name' => 'Page move link fixup script' ), |
| 972 | + 'summary' => '', |
| 973 | + 'minor' => true |
| 974 | + ) ); |
| 975 | + } |
| 976 | + } |
| 977 | + } |
| 978 | + |
| 979 | + function substituteTextLinks( $old, $new, $text ) { |
| 980 | + $this->saveUrl = array(); |
| 981 | + $this->old = $old; |
| 982 | + $this->new = $new; |
| 983 | + |
| 984 | + $text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia) |
| 985 | + $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is', |
| 986 | + array( $this, 'storeRaw' ), $text ); |
| 987 | + $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is', |
| 988 | + array( $this, 'storeRaw' ), $text ); |
| 989 | + $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s', |
| 990 | + array( $this, 'storeRaw' ), $text ); |
| 991 | + |
| 992 | + $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/", |
| 993 | + array( $this, 'subFreeLink' ), $text ); |
| 994 | + $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/", |
| 995 | + array( $this, 'subFreeLink' ), $text ); |
| 996 | + $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/", |
| 997 | + array( $this, 'storeRaw' ), $text ); |
| 998 | + $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/", |
| 999 | + array( $this, 'storeRaw' ), $text ); |
| 1000 | + $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/", |
| 1001 | + array( $this, 'storeRaw' ), $text ); |
| 1002 | + $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/", |
| 1003 | + array( $this, 'storeRaw' ), $text ); |
| 1004 | + $text = preg_replace_callback( "/{$this->LinkPattern}/", |
| 1005 | + array( $this, 'subWikiLink' ), $text ); |
| 1006 | + |
| 1007 | + $text = preg_replace_callback( "/{$this->FS}(\d+){$this->FS}/", |
| 1008 | + array( $this, 'restoreRaw' ), $text ); # Restore saved text |
| 1009 | + return $text; |
| 1010 | + } |
| 1011 | + |
| 1012 | + function getLinkList( $text ) { |
| 1013 | + $this->saveUrl = array(); |
| 1014 | + $this->linkList = array(); |
| 1015 | + |
| 1016 | + $text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia) |
| 1017 | + $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is', |
| 1018 | + array( $this, 'storeRaw' ), $text ); |
| 1019 | + $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is', |
| 1020 | + array( $this, 'storeRaw' ), $text ); |
| 1021 | + $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s', |
| 1022 | + array( $this, 'storeRaw' ), $text ); |
| 1023 | + |
| 1024 | + $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/", |
| 1025 | + array( $this, 'storeLink' ), $text ); |
| 1026 | + $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/", |
| 1027 | + array( $this, 'storeLink' ), $text ); |
| 1028 | + $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/", |
| 1029 | + array( $this, 'storeRaw' ), $text ); |
| 1030 | + $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/", |
| 1031 | + array( $this, 'storeRaw' ), $text ); |
| 1032 | + $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/", |
| 1033 | + array( $this, 'storeRaw' ), $text ); |
| 1034 | + $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/", |
| 1035 | + array( $this, 'storeRaw' ), $text ); |
| 1036 | + $text = preg_replace_callback( "/{$this->LinkPattern}/", |
| 1037 | + array( $this, 'storeLink' ), $text ); |
| 1038 | + |
| 1039 | + return $this->linkList; |
| 1040 | + } |
| 1041 | + |
| 1042 | + function storeRaw( $m ) { |
| 1043 | + $this->saveUrl[] = $m[1]; |
| 1044 | + return $this->FS . (count( $this->saveUrl ) - 1) . $this->FS; |
| 1045 | + } |
| 1046 | + |
| 1047 | + function subFreeLink( $m ) { |
| 1048 | + $link = $m[1]; |
| 1049 | + if ( isset( $m[2] ) ) { |
| 1050 | + $name = $m[2]; |
| 1051 | + } else { |
| 1052 | + $name = ''; |
| 1053 | + } |
| 1054 | + $oldlink = $link; |
| 1055 | + $link = preg_replace( '/^\s+/', '', $link ); |
| 1056 | + $link = preg_replace( '/\s+$/', '', $link ); |
| 1057 | + if ( $link == $this->old ) { |
| 1058 | + $link = $this->new; |
| 1059 | + } else { |
| 1060 | + $link = $oldlink; # Preserve spaces if no match |
| 1061 | + } |
| 1062 | + $link = "[[$link"; |
| 1063 | + if ( $name !== "" ) { |
| 1064 | + $link .= "|$name"; |
| 1065 | + } |
| 1066 | + $link .= "]]"; |
| 1067 | + return $this->storeRaw( array( 1 => $link ) ); |
| 1068 | + } |
| 1069 | + |
| 1070 | + function subWikiLink( $m ) { |
| 1071 | + $link = $m[1]; |
| 1072 | + if ( $link == $this->old ) { |
| 1073 | + $link = $this->new; |
| 1074 | + if ( !preg_match( "/^{$this->LinkPattern}$/", $this->new ) ) { |
| 1075 | + $link = "[[$link]]"; |
| 1076 | + } |
| 1077 | + } |
| 1078 | + return $this->storeRaw( array( 1 => $link ) ); |
| 1079 | + } |
| 1080 | + |
| 1081 | + function restoreRaw( $m ) { |
| 1082 | + return $this->saveUrl[$m[1]]; |
| 1083 | + } |
| 1084 | + |
| 1085 | + function storeLink( $m ) { |
| 1086 | + $this->linkList[] = $m[1]; |
| 1087 | + return $this->storeRaw( $m ); |
| 1088 | + } |
| 1089 | + |
| 1090 | + function encode( $s ) { |
| 1091 | + return strtr( $s, $this->encodeMap ); |
| 1092 | + } |
| 1093 | + |
| 1094 | + function decode( $s ) { |
| 1095 | + return strtr( $s, $this->decodeMap ); |
| 1096 | + } |
| 1097 | + |
| 1098 | + function printLatin1( $s ) { |
| 1099 | + echo $this->encode( $s ); |
| 1100 | + } |
| 1101 | + |
| 1102 | + function unserializeUseMod( $s, $sep ) { |
| 1103 | + $parts = explode( $sep, $s ); |
| 1104 | + $result = array(); |
| 1105 | + for ( $i = 0; $i < count( $parts ); $i += 2 ) { |
| 1106 | + $result[$parts[$i]] = $parts[$i+1]; |
| 1107 | + } |
| 1108 | + return $result; |
| 1109 | + } |
| 1110 | +} |
| 1111 | + |
| 1112 | +$maintClass = 'ImportUseModWikipedia'; |
| 1113 | +require_once( DO_MAINTENANCE ); |
Property changes on: trunk/phase3/maintenance/importUseModWikipedia.php |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 1114 | + native |