Index: trunk/phase3/maintenance/importUseModWiki.php |
— | — | @@ -1,375 +0,0 @@ |
2 | | -<?php |
3 | | -/** |
4 | | - * Import data from a UseModWiki into a MediaWiki wiki |
5 | | - * 2003-02-09 Brion VIBBER <brion@pobox.com> |
6 | | - * Based loosely on Magnus's code from 2001-2002 |
7 | | - * |
8 | | - * Updated limited version to get something working temporarily |
9 | | - * 2003-10-09 |
10 | | - * Be sure to run the link & index rebuilding scripts! |
11 | | - * |
12 | | - * Some more munging for charsets etc |
13 | | - * 2003-11-28 |
14 | | - * |
15 | | - * Partial fix for pages starting with lowercase letters (??) |
16 | | - * and CamelCase and /Subpage link conversion |
17 | | - * 2004-11-17 |
18 | | - * |
19 | | - * Rewrite output to create Special:Export format for import |
20 | | - * instead of raw SQL. Should be 'future-proof' against future |
21 | | - * schema changes. |
22 | | - * 2005-03-14 |
23 | | - * |
24 | | - * This program is free software; you can redistribute it and/or modify |
25 | | - * it under the terms of the GNU General Public License as published by |
26 | | - * the Free Software Foundation; either version 2 of the License, or |
27 | | - * (at your option) any later version. |
28 | | - * |
29 | | - * This program is distributed in the hope that it will be useful, |
30 | | - * but WITHOUT ANY WARRANTY; without even the implied warranty of |
31 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
32 | | - * GNU General Public License for more details. |
33 | | - * |
34 | | - * You should have received a copy of the GNU General Public License along |
35 | | - * with this program; if not, write to the Free Software Foundation, Inc., |
36 | | - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
37 | | - * http://www.gnu.org/copyleft/gpl.html |
38 | | - * |
39 | | - * @todo document |
40 | | - * @file |
41 | | - * @ingroup Maintenance |
42 | | - */ |
43 | | - |
44 | | -require_once( "Maintenance.php" ); |
45 | | - |
46 | | -class ImportUseModWiki extends Maintenance { |
47 | | - |
48 | | - private $encoding, $rootDirectory = ''; |
49 | | - |
50 | | - /** |
51 | | - * Field separators |
52 | | - * @var String |
53 | | - */ |
54 | | - private $FS1, $FS2, $FS3 = ''; |
55 | | - |
56 | | - /** |
57 | | - * @var Array |
58 | | - */ |
59 | | - private $usercache, $nowiki = array(); |
60 | | - |
61 | | - public function __construct() { |
62 | | - parent::__construct(); |
63 | | - $this->mDescription = "Import pages from UseMod wikis"; |
64 | | - $this->addOption( 'encoding', 'Encoding of the imported text, default CP1252', false, true ); |
65 | | - /** |
66 | | - * If UseModWiki's New File System is used: |
67 | | - * $NewFS = 1; # 1 = new multibyte $FS, 0 = old $FS |
68 | | - * Use "\xb3"; for the Old File System |
69 | | - * Changed with UTF-8 UseModWiki |
70 | | - * http://www.usemod.com/cgi-bin/wiki.pl?SupportForUtf8 |
71 | | - * http://www.usemod.com/cgi-bin/wiki.pl?WikiBugs/NewFieldSeparatorWronglyTreated |
72 | | - * http://www.meatballwiki.org/wiki/WikiEngine#Q_amp_A |
73 | | - */ |
74 | | - $this->addOption( 'separator', 'Field separator to use, default \x1E\xFF\xFE\x1E', false, true ); |
75 | | - $this->addArg( 'path', 'Path to your UseMod wiki' ); |
76 | | - } |
77 | | - |
78 | | - public function execute() { |
79 | | - $this->rootDirectory = $this->getArg(); |
80 | | - $this->encoding = $this->getOption( 'encoding', 'CP1252' ); |
81 | | - $sep = $this->getOption( 'separator', "\x1E\xFF\xFE\x1E" ); |
82 | | - $this->FS1 = "{$sep}1"; |
83 | | - $this->FS2 = "{$sep}2"; |
84 | | - $this->FS3 = "{$sep}3"; |
85 | | - |
86 | | - echo <<<XML |
87 | | -<?xml version="1.0" encoding="UTF-8" ?> |
88 | | -<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/" |
89 | | - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
90 | | - xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/ |
91 | | - http://www.mediawiki.org/xml/export-0.1.xsd" |
92 | | - version="0.1" |
93 | | - xml:lang="en"> |
94 | | -<!-- generated by importUseModWiki.php --> |
95 | | - |
96 | | -XML; |
97 | | - $letters = array( |
98 | | - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', |
99 | | - 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', |
100 | | - 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' ); |
101 | | - foreach ( $letters as $letter ) { |
102 | | - $dir = "{$this->rootDirectory}/page/$letter"; |
103 | | - if ( is_dir( $dir ) ) |
104 | | - $this->importPageDirectory( $dir ); |
105 | | - } |
106 | | - echo <<<XML |
107 | | -</mediawiki> |
108 | | - |
109 | | -XML; |
110 | | - } |
111 | | - |
112 | | - private function importPageDirectory( $dir, $prefix = "" ) { |
113 | | - echo "\n<!-- Checking page directory " . $this->xmlCommentSafe( $dir ) . " -->\n"; |
114 | | - $mydir = opendir( $dir ); |
115 | | - while ( $entry = readdir( $mydir ) ) { |
116 | | - $m = array(); |
117 | | - if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) { |
118 | | - echo $this->importPage( $prefix . $m[1] ); |
119 | | - } else { |
120 | | - if ( is_dir( "$dir/$entry" ) ) { |
121 | | - if ( $entry != '.' && $entry != '..' ) { |
122 | | - $this->importPageDirectory( "$dir/$entry", "$entry/" ); |
123 | | - } |
124 | | - } else { |
125 | | - echo "<!-- File '" . $this->xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n"; |
126 | | - } |
127 | | - } |
128 | | - } |
129 | | - } |
130 | | - |
131 | | - private function useModFilename( $title ) { |
132 | | - $c = substr( $title, 0, 1 ); |
133 | | - if ( preg_match( '/[A-Z]/i', $c ) ) { |
134 | | - return strtoupper( $c ) . "/$title"; |
135 | | - } |
136 | | - return "other/$title"; |
137 | | - } |
138 | | - |
139 | | - private function fetchPage( $title ) { |
140 | | - $fname = $this->rootDirectory . "/page/" . $this->useModFilename( $title ) . ".db"; |
141 | | - if ( !file_exists( $fname ) ) { |
142 | | - echo "Couldn't open file '$fname' for page '$title'.\n"; |
143 | | - die( -1 ); |
144 | | - } |
145 | | - |
146 | | - $page = $this->splitHash( $this->FS1, file_get_contents( $fname ) ); |
147 | | - $section = $this->splitHash( $this->FS2, $page["text_default"] ); |
148 | | - $text = $this->splitHash( $this->FS3, $section["data"] ); |
149 | | - |
150 | | - return $this->array2object( array( "text" => $text["text"] , "summary" => $text["summary"] , |
151 | | - "minor" => $text["minor"] , "ts" => $section["ts"] , |
152 | | - "username" => $section["username"] , "host" => $section["host"] ) ); |
153 | | - } |
154 | | - |
155 | | - private function fetchKeptPages( $title ) { |
156 | | - $fname = $this->rootDirectory . "/keep/" . $this->useModFilename( $title ) . ".kp"; |
157 | | - if ( !file_exists( $fname ) ) return array(); |
158 | | - |
159 | | - $keptlist = explode( $this->FS1, file_get_contents( $fname ) ); |
160 | | - array_shift( $keptlist ); # Drop the junk at beginning of file |
161 | | - |
162 | | - $revisions = array(); |
163 | | - foreach ( $keptlist as $rev ) { |
164 | | - $section = $this->splitHash( $this->FS2, $rev ); |
165 | | - $text = $this->splitHash( $this->FS3, $section["data"] ); |
166 | | - if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) { |
167 | | - array_push( $revisions, $this->array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] , |
168 | | - "minor" => $text["minor"] , "ts" => $section["ts"] , |
169 | | - "username" => $section["username"] , "host" => $section["host"] ) ) ); |
170 | | - } else { |
171 | | - echo "<!-- skipped a bad old revision -->\n"; |
172 | | - } |
173 | | - } |
174 | | - return $revisions; |
175 | | - } |
176 | | - |
177 | | - private function splitHash( $sep , $str ) { |
178 | | - $temp = explode ( $sep , $str ) ; |
179 | | - $ret = array () ; |
180 | | - for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) { |
181 | | - $ret[$temp[$i]] = $temp[++$i] ; |
182 | | - } |
183 | | - return $ret ; |
184 | | - } |
185 | | - |
186 | | - private function checkUserCache( $name, $host ) { |
187 | | - if ( $name ) { |
188 | | - if ( in_array( $name, $this->usercache ) ) { |
189 | | - $userid = $this->usercache[$name]; |
190 | | - } else { |
191 | | - # If we haven't imported user accounts |
192 | | - $userid = 0; |
193 | | - } |
194 | | - $username = str_replace( '_', ' ', $name ); |
195 | | - } else { |
196 | | - $userid = 0; |
197 | | - $username = $host; |
198 | | - } |
199 | | - return array( $userid, $username ); |
200 | | - } |
201 | | - |
202 | | - private function importPage( $title ) { |
203 | | - echo "\n<!-- Importing page " . $this->xmlCommentSafe( $title ) . " -->\n"; |
204 | | - $page = $this->fetchPage( $title ); |
205 | | - |
206 | | - $newtitle = $this->xmlsafe( str_replace( '_', ' ', $this->recodeText( $title ) ) ); |
207 | | - |
208 | | - $munged = $this->mungeFormat( $page->text ); |
209 | | - if ( $munged != $page->text ) { |
210 | | - /** |
211 | | - * Save a *new* revision with the conversion, and put the |
212 | | - * previous last version into the history. |
213 | | - */ |
214 | | - $next = $this->array2object( array( |
215 | | - 'text' => $munged, |
216 | | - 'minor' => 1, |
217 | | - 'username' => 'Conversion script', |
218 | | - 'host' => '127.0.0.1', |
219 | | - 'ts' => time(), |
220 | | - 'summary' => 'link fix', |
221 | | - ) ); |
222 | | - $revisions = array( $page, $next ); |
223 | | - } else { |
224 | | - /** |
225 | | - * Current revision: |
226 | | - */ |
227 | | - $revisions = array( $page ); |
228 | | - } |
229 | | - $xml = <<<XML |
230 | | - <page> |
231 | | - <title>$newtitle</title> |
232 | | - |
233 | | -XML; |
234 | | - |
235 | | - # History |
236 | | - $revisions = array_merge( $revisions, $this->fetchKeptPages( $title ) ); |
237 | | - if ( count( $revisions ) == 0 ) { |
238 | | - return NULL; // Was "$sql", which does not appear to be defined. |
239 | | - } |
240 | | - |
241 | | - foreach ( $revisions as $rev ) { |
242 | | - $text = $this->xmlsafe( $this->recodeText( $rev->text ) ); |
243 | | - $minor = ( $rev->minor ? '<minor/>' : '' ); |
244 | | - list( /* $userid */ , $username ) = $this->checkUserCache( $rev->username, $rev->host ); |
245 | | - $username = $this->xmlsafe( $this->recodeText( $username ) ); |
246 | | - $timestamp = $this->xmlsafe( $this->timestamp2ISO8601( $rev->ts ) ); |
247 | | - $comment = $this->xmlsafe( $this->recodeText( $rev->summary ) ); |
248 | | - |
249 | | - $xml .= <<<XML |
250 | | - <revision> |
251 | | - <timestamp>$timestamp</timestamp> |
252 | | - <contributor><username>$username</username></contributor> |
253 | | - $minor |
254 | | - <comment>$comment</comment> |
255 | | - <text>$text</text> |
256 | | - </revision> |
257 | | - |
258 | | -XML; |
259 | | - } |
260 | | - $xml .= "</page>\n\n"; |
261 | | - return $xml; |
262 | | - } |
263 | | - |
264 | | - private function recodeText( $string ) { |
265 | | - # For currently latin-1 wikis |
266 | | - $string = str_replace( "\r\n", "\n", $string ); |
267 | | - $string = @iconv( $this->encoding, "UTF-8", $string ); |
268 | | - $string = $this->mungeToUtf8( $string ); # Any old Ӓ stuff |
269 | | - return $string; |
270 | | - } |
271 | | - |
272 | | - /** |
273 | | - * @todo FIXME: Don't use /e |
274 | | - */ |
275 | | - private function mungeToUtf8( $string ) { |
276 | | - $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string ); |
277 | | - $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string ); |
278 | | - # Should also do named entities here |
279 | | - return $string; |
280 | | - } |
281 | | - |
282 | | - private function timestamp2ISO8601( $ts ) { |
283 | | - # 2003-08-05T18:30:02Z |
284 | | - return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z'; |
285 | | - } |
286 | | - |
287 | | - /** |
288 | | - * The page may contain old data which has not been properly normalized. |
289 | | - * Invalid UTF-8 sequences or forbidden control characters will make our |
290 | | - * XML output invalid, so be sure to strip them out. |
291 | | - * @param String $string Text to clean up |
292 | | - * @return String |
293 | | - */ |
294 | | - private function xmlsafe( $string ) { |
295 | | - $string = UtfNormal::cleanUp( $string ); |
296 | | - $string = htmlspecialchars( $string ); |
297 | | - return $string; |
298 | | - } |
299 | | - |
300 | | - private function xmlCommentSafe( $text ) { |
301 | | - return str_replace( '--', '\\-\\-', $this->xmlsafe( $this->recodeText( $text ) ) ); |
302 | | - } |
303 | | - |
304 | | - private function array2object( $arr ) { |
305 | | - $o = (object)0; |
306 | | - foreach ( $arr as $x => $y ) { |
307 | | - $o->$x = $y; |
308 | | - } |
309 | | - return $o; |
310 | | - } |
311 | | - |
312 | | - /** |
313 | | - * Make CamelCase and /Talk links work |
314 | | - */ |
315 | | - private function mungeFormat( $text ) { |
316 | | - $this->nowiki = array(); |
317 | | - $staged = preg_replace_callback( |
318 | | - '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s', |
319 | | - array( $this, 'nowikiPlaceholder' ), $text ); |
320 | | - |
321 | | - # This is probably not 100% correct, I'm just |
322 | | - # glancing at the UseModWiki code. |
323 | | - $upper = "[A-Z]"; |
324 | | - $lower = "[a-z_0-9]"; |
325 | | - $any = "[A-Za-z_0-9]"; |
326 | | - $camel = "(?:$upper+$lower+$upper+$any*)"; |
327 | | - $subpage = "(?:\\/$any+)"; |
328 | | - $substart = "(?:\\/$upper$any*)"; |
329 | | - |
330 | | - $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/", |
331 | | - '[[$1]]', $staged ); |
332 | | - |
333 | | - $final = preg_replace( '/' . preg_quote( $this->placeholder() ) . '/s', |
334 | | - array( $this, 'nowikiShift' ), $munged ); |
335 | | - return $final; |
336 | | - } |
337 | | - |
338 | | - private function placeholder( $x = null ) { |
339 | | - return '\xffplaceholder\xff'; |
340 | | - } |
341 | | - |
342 | | - public function nowikiPlaceholder( $matches ) { |
343 | | - $this->nowiki[] = $matches[1]; |
344 | | - return $this->placeholder(); |
345 | | - } |
346 | | - |
347 | | - public function nowikiShift() { |
348 | | - return array_shift( $this->nowiki ); |
349 | | - } |
350 | | -} |
351 | | - |
352 | | -function wfUtf8Sequence( $codepoint ) { |
353 | | - if ( $codepoint < 0x80 ) { |
354 | | - return chr( $codepoint ); |
355 | | - } |
356 | | - if ( $codepoint < 0x800 ) { |
357 | | - return chr( $codepoint >> 6 & 0x3f | 0xc0 ) . |
358 | | - chr( $codepoint & 0x3f | 0x80 ); |
359 | | - } |
360 | | - if ( $codepoint < 0x10000 ) { |
361 | | - return chr( $codepoint >> 12 & 0x0f | 0xe0 ) . |
362 | | - chr( $codepoint >> 6 & 0x3f | 0x80 ) . |
363 | | - chr( $codepoint & 0x3f | 0x80 ); |
364 | | - } |
365 | | - if ( $codepoint < 0x100000 ) { |
366 | | - return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this |
367 | | - chr( $codepoint >> 12 & 0x3f | 0x80 ) . |
368 | | - chr( $codepoint >> 6 & 0x3f | 0x80 ) . |
369 | | - chr( $codepoint & 0x3f | 0x80 ); |
370 | | - } |
371 | | - # Doesn't yet handle outside the BMP |
372 | | - return "&#$codepoint;"; |
373 | | -} |
374 | | - |
375 | | -$maintClass = 'ImportUseModWiki'; |
376 | | -require_once( RUN_MAINTENANCE_IF_MAIN ); |
Index: trunk/phase3/maintenance/importUseModWikipedia.php |
— | — | @@ -1,892 +0,0 @@ |
2 | | -<?php |
3 | | - |
4 | | -/** |
5 | | - * A script to read a dump of the English Wikipedia from the UseModWiki period, and to |
6 | | - * generate an XML dump in MediaWiki format. |
7 | | - * |
8 | | - * Some relevant code was ported from UseModWiki 0.92. |
9 | | - * |
10 | | - */ |
11 | | - |
12 | | -require_once( dirname( __FILE__ ) . '/Maintenance.php' ); |
13 | | -require_once( dirname( __FILE__ ) .'/../includes/normal/UtfNormalUtil.php' ); |
14 | | - |
15 | | - |
16 | | -class ImportUseModWikipedia extends Maintenance { |
17 | | - var $encodeMap, $decodeMap; |
18 | | - |
19 | | - var $deepRenames = array( |
20 | | - 'JimboWales' => 983862286, |
21 | | - 'TexaS' => 983918410, |
22 | | - 'HistoryOfUnitedStatesTalk' => 984795423, |
23 | | - 'MetallicA' => 985128533, |
24 | | - 'PythagoreanTheorem' => 985225545, |
25 | | - 'TheCanonofScripture' => 985368223, |
26 | | - 'TaoTehChing' => 985368222, |
27 | | - //'TheMostRemarkableFormulaInTheWorld' => 985368221, |
28 | | - 'TheRecorder' => 985368220, |
29 | | - 'GladstoneOregon' => 985368219, |
30 | | - 'PacificBeach' => '?', |
31 | | - 'AaRiver' => '?', |
32 | | - ); |
33 | | - |
34 | | - var $replacements = array(); |
35 | | - |
36 | | - var $renameTextLinksOps = array( |
37 | | - 983846265 => array( |
38 | | - 'TestIgnore' => 'IgnoreTest', |
39 | | - ), |
40 | | - 983848080 => array( |
41 | | - 'UnitedLocomotiveWorks' => 'Atlas Shrugged/United Locomotive Works' |
42 | | - ), |
43 | | - 983856376 => array( |
44 | | - 'WikiPedia' => 'Wikipedia', |
45 | | - ), |
46 | | - 983896152 => array( |
47 | | - 'John_F_Kennedy' => 'John_F._Kennedy', |
48 | | - ), |
49 | | - 983905871 => array( |
50 | | - 'LarrySanger' => 'Larry_Sanger' |
51 | | - ), |
52 | | - 984697068 => array( |
53 | | - 'UnitedStates' => 'United States', |
54 | | - ), |
55 | | - 984792748 => array( |
56 | | - 'LibertarianisM' => 'Libertarianism' |
57 | | - ), |
58 | | - 985327832 => array( |
59 | | - 'AnarchisM' => 'Anarchism', |
60 | | - ), |
61 | | - 985290063 => array( |
62 | | - 'HistoryOfUnitedStatesDiscussion' => 'History_Of_United_States_Discussion' |
63 | | - ), |
64 | | - 985290091 => array( |
65 | | - 'BritishEmpire' => 'British Empire' |
66 | | - ), |
67 | | - /* |
68 | | - 985468958 => array( |
69 | | - 'ScienceFiction' => 'Science fiction', |
70 | | - ),*/ |
71 | | - ); |
72 | | - |
73 | | - /** |
74 | | - * Hack for observed substitution issues |
75 | | - */ |
76 | | - var $skipSelfSubstitution = array( |
77 | | - 'Pythagorean_Theorem', |
78 | | - 'The_Most_Remarkable_Formula_In_The_World', |
79 | | - 'Wine', |
80 | | - ); |
81 | | - |
82 | | - var $unixLineEndingsOps = array( |
83 | | - 987743732 => 'Wikipedia_FAQ' |
84 | | - ); |
85 | | - |
86 | | - var $replacementsDone = array(); |
87 | | - |
88 | | - var $moveLog = array(); |
89 | | - var $moveDests = array(); |
90 | | - var $revId; |
91 | | - |
92 | | - var $rc = array(); |
93 | | - var $textCache = array(); |
94 | | - var $blacklist = array(); |
95 | | - |
96 | | - var $FS, $FS1, $FS2, $FS3; |
97 | | - var $FreeLinkPattern, $UrlPattern, $LinkPattern, $InterLinkPattern; |
98 | | - |
99 | | - var $cp1252Table = array( |
100 | | -0x80 => 0x20ac, |
101 | | -0x81 => 0x0081, |
102 | | -0x82 => 0x201a, |
103 | | -0x83 => 0x0192, |
104 | | -0x84 => 0x201e, |
105 | | -0x85 => 0x2026, |
106 | | -0x86 => 0x2020, |
107 | | -0x87 => 0x2021, |
108 | | -0x88 => 0x02c6, |
109 | | -0x89 => 0x2030, |
110 | | -0x8a => 0x0160, |
111 | | -0x8b => 0x2039, |
112 | | -0x8c => 0x0152, |
113 | | -0x8d => 0x008d, |
114 | | -0x8e => 0x017d, |
115 | | -0x8f => 0x008f, |
116 | | -0x90 => 0x0090, |
117 | | -0x91 => 0x2018, |
118 | | -0x92 => 0x2019, |
119 | | -0x93 => 0x201c, |
120 | | -0x94 => 0x201d, |
121 | | -0x95 => 0x2022, |
122 | | -0x96 => 0x2013, |
123 | | -0x97 => 0x2014, |
124 | | -0x98 => 0x02dc, |
125 | | -0x99 => 0x2122, |
126 | | -0x9a => 0x0161, |
127 | | -0x9b => 0x203a, |
128 | | -0x9c => 0x0153, |
129 | | -0x9d => 0x009d, |
130 | | -0x9e => 0x017e, |
131 | | -0x9f => 0x0178); |
132 | | - |
133 | | - public function __construct() { |
134 | | - parent::__construct(); |
135 | | - $this->addOption( 'datadir', 'the value of $DataDir from wiki.cgi', true, true ); |
136 | | - $this->addOption( 'outfile', 'the name of the output XML file', true, true ); |
137 | | - $this->initLinkPatterns(); |
138 | | - |
139 | | - $this->encodeMap = $this->decodeMap = array(); |
140 | | - |
141 | | - for ($source = 0; $source <= 0xff; $source++) { |
142 | | - if ( isset( $this->cp1252Table[$source] ) ) { |
143 | | - $dest = $this->cp1252Table[$source]; |
144 | | - } else { |
145 | | - $dest = $source; |
146 | | - } |
147 | | - $sourceChar = chr( $source ); |
148 | | - $destChar = codepointToUtf8( $dest ); |
149 | | - $this->encodeMap[$sourceChar] = $destChar; |
150 | | - $this->decodeMap[$destChar] = $sourceChar; |
151 | | - } |
152 | | - } |
153 | | - |
154 | | - function initLinkPatterns() { |
155 | | - # Field separators are used in the URL-style patterns below. |
156 | | - $this->FS = "\xb3"; # The FS character is a superscript "3" |
157 | | - $this->FS1 = $this->FS . "1"; # The FS values are used to separate fields |
158 | | - $this->FS2 = $this->FS . "2"; # in stored hashtables and other data structures. |
159 | | - $this->FS3 = $this->FS . "3"; # The FS character is not allowed in user data. |
160 | | - |
161 | | - $UpperLetter = "[A-Z"; |
162 | | - $LowerLetter = "[a-z"; |
163 | | - $AnyLetter = "[A-Za-z"; |
164 | | - $AnyLetter .= "_0-9"; |
165 | | - $UpperLetter .= "]"; $LowerLetter .= "]"; $AnyLetter .= "]"; |
166 | | - |
167 | | - # Main link pattern: lowercase between uppercase, then anything |
168 | | - $LpA = $UpperLetter . "+" . $LowerLetter . "+" . $UpperLetter |
169 | | - . $AnyLetter . "*"; |
170 | | - # Optional subpage link pattern: uppercase, lowercase, then anything |
171 | | - $LpB = $UpperLetter . "+" . $LowerLetter . "+" . $AnyLetter . "*"; |
172 | | - |
173 | | - # Loose pattern: If subpage is used, subpage may be simple name |
174 | | - $this->LinkPattern = "((?:(?:$LpA)?\\/$LpB)|$LpA)"; |
175 | | - $QDelim = '(?:"")?'; # Optional quote delimiter (not in output) |
176 | | - $this->LinkPattern .= $QDelim; |
177 | | - |
178 | | - # Inter-site convention: sites must start with uppercase letter |
179 | | - # (Uppercase letter avoids confusion with URLs) |
180 | | - $InterSitePattern = $UpperLetter . $AnyLetter . "+"; |
181 | | - $this->InterLinkPattern = "((?:$InterSitePattern:[^\\]\\s\"<>{$this->FS}]+)$QDelim)"; |
182 | | - |
183 | | - $AnyLetter = "[-,. _0-9A-Za-z]"; |
184 | | - $this->FreeLinkPattern = "($AnyLetter+)"; |
185 | | - $this->FreeLinkPattern = "((?:(?:$AnyLetter+)?\\/)?$AnyLetter+)"; |
186 | | - $this->FreeLinkPattern .= $QDelim; |
187 | | - |
188 | | - # Url-style links are delimited by one of: |
189 | | - # 1. Whitespace (kept in output) |
190 | | - # 2. Left or right angle-bracket (< or >) (kept in output) |
191 | | - # 3. Right square-bracket (]) (kept in output) |
192 | | - # 4. A single double-quote (") (kept in output) |
193 | | - # 5. A $FS (field separator) character (kept in output) |
194 | | - # 6. A double double-quote ("") (removed from output) |
195 | | - |
196 | | - $UrlProtocols = "http|https|ftp|afs|news|nntp|mid|cid|mailto|wais|" |
197 | | - . "prospero|telnet|gopher"; |
198 | | - $UrlProtocols .= '|file'; |
199 | | - $this->UrlPattern = "((?:(?:$UrlProtocols):[^\\]\\s\"<>{$this->FS}]+)$QDelim)"; |
200 | | - $ImageExtensions = "(gif|jpg|png|bmp|jpeg)"; |
201 | | - $RFCPattern = "RFC\\s?(\\d+)"; |
202 | | - $ISBNPattern = "ISBN:?([0-9- xX]{10,})"; |
203 | | - } |
204 | | - |
205 | | - function execute() { |
206 | | - $this->articleFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp'; |
207 | | - $this->patchFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp'; |
208 | | - $this->dataDir = $this->getOption( 'datadir' ); |
209 | | - $this->outFile = fopen( $this->getOption( 'outfile' ), 'w' ); |
210 | | - if ( !$this->outFile ) { |
211 | | - echo "Unable to open output file\n"; |
212 | | - return 1; |
213 | | - } |
214 | | - $this->writeXmlHeader(); |
215 | | - $this->readRclog(); |
216 | | - $this->writeMoveLog(); |
217 | | - $this->writeRevisions(); |
218 | | - $this->reconcileCurrentRevs(); |
219 | | - $this->writeXmlFooter(); |
220 | | - unlink( $this->articleFileName ); |
221 | | - unlink( $this->patchFileName ); |
222 | | - return 0; |
223 | | - } |
224 | | - |
225 | | - function writeXmlHeader() { |
226 | | - fwrite( $this->outFile, <<<EOT |
227 | | -<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en"> |
228 | | - <siteinfo> |
229 | | - <sitename>Wikipedia</sitename> |
230 | | - <base>http://www.wikipedia.com/</base> |
231 | | - <generator>MediaWiki 1.18alpha importUseModWikipedia.php</generator> |
232 | | - <case>case-sensitive</case> |
233 | | - <namespaces> |
234 | | - <namespace key="0" /> |
235 | | - </namespaces> |
236 | | - </siteinfo> |
237 | | - |
238 | | -EOT |
239 | | - ); |
240 | | - } |
241 | | - |
242 | | - function writeXmlFooter() { |
243 | | - fwrite( $this->outFile, "</mediawiki>\n" ); |
244 | | - } |
245 | | - |
246 | | - function readRclog() { |
247 | | - $rcFile = fopen( "{$this->dataDir}/rclog", 'r' ); |
248 | | - while ( $line = fgets( $rcFile ) ) { |
249 | | - $bits = explode( $this->FS3, $line ); |
250 | | - if ( count( $bits ) !== 7 ) { |
251 | | - echo "Error reading rclog\n"; |
252 | | - return; |
253 | | - } |
254 | | - $params = array( |
255 | | - 'timestamp' => $bits[0], |
256 | | - 'rctitle' => $bits[1], |
257 | | - 'summary' => $bits[2], |
258 | | - 'minor' => $bits[3], |
259 | | - 'host' => $bits[4], |
260 | | - 'kind' => $bits[5], |
261 | | - 'extra' => array() |
262 | | - ); |
263 | | - $extraList = explode( $this->FS2, $bits[6] ); |
264 | | - |
265 | | - for ( $i = 0; $i < count( $extraList ); $i += 2 ) { |
266 | | - $params['extra'][$extraList[$i]] = $extraList[$i + 1]; |
267 | | - } |
268 | | - $this->rc[$params['timestamp']][] = $params; |
269 | | - } |
270 | | - } |
271 | | - |
272 | | - function writeMoveLog() { |
273 | | - $this->moveLog = array(); |
274 | | - $deepRenames = $this->deepRenames; |
275 | | - echo "Calculating move log...\n"; |
276 | | - $this->processDiffFile( array( $this, 'moveLogCallback' ) ); |
277 | | - |
278 | | - // We have the timestamp intervals, now make a guess at the actual timestamp |
279 | | - foreach ( $this->moveLog as $newTitle => $params ) { |
280 | | - // Is there a time specified? |
281 | | - $drTime = false; |
282 | | - if ( isset( $deepRenames[$params['old']] ) ) { |
283 | | - $drTime = $deepRenames[$params['old']]; |
284 | | - if ( $drTime !== '?' ) { |
285 | | - if ( ( !isset( $params['endTime'] ) || $drTime < $params['endTime'] ) |
286 | | - && $drTime > $params['startTime'] ) |
287 | | - { |
288 | | - $this->moveLog[$newTitle]['timestamp'] = $drTime; |
289 | | - $this->moveLog[$newTitle]['deep'] = true; |
290 | | - |
291 | | - echo "{$params['old']} -> $newTitle at $drTime\n"; |
292 | | - unset( $deepRenames[$params['old']] ); |
293 | | - continue; |
294 | | - } else { |
295 | | - echo "WARNING: deep rename time invalid: {$params['old']}\n"; |
296 | | - unset( $deepRenames[$params['old']] ); |
297 | | - } |
298 | | - } |
299 | | - } |
300 | | - |
301 | | - // Guess that it is one second after the last edit to the page before it was moved |
302 | | - $this->moveLog[$newTitle]['timestamp'] = $params['startTime'] + 1; |
303 | | - if ( $drTime === '?' ) { |
304 | | - $this->moveLog[$newTitle]['deep'] = true; |
305 | | - unset( $deepRenames[$params['old']] ); |
306 | | - } |
307 | | - if ( isset( $params['endTime'] ) ) { |
308 | | - $this->printLatin1( "{$params['old']} -> $newTitle between " . |
309 | | - "{$params['startTime']} and {$params['endTime']}\n" ); |
310 | | - } else { |
311 | | - $this->printLatin1( "{$params['old']} -> $newTitle after " . |
312 | | - "{$params['startTime']}\n" ); |
313 | | - } |
314 | | - } |
315 | | - |
316 | | - // Write the move log to the XML file |
317 | | - $id = 1; |
318 | | - foreach ( $this->moveLog as $newTitle => $params ) { |
319 | | - $out = "<logitem>\n" . |
320 | | - $this->element( 'id', $id++ ) . |
321 | | - $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) . |
322 | | - "<contributor>\n" . |
323 | | - $this->element( 'username', 'UseModWiki admin' ) . |
324 | | - "</contributor>" . |
325 | | - $this->element( 'type', 'move' ) . |
326 | | - $this->element( 'action', 'move' ) . |
327 | | - $this->element( 'logtitle', $params['old'] ) . |
328 | | - "<params xml:space=\"preserve\">" . |
329 | | - htmlspecialchars( $this->encode( "{$newTitle}\n1" ) ) . |
330 | | - "</params>\n" . |
331 | | - "</logitem>\n"; |
332 | | - fwrite( $this->outFile, $out ); |
333 | | - } |
334 | | - |
335 | | - // Check for remaining deep rename entries |
336 | | - if ( $deepRenames ) { |
337 | | - echo "WARNING: the following entries in \$this->deepRenames are " . |
338 | | - "invalid, since no such move exists:\n" . |
339 | | - implode( "\n", array_keys( $deepRenames ) ) . |
340 | | - "\n\n"; |
341 | | - } |
342 | | - |
343 | | - } |
344 | | - |
345 | | - function element( $name, $value ) { |
346 | | - return "<$name>" . htmlspecialchars( $this->encode( $value ) ) . "</$name>\n"; |
347 | | - } |
348 | | - |
349 | | - function moveLogCallback( $entry ) { |
350 | | - $rctitle = $entry['rctitle']; |
351 | | - $title = $entry['title']; |
352 | | - $this->moveDests[$rctitle] = $title; |
353 | | - |
354 | | - if ( $rctitle === $title ) { |
355 | | - if ( isset( $this->moveLog[$rctitle] ) |
356 | | - && !isset( $this->moveLog[$rctitle]['endTime'] ) ) |
357 | | - { |
358 | | - // This is the latest time that the page could have been moved |
359 | | - $this->moveLog[$rctitle]['endTime'] = $entry['timestamp']; |
360 | | - } |
361 | | - } else { |
362 | | - if ( !isset( $this->moveLog[$rctitle] ) ) { |
363 | | - // Initialise the move log entry |
364 | | - $this->moveLog[$rctitle] = array( |
365 | | - 'old' => $title |
366 | | - ); |
367 | | - } |
368 | | - // Update the earliest time the page could have been moved |
369 | | - $this->moveLog[$rctitle]['startTime'] = $entry['timestamp']; |
370 | | - } |
371 | | - } |
372 | | - |
373 | | - function writeRevisions() { |
374 | | - $this->numGoodRevs = 0; |
375 | | - $this->revId = 1; |
376 | | - $this->processDiffFile( array( $this, 'revisionCallback' ) ); |
377 | | - echo "\n\nImported {$this->numGoodRevs} out of {$this->numRevs}\n"; |
378 | | - } |
379 | | - |
380 | | - function revisionCallback( $params ) { |
381 | | - $title = $params['rctitle']; |
382 | | - $editTime = $params['timestamp']; |
383 | | - |
384 | | - if ( isset( $this->blacklist[$title] ) ) { |
385 | | - return; |
386 | | - } |
387 | | - $this->doPendingOps( $editTime ); |
388 | | - |
389 | | - $origText = $this->getText( $title ); |
390 | | - $text = $this->patch( $origText, $params['diff'] ); |
391 | | - if ( $text === false ) { |
392 | | - echo "$editTime $title attempting resolution...\n"; |
393 | | - $linkSubstitutes = $this->resolveFailedDiff( $origText, $params['diff'] ); |
394 | | - if ( !$linkSubstitutes ) { |
395 | | - $this->printLatin1( "$editTime $title DIFF FAILED\n" ); |
396 | | - $this->blacklist[$title] = true; |
397 | | - return; |
398 | | - } |
399 | | - $this->printLatin1( "$editTime $title requires substitutions:\n" ); |
400 | | - $time = $editTime - 1; |
401 | | - foreach ( $linkSubstitutes as $old => $new ) { |
402 | | - $this->printLatin1( "SUBSTITUTE $old -> $new\n" ); |
403 | | - $this->renameTextLinks( $old, $new, $time-- ); |
404 | | - } |
405 | | - $origText = $this->getText( $title ); |
406 | | - $text = $this->patch( $origText, $params['diff'] ); |
407 | | - if ( $text === false ) { |
408 | | - $this->printLatin1( "$editTime $title STILL FAILS!\n" ); |
409 | | - $this->blacklist[$title] = true; |
410 | | - return; |
411 | | - } |
412 | | - |
413 | | - echo "\n"; |
414 | | - } |
415 | | - |
416 | | - $params['text'] = $text; |
417 | | - $this->saveRevision( $params ); |
418 | | - $this->numGoodRevs++; |
419 | | - #$this->printLatin1( "$editTime $title\n" ); |
420 | | - } |
421 | | - |
422 | | - function doPendingOps( $editTime ) { |
423 | | - foreach ( $this->moveLog as $newTitle => $entry ) { |
424 | | - if ( $entry['timestamp'] <= $editTime ) { |
425 | | - unset( $this->moveLog[$newTitle] ); |
426 | | - if ( isset( $entry['deep'] ) ) { |
427 | | - $this->renameTextLinks( $entry['old'], $newTitle, $entry['timestamp'] ); |
428 | | - } |
429 | | - } |
430 | | - } |
431 | | - |
432 | | - foreach ( $this->renameTextLinksOps as $renameTime => $replacements ) { |
433 | | - if ( $editTime >= $renameTime ) { |
434 | | - foreach ( $replacements as $old => $new ) { |
435 | | - $this->printLatin1( "SUBSTITUTE $old -> $new\n" ); |
436 | | - $this->renameTextLinks( $old, $new, $renameTime ); |
437 | | - } |
438 | | - unset( $this->renameTextLinksOps[$renameTime] ); |
439 | | - } |
440 | | - } |
441 | | - |
442 | | - foreach ( $this->unixLineEndingsOps as $fixTime => $title ) { |
443 | | - if ( $editTime >= $fixTime ) { |
444 | | - $this->printLatin1( "$fixTime $title FIXING LINE ENDINGS\n" ); |
445 | | - $text = $this->getText( $title ); |
446 | | - $text = str_replace( "\r", '', $text ); |
447 | | - $this->saveRevision( array( |
448 | | - 'rctitle' => $title, |
449 | | - 'timestamp' => $fixTime, |
450 | | - 'extra' => array( 'name' => 'UseModWiki admin' ), |
451 | | - 'text' => $text, |
452 | | - 'summary' => 'Fixing line endings', |
453 | | - ) ); |
454 | | - unset( $this->unixLineEndingsOps[$fixTime] ); |
455 | | - } |
456 | | - } |
457 | | - } |
458 | | - |
459 | | - function patch( $source, $diff ) { |
460 | | - file_put_contents( $this->articleFileName, $source ); |
461 | | - file_put_contents( $this->patchFileName, $diff ); |
462 | | - $error = wfShellExec( |
463 | | - wfEscapeShellArg( |
464 | | - 'patch', |
465 | | - '-n', |
466 | | - '-r', '-', |
467 | | - '--no-backup-if-mismatch', |
468 | | - '--binary', |
469 | | - $this->articleFileName, |
470 | | - $this->patchFileName |
471 | | - ) . ' 2>&1', |
472 | | - $status |
473 | | - ); |
474 | | - $text = file_get_contents( $this->articleFileName ); |
475 | | - if ( $status || $text === false ) { |
476 | | - return false; |
477 | | - } else { |
478 | | - return $text; |
479 | | - } |
480 | | - } |
481 | | - |
482 | | - function resolveFailedDiff( $origText, $diff ) { |
483 | | - $context = array(); |
484 | | - $diffLines = explode( "\n", $diff ); |
485 | | - for ( $i = 0; $i < count( $diffLines ); $i++ ) { |
486 | | - $diffLine = $diffLines[$i]; |
487 | | - if ( !preg_match( '/^(\d+)(?:,\d+)?[acd]\d+(?:,\d+)?$/', $diffLine, $m ) ) { |
488 | | - continue; |
489 | | - } |
490 | | - |
491 | | - $sourceIndex = intval( $m[1] ); |
492 | | - $i++; |
493 | | - while ( $i < count( $diffLines ) && substr( $diffLines[$i], 0, 1 ) === '<' ) { |
494 | | - $context[$sourceIndex - 1] = substr( $diffLines[$i], 2 ); |
495 | | - $sourceIndex++; |
496 | | - $i++; |
497 | | - } |
498 | | - $i--; |
499 | | - } |
500 | | - |
501 | | - $changedLinks = array(); |
502 | | - $origLines = explode( "\n", $origText ); |
503 | | - foreach ( $context as $i => $contextLine ) { |
504 | | - $origLine = isset( $origLines[$i] ) ? $origLines[$i] : ''; |
505 | | - if ( $contextLine === $origLine ) { |
506 | | - continue; |
507 | | - } |
508 | | - $newChanges = $this->resolveTextChange( $origLine, $contextLine ); |
509 | | - if ( is_array( $newChanges ) ) { |
510 | | - $changedLinks += $newChanges; |
511 | | - } else { |
512 | | - echo "Resolution failure on line " . ( $i + 1 ) . "\n"; |
513 | | - $this->printLatin1( $newChanges ); |
514 | | - } |
515 | | - } |
516 | | - |
517 | | - return $changedLinks; |
518 | | - } |
519 | | - |
520 | | - function resolveTextChange( $source, $dest ) { |
521 | | - $changedLinks = array(); |
522 | | - $sourceLinks = $this->getLinkList( $source ); |
523 | | - $destLinks = $this->getLinkList( $dest ); |
524 | | - $newLinks = array_diff( $destLinks, $sourceLinks ); |
525 | | - $removedLinks = array_diff( $sourceLinks, $destLinks ); |
526 | | - |
527 | | - // Match up the removed links with the new links |
528 | | - foreach ( $newLinks as $newLink ) { |
529 | | - $minDistance = 100000000; |
530 | | - $bestRemovedLink = false; |
531 | | - foreach ( $removedLinks as $removedLink ) { |
532 | | - $editDistance = levenshtein( $newLink, $removedLink ); |
533 | | - if ( $editDistance < $minDistance ) { |
534 | | - $minDistance = $editDistance; |
535 | | - $bestRemovedLink = $removedLink; |
536 | | - } |
537 | | - } |
538 | | - if ( $bestRemovedLink !== false ) { |
539 | | - $changedLinks[$bestRemovedLink] = $newLink; |
540 | | - $newLinks = array_diff( $newLinks, array( $newLink ) ); |
541 | | - $removedLinks = array_diff( $removedLinks, array( $bestRemovedLink ) ); |
542 | | - } |
543 | | - } |
544 | | - |
545 | | - $proposal = $source; |
546 | | - foreach ( $changedLinks as $removedLink => $newLink ) { |
547 | | - $proposal = $this->substituteTextLinks( $removedLink, $newLink, $proposal ); |
548 | | - } |
549 | | - if ( $proposal !== $dest ) { |
550 | | - // Resolution failed |
551 | | - $msg = "Source line: $source\n" . |
552 | | - "Source links: " . implode( ', ', $sourceLinks ) . "\n" . |
553 | | - "Context line: $dest\n" . |
554 | | - "Context links: " . implode( ', ', $destLinks ) . "\n" . |
555 | | - "Proposal: $proposal\n"; |
556 | | - return $msg; |
557 | | - } |
558 | | - return $changedLinks; |
559 | | - } |
560 | | - |
561 | | - function processDiffFile( $callback ) { |
562 | | - $diffFile = fopen( "{$this->dataDir}/diff_log", 'r' ); |
563 | | - |
564 | | - $delimiter = "------\n"; |
565 | | - file_put_contents( $this->articleFileName, "Describe the new page here.\n" ); |
566 | | - |
567 | | - $line = fgets( $diffFile ); |
568 | | - $lineNum = 1; |
569 | | - if ( $line !== $delimiter ) { |
570 | | - echo "Invalid diff file\n"; |
571 | | - return false; |
572 | | - } |
573 | | - $lastReportLine = 0; |
574 | | - $this->numRevs = 0; |
575 | | - |
576 | | - while ( true ) { |
577 | | - $line = fgets( $diffFile ); |
578 | | - $lineNum++; |
579 | | - if ( $line === false ) { |
580 | | - break; |
581 | | - } |
582 | | - if ( $lineNum > $lastReportLine + 1000 ) { |
583 | | - $lastReportLine = $lineNum; |
584 | | - fwrite( STDERR, "$lineNum \r" ); |
585 | | - fflush( STDERR ); |
586 | | - } |
587 | | - $line = trim( $line ); |
588 | | - if ( !preg_match( '/^([^|]+)\|(\d+)$/', $line, $matches ) ) { |
589 | | - echo "Invalid header on line $lineNum\n"; |
590 | | - return true; |
591 | | - } |
592 | | - list( , $title, $editTime ) = $matches; |
593 | | - |
594 | | - $diff = ''; |
595 | | - $diffStartLine = $lineNum; |
596 | | - while ( true ) { |
597 | | - $line = fgets( $diffFile ); |
598 | | - $lineNum++; |
599 | | - if ( $line === $delimiter ) { |
600 | | - break; |
601 | | - } |
602 | | - if ( $line === false ) { |
603 | | - break 2; |
604 | | - } |
605 | | - $diff .= $line; |
606 | | - } |
607 | | - |
608 | | - $this->numRevs++; |
609 | | - |
610 | | - if ( !isset( $this->rc[$editTime] ) ) { |
611 | | - $this->printLatin1( "$editTime $title DELETED, skipping\n" ); |
612 | | - continue; |
613 | | - } |
614 | | - |
615 | | - if ( count( $this->rc[$editTime] ) == 1 ) { |
616 | | - $params = $this->rc[$editTime][0]; |
617 | | - } else { |
618 | | - $params = false; |
619 | | - $candidates = ''; |
620 | | - foreach ( $this->rc[$editTime] as $rc ) { |
621 | | - if ( $rc['rctitle'] === $title ) { |
622 | | - $params = $rc; |
623 | | - break; |
624 | | - } |
625 | | - if ( $candidates === '' ) { |
626 | | - $candidates = $rc['rctitle']; |
627 | | - } else { |
628 | | - $candidates .= ', ' . $rc['rctitle']; |
629 | | - } |
630 | | - } |
631 | | - if ( !$params ) { |
632 | | - $this->printLatin1( "$editTime $title ERROR cannot resolve rclog\n" ); |
633 | | - $this->printLatin1( "$editTime $title CANDIDATES: $candidates\n" ); |
634 | | - continue; |
635 | | - } |
636 | | - } |
637 | | - $params['diff'] = $diff; |
638 | | - $params['title'] = $title; |
639 | | - $params['diffStartLine'] = $diffStartLine; |
640 | | - call_user_func( $callback, $params ); |
641 | | - } |
642 | | - echo "\n"; |
643 | | - |
644 | | - if ( !feof( $diffFile ) ) { |
645 | | - echo "Stopped at line $lineNum\n"; |
646 | | - } |
647 | | - return true; |
648 | | - } |
649 | | - |
650 | | - function reconcileCurrentRevs() { |
651 | | - foreach ( $this->textCache as $title => $text ) { |
652 | | - $fileName = "{$this->dataDir}/page/"; |
653 | | - if ( preg_match( '/^[A-Z]/', $title, $m ) ) { |
654 | | - $fileName .= $m[0]; |
655 | | - } else { |
656 | | - $fileName .= 'other'; |
657 | | - } |
658 | | - $fileName .= "/$title.db"; |
659 | | - |
660 | | - if ( !file_exists( $fileName ) ) { |
661 | | - $this->printLatin1( "ERROR: Cannot find page file for {$title}\n" ); |
662 | | - continue; |
663 | | - } |
664 | | - |
665 | | - $fileContents = file_get_contents( $fileName ); |
666 | | - $page = $this->unserializeUseMod( $fileContents, $this->FS1 ); |
667 | | - $section = $this->unserializeUseMod( $page['text_default'], $this->FS2 ); |
668 | | - $data = $this->unserializeUseMod( $section['data'], $this->FS3 ); |
669 | | - $pageText = $data['text']; |
670 | | - if ( $text !== $pageText ) { |
671 | | - $substs = $this->resolveTextChange( $text, $pageText ); |
672 | | - if ( is_array( $substs ) ) { |
673 | | - foreach ( $substs as $source => $dest ) { |
674 | | - if ( isset( $this->moveLog[$dest] ) ) { |
675 | | - $this->printLatin1( "ERROR: need deep rename: $source\n" ); |
676 | | - } else { |
677 | | - $this->printLatin1( "ERROR: need substitute: $source -> $dest\n" ); |
678 | | - } |
679 | | - } |
680 | | - } else { |
681 | | - $this->printLatin1( "ERROR: unresolved diff in $title:\n" ); |
682 | | - wfSuppressWarnings(); |
683 | | - $diff = xdiff_string_diff( $text, $pageText ) . ''; |
684 | | - wfRestoreWarnings(); |
685 | | - $this->printLatin1( "$diff\n" ); |
686 | | - } |
687 | | - } |
688 | | - } |
689 | | - } |
690 | | - |
691 | | - function makeTitle( $titleText ) { |
692 | | - return Title::newFromText( $this->encode( $titleText ) ); |
693 | | - } |
694 | | - |
695 | | - function getText( $titleText ) { |
696 | | - if ( !isset( $this->textCache[$titleText] ) ) { |
697 | | - return "Describe the new page here.\n"; |
698 | | - } else { |
699 | | - return $this->textCache[$titleText]; |
700 | | - } |
701 | | - } |
702 | | - |
703 | | - function saveRevision( $params ) { |
704 | | - $this->textCache[$params['rctitle']] = $params['text']; |
705 | | - |
706 | | - $out = "<page>\n" . |
707 | | - $this->element( 'title', $params['rctitle'] ) . |
708 | | - "<revision>\n" . |
709 | | - $this->element( 'id', $this->revId ++ ) . |
710 | | - $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) . |
711 | | - "<contributor>\n"; |
712 | | - if ( isset( $params['extra']['name'] ) ) { |
713 | | - $out .= $this->element( 'username', $params['extra']['name'] ); |
714 | | - } |
715 | | - if ( isset( $params['extra']['id'] ) ) { |
716 | | - $out .= $this->element( 'id', $params['extra']['id'] ); |
717 | | - } |
718 | | - if ( isset( $params['host'] ) ) { |
719 | | - $out .= $this->element( 'ip', $params['host'] ); |
720 | | - } |
721 | | - $out .= |
722 | | - "</contributor>\n" . |
723 | | - $this->element( 'comment', $params['summary'] ) . |
724 | | - "<text xml:space=\"preserve\">" . |
725 | | - htmlspecialchars( $this->encode( $params['text'] ) ) . |
726 | | - "</text>\n" . |
727 | | - "</revision>\n" . |
728 | | - "</page>\n"; |
729 | | - fwrite( $this->outFile, $out ); |
730 | | - } |
731 | | - |
732 | | - function renameTextLinks( $old, $new, $timestamp ) { |
733 | | - $newWithUnderscores = $new; |
734 | | - $old = str_replace( '_', ' ', $old ); |
735 | | - $new = str_replace( '_', ' ', $new ); |
736 | | - |
737 | | - foreach ( $this->textCache as $title => $oldText ) { |
738 | | - if ( $newWithUnderscores === $title |
739 | | - && in_array( $title, $this->skipSelfSubstitution ) ) |
740 | | - { |
741 | | - // Hack to make Pythagorean_Theorem etc. work |
742 | | - continue; |
743 | | - } |
744 | | - |
745 | | - $newText = $this->substituteTextLinks( $old, $new, $oldText ); |
746 | | - if ( $oldText !== $newText ) { |
747 | | - $this->saveRevision( array( |
748 | | - 'rctitle' => $title, |
749 | | - 'timestamp' => $timestamp, |
750 | | - 'text' => $newText, |
751 | | - 'extra' => array( 'name' => 'Page move link fixup script' ), |
752 | | - 'summary' => '', |
753 | | - 'minor' => true |
754 | | - ) ); |
755 | | - } |
756 | | - } |
757 | | - } |
758 | | - |
759 | | - function substituteTextLinks( $old, $new, $text ) { |
760 | | - $this->saveUrl = array(); |
761 | | - $this->old = $old; |
762 | | - $this->new = $new; |
763 | | - |
764 | | - $text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia) |
765 | | - $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is', |
766 | | - array( $this, 'storeRaw' ), $text ); |
767 | | - $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is', |
768 | | - array( $this, 'storeRaw' ), $text ); |
769 | | - $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s', |
770 | | - array( $this, 'storeRaw' ), $text ); |
771 | | - |
772 | | - $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/", |
773 | | - array( $this, 'subFreeLink' ), $text ); |
774 | | - $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/", |
775 | | - array( $this, 'subFreeLink' ), $text ); |
776 | | - $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/", |
777 | | - array( $this, 'storeRaw' ), $text ); |
778 | | - $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/", |
779 | | - array( $this, 'storeRaw' ), $text ); |
780 | | - $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/", |
781 | | - array( $this, 'storeRaw' ), $text ); |
782 | | - $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/", |
783 | | - array( $this, 'storeRaw' ), $text ); |
784 | | - $text = preg_replace_callback( "/{$this->LinkPattern}/", |
785 | | - array( $this, 'subWikiLink' ), $text ); |
786 | | - |
787 | | - $text = preg_replace_callback( "/{$this->FS}(\d+){$this->FS}/", |
788 | | - array( $this, 'restoreRaw' ), $text ); # Restore saved text |
789 | | - return $text; |
790 | | - } |
791 | | - |
792 | | - function getLinkList( $text ) { |
793 | | - $this->saveUrl = array(); |
794 | | - $this->linkList = array(); |
795 | | - |
796 | | - $text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia) |
797 | | - $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is', |
798 | | - array( $this, 'storeRaw' ), $text ); |
799 | | - $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is', |
800 | | - array( $this, 'storeRaw' ), $text ); |
801 | | - $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s', |
802 | | - array( $this, 'storeRaw' ), $text ); |
803 | | - |
804 | | - $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/", |
805 | | - array( $this, 'storeLink' ), $text ); |
806 | | - $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/", |
807 | | - array( $this, 'storeLink' ), $text ); |
808 | | - $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/", |
809 | | - array( $this, 'storeRaw' ), $text ); |
810 | | - $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/", |
811 | | - array( $this, 'storeRaw' ), $text ); |
812 | | - $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/", |
813 | | - array( $this, 'storeRaw' ), $text ); |
814 | | - $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/", |
815 | | - array( $this, 'storeRaw' ), $text ); |
816 | | - $text = preg_replace_callback( "/{$this->LinkPattern}/", |
817 | | - array( $this, 'storeLink' ), $text ); |
818 | | - |
819 | | - return $this->linkList; |
820 | | - } |
821 | | - |
822 | | - function storeRaw( $m ) { |
823 | | - $this->saveUrl[] = $m[1]; |
824 | | - return $this->FS . (count( $this->saveUrl ) - 1) . $this->FS; |
825 | | - } |
826 | | - |
827 | | - function subFreeLink( $m ) { |
828 | | - $link = $m[1]; |
829 | | - if ( isset( $m[2] ) ) { |
830 | | - $name = $m[2]; |
831 | | - } else { |
832 | | - $name = ''; |
833 | | - } |
834 | | - $oldlink = $link; |
835 | | - $link = preg_replace( '/^\s+/', '', $link ); |
836 | | - $link = preg_replace( '/\s+$/', '', $link ); |
837 | | - if ( $link == $this->old ) { |
838 | | - $link = $this->new; |
839 | | - } else { |
840 | | - $link = $oldlink; # Preserve spaces if no match |
841 | | - } |
842 | | - $link = "[[$link"; |
843 | | - if ( $name !== "" ) { |
844 | | - $link .= "|$name"; |
845 | | - } |
846 | | - $link .= "]]"; |
847 | | - return $this->storeRaw( array( 1 => $link ) ); |
848 | | - } |
849 | | - |
850 | | - function subWikiLink( $m ) { |
851 | | - $link = $m[1]; |
852 | | - if ( $link == $this->old ) { |
853 | | - $link = $this->new; |
854 | | - if ( !preg_match( "/^{$this->LinkPattern}$/", $this->new ) ) { |
855 | | - $link = "[[$link]]"; |
856 | | - } |
857 | | - } |
858 | | - return $this->storeRaw( array( 1 => $link ) ); |
859 | | - } |
860 | | - |
861 | | - function restoreRaw( $m ) { |
862 | | - return $this->saveUrl[$m[1]]; |
863 | | - } |
864 | | - |
865 | | - function storeLink( $m ) { |
866 | | - $this->linkList[] = $m[1]; |
867 | | - return $this->storeRaw( $m ); |
868 | | - } |
869 | | - |
870 | | - function encode( $s ) { |
871 | | - return strtr( $s, $this->encodeMap ); |
872 | | - } |
873 | | - |
874 | | - function decode( $s ) { |
875 | | - return strtr( $s, $this->decodeMap ); |
876 | | - } |
877 | | - |
878 | | - function printLatin1( $s ) { |
879 | | - echo $this->encode( $s ); |
880 | | - } |
881 | | - |
882 | | - function unserializeUseMod( $s, $sep ) { |
883 | | - $parts = explode( $sep, $s ); |
884 | | - $result = array(); |
885 | | - for ( $i = 0; $i < count( $parts ); $i += 2 ) { |
886 | | - $result[$parts[$i]] = $parts[$i+1]; |
887 | | - } |
888 | | - return $result; |
889 | | - } |
890 | | -} |
891 | | - |
892 | | -$maintClass = 'ImportUseModWikipedia'; |
893 | | -require_once( RUN_MAINTENANCE_IF_MAIN ); |