r2097 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r2096‎ | r2097 | r2098 >
Date:23:40, 28 November 2003
Author:vibber
Status:old
Tags:
Comment:
Updates
Modified paths:
  • /branches/stable/phase3/maintenance/importUseModWiki.php (modified) (history)

Diff [purge]

Index: branches/stable/phase3/maintenance/importUseModWiki.php
@@ -8,12 +8,17 @@
99 Updated limited version to get something working temporarily
1010 2003-10-09
1111 Be sure to run the link & index rebuilding scripts!
 12+
 13+ Some more munging for charsets etc
 14+ 2003-11-28
1215
1316 */
1417
 18+/* Set these correctly! */
 19+$wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
 20+$wgRootDirectory = "/home/usemod/wiki-fi/lib-http/db/wiki";
 21+
1522 /* globals */
16 -$wgRootDirectory = "/Users/brion/src/wiki/convert/wiki-fy/lib-http/db/wiki";
17 -$wgRootDirectory = "/home/usemod/wiki-fy/lib-http/db/wiki";
1823 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
1924 $FS = $wgFieldSeparator ;
2025 $FS1 = $FS."1" ;
@@ -252,13 +257,36 @@
253258
254259 # Whee!
255260 function recodeText( $string ) {
 261+ global $wgImportEncoding;
256262 # For currently latin-1 wikis
257263 $string = str_replace( "\r\n", "\n", $string );
258 - # return iconv( "CP1252", "UTF-8", $string );
259 - return utf8_encode( $string );
 264+ $string = iconv( $wgImportEncoding, "UTF-8", $string );
 265+ $string = wfMungeToUtf8( $string ); # Any old Ӓ stuff
 266+ return $string;
260267 }
261268
 269+function wfUtf8Sequence($codepoint) {
 270+ if($codepoint < 0x80) return chr($codepoint);
 271+ if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
 272+ chr($codepoint & 0x3f | 0x80);
 273+ if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
 274+ chr($codepoint >> 6 & 0x3f | 0x80) .
 275+ chr($codepoint & 0x3f | 0x80);
 276+ if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
 277+ chr($codepoint >> 12 & 0x3f | 0x80) .
 278+ chr($codepoint >> 6 & 0x3f | 0x80) .
 279+ chr($codepoint & 0x3f | 0x80);
 280+ # Doesn't yet handle outside the BMP
 281+ return "&#$codepoint;";
 282+}
262283
 284+function wfMungeToUtf8($string) {
 285+ $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
 286+ $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
 287+ # Should also do named entities here
 288+ return $string;
 289+}
 290+
263291 function wfStrencode( $string ) {
264292 return mysql_escape_string( $string );
265293 }

Status & tagging log