Index: branches/stable/phase3/maintenance/importUseModWiki.php |
— | — | @@ -8,12 +8,17 @@ |
9 | 9 | Updated limited version to get something working temporarily |
10 | 10 | 2003-10-09 |
11 | 11 | Be sure to run the link & index rebuilding scripts! |
| 12 | + |
| 13 | + Some more munging for charsets etc |
| 14 | + 2003-11-28 |
12 | 15 | |
13 | 16 | */ |
14 | 17 | |
| 18 | +/* Set these correctly! */ |
| 19 | +$wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */ |
| 20 | +$wgRootDirectory = "/home/usemod/wiki-fi/lib-http/db/wiki"; |
| 21 | + |
15 | 22 | /* globals */ |
16 | | -$wgRootDirectory = "/Users/brion/src/wiki/convert/wiki-fy/lib-http/db/wiki"; |
17 | | -$wgRootDirectory = "/home/usemod/wiki-fy/lib-http/db/wiki"; |
18 | 23 | $wgFieldSeparator = "\xb3"; # Some wikis may use different char |
19 | 24 | $FS = $wgFieldSeparator ; |
20 | 25 | $FS1 = $FS."1" ; |
— | — | @@ -252,13 +257,36 @@ |
253 | 258 | |
254 | 259 | # Whee! |
255 | 260 | function recodeText( $string ) { |
| 261 | + global $wgImportEncoding; |
256 | 262 | # For currently latin-1 wikis |
257 | 263 | $string = str_replace( "\r\n", "\n", $string ); |
258 | | - # return iconv( "CP1252", "UTF-8", $string ); |
259 | | - return utf8_encode( $string ); |
| 264 | + $string = iconv( $wgImportEncoding, "UTF-8", $string ); |
| 265 | + $string = wfMungeToUtf8( $string ); # Any old Ӓ stuff |
| 266 | + return $string; |
260 | 267 | } |
261 | 268 | |
| 269 | +function wfUtf8Sequence($codepoint) { |
| 270 | + if($codepoint < 0x80) return chr($codepoint); |
| 271 | + if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) . |
| 272 | + chr($codepoint & 0x3f | 0x80); |
| 273 | + if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) . |
| 274 | + chr($codepoint >> 6 & 0x3f | 0x80) . |
| 275 | + chr($codepoint & 0x3f | 0x80); |
| 276 | + if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this |
| 277 | + chr($codepoint >> 12 & 0x3f | 0x80) . |
| 278 | + chr($codepoint >> 6 & 0x3f | 0x80) . |
| 279 | + chr($codepoint & 0x3f | 0x80); |
| 280 | + # Doesn't yet handle outside the BMP |
| 281 | + return "&#$codepoint;"; |
| 282 | +} |
262 | 283 | |
| 284 | +function wfMungeToUtf8($string) { |
| 285 | + $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string ); |
| 286 | + $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string ); |
| 287 | + # Should also do named entities here |
| 288 | + return $string; |
| 289 | +} |
| 290 | + |
263 | 291 | function wfStrencode( $string ) { |
264 | 292 | return mysql_escape_string( $string ); |
265 | 293 | } |