Index: trunk/extensions/AntiSpoof/generateEquivset.php |
— | — | @@ -1,27 +1,28 @@ |
2 | 2 | <?php |
3 | 3 | |
4 | | -require_once ( getenv( 'MW_INSTALL_PATH' ) !== false |
5 | | - ? getenv( 'MW_INSTALL_PATH' ) . "/maintenance/commandLine.inc" |
6 | | - : dirname( __FILE__ ) . '/../../maintenance/commandLine.inc' ); |
| 4 | +$IP = getenv( 'MW_INSTALL_PATH' ); |
| 5 | +if ( $IP === false ) { |
| 6 | + $IP = dirname( __FILE__ ) . '/../..'; |
| 7 | +} |
| 8 | +require( "$IP/maintenance/Maintenance.php" ); |
7 | 9 | |
8 | | -$dir = dirname( __FILE__ ); |
| 10 | +class GenerateEquivset extends Maintenance { |
| 11 | + public function execute() { |
| 12 | + $dir = dirname( __FILE__ ); |
9 | 13 | |
10 | | -$endl = ' |
11 | | -'; |
| 14 | + $endl = "\n"; |
12 | 15 | |
13 | | -$lines = file( "$dir/equivset.in" ); |
14 | | -if ( !$lines ) { |
15 | | - print "Unable to open equivset.in\n"; |
16 | | - exit( 1 ); |
17 | | -} |
| 16 | + $lines = file( "$dir/equivset.in" ); |
| 17 | + if ( !$lines ) { |
| 18 | + $this->error( "Unable to open equivset.in\n", 1 ); |
| 19 | + } |
18 | 20 | |
19 | | -$setsFile = fopen( "$dir/equivset.txt", 'w' ); |
20 | | -if ( !$setsFile ) { |
21 | | - print "Unable to open equivset.txt for writing\n"; |
22 | | - exit( 1 ); |
23 | | -} |
| 21 | + $setsFile = fopen( "$dir/equivset.txt", 'w' ); |
| 22 | + if ( !$setsFile ) { |
| 23 | + $this->error( "Unable to open equivset.txt for writing\n", 1 ); |
| 24 | + } |
24 | 25 | |
25 | | -fwrite( $setsFile, <<<EOT |
| 26 | + fwrite( $setsFile, <<<EOT |
26 | 27 | # This file is generated by generateEquivset.php |
27 | 28 | # It shows sets of equivalent characters, one set per line, with characters |
28 | 29 | # separated by whitespace. This file is not used by MediaWiki, rather it is |
— | — | @@ -29,14 +30,14 @@ |
30 | 31 | # review purposes. |
31 | 32 | |
32 | 33 | EOT |
33 | | -); |
| 34 | + ); |
34 | 35 | |
35 | | -$outputFile = fopen( "$dir/equivset.php", 'w' ); |
36 | | -if ( !$outputFile ) { |
37 | | - print "Unable to open equivset.php for writing\n"; |
38 | | - exit( 1 ); |
39 | | -} |
40 | | -fwrite( $outputFile, "<?" . "php$endl" . <<<EOT |
| 36 | + $outputFile = fopen( "$dir/equivset.php", 'w' ); |
| 37 | + if ( !$outputFile ) { |
| 38 | + $this->error( "Unable to open equivset.php for writing\n", 1 ); |
| 39 | + } |
| 40 | + |
| 41 | + fwrite( $outputFile, "<?" . "php$endl" . <<<EOT |
41 | 42 | # This file is generated by generateEquivset.php |
42 | 43 | # It contains a map of characters, encoded in UTF-8, such that running strtr() |
43 | 44 | # on a string with this map will cause confusable characters to be reduced to |
— | — | @@ -44,104 +45,109 @@ |
45 | 46 | # form, in equivset.ser. |
46 | 47 | |
47 | 48 | EOT |
48 | | -); |
| 49 | + ); |
49 | 50 | |
50 | | -$serializedFile = fopen( "$dir/equivset.ser", 'w' ); |
51 | | -if ( !$serializedFile ) { |
52 | | - print "Unable to open equivset.ser for writing\n"; |
53 | | - exit( 1 ); |
54 | | -} |
| 51 | + $serializedFile = fopen( "$dir/equivset.ser", 'w' ); |
| 52 | + if ( !$serializedFile ) { |
| 53 | + $this->error( "Unable to open equivset.ser for writing\n", 1 ); |
| 54 | + } |
55 | 55 | |
56 | | -# \s matches \xa0 in non-unicode mode, which is not what we want |
57 | | -# So we need to make our own whitespace class |
58 | | -$sp = '[\ \t]'; |
| 56 | + # \s matches \xa0 in non-unicode mode, which is not what we want |
| 57 | + # So we need to make our own whitespace class |
| 58 | + $sp = '[\ \t]'; |
59 | 59 | |
60 | | -$lineNum = 0; |
61 | | -$setsByChar = array(); |
62 | | -$sets = array(); |
63 | | -$exitStatus = 0; |
64 | | -foreach ( $lines as $line ) { |
65 | | - ++$lineNum; |
66 | | - $line = trim( $line ); |
| 60 | + $lineNum = 0; |
| 61 | + $setsByChar = array(); |
| 62 | + $sets = array(); |
| 63 | + $exitStatus = 0; |
67 | 64 | |
68 | | - # Filter comments |
69 | | - if ( !$line || $line[0] == '#' ) { |
70 | | - continue; |
71 | | - } |
| 65 | + foreach ( $lines as $line ) { |
| 66 | + ++$lineNum; |
| 67 | + $line = trim( $line ); |
72 | 68 | |
73 | | - # Process line |
74 | | - if ( !preg_match( |
75 | | -"/^(?P<hexleft> [A-F0-9]+) $sp+ (?P<charleft> .+?) $sp+ => $sp+ (?:(?P<hexright> [A-F0-9]+) $sp+|) (?P<charright> .+?) $sp* (?: \#.*|) $ /x", $line, $m ) ) |
76 | | - { |
77 | | - print "Error: invalid entry at line $lineNum: $line\n"; |
78 | | - $exitStatus = 1; |
79 | | - continue; |
80 | | - } |
81 | | - $error = false; |
82 | | - if ( codepointToUtf8( hexdec( $m['hexleft'] ) ) != $m['charleft'] ) { |
83 | | - $actual = utf8ToCodepoint( $m['charleft'] ); |
84 | | - if ( $actual === false ) { |
85 | | - print "Bytes: " . strlen( $m['charleft'] ) . "\n"; |
86 | | - print bin2hex( $line ) . "\n"; |
87 | | - $hexForm = bin2hex( $m['charleft'] ); |
88 | | - print "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at line $lineNum: $line\n"; |
89 | | - } else { |
90 | | - print "Error: left number ({$m['hexleft']}) does not match left character ($actual) " . |
91 | | - "at line $lineNum: $line\n"; |
| 69 | + # Filter comments |
| 70 | + if ( !$line || $line[0] == '#' ) { |
| 71 | + continue; |
| 72 | + } |
| 73 | + |
| 74 | + # Process line |
| 75 | + if ( !preg_match( |
| 76 | + "/^(?P<hexleft> [A-F0-9]+) $sp+ (?P<charleft> .+?) $sp+ => $sp+ (?:(?P<hexright> [A-F0-9]+) $sp+|) (?P<charright> .+?) $sp* (?: \#.*|) $ /x", $line, $m ) ) |
| 77 | + { |
| 78 | + $this->output( "Error: invalid entry at line $lineNum: $line\n" ); |
| 79 | + $exitStatus = 1; |
| 80 | + continue; |
| 81 | + } |
| 82 | + $error = false; |
| 83 | + if ( codepointToUtf8( hexdec( $m['hexleft'] ) ) != $m['charleft'] ) { |
| 84 | + $actual = utf8ToCodepoint( $m['charleft'] ); |
| 85 | + if ( $actual === false ) { |
| 86 | + $this->output( "Bytes: " . strlen( $m['charleft'] ) . "\n" ); |
| 87 | + $this->output( bin2hex( $line ) . "\n" ); |
| 88 | + $hexForm = bin2hex( $m['charleft'] ); |
| 89 | + $this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at line $lineNum: $line\n" ); |
| 90 | + } else { |
| 91 | + $this->output( "Error: left number ({$m['hexleft']}) does not match left character ($actual) " . |
| 92 | + "at line $lineNum: $line\n" ); |
| 93 | + } |
| 94 | + $error = true; |
| 95 | + } |
| 96 | + if ( !empty( $m['hexright'] ) && codepointToUtf8( hexdec( $m['hexright'] ) ) != $m['charright'] ) { |
| 97 | + $actual = utf8ToCodepoint( $m['charright'] ); |
| 98 | + if ( $actual === false ) { |
| 99 | + $hexForm = bin2hex( $m['charright'] ); |
| 100 | + $this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at line $lineNum: $line\n" ); |
| 101 | + } else { |
| 102 | + $this->output( "Error: right number ({$m['hexright']}) does not match right character ($actual) " . |
| 103 | + "at line $lineNum: $line\n" ); |
| 104 | + } |
| 105 | + $error = true; |
| 106 | + } |
| 107 | + if ( $error ) { |
| 108 | + $exitStatus = 1; |
| 109 | + continue; |
| 110 | + } |
| 111 | + |
| 112 | + # Find the set for the right character, add a new one if necessary |
| 113 | + if ( isset( $setsByChar[$m['charright']] ) ) { |
| 114 | + $setName = $setsByChar[$m['charright']]; |
| 115 | + } else { |
| 116 | + # New set |
| 117 | + $setName = $m['charright']; |
| 118 | + $sets[$setName] = array( $m['charright'] ); |
| 119 | + $setsByChar[$setName] = $setName; |
| 120 | + } |
| 121 | + |
| 122 | + # Add the left character to the set |
| 123 | + $sets[$setName][] = $m['charleft']; |
| 124 | + $setsByChar[$m['charleft']] = $setName; |
92 | 125 | } |
93 | | - $error = true; |
94 | | - } |
95 | | - if ( !empty( $m['hexright'] ) && codepointToUtf8( hexdec( $m['hexright'] ) ) != $m['charright'] ) { |
96 | | - $actual = utf8ToCodepoint( $m['charright'] ); |
97 | | - if ( $actual === false ) { |
98 | | - $hexForm = bin2hex( $m['charright'] ); |
99 | | - print "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at line $lineNum: $line\n"; |
100 | | - } else { |
101 | | - print "Error: right number ({$m['hexright']}) does not match right character ($actual) " . |
102 | | - "at line $lineNum: $line\n"; |
| 126 | + |
| 127 | + # Sets output |
| 128 | + foreach ( $sets as $members ) { |
| 129 | + fwrite( $setsFile, implode( ' ', $members ) . $endl ); |
103 | 130 | } |
104 | | - $error = true; |
105 | | - } |
106 | | - if ( $error ) { |
107 | | - $exitStatus = 1; |
108 | | - continue; |
109 | | - } |
110 | 131 | |
111 | | - # Find the set for the right character, add a new one if necessary |
112 | | - if ( isset( $setsByChar[$m['charright']] ) ) { |
113 | | - $setName = $setsByChar[$m['charright']]; |
114 | | - } else { |
115 | | - # New set |
116 | | - $setName = $m['charright']; |
117 | | - $sets[$setName] = array( $m['charright'] ); |
118 | | - $setsByChar[$setName] = $setName; |
119 | | - } |
| 132 | + # Map output |
| 133 | + $output = var_export( $setsByChar, true ); |
| 134 | + $output = str_replace( "\n", $endl, $output ); |
| 135 | + fwrite( $outputFile, '$equivset = ' . "$output;$endl" ); |
120 | 136 | |
121 | | - # Add the left character to the set |
122 | | - $sets[$setName][] = $m['charleft']; |
123 | | - $setsByChar[$m['charleft']] = $setName; |
124 | | -} |
| 137 | + # Serialized codepoint map |
| 138 | + $codepointMap = array(); |
| 139 | + foreach ( $setsByChar as $char => $setName ) { |
| 140 | + $codepointMap[ utf8ToCodepoint( $char ) ] = utf8ToCodepoint( $setName ); |
| 141 | + } |
| 142 | + fwrite( $serializedFile, serialize( $codepointMap ) ); |
125 | 143 | |
126 | | -# Sets output |
127 | | -foreach ( $sets as $setName => $members ) { |
128 | | - fwrite( $setsFile, implode( ' ', $members ) . $endl ); |
129 | | -} |
| 144 | + fclose( $setsFile ); |
| 145 | + fclose( $outputFile ); |
| 146 | + fclose( $serializedFile ); |
130 | 147 | |
131 | | -# Map output |
132 | | -$output = var_export( $setsByChar, true ); |
133 | | -$output = str_replace( "\n", $endl, $output ); |
134 | | -fwrite( $outputFile, '$equivset = ' . "$output;$endl" ); |
135 | | - |
136 | | -# Serialized codepoint map |
137 | | -$codepointMap = array(); |
138 | | -foreach ( $setsByChar as $char => $setName ) { |
139 | | - $codepointMap[ utf8ToCodepoint( $char ) ] = utf8ToCodepoint( $setName ); |
| 148 | + $this->error( '', $exitStatus ); |
| 149 | + } |
140 | 150 | } |
141 | | -fwrite( $serializedFile, serialize( $codepointMap ) ); |
142 | 151 | |
143 | | -fclose( $setsFile ); |
144 | | -fclose( $outputFile ); |
145 | | -fclose( $serializedFile ); |
| 152 | +$maintClass = "GenerateEquivset"; |
| 153 | +require_once( DO_MAINTENANCE ); |
146 | 154 | |
147 | | -exit( $exitStatus ); |
148 | | - |