Index: trunk/extensions/Translate/scripts/groupStatistics.php |
— | — | @@ -9,7 +9,80 @@ |
10 | 10 | * @file |
11 | 11 | */ |
12 | 12 | |
13 | | -$optionsWithArgs = array( 'groups', 'output', 'skiplanguages', ); |
| 13 | +$mostSpokenLanguages = array( |
| 14 | + // 'language code' => array( position, ethnologue, encarta, average ), // Remark |
| 15 | + // Source: http://en.wikipedia.org/w/index.php?title=List_of_languages_by_number_of_native_speakers&oldid=317526109 |
| 16 | + 'zh-hans' => array( 1, 845000, 844700, 844850 ), |
| 17 | + 'zh-hant' => array( 1, 845000, 844700, 844850 ), |
| 18 | + 'es' => array( 2, 329000, 322000, 325500 ), |
| 19 | + 'en' => array( 3, 328000, 341000, 334500 ), |
| 20 | + 'hi' => array( 4, 182000, 366000, 274000 ), // Classified together with Urdu |
| 21 | + 'ur' => array( 4, 60600, 60290, 60445 ), // Classified together with Hindi |
| 22 | + 'ar' => array( 5, 221000, 422039, 321519 ), |
| 23 | + 'bn' => array( 6, 181000, 207000, 194000 ), |
| 24 | + 'pt' => array( 7, 178000, 176000, 177000 ), |
| 25 | + 'pt-br' => array( 7, 178000, 176000, 177000 ), |
| 26 | + 'ru' => array( 8, 144000, 167000, 155500 ), |
| 27 | + 'ja' => array( 9, 122000, 125000, 123500 ), |
| 28 | + 'de' => array( 10, 90300, 100130, 95215 ), |
| 29 | + 'jv' => array( 11, 84600, 75567, 80083 ), |
| 30 | + 'wuu' => array( 12, 77200, 77200, 77200 ), // No encarta data |
| 31 | + 'ko' => array( 13, 75000, 78000, 76500 ), |
| 32 | + 'pnb' => array( 14, 78300, 72188, 75244 ), // Most spoken variant |
| 33 | + 'fr' => array( 15, 67800, 78000, 72900 ), |
| 34 | + 'te' => array( 16, 69800, 69666, 69733 ), |
| 35 | + 'vi' => array( 17, 68600, 68000, 68300 ), |
| 36 | + 'mr' => array( 18, 68100, 68022, 68061 ), |
| 37 | + 'ta' => array( 19, 65700, 66000, 65850 ), |
| 38 | + 'it' => array( 20, 61700, 62000, 61850 ), |
| 39 | + 'tr' => array( 21, 59000, 61000, 60000 ), |
| 40 | + 'fa' => array( 22, 72000, 31300, 51650 ), |
| 41 | + 'yue' => array( 23, 55500, 55000, 55250 ), // No encarta data |
| 42 | + 'tl' => array( 24, 48900, 17000, 32950 ), |
| 43 | + 'gu' => array( 25, 46500, 46100, 46300 ), |
| 44 | + 'nan' => array( 26, 46200, 46200, 46200 ), // No encarta data, most spoken variant |
| 45 | + 'pl' => array( 27, 40000, 44000, 42000 ), |
| 46 | + 'uk' => array( 28, 39400, 47000, 43200 ), |
| 47 | + 'hsn' => array( 29, 36000, 36000, 36000 ), // No encarta data |
| 48 | + 'ml' => array( 30, 35706, 35706, 35706 ), |
| 49 | + 'kn' => array( 31, 35400, 35400, 35400 ), |
| 50 | + 'mai' => array( 32, 45000, 24191, 34595 ), |
| 51 | + 'bh' => array( 33, 38500, 26254, 32377 ), |
| 52 | + 'my' => array( 34, 32300, 32300, 32300 ), |
| 53 | + 'or' => array( 35, 31700, 32300, 32000 ), |
| 54 | + 'ms' => array( 36, 39100, 23600, 31350 ), |
| 55 | + 'su' => array( 37, 34000, 27000, 30500 ), |
| 56 | + 'hak' => array( 38, 30000, 30000, 30000 ), // No encarta data |
| 57 | + 'ro' => array( 39, 23400, 26265, 24832 ), |
| 58 | + 'az' => array( 40, 19100, 31400, 25250 ), |
| 59 | + 'ha' => array( 41, 24200, 24200, 24200 ), |
| 60 | + 'ps' => array( 42, 19000, 26811, 22905 ), |
| 61 | + 'gan-hans' => array( 43, 21000, 21000, 21000 ), |
| 62 | + 'gan-hant' => array( 43, 21000, 21000, 21000 ), |
| 63 | + 'id' => array( 44, 23200, 17100, 20150 ), |
| 64 | + 'th' => array( 45, 20050, 46100, 33075 ), |
| 65 | + 'nl' => array( 46, 21700, 20000, 20850 ), |
| 66 | + 'yo' => array( 47, 20000, 20000, 20000 ), |
| 67 | + 'sd' => array( 48, 19720, 19720, 19720 ), |
| 68 | + 'uz' => array( 49, 18466, 20100, 19283 ), |
| 69 | + 'sh' => array( 50, 16400, 21100, 18750 ), |
| 70 | +); |
| 71 | + |
| 72 | +$localisedWeights = array( |
| 73 | + 'wikimedia' => array( |
| 74 | + 'core-mostused' => 40, |
| 75 | + 'core' => 30, |
| 76 | + 'ext-0-wikimedia' => 30 |
| 77 | + ), |
| 78 | + 'mediawiki' => array( |
| 79 | + 'core-mostused' => 30, |
| 80 | + 'core' => 30, |
| 81 | + 'ext-0-wikimedia' => 20, |
| 82 | + 'ext-0-all' => 20 |
| 83 | + ) |
| 84 | +); |
| 85 | + |
| 86 | +$optionsWithArgs = array( 'groups', 'output', 'skiplanguages', 'most' ); |
14 | 87 | require( dirname( __FILE__ ) . '/cli.inc' ); |
15 | 88 | |
16 | 89 | class TranslateStatsOutput extends WikiStatsOutput { |
— | — | @@ -27,7 +100,7 @@ |
28 | 101 | $msg = <<<END |
29 | 102 | --help : this help message |
30 | 103 | --groups LIST: comma separated list of groups |
31 | | - --skiplanguages LIST: comma separated list of languages that should be skipped |
| 104 | + --skiplanguages LIST: comma separated list of skipped languages |
32 | 105 | --skipzero : skip languages that do not have any localisation at all |
33 | 106 | --fuzzy : add column for fuzzy counts |
34 | 107 | --output TYPE: select an another output engine |
— | — | @@ -35,6 +108,21 @@ |
36 | 109 | * 'wiki' : MediaWiki syntax. |
37 | 110 | * 'metawiki' : MediaWiki syntax used for Meta-Wiki. |
38 | 111 | * 'text' : Text with tabs. |
| 112 | + --most : [SCOPE]: report on the 50 most spoken languages. Skipzero is |
| 113 | + ignored. If a valid scope is defined, the group list is |
| 114 | + has been chosen, the localisation levels are weighted |
| 115 | + and reported. |
| 116 | + * mediawiki: |
| 117 | + core-mostused (30%) |
| 118 | + core (30%) |
| 119 | + ext-0-wikimedia (20%) |
| 120 | + ext-0-all (20%) |
| 121 | + * wikimedia: |
| 122 | + core-mostused (40%) |
| 123 | + core (30%) |
| 124 | + ext-0-wikimedia (30%) |
| 125 | + --speakers : add column for number of speakers (est.). Only valid when |
| 126 | + combined with --most. |
39 | 127 | |
40 | 128 | END; |
41 | 129 | STDERR( $msg ); |
— | — | @@ -87,15 +175,23 @@ |
88 | 176 | |
89 | 177 | // List of all languages. |
90 | 178 | $languages = Language::getLanguageNames( false ); |
91 | | -// Default sorting order by language code, users can sort wiki output by any |
92 | | -// column, if it is supported. |
| 179 | +// Default sorting order by language code, users can sort wiki output. |
93 | 180 | ksort( $languages ); |
94 | 181 | |
95 | 182 | // Output headers |
96 | 183 | $out->heading(); |
97 | 184 | $out->blockstart(); |
| 185 | + |
| 186 | +// Add header column for language size |
| 187 | +if( isset( $options['most'] ) ) { |
| 188 | + $out->element( 'Pos.', true ); |
| 189 | +} |
98 | 190 | $out->element( 'Code', true ); |
99 | 191 | $out->element( 'Language', true ); |
| 192 | + |
| 193 | +if( isset( $options['most'] ) && isset( $options['speakers'] ) ) { |
| 194 | + $out->element( 'Speakers', true ); |
| 195 | +} |
100 | 196 | foreach ( $groups as $g ) { |
101 | 197 | // Add unprocessed description of group as heading |
102 | 198 | $out->element( $g->getLabel(), true ); |
— | — | @@ -121,13 +217,19 @@ |
122 | 218 | // Perform the statistic calculations on every language |
123 | 219 | foreach ( $languages as $code => $name ) { |
124 | 220 | // Skip list |
125 | | - if ( in_array( $code, $skipLanguages ) ) continue; |
| 221 | + if ( !isset( $options['most'] ) && in_array( $code, $skipLanguages ) ) { |
| 222 | + continue; |
| 223 | + } |
126 | 224 | |
| 225 | + // If --most is set, skip all other |
| 226 | + if ( isset( $options['most'] ) && !isset( $mostSpokenLanguages[$code] ) ) { |
| 227 | + continue; |
| 228 | + } |
| 229 | + |
127 | 230 | $incache = $cache->get( $groupName, $code ); |
128 | 231 | if ( $incache !== false ) { |
129 | 232 | list( $fuzzy, $translated, $total ) = $incache; |
130 | 233 | } else { |
131 | | - |
132 | 234 | $collection->resetForNewLanguage( $code ); |
133 | 235 | $collection->filter( 'ignored' ); |
134 | 236 | $collection->filter( 'optional' ); |
— | — | @@ -153,15 +255,22 @@ |
154 | 256 | |
155 | 257 | } |
156 | 258 | |
157 | | - $cache->commit(); // Don't keep open too long... to avoid concurrent access |
| 259 | + $cache->commit(); // Do not keep open too long to avoid concurrent access |
158 | 260 | |
159 | 261 | unset($collection); |
160 | 262 | } |
161 | 263 | |
162 | 264 | foreach ( $languages as $code => $name ) { |
163 | 265 | // Skip list |
164 | | - if ( in_array( $code, $skipLanguages ) ) continue; |
| 266 | + if ( !isset( $options['most'] ) && in_array( $code, $skipLanguages ) ) { |
| 267 | + continue; |
| 268 | + } |
165 | 269 | |
| 270 | + // If --most is set, skip all other |
| 271 | + if ( isset( $options['most'] ) && !isset( $mostSpokenLanguages[$code] ) ) { |
| 272 | + continue; |
| 273 | + } |
| 274 | + |
166 | 275 | $columns = $rows[$code]; |
167 | 276 | |
168 | 277 | $allZero = true; |
— | — | @@ -174,8 +283,14 @@ |
175 | 284 | |
176 | 285 | // Output the the row |
177 | 286 | $out->blockstart(); |
| 287 | + if( isset( $options['most'] ) ) { |
| 288 | + $out->element( $mostSpokenLanguages[$code][0] ); |
| 289 | + } |
178 | 290 | $out->element( $code ); |
179 | 291 | $out->element( $name ); |
| 292 | + if( isset( $options['most'] ) && isset( $options['speakers'] ) ) { |
| 293 | + $out->element( number_format( $mostSpokenLanguages[$code][3] ) ); |
| 294 | + } |
180 | 295 | foreach ( $columns as $fields ) { |
181 | 296 | list( $invert, $upper, $total ) = $fields; |
182 | 297 | $c = $out->formatPercent( $upper, $total, $invert, /* Decimals */ 2 ); |