r57311 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r57310‎ | r57311 | r57312 >
Date:23:33, 2 October 2009
Author:siebrand
Status:deferred
Tags:
Comment:
Work in progress.
* added a ranked list of 50 most spoken languages with speakers as reported by Ethnologue and Encarta (data from Wikipedia), and average number of speakers (in thousands)
* added a array with named keys containing arrays with keys of message groups and values that represent the weight of the message groups
* updated command line usage documentation

TODO:
* implement localisation scores
Modified paths:
  • /trunk/extensions/Translate/scripts/groupStatistics.php (modified) (history)

Diff [purge]

Index: trunk/extensions/Translate/scripts/groupStatistics.php
@@ -9,7 +9,80 @@
1010 * @file
1111 */
1212
13 -$optionsWithArgs = array( 'groups', 'output', 'skiplanguages', );
 13+$mostSpokenLanguages = array(
 14+ // 'language code' => array( position, ethnologue, encarta, average ), // Remark
 15+ // Source: http://en.wikipedia.org/w/index.php?title=List_of_languages_by_number_of_native_speakers&oldid=317526109
 16+ 'zh-hans' => array( 1, 845000, 844700, 844850 ),
 17+ 'zh-hant' => array( 1, 845000, 844700, 844850 ),
 18+ 'es' => array( 2, 329000, 322000, 325500 ),
 19+ 'en' => array( 3, 328000, 341000, 334500 ),
 20+ 'hi' => array( 4, 182000, 366000, 274000 ), // Classified together with Urdu
 21+ 'ur' => array( 4, 60600, 60290, 60445 ), // Classified together with Hindi
 22+ 'ar' => array( 5, 221000, 422039, 321519 ),
 23+ 'bn' => array( 6, 181000, 207000, 194000 ),
 24+ 'pt' => array( 7, 178000, 176000, 177000 ),
 25+ 'pt-br' => array( 7, 178000, 176000, 177000 ),
 26+ 'ru' => array( 8, 144000, 167000, 155500 ),
 27+ 'ja' => array( 9, 122000, 125000, 123500 ),
 28+ 'de' => array( 10, 90300, 100130, 95215 ),
 29+ 'jv' => array( 11, 84600, 75567, 80083 ),
 30+ 'wuu' => array( 12, 77200, 77200, 77200 ), // No encarta data
 31+ 'ko' => array( 13, 75000, 78000, 76500 ),
 32+ 'pnb' => array( 14, 78300, 72188, 75244 ), // Most spoken variant
 33+ 'fr' => array( 15, 67800, 78000, 72900 ),
 34+ 'te' => array( 16, 69800, 69666, 69733 ),
 35+ 'vi' => array( 17, 68600, 68000, 68300 ),
 36+ 'mr' => array( 18, 68100, 68022, 68061 ),
 37+ 'ta' => array( 19, 65700, 66000, 65850 ),
 38+ 'it' => array( 20, 61700, 62000, 61850 ),
 39+ 'tr' => array( 21, 59000, 61000, 60000 ),
 40+ 'fa' => array( 22, 72000, 31300, 51650 ),
 41+ 'yue' => array( 23, 55500, 55000, 55250 ), // No encarta data
 42+ 'tl' => array( 24, 48900, 17000, 32950 ),
 43+ 'gu' => array( 25, 46500, 46100, 46300 ),
 44+ 'nan' => array( 26, 46200, 46200, 46200 ), // No encarta data, most spoken variant
 45+ 'pl' => array( 27, 40000, 44000, 42000 ),
 46+ 'uk' => array( 28, 39400, 47000, 43200 ),
 47+ 'hsn' => array( 29, 36000, 36000, 36000 ), // No encarta data
 48+ 'ml' => array( 30, 35706, 35706, 35706 ),
 49+ 'kn' => array( 31, 35400, 35400, 35400 ),
 50+ 'mai' => array( 32, 45000, 24191, 34595 ),
 51+ 'bh' => array( 33, 38500, 26254, 32377 ),
 52+ 'my' => array( 34, 32300, 32300, 32300 ),
 53+ 'or' => array( 35, 31700, 32300, 32000 ),
 54+ 'ms' => array( 36, 39100, 23600, 31350 ),
 55+ 'su' => array( 37, 34000, 27000, 30500 ),
 56+ 'hak' => array( 38, 30000, 30000, 30000 ), // No encarta data
 57+ 'ro' => array( 39, 23400, 26265, 24832 ),
 58+ 'az' => array( 40, 19100, 31400, 25250 ),
 59+ 'ha' => array( 41, 24200, 24200, 24200 ),
 60+ 'ps' => array( 42, 19000, 26811, 22905 ),
 61+ 'gan-hans' => array( 43, 21000, 21000, 21000 ),
 62+ 'gan-hant' => array( 43, 21000, 21000, 21000 ),
 63+ 'id' => array( 44, 23200, 17100, 20150 ),
 64+ 'th' => array( 45, 20050, 46100, 33075 ),
 65+ 'nl' => array( 46, 21700, 20000, 20850 ),
 66+ 'yo' => array( 47, 20000, 20000, 20000 ),
 67+ 'sd' => array( 48, 19720, 19720, 19720 ),
 68+ 'uz' => array( 49, 18466, 20100, 19283 ),
 69+ 'sh' => array( 50, 16400, 21100, 18750 ),
 70+);
 71+
 72+$localisedWeights = array(
 73+ 'wikimedia' => array(
 74+ 'core-mostused' => 40,
 75+ 'core' => 30,
 76+ 'ext-0-wikimedia' => 30
 77+ ),
 78+ 'mediawiki' => array(
 79+ 'core-mostused' => 30,
 80+ 'core' => 30,
 81+ 'ext-0-wikimedia' => 20,
 82+ 'ext-0-all' => 20
 83+ )
 84+);
 85+
 86+$optionsWithArgs = array( 'groups', 'output', 'skiplanguages', 'most' );
1487 require( dirname( __FILE__ ) . '/cli.inc' );
1588
1689 class TranslateStatsOutput extends WikiStatsOutput {
@@ -27,7 +100,7 @@
28101 $msg = <<<END
29102 --help : this help message
30103 --groups LIST: comma separated list of groups
31 - --skiplanguages LIST: comma separated list of languages that should be skipped
 104+ --skiplanguages LIST: comma separated list of skipped languages
32105 --skipzero : skip languages that do not have any localisation at all
33106 --fuzzy : add column for fuzzy counts
34107 --output TYPE: select an another output engine
@@ -35,6 +108,21 @@
36109 * 'wiki' : MediaWiki syntax.
37110 * 'metawiki' : MediaWiki syntax used for Meta-Wiki.
38111 * 'text' : Text with tabs.
 112+ --most : [SCOPE]: report on the 50 most spoken languages. Skipzero is
 113+ ignored. If a valid scope is defined, the group list is
 114+ has been chosen, the localisation levels are weighted
 115+ and reported.
 116+ * mediawiki:
 117+ core-mostused (30%)
 118+ core (30%)
 119+ ext-0-wikimedia (20%)
 120+ ext-0-all (20%)
 121+ * wikimedia:
 122+ core-mostused (40%)
 123+ core (30%)
 124+ ext-0-wikimedia (30%)
 125+ --speakers : add column for number of speakers (est.). Only valid when
 126+ combined with --most.
39127
40128 END;
41129 STDERR( $msg );
@@ -87,15 +175,23 @@
88176
89177 // List of all languages.
90178 $languages = Language::getLanguageNames( false );
91 -// Default sorting order by language code, users can sort wiki output by any
92 -// column, if it is supported.
 179+// Default sorting order by language code, users can sort wiki output.
93180 ksort( $languages );
94181
95182 // Output headers
96183 $out->heading();
97184 $out->blockstart();
 185+
 186+// Add header column for language size
 187+if( isset( $options['most'] ) ) {
 188+ $out->element( 'Pos.', true );
 189+}
98190 $out->element( 'Code', true );
99191 $out->element( 'Language', true );
 192+
 193+if( isset( $options['most'] ) && isset( $options['speakers'] ) ) {
 194+ $out->element( 'Speakers', true );
 195+}
100196 foreach ( $groups as $g ) {
101197 // Add unprocessed description of group as heading
102198 $out->element( $g->getLabel(), true );
@@ -121,13 +217,19 @@
122218 // Perform the statistic calculations on every language
123219 foreach ( $languages as $code => $name ) {
124220 // Skip list
125 - if ( in_array( $code, $skipLanguages ) ) continue;
 221+ if ( !isset( $options['most'] ) && in_array( $code, $skipLanguages ) ) {
 222+ continue;
 223+ }
126224
 225+ // If --most is set, skip all other
 226+ if ( isset( $options['most'] ) && !isset( $mostSpokenLanguages[$code] ) ) {
 227+ continue;
 228+ }
 229+
127230 $incache = $cache->get( $groupName, $code );
128231 if ( $incache !== false ) {
129232 list( $fuzzy, $translated, $total ) = $incache;
130233 } else {
131 -
132234 $collection->resetForNewLanguage( $code );
133235 $collection->filter( 'ignored' );
134236 $collection->filter( 'optional' );
@@ -153,15 +255,22 @@
154256
155257 }
156258
157 - $cache->commit(); // Don't keep open too long... to avoid concurrent access
 259+ $cache->commit(); // Do not keep open too long to avoid concurrent access
158260
159261 unset($collection);
160262 }
161263
162264 foreach ( $languages as $code => $name ) {
163265 // Skip list
164 - if ( in_array( $code, $skipLanguages ) ) continue;
 266+ if ( !isset( $options['most'] ) && in_array( $code, $skipLanguages ) ) {
 267+ continue;
 268+ }
165269
 270+ // If --most is set, skip all other
 271+ if ( isset( $options['most'] ) && !isset( $mostSpokenLanguages[$code] ) ) {
 272+ continue;
 273+ }
 274+
166275 $columns = $rows[$code];
167276
168277 $allZero = true;
@@ -174,8 +283,14 @@
175284
176285 // Output the the row
177286 $out->blockstart();
 287+ if( isset( $options['most'] ) ) {
 288+ $out->element( $mostSpokenLanguages[$code][0] );
 289+ }
178290 $out->element( $code );
179291 $out->element( $name );
 292+ if( isset( $options['most'] ) && isset( $options['speakers'] ) ) {
 293+ $out->element( number_format( $mostSpokenLanguages[$code][3] ) );
 294+ }
180295 foreach ( $columns as $fields ) {
181296 list( $invert, $upper, $total ) = $fields;
182297 $c = $out->formatPercent( $upper, $total, $invert, /* Decimals */ 2 );

Status & tagging log