r69816 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r69815‎ | r69816 | r69817 >
Date:20:58, 23 July 2010
Author:simetrical
Status:ok (Comments)
Tags:
Comment:
Add non-identity collation, with migration script

It seemed to work correctly, with the newly-created page "bob" sorting
as "BOB", but then I nuked all my cl_sortkey by running the migration
script before refreshLinks.php had finished running, so I'll have to
wait a while to see if it works properly with a non-messed-up database.
It's possible there's something wrong with the display of section
letters in the categories, but otherwise I think this is working right.
Modified paths:
  • /trunk/phase3/includes/CategoryPage.php (modified) (history)
  • /trunk/phase3/includes/DefaultSettings.php (modified) (history)
  • /trunk/phase3/languages/Language.php (modified) (history)
  • /trunk/phase3/maintenance/updateCollation.php (added) (history)

Diff [purge]

Index: trunk/phase3/maintenance/updateCollation.php
@@ -0,0 +1,74 @@
 2+<?php
 3+/**
 4+ * @file
 5+ * @ingroup Maintenance
 6+ * @author Aryeh Gregor (Simetrical)
 7+ */
 8+
 9+#$optionsWithArgs = array( 'begin', 'max-slave-lag' );
 10+
 11+require_once( dirname( __FILE__ ) . '/Maintenance.php' );
 12+
 13+class UpdateCollation extends Maintenance {
 14+ const BATCH_SIZE = 1000;
 15+
 16+ public function __construct() {
 17+ parent::__construct();
 18+
 19+ global $wgCollationVersion;
 20+ $this->mDescription = <<<TEXT
 21+This script will find all rows in the categorylinks table whose collation is
 22+out-of-date (cl_collation < $wgCollationVersion) and repopulate cl_sortkey
 23+using cl_raw_sortkey. If everything's collation is up-to-date, it will do
 24+nothing.
 25+TEXT;
 26+
 27+ #$this->addOption( 'force', 'Run on all rows, even if the collation is supposed to be up-to-date.' );
 28+ }
 29+
 30+ public function execute() {
 31+ global $wgCollationVersion, $wgContLang;
 32+
 33+ $dbw = wfGetDB( DB_MASTER );
 34+ $count = $dbw->estimateRowCount(
 35+ 'categorylinks',
 36+ array( 'cl_from', 'cl_to', 'cl_raw_sortkey' ),
 37+ 'cl_collation < ' . $dbw->addQuotes( $wgCollationVersion ),
 38+ __METHOD__
 39+ );
 40+
 41+ $this->output( "Fixing around $count rows (estimate might be wrong).\n" );
 42+
 43+ $count = 0;
 44+ do {
 45+ $res = $dbw->select(
 46+ 'categorylinks',
 47+ array( 'cl_from', 'cl_to', 'cl_raw_sortkey' ),
 48+ 'cl_collation < ' . $dbw->addQuotes( $wgCollationVersion ),
 49+ __METHOD__,
 50+ array( 'LIMIT' => self::BATCH_SIZE )
 51+ );
 52+
 53+ $dbw->begin();
 54+ foreach ( $res as $row ) {
 55+ # TODO: Handle the case where cl_raw_sortkey is null.
 56+ $dbw->update(
 57+ 'categorylinks',
 58+ array(
 59+ 'cl_sortkey' => $wgContLang->convertToSortkey( $row->cl_raw_sortkey ),
 60+ 'cl_collation' => $wgCollationVersion
 61+ ),
 62+ array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ),
 63+ __METHOD__
 64+ );
 65+ }
 66+ $dbw->commit();
 67+
 68+ $count += self::BATCH_SIZE;
 69+ $this->output( "$count done.\n" );
 70+ } while ( $res->numRows() >= self::BATCH_SIZE );
 71+ }
 72+}
 73+
 74+$maintClass = "UpdateCollation";
 75+require_once( DO_MAINTENANCE );
Property changes on: trunk/phase3/maintenance/updateCollation.php
___________________________________________________________________
Added: svn:eol-style
176 + native
Index: trunk/phase3/includes/CategoryPage.php
@@ -172,14 +172,20 @@
173173 * else use sortkey...
174174 */
175175 function getSubcategorySortChar( $title, $sortkey ) {
176 - global $wgContLang;
 176+ global $wgContLang, $wgExperimentalCategorySort;
177177
178178 if ( $title->getPrefixedText() == $sortkey ) {
179 - $firstChar = $wgContLang->firstChar( $title->getDBkey() );
 179+ $word = $title->getDBkey();
180180 } else {
181 - $firstChar = $wgContLang->firstChar( $sortkey );
 181+ $word = $sortkey;
182182 }
183183
 184+ if ( $wgExperimentalCategorySort ) {
 185+ $firstChar = $wgContLang->firstLetterForLists( $word );
 186+ } else {
 187+ $firstChar = $wgContLang->firstChar( $word );
 188+ }
 189+
184190 return $wgContLang->convert( $firstChar );
185191 }
186192
@@ -202,7 +208,7 @@
203209 * Add a miscellaneous page
204210 */
205211 function addPage( $title, $sortkey, $pageLength, $isRedirect = false ) {
206 - global $wgContLang;
 212+ global $wgContLang, $wgExperimentalCategorySort;
207213 $this->articles[] = $isRedirect
208214 ? '<span class="redirect-in-category">' .
209215 $this->getSkin()->link(
@@ -213,7 +219,12 @@
214220 array( 'known', 'noclasses' )
215221 ) . '</span>'
216222 : $this->getSkin()->makeSizeLinkObj( $pageLength, $title );
217 - $this->articles_start_char[] = $wgContLang->convert( $wgContLang->firstChar( $sortkey ) );
 223+
 224+ if ( $wgExperimentalCategorySort ) {
 225+ $this->articles_start_char[] = $wgContLang->convert( $wgContLang->firstLetterForLists( $sortkey ) );
 226+ } else {
 227+ $this->articles_start_char[] = $wgContLang->convert( $wgContLang->firstChar( $sortkey ) );
 228+ }
218229 }
219230
220231 function finaliseCategoryState() {
@@ -259,7 +270,7 @@
260271 foreach ( array( 'page', 'subcat', 'file' ) as $type ) {
261272 $res = $dbr->select(
262273 $tables,
263 - $fields,
 274+ array_merge( $fields, array( 'cl_raw_sortkey' ) ),
264275 $conds + array( 'cl_type' => $type ) + ( $type == 'page' ? array( $pageCondition ) : array() ),
265276 __METHOD__,
266277 $opts + ( $type == 'page' ? array( 'LIMIT' => $this->limit + 1 ) : array() ),
@@ -278,11 +289,11 @@
279290
280291 if ( $title->getNamespace() == NS_CATEGORY ) {
281292 $cat = Category::newFromRow( $row, $title );
282 - $this->addSubcategoryObject( $cat, $row->cl_sortkey, $row->page_len );
 293+ $this->addSubcategoryObject( $cat, $row->cl_raw_sortkey, $row->page_len );
283294 } elseif ( $this->showGallery && $title->getNamespace() == NS_FILE ) {
284 - $this->addImage( $title, $row->cl_sortkey, $row->page_len, $row->page_is_redirect );
 295+ $this->addImage( $title, $row->cl_raw_sortkey, $row->page_len, $row->page_is_redirect );
285296 } else {
286 - $this->addPage( $title, $row->cl_sortkey, $row->page_len, $row->page_is_redirect );
 297+ $this->addPage( $title, $row->cl_raw_sortkey, $row->page_len, $row->page_is_redirect );
287298 }
288299 }
289300 }
Index: trunk/phase3/includes/DefaultSettings.php
@@ -4474,7 +4474,7 @@
44754475 * for all rows where cl_collation < $wgCollationVersion and regenerates
44764476 * cl_sortkey based on cl_raw_sortkey.
44774477 */
4478 -$wgCollationVersion = 0;
 4478+$wgCollationVersion = 1;
44794479
44804480 /** @} */ # End categories }
44814481
Index: trunk/phase3/languages/Language.php
@@ -2945,8 +2945,8 @@
29462946 * @return string Binary sortkey
29472947 */
29482948 public function convertToSortkey( $string ) {
2949 - # Stub function for now
2950 - return $string;
 2949+ # Fake function for now
 2950+ return strtoupper( $string );
29512951 }
29522952
29532953 /**
@@ -2986,6 +2986,6 @@
29872987 * @return string UTF-8 string corresponding to the first letter of input
29882988 */
29892989 public function firstLetterForLists( $string ) {
2990 - return mb_substr( $string, 0, 1 );
 2990+ return strtoupper( mb_substr( $string, 0, 1 ) );
29912991 }
29922992 }

Follow-up revisions

RevisionCommit summaryAuthorDate
r80436Change the default collation from strtoupper to Language::uc, so that non-asc...bawolff06:27, 17 January 2011

Comments

#Comment by Bawolff (talk | contribs)   00:14, 6 January 2011

why not use $wgContLang->uc instead of strtoupper for the default convertToSortkey so that things like ĉ get uppercased?

#Comment by Simetrical (talk | contribs)   20:41, 6 January 2011

It's just proof-of-concept code for testing. The idea is that someone would add real convertToSortkey() functions that actually compute sortkeys via CLDR or something.

#Comment by Simetrical (talk | contribs)   20:44, 6 January 2011

That said, go ahead and update the stub function to a slightly nicer stub if you like. You'll need to update $wgCategoryCollation to some different string for this to work (IIRC, this should update the categorylinks table next time you run update.php).

Status & tagging log