r63578 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r63577‎ | r63578 | r63579 >
Date:21:54, 10 March 2010
Author:mah
Status:resolved
Tags:
Comment:
Follow-up r61856
* Rename wordSegmentation() to segmentByWord().
* Consolidate search index locking and iteration to Maintenance.php
* Add maintenance/updateDoubleWidthSearch.php to take care of new
format for normalized double-width roman characters.
* Add error checking to updateSearchIndex.php for creating $posFile.
* Add note to UPGRADE about running updateDoubleWidthSearch.php.
Modified paths:
  • /trunk/phase3/UPGRADE (modified) (history)
  • /trunk/phase3/languages/Language.php (modified) (history)
  • /trunk/phase3/languages/classes/LanguageJa.php (modified) (history)
  • /trunk/phase3/languages/classes/LanguageYue.php (modified) (history)
  • /trunk/phase3/languages/classes/LanguageZh_hans.php (modified) (history)
  • /trunk/phase3/maintenance/Maintenance.php (modified) (history)
  • /trunk/phase3/maintenance/updateDoubleWidthSearch.php (added) (history)
  • /trunk/phase3/maintenance/updateSearchIndex.php (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/updateSearchIndex.php
@@ -63,9 +63,18 @@
6464 $lockTime = $this->getOption( 'l', 20 );
6565
6666 $this->doUpdateSearchIndex( $start, $end, $lockTime );
67 - $file = fopen( $posFile, 'w' );
68 - fwrite( $file, $end );
69 - fclose( $file );
 67+ if( is_writable( dirname( realpath( $posFile ) ) ) ) {
 68+ $file = fopen( $posFile, 'w' );
 69+ if( $file !== false ) {
 70+ fwrite( $file, $end );
 71+ fclose( $file );
 72+ } else {
 73+ echo posix_get_last_error();
 74+ $this->output( "*** Couldn't write to the $posFile!" );
 75+ }
 76+ } else {
 77+ $this->output( "*** Couldn't write to the $posFile!" );
 78+ }
7079 }
7180
7281 private function doUpdateSearchIndex( $start, $end, $maxLockTime ) {
@@ -89,84 +98,23 @@
9099 ";
91100 $res = $dbw->query( $sql, __METHOD__ );
92101
 102+ $this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res);
93103
94 - # Lock searchindex
95 - if ( $maxLockTime ) {
96 - $this->output( " --- Waiting for lock ---" );
97 - $this->lockSearchindex( $dbw );
98 - $lockTime = time();
99 - $this->output( "\n" );
100 - }
 104+ $this->output( "Done\n" );
 105+ }
101106
102 - # Loop through the results and do a search update
103 - foreach ( $res as $row ) {
104 - # Allow reads to be processed
105 - if ( $maxLockTime && time() > $lockTime + $maxLockTime ) {
106 - $this->output( " --- Relocking ---" );
107 - $this->relockSearchindex( $dbw );
108 - $lockTime = time();
109 - $this->output( "\n" );
110 - }
111 - if ( $row->rc_type == RC_LOG ) {
112 - continue;
113 - } elseif ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) {
114 - # Rename searchindex entry
115 - $titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title );
116 - $title = $titleObj->getPrefixedDBkey();
117 - $this->output( "$title..." );
118 - $u = new SearchUpdate( $row->rc_cur_id, $title, false );
119 - $this->output( "\n" );
120 - } else {
121 - // Get current revision
122 - $rev = Revision::loadFromPageId( $dbw, $row->rc_cur_id );
123 - if( $rev ) {
124 - $titleObj = $rev->getTitle();
125 - $title = $titleObj->getPrefixedDBkey();
126 - $this->output( $title );
127 - # Update searchindex
128 - $u = new SearchUpdate( $row->rc_cur_id, $titleObj->getText(), $rev->getText() );
129 - $u->doUpdate();
130 - $this->output( "\n" );
131 - }
132 - }
133 - }
134 -
135 - # Unlock searchindex
136 - if ( $maxLockTime ) {
137 - $this->output( " --- Unlocking --" );
138 - $this->unlockSearchindex( $dbw );
 107+ public function searchIndexUpdateCallback($dbw, $row) {
 108+ if ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) {
 109+ # Rename searchindex entry
 110+ $titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title );
 111+ $title = $titleObj->getPrefixedDBkey();
 112+ $this->output( "$title..." );
 113+ $u = new SearchUpdate( $row->rc_cur_id, $title, false );
139114 $this->output( "\n" );
 115+ } elseif ( $row->rc_type !== RC_LOG ) {
 116+ $this->updateSearchIndexForPage( $dbw, $row->rc_cur_id );
140117 }
141 - $this->output( "Done\n" );
142118 }
143 -
144 - /**
145 - * Lock the search index
146 - * @param &$db Database object
147 - */
148 - private function lockSearchindex( &$db ) {
149 - $write = array( 'searchindex' );
150 - $read = array( 'page', 'revision', 'text', 'interwiki' );
151 - $db->lockTables( $read, $write, 'updateSearchIndex.php ' . __METHOD__ );
152 - }
153 -
154 - /**
155 - * Unlock the tables
156 - * @param &$db Database object
157 - */
158 - private function unlockSearchindex( &$db ) {
159 - $db->unlockTables( 'updateSearchIndex.php ' . __METHOD__ );
160 - }
161 -
162 - /**
163 - * Unlock and lock again
164 - * Since the lock is low-priority, queued reads will be able to complete
165 - * @param &$db Database object
166 - */
167 - private function relockSearchindex( &$db ) {
168 - $this->unlockSearchindex( $db );
169 - $this->lockSearchindex( $db );
170 - }
171119 }
172120
173121 $maintClass = "UpdateSearchIndex";
Index: trunk/phase3/maintenance/Maintenance.php
@@ -844,4 +844,91 @@
845845 }
846846 return self::$mCoreScripts;
847847 }
 848+
 849+ /**
 850+ * Lock the search index
 851+ * @param &$db Database object
 852+ */
 853+ private function lockSearchindex( &$db ) {
 854+ $write = array( 'searchindex' );
 855+ $read = array( 'page', 'revision', 'text', 'interwiki', 'l10n_cache' );
 856+ $db->lockTables( $read, $write, __CLASS__ . '::' . __METHOD__ );
 857+ }
 858+
 859+ /**
 860+ * Unlock the tables
 861+ * @param &$db Database object
 862+ */
 863+ private function unlockSearchindex( &$db ) {
 864+ $db->unlockTables( __CLASS__ . '::' . __METHOD__ );
 865+ }
 866+
 867+ /**
 868+ * Unlock and lock again
 869+ * Since the lock is low-priority, queued reads will be able to complete
 870+ * @param &$db Database object
 871+ */
 872+ private function relockSearchindex( &$db ) {
 873+ $this->unlockSearchindex( $db );
 874+ $this->lockSearchindex( $db );
 875+ }
 876+
 877+ /**
 878+ * Perform a search index update with locking
 879+ * @param $maxLockTime integer the maximum time to keep the search index locked.
 880+ * @param $updateFunction callback the function that will update the function.
 881+ */
 882+ public function updateSearchIndex( $maxLockTime, $callback, $dbw, $results ) {
 883+ $lockTime = time();
 884+
 885+ # Lock searchindex
 886+ if ( $maxLockTime ) {
 887+ $this->output( " --- Waiting for lock ---" );
 888+ $this->lockSearchindex( $dbw );
 889+ $lockTime = time();
 890+ $this->output( "\n" );
 891+ }
 892+
 893+ # Loop through the results and do a search update
 894+ foreach ( $results as $row ) {
 895+ # Allow reads to be processed
 896+ if ( $maxLockTime && time() > $lockTime + $maxLockTime ) {
 897+ $this->output( " --- Relocking ---" );
 898+ $this->relockSearchindex( $dbw );
 899+ $lockTime = time();
 900+ $this->output( "\n" );
 901+ }
 902+ call_user_func( $callback, $dbw, $row );
 903+ }
 904+
 905+ # Unlock searchindex
 906+ if ( $maxLockTime ) {
 907+ $this->output( " --- Unlocking --" );
 908+ $this->unlockSearchindex( $dbw );
 909+ $this->output( "\n" );
 910+ }
 911+
 912+ }
 913+
 914+ /**
 915+ * Update the searchindex table for a given pageid
 916+ * @param $dbw Database a database write handle
 917+ * @param $pageId the page ID to update.
 918+ */
 919+ public function updateSearchIndexForPage( $dbw, $pageId ) {
 920+ // Get current revision
 921+ $rev = Revision::loadFromPageId( $dbw, $pageId );
 922+ $title = null;
 923+ if( $rev ) {
 924+ $titleObj = $rev->getTitle();
 925+ $title = $titleObj->getPrefixedDBkey();
 926+ $this->output( "$title..." );
 927+ # Update searchindex
 928+ $u = new SearchUpdate( $pageId, $titleObj->getText(), $rev->getText() );
 929+ $u->doUpdate();
 930+ $this->output( "\n" );
 931+ }
 932+ return $title;
 933+ }
 934+
848935 }
Index: trunk/phase3/maintenance/updateDoubleWidthSearch.php
@@ -0,0 +1,72 @@
 2+<?php
 3+/**
 4+ * Script to normalize double-byte latin UTF-8 characters
 5+ *
 6+ * Usage: php updateDoubleWidthSearch.php
 7+ *
 8+ * This program is free software; you can redistribute it and/or modify
 9+ * it under the terms of the GNU General Public License as published by
 10+ * the Free Software Foundation; either version 2 of the License, or
 11+ * (at your option) any later version.
 12+ *
 13+ * This program is distributed in the hope that it will be useful,
 14+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
 15+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 16+ * GNU General Public License for more details.
 17+ *
 18+ * You should have received a copy of the GNU General Public License along
 19+ * with this program; if not, write to the Free Software Foundation, Inc.,
 20+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 21+ * http://www.gnu.org/copyleft/gpl.html
 22+ *
 23+ * @ingroup Maintenance
 24+ */
 25+
 26+require_once( dirname(__FILE__) . '/Maintenance.php' );
 27+
 28+class UpdateDoubleWidthSearch extends Maintenance {
 29+
 30+ public function __construct() {
 31+ parent::__construct();
 32+ $this->mDescription = "Script to normalize double-byte latin UTF-8 characters";
 33+ $this->addOption( 'q', 'quiet', false, true );
 34+ $this->addOption( 'l', 'How long the searchindex and revision tables will be locked for', false, true );
 35+ }
 36+
 37+ public function getDbType() {
 38+ return Maintenance::DB_ADMIN;
 39+ }
 40+
 41+ public function execute() {
 42+ $quiet = $this->hasOption( 'q' );
 43+ $maxLockTime = $this->getOption( 'l', 20 );
 44+ $lockTime = time();
 45+
 46+ $dbw = wfGetDB( DB_MASTER );
 47+ if( $dbw->getType() !== 'mysql' ) {
 48+ $this->output( "This change is only needed on MySQL, quitting..." );
 49+ exit(1);
 50+ }
 51+
 52+ $res = $this->findRows($dbw);
 53+ $this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res);
 54+
 55+ $this->output( "Done\n" );
 56+ }
 57+
 58+ public function searchIndexUpdateCallback($dbw, $row) {
 59+ return $this->updateSearchIndexForPage( $dbw, $row->si_page );
 60+ }
 61+
 62+ private function findRows($dbw) {
 63+ $searchindex = $dbw->tableName( 'searchindex' );
 64+ $regexp = '[[:<:]]u8efbd([89][1-9a]|8[b-f]|90)[[:>:]]';
 65+ $sql = "SELECT si_page FROM $searchindex
 66+ WHERE ( si_text RLIKE '$regexp' )
 67+ OR ( si_title RLIKE '$regexp' )";
 68+ return $dbw->query( $sql, __METHOD__ );
 69+ }
 70+}
 71+
 72+$maintClass = "UpdateDoubleWidthSearch";
 73+require_once( DO_MAINTENANCE );
Property changes on: trunk/phase3/maintenance/updateDoubleWidthSearch.php
___________________________________________________________________
Name: svn:eol-syle
174 + native
Index: trunk/phase3/UPGRADE
@@ -53,11 +53,19 @@
5454 You will need to have $wgDBadminuser and $wgDBadminpass set in your
5555 LocalSettings.php, see there for more info.
5656
57 -From the command line, browse to the "maintenance" directory and run the
 57+From the command line, browse to the "maintenance" directory and run the
5858 update.php script to check and update the schema. This will insert missing
5959 tables, update existing tables, and move data around as needed. In most cases,
6060 this is successful and nothing further needs to be done.
6161
 62+If you have a Chinese or Japanese wiki ($wgLanguageCode is set to one
 63+of "zh", "ja", or "yue") and you are using MySQL fulltext search, you
 64+will probably want to update the search index.
 65+
 66+In the "maintenance" directory, run the updateDoubleWidthSearch.php
 67+script. This will update the searchindex table for those pages that
 68+contain double-byte latin characters.
 69+
6270 === Check configuration settings ===
6371
6472 The names of configuration variables, and their default values and purposes,
@@ -67,6 +75,7 @@
6876 behaviour of MediaWiki.
6977
7078 === Check installed extensions ===
 79+
7180 In MediaWiki 1.14 some extensions are migrated into the core. Please see the
7281 HISTORY section "Migrated extensions" and disable these extensions in your
7382 LocalSettings.php
Index: trunk/phase3/languages/Language.php
@@ -1695,7 +1695,7 @@
16961696 * @param $string String
16971697 * @return String
16981698 */
1699 - function wordSegmentation( $string ) {
 1699+ function segmentByWord( $string ) {
17001700 return $string;
17011701 }
17021702
Index: trunk/phase3/languages/classes/LanguageZh_hans.php
@@ -13,7 +13,7 @@
1414 * for now just treat each character as a word.
1515 * @todo Fixme: only do this for Han characters...
1616 */
17 - function wordSegmentation( $string ) {
 17+ function segmentByWord( $string ) {
1818 $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
1919 $s = self::insertSpace( $string, $reg );
2020 return $s;
@@ -25,7 +25,7 @@
2626 // Double-width roman characters
2727 $s = self::convertDoubleWidth( $string );
2828 $s = trim( $s );
29 - $s = self::wordSegmentation( $s );
 29+ $s = self::segmentByWord( $s );
3030 $s = parent::normalizeForSearch( $s );
3131
3232 wfProfileOut( __METHOD__ );
Index: trunk/phase3/languages/classes/LanguageJa.php
@@ -6,7 +6,7 @@
77 * @ingroup Language
88 */
99 class LanguageJa extends Language {
10 - function wordSegmentation( $string ) {
 10+ function segmentByWord( $string ) {
1111 // Strip known punctuation ?
1212 // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
1313
Index: trunk/phase3/languages/classes/LanguageYue.php
@@ -12,7 +12,7 @@
1313 * for now just treat each character as a word.
1414 * @todo Fixme: only do this for Han characters...
1515 */
16 - function wordSegmentation( $string ) {
 16+ function segmentByWord( $string ) {
1717 $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
1818 $s = self::insertSpace( $string, $reg );
1919 return $s;

Follow-up revisions

RevisionCommit summaryAuthorDate
r63598Fix for r63578: also change wordSegmentation() to segmentByWord() hereialex12:27, 11 March 2010
r63601Further fixes for r63578, rename remaining wordSegmentation functions and cal...conrad15:25, 11 March 2010
r63613follow up r63578 - remove stray debugging code.mah20:05, 11 March 2010
r78378Cleanup for r63578: Use Maintenance::error(), that's what its fordemon13:37, 14 December 2010

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r61856Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearc...philip15:09, 2 February 2010

Status & tagging log