r61390 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r61389‎ | r61390 | r61391 >
Date:20:36, 22 January 2010
Author:maxsem
Status:ok
Tags:
Comment:
Fixed r61214: moved MySQL munging to SearchEngine, updated calls. Can we kill $doStrip now?
Modified paths:
  • /trunk/extensions/AdvancedSearch/AdvancedSearchCategoryIntersector.php (modified) (history)
  • /trunk/extensions/AdvancedSearch/AdvancedSearchPager.php (modified) (history)
  • /trunk/phase3/includes/db/Database.php (modified) (history)
  • /trunk/phase3/includes/db/DatabaseMysql.php (modified) (history)
  • /trunk/phase3/includes/search/SearchEngine.php (modified) (history)
  • /trunk/phase3/includes/search/SearchMySQL.php (modified) (history)
  • /trunk/phase3/languages/Language.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/search/SearchEngine.php
@@ -48,6 +48,18 @@
4949 }
5050
5151 /**
 52+ * When overridden in derived class, performs database-specific conversions
 53+ * on text to be used for searching or updating search index.
 54+ * Default implementation does nothing (simply returns $string).
 55+ *
 56+ * @param $string string: String to process
 57+ * @return string
 58+ */
 59+ public function normalizeText( $string ) {
 60+ return $string;
 61+ }
 62+
 63+ /**
5264 * Transform search term in cases when parts of the query came as different GET params (when supported)
5365 * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
5466 */
Index: trunk/phase3/includes/search/SearchMySQL.php
@@ -28,6 +28,7 @@
2929 */
3030 class SearchMySQL extends SearchEngine {
3131 var $strictMatching = true;
 32+ static $mMinSearchLength;
3233
3334 /** @todo document */
3435 function __construct( $db ) {
@@ -91,6 +92,7 @@
9293 if( count( $strippedVariants) > 1 )
9394 $searchon .= '(';
9495 foreach( $strippedVariants as $stripped ) {
 96+ $stripped = $this->normalizeText( $stripped );
9597 if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
9698 // Hack for Chinese: we need to toss in quotes for
9799 // multiple-character phrases since stripForSearch()
@@ -292,8 +294,8 @@
293295 array( 'si_page' ),
294296 array(
295297 'si_page' => $id,
296 - 'si_title' => $title,
297 - 'si_text' => $text
 298+ 'si_title' => $this->normalizeText( $title ),
 299+ 'si_text' => $this->normalizeText( $text )
298300 ), __METHOD__ );
299301 }
300302
@@ -308,11 +310,88 @@
309311 $dbw = wfGetDB( DB_MASTER );
310312
311313 $dbw->update( 'searchindex',
312 - array( 'si_title' => $title ),
 314+ array( 'si_title' => $this->normalizeText( $title ) ),
313315 array( 'si_page' => $id ),
314316 __METHOD__,
315317 array( $dbw->lowPriorityOption() ) );
316318 }
 319+
 320+ /**
 321+ * Converts some characters for MySQL's indexing to grok it correctly,
 322+ * and pads short words to overcome limitations.
 323+ */
 324+ function normalizeText( $string ) {
 325+ global $wgContLang;
 326+
 327+ wfProfileIn( __METHOD__ );
 328+
 329+ // MySQL fulltext index doesn't grok utf-8, so we
 330+ // need to fold cases and convert to hex
 331+ $out = preg_replace_callback(
 332+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
 333+ array( $this, 'stripForSearchCallback' ),
 334+ $wgContLang->lc( $string ) );
 335+
 336+ // And to add insult to injury, the default indexing
 337+ // ignores short words... Pad them so we can pass them
 338+ // through without reconfiguring the server...
 339+ $minLength = $this->minSearchLength();
 340+ if( $minLength > 1 ) {
 341+ $n = $minLength - 1;
 342+ $out = preg_replace(
 343+ "/\b(\w{1,$n})\b/",
 344+ "$1u800",
 345+ $out );
 346+ }
 347+
 348+ // Periods within things like hostnames and IP addresses
 349+ // are also important -- we want a search for "example.com"
 350+ // or "192.168.1.1" to work sanely.
 351+ //
 352+ // MySQL's search seems to ignore them, so you'd match on
 353+ // "example.wikipedia.com" and "192.168.83.1" as well.
 354+ $out = preg_replace(
 355+ "/(\w)\.(\w|\*)/u",
 356+ "$1u82e$2",
 357+ $out );
 358+
 359+ wfProfileOut( __METHOD__ );
 360+
 361+ return $out;
 362+ }
 363+
 364+ /**
 365+ * Armor a case-folded UTF-8 string to get through MySQL's
 366+ * fulltext search without being mucked up by funny charset
 367+ * settings or anything else of the sort.
 368+ */
 369+ protected function stripForSearchCallback( $matches ) {
 370+ return 'u8' . bin2hex( $matches[1] );
 371+ }
 372+
 373+ /**
 374+ * Check MySQL server's ft_min_word_len setting so we know
 375+ * if we need to pad short words...
 376+ *
 377+ * @return int
 378+ */
 379+ protected function minSearchLength() {
 380+ if( is_null( self::$mMinSearchLength ) ) {
 381+ $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'";
 382+
 383+ $dbr = wfGetDB( DB_SLAVE );
 384+ $result = $dbr->query( $sql );
 385+ $row = $result->fetchObject();
 386+ $result->free();
 387+
 388+ if( $row && $row->Variable_name == 'ft_min_word_len' ) {
 389+ self::$mMinSearchLength = intval( $row->Value );
 390+ } else {
 391+ self::$mMinSearchLength = 0;
 392+ }
 393+ }
 394+ return self::$mMinSearchLength;
 395+ }
317396 }
318397
319398 /**
Index: trunk/phase3/includes/db/DatabaseMysql.php
@@ -7,8 +7,6 @@
88 * @see Database
99 */
1010 class DatabaseMysql extends DatabaseBase {
11 - static $mMinSearchLength;
12 -
1311 function getType() {
1412 return 'mysql';
1513 }
@@ -368,85 +366,7 @@
369367 public function unlockTables( $method ) {
370368 $this->query( "UNLOCK TABLES", $method );
371369 }
372 -
373 - /**
374 - * Converts some characters for MySQL's indexing to grok it correctly,
375 - * and pads short words to overcome limitations.
376 - */
377 - function stripForSearch( $string ) {
378 - global $wgContLang;
379370
380 - wfProfileIn( __METHOD__ );
381 -
382 - // MySQL fulltext index doesn't grok utf-8, so we
383 - // need to fold cases and convert to hex
384 - $out = preg_replace_callback(
385 - "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
386 - array( $this, 'stripForSearchCallback' ),
387 - $wgContLang->lc( $string ) );
388 -
389 - // And to add insult to injury, the default indexing
390 - // ignores short words... Pad them so we can pass them
391 - // through without reconfiguring the server...
392 - $minLength = $this->minSearchLength();
393 - if( $minLength > 1 ) {
394 - $n = $minLength - 1;
395 - $out = preg_replace(
396 - "/\b(\w{1,$n})\b/",
397 - "$1u800",
398 - $out );
399 - }
400 -
401 - // Periods within things like hostnames and IP addresses
402 - // are also important -- we want a search for "example.com"
403 - // or "192.168.1.1" to work sanely.
404 - //
405 - // MySQL's search seems to ignore them, so you'd match on
406 - // "example.wikipedia.com" and "192.168.83.1" as well.
407 - $out = preg_replace(
408 - "/(\w)\.(\w|\*)/u",
409 - "$1u82e$2",
410 - $out );
411 -
412 - wfProfileOut( __METHOD__ );
413 -
414 - return $out;
415 - }
416 -
417 - /**
418 - * Armor a case-folded UTF-8 string to get through MySQL's
419 - * fulltext search without being mucked up by funny charset
420 - * settings or anything else of the sort.
421 - */
422 - protected function stripForSearchCallback( $matches ) {
423 - return 'u8' . bin2hex( $matches[1] );
424 - }
425 -
426 - /**
427 - * Check MySQL server's ft_min_word_len setting so we know
428 - * if we need to pad short words...
429 - *
430 - * @return int
431 - */
432 - protected function minSearchLength() {
433 - if( is_null( self::$mMinSearchLength ) ) {
434 - $sql = "show global variables like 'ft\\_min\\_word\\_len'";
435 -
436 - // Even though this query is pretty fast, let's not overload the master
437 - $dbr = wfGetDB( DB_SLAVE );
438 - $result = $dbr->query( $sql );
439 - $row = $result->fetchObject();
440 - $result->free();
441 -
442 - if( $row && $row->Variable_name == 'ft_min_word_len' ) {
443 - self::$mMinSearchLength = intval( $row->Value );
444 - } else {
445 - self::$mMinSearchLength = 0;
446 - }
447 - }
448 - return self::$mMinSearchLength;
449 - }
450 -
451371 public function setBigSelects( $value = true ) {
452372 if ( $value === 'default' ) {
453373 if ( $this->mDefaultBigSelects === null ) {
Index: trunk/phase3/includes/db/Database.php
@@ -2367,18 +2367,6 @@
23682368 }
23692369
23702370 /**
2371 - * When overridden in derived class, performs database-specific conversions
2372 - * on text to be used for searching or updating search index.
2373 - * Default implementation does nothing (simply returns $string).
2374 - *
2375 - * @param $string string: String to strip
2376 - * @return string
2377 - */
2378 - public function stripForSearch( $string ) {
2379 - return $string;
2380 - }
2381 -
2382 - /**
23832371 * Allow or deny "big selects" for this session only. This is done by setting
23842372 * the sql_big_selects session variable.
23852373 *
Index: trunk/phase3/languages/Language.php
@@ -1695,12 +1695,7 @@
16961696 * @return String
16971697 */
16981698 function stripForSearch( $string, $doStrip = true ) {
1699 - if ( !$doStrip ) {
1700 - return $string;
1701 - }
1702 -
1703 - $dbr = wfGetDB( DB_SLAVE );
1704 - return $dbr->stripForSearch( $string );
 1699+ return $string;
17051700 }
17061701
17071702 /**
Index: trunk/extensions/AdvancedSearch/AdvancedSearchPager.php
@@ -387,6 +387,7 @@
388388 protected function getMatchString($arr)
389389 {
390390 $conds = array();
 391+ $searchEngine = SearchEngine::create();
391392 foreach($arr as $a)
392393 {
393394 $subconds = array();
@@ -402,6 +403,7 @@
403404 {
404405 global $wgContLang;
405406 $s = $wgContLang->stripForSearch($b);
 407+ $s = $searchEngine->normalizeText($s);
406408 $s = $this->mDb->strencode($s);
407409 # If $s contains spaces or ( ) :, quote it
408410 if(strpos($s, ' ') !== false
Index: trunk/extensions/AdvancedSearch/AdvancedSearchCategoryIntersector.php
@@ -28,6 +28,8 @@
2929 */
3030 class AdvancedSearchCategoryIntersector
3131 {
 32+ private static $searchEngine;
 33+
3234 /**
3335 * Update the categorysearch table
3436 * @param $pageid int Page ID
@@ -37,6 +39,7 @@
3840 {
3941 global $wgContLang;
4042 $ctext = $wgContLang->stripForSearch(implode(' ', $categories));
 43+ $ctext = self::getSearchEngine()->normalizeText($ctext);
4144 $dbw = wfGetDb(DB_MASTER);
4245 $dbw->replace('categorysearch', 'cs_page',
4346 array('cs_page' => $pageid, 'cs_categories' => $ctext),
@@ -68,4 +71,11 @@
6972 self::remove($article->getID());
7073 return true;
7174 }
 75+
 76+ static function getSearchEngine() {
 77+ if (!self::$searchEngine) {
 78+ self::$searchEngine = SearchEngine::create();
 79+ }
 80+ return self::$searchEngine;
 81+ }
7282 }

Follow-up revisions

RevisionCommit summaryAuthorDate
r61856Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearc...philip15:09, 2 February 2010

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r61214Factored MySQL-specific munging out of Language::stripForSearch() to Database...maxsem20:54, 18 January 2010

Status & tagging log