Index: trunk/phase3/includes/search/SearchEngine.php |
— | — | @@ -48,6 +48,18 @@ |
49 | 49 | } |
50 | 50 | |
51 | 51 | /** |
| 52 | + * When overridden in derived class, performs database-specific conversions |
| 53 | + * on text to be used for searching or updating search index. |
| 54 | + * Default implementation does nothing (simply returns $string). |
| 55 | + * |
| 56 | + * @param $string string: String to process |
| 57 | + * @return string |
| 58 | + */ |
| 59 | + public function normalizeText( $string ) { |
| 60 | + return $string; |
| 61 | + } |
| 62 | + |
| 63 | + /** |
52 | 64 | * Transform search term in cases when parts of the query came as different GET params (when supported) |
53 | 65 | * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive |
54 | 66 | */ |
Index: trunk/phase3/includes/search/SearchMySQL.php |
— | — | @@ -28,6 +28,7 @@ |
29 | 29 | */ |
30 | 30 | class SearchMySQL extends SearchEngine { |
31 | 31 | var $strictMatching = true; |
| 32 | + static $mMinSearchLength; |
32 | 33 | |
33 | 34 | /** @todo document */ |
34 | 35 | function __construct( $db ) { |
— | — | @@ -91,6 +92,7 @@ |
92 | 93 | if( count( $strippedVariants) > 1 ) |
93 | 94 | $searchon .= '('; |
94 | 95 | foreach( $strippedVariants as $stripped ) { |
| 96 | + $stripped = $this->normalizeText( $stripped ); |
95 | 97 | if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { |
96 | 98 | // Hack for Chinese: we need to toss in quotes for |
97 | 99 | // multiple-character phrases since stripForSearch() |
— | — | @@ -292,8 +294,8 @@ |
293 | 295 | array( 'si_page' ), |
294 | 296 | array( |
295 | 297 | 'si_page' => $id, |
296 | | - 'si_title' => $title, |
297 | | - 'si_text' => $text |
| 298 | + 'si_title' => $this->normalizeText( $title ), |
| 299 | + 'si_text' => $this->normalizeText( $text ) |
298 | 300 | ), __METHOD__ ); |
299 | 301 | } |
300 | 302 | |
— | — | @@ -308,11 +310,88 @@ |
309 | 311 | $dbw = wfGetDB( DB_MASTER ); |
310 | 312 | |
311 | 313 | $dbw->update( 'searchindex', |
312 | | - array( 'si_title' => $title ), |
| 314 | + array( 'si_title' => $this->normalizeText( $title ) ), |
313 | 315 | array( 'si_page' => $id ), |
314 | 316 | __METHOD__, |
315 | 317 | array( $dbw->lowPriorityOption() ) ); |
316 | 318 | } |
| 319 | + |
| 320 | + /** |
| 321 | + * Converts some characters for MySQL's indexing to grok it correctly, |
| 322 | + * and pads short words to overcome limitations. |
| 323 | + */ |
| 324 | + function normalizeText( $string ) { |
| 325 | + global $wgContLang; |
| 326 | + |
| 327 | + wfProfileIn( __METHOD__ ); |
| 328 | + |
| 329 | + // MySQL fulltext index doesn't grok utf-8, so we |
| 330 | + // need to fold cases and convert to hex |
| 331 | + $out = preg_replace_callback( |
| 332 | + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
| 333 | + array( $this, 'stripForSearchCallback' ), |
| 334 | + $wgContLang->lc( $string ) ); |
| 335 | + |
| 336 | + // And to add insult to injury, the default indexing |
| 337 | + // ignores short words... Pad them so we can pass them |
| 338 | + // through without reconfiguring the server... |
| 339 | + $minLength = $this->minSearchLength(); |
| 340 | + if( $minLength > 1 ) { |
| 341 | + $n = $minLength - 1; |
| 342 | + $out = preg_replace( |
| 343 | + "/\b(\w{1,$n})\b/", |
| 344 | + "$1u800", |
| 345 | + $out ); |
| 346 | + } |
| 347 | + |
| 348 | + // Periods within things like hostnames and IP addresses |
| 349 | + // are also important -- we want a search for "example.com" |
| 350 | + // or "192.168.1.1" to work sanely. |
| 351 | + // |
| 352 | + // MySQL's search seems to ignore them, so you'd match on |
| 353 | + // "example.wikipedia.com" and "192.168.83.1" as well. |
| 354 | + $out = preg_replace( |
| 355 | + "/(\w)\.(\w|\*)/u", |
| 356 | + "$1u82e$2", |
| 357 | + $out ); |
| 358 | + |
| 359 | + wfProfileOut( __METHOD__ ); |
| 360 | + |
| 361 | + return $out; |
| 362 | + } |
| 363 | + |
| 364 | + /** |
| 365 | + * Armor a case-folded UTF-8 string to get through MySQL's |
| 366 | + * fulltext search without being mucked up by funny charset |
| 367 | + * settings or anything else of the sort. |
| 368 | + */ |
| 369 | + protected function stripForSearchCallback( $matches ) { |
| 370 | + return 'u8' . bin2hex( $matches[1] ); |
| 371 | + } |
| 372 | + |
| 373 | + /** |
| 374 | + * Check MySQL server's ft_min_word_len setting so we know |
| 375 | + * if we need to pad short words... |
| 376 | + * |
| 377 | + * @return int |
| 378 | + */ |
| 379 | + protected function minSearchLength() { |
| 380 | + if( is_null( self::$mMinSearchLength ) ) { |
| 381 | + $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'"; |
| 382 | + |
| 383 | + $dbr = wfGetDB( DB_SLAVE ); |
| 384 | + $result = $dbr->query( $sql ); |
| 385 | + $row = $result->fetchObject(); |
| 386 | + $result->free(); |
| 387 | + |
| 388 | + if( $row && $row->Variable_name == 'ft_min_word_len' ) { |
| 389 | + self::$mMinSearchLength = intval( $row->Value ); |
| 390 | + } else { |
| 391 | + self::$mMinSearchLength = 0; |
| 392 | + } |
| 393 | + } |
| 394 | + return self::$mMinSearchLength; |
| 395 | + } |
317 | 396 | } |
318 | 397 | |
319 | 398 | /** |
Index: trunk/phase3/includes/db/DatabaseMysql.php |
— | — | @@ -7,8 +7,6 @@ |
8 | 8 | * @see Database |
9 | 9 | */ |
10 | 10 | class DatabaseMysql extends DatabaseBase { |
11 | | - static $mMinSearchLength; |
12 | | - |
13 | 11 | function getType() { |
14 | 12 | return 'mysql'; |
15 | 13 | } |
— | — | @@ -368,85 +366,7 @@ |
369 | 367 | public function unlockTables( $method ) { |
370 | 368 | $this->query( "UNLOCK TABLES", $method ); |
371 | 369 | } |
372 | | - |
373 | | - /** |
374 | | - * Converts some characters for MySQL's indexing to grok it correctly, |
375 | | - * and pads short words to overcome limitations. |
376 | | - */ |
377 | | - function stripForSearch( $string ) { |
378 | | - global $wgContLang; |
379 | 370 | |
380 | | - wfProfileIn( __METHOD__ ); |
381 | | - |
382 | | - // MySQL fulltext index doesn't grok utf-8, so we |
383 | | - // need to fold cases and convert to hex |
384 | | - $out = preg_replace_callback( |
385 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
386 | | - array( $this, 'stripForSearchCallback' ), |
387 | | - $wgContLang->lc( $string ) ); |
388 | | - |
389 | | - // And to add insult to injury, the default indexing |
390 | | - // ignores short words... Pad them so we can pass them |
391 | | - // through without reconfiguring the server... |
392 | | - $minLength = $this->minSearchLength(); |
393 | | - if( $minLength > 1 ) { |
394 | | - $n = $minLength - 1; |
395 | | - $out = preg_replace( |
396 | | - "/\b(\w{1,$n})\b/", |
397 | | - "$1u800", |
398 | | - $out ); |
399 | | - } |
400 | | - |
401 | | - // Periods within things like hostnames and IP addresses |
402 | | - // are also important -- we want a search for "example.com" |
403 | | - // or "192.168.1.1" to work sanely. |
404 | | - // |
405 | | - // MySQL's search seems to ignore them, so you'd match on |
406 | | - // "example.wikipedia.com" and "192.168.83.1" as well. |
407 | | - $out = preg_replace( |
408 | | - "/(\w)\.(\w|\*)/u", |
409 | | - "$1u82e$2", |
410 | | - $out ); |
411 | | - |
412 | | - wfProfileOut( __METHOD__ ); |
413 | | - |
414 | | - return $out; |
415 | | - } |
416 | | - |
417 | | - /** |
418 | | - * Armor a case-folded UTF-8 string to get through MySQL's |
419 | | - * fulltext search without being mucked up by funny charset |
420 | | - * settings or anything else of the sort. |
421 | | - */ |
422 | | - protected function stripForSearchCallback( $matches ) { |
423 | | - return 'u8' . bin2hex( $matches[1] ); |
424 | | - } |
425 | | - |
426 | | - /** |
427 | | - * Check MySQL server's ft_min_word_len setting so we know |
428 | | - * if we need to pad short words... |
429 | | - * |
430 | | - * @return int |
431 | | - */ |
432 | | - protected function minSearchLength() { |
433 | | - if( is_null( self::$mMinSearchLength ) ) { |
434 | | - $sql = "show global variables like 'ft\\_min\\_word\\_len'"; |
435 | | - |
436 | | - // Even though this query is pretty fast, let's not overload the master |
437 | | - $dbr = wfGetDB( DB_SLAVE ); |
438 | | - $result = $dbr->query( $sql ); |
439 | | - $row = $result->fetchObject(); |
440 | | - $result->free(); |
441 | | - |
442 | | - if( $row && $row->Variable_name == 'ft_min_word_len' ) { |
443 | | - self::$mMinSearchLength = intval( $row->Value ); |
444 | | - } else { |
445 | | - self::$mMinSearchLength = 0; |
446 | | - } |
447 | | - } |
448 | | - return self::$mMinSearchLength; |
449 | | - } |
450 | | - |
451 | 371 | public function setBigSelects( $value = true ) { |
452 | 372 | if ( $value === 'default' ) { |
453 | 373 | if ( $this->mDefaultBigSelects === null ) { |
Index: trunk/phase3/includes/db/Database.php |
— | — | @@ -2367,18 +2367,6 @@ |
2368 | 2368 | } |
2369 | 2369 | |
2370 | 2370 | /** |
2371 | | - * When overridden in derived class, performs database-specific conversions |
2372 | | - * on text to be used for searching or updating search index. |
2373 | | - * Default implementation does nothing (simply returns $string). |
2374 | | - * |
2375 | | - * @param $string string: String to strip |
2376 | | - * @return string |
2377 | | - */ |
2378 | | - public function stripForSearch( $string ) { |
2379 | | - return $string; |
2380 | | - } |
2381 | | - |
2382 | | - /** |
2383 | 2371 | * Allow or deny "big selects" for this session only. This is done by setting |
2384 | 2372 | * the sql_big_selects session variable. |
2385 | 2373 | * |
Index: trunk/phase3/languages/Language.php |
— | — | @@ -1695,12 +1695,7 @@ |
1696 | 1696 | * @return String |
1697 | 1697 | */ |
1698 | 1698 | function stripForSearch( $string, $doStrip = true ) { |
1699 | | - if ( !$doStrip ) { |
1700 | | - return $string; |
1701 | | - } |
1702 | | - |
1703 | | - $dbr = wfGetDB( DB_SLAVE ); |
1704 | | - return $dbr->stripForSearch( $string ); |
| 1699 | + return $string; |
1705 | 1700 | } |
1706 | 1701 | |
1707 | 1702 | /** |
Index: trunk/extensions/AdvancedSearch/AdvancedSearchPager.php |
— | — | @@ -387,6 +387,7 @@ |
388 | 388 | protected function getMatchString($arr) |
389 | 389 | { |
390 | 390 | $conds = array(); |
| 391 | + $searchEngine = SearchEngine::create(); |
391 | 392 | foreach($arr as $a) |
392 | 393 | { |
393 | 394 | $subconds = array(); |
— | — | @@ -402,6 +403,7 @@ |
403 | 404 | { |
404 | 405 | global $wgContLang; |
405 | 406 | $s = $wgContLang->stripForSearch($b); |
| 407 | + $s = $searchEngine->normalizeText($s); |
406 | 408 | $s = $this->mDb->strencode($s); |
407 | 409 | # If $s contains spaces or ( ) :, quote it |
408 | 410 | if(strpos($s, ' ') !== false |
Index: trunk/extensions/AdvancedSearch/AdvancedSearchCategoryIntersector.php |
— | — | @@ -28,6 +28,8 @@ |
29 | 29 | */ |
30 | 30 | class AdvancedSearchCategoryIntersector |
31 | 31 | { |
| 32 | + private static $searchEngine; |
| 33 | + |
32 | 34 | /** |
33 | 35 | * Update the categorysearch table |
34 | 36 | * @param $pageid int Page ID |
— | — | @@ -37,6 +39,7 @@ |
38 | 40 | { |
39 | 41 | global $wgContLang; |
40 | 42 | $ctext = $wgContLang->stripForSearch(implode(' ', $categories)); |
| 43 | + $ctext = self::getSearchEngine()->normalizeText($ctext); |
41 | 44 | $dbw = wfGetDb(DB_MASTER); |
42 | 45 | $dbw->replace('categorysearch', 'cs_page', |
43 | 46 | array('cs_page' => $pageid, 'cs_categories' => $ctext), |
— | — | @@ -68,4 +71,11 @@ |
69 | 72 | self::remove($article->getID()); |
70 | 73 | return true; |
71 | 74 | } |
| 75 | + |
| 76 | + static function getSearchEngine() { |
| 77 | + if (!self::$searchEngine) { |
| 78 | + self::$searchEngine = SearchEngine::create(); |
| 79 | + } |
| 80 | + return self::$searchEngine; |
| 81 | + } |
72 | 82 | } |