Index: trunk/phase3/includes/db/DatabaseMysql.php |
— | — | @@ -7,6 +7,8 @@ |
8 | 8 | * @see Database |
9 | 9 | */ |
10 | 10 | class DatabaseMysql extends DatabaseBase { |
| 11 | + static $mMinSearchLength; |
| 12 | + |
11 | 13 | function getType() { |
12 | 14 | return 'mysql'; |
13 | 15 | } |
— | — | @@ -367,6 +369,84 @@ |
368 | 370 | $this->query( "UNLOCK TABLES", $method ); |
369 | 371 | } |
370 | 372 | |
| 373 | + /** |
| 374 | + * Converts some characters for MySQL's indexing to grok it correctly, |
| 375 | + * and pads short words to overcome limitations. |
| 376 | + */ |
| 377 | + function stripForSearch( $string ) { |
| 378 | + global $wgContLang; |
| 379 | + |
| 380 | + wfProfileIn( __METHOD__ ); |
| 381 | + |
| 382 | + // MySQL fulltext index doesn't grok utf-8, so we |
| 383 | + // need to fold cases and convert to hex |
| 384 | + $out = preg_replace_callback( |
| 385 | + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
| 386 | + array( $this, 'stripForSearchCallback' ), |
| 387 | + $wgContLang->lc( $string ) ); |
| 388 | + |
| 389 | + // And to add insult to injury, the default indexing |
| 390 | + // ignores short words... Pad them so we can pass them |
| 391 | + // through without reconfiguring the server... |
| 392 | + $minLength = $this->minSearchLength(); |
| 393 | + if( $minLength > 1 ) { |
| 394 | + $n = $minLength - 1; |
| 395 | + $out = preg_replace( |
| 396 | + "/\b(\w{1,$n})\b/", |
| 397 | + "$1u800", |
| 398 | + $out ); |
| 399 | + } |
| 400 | + |
| 401 | + // Periods within things like hostnames and IP addresses |
| 402 | + // are also important -- we want a search for "example.com" |
| 403 | + // or "192.168.1.1" to work sanely. |
| 404 | + // |
| 405 | + // MySQL's search seems to ignore them, so you'd match on |
| 406 | + // "example.wikipedia.com" and "192.168.83.1" as well. |
| 407 | + $out = preg_replace( |
| 408 | + "/(\w)\.(\w|\*)/u", |
| 409 | + "$1u82e$2", |
| 410 | + $out ); |
| 411 | + |
| 412 | + wfProfileOut( __METHOD__ ); |
| 413 | + |
| 414 | + return $out; |
| 415 | + } |
| 416 | + |
| 417 | + /** |
| 418 | + * Armor a case-folded UTF-8 string to get through MySQL's |
| 419 | + * fulltext search without being mucked up by funny charset |
| 420 | + * settings or anything else of the sort. |
| 421 | + */ |
| 422 | + protected function stripForSearchCallback( $matches ) { |
| 423 | + return 'u8' . bin2hex( $matches[1] ); |
| 424 | + } |
| 425 | + |
| 426 | + /** |
| 427 | + * Check MySQL server's ft_min_word_len setting so we know |
| 428 | + * if we need to pad short words... |
| 429 | + * |
| 430 | + * @return int |
| 431 | + */ |
| 432 | + protected function minSearchLength() { |
| 433 | + if( is_null( self::$mMinSearchLength ) ) { |
| 434 | + $sql = "show global variables like 'ft\\_min\\_word\\_len'"; |
| 435 | + |
| 436 | + // Even though this query is pretty fast, let's not overload the master |
| 437 | + $dbr = wfGetDB( DB_SLAVE ); |
| 438 | + $result = $dbr->query( $sql ); |
| 439 | + $row = $result->fetchObject(); |
| 440 | + $result->free(); |
| 441 | + |
| 442 | + if( $row && $row->Variable_name == 'ft_min_word_len' ) { |
| 443 | + self::$mMinSearchLength = intval( $row->Value ); |
| 444 | + } else { |
| 445 | + self::$mMinSearchLength = 0; |
| 446 | + } |
| 447 | + } |
| 448 | + return self::$mMinSearchLength; |
| 449 | + } |
| 450 | + |
371 | 451 | public function setBigSelects( $value = true ) { |
372 | 452 | if ( $value === 'default' ) { |
373 | 453 | if ( $this->mDefaultBigSelects === null ) { |
Index: trunk/phase3/includes/db/Database.php |
— | — | @@ -2367,6 +2367,18 @@ |
2368 | 2368 | } |
2369 | 2369 | |
2370 | 2370 | /** |
| 2371 | + * When overridden in derived class, performs database-specific conversions |
| 2372 | + * on text to be used for searching or updating search index. |
| 2373 | + * Default implementation does nothing (simply returns $string). |
| 2374 | + * |
| 2375 | + * @param $string string: String to strip |
| 2376 | + * @return string |
| 2377 | + */ |
| 2378 | + public function stripForSearch( $string ) { |
| 2379 | + return $string; |
| 2380 | + } |
| 2381 | + |
| 2382 | + /** |
2371 | 2383 | * Allow or deny "big selects" for this session only. This is done by setting |
2372 | 2384 | * the sql_big_selects session variable. |
2373 | 2385 | * |
Index: trunk/phase3/languages/Language.php |
— | — | @@ -59,7 +59,6 @@ |
60 | 60 | |
61 | 61 | var $mNamespaceIds, $namespaceNames, $namespaceAliases; |
62 | 62 | var $dateFormatStrings = array(); |
63 | | - var $minSearchLength; |
64 | 63 | var $mExtendedSpecialPageAliases; |
65 | 64 | |
66 | 65 | /** |
— | — | @@ -1689,86 +1688,22 @@ |
1690 | 1689 | } |
1691 | 1690 | |
1692 | 1691 | /** |
1693 | | - * Some languages have special punctuation to strip out |
1694 | | - * or characters which need to be converted for MySQL's |
1695 | | - * indexing to grok it correctly. Make such changes here. |
| 1692 | + * Some languages have special punctuation to strip out. |
| 1693 | + * Make such changes here. |
1696 | 1694 | * |
1697 | 1695 | * @param $string String |
1698 | 1696 | * @return String |
1699 | 1697 | */ |
1700 | 1698 | function stripForSearch( $string, $doStrip = true ) { |
1701 | | - global $wgDBtype; |
1702 | | - if ( $wgDBtype != 'mysql' || $doStrip == false ) { |
| 1699 | + if ( !$doStrip ) { |
1703 | 1700 | return $string; |
1704 | 1701 | } |
1705 | 1702 | |
1706 | | - wfProfileIn( __METHOD__ ); |
1707 | | - |
1708 | | - // MySQL fulltext index doesn't grok utf-8, so we |
1709 | | - // need to fold cases and convert to hex |
1710 | | - $out = preg_replace_callback( |
1711 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
1712 | | - array( $this, 'stripForSearchCallback' ), |
1713 | | - $this->lc( $string ) ); |
1714 | | - |
1715 | | - // And to add insult to injury, the default indexing |
1716 | | - // ignores short words... Pad them so we can pass them |
1717 | | - // through without reconfiguring the server... |
1718 | | - $minLength = $this->minSearchLength(); |
1719 | | - if( $minLength > 1 ) { |
1720 | | - $n = $minLength-1; |
1721 | | - $out = preg_replace( |
1722 | | - "/\b(\w{1,$n})\b/", |
1723 | | - "$1u800", |
1724 | | - $out ); |
1725 | | - } |
1726 | | - |
1727 | | - // Periods within things like hostnames and IP addresses |
1728 | | - // are also important -- we want a search for "example.com" |
1729 | | - // or "192.168.1.1" to work sanely. |
1730 | | - // |
1731 | | - // MySQL's search seems to ignore them, so you'd match on |
1732 | | - // "example.wikipedia.com" and "192.168.83.1" as well. |
1733 | | - $out = preg_replace( |
1734 | | - "/(\w)\.(\w|\*)/u", |
1735 | | - "$1u82e$2", |
1736 | | - $out ); |
1737 | | - |
1738 | | - wfProfileOut( __METHOD__ ); |
1739 | | - return $out; |
| 1703 | + $dbr = wfGetDB( DB_SLAVE ); |
| 1704 | + return $dbr->stripForSearch( $string ); |
1740 | 1705 | } |
1741 | 1706 | |
1742 | 1707 | /** |
1743 | | - * Armor a case-folded UTF-8 string to get through MySQL's |
1744 | | - * fulltext search without being mucked up by funny charset |
1745 | | - * settings or anything else of the sort. |
1746 | | - */ |
1747 | | - protected function stripForSearchCallback( $matches ) { |
1748 | | - return 'u8' . bin2hex( $matches[1] ); |
1749 | | - } |
1750 | | - |
1751 | | - /** |
1752 | | - * Check MySQL server's ft_min_word_len setting so we know |
1753 | | - * if we need to pad short words... |
1754 | | - */ |
1755 | | - protected function minSearchLength() { |
1756 | | - if( is_null( $this->minSearchLength ) ) { |
1757 | | - $sql = "show global variables like 'ft\\_min\\_word\\_len'"; |
1758 | | - $dbr = wfGetDB( DB_SLAVE ); |
1759 | | - $result = $dbr->query( $sql ); |
1760 | | - $row = $result->fetchObject(); |
1761 | | - $result->free(); |
1762 | | - |
1763 | | - if( $row && $row->Variable_name == 'ft_min_word_len' ) { |
1764 | | - $this->minSearchLength = intval( $row->Value ); |
1765 | | - } else { |
1766 | | - $this->minSearchLength = 0; |
1767 | | - } |
1768 | | - } |
1769 | | - return $this->minSearchLength; |
1770 | | - } |
1771 | | - |
1772 | | - /** |
1773 | 1708 | * convert double-width roman characters to single-width. |
1774 | 1709 | * range: ff00-ff5f ~= 0020-007f |
1775 | 1710 | */ |