r61214 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r61213‎ | r61214 | r61215 >
Date:20:54, 18 January 2010
Author:maxsem
Status:resolved (Comments)
Tags:
Comment:
Factored MySQL-specific munging out of Language::stripForSearch() to DatabaseMysql. This will also allow other backends to provide seamlessly their own munging algorithms in the future.
Modified paths:
  • /trunk/phase3/includes/db/Database.php (modified) (history)
  • /trunk/phase3/includes/db/DatabaseMysql.php (modified) (history)
  • /trunk/phase3/languages/Language.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/db/DatabaseMysql.php
@@ -7,6 +7,8 @@
88 * @see Database
99 */
1010 class DatabaseMysql extends DatabaseBase {
 11+ static $mMinSearchLength;
 12+
1113 function getType() {
1214 return 'mysql';
1315 }
@@ -367,6 +369,84 @@
368370 $this->query( "UNLOCK TABLES", $method );
369371 }
370372
 373+ /**
 374+ * Converts some characters for MySQL's indexing to grok it correctly,
 375+ * and pads short words to overcome limitations.
 376+ */
 377+ function stripForSearch( $string ) {
 378+ global $wgContLang;
 379+
 380+ wfProfileIn( __METHOD__ );
 381+
 382+ // MySQL fulltext index doesn't grok utf-8, so we
 383+ // need to fold cases and convert to hex
 384+ $out = preg_replace_callback(
 385+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
 386+ array( $this, 'stripForSearchCallback' ),
 387+ $wgContLang->lc( $string ) );
 388+
 389+ // And to add insult to injury, the default indexing
 390+ // ignores short words... Pad them so we can pass them
 391+ // through without reconfiguring the server...
 392+ $minLength = $this->minSearchLength();
 393+ if( $minLength > 1 ) {
 394+ $n = $minLength - 1;
 395+ $out = preg_replace(
 396+ "/\b(\w{1,$n})\b/",
 397+ "$1u800",
 398+ $out );
 399+ }
 400+
 401+ // Periods within things like hostnames and IP addresses
 402+ // are also important -- we want a search for "example.com"
 403+ // or "192.168.1.1" to work sanely.
 404+ //
 405+ // MySQL's search seems to ignore them, so you'd match on
 406+ // "example.wikipedia.com" and "192.168.83.1" as well.
 407+ $out = preg_replace(
 408+ "/(\w)\.(\w|\*)/u",
 409+ "$1u82e$2",
 410+ $out );
 411+
 412+ wfProfileOut( __METHOD__ );
 413+
 414+ return $out;
 415+ }
 416+
 417+ /**
 418+ * Armor a case-folded UTF-8 string to get through MySQL's
 419+ * fulltext search without being mucked up by funny charset
 420+ * settings or anything else of the sort.
 421+ */
 422+ protected function stripForSearchCallback( $matches ) {
 423+ return 'u8' . bin2hex( $matches[1] );
 424+ }
 425+
 426+ /**
 427+ * Check MySQL server's ft_min_word_len setting so we know
 428+ * if we need to pad short words...
 429+ *
 430+ * @return int
 431+ */
 432+ protected function minSearchLength() {
 433+ if( is_null( self::$mMinSearchLength ) ) {
 434+ $sql = "show global variables like 'ft\\_min\\_word\\_len'";
 435+
 436+ // Even though this query is pretty fast, let's not overload the master
 437+ $dbr = wfGetDB( DB_SLAVE );
 438+ $result = $dbr->query( $sql );
 439+ $row = $result->fetchObject();
 440+ $result->free();
 441+
 442+ if( $row && $row->Variable_name == 'ft_min_word_len' ) {
 443+ self::$mMinSearchLength = intval( $row->Value );
 444+ } else {
 445+ self::$mMinSearchLength = 0;
 446+ }
 447+ }
 448+ return self::$mMinSearchLength;
 449+ }
 450+
371451 public function setBigSelects( $value = true ) {
372452 if ( $value === 'default' ) {
373453 if ( $this->mDefaultBigSelects === null ) {
Index: trunk/phase3/includes/db/Database.php
@@ -2367,6 +2367,18 @@
23682368 }
23692369
23702370 /**
 2371+ * When overridden in derived class, performs database-specific conversions
 2372+ * on text to be used for searching or updating search index.
 2373+ * Default implementation does nothing (simply returns $string).
 2374+ *
 2375+ * @param $string string: String to strip
 2376+ * @return string
 2377+ */
 2378+ public function stripForSearch( $string ) {
 2379+ return $string;
 2380+ }
 2381+
 2382+ /**
23712383 * Allow or deny "big selects" for this session only. This is done by setting
23722384 * the sql_big_selects session variable.
23732385 *
Index: trunk/phase3/languages/Language.php
@@ -59,7 +59,6 @@
6060
6161 var $mNamespaceIds, $namespaceNames, $namespaceAliases;
6262 var $dateFormatStrings = array();
63 - var $minSearchLength;
6463 var $mExtendedSpecialPageAliases;
6564
6665 /**
@@ -1689,86 +1688,22 @@
16901689 }
16911690
16921691 /**
1693 - * Some languages have special punctuation to strip out
1694 - * or characters which need to be converted for MySQL's
1695 - * indexing to grok it correctly. Make such changes here.
 1692+ * Some languages have special punctuation to strip out.
 1693+ * Make such changes here.
16961694 *
16971695 * @param $string String
16981696 * @return String
16991697 */
17001698 function stripForSearch( $string, $doStrip = true ) {
1701 - global $wgDBtype;
1702 - if ( $wgDBtype != 'mysql' || $doStrip == false ) {
 1699+ if ( !$doStrip ) {
17031700 return $string;
17041701 }
17051702
1706 - wfProfileIn( __METHOD__ );
1707 -
1708 - // MySQL fulltext index doesn't grok utf-8, so we
1709 - // need to fold cases and convert to hex
1710 - $out = preg_replace_callback(
1711 - "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
1712 - array( $this, 'stripForSearchCallback' ),
1713 - $this->lc( $string ) );
1714 -
1715 - // And to add insult to injury, the default indexing
1716 - // ignores short words... Pad them so we can pass them
1717 - // through without reconfiguring the server...
1718 - $minLength = $this->minSearchLength();
1719 - if( $minLength > 1 ) {
1720 - $n = $minLength-1;
1721 - $out = preg_replace(
1722 - "/\b(\w{1,$n})\b/",
1723 - "$1u800",
1724 - $out );
1725 - }
1726 -
1727 - // Periods within things like hostnames and IP addresses
1728 - // are also important -- we want a search for "example.com"
1729 - // or "192.168.1.1" to work sanely.
1730 - //
1731 - // MySQL's search seems to ignore them, so you'd match on
1732 - // "example.wikipedia.com" and "192.168.83.1" as well.
1733 - $out = preg_replace(
1734 - "/(\w)\.(\w|\*)/u",
1735 - "$1u82e$2",
1736 - $out );
1737 -
1738 - wfProfileOut( __METHOD__ );
1739 - return $out;
 1703+ $dbr = wfGetDB( DB_SLAVE );
 1704+ return $dbr->stripForSearch( $string );
17401705 }
17411706
17421707 /**
1743 - * Armor a case-folded UTF-8 string to get through MySQL's
1744 - * fulltext search without being mucked up by funny charset
1745 - * settings or anything else of the sort.
1746 - */
1747 - protected function stripForSearchCallback( $matches ) {
1748 - return 'u8' . bin2hex( $matches[1] );
1749 - }
1750 -
1751 - /**
1752 - * Check MySQL server's ft_min_word_len setting so we know
1753 - * if we need to pad short words...
1754 - */
1755 - protected function minSearchLength() {
1756 - if( is_null( $this->minSearchLength ) ) {
1757 - $sql = "show global variables like 'ft\\_min\\_word\\_len'";
1758 - $dbr = wfGetDB( DB_SLAVE );
1759 - $result = $dbr->query( $sql );
1760 - $row = $result->fetchObject();
1761 - $result->free();
1762 -
1763 - if( $row && $row->Variable_name == 'ft_min_word_len' ) {
1764 - $this->minSearchLength = intval( $row->Value );
1765 - } else {
1766 - $this->minSearchLength = 0;
1767 - }
1768 - }
1769 - return $this->minSearchLength;
1770 - }
1771 -
1772 - /**
17731708 * convert double-width roman characters to single-width.
17741709 * range: ff00-ff5f ~= 0020-007f
17751710 */

Follow-up revisions

RevisionCommit summaryAuthorDate
r61390Fixed r61214: moved MySQL munging to SearchEngine, updated calls. Can we kill...maxsem20:36, 22 January 2010
r61856Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearc...philip15:09, 2 February 2010

Comments

#Comment by Tim Starling (talk | contribs)   07:05, 20 January 2010

Why is this in DatabaseMysql instead of SearchMySQL? It seems like it is search engine dependent, not DBMS dependent. Also see my comments on r60743.

Status & tagging log