r60764 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r60763‎ | r60764 | r60765 >
Date:04:50, 7 January 2010
Author:philip
Status:ok (Comments)
Tags:
Comment:
follow-up r60743.
1. Changed the conditions, not only for LuceneSearch, but also more commonly to others.
2. Reduced code duplication.
Modified paths:
  • /trunk/phase3/languages/Language.php (modified) (history)
  • /trunk/phase3/languages/classes/LanguageGan.php (modified) (history)
  • /trunk/phase3/languages/classes/LanguageJa.php (modified) (history)
  • /trunk/phase3/languages/classes/LanguageYue.php (modified) (history)
  • /trunk/phase3/languages/classes/LanguageZh.php (modified) (history)
  • /trunk/phase3/languages/classes/LanguageZh_hans.php (modified) (history)

Diff [purge]

Index: trunk/phase3/languages/Language.php
@@ -1695,9 +1695,9 @@
16961696 * @param $string String
16971697 * @return String
16981698 */
1699 - function stripForSearch( $string ) {
1700 - global $wgDBtype, $wgSearchType;
1701 - if ( $wgDBtype != 'mysql' || $wgSearchType == 'LuceneSearch' ) {
 1699+ function stripForSearch( $string, $doStrip = true ) {
 1700+ global $wgDBtype;
 1701+ if ( $wgDBtype != 'mysql' || $doStrip == false ) {
17021702 return $string;
17031703 }
17041704
@@ -1767,6 +1767,22 @@
17681768 return $this->minSearchLength;
17691769 }
17701770
 1771+ /**
 1772+ * convert double-width roman characters to single-width.
 1773+ * range: ff00-ff5f ~= 0020-007f
 1774+ */
 1775+ protected static function convertDoubleWidth( $string ) {
 1776+ $string = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $string );
 1777+ $string = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $string );
 1778+ return $string;
 1779+ }
 1780+
 1781+ protected static function wordSegmentation( $string, $pattern ) {
 1782+ $string = preg_replace( $pattern, " $1 ", $string );
 1783+ $string = preg_replace( '/ +/', ' ', $string );
 1784+ return $string;
 1785+ }
 1786+
17711787 function convertForSearchResult( $termsArray ) {
17721788 # some languages, e.g. Chinese, need to do a conversion
17731789 # in order for search results to be displayed correctly
Index: trunk/phase3/languages/classes/LanguageZh_hans.php
@@ -8,33 +8,25 @@
99 return false;
1010 }
1111
12 - function stripForSearch( $string ) {
 12+ function stripForSearch( $string, $doStrip = true ) {
1313 wfProfileIn( __METHOD__ );
14 - global $wgSearchType;
1514
16 - $s = $string;
 15+ // Double-width roman characters
 16+ $s = self::convertDoubleWidth( $string );
1717
18 - // Double-width roman characters: ff00-ff5f ~= 0020-007f
19 - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
20 - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
21 -
22 - if ( $wgSearchType != 'LuceneSearch' ) {
 18+ if ( $doStrip == true ) {
2319 // Eventually this should be a word segmentation;
2420 // for now just treat each character as a word.
25 - // Not for LuceneSearch, because LSearch will
26 - // split the text to words itself.
2721 // @todo Fixme: only do this for Han characters...
28 - $s = preg_replace(
29 - "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
30 - " $1 ", $s);
31 - $s = preg_replace( '/ +/', ' ', $s );
 22+ $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
 23+ $s = self::wordSegmentation( $s, $reg );
3224 }
3325
3426 $s = trim( $s );
3527
3628 // Do general case folding and UTF-8 armoring
37 - $s = parent::stripForSearch( $s );
 29+ $s = parent::stripForSearch( $s, $doStrip );
3830 wfProfileOut( __METHOD__ );
3931 return $s;
4032 }
41 -}
 33+}
\ No newline at end of file
Index: trunk/phase3/languages/classes/LanguageJa.php
@@ -6,18 +6,15 @@
77 * @ingroup Language
88 */
99 class LanguageJa extends Language {
10 - function stripForSearch( $string ) {
11 - # MySQL fulltext index doesn't grok utf-8, so we
12 - # need to fold cases and convert to hex
 10+ function stripForSearch( $string, $doStrip = true ) {
 11+
1312 $s = $string;
1413
15 - # not for LuceneSearch, because LSearch
16 - # will split the text to words itself
17 - if ( $wgSearchType != 'LuceneSearch' ) {
18 - # Strip known punctuation ?
19 - #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
 14+ if ( $doStrip == true ) {
 15+ // Strip known punctuation ?
 16+ // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
2017
21 - # Space strings of like hiragana/katakana/kanji
 18+ // Space strings of like hiragana/katakana/kanji
2219 $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
2320 $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
2421 $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
@@ -25,14 +22,14 @@
2623 . '|\xe9[\x80-\xa5][\x80-\xbf]'
2724 . '|\xe9\xa6[\x80-\x99])';
2825 # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
29 - $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s );
 26+ $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
 27+ $s = self::wordSegmentation( $s, $reg );
3028 }
31 - # Double-width roman characters: ff00-ff5f ~= 0020-007f
32 - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
33 - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
 29+ // Double-width roman characters
 30+ $s = self::convertDoubleWidth( $s );
3431
3532 # Do general case folding and UTF-8 armoring
36 - return parent::stripForSearch( $s );
 33+ return parent::stripForSearch( $s, $doStrip );
3734 }
3835
3936 # Italic is not appropriate for Japanese script
Index: trunk/phase3/languages/classes/LanguageGan.php
@@ -137,43 +137,14 @@
138138 }
139139
140140 // word segmentation
141 - function stripForSearch( $string ) {
142 - wfProfileIn( __METHOD__ );
143 - global $wgSearchType;
144 -
145 - // always convert to gan-hans before indexing. it should be
146 - // better to use gan-hans for search, since conversion from
147 - // Traditional to Simplified is less ambiguous than the
148 - // other way around
149 - $s = $this->mConverter->autoConvert($string, 'gan-hans');
150 -
151 - // Double-width roman characters: ff00-ff5f ~= 0020-007f
152 - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
153 - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
154 -
155 - if ( $wgSearchType != 'LuceneSearch' ) {
156 - // eventually this should be a word segmentation;
157 - // for now just treat each character as a word.
158 - // Not for LuceneSearch, because LSearch will
159 - // split the text to words itself.
160 - // @todo Fixme: only do this for Han characters...
161 - $s = preg_replace(
162 - "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
163 - " $1 ", $s);
164 - $s = preg_replace( '/ +/', ' ', $s );
165 - }
166 -
167 - $s = trim( $s );
168 -
169 - // Do general case folding and UTF-8 armoring
170 - $s = parent::stripForSearch( $s );
171 - wfProfileOut( __METHOD__ );
172 - return $s;
173 -
 141+ function stripForSearch( $string, $doStrip = true ) {
 142+ // LanguageZh::stripForSearch
 143+ return parent::stripForSearch( $string, $doStrip, 'gan-hans' );
174144 }
175145
176146 function convertForSearchResult( $termsArray ) {
177147 $terms = implode( '|', $termsArray );
 148+ $terms = self::convertDoubleWidth( $terms );
178149 $terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) );
179150 $ret = array_unique( explode('|', $terms) );
180151 return $ret;
Index: trunk/phase3/languages/classes/LanguageZh.php
@@ -173,15 +173,16 @@
174174 }
175175
176176 // word segmentation
177 - function stripForSearch( $string ) {
 177+ function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) {
178178 wfProfileIn( __METHOD__ );
179179
180180 // always convert to zh-hans before indexing. it should be
181181 // better to use zh-hans for search, since conversion from
182182 // Traditional to Simplified is less ambiguous than the
183183 // other way around
184 - $s = $this->mConverter->autoConvert( $string, 'zh-hans' );
185 - $s = parent::stripForSearch( $s );
 184+ $s = $this->mConverter->autoConvert( $string, $autoVariant );
 185+ // LanguageZh_hans::stripForSearch
 186+ $s = parent::stripForSearch( $s, $doStrip );
186187 wfProfileOut( __METHOD__ );
187188 return $s;
188189
@@ -189,6 +190,7 @@
190191
191192 function convertForSearchResult( $termsArray ) {
192193 $terms = implode( '|', $termsArray );
 194+ $terms = self::convertDoubleWidth( $terms );
193195 $terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) );
194196 $ret = array_unique( explode('|', $terms) );
195197 return $ret;
Index: trunk/phase3/languages/classes/LanguageYue.php
@@ -3,32 +3,24 @@
44 * @ingroup Language
55 */
66 class LanguageYue extends Language {
7 - function stripForSearch( $string ) {
 7+ function stripForSearch( $string, $doStrip = true ) {
88 wfProfileIn( __METHOD__ );
9 - global $wgSearchType;
109
11 - $s = $string;
 10+ // Double-width roman characters
 11+ $s = self::convertDoubleWidth( $string );
1212
13 - // Double-width roman characters: ff00-ff5f ~= 0020-007f
14 - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
15 - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
16 -
17 - if ( $wgSearchType != 'LuceneSearch' ) {
 13+ if ( $doStrip == true ) {
1814 // eventually this should be a word segmentation;
1915 // for now just treat each character as a word.
20 - // Not for LuceneSearch, because LSearch will
21 - // split the text to words itself.
2216 // @todo Fixme: only do this for Han characters...
23 - $s = preg_replace(
24 - "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
25 - " $1 ", $s);
26 - $s = preg_replace( '/ +/', ' ', $s );
 17+ $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
 18+ $s = self::wordSegmentation( $s, $reg );
2719 }
2820
2921 $s = trim( $s );
3022
3123 // Do general case folding and UTF-8 armoring
32 - $s = parent::stripForSearch( $s );
 24+ $s = parent::stripForSearch( $s, $doStrip );
3325 wfProfileOut( __METHOD__ );
3426 return $s;
3527 }

Follow-up revisions

RevisionCommit summaryAuthorDate
r60766follow-up r60742. adapt to the code changes made in r60764.philip04:53, 7 January 2010
r60796follow-up r60764. compatible fix.philip17:48, 7 January 2010
r61856Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearc...philip15:09, 2 February 2010

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r607431. Add conditions to stripForSearch for LuceneSearch / MWSearch....philip19:51, 6 January 2010

Comments

#Comment by Raymond (talk | contribs)   17:40, 7 January 2010

Seen at translatewiki:

PHP Strict Standards: Declaration of LanguageGan::stripForSearch() should be compatible with that of LanguageZh::stripForSearch() in /var/www/w/languages/classes/LanguageGan.php on line 102
#Comment by PhiLiP (talk | contribs)   17:49, 7 January 2010

fixed on r60796.

Status & tagging log