r60764 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r60763‎ \| r60764 \| r60765 >
Date:	04:50, 7 January 2010
Author:	philip
Status:	ok (Comments)
Tags:
Comment:	follow-up r60743. 1. Changed the conditions, not only for LuceneSearch, but also more commonly to others. 2. Reduced code duplication.
Modified paths:	/trunk/phase3/languages/Language.php (modified) (history) /trunk/phase3/languages/classes/LanguageGan.php (modified) (history) /trunk/phase3/languages/classes/LanguageJa.php (modified) (history) /trunk/phase3/languages/classes/LanguageYue.php (modified) (history) /trunk/phase3/languages/classes/LanguageZh.php (modified) (history) /trunk/phase3/languages/classes/LanguageZh_hans.php (modified) (history)

Diff [purge]

Index: trunk/phase3/languages/Language.php
—	—	@@ -1695,9 +1695,9 @@
1696	1696	* @param $string String
1697	1697	* @return String
1698	1698	*/
1699		~~- function stripForSearch( $string ) {~~
1700		~~- global $wgDBtype, $wgSearchType;~~
1701		~~- if ( $wgDBtype != 'mysql' \|\| $wgSearchType == 'LuceneSearch' ) {~~
	1699	+ function stripForSearch( $string, $doStrip = true ) {
	1700	+ global $wgDBtype;
	1701	+ if ( $wgDBtype != 'mysql' \|\| $doStrip == false ) {
1702	1702	return $string;
1703	1703	}
1704	1704
—	—	@@ -1767,6 +1767,22 @@
1768	1768	return $this->minSearchLength;
1769	1769	}
1770	1770
	1771	+ /**
	1772	+ * convert double-width roman characters to single-width.
	1773	+ * range: ff00-ff5f ~= 0020-007f
	1774	+ */
	1775	+ protected static function convertDoubleWidth( $string ) {
	1776	+ $string = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $string );
	1777	+ $string = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $string );
	1778	+ return $string;
	1779	+ }
	1780	+
	1781	+ protected static function wordSegmentation( $string, $pattern ) {
	1782	+ $string = preg_replace( $pattern, " $1 ", $string );
	1783	+ $string = preg_replace( '/ +/', ' ', $string );
	1784	+ return $string;
	1785	+ }
	1786	+
1771	1787	function convertForSearchResult( $termsArray ) {
1772	1788	# some languages, e.g. Chinese, need to do a conversion
1773	1789	# in order for search results to be displayed correctly
Index: trunk/phase3/languages/classes/LanguageZh_hans.php
—	—	@@ -8,33 +8,25 @@
9	9	return false;
10	10	}
11	11
12		~~- function stripForSearch( $string ) {~~
	12	+ function stripForSearch( $string, $doStrip = true ) {
13	13	wfProfileIn( __METHOD__ );
14		~~- global $wgSearchType;~~
15	14
16		~~- $s = $string;~~
	15	+ // Double-width roman characters
	16	+ $s = self::convertDoubleWidth( $string );
17	17
18		~~- // Double-width roman characters: ff00-ff5f ~= 0020-007f~~
19		~~- $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );~~
20		~~- $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );~~
21		-
22		~~- if ( $wgSearchType != 'LuceneSearch' ) {~~
	18	+ if ( $doStrip == true ) {
23	19	// Eventually this should be a word segmentation;
24	20	// for now just treat each character as a word.
25		~~- // Not for LuceneSearch, because LSearch will~~
26		~~- // split the text to words itself.~~
27	21	// @todo Fixme: only do this for Han characters...
28		~~- $s = preg_replace(~~
29		~~- "/([\\xc0-\\xff][\\x80-\\xbf]*)/",~~
30		~~- " $1 ", $s);~~
31		~~- $s = preg_replace( '/ +/', ' ', $s );~~
	22	+ $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
	23	+ $s = self::wordSegmentation( $s, $reg );
32	24	}
33	25
34	26	$s = trim( $s );
35	27
36	28	// Do general case folding and UTF-8 armoring
37		~~- $s = parent::stripForSearch( $s );~~
	29	+ $s = parent::stripForSearch( $s, $doStrip );
38	30	wfProfileOut( __METHOD__ );
39	31	return $s;
40	32	}
41		-}
	33	+}
\ No newline at end of file
Index: trunk/phase3/languages/classes/LanguageJa.php
—	—	@@ -6,18 +6,15 @@
7	7	* @ingroup Language
8	8	*/
9	9	class LanguageJa extends Language {
10		~~- function stripForSearch( $string ) {~~
11		~~- # MySQL fulltext index doesn't grok utf-8, so we~~
12		~~- # need to fold cases and convert to hex~~
	10	+ function stripForSearch( $string, $doStrip = true ) {
	11	+
13	12	$s = $string;
14	13
15		~~- # not for LuceneSearch, because LSearch~~
16		~~- # will split the text to words itself~~
17		~~- if ( $wgSearchType != 'LuceneSearch' ) {~~
18		~~- # Strip known punctuation ?~~
19		~~- #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f~~
	14	+ if ( $doStrip == true ) {
	15	+ // Strip known punctuation ?
	16	+ // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
20	17
21		~~- # Space strings of like hiragana/katakana/kanji~~
	18	+ // Space strings of like hiragana/katakana/kanji
22	19	$hiragana = '(?:\xe3(?:\x81[\x80-\xbf]\|\x82[\x80-\x9f]))'; # U3040-309f
23	20	$katakana = '(?:\xe3(?:\x82[\xa0-\xbf]\|\x83[\x80-\xbf]))'; # U30a0-30ff
24	21	$kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
—	—	@@ -25,14 +22,14 @@
26	23	. '\|\xe9[\x80-\xa5][\x80-\xbf]'
27	24	. '\|\xe9\xa6[\x80-\x99])';
28	25	# U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
29		~~- $s = preg_replace( "/({$hiragana}+\|{$katakana}+\|{$kanji}+)/", ' $1 ', $s );~~
	26	+ $reg = "/({$hiragana}+\|{$katakana}+\|{$kanji}+)/";
	27	+ $s = self::wordSegmentation( $s, $reg );
30	28	}
31		~~- # Double-width roman characters: ff00-ff5f ~= 0020-007f~~
32		~~- $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );~~
33		~~- $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );~~
	29	+ // Double-width roman characters
	30	+ $s = self::convertDoubleWidth( $s );
34	31
35	32	# Do general case folding and UTF-8 armoring
36		~~- return parent::stripForSearch( $s );~~
	33	+ return parent::stripForSearch( $s, $doStrip );
37	34	}
38	35
39	36	# Italic is not appropriate for Japanese script
Index: trunk/phase3/languages/classes/LanguageGan.php
—	—	@@ -137,43 +137,14 @@
138	138	}
139	139
140	140	// word segmentation
141		~~- function stripForSearch( $string ) {~~
142		~~- wfProfileIn( __METHOD__ );~~
143		~~- global $wgSearchType;~~
144		-
145		~~- // always convert to gan-hans before indexing. it should be~~
146		~~- // better to use gan-hans for search, since conversion from~~
147		~~- // Traditional to Simplified is less ambiguous than the~~
148		~~- // other way around~~
149		~~- $s = $this->mConverter->autoConvert($string, 'gan-hans');~~
150		-
151		~~- // Double-width roman characters: ff00-ff5f ~= 0020-007f~~
152		~~- $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );~~
153		~~- $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );~~
154		-
155		~~- if ( $wgSearchType != 'LuceneSearch' ) {~~
156		~~- // eventually this should be a word segmentation;~~
157		~~- // for now just treat each character as a word.~~
158		~~- // Not for LuceneSearch, because LSearch will~~
159		~~- // split the text to words itself.~~
160		~~- // @todo Fixme: only do this for Han characters...~~
161		~~- $s = preg_replace(~~
162		~~- "/([\\xc0-\\xff][\\x80-\\xbf]*)/",~~
163		~~- " $1 ", $s);~~
164		~~- $s = preg_replace( '/ +/', ' ', $s );~~
165		~~- }~~
166		-
167		~~- $s = trim( $s );~~
168		-
169		~~- // Do general case folding and UTF-8 armoring~~
170		~~- $s = parent::stripForSearch( $s );~~
171		~~- wfProfileOut( __METHOD__ );~~
172		~~- return $s;~~
173		-
	141	+ function stripForSearch( $string, $doStrip = true ) {
	142	+ // LanguageZh::stripForSearch
	143	+ return parent::stripForSearch( $string, $doStrip, 'gan-hans' );
174	144	}
175	145
176	146	function convertForSearchResult( $termsArray ) {
177	147	$terms = implode( '\|', $termsArray );
	148	+ $terms = self::convertDoubleWidth( $terms );
178	149	$terms = implode( '\|', $this->mConverter->autoConvertToAllVariants( $terms ) );
179	150	$ret = array_unique( explode('\|', $terms) );
180	151	return $ret;
Index: trunk/phase3/languages/classes/LanguageZh.php
—	—	@@ -173,15 +173,16 @@
174	174	}
175	175
176	176	// word segmentation
177		~~- function stripForSearch( $string ) {~~
	177	+ function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) {
178	178	wfProfileIn( __METHOD__ );
179	179
180	180	// always convert to zh-hans before indexing. it should be
181	181	// better to use zh-hans for search, since conversion from
182	182	// Traditional to Simplified is less ambiguous than the
183	183	// other way around
184		~~- $s = $this->mConverter->autoConvert( $string, 'zh-hans' );~~
185		~~- $s = parent::stripForSearch( $s );~~
	184	+ $s = $this->mConverter->autoConvert( $string, $autoVariant );
	185	+ // LanguageZh_hans::stripForSearch
	186	+ $s = parent::stripForSearch( $s, $doStrip );
186	187	wfProfileOut( __METHOD__ );
187	188	return $s;
188	189
—	—	@@ -189,6 +190,7 @@
190	191
191	192	function convertForSearchResult( $termsArray ) {
192	193	$terms = implode( '\|', $termsArray );
	194	+ $terms = self::convertDoubleWidth( $terms );
193	195	$terms = implode( '\|', $this->mConverter->autoConvertToAllVariants( $terms ) );
194	196	$ret = array_unique( explode('\|', $terms) );
195	197	return $ret;
Index: trunk/phase3/languages/classes/LanguageYue.php
—	—	@@ -3,32 +3,24 @@
4	4	* @ingroup Language
5	5	*/
6	6	class LanguageYue extends Language {
7		~~- function stripForSearch( $string ) {~~
	7	+ function stripForSearch( $string, $doStrip = true ) {
8	8	wfProfileIn( __METHOD__ );
9		~~- global $wgSearchType;~~
10	9
11		~~- $s = $string;~~
	10	+ // Double-width roman characters
	11	+ $s = self::convertDoubleWidth( $string );
12	12
13		~~- // Double-width roman characters: ff00-ff5f ~= 0020-007f~~
14		~~- $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );~~
15		~~- $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );~~
16		-
17		~~- if ( $wgSearchType != 'LuceneSearch' ) {~~
	13	+ if ( $doStrip == true ) {
18	14	// eventually this should be a word segmentation;
19	15	// for now just treat each character as a word.
20		~~- // Not for LuceneSearch, because LSearch will~~
21		~~- // split the text to words itself.~~
22	16	// @todo Fixme: only do this for Han characters...
23		~~- $s = preg_replace(~~
24		~~- "/([\\xc0-\\xff][\\x80-\\xbf]*)/",~~
25		~~- " $1 ", $s);~~
26		~~- $s = preg_replace( '/ +/', ' ', $s );~~
	17	+ $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
	18	+ $s = self::wordSegmentation( $s, $reg );
27	19	}
28	20
29	21	$s = trim( $s );
30	22
31	23	// Do general case folding and UTF-8 armoring
32		~~- $s = parent::stripForSearch( $s );~~
	24	+ $s = parent::stripForSearch( $s, $doStrip );
33	25	wfProfileOut( __METHOD__ );
34	26	return $s;
35	27	}

Follow-up revisions

Revision	Commit summary	Author	Date
r60766	follow-up r60742. adapt to the code changes made in r60764.	philip	04:53, 7 January 2010
r60796	follow-up r60764. compatible fix.	philip	17:48, 7 January 2010
r61856	Follow up r60742, r60743, r60764, r60766, r61214, r61390. Split stripForSearc...	philip	15:09, 2 February 2010

Past revisions this follows-up on

Revision	Commit summary	Author	Date
r60743	1. Add conditions to stripForSearch for LuceneSearch / MWSearch....	philip	19:51, 6 January 2010

Comments

#Comment by Raymond (talk | contribs) 17:40, 7 January 2010

Seen at translatewiki:

PHP Strict Standards: Declaration of LanguageGan::stripForSearch() should be compatible with that of LanguageZh::stripForSearch() in /var/www/w/languages/classes/LanguageGan.php on line 102

#Comment by PhiLiP (talk | contribs) 17:49, 7 January 2010

fixed on r60796.

Status & tagging log

20:18, 22 February 2010 MarkAHershberger (talk | contribs) changed the status of r60764 [removed: new added: ok]
19:26, 7 January 2010 Raymond (talk | contribs) changed the status of r60764 [removed: fixme added: new]
17:40, 7 January 2010 Raymond (talk | contribs) changed the status of r60764 [removed: new added: fixme]