Index: trunk/phase3/languages/Language.php |
— | — | @@ -1695,9 +1695,9 @@ |
1696 | 1696 | * @param $string String |
1697 | 1697 | * @return String |
1698 | 1698 | */ |
1699 | | - function stripForSearch( $string ) { |
1700 | | - global $wgDBtype, $wgSearchType; |
1701 | | - if ( $wgDBtype != 'mysql' || $wgSearchType == 'LuceneSearch' ) { |
| 1699 | + function stripForSearch( $string, $doStrip = true ) { |
| 1700 | + global $wgDBtype; |
| 1701 | + if ( $wgDBtype != 'mysql' || $doStrip == false ) { |
1702 | 1702 | return $string; |
1703 | 1703 | } |
1704 | 1704 | |
— | — | @@ -1767,6 +1767,22 @@ |
1768 | 1768 | return $this->minSearchLength; |
1769 | 1769 | } |
1770 | 1770 | |
| 1771 | + /** |
| 1772 | + * convert double-width roman characters to single-width. |
| 1773 | + * range: ff00-ff5f ~= 0020-007f |
| 1774 | + */ |
| 1775 | + protected static function convertDoubleWidth( $string ) { |
| 1776 | + $string = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $string ); |
| 1777 | + $string = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $string ); |
| 1778 | + return $string; |
| 1779 | + } |
| 1780 | + |
| 1781 | + protected static function wordSegmentation( $string, $pattern ) { |
| 1782 | + $string = preg_replace( $pattern, " $1 ", $string ); |
| 1783 | + $string = preg_replace( '/ +/', ' ', $string ); |
| 1784 | + return $string; |
| 1785 | + } |
| 1786 | + |
1771 | 1787 | function convertForSearchResult( $termsArray ) { |
1772 | 1788 | # some languages, e.g. Chinese, need to do a conversion |
1773 | 1789 | # in order for search results to be displayed correctly |
Index: trunk/phase3/languages/classes/LanguageZh_hans.php |
— | — | @@ -8,33 +8,25 @@ |
9 | 9 | return false; |
10 | 10 | } |
11 | 11 | |
12 | | - function stripForSearch( $string ) { |
| 12 | + function stripForSearch( $string, $doStrip = true ) { |
13 | 13 | wfProfileIn( __METHOD__ ); |
14 | | - global $wgSearchType; |
15 | 14 | |
16 | | - $s = $string; |
| 15 | + // Double-width roman characters |
| 16 | + $s = self::convertDoubleWidth( $string ); |
17 | 17 | |
18 | | - // Double-width roman characters: ff00-ff5f ~= 0020-007f |
19 | | - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); |
20 | | - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); |
21 | | - |
22 | | - if ( $wgSearchType != 'LuceneSearch' ) { |
| 18 | + if ( $doStrip == true ) { |
23 | 19 | // Eventually this should be a word segmentation; |
24 | 20 | // for now just treat each character as a word. |
25 | | - // Not for LuceneSearch, because LSearch will |
26 | | - // split the text to words itself. |
27 | 21 | // @todo Fixme: only do this for Han characters... |
28 | | - $s = preg_replace( |
29 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
30 | | - " $1 ", $s); |
31 | | - $s = preg_replace( '/ +/', ' ', $s ); |
| 22 | + $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; |
| 23 | + $s = self::wordSegmentation( $s, $reg ); |
32 | 24 | } |
33 | 25 | |
34 | 26 | $s = trim( $s ); |
35 | 27 | |
36 | 28 | // Do general case folding and UTF-8 armoring |
37 | | - $s = parent::stripForSearch( $s ); |
| 29 | + $s = parent::stripForSearch( $s, $doStrip ); |
38 | 30 | wfProfileOut( __METHOD__ ); |
39 | 31 | return $s; |
40 | 32 | } |
41 | | -} |
| 33 | +} |
\ No newline at end of file |
Index: trunk/phase3/languages/classes/LanguageJa.php |
— | — | @@ -6,18 +6,15 @@ |
7 | 7 | * @ingroup Language |
8 | 8 | */ |
9 | 9 | class LanguageJa extends Language { |
10 | | - function stripForSearch( $string ) { |
11 | | - # MySQL fulltext index doesn't grok utf-8, so we |
12 | | - # need to fold cases and convert to hex |
| 10 | + function stripForSearch( $string, $doStrip = true ) { |
| 11 | + |
13 | 12 | $s = $string; |
14 | 13 | |
15 | | - # not for LuceneSearch, because LSearch |
16 | | - # will split the text to words itself |
17 | | - if ( $wgSearchType != 'LuceneSearch' ) { |
18 | | - # Strip known punctuation ? |
19 | | - #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f |
| 14 | + if ( $doStrip == true ) { |
| 15 | + // Strip known punctuation ? |
| 16 | + // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f |
20 | 17 | |
21 | | - # Space strings of like hiragana/katakana/kanji |
| 18 | + // Space strings of like hiragana/katakana/kanji |
22 | 19 | $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f |
23 | 20 | $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff |
24 | 21 | $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' |
— | — | @@ -25,14 +22,14 @@ |
26 | 23 | . '|\xe9[\x80-\xa5][\x80-\xbf]' |
27 | 24 | . '|\xe9\xa6[\x80-\x99])'; |
28 | 25 | # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 |
29 | | - $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s ); |
| 26 | + $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/"; |
| 27 | + $s = self::wordSegmentation( $s, $reg ); |
30 | 28 | } |
31 | | - # Double-width roman characters: ff00-ff5f ~= 0020-007f |
32 | | - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); |
33 | | - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); |
| 29 | + // Double-width roman characters |
| 30 | + $s = self::convertDoubleWidth( $s ); |
34 | 31 | |
35 | 32 | # Do general case folding and UTF-8 armoring |
36 | | - return parent::stripForSearch( $s ); |
| 33 | + return parent::stripForSearch( $s, $doStrip ); |
37 | 34 | } |
38 | 35 | |
39 | 36 | # Italic is not appropriate for Japanese script |
Index: trunk/phase3/languages/classes/LanguageGan.php |
— | — | @@ -137,43 +137,14 @@ |
138 | 138 | } |
139 | 139 | |
140 | 140 | // word segmentation |
141 | | - function stripForSearch( $string ) { |
142 | | - wfProfileIn( __METHOD__ ); |
143 | | - global $wgSearchType; |
144 | | - |
145 | | - // always convert to gan-hans before indexing. it should be |
146 | | - // better to use gan-hans for search, since conversion from |
147 | | - // Traditional to Simplified is less ambiguous than the |
148 | | - // other way around |
149 | | - $s = $this->mConverter->autoConvert($string, 'gan-hans'); |
150 | | - |
151 | | - // Double-width roman characters: ff00-ff5f ~= 0020-007f |
152 | | - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); |
153 | | - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); |
154 | | - |
155 | | - if ( $wgSearchType != 'LuceneSearch' ) { |
156 | | - // eventually this should be a word segmentation; |
157 | | - // for now just treat each character as a word. |
158 | | - // Not for LuceneSearch, because LSearch will |
159 | | - // split the text to words itself. |
160 | | - // @todo Fixme: only do this for Han characters... |
161 | | - $s = preg_replace( |
162 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
163 | | - " $1 ", $s); |
164 | | - $s = preg_replace( '/ +/', ' ', $s ); |
165 | | - } |
166 | | - |
167 | | - $s = trim( $s ); |
168 | | - |
169 | | - // Do general case folding and UTF-8 armoring |
170 | | - $s = parent::stripForSearch( $s ); |
171 | | - wfProfileOut( __METHOD__ ); |
172 | | - return $s; |
173 | | - |
| 141 | + function stripForSearch( $string, $doStrip = true ) { |
| 142 | + // LanguageZh::stripForSearch |
| 143 | + return parent::stripForSearch( $string, $doStrip, 'gan-hans' ); |
174 | 144 | } |
175 | 145 | |
176 | 146 | function convertForSearchResult( $termsArray ) { |
177 | 147 | $terms = implode( '|', $termsArray ); |
| 148 | + $terms = self::convertDoubleWidth( $terms ); |
178 | 149 | $terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) ); |
179 | 150 | $ret = array_unique( explode('|', $terms) ); |
180 | 151 | return $ret; |
Index: trunk/phase3/languages/classes/LanguageZh.php |
— | — | @@ -173,15 +173,16 @@ |
174 | 174 | } |
175 | 175 | |
176 | 176 | // word segmentation |
177 | | - function stripForSearch( $string ) { |
| 177 | + function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) { |
178 | 178 | wfProfileIn( __METHOD__ ); |
179 | 179 | |
180 | 180 | // always convert to zh-hans before indexing. it should be |
181 | 181 | // better to use zh-hans for search, since conversion from |
182 | 182 | // Traditional to Simplified is less ambiguous than the |
183 | 183 | // other way around |
184 | | - $s = $this->mConverter->autoConvert( $string, 'zh-hans' ); |
185 | | - $s = parent::stripForSearch( $s ); |
| 184 | + $s = $this->mConverter->autoConvert( $string, $autoVariant ); |
| 185 | + // LanguageZh_hans::stripForSearch |
| 186 | + $s = parent::stripForSearch( $s, $doStrip ); |
186 | 187 | wfProfileOut( __METHOD__ ); |
187 | 188 | return $s; |
188 | 189 | |
— | — | @@ -189,6 +190,7 @@ |
190 | 191 | |
191 | 192 | function convertForSearchResult( $termsArray ) { |
192 | 193 | $terms = implode( '|', $termsArray ); |
| 194 | + $terms = self::convertDoubleWidth( $terms ); |
193 | 195 | $terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) ); |
194 | 196 | $ret = array_unique( explode('|', $terms) ); |
195 | 197 | return $ret; |
Index: trunk/phase3/languages/classes/LanguageYue.php |
— | — | @@ -3,32 +3,24 @@ |
4 | 4 | * @ingroup Language |
5 | 5 | */ |
6 | 6 | class LanguageYue extends Language { |
7 | | - function stripForSearch( $string ) { |
| 7 | + function stripForSearch( $string, $doStrip = true ) { |
8 | 8 | wfProfileIn( __METHOD__ ); |
9 | | - global $wgSearchType; |
10 | 9 | |
11 | | - $s = $string; |
| 10 | + // Double-width roman characters |
| 11 | + $s = self::convertDoubleWidth( $string ); |
12 | 12 | |
13 | | - // Double-width roman characters: ff00-ff5f ~= 0020-007f |
14 | | - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); |
15 | | - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); |
16 | | - |
17 | | - if ( $wgSearchType != 'LuceneSearch' ) { |
| 13 | + if ( $doStrip == true ) { |
18 | 14 | // eventually this should be a word segmentation; |
19 | 15 | // for now just treat each character as a word. |
20 | | - // Not for LuceneSearch, because LSearch will |
21 | | - // split the text to words itself. |
22 | 16 | // @todo Fixme: only do this for Han characters... |
23 | | - $s = preg_replace( |
24 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
25 | | - " $1 ", $s); |
26 | | - $s = preg_replace( '/ +/', ' ', $s ); |
| 17 | + $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; |
| 18 | + $s = self::wordSegmentation( $s, $reg ); |
27 | 19 | } |
28 | 20 | |
29 | 21 | $s = trim( $s ); |
30 | 22 | |
31 | 23 | // Do general case folding and UTF-8 armoring |
32 | | - $s = parent::stripForSearch( $s ); |
| 24 | + $s = parent::stripForSearch( $s, $doStrip ); |
33 | 25 | wfProfileOut( __METHOD__ ); |
34 | 26 | return $s; |
35 | 27 | } |