Index: trunk/phase3/languages/Language.php |
— | — | @@ -1697,12 +1697,11 @@ |
1698 | 1698 | * @return String |
1699 | 1699 | */ |
1700 | 1700 | function stripForSearch( $string ) { |
1701 | | - global $wgDBtype; |
1702 | | - if ( $wgDBtype != 'mysql' ) { |
| 1701 | + global $wgDBtype, $wgSearchType; |
| 1702 | + if ( $wgDBtype != 'mysql' or $wgSearchType == 'LuceneSearch' ) { |
1703 | 1703 | return $string; |
1704 | 1704 | } |
1705 | 1705 | |
1706 | | - |
1707 | 1706 | wfProfileIn( __METHOD__ ); |
1708 | 1707 | |
1709 | 1708 | // MySQL fulltext index doesn't grok utf-8, so we |
Index: trunk/phase3/languages/classes/LanguageZh_hans.php |
— | — | @@ -9,18 +9,32 @@ |
10 | 10 | } |
11 | 11 | |
12 | 12 | function stripForSearch( $string ) { |
13 | | - // Eventually this should be a word segmentation; |
14 | | - // for now just treat each character as a word. |
15 | | - // |
16 | | - // Note we put a space on both sides to cover cases |
17 | | - // where a number or Latin char follows a Han char. |
18 | | - // |
19 | | - // @todo Fixme: only do this for Han characters... |
20 | | - $t = preg_replace( |
21 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
22 | | - " $1 ", $string); |
23 | | - $t = preg_replace( '/ +/', ' ', $t ); |
24 | | - $t = trim( $t ); |
25 | | - return parent::stripForSearch( $t ); |
| 13 | + wfProfileIn( __METHOD__ ); |
| 14 | + global $wgSearchType; |
| 15 | + |
| 16 | + $s = $string; |
| 17 | + |
| 18 | + // Double-width roman characters: ff00-ff5f ~= 0020-007f |
| 19 | + $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); |
| 20 | + $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); |
| 21 | + |
| 22 | + if ( $wgSearchType != 'LuceneSearch' ) { |
| 23 | + // Eventually this should be a word segmentation; |
| 24 | + // for now just treat each character as a word. |
| 25 | + // Not for LuceneSearch, because LSearch will |
| 26 | + // split the text to words itself. |
| 27 | + // @todo Fixme: only do this for Han characters... |
| 28 | + $s = preg_replace( |
| 29 | + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
| 30 | + " $1 ", $s); |
| 31 | + $s = preg_replace( '/ +/', ' ', $s ); |
| 32 | + } |
| 33 | + |
| 34 | + $s = trim( $s ); |
| 35 | + |
| 36 | + // Do general case folding and UTF-8 armoring |
| 37 | + $s = parent::stripForSearch( $s ); |
| 38 | + wfProfileOut( __METHOD__ ); |
| 39 | + return $s; |
26 | 40 | } |
27 | 41 | } |
Index: trunk/phase3/languages/classes/LanguageGan.php |
— | — | @@ -139,23 +139,36 @@ |
140 | 140 | // word segmentation |
141 | 141 | function stripForSearch( $string ) { |
142 | 142 | wfProfileIn( __METHOD__ ); |
| 143 | + global $wgSearchType; |
143 | 144 | |
144 | | - // eventually this should be a word segmentation |
145 | | - // for now just treat each character as a word |
146 | | - // @todo Fixme: only do this for Han characters... |
147 | | - $t = preg_replace( |
148 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
149 | | - " $1", $string); |
| 145 | + // always convert to gan-hans before indexing. it should be |
| 146 | + // better to use gan-hans for search, since conversion from |
| 147 | + // Traditional to Simplified is less ambiguous than the |
| 148 | + // other way around |
| 149 | + $s = $this->mConverter->autoConvert($string, 'gan-hans'); |
150 | 150 | |
151 | | - //always convert to gan-hans before indexing. it should be |
152 | | - //better to use gan-hans for search, since conversion from |
153 | | - //Traditional to Simplified is less ambiguous than the |
154 | | - //other way around |
| 151 | + // Double-width roman characters: ff00-ff5f ~= 0020-007f |
| 152 | + $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); |
| 153 | + $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); |
155 | 154 | |
156 | | - $t = $this->mConverter->autoConvert($t, 'gan-hans'); |
157 | | - $t = parent::stripForSearch( $t ); |
| 155 | + if ( $wgSearchType != 'LuceneSearch' ) { |
| 156 | + // eventually this should be a word segmentation; |
| 157 | + // for now just treat each character as a word. |
| 158 | + // Not for LuceneSearch, because LSearch will |
| 159 | + // split the text to words itself. |
| 160 | + // @todo Fixme: only do this for Han characters... |
| 161 | + $s = preg_replace( |
| 162 | + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
| 163 | + " $1 ", $s); |
| 164 | + $s = preg_replace( '/ +/', ' ', $s ); |
| 165 | + } |
| 166 | + |
| 167 | + $s = trim( $s ); |
| 168 | + |
| 169 | + // Do general case folding and UTF-8 armoring |
| 170 | + $s = parent::stripForSearch( $s ); |
158 | 171 | wfProfileOut( __METHOD__ ); |
159 | | - return $t; |
| 172 | + return $s; |
160 | 173 | |
161 | 174 | } |
162 | 175 | |
Index: trunk/phase3/languages/classes/LanguageJa.php |
— | — | @@ -11,23 +11,26 @@ |
12 | 12 | # need to fold cases and convert to hex |
13 | 13 | $s = $string; |
14 | 14 | |
15 | | - # Strip known punctuation ? |
16 | | - #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f |
| 15 | + # not for LuceneSearch, because LSearch |
| 16 | + # will split the text to words itself |
| 17 | + if ( $wgSearchType != 'LuceneSearch' ) { |
| 18 | + # Strip known punctuation ? |
| 19 | + #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f |
17 | 20 | |
18 | | - # Space strings of like hiragana/katakana/kanji |
19 | | - $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f |
20 | | - $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff |
21 | | - $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' |
22 | | - . '|[\xe4-\xe8][\x80-\xbf]{2}' |
23 | | - . '|\xe9[\x80-\xa5][\x80-\xbf]' |
24 | | - . '|\xe9\xa6[\x80-\x99])'; |
25 | | - # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 |
26 | | - $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s ); |
27 | | - |
| 21 | + # Space strings of like hiragana/katakana/kanji |
| 22 | + $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f |
| 23 | + $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff |
| 24 | + $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' |
| 25 | + . '|[\xe4-\xe8][\x80-\xbf]{2}' |
| 26 | + . '|\xe9[\x80-\xa5][\x80-\xbf]' |
| 27 | + . '|\xe9\xa6[\x80-\x99])'; |
| 28 | + # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 |
| 29 | + $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s ); |
| 30 | + } |
28 | 31 | # Double-width roman characters: ff00-ff5f ~= 0020-007f |
29 | 32 | $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); |
30 | 33 | $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); |
31 | | - |
| 34 | + |
32 | 35 | # Do general case folding and UTF-8 armoring |
33 | 36 | return parent::stripForSearch( $s ); |
34 | 37 | } |
Index: trunk/phase3/languages/classes/LanguageYue.php |
— | — | @@ -5,17 +5,31 @@ |
6 | 6 | class LanguageYue extends Language { |
7 | 7 | function stripForSearch( $string ) { |
8 | 8 | wfProfileIn( __METHOD__ ); |
| 9 | + global $wgSearchType; |
9 | 10 | |
10 | | - // eventually this should be a word segmentation |
11 | | - // for now just treat each character as a word |
12 | | - // @todo Fixme: only do this for Han characters... |
13 | | - $t = preg_replace( |
14 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
15 | | - " $1", $string); |
| 11 | + $s = $string; |
16 | 12 | |
| 13 | + // Double-width roman characters: ff00-ff5f ~= 0020-007f |
| 14 | + $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); |
| 15 | + $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); |
| 16 | + |
| 17 | + if ( $wgSearchType != 'LuceneSearch' ) { |
| 18 | + // eventually this should be a word segmentation; |
| 19 | + // for now just treat each character as a word. |
| 20 | + // Not for LuceneSearch, because LSearch will |
| 21 | + // split the text to words itself. |
| 22 | + // @todo Fixme: only do this for Han characters... |
| 23 | + $s = preg_replace( |
| 24 | + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
| 25 | + " $1 ", $s); |
| 26 | + $s = preg_replace( '/ +/', ' ', $s ); |
| 27 | + } |
| 28 | + |
| 29 | + $s = trim( $s ); |
| 30 | + |
17 | 31 | // Do general case folding and UTF-8 armoring |
18 | | - $t = parent::stripForSearch( $t ); |
| 32 | + $s = parent::stripForSearch( $s ); |
19 | 33 | wfProfileOut( __METHOD__ ); |
20 | | - return $t; |
| 34 | + return $s; |
21 | 35 | } |
22 | 36 | } |
Index: trunk/phase3/languages/classes/LanguageZh.php |
— | — | @@ -176,15 +176,14 @@ |
177 | 177 | function stripForSearch( $string ) { |
178 | 178 | wfProfileIn( __METHOD__ ); |
179 | 179 | |
180 | | - //always convert to zh-hans before indexing. it should be |
181 | | - //better to use zh-hans for search, since conversion from |
182 | | - //Traditional to Simplified is less ambiguous than the |
183 | | - //other way around |
184 | | - |
185 | | - $t = $this->mConverter->autoConvert( $string, 'zh-hans' ); |
186 | | - $t = parent::stripForSearch( $t ); |
| 180 | + // always convert to zh-hans before indexing. it should be |
| 181 | + // better to use zh-hans for search, since conversion from |
| 182 | + // Traditional to Simplified is less ambiguous than the |
| 183 | + // other way around |
| 184 | + $s = $this->mConverter->autoConvert( $string, 'zh-hans' ); |
| 185 | + $s = parent::stripForSearch( $s ); |
187 | 186 | wfProfileOut( __METHOD__ ); |
188 | | - return $t; |
| 187 | + return $s; |
189 | 188 | |
190 | 189 | } |
191 | 190 | |