Index: trunk/phase3/includes/search/SearchMySQL.php |
— | — | @@ -80,7 +80,7 @@ |
81 | 81 | // fulltext engine. |
82 | 82 | // For Chinese this also inserts spaces between adjacent Han characters. |
83 | 83 | $strippedVariants = array_map( |
84 | | - array( $wgContLang, 'stripForSearch' ), |
| 84 | + array( $wgContLang, 'normalizeForSearch' ), |
85 | 85 | $variants ); |
86 | 86 | |
87 | 87 | // Some languages such as Chinese force all variants to a canonical |
— | — | @@ -95,7 +95,7 @@ |
96 | 96 | $stripped = $this->normalizeText( $stripped ); |
97 | 97 | if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { |
98 | 98 | // Hack for Chinese: we need to toss in quotes for |
99 | | - // multiple-character phrases since stripForSearch() |
| 99 | + // multiple-character phrases since normalizeForSearch() |
100 | 100 | // added spaces between them to make word breaks. |
101 | 101 | $stripped = '"' . trim( $stripped ) . '"'; |
102 | 102 | } |
— | — | @@ -324,13 +324,16 @@ |
325 | 325 | global $wgContLang; |
326 | 326 | |
327 | 327 | wfProfileIn( __METHOD__ ); |
| 328 | + |
| 329 | + // Some languages such as Chinese require word segmentation |
| 330 | + $out = $wgContLang->wordSegmentation( $string ); |
328 | 331 | |
329 | 332 | // MySQL fulltext index doesn't grok utf-8, so we |
330 | 333 | // need to fold cases and convert to hex |
331 | 334 | $out = preg_replace_callback( |
332 | 335 | "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
333 | 336 | array( $this, 'stripForSearchCallback' ), |
334 | | - $wgContLang->lc( $string ) ); |
| 337 | + $wgContLang->lc( $out ) ); |
335 | 338 | |
336 | 339 | // And to add insult to injury, the default indexing |
337 | 340 | // ignores short words... Pad them so we can pass them |
Index: trunk/phase3/includes/search/SearchOracle.php |
— | — | @@ -217,7 +217,7 @@ |
218 | 218 | |
219 | 219 | private function escapeTerm($t) { |
220 | 220 | global $wgContLang; |
221 | | - $t = $wgContLang->stripForSearch($t); |
| 221 | + $t = $wgContLang->normalizeForSearch($t); |
222 | 222 | $t = isset($this->reservedWords[strtoupper($t)]) ? '{'.$t.'}' : $t; |
223 | 223 | $t = preg_replace('/^"(.*)"$/', '($1)', $t); |
224 | 224 | $t = preg_replace('/([-&|])/', '\\\\$1', $t); |
Index: trunk/phase3/includes/search/SearchIBM_DB2.php |
— | — | @@ -158,10 +158,10 @@ |
159 | 159 | if( is_array( $temp_terms )) { |
160 | 160 | $temp_terms = array_unique( array_values( $temp_terms )); |
161 | 161 | foreach( $temp_terms as $t ) |
162 | | - $q[] = $terms[1] . $wgContLang->stripForSearch( $t ); |
| 162 | + $q[] = $terms[1] . $wgContLang->normalizeForSearch( $t ); |
163 | 163 | } |
164 | 164 | else |
165 | | - $q[] = $terms[1] . $wgContLang->stripForSearch( $terms[2] ); |
| 165 | + $q[] = $terms[1] . $wgContLang->normalizeForSearch( $terms[2] ); |
166 | 166 | |
167 | 167 | if (!empty($terms[3])) { |
168 | 168 | $regexp = preg_quote( $terms[3], '/' ); |
Index: trunk/phase3/includes/search/SearchSqlite.php |
— | — | @@ -92,7 +92,7 @@ |
93 | 93 | // fulltext engine. |
94 | 94 | // For Chinese this also inserts spaces between adjacent Han characters. |
95 | 95 | $strippedVariants = array_map( |
96 | | - array( $wgContLang, 'stripForSearch' ), |
| 96 | + array( $wgContLang, 'normalizeForSearch' ), |
97 | 97 | $variants ); |
98 | 98 | |
99 | 99 | // Some languages such as Chinese force all variants to a canonical |
— | — | @@ -106,7 +106,7 @@ |
107 | 107 | foreach( $strippedVariants as $stripped ) { |
108 | 108 | if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { |
109 | 109 | // Hack for Chinese: we need to toss in quotes for |
110 | | - // multiple-character phrases since stripForSearch() |
| 110 | + // multiple-character phrases since normalizeForSearch() |
111 | 111 | // added spaces between them to make word breaks. |
112 | 112 | $stripped = '"' . trim( $stripped ) . '"'; |
113 | 113 | } |
Index: trunk/phase3/includes/search/SearchUpdate.php |
— | — | @@ -43,7 +43,7 @@ |
44 | 44 | } |
45 | 45 | |
46 | 46 | # Language-specific strip/conversion |
47 | | - $text = $wgContLang->stripForSearch( $this->mText ); |
| 47 | + $text = $wgContLang->normalizeForSearch( $this->mText ); |
48 | 48 | |
49 | 49 | wfProfileIn( $fname.'-regexps' ); |
50 | 50 | $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", |
Index: trunk/phase3/includes/Title.php |
— | — | @@ -435,7 +435,7 @@ |
436 | 436 | global $wgContLang; |
437 | 437 | |
438 | 438 | $lc = SearchEngine::legalSearchChars() . '&#;'; |
439 | | - $t = $wgContLang->stripForSearch( $title ); |
| 439 | + $t = $wgContLang->normalizeForSearch( $title ); |
440 | 440 | $t = preg_replace( "/[^{$lc}]+/", ' ', $t ); |
441 | 441 | $t = $wgContLang->lc( $t ); |
442 | 442 | |
Index: trunk/phase3/languages/Language.php |
— | — | @@ -1686,15 +1686,26 @@ |
1687 | 1687 | function hasWordBreaks() { |
1688 | 1688 | return true; |
1689 | 1689 | } |
| 1690 | + |
| 1691 | + /** |
| 1692 | + * Some languages such as Chinese require word segmentation, |
| 1693 | + * Specify such segmentation when overridden in derived class. |
| 1694 | + * |
| 1695 | + * @param $string String |
| 1696 | + * @return String |
| 1697 | + */ |
| 1698 | + function wordSegmentation( $string ) { |
| 1699 | + return $string; |
| 1700 | + } |
1690 | 1701 | |
1691 | 1702 | /** |
1692 | | - * Some languages have special punctuation to strip out. |
| 1703 | + * Some languages have special punctuation need to be normalized. |
1693 | 1704 | * Make such changes here. |
1694 | 1705 | * |
1695 | 1706 | * @param $string String |
1696 | 1707 | * @return String |
1697 | 1708 | */ |
1698 | | - function stripForSearch( $string, $doStrip = true ) { |
| 1709 | + function normalizeForSearch( $string ) { |
1699 | 1710 | return $string; |
1700 | 1711 | } |
1701 | 1712 | |
— | — | @@ -1708,7 +1719,7 @@ |
1709 | 1720 | return $string; |
1710 | 1721 | } |
1711 | 1722 | |
1712 | | - protected static function wordSegmentation( $string, $pattern ) { |
| 1723 | + protected static function insertSpace( $string, $pattern ) { |
1713 | 1724 | $string = preg_replace( $pattern, " $1 ", $string ); |
1714 | 1725 | $string = preg_replace( '/ +/', ' ', $string ); |
1715 | 1726 | return $string; |
Index: trunk/phase3/languages/classes/LanguageZh_hans.php |
— | — | @@ -7,25 +7,26 @@ |
8 | 8 | function hasWordBreaks() { |
9 | 9 | return false; |
10 | 10 | } |
11 | | - |
12 | | - function stripForSearch( $string, $doStrip = true ) { |
| 11 | + |
| 12 | + /** |
| 13 | + * Eventually this should be a word segmentation; |
| 14 | + * for now just treat each character as a word. |
| 15 | + * @todo Fixme: only do this for Han characters... |
| 16 | + */ |
| 17 | + function wordSegmentation( $string ) { |
| 18 | + $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; |
| 19 | + $s = self::insertSpace( $string, $reg ); |
| 20 | + return $s; |
| 21 | + } |
| 22 | + |
| 23 | + function normalizeForSearch( $string ) { |
13 | 24 | wfProfileIn( __METHOD__ ); |
14 | 25 | |
15 | 26 | // Double-width roman characters |
16 | 27 | $s = self::convertDoubleWidth( $string ); |
17 | | - |
18 | | - if ( $doStrip == true ) { |
19 | | - // Eventually this should be a word segmentation; |
20 | | - // for now just treat each character as a word. |
21 | | - // @todo Fixme: only do this for Han characters... |
22 | | - $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; |
23 | | - $s = self::wordSegmentation( $s, $reg ); |
24 | | - } |
25 | | - |
26 | 28 | $s = trim( $s ); |
| 29 | + $s = parent::normalizeForSearch( $s ); |
27 | 30 | |
28 | | - // Do general case folding and UTF-8 armoring |
29 | | - $s = parent::stripForSearch( $s, $doStrip ); |
30 | 31 | wfProfileOut( __METHOD__ ); |
31 | 32 | return $s; |
32 | 33 | } |
Index: trunk/phase3/languages/classes/LanguageJa.php |
— | — | @@ -6,30 +6,29 @@ |
7 | 7 | * @ingroup Language |
8 | 8 | */ |
9 | 9 | class LanguageJa extends Language { |
10 | | - function stripForSearch( $string, $doStrip = true ) { |
| 10 | + function wordSegmentation( $string ) { |
| 11 | + // Strip known punctuation ? |
| 12 | + // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f |
11 | 13 | |
12 | | - $s = $string; |
| 14 | + // Space strings of like hiragana/katakana/kanji |
| 15 | + $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f |
| 16 | + $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff |
| 17 | + $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' |
| 18 | + . '|[\xe4-\xe8][\x80-\xbf]{2}' |
| 19 | + . '|\xe9[\x80-\xa5][\x80-\xbf]' |
| 20 | + . '|\xe9\xa6[\x80-\x99])'; |
| 21 | + # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 |
| 22 | + $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/"; |
| 23 | + $s = self::insertSpace( $string, $reg ); |
| 24 | + return $s; |
| 25 | + } |
13 | 26 | |
14 | | - if ( $doStrip == true ) { |
15 | | - // Strip known punctuation ? |
16 | | - // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f |
17 | | - |
18 | | - // Space strings of like hiragana/katakana/kanji |
19 | | - $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f |
20 | | - $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff |
21 | | - $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' |
22 | | - . '|[\xe4-\xe8][\x80-\xbf]{2}' |
23 | | - . '|\xe9[\x80-\xa5][\x80-\xbf]' |
24 | | - . '|\xe9\xa6[\x80-\x99])'; |
25 | | - # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 |
26 | | - $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/"; |
27 | | - $s = self::wordSegmentation( $s, $reg ); |
28 | | - } |
| 27 | + function normalizeForSearch( $string ) { |
29 | 28 | // Double-width roman characters |
30 | | - $s = self::convertDoubleWidth( $s ); |
| 29 | + $s = self::convertDoubleWidth( $string ); |
31 | 30 | |
32 | 31 | # Do general case folding and UTF-8 armoring |
33 | | - return parent::stripForSearch( $s, $doStrip ); |
| 32 | + return parent::normalizeForSearch( $s ); |
34 | 33 | } |
35 | 34 | |
36 | 35 | # Italic is not appropriate for Japanese script |
Index: trunk/phase3/languages/classes/LanguageGan.php |
— | — | @@ -135,9 +135,9 @@ |
136 | 136 | } |
137 | 137 | |
138 | 138 | // word segmentation |
139 | | - function stripForSearch( $string, $doStrip = true, $autoVariant = 'gan-hans' ) { |
140 | | - // LanguageZh::stripForSearch |
141 | | - return parent::stripForSearch( $string, $doStrip, $autoVariant ); |
| 139 | + function normalizeForSearch( $string, $autoVariant = 'gan-hans' ) { |
| 140 | + // LanguageZh::normalizeForSearch |
| 141 | + return parent::normalizeForSearch( $string, $autoVariant ); |
142 | 142 | } |
143 | 143 | |
144 | 144 | function convertForSearchResult( $termsArray ) { |
Index: trunk/phase3/languages/classes/LanguageZh.php |
— | — | @@ -170,8 +170,23 @@ |
171 | 171 | "\"$1\"", $text); |
172 | 172 | } |
173 | 173 | |
174 | | - // word segmentation |
175 | | - function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) { |
| 174 | + /** |
| 175 | + * word segmentation |
| 176 | + */ |
| 177 | + function wordSegmentation( $string ) { |
| 178 | + // LanguageZh_hans::wordSegmentation |
| 179 | + $s = parent::wordSegmentation( $string ); |
| 180 | + return $s; |
| 181 | + } |
| 182 | + |
| 183 | + /** |
| 184 | + * auto convert to zh-hans and normalize special characters. |
| 185 | + * |
| 186 | + * @param $string String |
| 187 | + * @param $autoVariant String, default to 'zh-hans' |
| 188 | + * @return String |
| 189 | + */ |
| 190 | + function normalizeForSearch( $string, $autoVariant = 'zh-hans' ) { |
176 | 191 | wfProfileIn( __METHOD__ ); |
177 | 192 | |
178 | 193 | // always convert to zh-hans before indexing. it should be |
— | — | @@ -179,8 +194,8 @@ |
180 | 195 | // Traditional to Simplified is less ambiguous than the |
181 | 196 | // other way around |
182 | 197 | $s = $this->mConverter->autoConvert( $string, $autoVariant ); |
183 | | - // LanguageZh_hans::stripForSearch |
184 | | - $s = parent::stripForSearch( $s, $doStrip ); |
| 198 | + // LanguageZh_hans::normalizeForSearch |
| 199 | + $s = parent::normalizeForSearch( $s ); |
185 | 200 | wfProfileOut( __METHOD__ ); |
186 | 201 | return $s; |
187 | 202 | |
Index: trunk/phase3/languages/classes/LanguageYue.php |
— | — | @@ -3,24 +3,29 @@ |
4 | 4 | * @ingroup Language |
5 | 5 | */ |
6 | 6 | class LanguageYue extends Language { |
7 | | - function stripForSearch( $string, $doStrip = true ) { |
| 7 | + function hasWordBreaks() { |
| 8 | + return false; |
| 9 | + } |
| 10 | + |
| 11 | + /** |
| 12 | + * Eventually this should be a word segmentation; |
| 13 | + * for now just treat each character as a word. |
| 14 | + * @todo Fixme: only do this for Han characters... |
| 15 | + */ |
| 16 | + function wordSegmentation( $string ) { |
| 17 | + $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; |
| 18 | + $s = self::insertSpace( $string, $reg ); |
| 19 | + return $s; |
| 20 | + } |
| 21 | + |
| 22 | + function normalizeForSearch( $string ) { |
8 | 23 | wfProfileIn( __METHOD__ ); |
9 | 24 | |
10 | 25 | // Double-width roman characters |
11 | 26 | $s = self::convertDoubleWidth( $string ); |
12 | | - |
13 | | - if ( $doStrip == true ) { |
14 | | - // eventually this should be a word segmentation; |
15 | | - // for now just treat each character as a word. |
16 | | - // @todo Fixme: only do this for Han characters... |
17 | | - $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; |
18 | | - $s = self::wordSegmentation( $s, $reg ); |
19 | | - } |
20 | | - |
21 | 27 | $s = trim( $s ); |
| 28 | + $s = parent::normalizeForSearch( $s ); |
22 | 29 | |
23 | | - // Do general case folding and UTF-8 armoring |
24 | | - $s = parent::stripForSearch( $s, $doStrip ); |
25 | 30 | wfProfileOut( __METHOD__ ); |
26 | 31 | return $s; |
27 | 32 | } |