Index: trunk/phase3/languages/Language.php |
— | — | @@ -1595,6 +1595,16 @@ |
1596 | 1596 | } |
1597 | 1597 | |
1598 | 1598 | /** |
| 1599 | + * Most writing systems use whitespace to break up words. |
| 1600 | + * Some languages such as Chinese don't conventionally do this, |
| 1601 | + * which requires special handling when breaking up words for |
| 1602 | + * searching etc. |
| 1603 | + */ |
| 1604 | + function hasWordBreaks() { |
| 1605 | + return true; |
| 1606 | + } |
| 1607 | + |
| 1608 | + /** |
1599 | 1609 | * Some languages have special punctuation to strip out |
1600 | 1610 | * or characters which need to be converted for MySQL's |
1601 | 1611 | * indexing to grok it correctly. Make such changes here. |
Index: trunk/phase3/languages/classes/LanguageZh.php |
— | — | @@ -175,19 +175,12 @@ |
176 | 176 | function stripForSearch( $string ) { |
177 | 177 | wfProfileIn( __METHOD__ ); |
178 | 178 | |
179 | | - // eventually this should be a word segmentation |
180 | | - // for now just treat each character as a word |
181 | | - // @fixme only do this for Han characters... |
182 | | - $t = preg_replace( |
183 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
184 | | - " $1", $string); |
185 | | - |
186 | 179 | //always convert to zh-hans before indexing. it should be |
187 | 180 | //better to use zh-hans for search, since conversion from |
188 | 181 | //Traditional to Simplified is less ambiguous than the |
189 | 182 | //other way around |
190 | 183 | |
191 | | - $t = $this->mConverter->autoConvert($t, 'zh-hans'); |
| 184 | + $t = $this->mConverter->autoConvert( $string, 'zh-hans' ); |
192 | 185 | $t = parent::stripForSearch( $t ); |
193 | 186 | wfProfileOut( __METHOD__ ); |
194 | 187 | return $t; |
Index: trunk/phase3/languages/classes/LanguageZh_hans.php |
— | — | @@ -4,21 +4,23 @@ |
5 | 5 | * @ingroup Language |
6 | 6 | */ |
7 | 7 | class LanguageZh_hans extends Language { |
| 8 | + function hasWordBreaks() { |
| 9 | + return false; |
| 10 | + } |
| 11 | + |
8 | 12 | function stripForSearch( $string ) { |
9 | | - # MySQL fulltext index doesn't grok utf-8, so we |
10 | | - # need to fold cases and convert to hex |
11 | | - # we also separate characters as "words" |
12 | | - if( function_exists( 'mb_strtolower' ) ) { |
13 | | - return preg_replace( |
14 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", |
15 | | - "' U8' . bin2hex( \"$1\" )", |
16 | | - mb_strtolower( $string ) ); |
17 | | - } else { |
18 | | - list( , $wikiLowerChars ) = Language::getCaseMaps(); |
19 | | - return preg_replace( |
20 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", |
21 | | - "' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )", |
22 | | - $string ); |
23 | | - } |
| 13 | + // Eventually this should be a word segmentation; |
| 14 | + // for now just treat each character as a word. |
| 15 | + // |
| 16 | + // Note we put a space on both sides to cover cases |
| 17 | + // where a number or Latin char follows a Han char. |
| 18 | + // |
| 19 | + // @fixme only do this for Han characters... |
| 20 | + $t = preg_replace( |
| 21 | + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
| 22 | + " $1 ", $string); |
| 23 | + $t = preg_replace( '/ +/', ' ', $t ); |
| 24 | + $t = trim( $t ); |
| 25 | + return parent::stripForSearch( $t ); |
24 | 26 | } |
25 | 27 | } |
Index: trunk/phase3/RELEASE-NOTES |
— | — | @@ -206,6 +206,7 @@ |
207 | 207 | via extensions not using the userCan hook and via $wgRevokePermissions now work. |
208 | 208 | * (bug 19157) createAndPromote error on bad password |
209 | 209 | * (bug 18768) Remove AdminSettings.php from MediaWiki core |
| 210 | +* (bug 8445) Multiple-character search terms are now handled properly for Chinese |
210 | 211 | |
211 | 212 | == API changes in 1.16 == |
212 | 213 | |
Index: trunk/phase3/includes/SearchMySQL.php |
— | — | @@ -48,45 +48,94 @@ |
49 | 49 | $m = array(); |
50 | 50 | if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', |
51 | 51 | $filteredText, $m, PREG_SET_ORDER ) ) { |
52 | | - foreach( $m as $terms ) { |
| 52 | + foreach( $m as $bits ) { |
| 53 | + @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; |
| 54 | + |
| 55 | + if( $nonQuoted != '' ) { |
| 56 | + $term = $nonQuoted; |
| 57 | + $quote = ''; |
| 58 | + } else { |
| 59 | + $term = str_replace( '"', '', $term ); |
| 60 | + $quote = '"'; |
| 61 | + } |
| 62 | + |
53 | 63 | if( $searchon !== '' ) $searchon .= ' '; |
54 | | - if( $this->strictMatching && ($terms[1] == '') ) { |
55 | | - $terms[1] = '+'; |
| 64 | + if( $this->strictMatching && ($modifier == '') ) { |
| 65 | + // If we leave this out, boolean op defaults to OR which is rarely helpful. |
| 66 | + $modifier = '+'; |
56 | 67 | } |
57 | | - // Search terms in all variant forms, only |
58 | | - // apply on wiki with LanguageConverter |
59 | | - $temp_terms = $wgContLang->autoConvertToAllVariants( $terms[2] ); |
60 | | - if( is_array( $temp_terms )) { |
61 | | - $temp_terms = array_unique( array_values( $temp_terms )); |
62 | | - foreach( $temp_terms as $t ) |
63 | | - $searchon .= $terms[1] . $wgContLang->stripForSearch( $t ) . ' '; |
| 68 | + |
| 69 | + // Some languages such as Serbian store the input form in the search index, |
| 70 | + // so we may need to search for matches in multiple writing system variants. |
| 71 | + $convertedVariants = $wgContLang->autoConvertToAllVariants( $term ); |
| 72 | + if( is_array( $convertedVariants ) ) { |
| 73 | + $variants = array_unique( array_values( $convertedVariants ) ); |
| 74 | + } else { |
| 75 | + $variants = array( $term ); |
64 | 76 | } |
65 | | - else |
66 | | - $searchon .= $terms[1] . $wgContLang->stripForSearch( $terms[2] ); |
67 | | - if( !empty( $terms[3] ) ) { |
68 | | - // Match individual terms in result highlighting... |
69 | | - $regexp = preg_quote( $terms[3], '/' ); |
70 | | - if( $terms[4] ) { |
71 | | - $regexp = "\b$regexp"; // foo* |
72 | | - } else { |
73 | | - $regexp = "\b$regexp\b"; |
| 77 | + |
| 78 | + // The low-level search index does some processing on input to work |
| 79 | + // around problems with minimum lengths and encoding in MySQL's |
| 80 | + // fulltext engine. |
| 81 | + // For Chinese this also inserts spaces between adjacent Han characters. |
| 82 | + $strippedVariants = array_map( |
| 83 | + array( $wgContLang, 'stripForSearch' ), |
| 84 | + $variants ); |
| 85 | + |
| 86 | + // Some languages such as Chinese force all variants to a canonical |
| 87 | + // form when stripping to the low-level search index, so to be sure |
| 88 | + // let's check our variants list for unique items after stripping. |
| 89 | + $strippedVariants = array_unique( $strippedVariants ); |
| 90 | + |
| 91 | + $searchon .= $modifier; |
| 92 | + if( count( $strippedVariants) > 1 ) |
| 93 | + $searchon .= '('; |
| 94 | + foreach( $strippedVariants as $stripped ) { |
| 95 | + if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { |
| 96 | + // Hack for Chinese: we need to toss in quotes for |
| 97 | + // multiple-character phrases since stripForSearch() |
| 98 | + // added spaces between them to make word breaks. |
| 99 | + $stripped = '"' . trim( $stripped ) . '"'; |
74 | 100 | } |
75 | | - } else { |
76 | | - // Match the quoted term in result highlighting... |
77 | | - $regexp = preg_quote( str_replace( '"', '', $terms[2] ), '/' ); |
| 101 | + $searchon .= "$quote$stripped$quote$wildcard "; |
78 | 102 | } |
| 103 | + if( count( $strippedVariants) > 1 ) |
| 104 | + $searchon .= ')'; |
| 105 | + |
| 106 | + // Match individual terms or quoted phrase in result highlighting... |
| 107 | + // Note that variants will be introduced in a later stage for highlighting! |
| 108 | + $regexp = $this->regexTerm( $term, $wildcard ); |
79 | 109 | $this->searchTerms[] = $regexp; |
80 | 110 | } |
81 | | - wfDebug( "Would search with '$searchon'\n" ); |
82 | | - wfDebug( 'Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); |
| 111 | + wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); |
| 112 | + wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); |
83 | 113 | } else { |
84 | | - wfDebug( "Can't understand search query '{$filteredText}'\n" ); |
| 114 | + wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); |
85 | 115 | } |
86 | 116 | |
87 | 117 | $searchon = $this->db->strencode( $searchon ); |
88 | 118 | $field = $this->getIndexField( $fulltext ); |
89 | 119 | return " MATCH($field) AGAINST('$searchon' IN BOOLEAN MODE) "; |
90 | 120 | } |
| 121 | + |
| 122 | + function regexTerm( $string, $wildcard ) { |
| 123 | + global $wgContLang; |
| 124 | + |
| 125 | + $regex = preg_quote( $string, '/' ); |
| 126 | + if( $wgContLang->hasWordBreaks() ) { |
| 127 | + if( $wildcard ) { |
| 128 | + // Don't cut off the final bit! |
| 129 | + $regex = "\b$regex"; |
| 130 | + } else { |
| 131 | + $regex = "\b$regex\b"; |
| 132 | + } |
| 133 | + } else { |
| 134 | + // For Chinese, words may legitimately abut other words in the text literal. |
| 135 | + // Don't add \b boundary checks... note this could cause false positives |
| 136 | + // for latin chars. |
| 137 | + } |
| 138 | + return $regex; |
| 139 | + } |
91 | 140 | |
92 | 141 | public static function legalSearchChars() { |
93 | 142 | return "\"*" . parent::legalSearchChars(); |