r96534 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r96533‎ | r96534 | r96535 >
Date:00:55, 8 September 2011
Author:svemir
Status:deferred (Comments)
Tags:
Comment:
Added support for intitle:, incategory:, and prefix: searches
Updated defatult matching mode to SPH_MATCH_EXTENDED2, cleaned up
Modified paths:
  • /trunk/extensions/SphinxSearch/SphinxMWSearch.php (modified) (history)
  • /trunk/extensions/SphinxSearch/SphinxSearch.php (modified) (history)

Diff [purge]

Index: trunk/extensions/SphinxSearch/SphinxSearch.php
@@ -65,7 +65,7 @@
6666 $wgSphinxSearch_index_weights = null;
6767
6868 # Default Sphinx search mode
69 -$wgSphinxSearch_mode = SPH_MATCH_EXTENDED;
 69+$wgSphinxSearch_mode = SPH_MATCH_EXTENDED2;
7070
7171 # Default sort mode
7272 $wgSphinxSearch_sortmode = SPH_SORT_RELEVANCE;
Index: trunk/extensions/SphinxSearch/SphinxMWSearch.php
@@ -18,6 +18,12 @@
1919 var $exc_categories = array();
2020 var $db;
2121 var $sphinx_client = null;
 22+ var $prefix_handlers = array(
 23+ 'all' => 'searchAllNamespaces',
 24+ 'intitle' => 'filterByTitle',
 25+ 'incategory' => 'filterByCategory',
 26+ 'prefix' => 'filterByPrefix',
 27+ );
2228
2329 /**
2430 * Do not go to a near match if query prefixed with ~
@@ -117,29 +123,26 @@
118124
119125 $cl = new SphinxClient();
120126
121 - // setup the options for searching
122 - if ( isset( $wgSphinxSearch_host ) && isset( $wgSphinxSearch_port ) ) {
123 - $cl->SetServer( $wgSphinxSearch_host, $wgSphinxSearch_port );
124 - }
125 - if ( count( $wgSphinxSearch_weights ) ) {
 127+ $cl->SetServer( $wgSphinxSearch_host, $wgSphinxSearch_port );
 128+ if ( $wgSphinxSearch_weights && count( $wgSphinxSearch_weights ) ) {
126129 $cl->SetFieldWeights( $wgSphinxSearch_weights );
127130 }
128131 if ( is_array( $wgSphinxSearch_index_weights ) ) {
129132 $cl->SetIndexWeights( $wgSphinxSearch_index_weights );
130133 }
131 - if ( isset( $wgSphinxSearch_mode ) ) {
 134+ if ( $wgSphinxSearch_mode ) {
132135 $cl->SetMatchMode( $wgSphinxSearch_mode );
133136 }
134 - if ( count( $this->namespaces ) ) {
 137+ if ( $this->namespaces && count( $this->namespaces ) ) {
135138 $cl->SetFilter( 'page_namespace', $this->namespaces );
136139 }
137140 if( !$this->showRedirects ) {
138141 $cl->SetFilter( 'page_is_redirect', array( 0 ) );
139142 }
140 - if ( count( $this->categories ) ) {
 143+ if ( $this->categories && count( $this->categories ) ) {
141144 $cl->SetFilter( 'category', $this->categories );
142145 }
143 - if ( count( $this->exc_categories ) ) {
 146+ if ( $this->exc_categories && count( $this->exc_categories ) ) {
144147 $cl->SetFilter( 'category', $this->exc_categories, true );
145148 }
146149 $cl->SetSortMode( $wgSphinxSearch_sortmode, $wgSphinxSearch_sortby );
@@ -167,6 +170,136 @@
168171 return array( $contextlines, $contextchars );
169172 }
170173
 174+ /**
 175+ * Prepare query for sphinx search daemon
 176+ *
 177+ * @param string $query
 178+ * @return string rewritten query
 179+ */
 180+ function replacePrefixes( $query ) {
 181+ // ~ prefix is used to avoid near-term search, remove it now
 182+ if ( $query[ 0 ] === '~' ) {
 183+ $query = substr( $query, 1 );
 184+ }
 185+
 186+ $parts = preg_split( '/(")/', $query, -1, PREG_SPLIT_DELIM_CAPTURE );
 187+ $inquotes = false;
 188+ $rewritten = '';
 189+ foreach ( $parts as $part ) {
 190+ if ( $part == '"' ) { // stuff in quotes doesn't get rewritten
 191+ $rewritten .= $part;
 192+ $inquotes = !$inquotes;
 193+ } elseif ( $inquotes ) {
 194+ $rewritten .= $part;
 195+ } else {
 196+ if ( strpos( $query, ':' ) !== false ) {
 197+ $regexp = $this->preparePrefixRegexp();
 198+ $part = preg_replace_callback(
 199+ '/(^|[| :])(' . $regexp . '):([^ ]+)/i',
 200+ array( $this, 'replaceQueryPrefix' ),
 201+ $part
 202+ );
 203+ }
 204+ $rewritten .= str_replace(
 205+ array( ' OR ', ' AND ' ),
 206+ array( ' | ', ' & ' ),
 207+ $part
 208+ );
 209+ }
 210+ }
 211+ return $rewritten;
 212+ }
 213+
 214+ /**
 215+ * @return string Regexp to match namespaces and other prefixes
 216+ */
 217+ function preparePrefixRegexp() {
 218+ global $wgContLang, $wgCanonicalNamespaceNames, $wgNamespaceAliases;
 219+
 220+ $nsNamesRaw = array_merge(
 221+ $wgContLang->getNamespaces(),
 222+ $wgCanonicalNamespaceNames,
 223+ array_keys( array_merge( $wgNamespaceAliases, $wgContLang->getNamespaceAliases() ) )
 224+ );
 225+
 226+ // add all namespace names w/o spaces
 227+ $nsNames = array();
 228+ foreach ( $nsNamesRaw as $ns ) {
 229+ if ( $ns != '' ) {
 230+ $nsNames[] = str_replace( ' ', '_', $ns );
 231+ }
 232+ }
 233+
 234+ // "search everything" keyword
 235+ $allkeyword = wfMsgForContent( 'searchall' );
 236+ $this->prefix_handlers[ $allkeyword ] = 'searchAllNamespaces';
 237+
 238+ // add other kinds of prefixes we support
 239+ $nsNames = array_merge( $nsNames, array_keys( $this->prefix_handlers ) );
 240+
 241+ return implode( '|', array_unique( $nsNames ) );
 242+ }
 243+
 244+ /**
 245+ * preg callback to process foo: prefixes in the query
 246+ *
 247+ * @param array $matches
 248+ * @return string
 249+ */
 250+ function replaceQueryPrefix( $matches ) {
 251+ if ( isset( $this->prefix_handlers[ $matches[ 2 ] ] ) ) {
 252+ $callback = $this->prefix_handlers[ $matches[ 2 ] ];
 253+ return $this->$callback( $matches );
 254+ } else {
 255+ return $this->filterByNamespace( $matches );
 256+ }
 257+ }
 258+
 259+ function filterByNamespace( $matches ) {
 260+ global $wgContLang;
 261+ $inx = $wgContLang->getNsIndex( str_replace( ' ', '_', $matches[ 2 ] ) );
 262+ if ( $inx === false ) {
 263+ return $matches[ 0 ];
 264+ } else {
 265+ $this->namespaces[] = $inx;
 266+ return $matches[ 3 ];
 267+ }
 268+ }
 269+
 270+ function searchAllNamespaces( $matches ) {
 271+ $this->namespaces = null;
 272+ return $matches[ 3 ];
 273+ }
 274+
 275+ function filterByTitle( $matches ) {
 276+ return '@page_title ' . $matches[ 3 ];
 277+ }
 278+
 279+ function filterByPrefix( $matches ) {
 280+ $prefix = $matches[ 3 ];
 281+ if ( strpos( $matches[ 3 ], ':' ) !== false ) {
 282+ global $wgContLang;
 283+ list( $ns, $prefix ) = explode( ':', $matches[ 3 ] );
 284+ $inx = $wgContLang->getNsIndex( str_replace( ' ', '_', $ns ) );
 285+ if ( $inx !== false ) {
 286+ $this->namespaces = array( $inx );
 287+ }
 288+ }
 289+ return '@page_title ^' . $prefix . '*';
 290+ }
 291+
 292+ function filterByCategory( $matches ) {
 293+ $page_id = $this->db->selectField( 'page', 'page_id',
 294+ array(
 295+ 'page_title' => $matches[ 3 ],
 296+ 'page_namespace' => NS_CATEGORY
 297+ ),
 298+ __METHOD__
 299+ );
 300+ $this->categories[] = intval( $page_id );
 301+ return '';
 302+ }
 303+
171304 }
172305
173306 class SphinxMWSearchResultSet extends SearchResultSet {

Follow-up revisions

RevisionCommit summaryAuthorDate
r96555streamlined preparePrefixRegexp and used preg_quote when building a regexpsvemir12:23, 8 September 2011

Comments

#Comment by Nikerabbit (talk | contribs)   05:38, 8 September 2011

Don't you need to preg_quote nsNames in preparePrefixRegexp?

#Comment by Svemir Brkic (talk | contribs)   12:24, 8 September 2011

Good point. I rewrote that method (var names were not good because it is not handling namespaces only) and added preg_quote in r96555

Status & tagging log