r73342 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r73341‎ | r73342 | r73343 >
Date:21:24, 19 September 2010
Author:svemir
Status:deferred
Tags:
Comment:
initial support for new model: SphinxMWSearch (extends SearchEngine and uses standard MW search interface) - old model is still supported (at least until version 0.8)
Modified paths:
  • /trunk/extensions/SphinxSearch/SphinxMWSearch.php (added) (history)
  • /trunk/extensions/SphinxSearch/SphinxSearch.php (modified) (history)

Diff [purge]

Index: trunk/extensions/SphinxSearch/SphinxSearch.php
@@ -11,7 +11,7 @@
1212
1313 $wgExtensionCredits['specialpage'][] = array(
1414 'path' => __FILE__,
15 - 'version' => '0.7.1',
 15+ 'version' => '0.7.2',
1616 'name' => 'SphinxSearch',
1717 'author' => array( 'Svemir Brkic', 'Paul Grinberg' ),
1818 'email' => 'svemir at deveblog dot com, gri6507 at yahoo dot com',
@@ -21,29 +21,35 @@
2222
2323 $dir = dirname( __FILE__ ) . '/';
2424
25 -$wgAutoloadClasses['SphinxSearch'] = $dir . 'SphinxSearch_body.php';
2625 $wgExtensionMessagesFiles['SphinxSearch'] = $dir . 'SphinxSearch.i18n.php';
27 -$wgExtensionAliasesFiles['SphinxSearch'] = $dir . 'SphinxSearch.alias.php';
2826
2927 # To completely disable the default search and replace it with SphinxSearch,
3028 # set this BEFORE including SphinxSearch.php in LocalSettings.php
3129 # $wgSearchType = 'SphinxSearch';
32 -
33 -if ( $wgSearchType == 'SphinxSearch' ) {
34 - $wgDisableInternalSearch = true;
35 - $wgDisableSearchUpdate = true;
36 - $wgSpecialPages['Search'] = 'SphinxSearch';
 30+# To use the new approach (added in 0.7.2) set it to SphinxMWSearch
 31+if ( $wgSearchType == 'SphinxMWSearch' ) {
 32+ $wgAutoloadClasses['SphinxMWSearch'] = $dir . 'SphinxMWSearch.php';
3733 } else {
38 - $wgSpecialPages['SphinxSearch'] = 'SphinxSearch';
 34+ if ( $wgSearchType == 'SphinxSearch' ) {
 35+ $wgAutoloadClasses['SphinxSearch'] = $dir . 'SphinxSearch_body.php';
 36+ $wgDisableInternalSearch = true;
 37+ $wgDisableSearchUpdate = true;
 38+ $wgSpecialPages['Search'] = 'SphinxSearch';
 39+ $wgDisableSearchUpdate = true;
 40+ } else {
 41+ $wgExtensionAliasesFiles['SphinxSearch'] = $dir . 'SphinxSearch.alias.php';
 42+ $wgSpecialPages['SphinxSearch'] = 'SphinxSearch';
 43+ }
3944 }
4045
4146 # this assumes you have copied sphinxapi.php from your Sphinx
4247 # installation folder to your SphinxSearch extension folder
 48+# not needed if you install http://pecl.php.net/package/sphinx
4349 if ( !class_exists( 'SphinxClient' ) ) {
4450 require_once ( $dir . "sphinxapi.php" );
4551 }
4652
47 -# Host and port on which searchd deamon is tunning
 53+# Host and port on which searchd deamon is running
4854 $wgSphinxSearch_host = 'localhost';
4955 $wgSphinxSearch_port = 9312;
5056
@@ -70,44 +76,43 @@
7177 $wgSphinxSearch_sortmode = SPH_SORT_RELEVANCE;
7278 $wgSphinxSearch_sortby = '';
7379
74 -# By default, search will return articles that match any of the words in the search
75 -# To change that to require all words to match by default, set the following to true
76 -$wgSphinxMatchAll = false;
 80+if ( $wgSearchType == 'SphinxMWSearch' ) {
 81+ # Following settings apply only in the new search model
7782
78 -# Number of matches to display at once
79 -$wgSphinxSearch_matches = 10;
80 -# How many matches searchd will keep in RAM while searching
81 -$wgSphinxSearch_maxmatches = 1000;
82 -# When to stop searching all together (if not zero)
83 -$wgSphinxSearch_cutoff = 0;
 83+ # Set to true to use MW's default search snippets and highlighting
 84+ $wgSphinxSearchMWHighlighter = false;
 85+} else {
 86+ # Following settings apply only in the old search model
8487
85 -# Weights of individual indexed columns. This gives page titles extra weight
86 -$wgSphinxSearch_weights = array(
87 - 'old_text' => 1,
88 - 'page_title' => 100
89 -);
 88+ # By default, search will return articles that match any of the words in the search
 89+ # To change that to require all words to match by default, set the following to true
 90+ $wgSphinxMatchAll = false;
 91+
 92+ # Number of matches to display at once
 93+ $wgSphinxSearch_matches = 10;
9094
91 -# To enable hierarchical category search, specify the top category of your hierarchy
92 -$wgSphinxTopSearchableCategory = '';
 95+ # To enable hierarchical category search, specify the top category of your hierarchy
 96+ $wgSphinxTopSearchableCategory = '';
 97+
 98+ # This will fetch sub-categories as parent categories are checked
 99+ # Requires $wgUseAjax to be true
 100+ $wgAjaxExportList[] = 'SphinxSearch::ajaxGetCategoryChildren';
 101+
 102+ # Allow excluding selected categories when filtering
 103+ $wgUseExcludes = false;
93104
94 -# This will fetch sub-categories as parent categories are checked
95 -# Requires $wgUseAjax to be true
96 -$wgAjaxExportList[] = 'SphinxSearch::ajaxGetCategoryChildren';
 105+ # Web-accessible path to the extension's folder
 106+ $wgSphinxSearchExtPath = $wgScriptPath . '/extensions/SphinxSearch';
 107+
 108+ # Web-accessible path to the folder with SphinxSearch.js file (if different from $wgSphinxSearchExtPath)
 109+ $wgSphinxSearchJSPath = '';
 110+}
97111
98 -# EXPERIMENTAL: allow excluding selected categories when filtering
99 -$wgUseExcludes = false;
100 -
101 -# Web-accessible path to the extension's folder
102 -$wgSphinxSearchExtPath = $wgScriptPath . '/extensions/SphinxSearch';
103 -
104 -# Web-accessible path to the folder with SphinxSearch.js file (if different from $wgSphinxSearchExtPath)
105 -$wgSphinxSearchJSPath = '';
106 -
107112 # #########################################################
108113 # Use Aspell to suggest possible misspellings. This can be provided via
109114 # PHP pspell module (http://www.php.net/manual/en/ref.pspell.php)
110115 # or command line insterface to ASpell
111 -
 116+
112117 # Should the suggestion mode be enabled?
113118 $wgSphinxSuggestMode = false;
114119
@@ -119,3 +124,15 @@
120125
121126 # Path to aspell location and language data files. Do not set if not sure.
122127 $wgSphinxSearchPspellDictionaryDir = '';
 128+
 129+# How many matches searchd will keep in RAM while searching
 130+$wgSphinxSearch_maxmatches = 1000;
 131+
 132+# When to stop searching all together (if not zero)
 133+$wgSphinxSearch_cutoff = 0;
 134+
 135+# Weights of individual indexed columns. This gives page titles extra weight
 136+$wgSphinxSearch_weights = array(
 137+ 'old_text' => 1,
 138+ 'page_title' => 100
 139+);
Index: trunk/extensions/SphinxSearch/SphinxMWSearch.php
@@ -0,0 +1,340 @@
 2+<?php
 3+
 4+/**
 5+ * Class file for the SphinxMWSearch extension
 6+ *
 7+ * http://www.mediawiki.org/wiki/Extension:SphinxSearch
 8+ *
 9+ * Released under GNU General Public License (see http://www.fsf.org/licenses/gpl.html)
 10+ *
 11+ * @file
 12+ * @ingroup Extensions
 13+ * @author Svemir Brkic <svemir@deveblog.com>
 14+ */
 15+
 16+ class SphinxMWSearch extends SearchEngine {
 17+
 18+ var $categories = array();
 19+ var $exc_categories = array();
 20+ var $db;
 21+ var $sphinx_client = null;
 22+
 23+ function __construct( $db ) {
 24+ $this->db = $db;
 25+ }
 26+
 27+ /**
 28+ * Perform a full text search query and return a result set.
 29+ *
 30+ * @param string $term - Raw search term
 31+ * @return SphinxMWSearchResultSet
 32+ * @access public
 33+ */
 34+ function searchText( $term ) {
 35+ global $wgSphinxSearch_index_list;
 36+
 37+ if ( !$this->sphinx_client ) {
 38+ $this->sphinx_client = $this->prepareSphinxClient( $term );
 39+ }
 40+
 41+ if ( $this->sphinx_client ) {
 42+ $this->searchTerms = $term;
 43+ $escape = '/';
 44+ $delims = array(
 45+ '(' => ')',
 46+ '[' => ']',
 47+ '"' => '',
 48+ );
 49+ // temporarily replace already escaped characters
 50+ $placeholders = array(
 51+ '\\(' => '_PLC_O_PAR_',
 52+ '\\)' => '_PLC_C_PAR_',
 53+ '\\[' => '_PLC_O_BRA_',
 54+ '\\]' => '_PLC_C_BRA_',
 55+ '\\"' => '_PLC_QUOTE_',
 56+ );
 57+ $term = str_replace(array_keys($placeholders), $placeholders, $term);
 58+ foreach ($delims as $open => $close) {
 59+ $open_cnt = substr_count( $term, $open );
 60+ if ($close) {
 61+ // if counts do not match, escape them all
 62+ $close_cnt = substr_count( $term, $close );
 63+ if ($open_cnt != $close_cnt) {
 64+ $escape .= $open . $close;
 65+ }
 66+ } elseif ($open_cnt % 2 == 1) {
 67+ // if there is no closing symbol, count should be even
 68+ $escape .= $open;
 69+ }
 70+ }
 71+ $term = str_replace($placeholders, array_keys($placeholders), $term);
 72+ $resultSet = $this->sphinx_client->Query(
 73+ addcslashes( $term, $escape ),
 74+ $wgSphinxSearch_index_list
 75+ );
 76+ } else {
 77+ $resultSet = false;
 78+ }
 79+
 80+ if ( $resultSet === false ) {
 81+ return null;
 82+ } else {
 83+ return new SphinxMWSearchResultSet( $resultSet, $term, $this->sphinx_client, $this->db );
 84+ }
 85+ }
 86+
 87+ /**
 88+ * We do a weighted title/body search, no need to return titles separately
 89+ *
 90+ * @param string $term - Raw search term
 91+ * @return SphinxMWSearchResultSet
 92+ * @access public
 93+ */
 94+ function searchTitle( $term ) {
 95+ return null;
 96+ }
 97+
 98+ /**
 99+ * @return SphinxClient: ready to run or false if term is empty
 100+ */
 101+ function prepareSphinxClient( &$term ) {
 102+ global $wgSphinxSearch_sortmode, $wgSphinxSearch_sortby, $wgSphinxSearch_host,
 103+ $wgSphinxSearch_port, $wgSphinxSearch_index_weights, $wgSphinxSearch_index,
 104+ $wgSphinxSearch_mode, $wgSphinxMatchAll, $wgSphinxSearch_maxmatches,
 105+ $wgSphinxSearch_cutoff, $wgSphinxSearch_weights;
 106+
 107+ // don't do anything for blank searches
 108+ if ( trim( $term ) === '' ) {
 109+ return false;
 110+ }
 111+
 112+ wfRunHooks( 'SphinxSearchBeforeResults', array(
 113+ &$term,
 114+ &$this->offset,
 115+ &$this->namespaces,
 116+ &$this->categories,
 117+ &$this->exc_categories
 118+ ) );
 119+
 120+ $cl = new SphinxClient();
 121+
 122+ // setup the options for searching
 123+ if ( isset( $wgSphinxSearch_host ) && isset( $wgSphinxSearch_port ) ) {
 124+ $cl->SetServer( $wgSphinxSearch_host, $wgSphinxSearch_port );
 125+ }
 126+ if ( count( $wgSphinxSearch_weights ) ) {
 127+ $cl->SetFieldWeights( $wgSphinxSearch_weights );
 128+ }
 129+ if ( is_array( $wgSphinxSearch_index_weights ) ) {
 130+ $cl->SetIndexWeights( $wgSphinxSearch_index_weights );
 131+ }
 132+ if ( isset( $wgSphinxSearch_mode ) ) {
 133+ $cl->SetMatchMode( $wgSphinxSearch_mode );
 134+ }
 135+ if ( count( $this->namespaces ) ) {
 136+ $cl->SetFilter( 'page_namespace', $this->namespaces );
 137+ }
 138+ if( !$this->showRedirects ) {
 139+ $cl->SetFilter( 'page_is_redirect', array( 0 ) );
 140+ }
 141+ if ( count( $this->categories ) ) {
 142+ $cl->SetFilter( 'category', $this->categories );
 143+ }
 144+ if ( count( $this->exc_categories ) ) {
 145+ $cl->SetFilter( 'category', $this->exc_categories, true );
 146+ }
 147+ $cl->SetSortMode( $wgSphinxSearch_sortmode, $wgSphinxSearch_sortby );
 148+ $cl->SetLimits(
 149+ $this->offset,
 150+ $this->limit,
 151+ $wgSphinxSearch_maxmatches,
 152+ $wgSphinxSearch_cutoff
 153+ );
 154+
 155+ wfRunHooks( 'SphinxSearchBeforeQuery', array( &$term, &$cl ) );
 156+
 157+ return $cl;
 158+ }
 159+
 160+ /**
 161+ * @return Boolean: can we list/unlist redirects
 162+ */
 163+ function acceptListRedirects() {
 164+ return true;
 165+ }
 166+
 167+ /**
 168+ * @return String: allowed query characters
 169+ */
 170+ public static function legalSearchChars() {
 171+ return "A-Za-z_'./\"!~0-9\\x80-\\xFF\\-";
 172+ }
 173+
 174+ }
 175+
 176+/**
 177+ * @ingroup Search
 178+ */
 179+class SphinxMWSearchResultSet extends SearchResultSet {
 180+ var $mNdx = 0;
 181+ var $sphinx_client = null;
 182+ var $mSuggestion = '';
 183+
 184+ function __construct( $resultSet, $terms, $sphinx_client, $dbr ) {
 185+ global $wgSphinxSearch_index;
 186+
 187+ $this->sphinx_client = $sphinx_client;
 188+ $this->mResultSet = array();
 189+
 190+ if ( is_array( $resultSet ) && is_array( $resultSet['matches'] ) ) {
 191+ foreach ( $resultSet['matches'] as $id => $docinfo ) {
 192+ $res = $dbr->select(
 193+ 'page',
 194+ array( 'page_id', 'page_title', 'page_namespace' ),
 195+ array( 'page_id' => $id ),
 196+ __METHOD__,
 197+ array()
 198+ );
 199+ if ( $dbr->numRows( $res ) > 0 ) {
 200+ $this->mResultSet[] = $dbr->fetchObject( $res );
 201+ }
 202+ }
 203+ }
 204+ $this->mNdx = 0;
 205+ $this->mTerms = preg_split('/\W+/', $terms);
 206+ }
 207+
 208+ /**
 209+ * Some search modes return a suggested alternate term if there are
 210+ * no exact hits. Returns true if there is one on this set.
 211+ *
 212+ * @return Boolean
 213+ */
 214+ function hasSuggestion() {
 215+ global $wgSphinxSuggestMode;
 216+
 217+ if ( $wgSphinxSuggestMode ) {
 218+ // Initial (weak) implementation - will be replaced
 219+ $dbr = wfGetDB( DB_SLAVE );
 220+ $res = $dbr->select(
 221+ array( 'page' ),
 222+ array( 'page_title' ),
 223+ array( "page_title SOUNDS LIKE " . $dbr->addQuotes($this->mTerms[0]) ),
 224+ __METHOD__,
 225+ array(
 226+ 'ORDER BY' => 'page_counter desc',
 227+ 'LIMIT' => 1
 228+ )
 229+ );
 230+ $suggestion = $dbr->fetchObject ( $res );
 231+ $this->mSuggestion = $suggestion->page_title;
 232+ if ($this->mSuggestion) {
 233+ return true;
 234+ }
 235+ }
 236+ return false;
 237+ }
 238+
 239+ /**
 240+ * @return String: suggested query, null if none
 241+ */
 242+ function getSuggestionQuery(){
 243+ return $this->mSuggestion;
 244+ }
 245+
 246+ /**
 247+ * @return String: HTML highlighted suggested query, '' if none
 248+ */
 249+ function getSuggestionSnippet(){
 250+ return $this->mSuggestion;
 251+ }
 252+
 253+ /**
 254+ * @return Array: search terms
 255+ */
 256+ function termMatches() {
 257+ return $this->mTerms;
 258+ }
 259+
 260+ /**
 261+ * @return Integer: number of results
 262+ */
 263+ function numRows() {
 264+ return count( $this->mResultSet );
 265+ }
 266+
 267+ /**
 268+ * @return SphinxMWSearchResult: next result, false if none
 269+ */
 270+ function next() {
 271+ if ( isset( $this->mResultSet[$this->mNdx] ) ) {
 272+ $row = $this->mResultSet[$this->mNdx];
 273+ ++$this->mNdx;
 274+ return new SphinxMWSearchResult( $row, $this->sphinx_client );
 275+ } else {
 276+ return false;
 277+ }
 278+ }
 279+
 280+ function free() {
 281+ unset( $this->mResultSet );
 282+ }
 283+
 284+}
 285+
 286+class SphinxMWSearchResult extends SearchResult {
 287+
 288+ var $sphinx_client = null;
 289+
 290+ function __construct( $row, $sphinx_client ) {
 291+ $this->sphinx_client = $sphinx_client;
 292+ parent::__construct( $row );
 293+ }
 294+
 295+ /**
 296+ * @param $terms Array: terms to highlight
 297+ * @return String: highlighted text snippet, null (and not '') if not supported
 298+ */
 299+ function getTextSnippet( $terms ){
 300+ global $wgUser, $wgSphinxSearchMWHighlighter, $wgSphinxSearch_index;
 301+
 302+ if ( $wgSphinxSearchMWHighlighter ) {
 303+ return parent::getTextSnippet( $terms );
 304+ }
 305+
 306+ $this->initText();
 307+
 308+ list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs( $wgUser );
 309+
 310+ $excerpts_opt = array(
 311+ "before_match" => "<span class='searchmatch'>",
 312+ "after_match" => "</span>",
 313+ "chunk_separator" => " ... ",
 314+ "limit" => $contextlines * $contextchars,
 315+ "around" => $contextchars
 316+ );
 317+
 318+ $excerpts = $this->sphinx_client->BuildExcerpts(
 319+ array( $this->mText ),
 320+ $wgSphinxSearch_index,
 321+ join(' ', $terms),
 322+ $excerpts_opt
 323+ );
 324+
 325+ if ( is_array( $excerpts ) ) {
 326+ $ret = '';
 327+ foreach ( $excerpts as $entry ) {
 328+ // remove some wiki markup
 329+ $entry = preg_replace( '/([\[\]\{\}\*\#\|\!]+|==+)/',
 330+ ' ',
 331+ strip_tags( $entry, '<span><br>' )
 332+ );
 333+ $ret .= "<div style='margin: 0.2em 1em 0.2em 1em;'>$entry</div>\n";
 334+ }
 335+ } else {
 336+ $ret = wfMsg( 'sphinxSearchWarning', $this->sphinx_client->GetLastError() );
 337+ }
 338+ return $ret;
 339+ }
 340+
 341+}
\ No newline at end of file

Status & tagging log