r76659 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76658‎ | r76659 | r76660 >
Date:15:18, 14 November 2010
Author:svemir
Status:deferred
Tags:
Comment:
refactored hasSuggestion, added suggestWithEnchant
avoid E_ALL warnings when no matches found
added script to create myspell dictionary for enchant
sphinx.conf port parameter merged into listen
Modified paths:
  • /trunk/extensions/SphinxSearch/SphinxMWSearch.php (modified) (history)
  • /trunk/extensions/SphinxSearch/SphinxSearch_setup.php (added) (history)
  • /trunk/extensions/SphinxSearch/sphinx.conf (modified) (history)

Diff [purge]

Index: trunk/extensions/SphinxSearch/SphinxSearch_setup.php
@@ -0,0 +1,60 @@
 2+<?php
 3+/**
 4+ * Sets up myspell dictionary for search suggestions
 5+ *
 6+ * Run without any arguments to see instructions.
 7+ *
 8+ * @author Svemir Brkic
 9+ * @file
 10+ * @ingroup extensions
 11+ */
 12+
 13+require_once( '../../maintenance/Maintenance.php' );
 14+
 15+class SphinxSearch_setup extends Maintenance {
 16+
 17+ public function __construct() {
 18+ parent::__construct();
 19+
 20+ $this->mDescription = "Sets up myspell dictionary (sphinx.dic and sphinx.aff) ";
 21+ $this->mDescription .= "for for search suggestions (suggestWithEnchant method.)\n";
 22+ $this->mDescription .= "Uses Sphinx indexer to create a list ";
 23+ $this->mDescription .= "of all indexed words, sorted by frequency.";
 24+ }
 25+
 26+ /* Override parameters setup becuase we do not need some of the default ones */
 27+ protected function addDefaultParams() {
 28+ $this->addOption( 'spinxconf', 'Location of Sphinx configuration file', true, true );
 29+ $this->addOption( 'indexer', 'Full path to Sphinx indexer if not in the path', false, true );
 30+ $this->addOption( 'useindex', 'Sphinx index to use (defaults to wiki_main)', false, true );
 31+ $this->addOption( 'maxwords', 'Maximum number of words to extract (defaults to 10000)', false, true );
 32+ $this->addOption( 'help', "Display this help message" );
 33+ $this->addOption( 'quiet', "Whether to supress non-error output" );
 34+ }
 35+
 36+ public function execute() {
 37+ $max_words = intval( $this->getOption( 'maxwords', 10000 ) );
 38+ $indexer = wfEscapeShellArg( $this->getOption( 'indexer', 'indexer' ) );
 39+ $index = wfEscapeShellArg( $this->getOption( 'useindex', 'wiki_main' ) );
 40+ $conf = wfEscapeShellArg( $this->getOption( 'spinxconf' ) );
 41+
 42+ $cmd = "$indexer --config $conf $index --buildstops sphinx.dic $max_words";
 43+ $this->output( wfShellExec( $cmd, $retval ) );
 44+ if ( file_exists( 'sphinx.dic' ) ) {
 45+ $words = file('sphinx.dic');
 46+ $cnt = count($words);
 47+ if ($cnt) {
 48+ file_put_contents( 'sphinx.dic', $cnt . "\n" . join( '', $words ) );
 49+ file_put_contents( 'sphinx.aff', "SET UTF-8\n" );
 50+ }
 51+ }
 52+ }
 53+
 54+}
 55+
 56+$maintClass = "SphinxSearch_setup";
 57+
 58+// Avoid E_ALL notice caused by ob_end_flush() in Maintenance::setup()
 59+ob_start();
 60+
 61+require_once( DO_MAINTENANCE );
Property changes on: trunk/extensions/SphinxSearch/SphinxSearch_setup.php
___________________________________________________________________
Added: svn:eol-style
162 + native
Index: trunk/extensions/SphinxSearch/SphinxMWSearch.php
@@ -18,7 +18,7 @@
1919 var $exc_categories = array();
2020 var $db;
2121 var $sphinx_client = null;
22 -
 22+
2323 function __construct( $db ) {
2424 $this->db = $db;
2525 }
@@ -75,7 +75,7 @@
7676 } else {
7777 $resultSet = false;
7878 }
79 -
 79+
8080 if ( $resultSet === false ) {
8181 return null;
8282 } else {
@@ -85,12 +85,8 @@
8686
8787 /**
8888 * We do a weighted title/body search, no need to return titles separately
89 - *
90 - * @param string $term - Raw search term
91 - * @return SphinxMWSearchResultSet
92 - * @access public
9389 */
94 - function searchTitle( $term ) {
 90+ function searchTitle() {
9591 return null;
9692 }
9793
@@ -170,23 +166,20 @@
171167 return "A-Za-z_'./\"!~0-9\\x80-\\xFF\\-";
172168 }
173169
174 - }
 170+}
175171
176 -/**
177 - * @ingroup Search
178 - */
179172 class SphinxMWSearchResultSet extends SearchResultSet {
180173 var $mNdx = 0;
181174 var $sphinx_client = null;
182175 var $mSuggestion = '';
183 -
 176+
184177 function __construct( $resultSet, $terms, $sphinx_client, $dbr ) {
185178 global $wgSphinxSearch_index;
186179
187180 $this->sphinx_client = $sphinx_client;
188181 $this->mResultSet = array();
189182
190 - if ( is_array( $resultSet ) && is_array( $resultSet['matches'] ) ) {
 183+ if ( is_array( $resultSet ) && isset( $resultSet['matches'] ) ) {
191184 foreach ( $resultSet['matches'] as $id => $docinfo ) {
192185 $res = $dbr->select(
193186 'page',
@@ -212,22 +205,14 @@
213206 */
214207 function hasSuggestion() {
215208 global $wgSphinxSuggestMode;
216 -
 209+
217210 if ( $wgSphinxSuggestMode ) {
218 - // Initial (weak) implementation - will be replaced
219 - $dbr = wfGetDB( DB_SLAVE );
220 - $res = $dbr->select(
221 - array( 'page' ),
222 - array( 'page_title' ),
223 - array( "page_title SOUNDS LIKE " . $dbr->addQuotes($this->mTerms[0]) ),
224 - __METHOD__,
225 - array(
226 - 'ORDER BY' => 'page_counter desc',
227 - 'LIMIT' => 1
228 - )
229 - );
230 - $suggestion = $dbr->fetchObject ( $res );
231 - $this->mSuggestion = $suggestion->page_title;
 211+ $this->mSuggestion = '';
 212+ if ( $wgSphinxSuggestMode == 'enchant' ) {
 213+ $this->suggestWithEnchant();
 214+ } else {
 215+ $this->suggestWithSoundex();
 216+ }
232217 if ($this->mSuggestion) {
233218 return true;
234219 }
@@ -236,6 +221,66 @@
237222 }
238223
239224 /**
 225+ * Wiki-specific search suggestions using enchant library.
 226+ * Use SphinxSearch_setup.php to create the dictionary
 227+ */
 228+ function suggestWithEnchant() {
 229+ $broker = enchant_broker_init();
 230+ enchant_broker_set_dict_path($broker, ENCHANT_MYSPELL, dirname( __FILE__ ));
 231+ if ( enchant_broker_dict_exists( $broker, 'sphinx' ) ) {
 232+ $dict = enchant_broker_request_dict( $broker, 'sphinx' );
 233+ $suggestion_found = false;
 234+ $full_suggestion = '';
 235+ foreach ( $this->mTerms as $word ) {
 236+ $suggestions = array();
 237+ if ( !enchant_dict_check($dict, $word) ) {
 238+ $suggestions = enchant_dict_suggest($dict, $word);
 239+ while ( count( $suggestions ) ) {
 240+ $candidate = array_shift( $suggestions );
 241+ if ( strtolower($candidate) != strtolower($word) ) {
 242+ $word = $candidate;
 243+ $suggestion_found = true;
 244+ break;
 245+ }
 246+ }
 247+ }
 248+ $full_suggestion .= $word . ' ';
 249+ }
 250+ enchant_broker_free_dict( $dict );
 251+ if ($suggestion_found) {
 252+ $this->mSuggestion = trim( $full_suggestion );
 253+ }
 254+ }
 255+ enchant_broker_free( $broker );
 256+ }
 257+
 258+ /**
 259+ * Default (weak) suggestions implementation relies on MySQL soundex
 260+ */
 261+ function suggestWithSoundex() {
 262+ $dbr = wfGetDB( DB_SLAVE );
 263+ $joined_terms = $dbr->addQuotes( join( ' ', $this->mTerms ) );
 264+ $res = $dbr->select(
 265+ array( 'page' ),
 266+ array( 'page_title' ),
 267+ array(
 268+ "page_title SOUNDS LIKE " . $joined_terms,
 269+ // avoid (re)recommending the search string
 270+ "page_title NOT LIKE " . $joined_terms
 271+ ),
 272+ __METHOD__,
 273+ array(
 274+ 'ORDER BY' => 'page_counter desc',
 275+ 'LIMIT' => 1
 276+ )
 277+ );
 278+ $suggestion = $dbr->fetchObject( $res );
 279+ if ( is_object( $suggestion ) ) {
 280+ $this->mSuggestion = trim( $suggestion->page_title );
 281+ }
 282+ }
 283+
 284+ /**
240285 * @return String: suggested query, null if none
241286 */
242287 function getSuggestionQuery(){
@@ -285,12 +330,12 @@
286331 class SphinxMWSearchResult extends SearchResult {
287332
288333 var $sphinx_client = null;
289 -
 334+
290335 function __construct( $row, $sphinx_client ) {
291336 $this->sphinx_client = $sphinx_client;
292337 parent::__construct( $row );
293338 }
294 -
 339+
295340 /**
296341 * @param $terms Array: terms to highlight
297342 * @return String: highlighted text snippet, null (and not '') if not supported
@@ -333,7 +378,7 @@
334379 }
335380 } else {
336381 $ret = wfMsg( 'sphinxSearchWarning', $this->sphinx_client->GetLastError() );
337 - }
 382+ }
338383 return $ret;
339384 }
340385
Index: trunk/extensions/SphinxSearch/sphinx.conf
@@ -45,7 +45,7 @@
4646 source src_wiki_incremental : src_wiki_main
4747 {
4848 # adjust this query based on the time you run the full index
49 - # in this case, full index runs at 3 AM (server time) which translates to 7 AM UTC
 49+ # in this case, full index runs at 7 AM UTC
5050 sql_query = SELECT page_id, page_title, page_namespace, page_is_redirect, old_id, old_text FROM page, revision, text WHERE rev_id=page_latest AND old_id=rev_text_id AND page_touched>=DATE_FORMAT(CURDATE(), '%Y%m%d070000')
5151
5252 # all other parameters are copied from the parent source,
@@ -113,14 +113,9 @@
114114 # searchd settings
115115 searchd
116116 {
117 - # IP address on which search daemon will bind and accept
118 - # optional, default is to listen on all addresses,
119 - # ie. listen = 0.0.0.0
120 - listen = 127.0.0.1
 117+ # IP address and port on which search daemon will bind and accept
 118+ listen = 127.0.0.1:9312
121119
122 - # port on which search daemon will listen
123 - port = 9312
124 -
125120 # searchd run info is logged here - create or change the folder
126121 log = /var/log/sphinx/searchd.log
127122

Status & tagging log