r2466 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r2465‎ | r2466 | r2467 >
Date:01:48, 11 February 2004
Author:e23
Status:old
Tags:
Comment:
Fuzzy search feature (replaces full text search as default action when exact title search fails)
Modified paths:
  • /trunk/phase3/includes/DefaultSettings.php (modified) (history)
  • /trunk/phase3/includes/SearchEngine.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/DefaultSettings.php
@@ -187,6 +187,7 @@
188188
189189 $wgDisableCounters = false;
190190 $wgDisableTextSearch = false;
 191+$wgDisableFuzzySearch = false;
191192 $wgDisableSearchUpdate = false; # If you've disabled search semi-permanently, this also disables updates to the table. If you ever re-enable, be sure to rebuild the search table.
192193 $wgDisableUploads = true; # Uploads have to be specially set up to be secure
193194 $wgRemoteUploads = false; # Set to true to enable the upload _link_ while local uploads are disabled. Assumes that the special page link will be bounced to another server where uploads do work.
Index: trunk/phase3/includes/SearchEngine.php
@@ -458,30 +458,109 @@
459459 $wgOut->redirect( wfLocalUrl( $t->getPrefixedURL() ) );
460460 return;
461461 }
 462+ $wgOut->addHTML( wfMsg("nogomatch",
 463+ htmlspecialchars( wfLocalUrl( ucfirst($this->mUsertext), "action=edit") ) )
 464+ . "\n<p>" );
462465
463 - # Try a near match
464 - #
465 - if( !$wgDisableTextSearch ) {
466 - $this->parseQuery();
467 - $sql = "SELECT cur_id,cur_title,cur_namespace,si_page FROM cur,searchindex " .
468 - "WHERE cur_id=si_page AND {$this->mTitlecond} ORDER BY cur_namespace LIMIT 1";
469 -
470 - if ( "" != $this->mTitlecond ) {
471 - $res = wfQuery( $sql, DB_READ, $fname );
472 - }
473 - if ( isset( $res ) && 0 != wfNumRows( $res ) ) {
474 - $s = wfFetchObject( $res );
475 -
476 - $t = Title::makeTitle( $s->cur_namespace, $s->cur_title );
477 - $wgOut->redirect( wfLocalUrl( $t->getPrefixedURL() ) );
478 - return;
 466+ # Try a fuzzy title search
 467+ $anyhit = false;
 468+ global $wgDisableFuzzySearch;
 469+ if(! $wgDisableFuzzySearch ){
 470+ foreach( array(NS_MAIN, NS_WP, NS_USER, NS_IMAGE, NS_MEDIAWIKI) as $namespace){
 471+ $anyhit |= SearchEngine::doFuzzyTitleSearch( $search, $namespace );
479472 }
 473+ }
 474+ if( ! $anyhit ){
 475+ $wgOut->addHTML( wfMsg("notitlematches") );
480476 }
481 - $wgOut->addHTML( wfMsg("nogomatch",
482 - htmlspecialchars( wfLocalUrl( ucfirst($this->mUsertext), "action=edit") ) )
483 - . "\n<p>" );
484 - $this->showResults();
485477 }
 478+
 479+ /* static */ function doFuzzyTitleSearch( $search, $namespace ){
 480+ global $wgLang, $wgOut;
 481+ $sstr = ucfirst($search);
 482+ $sstr = str_replace(" ", "_", $sstr);
 483+ $fuzzymatches = SearchEngine::fuzzyTitles( $sstr, $namespace );
 484+ $fuzzymatches = array_slice($fuzzymatches, 0, 10);
 485+ $slen = strlen( $search );
 486+ $wikitext = "";
 487+ foreach($fuzzymatches as $res){
 488+ $t = str_replace("_", " ", $res[1]);
 489+ $tfull = $wgLang->getNsText( $namespace ) . ":$t|$t";
 490+ if( $namespace == NS_MAIN )
 491+ $tfull = "$t";
 492+ $distance = $res[0];
 493+ $closeness = (strlen( $search ) - $distance) / strlen( $search );
 494+ $percent = intval( $closeness * 100 ) . "%";
 495+ $stars = str_repeat("*", ceil(5 * $closeness) );
 496+ $wikitext .= "* [[$tfull]] $percent ($stars)\n";
 497+ }
 498+ if( $wikitext ){
 499+ if( $namespace != NS_MAIN )
 500+ $wikitext = "=== " . $wgLang->getNsText( $namespace ) . " ===\n" . $wikitext;
 501+ $wgOut->addWikiText( $wikitext );
 502+ return true;
 503+ }
 504+ return false;
 505+ }
 506+
 507+ /* static */ function fuzzyTitles( $sstr, $namespace = NS_MAIN ){
 508+ $span = 0.10; // weed on title length before doing levenshtein.
 509+ $tolerance = 0.35; // allowed percentage of erronous characters
 510+ $slen = strlen($sstr);
 511+ $tolerance_count = ceil($tolerance * $slen);
 512+ $spanabs = ceil($slen * (1 + $span)) - $slen;
 513+ # print "Word: $sstr, len = $slen, range = [$min, $max], tolerance_count = $tolerance_count<BR>\n";
 514+ $result = array();
 515+ for( $i=0; $i <= $spanabs; $i++ ){
 516+ $titles = SearchEngine::getTitlesByLength( $slen + $i, $namespace );
 517+ if( $i != 0)
 518+ $titles = array_merge($titles, SearchEngine::getTitlesByLength( $slen - $i, $namespace ) );
 519+ foreach($titles as $t){
 520+ $d = levenshtein($sstr, $t);
 521+ if($d < $tolerance_count)
 522+ $result[] = array($d, $t);
 523+ $cnt++;
 524+ }
 525+ }
 526+ usort($result, "SearchEngine_pcmp");
 527+ return $result;
 528+ }
 529+
 530+ /* static */ function getTitlesByLength($aLength, $aNamespace = 0){
 531+ global $wgMemc, $wgDBname;
 532+
 533+ $mkey = "$wgDBname:titlesbylength:$aLength:$aNamespace";
 534+ $mkeyts = "$wgDBname:titlesbylength:createtime";
 535+ $ts = $wgMemc->get( $mkeyts );
 536+ $result = $wgMemc->get( $mkey );
 537+
 538+ if( time() - $ts < 3600 ){
 539+ // note: in case of insufficient memcached space, we return
 540+ // an empty list instead of starting to hit the DB.
 541+ return is_array( $result ) ? $result : array();
 542+ }
 543+
 544+ $wgMemc->set( $mkeyts, time() );
 545+ $res = wfQuery("SELECT cur_title, cur_namespace FROM cur", DB_READ);
 546+ $titles = array(); // length, ns, [titles]
 547+ while( $obj = wfFetchObject( $res ) ){
 548+ $title = $obj->cur_title;
 549+ $ns = $obj->cur_namespace;
 550+ $len = strlen( $title );
 551+ $titles[$len][$ns][] = $title;
 552+ }
 553+ foreach($titles as $length => $length_arr){
 554+ foreach($length_arr as $ns => $title_arr){
 555+ $mkey = "$wgDBname:titlesbylength:$length:$ns";
 556+ $wgMemc->set( $mkey, $title_arr, 3600 * 24 );
 557+ }
 558+ }
 559+ return $titles[$aLength][$aNamespace];
 560+ }
486561 }
487562
 563+/* private static */ function SearchEngine_pcmp($a, $b){ return $a[0] - $b[0]; }
 564+
 565+
 566+
488567 ?>

Status & tagging log