r34072 MediaWiki - Code Review archive

Revision:r34071‎ | r34072 | r34073 >
Date:13:36, 1 May 2008
New class SearchHighlighter handles highlighting of search terms and
snippet extraction:
* prefer text hits over matches on images/templates/tables, making the
snippets more readable and relevant
* cleanup wikitext
* prefer snippets with exact query match - works only for whole phrases
* drop the old context calculation and replace it will a more flexible one
that does a better job keeping snippets of constant width
* if the first line of the article matches whole query show only one snippet
* manually lower/uppercase non-ascii chars so that words in e.g. cyrillic
are also case-insensitive
* workaround for php limited utf8 support so that snippets end up being of
constant char-size over single and multiple byte text
* if there is no text match for some reason, show beginning of the article
* haven't done performance testing, might not be safe to go live, although
I don't see any immediate problems with it
Modified paths:
  • /trunk/phase3/includes/SearchEngine.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/SearchEngine.php
@@ -250,8 +250,9 @@
251251 */
252252 public static function userHighlightPrefs( &$user ){
253253 //$contextlines = $user->getOption( 'contextlines', 5 );
 254+ //$contextchars = $user->getOption( 'contextchars', 50 );
254255 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
255 - $contextchars = $user->getOption( 'contextchars', 50 );
 256+ $contextchars = 75; // same as above.... :P
256257 return array($contextlines, $contextchars);
257258 }
@@ -553,68 +554,11 @@
554555 global $wgUser;
555556 $this->initText();
556557 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
557 - return $this->extractText( $this->mText, $terms, $contextlines, $contextchars);
 558+ $h = new SearchHighlighter();
 559+ return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars);
558560 }
560562 /**
561 - * Default implementation of snippet extraction
562 - *
563 - * @param string $text
564 - * @param array $terms Terms to highlight (unescaped)
565 - * @param int $contextlines
566 - * @param int $contextchars
567 - * @return string
568 - */
569 - protected function extractText( $text, $terms, $contextlines, $contextchars ) {
570 - global $wgLang, $wgContLang;
571 - $fname = __METHOD__;
572 -
573 - $lines = explode( "\n", $text );
574 -
575 - foreach( $terms as $index => $term ) {
576 - $terms[$index] = preg_quote( $term, '/' );
577 - }
578 - $terms = implode( '|', $terms );
579 - $max = intval( $contextchars ) + 1;
580 - $pat1 = "/(.*)($terms)(.{0,$max})/i";
581 -
582 - $lineno = 0;
583 -
584 - $extract = "";
585 - wfProfileIn( "$fname-extract" );
586 - foreach ( $lines as $line ) {
587 - if ( 0 == $contextlines ) {
588 - break;
589 - }
590 - ++$lineno;
591 - $m = array();
592 - if ( ! preg_match( $pat1, $line, $m ) ) {
593 - continue;
594 - }
595 - --$contextlines;
596 - $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
597 -
598 - if ( count( $m ) < 3 ) {
599 - $post = '';
600 - } else {
601 - $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
602 - }
603 -
604 - $found = $m[2];
605 -
606 - $line = htmlspecialchars( $pre . $found . $post );
607 - $pat2 = '/(' . $terms . ")/i";
608 - $line = preg_replace( $pat2,
609 - "<span class='searchmatch'>\\1</span>", $line );
610 -
611 - $extract .= "${line}\n";
612 - }
613 - wfProfileOut( "$fname-extract" );
614 -
615 - return $extract;
616 - }
617 -
618 - /**
619563 * @param array $terms terms to highlight
620564 * @return string highlighted title, '' if not supported
621565 */
@@ -690,8 +634,407 @@
691635 }
693637 /**
 638+ * Highlight bits of wikitext
 639+ *
694640 * @addtogroup Search
695641 */
 642+class SearchHighlighter {
 643+ var $mCleanWikitext = true;
 645+ function SearchHighlighter($cleanupWikitext = true){
 646+ $this->mCleanWikitext = $cleanupWikitext;
 647+ }
 649+ /**
 650+ * Default implementation of wikitext highlighting
 651+ *
 652+ * @param string $text
 653+ * @param array $terms Terms to highlight (unescaped)
 654+ * @param int $contextlines
 655+ * @param int $contextchars
 656+ * @return string
 657+ */
 658+ public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 659+ global $wgLang, $wgContLang;
 660+ $fname = __METHOD__;
 662+ if($text == '')
 663+ return '';
 665+ // spli text into text + templates/links/tables
 666+ $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)/";
 667+ // first capture group is for detecting nested templates/links/tables
 668+ $endPatterns = array(
 669+ 1 => '/(\{\{)|(\}\})/', // template
 670+ 2 => '/(\[\[)|(\]\])/', // image
 671+ 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 672+ $textExt = array(); // text extracts
 673+ $otherExt = array(); // other extracts
 674+ wfProfileIn( "$fname-split" );
 675+ $start = 0;
 676+ $textLen = strlen($text);
 677+ $count = 0; // sequence number to maintain ordering
 678+ while( $start < $textLen ){
 679+ // find start of template/image/table
 680+ if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 681+ $epat = '';
 682+ foreach($matches as $key => $val){
 683+ if($key > 0 && $val[1] != -1){
 684+ if($key == 2){
 685+ // see if this is an image link
 686+ $ns = substr($val[0],2,-1);
 687+ if( $wgContLang->getNsIndex($ns) != NS_IMAGE )
 688+ break;
 690+ }
 691+ $epat = $endPatterns[$key];
 692+ $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 693+ $start = $val[1];
 694+ break;
 695+ }
 696+ }
 697+ if( $epat ){
 698+ // find end (and detect any nested elements)
 699+ $level = 0;
 700+ $offset = $start + 1;
 701+ $found = false;
 702+ while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 703+ if( array_key_exists(2,$endMatches) ){
 704+ // found end
 705+ if($level == 0){
 706+ $len = strlen($endMatches[2][0]);
 707+ $off = $endMatches[2][1];
 708+ $this->splitAndAdd( $otherExt, $count,
 709+ substr( $text, $start, $off + $len - $start ) );
 710+ $start = $off + $len;
 711+ $found = true;
 712+ break;
 713+ } else{
 714+ // end of nested element
 715+ $level -= 1;
 716+ }
 717+ } else{
 718+ // nested
 719+ $level += 1;
 720+ }
 721+ $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 722+ }
 723+ if( ! $found ){
 724+ // couldn't find appropriate closing tag, skip
 725+ $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 726+ $start += strlen($matches[0][0]);
 727+ }
 728+ continue;
 729+ }
 730+ }
 731+ // else: add as text extract
 732+ $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 733+ break;
 734+ }
 736+ $all = $textExt + $otherExt; // these have disjunct key sets
 738+ wfProfileOut( "$fname-split" );
 740+ // prepare regexps
 741+ foreach( $terms as $index => $term ) {
 742+ $terms[$index] = preg_quote( $term, '/' );
 743+ // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 744+ if(preg_match('/[\x80-\xff]/', $term) ){
 745+ $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 746+ }
 749+ }
 750+ $anyterm = implode( '|', $terms );
 751+ $phrase = implode('[, .:;\(\)"\'\-\+]+', $terms );
 753+ // FIXME: a hack to scale contextchars, a correct solution
 754+ // would be to have contextchars actually be char and not byte
 755+ // length, and do proper utf-8 substrings and lengths everywhere,
 756+ // but PHP is making that very hard and unclean to implement :(
 757+ $scale = strlen($anyterm) / mb_strlen($anyterm);
 758+ $contextchars = intval( $contextchars * $scale );
 760+ $pat1 = '/('.$phrase.')/ui';
 761+ $pat2 = '/('.$anyterm.')/ui';
 763+ wfProfileIn( "$fname-extract" );
 765+ $left = $contextlines;
 767+ $snippets = array();
 768+ $offsets = array();
 769+ // match whole query on text
 770+ $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 771+ // match whole query on templates/tables/images
 772+ $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 773+ // match any words on text
 774+ $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 775+ // match any words on templates/tables/images
 776+ $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 778+ ksort($snippets);
 780+ $first = array_keys($textExt);
 781+ if( isset($first[0]))
 782+ $first = $first[0];
 783+ else
 784+ $first = 0;
 786+ // add extra chars to each snippet to make snippets constant size
 787+ $extended = array();
 788+ if( count( $snippets ) == 0){
 789+ // couldn't find the target words, just show beginning of article
 790+ $targetchars = $contextchars * $contextlines;
 791+ $snippets[$first] = '';
 792+ $offsets[$first] = 0;
 793+ } else{
 794+ // if begin of the article contains the whole phrase, show only that !!
 795+ if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 796+ && $offsets[$first] < $contextchars * 2 ){
 797+ $snippets = array ($first => $snippets[$first]);
 798+ }
 800+ // calc by how much to extend existing snippets
 801+ $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 802+ }
 804+ foreach($snippets as $index => $line){
 805+ $extended[$index] = $line;
 806+ $len = strlen($line);
 807+ if( $len < $targetchars - 20 ){
 808+ // complete this line
 809+ if($len < strlen( $all[$index] )){
 810+ $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 811+ $len = strlen( $extended[$index] );
 812+ }
 814+ // add more lines
 815+ $add = $index + 1;
 816+ while( $len < $targetchars - 20
 817+ && array_key_exists($add,$all)
 818+ && !array_key_exists($add,$snippets) ){
 819+ $offsets[$add] = 0;
 820+ $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 821+ $extended[$add] = $tt;
 822+ $len += strlen( $tt );
 823+ $add++;
 824+ }
 825+ }
 826+ }
 828+ $snippets = array_map('htmlspecialchars', $extended);
 829+ $last = -1;
 830+ $extract = '';
 831+ foreach($snippets as $index => $line){
 832+ if($last == -1)
 833+ $extract .= $line; // first line
 834+ elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 835+ $extract .= " ".$line; // continous lines
 836+ else
 837+ $extract .= '<b> ... </b>' . $line;
 839+ $last = $index;
 840+ }
 841+ if( $extract )
 842+ $extract .= '<b> ... </b>';
 844+ // highlight words
 845+ $pat3 = '/(' . $anyterm . ")/ui";
 846+ $extract = preg_replace( $pat3,
 847+ "<span class='searchmatch'>\\1</span>", $extract );
 849+ wfProfileOut( "$fname-extract" );
 851+ return $extract;
 852+ }
 854+ /**
 855+ * Split text into lines and add it to extracts array
 856+ *
 857+ * @param array $extracts index -> $line
 858+ * @param int $count
 859+ * @param string $text
 860+ */
 861+ function splitAndAdd(&$extracts, &$count, $text){
 862+ $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 863+ foreach($split as $line){
 864+ $tt = trim($line);
 865+ if( $tt )
 866+ $extracts[$count++] = $tt;
 867+ }
 868+ }
 870+ /**
 871+ * Do manual case conversion for non-ascii chars
 872+ *
 873+ * @param unknown_type $matches
 874+ */
 875+ function caseCallback($matches){
 876+ global $wgContLang;
 877+ if( strlen($matches[0]) > 1 ){
 878+ return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 879+ } else
 880+ return $matches[0];
 881+ }
 883+ /**
 884+ * Extract part of the text from start to end, but by
 885+ * not chopping up words
 886+ * @param string $text
 887+ * @param int $start
 888+ * @param int $end
 889+ * @param int $posStart (out) actual start position
 890+ * @param int $posEnd (out) actual end position
 891+ * @return string
 892+ */
 893+ function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 894+ global $wgContLang;
 896+ if( $start != 0)
 897+ $start = $this->position( $text, $start, 1 );
 898+ if( $end >= strlen($text) )
 899+ $end = strlen($text);
 900+ else
 901+ $end = $this->position( $text, $end );
 903+ if(!is_null($posStart))
 904+ $posStart = $start;
 905+ if(!is_null($posEnd))
 906+ $posEnd = $end;
 908+ if($end > $start)
 909+ return substr($text, $start, $end-$start);
 910+ else
 911+ return '';
 912+ }
 914+ /**
 915+ * Find a nonletter near a point (index) in the text
 916+ *
 917+ * @param string $text
 918+ * @param int $point
 919+ * @param int $offset to found index
 920+ * @return int nearest nonletter index, or beginning of utf8 char if none
 921+ */
 922+ function position($text, $point, $offset=0 ){
 923+ $tolerance = 10;
 924+ $s = max( 0, $point - $tolerance );
 925+ $l = min( strlen($text), $point + $tolerance ) - $s;
 926+ $m = array();
 927+ if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
 928+ return $m[0][1] + $s + $offset;
 929+ } else{
 930+ // check if point is on a valid first UTF8 char
 931+ $char = ord( $text[$point] );
 932+ while( $char >= 0x80 && $char < 0xc0 ) {
 933+ // skip trailing bytes
 934+ $point++;
 935+ if($point >= strlen($text))
 936+ return strlen($text);
 937+ $char = ord( $text[$point] );
 938+ }
 939+ return $point;
 941+ }
 942+ }
 944+ /**
 945+ * Search extracts for a pattern, and return snippets
 946+ *
 947+ * @param string $pattern regexp for matching lines
 948+ * @param array $extracts extracts to search
 949+ * @param int $linesleft number of extracts to make
 950+ * @param int $contextchars length of snippet
 951+ * @param array $out map for highlighted snippets
 952+ * @param array $offsets map of starting points of snippets
 953+ * @protected
 954+ */
 955+ function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
 956+ if($linesleft == 0)
 957+ return; // nothing to do
 958+ foreach($extracts as $index => $line){
 959+ if( array_key_exists($index,$out) )
 960+ continue; // this line already highlighted
 962+ $m = array();
 963+ if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
 964+ continue;
 966+ $offset = $m[0][1];
 967+ $len = strlen($m[0][0]);
 968+ if($offset + $len < $contextchars)
 969+ $begin = 0;
 970+ elseif( $len > $contextchars)
 971+ $begin = $offset;
 972+ else
 973+ $begin = $offset + intval( ($len - $contextchars) / 2 );
 975+ $end = $begin + $contextchars;
 977+ $posBegin = $begin;
 978+ // basic snippet from this line
 979+ $out[$index] = $this->extract($line,$begin,$end,$posBegin);
 980+ $offsets[$index] = $posBegin;
 981+ $linesleft--;
 982+ if($linesleft == 0)
 983+ return;
 984+ }
 985+ }
 987+ /**
 988+ * Basic wikitext removal
 989+ * @protected
 990+ */
 991+ function removeWiki($text) {
 992+ $fname = __METHOD__;
 993+ wfProfileIn( $fname );
 995+ //$text = preg_replace("/'{2,5}/", "", $text);
 996+ //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
 997+ //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
 998+ //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
 999+ //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
 1000+ //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
 1001+ $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
 1002+ $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
 1003+ $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
 1004+ $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
 1005+ //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
 1006+ $text = preg_replace("/<\/?[^>]+>/", "", $text);
 1007+ $text = preg_replace("/'''''/", "", $text);
 1008+ $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
 1009+ $text = preg_replace("/''/", "", $text);
 1011+ wfProfileOut( $fname );
 1012+ return $text;
 1013+ }
 1015+ /**
 1016+ * callback to replace [[target|caption]] kind of links, if
 1017+ * the target is category or image, leave it
 1018+ *
 1019+ * @param array $matches
 1020+ */
 1021+ function linkReplace($matches){
 1022+ $colon = strpos( $matches[1], ':' );
 1023+ if( $colon === false )
 1024+ return $matches[2]; // replace with caption
 1025+ global $wgContLang;
 1026+ $ns = substr( $matches[1], 0, $colon );
 1027+ $index = $wgContLang->getNsIndex($ns);
 1028+ if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) )
 1029+ return $matches[0]; // return the whole thing
 1030+ else
 1031+ return $matches[2];
 1033+ }
 1037+ * @addtogroup Search
 1038+ */
6961039 class SearchEngineDummy {
6971040 function search( $term ) {
6981041 return null;

Follow-up revisions

RevisionCommit summaryAuthorDate
r34086Revert for now:...brion20:55, 1 May 2008
r34210Re-commit r34072 with some modifications:...rainman15:31, 4 May 2008

Status & tagging log