r34086 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r34085‎ \| r34086 \| r34087 >
Date:	20:55, 1 May 2008
Author:	brion
Status:	old
Tags:
Comment:	Revert for now: * r34072 -- new highlighter code; looks a bit expensive, not fully tested yet. * r33489 -- broke search result highlighting all around * Part of r32350 -- bring the color back to search highlighting so we can see our results again. Why was this removed without comment?
Modified paths:	/trunk/phase3/includes/SearchEngine.php (modified) (history) /trunk/phase3/skins/monobook/main.css (modified) (history)

Diff [purge]

Index: trunk/phase3/skins/monobook/main.css
—	—	@@ -1564,6 +1564,7 @@
1565	1565
1566	1566	span.searchmatch {
1567	1567	font-weight: bold;
	1568	+ color: red;
1568	1569	}
1569	1570
1570	1571	/* God-damned hack for the crappy layout */
Index: trunk/phase3/includes/SearchEngine.php
—	—	@@ -250,9 +250,8 @@
251	251	*/
252	252	public static function userHighlightPrefs( &$user ){
253	253	//$contextlines = $user->getOption( 'contextlines', 5 );
254		~~- //$contextchars = $user->getOption( 'contextchars', 50 );~~
255	254	$contextlines = 2; // Hardcode this. Old defaults sucked. :)
256		~~- $contextchars = 75; // same as above.... :P~~
	255	+ $contextchars = $user->getOption( 'contextchars', 50 );
257	256	return array($contextlines, $contextchars);
258	257	}
259	258
—	—	@@ -547,18 +546,73 @@
548	547	}
549	548
550	549	/**
551		~~- * @param array $terms Terms to highlight (unescaped)~~
	550	+ * @param array $terms terms to highlight
552	551	* @return string highlighted text snippet, null (and not '') if not supported
553	552	*/
554	553	function getTextSnippet($terms){
555	554	global $wgUser;
556	555	$this->initText();
557	556	list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
558		~~- $h = new SearchHighlighter();~~
559		~~- return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars);~~
	557	+ return $this->extractText( $this->mText, $terms, $contextlines, $contextchars);
560	558	}
561	559
562	560	/**
	561	+ * Default implementation of snippet extraction
	562	+ *
	563	+ * @param string $text
	564	+ * @param array $terms
	565	+ * @param int $contextlines
	566	+ * @param int $contextchars
	567	+ * @return string
	568	+ */
	569	+ protected function extractText( $text, $terms, $contextlines, $contextchars ) {
	570	+ global $wgLang, $wgContLang;
	571	+ $fname = __METHOD__;
	572	+
	573	+ $lines = explode( "\n", $text );
	574	+
	575	+ $terms = implode( '\|', $terms );
	576	+ $terms = str_replace( '/', "\\/", $terms);
	577	+ $max = intval( $contextchars ) + 1;
	578	+ $pat1 = "/(.*)($terms)(.{0,$max})/i";
	579	+
	580	+ $lineno = 0;
	581	+
	582	+ $extract = "";
	583	+ wfProfileIn( "$fname-extract" );
	584	+ foreach ( $lines as $line ) {
	585	+ if ( 0 == $contextlines ) {
	586	+ break;
	587	+ }
	588	+ ++$lineno;
	589	+ $m = array();
	590	+ if ( ! preg_match( $pat1, $line, $m ) ) {
	591	+ continue;
	592	+ }
	593	+ --$contextlines;
	594	+ $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
	595	+
	596	+ if ( count( $m ) < 3 ) {
	597	+ $post = '';
	598	+ } else {
	599	+ $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
	600	+ }
	601	+
	602	+ $found = $m[2];
	603	+
	604	+ $line = htmlspecialchars( $pre . $found . $post );
	605	+ $pat2 = '/(' . $terms . ")/i";
	606	+ $line = preg_replace( $pat2,
	607	+ "<span class='searchmatch'>\\1</span>", $line );
	608	+
	609	+ $extract .= "${line}\n";
	610	+ }
	611	+ wfProfileOut( "$fname-extract" );
	612	+
	613	+ return $extract;
	614	+ }
	615	+
	616	+ /**
563	617	* @param array $terms terms to highlight
564	618	* @return string highlighted title, '' if not supported
565	619	*/
—	—	@@ -634,407 +688,8 @@
635	689	}
636	690
637	691	/**
638		~~- * Highlight bits of wikitext~~
639		- *
640	692	* @addtogroup Search
641	693	*/
642		~~-class SearchHighlighter {~~
643		~~- var $mCleanWikitext = true;~~
644		-
645		~~- function SearchHighlighter($cleanupWikitext = true){~~
646		~~- $this->mCleanWikitext = $cleanupWikitext;~~
647		~~- }~~
648		-
649		- /**
650		~~- * Default implementation of wikitext highlighting~~
651		- *
652		~~- * @param string $text~~
653		~~- * @param array $terms Terms to highlight (unescaped)~~
654		~~- * @param int $contextlines~~
655		~~- * @param int $contextchars~~
656		~~- * @return string~~
657		~~- */~~
658		~~- public function highlightText( $text, $terms, $contextlines, $contextchars ) {~~
659		~~- global $wgLang, $wgContLang;~~
660		~~- $fname = __METHOD__;~~
661		-
662		~~- if($text == '')~~
663		~~- return '';~~
664		-
665		~~- // spli text into text + templates/links/tables~~
666		~~- $spat = "/(\\{\\{)\|(\\[\\[[^\\]:]+:)\|(\n\\{\\\|)/";~~
667		~~- // first capture group is for detecting nested templates/links/tables~~
668		~~- $endPatterns = array(~~
669		~~- 1 => '/(\{\{)\|(\}\})/', // template~~
670		~~- 2 => '/(\[\[)\|(\]\])/', // image~~
671		~~- 3 => "/(\n\\{\\\|)\|(\n\\\|\\})/"); // table~~
672		~~- $textExt = array(); // text extracts~~
673		~~- $otherExt = array(); // other extracts~~
674		~~- wfProfileIn( "$fname-split" );~~
675		~~- $start = 0;~~
676		~~- $textLen = strlen($text);~~
677		~~- $count = 0; // sequence number to maintain ordering~~
678		~~- while( $start < $textLen ){~~
679		~~- // find start of template/image/table~~
680		~~- if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){~~
681		~~- $epat = '';~~
682		~~- foreach($matches as $key => $val){~~
683		~~- if($key > 0 && $val[1] != -1){~~
684		~~- if($key == 2){~~
685		~~- // see if this is an image link~~
686		~~- $ns = substr($val[0],2,-1);~~
687		~~- if( $wgContLang->getNsIndex($ns) != NS_IMAGE )~~
688		~~- break;~~
689		-
690		~~- }~~
691		~~- $epat = $endPatterns[$key];~~
692		~~- $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );~~
693		~~- $start = $val[1];~~
694		~~- break;~~
695		~~- }~~
696		~~- }~~
697		~~- if( $epat ){~~
698		~~- // find end (and detect any nested elements)~~
699		~~- $level = 0;~~
700		~~- $offset = $start + 1;~~
701		~~- $found = false;~~
702		~~- while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){~~
703		~~- if( array_key_exists(2,$endMatches) ){~~
704		~~- // found end~~
705		~~- if($level == 0){~~
706		~~- $len = strlen($endMatches[2][0]);~~
707		~~- $off = $endMatches[2][1];~~
708		~~- $this->splitAndAdd( $otherExt, $count,~~
709		~~- substr( $text, $start, $off + $len - $start ) );~~
710		~~- $start = $off + $len;~~
711		~~- $found = true;~~
712		~~- break;~~
713		~~- } else{~~
714		~~- // end of nested element~~
715		~~- $level -= 1;~~
716		~~- }~~
717		~~- } else{~~
718		~~- // nested~~
719		~~- $level += 1;~~
720		~~- }~~
721		~~- $offset = $endMatches[0][1] + strlen($endMatches[0][0]);~~
722		~~- }~~
723		~~- if( ! $found ){~~
724		~~- // couldn't find appropriate closing tag, skip~~
725		~~- $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );~~
726		~~- $start += strlen($matches[0][0]);~~
727		~~- }~~
728		~~- continue;~~
729		~~- }~~
730		~~- }~~
731		~~- // else: add as text extract~~
732		~~- $this->splitAndAdd( $textExt, $count, substr($text,$start) );~~
733		~~- break;~~
734		~~- }~~
735		-
736		~~- $all = $textExt + $otherExt; // these have disjunct key sets~~
737		-
738		~~- wfProfileOut( "$fname-split" );~~
739		-
740		~~- // prepare regexps~~
741		~~- foreach( $terms as $index => $term ) {~~
742		~~- $terms[$index] = preg_quote( $term, '/' );~~
743		~~- // manually do upper/lowercase stuff for utf-8 since PHP won't do it~~
744		~~- if(preg_match('/[\x80-\xff]/', $term) ){~~
745		~~- $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);~~
746		~~- }~~
747		-
748		-
749		~~- }~~
750		~~- $anyterm = implode( '\|', $terms );~~
751		~~- $phrase = implode('[, .:;"\'\-\+]+', $terms );~~
752		-
753		~~- // FIXME: a hack to scale contextchars, a correct solution~~
754		~~- // would be to have contextchars actually be char and not byte~~
755		~~- // length, and do proper utf-8 substrings and lengths everywhere,~~
756		~~- // but PHP is making that very hard and unclean to implement :(~~
757		~~- $scale = strlen($anyterm) / mb_strlen($anyterm);~~
758		~~- $contextchars = intval( $contextchars * $scale );~~
759		-
760		~~- $pat1 = '/('.$phrase.')/ui';~~
761		~~- $pat2 = '/('.$anyterm.')/ui';~~
762		-
763		~~- wfProfileIn( "$fname-extract" );~~
764		-
765		~~- $left = $contextlines;~~
766		-
767		~~- $snippets = array();~~
768		~~- $offsets = array();~~
769		~~- // match whole query on text~~
770		~~- $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);~~
771		~~- // match whole query on templates/tables/images~~
772		~~- $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);~~
773		~~- // match any words on text~~
774		~~- $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);~~
775		~~- // match any words on templates/tables/images~~
776		~~- $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);~~
777		-
778		~~- ksort($snippets);~~
779		-
780		~~- $first = array_keys($textExt);~~
781		~~- if( isset($first[0]))~~
782		~~- $first = $first[0];~~
783		~~- else~~
784		~~- $first = 0;~~
785		-
786		~~- // add extra chars to each snippet to make snippets constant size~~
787		~~- $extended = array();~~
788		~~- if( count( $snippets ) == 0){~~
789		~~- // couldn't find the target words, just show beginning of article~~
790		~~- $targetchars = $contextchars * $contextlines;~~
791		~~- $snippets[$first] = '';~~
792		~~- $offsets[$first] = 0;~~
793		~~- } else{~~
794		~~- // if begin of the article contains the whole phrase, show only that !!~~
795		~~- if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])~~
796		~~- && $offsets[$first] < $contextchars * 2 ){~~
797		~~- $snippets = array ($first => $snippets[$first]);~~
798		~~- }~~
799		-
800		~~- // calc by how much to extend existing snippets~~
801		~~- $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );~~
802		~~- }~~
803		-
804		~~- foreach($snippets as $index => $line){~~
805		~~- $extended[$index] = $line;~~
806		~~- $len = strlen($line);~~
807		~~- if( $len < $targetchars - 20 ){~~
808		~~- // complete this line~~
809		~~- if($len < strlen( $all[$index] )){~~
810		~~- $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);~~
811		~~- $len = strlen( $extended[$index] );~~
812		~~- }~~
813		-
814		~~- // add more lines~~
815		~~- $add = $index + 1;~~
816		~~- while( $len < $targetchars - 20~~
817		~~- && array_key_exists($add,$all)~~
818		~~- && !array_key_exists($add,$snippets) ){~~
819		~~- $offsets[$add] = 0;~~
820		~~- $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );~~
821		~~- $extended[$add] = $tt;~~
822		~~- $len += strlen( $tt );~~
823		~~- $add++;~~
824		~~- }~~
825		~~- }~~
826		~~- }~~
827		-
828		~~- $snippets = array_map('htmlspecialchars', $extended);~~
829		~~- $last = -1;~~
830		~~- $extract = '';~~
831		~~- foreach($snippets as $index => $line){~~
832		~~- if($last == -1)~~
833		~~- $extract .= $line; // first line~~
834		~~- elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))~~
835		~~- $extract .= " ".$line; // continous lines~~
836		~~- else~~
837		~~- $extract .= '<b> ... </b>' . $line;~~
838		-
839		~~- $last = $index;~~
840		~~- }~~
841		~~- if( $extract )~~
842		~~- $extract .= '<b> ... </b>';~~
843		-
844		~~- // highlight words~~
845		~~- $pat3 = '/(' . $anyterm . ")/ui";~~
846		~~- $extract = preg_replace( $pat3,~~
847		~~- "<span class='searchmatch'>\\1</span>", $extract );~~
848		-
849		~~- wfProfileOut( "$fname-extract" );~~
850		-
851		~~- return $extract;~~
852		~~- }~~
853		-
854		- /**
855		~~- * Split text into lines and add it to extracts array~~
856		- *
857		~~- * @param array $extracts index -> $line~~
858		~~- * @param int $count~~
859		~~- * @param string $text~~
860		~~- */~~
861		~~- function splitAndAdd(&$extracts, &$count, $text){~~
862		~~- $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );~~
863		~~- foreach($split as $line){~~
864		~~- $tt = trim($line);~~
865		~~- if( $tt )~~
866		~~- $extracts[$count++] = $tt;~~
867		~~- }~~
868		~~- }~~
869		-
870		- /**
871		~~- * Do manual case conversion for non-ascii chars~~
872		- *
873		~~- * @param unknown_type $matches~~
874		~~- */~~
875		~~- function caseCallback($matches){~~
876		~~- global $wgContLang;~~
877		~~- if( strlen($matches[0]) > 1 ){~~
878		~~- return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';~~
879		~~- } else~~
880		~~- return $matches[0];~~
881		~~- }~~
882		-
883		- /**
884		~~- * Extract part of the text from start to end, but by~~
885		~~- * not chopping up words~~
886		~~- * @param string $text~~
887		~~- * @param int $start~~
888		~~- * @param int $end~~
889		~~- * @param int $posStart (out) actual start position~~
890		~~- * @param int $posEnd (out) actual end position~~
891		~~- * @return string~~
892		~~- */~~
893		~~- function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){~~
894		~~- global $wgContLang;~~
895		-
896		~~- if( $start != 0)~~
897		~~- $start = $this->position( $text, $start, 1 );~~
898		~~- if( $end >= strlen($text) )~~
899		~~- $end = strlen($text);~~
900		~~- else~~
901		~~- $end = $this->position( $text, $end );~~
902		-
903		~~- if(!is_null($posStart))~~
904		~~- $posStart = $start;~~
905		~~- if(!is_null($posEnd))~~
906		~~- $posEnd = $end;~~
907		-
908		~~- if($end > $start)~~
909		~~- return substr($text, $start, $end-$start);~~
910		~~- else~~
911		~~- return '';~~
912		~~- }~~
913		-
914		- /**
915		~~- * Find a nonletter near a point (index) in the text~~
916		- *
917		~~- * @param string $text~~
918		~~- * @param int $point~~
919		~~- * @param int $offset to found index~~
920		~~- * @return int nearest nonletter index, or beginning of utf8 char if none~~
921		~~- */~~
922		~~- function position($text, $point, $offset=0 ){~~
923		~~- $tolerance = 10;~~
924		~~- $s = max( 0, $point - $tolerance );~~
925		~~- $l = min( strlen($text), $point + $tolerance ) - $s;~~
926		~~- $m = array();~~
927		~~- if( preg_match('/[ ,.!?~!@#$%^&*+=\-\\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){~~
928		~~- return $m[0][1] + $s + $offset;~~
929		~~- } else{~~
930		~~- // check if point is on a valid first UTF8 char~~
931		~~- $char = ord( $text[$point] );~~
932		~~- while( $char >= 0x80 && $char < 0xc0 ) {~~
933		~~- // skip trailing bytes~~
934		~~- $point++;~~
935		~~- if($point >= strlen($text))~~
936		~~- return strlen($text);~~
937		~~- $char = ord( $text[$point] );~~
938		~~- }~~
939		~~- return $point;~~
940		-
941		~~- }~~
942		~~- }~~
943		-
944		- /**
945		~~- * Search extracts for a pattern, and return snippets~~
946		- *
947		~~- * @param string $pattern regexp for matching lines~~
948		~~- * @param array $extracts extracts to search~~
949		~~- * @param int $linesleft number of extracts to make~~
950		~~- * @param int $contextchars length of snippet~~
951		~~- * @param array $out map for highlighted snippets~~
952		~~- * @param array $offsets map of starting points of snippets~~
953		~~- * @protected~~
954		~~- */~~
955		~~- function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){~~
956		~~- if($linesleft == 0)~~
957		~~- return; // nothing to do~~
958		~~- foreach($extracts as $index => $line){~~
959		~~- if( array_key_exists($index,$out) )~~
960		~~- continue; // this line already highlighted~~
961		-
962		~~- $m = array();~~
963		~~- if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )~~
964		~~- continue;~~
965		-
966		~~- $offset = $m[0][1];~~
967		~~- $len = strlen($m[0][0]);~~
968		~~- if($offset + $len < $contextchars)~~
969		~~- $begin = 0;~~
970		~~- elseif( $len > $contextchars)~~
971		~~- $begin = $offset;~~
972		~~- else~~
973		~~- $begin = $offset + intval( ($len - $contextchars) / 2 );~~
974		-
975		~~- $end = $begin + $contextchars;~~
976		-
977		~~- $posBegin = $begin;~~
978		~~- // basic snippet from this line~~
979		~~- $out[$index] = $this->extract($line,$begin,$end,$posBegin);~~
980		~~- $offsets[$index] = $posBegin;~~
981		~~- $linesleft--;~~
982		~~- if($linesleft == 0)~~
983		~~- return;~~
984		~~- }~~
985		~~- }~~
986		-
987		- /**
988		~~- * Basic wikitext removal~~
989		~~- * @protected~~
990		~~- */~~
991		~~- function removeWiki($text) {~~
992		~~- $fname = __METHOD__;~~
993		~~- wfProfileIn( $fname );~~
994		-
995		~~- //$text = preg_replace("/'{2,5}/", "", $text);~~
996		~~- //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);~~
997		~~- //$text = preg_replace("/\[\[([^]\|]+)\]\]/", "\\1", $text);~~
998		~~- //$text = preg_replace("/\[\[([^]]+\\|)?([^\|]]+)\]\]/", "\\2", $text);~~
999		~~- //$text = preg_replace("/\\{\\\|(.*?)\\\|\\}/", "", $text);~~
1000		~~- //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^\|]+?)\\]\\]/", "", $text);~~
1001		~~- $text = preg_replace("/\\{\\{([^\|]+?)\\}\\}/", "", $text);~~
1002		~~- $text = preg_replace("/\\{\\{([^\|]+\\\|)(.*?)\\}\\}/", "\\2", $text);~~
1003		~~- $text = preg_replace("/\\[\\[([^\|]+?)\\]\\]/", "\\1", $text);~~
1004		~~- $text = preg_replace_callback("/\\[\\[([^\|]+\\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);~~
1005		~~- //$text = preg_replace("/\\[\\[([^\|]+\\\|)(.*?)\\]\\]/", "\\2", $text);~~
1006		~~- $text = preg_replace("/<\/?[^>]+>/", "", $text);~~
1007		~~- $text = preg_replace("/'''''/", "", $text);~~
1008		~~- $text = preg_replace("/('''\|<\/?[iIuUbB]>)/", "", $text);~~
1009		~~- $text = preg_replace("/''/", "", $text);~~
1010		-
1011		~~- wfProfileOut( $fname );~~
1012		~~- return $text;~~
1013		~~- }~~
1014		-
1015		- /**
1016		~~- * callback to replace [[target\|caption]] kind of links, if~~
1017		~~- * the target is category or image, leave it~~
1018		- *
1019		~~- * @param array $matches~~
1020		~~- */~~
1021		~~- function linkReplace($matches){~~
1022		~~- $colon = strpos( $matches[1], ':' );~~
1023		~~- if( $colon === false )~~
1024		~~- return $matches[2]; // replace with caption~~
1025		~~- global $wgContLang;~~
1026		~~- $ns = substr( $matches[1], 0, $colon );~~
1027		~~- $index = $wgContLang->getNsIndex($ns);~~
1028		~~- if( $index !== false && ($index == NS_IMAGE \|\| $index == NS_CATEGORY) )~~
1029		~~- return $matches[0]; // return the whole thing~~
1030		~~- else~~
1031		~~- return $matches[2];~~
1032		-
1033		~~- }~~
1034		-}
1035		-
1036		-/**
1037		~~- * @addtogroup Search~~
1038		~~- */~~
1039	694	class SearchEngineDummy {
1040	695	function search( $term ) {
1041	696	return null;

Past revisions this follows-up on

Revision	Commit summary	Author	Date
r32350	Search frontend:...	rainman	13:43, 23 March 2008
r33489	The problem also applies to all the other regex special chars: try it out wit...	simetrical	15:59, 17 April 2008
r34072	New class SearchHighlighter handles highlighting of search terms and...	rainman	13:36, 1 May 2008

Status & tagging log

15:26, 12 September 2011 Meno25 (talk | contribs) changed the status of r34086 [removed: ok added: old]