r34086 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r34085‎ | r34086 | r34087 >
Date:20:55, 1 May 2008
Author:brion
Status:old
Tags:
Comment:
Revert for now:
* r34072 -- new highlighter code; looks a bit expensive, not fully tested yet.
* r33489 -- broke search result highlighting all around
* Part of r32350 -- bring the color back to search highlighting so we can see our results again. Why was this removed without comment?
Modified paths:
  • /trunk/phase3/includes/SearchEngine.php (modified) (history)
  • /trunk/phase3/skins/monobook/main.css (modified) (history)

Diff [purge]

Index: trunk/phase3/skins/monobook/main.css
@@ -1564,6 +1564,7 @@
15651565
15661566 span.searchmatch {
15671567 font-weight: bold;
 1568+ color: red;
15681569 }
15691570
15701571 /* God-damned hack for the crappy layout */
Index: trunk/phase3/includes/SearchEngine.php
@@ -250,9 +250,8 @@
251251 */
252252 public static function userHighlightPrefs( &$user ){
253253 //$contextlines = $user->getOption( 'contextlines', 5 );
254 - //$contextchars = $user->getOption( 'contextchars', 50 );
255254 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
256 - $contextchars = 75; // same as above.... :P
 255+ $contextchars = $user->getOption( 'contextchars', 50 );
257256 return array($contextlines, $contextchars);
258257 }
259258
@@ -547,18 +546,73 @@
548547 }
549548
550549 /**
551 - * @param array $terms Terms to highlight (unescaped)
 550+ * @param array $terms terms to highlight
552551 * @return string highlighted text snippet, null (and not '') if not supported
553552 */
554553 function getTextSnippet($terms){
555554 global $wgUser;
556555 $this->initText();
557556 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
558 - $h = new SearchHighlighter();
559 - return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars);
 557+ return $this->extractText( $this->mText, $terms, $contextlines, $contextchars);
560558 }
561559
562560 /**
 561+ * Default implementation of snippet extraction
 562+ *
 563+ * @param string $text
 564+ * @param array $terms
 565+ * @param int $contextlines
 566+ * @param int $contextchars
 567+ * @return string
 568+ */
 569+ protected function extractText( $text, $terms, $contextlines, $contextchars ) {
 570+ global $wgLang, $wgContLang;
 571+ $fname = __METHOD__;
 572+
 573+ $lines = explode( "\n", $text );
 574+
 575+ $terms = implode( '|', $terms );
 576+ $terms = str_replace( '/', "\\/", $terms);
 577+ $max = intval( $contextchars ) + 1;
 578+ $pat1 = "/(.*)($terms)(.{0,$max})/i";
 579+
 580+ $lineno = 0;
 581+
 582+ $extract = "";
 583+ wfProfileIn( "$fname-extract" );
 584+ foreach ( $lines as $line ) {
 585+ if ( 0 == $contextlines ) {
 586+ break;
 587+ }
 588+ ++$lineno;
 589+ $m = array();
 590+ if ( ! preg_match( $pat1, $line, $m ) ) {
 591+ continue;
 592+ }
 593+ --$contextlines;
 594+ $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
 595+
 596+ if ( count( $m ) < 3 ) {
 597+ $post = '';
 598+ } else {
 599+ $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
 600+ }
 601+
 602+ $found = $m[2];
 603+
 604+ $line = htmlspecialchars( $pre . $found . $post );
 605+ $pat2 = '/(' . $terms . ")/i";
 606+ $line = preg_replace( $pat2,
 607+ "<span class='searchmatch'>\\1</span>", $line );
 608+
 609+ $extract .= "${line}\n";
 610+ }
 611+ wfProfileOut( "$fname-extract" );
 612+
 613+ return $extract;
 614+ }
 615+
 616+ /**
563617 * @param array $terms terms to highlight
564618 * @return string highlighted title, '' if not supported
565619 */
@@ -634,407 +688,8 @@
635689 }
636690
637691 /**
638 - * Highlight bits of wikitext
639 - *
640692 * @addtogroup Search
641693 */
642 -class SearchHighlighter {
643 - var $mCleanWikitext = true;
644 -
645 - function SearchHighlighter($cleanupWikitext = true){
646 - $this->mCleanWikitext = $cleanupWikitext;
647 - }
648 -
649 - /**
650 - * Default implementation of wikitext highlighting
651 - *
652 - * @param string $text
653 - * @param array $terms Terms to highlight (unescaped)
654 - * @param int $contextlines
655 - * @param int $contextchars
656 - * @return string
657 - */
658 - public function highlightText( $text, $terms, $contextlines, $contextchars ) {
659 - global $wgLang, $wgContLang;
660 - $fname = __METHOD__;
661 -
662 - if($text == '')
663 - return '';
664 -
665 - // spli text into text + templates/links/tables
666 - $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)/";
667 - // first capture group is for detecting nested templates/links/tables
668 - $endPatterns = array(
669 - 1 => '/(\{\{)|(\}\})/', // template
670 - 2 => '/(\[\[)|(\]\])/', // image
671 - 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
672 - $textExt = array(); // text extracts
673 - $otherExt = array(); // other extracts
674 - wfProfileIn( "$fname-split" );
675 - $start = 0;
676 - $textLen = strlen($text);
677 - $count = 0; // sequence number to maintain ordering
678 - while( $start < $textLen ){
679 - // find start of template/image/table
680 - if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
681 - $epat = '';
682 - foreach($matches as $key => $val){
683 - if($key > 0 && $val[1] != -1){
684 - if($key == 2){
685 - // see if this is an image link
686 - $ns = substr($val[0],2,-1);
687 - if( $wgContLang->getNsIndex($ns) != NS_IMAGE )
688 - break;
689 -
690 - }
691 - $epat = $endPatterns[$key];
692 - $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
693 - $start = $val[1];
694 - break;
695 - }
696 - }
697 - if( $epat ){
698 - // find end (and detect any nested elements)
699 - $level = 0;
700 - $offset = $start + 1;
701 - $found = false;
702 - while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
703 - if( array_key_exists(2,$endMatches) ){
704 - // found end
705 - if($level == 0){
706 - $len = strlen($endMatches[2][0]);
707 - $off = $endMatches[2][1];
708 - $this->splitAndAdd( $otherExt, $count,
709 - substr( $text, $start, $off + $len - $start ) );
710 - $start = $off + $len;
711 - $found = true;
712 - break;
713 - } else{
714 - // end of nested element
715 - $level -= 1;
716 - }
717 - } else{
718 - // nested
719 - $level += 1;
720 - }
721 - $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
722 - }
723 - if( ! $found ){
724 - // couldn't find appropriate closing tag, skip
725 - $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
726 - $start += strlen($matches[0][0]);
727 - }
728 - continue;
729 - }
730 - }
731 - // else: add as text extract
732 - $this->splitAndAdd( $textExt, $count, substr($text,$start) );
733 - break;
734 - }
735 -
736 - $all = $textExt + $otherExt; // these have disjunct key sets
737 -
738 - wfProfileOut( "$fname-split" );
739 -
740 - // prepare regexps
741 - foreach( $terms as $index => $term ) {
742 - $terms[$index] = preg_quote( $term, '/' );
743 - // manually do upper/lowercase stuff for utf-8 since PHP won't do it
744 - if(preg_match('/[\x80-\xff]/', $term) ){
745 - $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
746 - }
747 -
748 -
749 - }
750 - $anyterm = implode( '|', $terms );
751 - $phrase = implode('[, .:;\(\)"\'\-\+]+', $terms );
752 -
753 - // FIXME: a hack to scale contextchars, a correct solution
754 - // would be to have contextchars actually be char and not byte
755 - // length, and do proper utf-8 substrings and lengths everywhere,
756 - // but PHP is making that very hard and unclean to implement :(
757 - $scale = strlen($anyterm) / mb_strlen($anyterm);
758 - $contextchars = intval( $contextchars * $scale );
759 -
760 - $pat1 = '/('.$phrase.')/ui';
761 - $pat2 = '/('.$anyterm.')/ui';
762 -
763 - wfProfileIn( "$fname-extract" );
764 -
765 - $left = $contextlines;
766 -
767 - $snippets = array();
768 - $offsets = array();
769 - // match whole query on text
770 - $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
771 - // match whole query on templates/tables/images
772 - $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
773 - // match any words on text
774 - $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
775 - // match any words on templates/tables/images
776 - $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
777 -
778 - ksort($snippets);
779 -
780 - $first = array_keys($textExt);
781 - if( isset($first[0]))
782 - $first = $first[0];
783 - else
784 - $first = 0;
785 -
786 - // add extra chars to each snippet to make snippets constant size
787 - $extended = array();
788 - if( count( $snippets ) == 0){
789 - // couldn't find the target words, just show beginning of article
790 - $targetchars = $contextchars * $contextlines;
791 - $snippets[$first] = '';
792 - $offsets[$first] = 0;
793 - } else{
794 - // if begin of the article contains the whole phrase, show only that !!
795 - if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
796 - && $offsets[$first] < $contextchars * 2 ){
797 - $snippets = array ($first => $snippets[$first]);
798 - }
799 -
800 - // calc by how much to extend existing snippets
801 - $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
802 - }
803 -
804 - foreach($snippets as $index => $line){
805 - $extended[$index] = $line;
806 - $len = strlen($line);
807 - if( $len < $targetchars - 20 ){
808 - // complete this line
809 - if($len < strlen( $all[$index] )){
810 - $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
811 - $len = strlen( $extended[$index] );
812 - }
813 -
814 - // add more lines
815 - $add = $index + 1;
816 - while( $len < $targetchars - 20
817 - && array_key_exists($add,$all)
818 - && !array_key_exists($add,$snippets) ){
819 - $offsets[$add] = 0;
820 - $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
821 - $extended[$add] = $tt;
822 - $len += strlen( $tt );
823 - $add++;
824 - }
825 - }
826 - }
827 -
828 - $snippets = array_map('htmlspecialchars', $extended);
829 - $last = -1;
830 - $extract = '';
831 - foreach($snippets as $index => $line){
832 - if($last == -1)
833 - $extract .= $line; // first line
834 - elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
835 - $extract .= " ".$line; // continous lines
836 - else
837 - $extract .= '<b> ... </b>' . $line;
838 -
839 - $last = $index;
840 - }
841 - if( $extract )
842 - $extract .= '<b> ... </b>';
843 -
844 - // highlight words
845 - $pat3 = '/(' . $anyterm . ")/ui";
846 - $extract = preg_replace( $pat3,
847 - "<span class='searchmatch'>\\1</span>", $extract );
848 -
849 - wfProfileOut( "$fname-extract" );
850 -
851 - return $extract;
852 - }
853 -
854 - /**
855 - * Split text into lines and add it to extracts array
856 - *
857 - * @param array $extracts index -> $line
858 - * @param int $count
859 - * @param string $text
860 - */
861 - function splitAndAdd(&$extracts, &$count, $text){
862 - $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
863 - foreach($split as $line){
864 - $tt = trim($line);
865 - if( $tt )
866 - $extracts[$count++] = $tt;
867 - }
868 - }
869 -
870 - /**
871 - * Do manual case conversion for non-ascii chars
872 - *
873 - * @param unknown_type $matches
874 - */
875 - function caseCallback($matches){
876 - global $wgContLang;
877 - if( strlen($matches[0]) > 1 ){
878 - return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
879 - } else
880 - return $matches[0];
881 - }
882 -
883 - /**
884 - * Extract part of the text from start to end, but by
885 - * not chopping up words
886 - * @param string $text
887 - * @param int $start
888 - * @param int $end
889 - * @param int $posStart (out) actual start position
890 - * @param int $posEnd (out) actual end position
891 - * @return string
892 - */
893 - function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
894 - global $wgContLang;
895 -
896 - if( $start != 0)
897 - $start = $this->position( $text, $start, 1 );
898 - if( $end >= strlen($text) )
899 - $end = strlen($text);
900 - else
901 - $end = $this->position( $text, $end );
902 -
903 - if(!is_null($posStart))
904 - $posStart = $start;
905 - if(!is_null($posEnd))
906 - $posEnd = $end;
907 -
908 - if($end > $start)
909 - return substr($text, $start, $end-$start);
910 - else
911 - return '';
912 - }
913 -
914 - /**
915 - * Find a nonletter near a point (index) in the text
916 - *
917 - * @param string $text
918 - * @param int $point
919 - * @param int $offset to found index
920 - * @return int nearest nonletter index, or beginning of utf8 char if none
921 - */
922 - function position($text, $point, $offset=0 ){
923 - $tolerance = 10;
924 - $s = max( 0, $point - $tolerance );
925 - $l = min( strlen($text), $point + $tolerance ) - $s;
926 - $m = array();
927 - if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
928 - return $m[0][1] + $s + $offset;
929 - } else{
930 - // check if point is on a valid first UTF8 char
931 - $char = ord( $text[$point] );
932 - while( $char >= 0x80 && $char < 0xc0 ) {
933 - // skip trailing bytes
934 - $point++;
935 - if($point >= strlen($text))
936 - return strlen($text);
937 - $char = ord( $text[$point] );
938 - }
939 - return $point;
940 -
941 - }
942 - }
943 -
944 - /**
945 - * Search extracts for a pattern, and return snippets
946 - *
947 - * @param string $pattern regexp for matching lines
948 - * @param array $extracts extracts to search
949 - * @param int $linesleft number of extracts to make
950 - * @param int $contextchars length of snippet
951 - * @param array $out map for highlighted snippets
952 - * @param array $offsets map of starting points of snippets
953 - * @protected
954 - */
955 - function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
956 - if($linesleft == 0)
957 - return; // nothing to do
958 - foreach($extracts as $index => $line){
959 - if( array_key_exists($index,$out) )
960 - continue; // this line already highlighted
961 -
962 - $m = array();
963 - if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
964 - continue;
965 -
966 - $offset = $m[0][1];
967 - $len = strlen($m[0][0]);
968 - if($offset + $len < $contextchars)
969 - $begin = 0;
970 - elseif( $len > $contextchars)
971 - $begin = $offset;
972 - else
973 - $begin = $offset + intval( ($len - $contextchars) / 2 );
974 -
975 - $end = $begin + $contextchars;
976 -
977 - $posBegin = $begin;
978 - // basic snippet from this line
979 - $out[$index] = $this->extract($line,$begin,$end,$posBegin);
980 - $offsets[$index] = $posBegin;
981 - $linesleft--;
982 - if($linesleft == 0)
983 - return;
984 - }
985 - }
986 -
987 - /**
988 - * Basic wikitext removal
989 - * @protected
990 - */
991 - function removeWiki($text) {
992 - $fname = __METHOD__;
993 - wfProfileIn( $fname );
994 -
995 - //$text = preg_replace("/'{2,5}/", "", $text);
996 - //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
997 - //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
998 - //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
999 - //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1000 - //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1001 - $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1002 - $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1003 - $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1004 - $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1005 - //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1006 - $text = preg_replace("/<\/?[^>]+>/", "", $text);
1007 - $text = preg_replace("/'''''/", "", $text);
1008 - $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1009 - $text = preg_replace("/''/", "", $text);
1010 -
1011 - wfProfileOut( $fname );
1012 - return $text;
1013 - }
1014 -
1015 - /**
1016 - * callback to replace [[target|caption]] kind of links, if
1017 - * the target is category or image, leave it
1018 - *
1019 - * @param array $matches
1020 - */
1021 - function linkReplace($matches){
1022 - $colon = strpos( $matches[1], ':' );
1023 - if( $colon === false )
1024 - return $matches[2]; // replace with caption
1025 - global $wgContLang;
1026 - $ns = substr( $matches[1], 0, $colon );
1027 - $index = $wgContLang->getNsIndex($ns);
1028 - if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) )
1029 - return $matches[0]; // return the whole thing
1030 - else
1031 - return $matches[2];
1032 -
1033 - }
1034 -}
1035 -
1036 -/**
1037 - * @addtogroup Search
1038 - */
1039694 class SearchEngineDummy {
1040695 function search( $term ) {
1041696 return null;

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r32350Search frontend:...rainman13:43, 23 March 2008
r33489The problem also applies to all the other regex special chars: try it out wit...simetrical15:59, 17 April 2008
r34072New class SearchHighlighter handles highlighting of search terms and...rainman13:36, 1 May 2008

Status & tagging log