Index: trunk/phase3/skins/monobook/main.css |
— | — | @@ -1564,6 +1564,7 @@ |
1565 | 1565 | |
1566 | 1566 | span.searchmatch { |
1567 | 1567 | font-weight: bold; |
| 1568 | + color: red; |
1568 | 1569 | } |
1569 | 1570 | |
1570 | 1571 | /* God-damned hack for the crappy layout */ |
Index: trunk/phase3/includes/SearchEngine.php |
— | — | @@ -250,9 +250,8 @@ |
251 | 251 | */ |
252 | 252 | public static function userHighlightPrefs( &$user ){ |
253 | 253 | //$contextlines = $user->getOption( 'contextlines', 5 ); |
254 | | - //$contextchars = $user->getOption( 'contextchars', 50 ); |
255 | 254 | $contextlines = 2; // Hardcode this. Old defaults sucked. :) |
256 | | - $contextchars = 75; // same as above.... :P |
| 255 | + $contextchars = $user->getOption( 'contextchars', 50 ); |
257 | 256 | return array($contextlines, $contextchars); |
258 | 257 | } |
259 | 258 | |
— | — | @@ -547,18 +546,73 @@ |
548 | 547 | } |
549 | 548 | |
550 | 549 | /** |
551 | | - * @param array $terms Terms to highlight (unescaped) |
| 550 | + * @param array $terms terms to highlight |
552 | 551 | * @return string highlighted text snippet, null (and not '') if not supported |
553 | 552 | */ |
554 | 553 | function getTextSnippet($terms){ |
555 | 554 | global $wgUser; |
556 | 555 | $this->initText(); |
557 | 556 | list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser); |
558 | | - $h = new SearchHighlighter(); |
559 | | - return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars); |
| 557 | + return $this->extractText( $this->mText, $terms, $contextlines, $contextchars); |
560 | 558 | } |
561 | 559 | |
562 | 560 | /** |
| 561 | + * Default implementation of snippet extraction |
| 562 | + * |
| 563 | + * @param string $text |
| 564 | + * @param array $terms |
| 565 | + * @param int $contextlines |
| 566 | + * @param int $contextchars |
| 567 | + * @return string |
| 568 | + */ |
| 569 | + protected function extractText( $text, $terms, $contextlines, $contextchars ) { |
| 570 | + global $wgLang, $wgContLang; |
| 571 | + $fname = __METHOD__; |
| 572 | + |
| 573 | + $lines = explode( "\n", $text ); |
| 574 | + |
| 575 | + $terms = implode( '|', $terms ); |
| 576 | + $terms = str_replace( '/', "\\/", $terms); |
| 577 | + $max = intval( $contextchars ) + 1; |
| 578 | + $pat1 = "/(.*)($terms)(.{0,$max})/i"; |
| 579 | + |
| 580 | + $lineno = 0; |
| 581 | + |
| 582 | + $extract = ""; |
| 583 | + wfProfileIn( "$fname-extract" ); |
| 584 | + foreach ( $lines as $line ) { |
| 585 | + if ( 0 == $contextlines ) { |
| 586 | + break; |
| 587 | + } |
| 588 | + ++$lineno; |
| 589 | + $m = array(); |
| 590 | + if ( ! preg_match( $pat1, $line, $m ) ) { |
| 591 | + continue; |
| 592 | + } |
| 593 | + --$contextlines; |
| 594 | + $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' ); |
| 595 | + |
| 596 | + if ( count( $m ) < 3 ) { |
| 597 | + $post = ''; |
| 598 | + } else { |
| 599 | + $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' ); |
| 600 | + } |
| 601 | + |
| 602 | + $found = $m[2]; |
| 603 | + |
| 604 | + $line = htmlspecialchars( $pre . $found . $post ); |
| 605 | + $pat2 = '/(' . $terms . ")/i"; |
| 606 | + $line = preg_replace( $pat2, |
| 607 | + "<span class='searchmatch'>\\1</span>", $line ); |
| 608 | + |
| 609 | + $extract .= "${line}\n"; |
| 610 | + } |
| 611 | + wfProfileOut( "$fname-extract" ); |
| 612 | + |
| 613 | + return $extract; |
| 614 | + } |
| 615 | + |
| 616 | + /** |
563 | 617 | * @param array $terms terms to highlight |
564 | 618 | * @return string highlighted title, '' if not supported |
565 | 619 | */ |
— | — | @@ -634,407 +688,8 @@ |
635 | 689 | } |
636 | 690 | |
637 | 691 | /** |
638 | | - * Highlight bits of wikitext |
639 | | - * |
640 | 692 | * @addtogroup Search |
641 | 693 | */ |
642 | | -class SearchHighlighter { |
643 | | - var $mCleanWikitext = true; |
644 | | - |
645 | | - function SearchHighlighter($cleanupWikitext = true){ |
646 | | - $this->mCleanWikitext = $cleanupWikitext; |
647 | | - } |
648 | | - |
649 | | - /** |
650 | | - * Default implementation of wikitext highlighting |
651 | | - * |
652 | | - * @param string $text |
653 | | - * @param array $terms Terms to highlight (unescaped) |
654 | | - * @param int $contextlines |
655 | | - * @param int $contextchars |
656 | | - * @return string |
657 | | - */ |
658 | | - public function highlightText( $text, $terms, $contextlines, $contextchars ) { |
659 | | - global $wgLang, $wgContLang; |
660 | | - $fname = __METHOD__; |
661 | | - |
662 | | - if($text == '') |
663 | | - return ''; |
664 | | - |
665 | | - // spli text into text + templates/links/tables |
666 | | - $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)/"; |
667 | | - // first capture group is for detecting nested templates/links/tables |
668 | | - $endPatterns = array( |
669 | | - 1 => '/(\{\{)|(\}\})/', // template |
670 | | - 2 => '/(\[\[)|(\]\])/', // image |
671 | | - 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table |
672 | | - $textExt = array(); // text extracts |
673 | | - $otherExt = array(); // other extracts |
674 | | - wfProfileIn( "$fname-split" ); |
675 | | - $start = 0; |
676 | | - $textLen = strlen($text); |
677 | | - $count = 0; // sequence number to maintain ordering |
678 | | - while( $start < $textLen ){ |
679 | | - // find start of template/image/table |
680 | | - if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){ |
681 | | - $epat = ''; |
682 | | - foreach($matches as $key => $val){ |
683 | | - if($key > 0 && $val[1] != -1){ |
684 | | - if($key == 2){ |
685 | | - // see if this is an image link |
686 | | - $ns = substr($val[0],2,-1); |
687 | | - if( $wgContLang->getNsIndex($ns) != NS_IMAGE ) |
688 | | - break; |
689 | | - |
690 | | - } |
691 | | - $epat = $endPatterns[$key]; |
692 | | - $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); |
693 | | - $start = $val[1]; |
694 | | - break; |
695 | | - } |
696 | | - } |
697 | | - if( $epat ){ |
698 | | - // find end (and detect any nested elements) |
699 | | - $level = 0; |
700 | | - $offset = $start + 1; |
701 | | - $found = false; |
702 | | - while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){ |
703 | | - if( array_key_exists(2,$endMatches) ){ |
704 | | - // found end |
705 | | - if($level == 0){ |
706 | | - $len = strlen($endMatches[2][0]); |
707 | | - $off = $endMatches[2][1]; |
708 | | - $this->splitAndAdd( $otherExt, $count, |
709 | | - substr( $text, $start, $off + $len - $start ) ); |
710 | | - $start = $off + $len; |
711 | | - $found = true; |
712 | | - break; |
713 | | - } else{ |
714 | | - // end of nested element |
715 | | - $level -= 1; |
716 | | - } |
717 | | - } else{ |
718 | | - // nested |
719 | | - $level += 1; |
720 | | - } |
721 | | - $offset = $endMatches[0][1] + strlen($endMatches[0][0]); |
722 | | - } |
723 | | - if( ! $found ){ |
724 | | - // couldn't find appropriate closing tag, skip |
725 | | - $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) ); |
726 | | - $start += strlen($matches[0][0]); |
727 | | - } |
728 | | - continue; |
729 | | - } |
730 | | - } |
731 | | - // else: add as text extract |
732 | | - $this->splitAndAdd( $textExt, $count, substr($text,$start) ); |
733 | | - break; |
734 | | - } |
735 | | - |
736 | | - $all = $textExt + $otherExt; // these have disjunct key sets |
737 | | - |
738 | | - wfProfileOut( "$fname-split" ); |
739 | | - |
740 | | - // prepare regexps |
741 | | - foreach( $terms as $index => $term ) { |
742 | | - $terms[$index] = preg_quote( $term, '/' ); |
743 | | - // manually do upper/lowercase stuff for utf-8 since PHP won't do it |
744 | | - if(preg_match('/[\x80-\xff]/', $term) ){ |
745 | | - $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]); |
746 | | - } |
747 | | - |
748 | | - |
749 | | - } |
750 | | - $anyterm = implode( '|', $terms ); |
751 | | - $phrase = implode('[, .:;\(\)"\'\-\+]+', $terms ); |
752 | | - |
753 | | - // FIXME: a hack to scale contextchars, a correct solution |
754 | | - // would be to have contextchars actually be char and not byte |
755 | | - // length, and do proper utf-8 substrings and lengths everywhere, |
756 | | - // but PHP is making that very hard and unclean to implement :( |
757 | | - $scale = strlen($anyterm) / mb_strlen($anyterm); |
758 | | - $contextchars = intval( $contextchars * $scale ); |
759 | | - |
760 | | - $pat1 = '/('.$phrase.')/ui'; |
761 | | - $pat2 = '/('.$anyterm.')/ui'; |
762 | | - |
763 | | - wfProfileIn( "$fname-extract" ); |
764 | | - |
765 | | - $left = $contextlines; |
766 | | - |
767 | | - $snippets = array(); |
768 | | - $offsets = array(); |
769 | | - // match whole query on text |
770 | | - $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets); |
771 | | - // match whole query on templates/tables/images |
772 | | - $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets); |
773 | | - // match any words on text |
774 | | - $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets); |
775 | | - // match any words on templates/tables/images |
776 | | - $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets); |
777 | | - |
778 | | - ksort($snippets); |
779 | | - |
780 | | - $first = array_keys($textExt); |
781 | | - if( isset($first[0])) |
782 | | - $first = $first[0]; |
783 | | - else |
784 | | - $first = 0; |
785 | | - |
786 | | - // add extra chars to each snippet to make snippets constant size |
787 | | - $extended = array(); |
788 | | - if( count( $snippets ) == 0){ |
789 | | - // couldn't find the target words, just show beginning of article |
790 | | - $targetchars = $contextchars * $contextlines; |
791 | | - $snippets[$first] = ''; |
792 | | - $offsets[$first] = 0; |
793 | | - } else{ |
794 | | - // if begin of the article contains the whole phrase, show only that !! |
795 | | - if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first]) |
796 | | - && $offsets[$first] < $contextchars * 2 ){ |
797 | | - $snippets = array ($first => $snippets[$first]); |
798 | | - } |
799 | | - |
800 | | - // calc by how much to extend existing snippets |
801 | | - $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) ); |
802 | | - } |
803 | | - |
804 | | - foreach($snippets as $index => $line){ |
805 | | - $extended[$index] = $line; |
806 | | - $len = strlen($line); |
807 | | - if( $len < $targetchars - 20 ){ |
808 | | - // complete this line |
809 | | - if($len < strlen( $all[$index] )){ |
810 | | - $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]); |
811 | | - $len = strlen( $extended[$index] ); |
812 | | - } |
813 | | - |
814 | | - // add more lines |
815 | | - $add = $index + 1; |
816 | | - while( $len < $targetchars - 20 |
817 | | - && array_key_exists($add,$all) |
818 | | - && !array_key_exists($add,$snippets) ){ |
819 | | - $offsets[$add] = 0; |
820 | | - $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); |
821 | | - $extended[$add] = $tt; |
822 | | - $len += strlen( $tt ); |
823 | | - $add++; |
824 | | - } |
825 | | - } |
826 | | - } |
827 | | - |
828 | | - $snippets = array_map('htmlspecialchars', $extended); |
829 | | - $last = -1; |
830 | | - $extract = ''; |
831 | | - foreach($snippets as $index => $line){ |
832 | | - if($last == -1) |
833 | | - $extract .= $line; // first line |
834 | | - elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last])) |
835 | | - $extract .= " ".$line; // continous lines |
836 | | - else |
837 | | - $extract .= '<b> ... </b>' . $line; |
838 | | - |
839 | | - $last = $index; |
840 | | - } |
841 | | - if( $extract ) |
842 | | - $extract .= '<b> ... </b>'; |
843 | | - |
844 | | - // highlight words |
845 | | - $pat3 = '/(' . $anyterm . ")/ui"; |
846 | | - $extract = preg_replace( $pat3, |
847 | | - "<span class='searchmatch'>\\1</span>", $extract ); |
848 | | - |
849 | | - wfProfileOut( "$fname-extract" ); |
850 | | - |
851 | | - return $extract; |
852 | | - } |
853 | | - |
854 | | - /** |
855 | | - * Split text into lines and add it to extracts array |
856 | | - * |
857 | | - * @param array $extracts index -> $line |
858 | | - * @param int $count |
859 | | - * @param string $text |
860 | | - */ |
861 | | - function splitAndAdd(&$extracts, &$count, $text){ |
862 | | - $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text ); |
863 | | - foreach($split as $line){ |
864 | | - $tt = trim($line); |
865 | | - if( $tt ) |
866 | | - $extracts[$count++] = $tt; |
867 | | - } |
868 | | - } |
869 | | - |
870 | | - /** |
871 | | - * Do manual case conversion for non-ascii chars |
872 | | - * |
873 | | - * @param unknown_type $matches |
874 | | - */ |
875 | | - function caseCallback($matches){ |
876 | | - global $wgContLang; |
877 | | - if( strlen($matches[0]) > 1 ){ |
878 | | - return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']'; |
879 | | - } else |
880 | | - return $matches[0]; |
881 | | - } |
882 | | - |
883 | | - /** |
884 | | - * Extract part of the text from start to end, but by |
885 | | - * not chopping up words |
886 | | - * @param string $text |
887 | | - * @param int $start |
888 | | - * @param int $end |
889 | | - * @param int $posStart (out) actual start position |
890 | | - * @param int $posEnd (out) actual end position |
891 | | - * @return string |
892 | | - */ |
893 | | - function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){ |
894 | | - global $wgContLang; |
895 | | - |
896 | | - if( $start != 0) |
897 | | - $start = $this->position( $text, $start, 1 ); |
898 | | - if( $end >= strlen($text) ) |
899 | | - $end = strlen($text); |
900 | | - else |
901 | | - $end = $this->position( $text, $end ); |
902 | | - |
903 | | - if(!is_null($posStart)) |
904 | | - $posStart = $start; |
905 | | - if(!is_null($posEnd)) |
906 | | - $posEnd = $end; |
907 | | - |
908 | | - if($end > $start) |
909 | | - return substr($text, $start, $end-$start); |
910 | | - else |
911 | | - return ''; |
912 | | - } |
913 | | - |
914 | | - /** |
915 | | - * Find a nonletter near a point (index) in the text |
916 | | - * |
917 | | - * @param string $text |
918 | | - * @param int $point |
919 | | - * @param int $offset to found index |
920 | | - * @return int nearest nonletter index, or beginning of utf8 char if none |
921 | | - */ |
922 | | - function position($text, $point, $offset=0 ){ |
923 | | - $tolerance = 10; |
924 | | - $s = max( 0, $point - $tolerance ); |
925 | | - $l = min( strlen($text), $point + $tolerance ) - $s; |
926 | | - $m = array(); |
927 | | - if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){ |
928 | | - return $m[0][1] + $s + $offset; |
929 | | - } else{ |
930 | | - // check if point is on a valid first UTF8 char |
931 | | - $char = ord( $text[$point] ); |
932 | | - while( $char >= 0x80 && $char < 0xc0 ) { |
933 | | - // skip trailing bytes |
934 | | - $point++; |
935 | | - if($point >= strlen($text)) |
936 | | - return strlen($text); |
937 | | - $char = ord( $text[$point] ); |
938 | | - } |
939 | | - return $point; |
940 | | - |
941 | | - } |
942 | | - } |
943 | | - |
944 | | - /** |
945 | | - * Search extracts for a pattern, and return snippets |
946 | | - * |
947 | | - * @param string $pattern regexp for matching lines |
948 | | - * @param array $extracts extracts to search |
949 | | - * @param int $linesleft number of extracts to make |
950 | | - * @param int $contextchars length of snippet |
951 | | - * @param array $out map for highlighted snippets |
952 | | - * @param array $offsets map of starting points of snippets |
953 | | - * @protected |
954 | | - */ |
955 | | - function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){ |
956 | | - if($linesleft == 0) |
957 | | - return; // nothing to do |
958 | | - foreach($extracts as $index => $line){ |
959 | | - if( array_key_exists($index,$out) ) |
960 | | - continue; // this line already highlighted |
961 | | - |
962 | | - $m = array(); |
963 | | - if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) |
964 | | - continue; |
965 | | - |
966 | | - $offset = $m[0][1]; |
967 | | - $len = strlen($m[0][0]); |
968 | | - if($offset + $len < $contextchars) |
969 | | - $begin = 0; |
970 | | - elseif( $len > $contextchars) |
971 | | - $begin = $offset; |
972 | | - else |
973 | | - $begin = $offset + intval( ($len - $contextchars) / 2 ); |
974 | | - |
975 | | - $end = $begin + $contextchars; |
976 | | - |
977 | | - $posBegin = $begin; |
978 | | - // basic snippet from this line |
979 | | - $out[$index] = $this->extract($line,$begin,$end,$posBegin); |
980 | | - $offsets[$index] = $posBegin; |
981 | | - $linesleft--; |
982 | | - if($linesleft == 0) |
983 | | - return; |
984 | | - } |
985 | | - } |
986 | | - |
987 | | - /** |
988 | | - * Basic wikitext removal |
989 | | - * @protected |
990 | | - */ |
991 | | - function removeWiki($text) { |
992 | | - $fname = __METHOD__; |
993 | | - wfProfileIn( $fname ); |
994 | | - |
995 | | - //$text = preg_replace("/'{2,5}/", "", $text); |
996 | | - //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text); |
997 | | - //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text); |
998 | | - //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text); |
999 | | - //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text); |
1000 | | - //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text); |
1001 | | - $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text); |
1002 | | - $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text); |
1003 | | - $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text); |
1004 | | - $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text); |
1005 | | - //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); |
1006 | | - $text = preg_replace("/<\/?[^>]+>/", "", $text); |
1007 | | - $text = preg_replace("/'''''/", "", $text); |
1008 | | - $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text); |
1009 | | - $text = preg_replace("/''/", "", $text); |
1010 | | - |
1011 | | - wfProfileOut( $fname ); |
1012 | | - return $text; |
1013 | | - } |
1014 | | - |
1015 | | - /** |
1016 | | - * callback to replace [[target|caption]] kind of links, if |
1017 | | - * the target is category or image, leave it |
1018 | | - * |
1019 | | - * @param array $matches |
1020 | | - */ |
1021 | | - function linkReplace($matches){ |
1022 | | - $colon = strpos( $matches[1], ':' ); |
1023 | | - if( $colon === false ) |
1024 | | - return $matches[2]; // replace with caption |
1025 | | - global $wgContLang; |
1026 | | - $ns = substr( $matches[1], 0, $colon ); |
1027 | | - $index = $wgContLang->getNsIndex($ns); |
1028 | | - if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) ) |
1029 | | - return $matches[0]; // return the whole thing |
1030 | | - else |
1031 | | - return $matches[2]; |
1032 | | - |
1033 | | - } |
1034 | | -} |
1035 | | - |
1036 | | -/** |
1037 | | - * @addtogroup Search |
1038 | | - */ |
1039 | 694 | class SearchEngineDummy { |
1040 | 695 | function search( $term ) { |
1041 | 696 | return null; |