Index: trunk/phase3/includes/SearchEngine.php |
— | — | @@ -250,8 +250,9 @@ |
251 | 251 | */ |
252 | 252 | public static function userHighlightPrefs( &$user ){ |
253 | 253 | //$contextlines = $user->getOption( 'contextlines', 5 ); |
| 254 | + //$contextchars = $user->getOption( 'contextchars', 50 ); |
254 | 255 | $contextlines = 2; // Hardcode this. Old defaults sucked. :) |
255 | | - $contextchars = $user->getOption( 'contextchars', 50 ); |
| 256 | + $contextchars = 75; // same as above.... :P |
256 | 257 | return array($contextlines, $contextchars); |
257 | 258 | } |
258 | 259 | |
— | — | @@ -553,68 +554,11 @@ |
554 | 555 | global $wgUser; |
555 | 556 | $this->initText(); |
556 | 557 | list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser); |
557 | | - return $this->extractText( $this->mText, $terms, $contextlines, $contextchars); |
| 558 | + $h = new SearchHighlighter(); |
| 559 | + return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars); |
558 | 560 | } |
559 | 561 | |
560 | 562 | /** |
561 | | - * Default implementation of snippet extraction |
562 | | - * |
563 | | - * @param string $text |
564 | | - * @param array $terms Terms to highlight (unescaped) |
565 | | - * @param int $contextlines |
566 | | - * @param int $contextchars |
567 | | - * @return string |
568 | | - */ |
569 | | - protected function extractText( $text, $terms, $contextlines, $contextchars ) { |
570 | | - global $wgLang, $wgContLang; |
571 | | - $fname = __METHOD__; |
572 | | - |
573 | | - $lines = explode( "\n", $text ); |
574 | | - |
575 | | - foreach( $terms as $index => $term ) { |
576 | | - $terms[$index] = preg_quote( $term, '/' ); |
577 | | - } |
578 | | - $terms = implode( '|', $terms ); |
579 | | - $max = intval( $contextchars ) + 1; |
580 | | - $pat1 = "/(.*)($terms)(.{0,$max})/i"; |
581 | | - |
582 | | - $lineno = 0; |
583 | | - |
584 | | - $extract = ""; |
585 | | - wfProfileIn( "$fname-extract" ); |
586 | | - foreach ( $lines as $line ) { |
587 | | - if ( 0 == $contextlines ) { |
588 | | - break; |
589 | | - } |
590 | | - ++$lineno; |
591 | | - $m = array(); |
592 | | - if ( ! preg_match( $pat1, $line, $m ) ) { |
593 | | - continue; |
594 | | - } |
595 | | - --$contextlines; |
596 | | - $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' ); |
597 | | - |
598 | | - if ( count( $m ) < 3 ) { |
599 | | - $post = ''; |
600 | | - } else { |
601 | | - $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' ); |
602 | | - } |
603 | | - |
604 | | - $found = $m[2]; |
605 | | - |
606 | | - $line = htmlspecialchars( $pre . $found . $post ); |
607 | | - $pat2 = '/(' . $terms . ")/i"; |
608 | | - $line = preg_replace( $pat2, |
609 | | - "<span class='searchmatch'>\\1</span>", $line ); |
610 | | - |
611 | | - $extract .= "${line}\n"; |
612 | | - } |
613 | | - wfProfileOut( "$fname-extract" ); |
614 | | - |
615 | | - return $extract; |
616 | | - } |
617 | | - |
618 | | - /** |
619 | 563 | * @param array $terms terms to highlight |
620 | 564 | * @return string highlighted title, '' if not supported |
621 | 565 | */ |
— | — | @@ -690,8 +634,407 @@ |
691 | 635 | } |
692 | 636 | |
693 | 637 | /** |
| 638 | + * Highlight bits of wikitext |
| 639 | + * |
694 | 640 | * @addtogroup Search |
695 | 641 | */ |
| 642 | +class SearchHighlighter { |
| 643 | + var $mCleanWikitext = true; |
| 644 | + |
| 645 | + function SearchHighlighter($cleanupWikitext = true){ |
| 646 | + $this->mCleanWikitext = $cleanupWikitext; |
| 647 | + } |
| 648 | + |
| 649 | + /** |
| 650 | + * Default implementation of wikitext highlighting |
| 651 | + * |
| 652 | + * @param string $text |
| 653 | + * @param array $terms Terms to highlight (unescaped) |
| 654 | + * @param int $contextlines |
| 655 | + * @param int $contextchars |
| 656 | + * @return string |
| 657 | + */ |
| 658 | + public function highlightText( $text, $terms, $contextlines, $contextchars ) { |
| 659 | + global $wgLang, $wgContLang; |
| 660 | + $fname = __METHOD__; |
| 661 | + |
| 662 | + if($text == '') |
| 663 | + return ''; |
| 664 | + |
| 665 | + // spli text into text + templates/links/tables |
| 666 | + $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)/"; |
| 667 | + // first capture group is for detecting nested templates/links/tables |
| 668 | + $endPatterns = array( |
| 669 | + 1 => '/(\{\{)|(\}\})/', // template |
| 670 | + 2 => '/(\[\[)|(\]\])/', // image |
| 671 | + 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table |
| 672 | + $textExt = array(); // text extracts |
| 673 | + $otherExt = array(); // other extracts |
| 674 | + wfProfileIn( "$fname-split" ); |
| 675 | + $start = 0; |
| 676 | + $textLen = strlen($text); |
| 677 | + $count = 0; // sequence number to maintain ordering |
| 678 | + while( $start < $textLen ){ |
| 679 | + // find start of template/image/table |
| 680 | + if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){ |
| 681 | + $epat = ''; |
| 682 | + foreach($matches as $key => $val){ |
| 683 | + if($key > 0 && $val[1] != -1){ |
| 684 | + if($key == 2){ |
| 685 | + // see if this is an image link |
| 686 | + $ns = substr($val[0],2,-1); |
| 687 | + if( $wgContLang->getNsIndex($ns) != NS_IMAGE ) |
| 688 | + break; |
| 689 | + |
| 690 | + } |
| 691 | + $epat = $endPatterns[$key]; |
| 692 | + $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); |
| 693 | + $start = $val[1]; |
| 694 | + break; |
| 695 | + } |
| 696 | + } |
| 697 | + if( $epat ){ |
| 698 | + // find end (and detect any nested elements) |
| 699 | + $level = 0; |
| 700 | + $offset = $start + 1; |
| 701 | + $found = false; |
| 702 | + while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){ |
| 703 | + if( array_key_exists(2,$endMatches) ){ |
| 704 | + // found end |
| 705 | + if($level == 0){ |
| 706 | + $len = strlen($endMatches[2][0]); |
| 707 | + $off = $endMatches[2][1]; |
| 708 | + $this->splitAndAdd( $otherExt, $count, |
| 709 | + substr( $text, $start, $off + $len - $start ) ); |
| 710 | + $start = $off + $len; |
| 711 | + $found = true; |
| 712 | + break; |
| 713 | + } else{ |
| 714 | + // end of nested element |
| 715 | + $level -= 1; |
| 716 | + } |
| 717 | + } else{ |
| 718 | + // nested |
| 719 | + $level += 1; |
| 720 | + } |
| 721 | + $offset = $endMatches[0][1] + strlen($endMatches[0][0]); |
| 722 | + } |
| 723 | + if( ! $found ){ |
| 724 | + // couldn't find appropriate closing tag, skip |
| 725 | + $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) ); |
| 726 | + $start += strlen($matches[0][0]); |
| 727 | + } |
| 728 | + continue; |
| 729 | + } |
| 730 | + } |
| 731 | + // else: add as text extract |
| 732 | + $this->splitAndAdd( $textExt, $count, substr($text,$start) ); |
| 733 | + break; |
| 734 | + } |
| 735 | + |
| 736 | + $all = $textExt + $otherExt; // these have disjunct key sets |
| 737 | + |
| 738 | + wfProfileOut( "$fname-split" ); |
| 739 | + |
| 740 | + // prepare regexps |
| 741 | + foreach( $terms as $index => $term ) { |
| 742 | + $terms[$index] = preg_quote( $term, '/' ); |
| 743 | + // manually do upper/lowercase stuff for utf-8 since PHP won't do it |
| 744 | + if(preg_match('/[\x80-\xff]/', $term) ){ |
| 745 | + $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]); |
| 746 | + } |
| 747 | + |
| 748 | + |
| 749 | + } |
| 750 | + $anyterm = implode( '|', $terms ); |
| 751 | + $phrase = implode('[, .:;\(\)"\'\-\+]+', $terms ); |
| 752 | + |
| 753 | + // FIXME: a hack to scale contextchars, a correct solution |
| 754 | + // would be to have contextchars actually be char and not byte |
| 755 | + // length, and do proper utf-8 substrings and lengths everywhere, |
| 756 | + // but PHP is making that very hard and unclean to implement :( |
| 757 | + $scale = strlen($anyterm) / mb_strlen($anyterm); |
| 758 | + $contextchars = intval( $contextchars * $scale ); |
| 759 | + |
| 760 | + $pat1 = '/('.$phrase.')/ui'; |
| 761 | + $pat2 = '/('.$anyterm.')/ui'; |
| 762 | + |
| 763 | + wfProfileIn( "$fname-extract" ); |
| 764 | + |
| 765 | + $left = $contextlines; |
| 766 | + |
| 767 | + $snippets = array(); |
| 768 | + $offsets = array(); |
| 769 | + // match whole query on text |
| 770 | + $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets); |
| 771 | + // match whole query on templates/tables/images |
| 772 | + $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets); |
| 773 | + // match any words on text |
| 774 | + $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets); |
| 775 | + // match any words on templates/tables/images |
| 776 | + $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets); |
| 777 | + |
| 778 | + ksort($snippets); |
| 779 | + |
| 780 | + $first = array_keys($textExt); |
| 781 | + if( isset($first[0])) |
| 782 | + $first = $first[0]; |
| 783 | + else |
| 784 | + $first = 0; |
| 785 | + |
| 786 | + // add extra chars to each snippet to make snippets constant size |
| 787 | + $extended = array(); |
| 788 | + if( count( $snippets ) == 0){ |
| 789 | + // couldn't find the target words, just show beginning of article |
| 790 | + $targetchars = $contextchars * $contextlines; |
| 791 | + $snippets[$first] = ''; |
| 792 | + $offsets[$first] = 0; |
| 793 | + } else{ |
| 794 | + // if begin of the article contains the whole phrase, show only that !! |
| 795 | + if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first]) |
| 796 | + && $offsets[$first] < $contextchars * 2 ){ |
| 797 | + $snippets = array ($first => $snippets[$first]); |
| 798 | + } |
| 799 | + |
| 800 | + // calc by how much to extend existing snippets |
| 801 | + $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) ); |
| 802 | + } |
| 803 | + |
| 804 | + foreach($snippets as $index => $line){ |
| 805 | + $extended[$index] = $line; |
| 806 | + $len = strlen($line); |
| 807 | + if( $len < $targetchars - 20 ){ |
| 808 | + // complete this line |
| 809 | + if($len < strlen( $all[$index] )){ |
| 810 | + $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]); |
| 811 | + $len = strlen( $extended[$index] ); |
| 812 | + } |
| 813 | + |
| 814 | + // add more lines |
| 815 | + $add = $index + 1; |
| 816 | + while( $len < $targetchars - 20 |
| 817 | + && array_key_exists($add,$all) |
| 818 | + && !array_key_exists($add,$snippets) ){ |
| 819 | + $offsets[$add] = 0; |
| 820 | + $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); |
| 821 | + $extended[$add] = $tt; |
| 822 | + $len += strlen( $tt ); |
| 823 | + $add++; |
| 824 | + } |
| 825 | + } |
| 826 | + } |
| 827 | + |
| 828 | + $snippets = array_map('htmlspecialchars', $extended); |
| 829 | + $last = -1; |
| 830 | + $extract = ''; |
| 831 | + foreach($snippets as $index => $line){ |
| 832 | + if($last == -1) |
| 833 | + $extract .= $line; // first line |
| 834 | + elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last])) |
| 835 | + $extract .= " ".$line; // continous lines |
| 836 | + else |
| 837 | + $extract .= '<b> ... </b>' . $line; |
| 838 | + |
| 839 | + $last = $index; |
| 840 | + } |
| 841 | + if( $extract ) |
| 842 | + $extract .= '<b> ... </b>'; |
| 843 | + |
| 844 | + // highlight words |
| 845 | + $pat3 = '/(' . $anyterm . ")/ui"; |
| 846 | + $extract = preg_replace( $pat3, |
| 847 | + "<span class='searchmatch'>\\1</span>", $extract ); |
| 848 | + |
| 849 | + wfProfileOut( "$fname-extract" ); |
| 850 | + |
| 851 | + return $extract; |
| 852 | + } |
| 853 | + |
| 854 | + /** |
| 855 | + * Split text into lines and add it to extracts array |
| 856 | + * |
| 857 | + * @param array $extracts index -> $line |
| 858 | + * @param int $count |
| 859 | + * @param string $text |
| 860 | + */ |
| 861 | + function splitAndAdd(&$extracts, &$count, $text){ |
| 862 | + $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text ); |
| 863 | + foreach($split as $line){ |
| 864 | + $tt = trim($line); |
| 865 | + if( $tt ) |
| 866 | + $extracts[$count++] = $tt; |
| 867 | + } |
| 868 | + } |
| 869 | + |
| 870 | + /** |
| 871 | + * Do manual case conversion for non-ascii chars |
| 872 | + * |
| 873 | + * @param unknown_type $matches |
| 874 | + */ |
| 875 | + function caseCallback($matches){ |
| 876 | + global $wgContLang; |
| 877 | + if( strlen($matches[0]) > 1 ){ |
| 878 | + return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']'; |
| 879 | + } else |
| 880 | + return $matches[0]; |
| 881 | + } |
| 882 | + |
| 883 | + /** |
| 884 | + * Extract part of the text from start to end, but by |
| 885 | + * not chopping up words |
| 886 | + * @param string $text |
| 887 | + * @param int $start |
| 888 | + * @param int $end |
| 889 | + * @param int $posStart (out) actual start position |
| 890 | + * @param int $posEnd (out) actual end position |
| 891 | + * @return string |
| 892 | + */ |
| 893 | + function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){ |
| 894 | + global $wgContLang; |
| 895 | + |
| 896 | + if( $start != 0) |
| 897 | + $start = $this->position( $text, $start, 1 ); |
| 898 | + if( $end >= strlen($text) ) |
| 899 | + $end = strlen($text); |
| 900 | + else |
| 901 | + $end = $this->position( $text, $end ); |
| 902 | + |
| 903 | + if(!is_null($posStart)) |
| 904 | + $posStart = $start; |
| 905 | + if(!is_null($posEnd)) |
| 906 | + $posEnd = $end; |
| 907 | + |
| 908 | + if($end > $start) |
| 909 | + return substr($text, $start, $end-$start); |
| 910 | + else |
| 911 | + return ''; |
| 912 | + } |
| 913 | + |
| 914 | + /** |
| 915 | + * Find a nonletter near a point (index) in the text |
| 916 | + * |
| 917 | + * @param string $text |
| 918 | + * @param int $point |
| 919 | + * @param int $offset to found index |
| 920 | + * @return int nearest nonletter index, or beginning of utf8 char if none |
| 921 | + */ |
| 922 | + function position($text, $point, $offset=0 ){ |
| 923 | + $tolerance = 10; |
| 924 | + $s = max( 0, $point - $tolerance ); |
| 925 | + $l = min( strlen($text), $point + $tolerance ) - $s; |
| 926 | + $m = array(); |
| 927 | + if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){ |
| 928 | + return $m[0][1] + $s + $offset; |
| 929 | + } else{ |
| 930 | + // check if point is on a valid first UTF8 char |
| 931 | + $char = ord( $text[$point] ); |
| 932 | + while( $char >= 0x80 && $char < 0xc0 ) { |
| 933 | + // skip trailing bytes |
| 934 | + $point++; |
| 935 | + if($point >= strlen($text)) |
| 936 | + return strlen($text); |
| 937 | + $char = ord( $text[$point] ); |
| 938 | + } |
| 939 | + return $point; |
| 940 | + |
| 941 | + } |
| 942 | + } |
| 943 | + |
| 944 | + /** |
| 945 | + * Search extracts for a pattern, and return snippets |
| 946 | + * |
| 947 | + * @param string $pattern regexp for matching lines |
| 948 | + * @param array $extracts extracts to search |
| 949 | + * @param int $linesleft number of extracts to make |
| 950 | + * @param int $contextchars length of snippet |
| 951 | + * @param array $out map for highlighted snippets |
| 952 | + * @param array $offsets map of starting points of snippets |
| 953 | + * @protected |
| 954 | + */ |
| 955 | + function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){ |
| 956 | + if($linesleft == 0) |
| 957 | + return; // nothing to do |
| 958 | + foreach($extracts as $index => $line){ |
| 959 | + if( array_key_exists($index,$out) ) |
| 960 | + continue; // this line already highlighted |
| 961 | + |
| 962 | + $m = array(); |
| 963 | + if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) |
| 964 | + continue; |
| 965 | + |
| 966 | + $offset = $m[0][1]; |
| 967 | + $len = strlen($m[0][0]); |
| 968 | + if($offset + $len < $contextchars) |
| 969 | + $begin = 0; |
| 970 | + elseif( $len > $contextchars) |
| 971 | + $begin = $offset; |
| 972 | + else |
| 973 | + $begin = $offset + intval( ($len - $contextchars) / 2 ); |
| 974 | + |
| 975 | + $end = $begin + $contextchars; |
| 976 | + |
| 977 | + $posBegin = $begin; |
| 978 | + // basic snippet from this line |
| 979 | + $out[$index] = $this->extract($line,$begin,$end,$posBegin); |
| 980 | + $offsets[$index] = $posBegin; |
| 981 | + $linesleft--; |
| 982 | + if($linesleft == 0) |
| 983 | + return; |
| 984 | + } |
| 985 | + } |
| 986 | + |
| 987 | + /** |
| 988 | + * Basic wikitext removal |
| 989 | + * @protected |
| 990 | + */ |
| 991 | + function removeWiki($text) { |
| 992 | + $fname = __METHOD__; |
| 993 | + wfProfileIn( $fname ); |
| 994 | + |
| 995 | + //$text = preg_replace("/'{2,5}/", "", $text); |
| 996 | + //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text); |
| 997 | + //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text); |
| 998 | + //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text); |
| 999 | + //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text); |
| 1000 | + //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text); |
| 1001 | + $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text); |
| 1002 | + $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text); |
| 1003 | + $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text); |
| 1004 | + $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text); |
| 1005 | + //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); |
| 1006 | + $text = preg_replace("/<\/?[^>]+>/", "", $text); |
| 1007 | + $text = preg_replace("/'''''/", "", $text); |
| 1008 | + $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text); |
| 1009 | + $text = preg_replace("/''/", "", $text); |
| 1010 | + |
| 1011 | + wfProfileOut( $fname ); |
| 1012 | + return $text; |
| 1013 | + } |
| 1014 | + |
| 1015 | + /** |
| 1016 | + * callback to replace [[target|caption]] kind of links, if |
| 1017 | + * the target is category or image, leave it |
| 1018 | + * |
| 1019 | + * @param array $matches |
| 1020 | + */ |
| 1021 | + function linkReplace($matches){ |
| 1022 | + $colon = strpos( $matches[1], ':' ); |
| 1023 | + if( $colon === false ) |
| 1024 | + return $matches[2]; // replace with caption |
| 1025 | + global $wgContLang; |
| 1026 | + $ns = substr( $matches[1], 0, $colon ); |
| 1027 | + $index = $wgContLang->getNsIndex($ns); |
| 1028 | + if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) ) |
| 1029 | + return $matches[0]; // return the whole thing |
| 1030 | + else |
| 1031 | + return $matches[2]; |
| 1032 | + |
| 1033 | + } |
| 1034 | +} |
| 1035 | + |
| 1036 | +/** |
| 1037 | + * @addtogroup Search |
| 1038 | + */ |
696 | 1039 | class SearchEngineDummy { |
697 | 1040 | function search( $term ) { |
698 | 1041 | return null; |