Index: trunk/extensions/IndexFunction/IndexFunction.php |
— | — | @@ -42,7 +42,16 @@ |
43 | 43 | $wgExtensionMessagesFiles['IndexFunction'] = $dir . 'IndexFunction.i18n.php'; |
44 | 44 | $wgAutoloadClasses['IndexFunctionHooks'] = $dir . 'IndexFunction_body.php'; |
45 | 45 | $wgAutoloadClasses['IndexFunction'] = $dir . 'IndexFunction_body.php'; |
| 46 | +$wgAutoloadClasses['IndexAbstracts'] = $dir . 'IndexAbstracts.php'; |
46 | 47 | |
| 48 | +/* |
| 49 | + * Used to set the context given on Special:Index auto-disambig pages |
| 50 | + * Can be 1 of 2 options: |
| 51 | + * 'extract' (default) - Show an extract from the start of the article |
| 52 | + * 'categories' - Show a comma-separated list of categories the article is in |
| 53 | +*/ |
| 54 | +$wgSpecialIndexContext = 'extract'; |
| 55 | + |
47 | 56 | function efIndexSetup( &$parser ) { |
48 | 57 | $parser->setFunctionHook( 'index-func', array( 'IndexFunctionHooks', 'indexRender' ) ); |
49 | 58 | return true; |
Index: trunk/extensions/IndexFunction/IndexAbstracts.php |
— | — | @@ -0,0 +1,139 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +/* |
| 5 | + * Class to extract the first bit of text from an article |
| 6 | + * Adapted from the OpenSearchXML extension, by Brion Vibber |
| 7 | +*/ |
| 8 | + |
| 9 | +class IndexAbstracts { |
| 10 | + /** |
| 11 | + * Strip markup to show plaintext |
| 12 | + * @param string $text |
| 13 | + * @return string |
| 14 | + * @access private |
| 15 | + */ |
| 16 | + function _stripMarkup( $text ) { |
| 17 | + global $wgContLang; |
| 18 | + |
| 19 | + $text = substr( $text, 0, 4096 ); // don't bother with long text... |
| 20 | + |
| 21 | + $text = str_replace( "'''", "", $text ); |
| 22 | + $text = str_replace( "''", "", $text ); |
| 23 | + |
| 24 | + $text = preg_replace( '#__[a-z0-9_]+__#i', '', $text ); // magic words |
| 25 | + |
| 26 | + $cleanChar = "[^|\[\]]"; |
| 27 | + $subLink = "\[\[$cleanChar*(?:\|$cleanChar*)*\]\]"; |
| 28 | + $pipeContents = "(?:$cleanChar|$subLink)*"; |
| 29 | + $text = preg_replace_callback( "# |
| 30 | + \[\[ |
| 31 | + ($cleanChar*) |
| 32 | + (?:\|($pipeContents))? |
| 33 | + (?:\|$pipeContents)* |
| 34 | + \]\] |
| 35 | + #six", array( $this, '_stripLink' ), $text ); |
| 36 | + |
| 37 | + $protocols = wfUrlProtocols(); |
| 38 | + $text = preg_replace( '#\\[(?:$protocols).*? (.*?)\\]#s', '$1', $text ); // URL links |
| 39 | + $text = preg_replace( '#</?[a-z0-9]+.*?>#s', '', $text ); // HTML-style tags |
| 40 | + $text = preg_replace( '#\\{\\|.*?\\|\\}#s', '', $text ); // tables |
| 41 | + |
| 42 | + $text = preg_replace( '#^:.*$#m', '', $text ); // indented lines near start are usually disambigs or notices |
| 43 | + $text = Sanitizer::decodeCharReferences( $text ); |
| 44 | + return trim( $text ); |
| 45 | + } |
| 46 | + |
| 47 | + function _stripLink( $matches ) { |
| 48 | + $target = trim( $matches[1] ); |
| 49 | + if( isset( $matches[2] ) ) { |
| 50 | + $text = trim( $matches[2] ); |
| 51 | + } else { |
| 52 | + $text = $target; |
| 53 | + } |
| 54 | + |
| 55 | + $title = Title::newFromText( $target ); |
| 56 | + if( $title ) { |
| 57 | + $ns = $title->getNamespace(); |
| 58 | + if( $title->getInterwiki() || $ns == NS_IMAGE || $ns == NS_CATEGORY ) { |
| 59 | + return ""; |
| 60 | + } else { |
| 61 | + return $text; |
| 62 | + } |
| 63 | + } else { |
| 64 | + return $matches[0]; |
| 65 | + } |
| 66 | + } |
| 67 | + |
| 68 | + /** |
| 69 | + * Extract the first two sentences, if detectable, from the text. |
| 70 | + * @param string $text |
| 71 | + * @return string |
| 72 | + * @access private |
| 73 | + */ |
| 74 | + function _extractStart( $text ) { |
| 75 | + $endchars = array( |
| 76 | + '([^\d])\.\s', '\!\s', '\?\s', // regular ASCII |
| 77 | + '。', // full-width ideographic full-stop |
| 78 | + '.', '!', '?', // double-width roman forms |
| 79 | + '。', // half-width ideographic full stop |
| 80 | + ); |
| 81 | + |
| 82 | + $endgroup = implode( '|', $endchars ); |
| 83 | + $end = "(?:$endgroup)"; |
| 84 | + $sentence = ".*?$end+"; |
| 85 | + $firstone = "/^($sentence)/u"; |
| 86 | + if( preg_match( $firstone, $text, $matches ) ) { |
| 87 | + return $matches[1]; |
| 88 | + } else { |
| 89 | + // Just return the first line |
| 90 | + $lines = explode( "\n", $text ); |
| 91 | + return trim( $lines[0] ); |
| 92 | + } |
| 93 | + } |
| 94 | + |
| 95 | + public function getExtract( $title, $chars=50 ) { |
| 96 | + $rev = Revision::newFromTitle( $title ); |
| 97 | + if( $rev ) { |
| 98 | + $text = substr( $rev->getText(), 0, 16384 ); |
| 99 | + |
| 100 | + // Ok, first note this is a TERRIBLE HACK. :D |
| 101 | + // |
| 102 | + // First, we use the system preprocessor to break down the text |
| 103 | + // into text, templates, extensions, and comments: |
| 104 | + global $wgParser; |
| 105 | + $wgParser->clearState(); |
| 106 | + $wgParser->mOptions = new ParserOptions(); |
| 107 | + $frame = $wgParser->getPreprocessor()->newFrame(); |
| 108 | + $dom = $wgParser->preprocessToDom( $text ); |
| 109 | + |
| 110 | + $imageArgs = array( |
| 111 | + 'image', |
| 112 | + 'image_skyline', |
| 113 | + 'img', |
| 114 | + 'Img', |
| 115 | + ); |
| 116 | + |
| 117 | + // Now, we strip out everything that's not text. |
| 118 | + // This works with both DOM and Hash parsers, but feels fragile. |
| 119 | + $node = $dom->getFirstChild(); |
| 120 | + $out = ''; |
| 121 | + while( $node ) { |
| 122 | + if( $node->getName() == '#text' ) { |
| 123 | + $out .= $frame->expand( $node, PPFrame::RECOVER_ORIG ); |
| 124 | + } |
| 125 | + $node = $node->getNextSibling(); |
| 126 | + } |
| 127 | + |
| 128 | + // The remaining text may still contain wiki and HTML markup. |
| 129 | + // We'll use our shitty hand parser to strip most of those from |
| 130 | + // the beginning of the text. |
| 131 | + $stripped = $this->_stripMarkup( $out ); |
| 132 | + |
| 133 | + // And now, we'll grab just the first sentence as text, and |
| 134 | + // also try to rip out a badge image. |
| 135 | + return $this->_extractStart( $stripped ); |
| 136 | + } |
| 137 | + return ''; |
| 138 | + } |
| 139 | + |
| 140 | +} |
Index: trunk/extensions/IndexFunction/SpecialIndex.php |
— | — | @@ -35,7 +35,7 @@ |
36 | 36 | } |
37 | 37 | |
38 | 38 | function showDabPage( Title $t1 ) { |
39 | | - global $wgOut, $wgUser; |
| 39 | + global $wgOut, $wgUser, $wgSpecialIndexContext; |
40 | 40 | $sk = $wgUser->getSkin(); |
41 | 41 | $wgOut->setPagetitle( $t1->getPrefixedText() ); |
42 | 42 | $dbr = wfGetDB( DB_SLAVE ); |
— | — | @@ -112,9 +112,8 @@ |
113 | 113 | $grouphtml .= Xml::openElement( 'ul' ); |
114 | 114 | foreach( $group as $pageid ) { |
115 | 115 | $t = $list[$pageid]['title']; |
116 | | - $cats = $list[$pageid]['cats']; |
117 | | - $link = $sk->link( $t, null, array(), array(), array( 'known', 'noclasses' ) ); |
118 | | - $grouphtml .= Xml::tags( 'li', array(), $link . ' &ndash ' . implode( ', ', $cats ) ); |
| 116 | + $cats = $list[$pageid]['cats']; |
| 117 | + $grouphtml .= $this->makeContextLine( $t, $cats ); |
119 | 118 | unset( $list[$pageid] ); |
120 | 119 | ksort($list); |
121 | 120 | foreach($catlist as $remaining) { |
— | — | @@ -137,8 +136,7 @@ |
138 | 137 | if (count($list) != 0) { //Pages w/ no cats |
139 | 138 | $grouphtml = Xml::openElement( 'ul' ); |
140 | 139 | foreach( $list as $pageid => $info ) { |
141 | | - $link = $sk->link( $info['title'], null, array(), array(), array( 'known', 'noclasses' ) ); |
142 | | - $grouphtml .= Xml::tags( 'li', array(), $link ); |
| 140 | + $grouphtml .= $this->makeContextLine( $info['title'], array() ); |
143 | 141 | } |
144 | 142 | $grouphtml .= Xml::closeElement('ul'); |
145 | 143 | $groups = array_merge( array($grouphtml), $groups); |
— | — | @@ -147,19 +145,44 @@ |
148 | 146 | } else { |
149 | 147 | $out = Xml::openElement( 'ul' ); |
150 | 148 | foreach( $list as $pageid => $info ) { |
151 | | - $link = $sk->link( $info['title'], null, array(), array(), array( 'known', 'noclasses' ) ); |
152 | | - if ( $info['cats'] ) { |
153 | | - $line = $link . ' &ndash ' . implode( ', ', $info['cats'] ); |
154 | | - $line = Xml::tags( 'li', array(), $line ); |
155 | | - } else { |
156 | | - $line = Xml::tags( 'li', array(), $link ); |
157 | | - } |
158 | | - $out .= $line; |
| 149 | + $out .= $this->makeContextLine( $info['title'], $info['cats'] ); |
159 | 150 | } |
160 | 151 | $out .= Xml::closeElement('ul'); |
161 | 152 | } |
162 | 153 | |
163 | 154 | $wgOut->addHtml($out); |
164 | 155 | } |
| 156 | + |
| 157 | + private function makeContextLine( $title, $cats ) { |
| 158 | + global $wgUser, $wgSpecialIndexContext; |
| 159 | + $sk = $wgUser->getSkin(); |
| 160 | + $link = $sk->link( $title, null, array(), array(), array( 'known', 'noclasses' ) ); |
| 161 | + if ( $wgSpecialIndexContext == 'extract' ) { |
| 162 | + $extracter = new IndexAbstracts(); |
| 163 | + $text = $extracter->getExtract( $title ); |
| 164 | + if ( $text != '' ) { |
| 165 | + if ( stripos( $text, $title->getPrefixedText() ) !== false ) { |
| 166 | + $search = preg_quote( $title->getPrefixedText(), '/' ); |
| 167 | + $line = preg_replace( "/$search/i", $link, $text, 1 ); |
| 168 | + } else { |
| 169 | + $line = $link . ' &ndash ' . $text; |
| 170 | + } |
| 171 | + } else { |
| 172 | + $line = $link; |
| 173 | + } |
| 174 | + $line = Xml::tags( 'li', array(), $line ); |
| 175 | + } elseif ( $wgSpecialIndexContext == 'categories' ) { |
| 176 | + if ( $cats ) { |
| 177 | + $line = $link . ' &ndash ' . implode( ', ', $cats ); |
| 178 | + $line = Xml::tags( 'li', array(), $line ); |
| 179 | + } else { |
| 180 | + $line = Xml::tags( 'li', array(), $link ); |
| 181 | + } |
| 182 | + } else { |
| 183 | + $line = Xml::tags( 'li', array(), $link ); |
| 184 | + } |
| 185 | + return $line; |
| 186 | + } |
| 187 | + |
165 | 188 | } |
166 | 189 | |