r53910 MediaWiki - Code Review archive

Revision:r53909‎ | r53910 | r53911 >
Date:22:35, 28 July 2009
Status:deferred (Comments)
Add an option to get an abstract from the article text,
rather than a list of categories for the context in the
auto-disambig pages. Abstract code borrowed from
OpenSearchXML extension by Brion Vibber
Modified paths:
  • /trunk/extensions/IndexFunction/IndexAbstracts.php (added) (history)
  • /trunk/extensions/IndexFunction/IndexFunction.php (modified) (history)
  • /trunk/extensions/IndexFunction/SpecialIndex.php (modified) (history)

Diff [purge]

Index: trunk/extensions/IndexFunction/IndexFunction.php
@@ -42,7 +42,16 @@
4343 $wgExtensionMessagesFiles['IndexFunction'] = $dir . 'IndexFunction.i18n.php';
4444 $wgAutoloadClasses['IndexFunctionHooks'] = $dir . 'IndexFunction_body.php';
4545 $wgAutoloadClasses['IndexFunction'] = $dir . 'IndexFunction_body.php';
 46+$wgAutoloadClasses['IndexAbstracts'] = $dir . 'IndexAbstracts.php';
 49+ * Used to set the context given on Special:Index auto-disambig pages
 50+ * Can be 1 of 2 options:
 51+ * 'extract' (default) - Show an extract from the start of the article
 52+ * 'categories' - Show a comma-separated list of categories the article is in
 54+$wgSpecialIndexContext = 'extract';
4756 function efIndexSetup( &$parser ) {
4857 $parser->setFunctionHook( 'index-func', array( 'IndexFunctionHooks', 'indexRender' ) );
4958 return true;
Index: trunk/extensions/IndexFunction/IndexAbstracts.php
@@ -0,0 +1,139 @@
 5+ * Class to extract the first bit of text from an article
 6+ * Adapted from the OpenSearchXML extension, by Brion Vibber
 9+class IndexAbstracts {
 10+ /**
 11+ * Strip markup to show plaintext
 12+ * @param string $text
 13+ * @return string
 14+ * @access private
 15+ */
 16+ function _stripMarkup( $text ) {
 17+ global $wgContLang;
 19+ $text = substr( $text, 0, 4096 ); // don't bother with long text...
 21+ $text = str_replace( "'''", "", $text );
 22+ $text = str_replace( "''", "", $text );
 24+ $text = preg_replace( '#__[a-z0-9_]+__#i', '', $text ); // magic words
 26+ $cleanChar = "[^|\[\]]";
 27+ $subLink = "\[\[$cleanChar*(?:\|$cleanChar*)*\]\]";
 28+ $pipeContents = "(?:$cleanChar|$subLink)*";
 29+ $text = preg_replace_callback( "#
 30+ \[\[
 31+ ($cleanChar*)
 32+ (?:\|($pipeContents))?
 33+ (?:\|$pipeContents)*
 34+ \]\]
 35+ #six", array( $this, '_stripLink' ), $text );
 37+ $protocols = wfUrlProtocols();
 38+ $text = preg_replace( '#\\[(?:$protocols).*? (.*?)\\]#s', '$1', $text ); // URL links
 39+ $text = preg_replace( '#</?[a-z0-9]+.*?>#s', '', $text ); // HTML-style tags
 40+ $text = preg_replace( '#\\{\\|.*?\\|\\}#s', '', $text ); // tables
 42+ $text = preg_replace( '#^:.*$#m', '', $text ); // indented lines near start are usually disambigs or notices
 43+ $text = Sanitizer::decodeCharReferences( $text );
 44+ return trim( $text );
 45+ }
 47+ function _stripLink( $matches ) {
 48+ $target = trim( $matches[1] );
 49+ if( isset( $matches[2] ) ) {
 50+ $text = trim( $matches[2] );
 51+ } else {
 52+ $text = $target;
 53+ }
 55+ $title = Title::newFromText( $target );
 56+ if( $title ) {
 57+ $ns = $title->getNamespace();
 58+ if( $title->getInterwiki() || $ns == NS_IMAGE || $ns == NS_CATEGORY ) {
 59+ return "";
 60+ } else {
 61+ return $text;
 62+ }
 63+ } else {
 64+ return $matches[0];
 65+ }
 66+ }
 68+ /**
 69+ * Extract the first two sentences, if detectable, from the text.
 70+ * @param string $text
 71+ * @return string
 72+ * @access private
 73+ */
 74+ function _extractStart( $text ) {
 75+ $endchars = array(
 76+ '([^\d])\.\s', '\!\s', '\?\s', // regular ASCII
 77+ '。', // full-width ideographic full-stop
 78+ '.', '!', '?', // double-width roman forms
 79+ '。', // half-width ideographic full stop
 80+ );
 82+ $endgroup = implode( '|', $endchars );
 83+ $end = "(?:$endgroup)";
 84+ $sentence = ".*?$end+";
 85+ $firstone = "/^($sentence)/u";
 86+ if( preg_match( $firstone, $text, $matches ) ) {
 87+ return $matches[1];
 88+ } else {
 89+ // Just return the first line
 90+ $lines = explode( "\n", $text );
 91+ return trim( $lines[0] );
 92+ }
 93+ }
 95+ public function getExtract( $title, $chars=50 ) {
 96+ $rev = Revision::newFromTitle( $title );
 97+ if( $rev ) {
 98+ $text = substr( $rev->getText(), 0, 16384 );
 100+ // Ok, first note this is a TERRIBLE HACK. :D
 101+ //
 102+ // First, we use the system preprocessor to break down the text
 103+ // into text, templates, extensions, and comments:
 104+ global $wgParser;
 105+ $wgParser->clearState();
 106+ $wgParser->mOptions = new ParserOptions();
 107+ $frame = $wgParser->getPreprocessor()->newFrame();
 108+ $dom = $wgParser->preprocessToDom( $text );
 110+ $imageArgs = array(
 111+ 'image',
 112+ 'image_skyline',
 113+ 'img',
 114+ 'Img',
 115+ );
 117+ // Now, we strip out everything that's not text.
 118+ // This works with both DOM and Hash parsers, but feels fragile.
 119+ $node = $dom->getFirstChild();
 120+ $out = '';
 121+ while( $node ) {
 122+ if( $node->getName() == '#text' ) {
 123+ $out .= $frame->expand( $node, PPFrame::RECOVER_ORIG );
 124+ }
 125+ $node = $node->getNextSibling();
 126+ }
 128+ // The remaining text may still contain wiki and HTML markup.
 129+ // We'll use our shitty hand parser to strip most of those from
 130+ // the beginning of the text.
 131+ $stripped = $this->_stripMarkup( $out );
 133+ // And now, we'll grab just the first sentence as text, and
 134+ // also try to rip out a badge image.
 135+ return $this->_extractStart( $stripped );
 136+ }
 137+ return '';
 138+ }
Index: trunk/extensions/IndexFunction/SpecialIndex.php
@@ -35,7 +35,7 @@
3636 }
3838 function showDabPage( Title $t1 ) {
39 - global $wgOut, $wgUser;
 39+ global $wgOut, $wgUser, $wgSpecialIndexContext;
4040 $sk = $wgUser->getSkin();
4141 $wgOut->setPagetitle( $t1->getPrefixedText() );
4242 $dbr = wfGetDB( DB_SLAVE );
@@ -112,9 +112,8 @@
113113 $grouphtml .= Xml::openElement( 'ul' );
114114 foreach( $group as $pageid ) {
115115 $t = $list[$pageid]['title'];
116 - $cats = $list[$pageid]['cats'];
117 - $link = $sk->link( $t, null, array(), array(), array( 'known', 'noclasses' ) );
118 - $grouphtml .= Xml::tags( 'li', array(), $link . '&nbsp;&ndash&nbsp;' . implode( ', ', $cats ) );
 116+ $cats = $list[$pageid]['cats'];
 117+ $grouphtml .= $this->makeContextLine( $t, $cats );
119118 unset( $list[$pageid] );
120119 ksort($list);
121120 foreach($catlist as $remaining) {
@@ -137,8 +136,7 @@
138137 if (count($list) != 0) { //Pages w/ no cats
139138 $grouphtml = Xml::openElement( 'ul' );
140139 foreach( $list as $pageid => $info ) {
141 - $link = $sk->link( $info['title'], null, array(), array(), array( 'known', 'noclasses' ) );
142 - $grouphtml .= Xml::tags( 'li', array(), $link );
 140+ $grouphtml .= $this->makeContextLine( $info['title'], array() );
143141 }
144142 $grouphtml .= Xml::closeElement('ul');
145143 $groups = array_merge( array($grouphtml), $groups);
@@ -147,19 +145,44 @@
148146 } else {
149147 $out = Xml::openElement( 'ul' );
150148 foreach( $list as $pageid => $info ) {
151 - $link = $sk->link( $info['title'], null, array(), array(), array( 'known', 'noclasses' ) );
152 - if ( $info['cats'] ) {
153 - $line = $link . '&nbsp;&ndash&nbsp;' . implode( ', ', $info['cats'] );
154 - $line = Xml::tags( 'li', array(), $line );
155 - } else {
156 - $line = Xml::tags( 'li', array(), $link );
157 - }
158 - $out .= $line;
 149+ $out .= $this->makeContextLine( $info['title'], $info['cats'] );
159150 }
160151 $out .= Xml::closeElement('ul');
161152 }
163154 $wgOut->addHtml($out);
164155 }
 157+ private function makeContextLine( $title, $cats ) {
 158+ global $wgUser, $wgSpecialIndexContext;
 159+ $sk = $wgUser->getSkin();
 160+ $link = $sk->link( $title, null, array(), array(), array( 'known', 'noclasses' ) );
 161+ if ( $wgSpecialIndexContext == 'extract' ) {
 162+ $extracter = new IndexAbstracts();
 163+ $text = $extracter->getExtract( $title );
 164+ if ( $text != '' ) {
 165+ if ( stripos( $text, $title->getPrefixedText() ) !== false ) {
 166+ $search = preg_quote( $title->getPrefixedText(), '/' );
 167+ $line = preg_replace( "/$search/i", $link, $text, 1 );
 168+ } else {
 169+ $line = $link . '&nbsp;&ndash&nbsp;' . $text;
 170+ }
 171+ } else {
 172+ $line = $link;
 173+ }
 174+ $line = Xml::tags( 'li', array(), $line );
 175+ } elseif ( $wgSpecialIndexContext == 'categories' ) {
 176+ if ( $cats ) {
 177+ $line = $link . '&nbsp;&ndash&nbsp;' . implode( ', ', $cats );
 178+ $line = Xml::tags( 'li', array(), $line );
 179+ } else {
 180+ $line = Xml::tags( 'li', array(), $link );
 181+ }
 182+ } else {
 183+ $line = Xml::tags( 'li', array(), $link );
 184+ }
 185+ return $line;
 186+ }
165188 }


#Comment by Platonides (talk | contribs)   16:01, 8 February 2011

What about making OpenSearchXML use this class?

Status & tagging log