r53910 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r53909‎ \| r53910 \| r53911 >
Date:	22:35, 28 July 2009
Author:	mrzman
Status:	deferred (Comments)
Tags:
Comment:	Add an option to get an abstract from the article text, rather than a list of categories for the context in the auto-disambig pages. Abstract code borrowed from OpenSearchXML extension by Brion Vibber
Modified paths:	/trunk/extensions/IndexFunction/IndexAbstracts.php (added) (history) /trunk/extensions/IndexFunction/IndexFunction.php (modified) (history) /trunk/extensions/IndexFunction/SpecialIndex.php (modified) (history)

Diff [purge]

Index: trunk/extensions/IndexFunction/IndexFunction.php
—	—	@@ -42,7 +42,16 @@
43	43	$wgExtensionMessagesFiles['IndexFunction'] = $dir . 'IndexFunction.i18n.php';
44	44	$wgAutoloadClasses['IndexFunctionHooks'] = $dir . 'IndexFunction_body.php';
45	45	$wgAutoloadClasses['IndexFunction'] = $dir . 'IndexFunction_body.php';
	46	+$wgAutoloadClasses['IndexAbstracts'] = $dir . 'IndexAbstracts.php';
46	47
	48	+/*
	49	+ * Used to set the context given on Special:Index auto-disambig pages
	50	+ * Can be 1 of 2 options:
	51	+ * 'extract' (default) - Show an extract from the start of the article
	52	+ * 'categories' - Show a comma-separated list of categories the article is in
	53	+*/
	54	+$wgSpecialIndexContext = 'extract';
	55	+
47	56	function efIndexSetup( &$parser ) {
48	57	$parser->setFunctionHook( 'index-func', array( 'IndexFunctionHooks', 'indexRender' ) );
49	58	return true;
Index: trunk/extensions/IndexFunction/IndexAbstracts.php
—	—	@@ -0,0 +1,139 @@
	2	+<?php
	3	+
	4	+/*
	5	+ * Class to extract the first bit of text from an article
	6	+ * Adapted from the OpenSearchXML extension, by Brion Vibber
	7	+*/
	8	+
	9	+class IndexAbstracts {
	10	+ /**
	11	+ * Strip markup to show plaintext
	12	+ * @param string $text
	13	+ * @return string
	14	+ * @access private
	15	+ */
	16	+ function _stripMarkup( $text ) {
	17	+ global $wgContLang;
	18	+
	19	+ $text = substr( $text, 0, 4096 ); // don't bother with long text...
	20	+
	21	+ $text = str_replace( "'''", "", $text );
	22	+ $text = str_replace( "''", "", $text );
	23	+
	24	+ $text = preg_replace( '#__[a-z0-9_]+__#i', '', $text ); // magic words
	25	+
	26	+ $cleanChar = "[^\|\[\]]";
	27	+ $subLink = "\[\[$cleanChar(?:\\|$cleanChar)*\]\]";
	28	+ $pipeContents = "(?:$cleanChar\|$subLink)*";
	29	+ $text = preg_replace_callback( "#
	30	+ \[\[
	31	+ ($cleanChar*)
	32	+ (?:\\|($pipeContents))?
	33	+ (?:\\|$pipeContents)*
	34	+ \]\]
	35	+ #six", array( $this, '_stripLink' ), $text );
	36	+
	37	+ $protocols = wfUrlProtocols();
	38	+ $text = preg_replace( '#\\[(?:$protocols).? (.?)\\]#s', '$1', $text ); // URL links
	39	+ $text = preg_replace( '#</?[a-z0-9]+.*?>#s', '', $text ); // HTML-style tags
	40	+ $text = preg_replace( '#\\{\\\|.*?\\\|\\}#s', '', $text ); // tables
	41	+
	42	+ $text = preg_replace( '#^:.*$#m', '', $text ); // indented lines near start are usually disambigs or notices
	43	+ $text = Sanitizer::decodeCharReferences( $text );
	44	+ return trim( $text );
	45	+ }
	46	+
	47	+ function _stripLink( $matches ) {
	48	+ $target = trim( $matches[1] );
	49	+ if( isset( $matches[2] ) ) {
	50	+ $text = trim( $matches[2] );
	51	+ } else {
	52	+ $text = $target;
	53	+ }
	54	+
	55	+ $title = Title::newFromText( $target );
	56	+ if( $title ) {
	57	+ $ns = $title->getNamespace();
	58	+ if( $title->getInterwiki() \|\| $ns == NS_IMAGE \|\| $ns == NS_CATEGORY ) {
	59	+ return "";
	60	+ } else {
	61	+ return $text;
	62	+ }
	63	+ } else {
	64	+ return $matches[0];
	65	+ }
	66	+ }
	67	+
	68	+ /**
	69	+ * Extract the first two sentences, if detectable, from the text.
	70	+ * @param string $text
	71	+ * @return string
	72	+ * @access private
	73	+ */
	74	+ function _extractStart( $text ) {
	75	+ $endchars = array(
	76	+ '([^\d])\.\s', '\!\s', '\?\s', // regular ASCII
	77	+ '。', // full-width ideographic full-stop
	78	+ '．', '！', '？', // double-width roman forms
	79	+ '｡', // half-width ideographic full stop
	80	+ );
	81	+
	82	+ $endgroup = implode( '\|', $endchars );
	83	+ $end = "(?:$endgroup)";
	84	+ $sentence = ".*?$end+";
	85	+ $firstone = "/^($sentence)/u";
	86	+ if( preg_match( $firstone, $text, $matches ) ) {
	87	+ return $matches[1];
	88	+ } else {
	89	+ // Just return the first line
	90	+ $lines = explode( "\n", $text );
	91	+ return trim( $lines[0] );
	92	+ }
	93	+ }
	94	+
	95	+ public function getExtract( $title, $chars=50 ) {
	96	+ $rev = Revision::newFromTitle( $title );
	97	+ if( $rev ) {
	98	+ $text = substr( $rev->getText(), 0, 16384 );
	99	+
	100	+ // Ok, first note this is a TERRIBLE HACK. :D
	101	+ //
	102	+ // First, we use the system preprocessor to break down the text
	103	+ // into text, templates, extensions, and comments:
	104	+ global $wgParser;
	105	+ $wgParser->clearState();
	106	+ $wgParser->mOptions = new ParserOptions();
	107	+ $frame = $wgParser->getPreprocessor()->newFrame();
	108	+ $dom = $wgParser->preprocessToDom( $text );
	109	+
	110	+ $imageArgs = array(
	111	+ 'image',
	112	+ 'image_skyline',
	113	+ 'img',
	114	+ 'Img',
	115	+ );
	116	+
	117	+ // Now, we strip out everything that's not text.
	118	+ // This works with both DOM and Hash parsers, but feels fragile.
	119	+ $node = $dom->getFirstChild();
	120	+ $out = '';
	121	+ while( $node ) {
	122	+ if( $node->getName() == '#text' ) {
	123	+ $out .= $frame->expand( $node, PPFrame::RECOVER_ORIG );
	124	+ }
	125	+ $node = $node->getNextSibling();
	126	+ }
	127	+
	128	+ // The remaining text may still contain wiki and HTML markup.
	129	+ // We'll use our shitty hand parser to strip most of those from
	130	+ // the beginning of the text.
	131	+ $stripped = $this->_stripMarkup( $out );
	132	+
	133	+ // And now, we'll grab just the first sentence as text, and
	134	+ // also try to rip out a badge image.
	135	+ return $this->_extractStart( $stripped );
	136	+ }
	137	+ return '';
	138	+ }
	139	+
	140	+}
Index: trunk/extensions/IndexFunction/SpecialIndex.php
—	—	@@ -35,7 +35,7 @@
36	36	}
37	37
38	38	function showDabPage( Title $t1 ) {
39		~~- global $wgOut, $wgUser;~~
	39	+ global $wgOut, $wgUser, $wgSpecialIndexContext;
40	40	$sk = $wgUser->getSkin();
41	41	$wgOut->setPagetitle( $t1->getPrefixedText() );
42	42	$dbr = wfGetDB( DB_SLAVE );
—	—	@@ -112,9 +112,8 @@
113	113	$grouphtml .= Xml::openElement( 'ul' );
114	114	foreach( $group as $pageid ) {
115	115	$t = $list[$pageid]['title'];
116		~~- $cats = $list[$pageid]['cats'];~~
117		~~- $link = $sk->link( $t, null, array(), array(), array( 'known', 'noclasses' ) );~~
118		~~- $grouphtml .= Xml::tags( 'li', array(), $link . ' &ndash ' . implode( ', ', $cats ) );~~
	116	+ $cats = $list[$pageid]['cats'];
	117	+ $grouphtml .= $this->makeContextLine( $t, $cats );
119	118	unset( $list[$pageid] );
120	119	ksort($list);
121	120	foreach($catlist as $remaining) {
—	—	@@ -137,8 +136,7 @@
138	137	if (count($list) != 0) { //Pages w/ no cats
139	138	$grouphtml = Xml::openElement( 'ul' );
140	139	foreach( $list as $pageid => $info ) {
141		~~- $link = $sk->link( $info['title'], null, array(), array(), array( 'known', 'noclasses' ) );~~
142		~~- $grouphtml .= Xml::tags( 'li', array(), $link );~~
	140	+ $grouphtml .= $this->makeContextLine( $info['title'], array() );
143	141	}
144	142	$grouphtml .= Xml::closeElement('ul');
145	143	$groups = array_merge( array($grouphtml), $groups);
—	—	@@ -147,19 +145,44 @@
148	146	} else {
149	147	$out = Xml::openElement( 'ul' );
150	148	foreach( $list as $pageid => $info ) {
151		~~- $link = $sk->link( $info['title'], null, array(), array(), array( 'known', 'noclasses' ) );~~
152		~~- if ( $info['cats'] ) {~~
153		~~- $line = $link . ' &ndash ' . implode( ', ', $info['cats'] );~~
154		~~- $line = Xml::tags( 'li', array(), $line );~~
155		~~- } else {~~
156		~~- $line = Xml::tags( 'li', array(), $link );~~
157		~~- }~~
158		~~- $out .= $line;~~
	149	+ $out .= $this->makeContextLine( $info['title'], $info['cats'] );
159	150	}
160	151	$out .= Xml::closeElement('ul');
161	152	}
162	153
163	154	$wgOut->addHtml($out);
164	155	}
	156	+
	157	+ private function makeContextLine( $title, $cats ) {
	158	+ global $wgUser, $wgSpecialIndexContext;
	159	+ $sk = $wgUser->getSkin();
	160	+ $link = $sk->link( $title, null, array(), array(), array( 'known', 'noclasses' ) );
	161	+ if ( $wgSpecialIndexContext == 'extract' ) {
	162	+ $extracter = new IndexAbstracts();
	163	+ $text = $extracter->getExtract( $title );
	164	+ if ( $text != '' ) {
	165	+ if ( stripos( $text, $title->getPrefixedText() ) !== false ) {
	166	+ $search = preg_quote( $title->getPrefixedText(), '/' );
	167	+ $line = preg_replace( "/$search/i", $link, $text, 1 );
	168	+ } else {
	169	+ $line = $link . ' &ndash ' . $text;
	170	+ }
	171	+ } else {
	172	+ $line = $link;
	173	+ }
	174	+ $line = Xml::tags( 'li', array(), $line );
	175	+ } elseif ( $wgSpecialIndexContext == 'categories' ) {
	176	+ if ( $cats ) {
	177	+ $line = $link . ' &ndash ' . implode( ', ', $cats );
	178	+ $line = Xml::tags( 'li', array(), $line );
	179	+ } else {
	180	+ $line = Xml::tags( 'li', array(), $link );
	181	+ }
	182	+ } else {
	183	+ $line = Xml::tags( 'li', array(), $link );
	184	+ }
	185	+ return $line;
	186	+ }
	187	+
165	188	}
166	189

Comments

#Comment by Platonides (talk | contribs) 16:01, 8 February 2011

What about making OpenSearchXML use this class?

Status & tagging log

15:36, 29 July 2009 😂 (talk | contribs) changed the status of r53910 [removed: new added: deferred]