r60233 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r60232‎ | r60233 | r60234 >
Date:21:36, 19 December 2009
Author:juliano
Status:deferred
Tags:
Comment:
Article summary extraction improvements.
Now a "--more--" marker alone in a line breaks the summary (above) from the
rest of the article (below). The <summary> tag has precedence over this new
marker. If neither is found, the first section is used.
Modified paths:
  • /trunk/extensions/Wikilog/RELEASE-NOTES (modified) (history)
  • /trunk/extensions/Wikilog/Wikilog.i18n.magic.php (modified) (history)
  • /trunk/extensions/Wikilog/Wikilog.php (modified) (history)
  • /trunk/extensions/Wikilog/WikilogParser.php (modified) (history)
  • /trunk/extensions/Wikilog/WikilogUtils.php (modified) (history)

Diff [purge]

Index: trunk/extensions/Wikilog/Wikilog.i18n.magic.php
@@ -21,6 +21,7 @@
2222 'wlk-subtitle' => array( 0, 'subtitle' ),
2323 'wlk-summary' => array( 1, 'summary' ),
2424 'wlk-hidden' => array( 0, 'hidden' ),
 25+ 'wlk-more' => array( 0, 'more' ),
2526 );
2627
2728 $magicWords['ml'] = array(
@@ -40,6 +41,7 @@
4142 'wlk-subtitle' => array( '0', 'subtítulo', 'subtitle' ),
4243 'wlk-summary' => array( '1', 'resumo', 'summary' ),
4344 'wlk-hidden' => array( '0', 'oculto', 'hidden' ),
 45+ 'wlk-more' => array( '0', 'mais', 'more' ),
4446 );
4547
4648 /**
@@ -54,4 +56,5 @@
5557 'wlk-subtitle' => array( '0', 'subtítulo', 'subtitle' ),
5658 'wlk-summary' => array( '1', 'resumo', 'summary' ),
5759 'wlk-hidden' => array( '0', 'oculto', 'hidden' ),
 60+ 'wlk-more' => array( '0', 'mais', 'more' ),
5861 );
Index: trunk/extensions/Wikilog/Wikilog.php
@@ -127,8 +127,9 @@
128128 // WikilogParser hooks
129129 $wgHooks['ParserFirstCallInit'][] = 'WikilogParser::FirstCallInit';
130130 $wgHooks['ParserClearState'][] = 'WikilogParser::ClearState';
131 -$wgHooks['ParserBeforeInternalParse'][] = 'WikilogParser::BeforeInternalParse';
 131+$wgHooks['ParserBeforeStrip'][] = 'WikilogParser::BeforeStrip';
132132 $wgHooks['ParserAfterTidy'][] = 'WikilogParser::AfterTidy';
 133+$wgHooks['InternalParseBeforeLinks'][] = 'WikilogParser::InternalParseBeforeLinks';
133134 $wgHooks['GetLocalURL'][] = 'WikilogParser::GetLocalURL';
134135 $wgHooks['GetFullURL'][] = 'WikilogParser::GetFullURL';
135136
Index: trunk/extensions/Wikilog/RELEASE-NOTES
@@ -13,6 +13,10 @@
1414 others replaced by wikilog-summary-header/footer, wikilog-entry-header/footer
1515 and others, with better support for plural and gender inflection, separate
1616 date and time, and also support to category and tag lists.
 17+* (wl:issue 2) Improved article summary extraction. Now a "--more--" marker
 18+ alone in a line breaks the summary (above) from the rest of the article
 19+ (below). The <summary> tag has precedence over this new marker. If neither
 20+ is found, the first section is used.
1721
1822 === Bug fixes ===
1923
Index: trunk/extensions/Wikilog/WikilogParser.php
@@ -35,6 +35,11 @@
3636 class WikilogParser
3737 {
3838 /**
 39+ * Anchor printed when a --more-- separator is substituted.
 40+ */
 41+ const MORE_ANCHOR = "<span id=\"wl-more\"></span>";
 42+
 43+ /**
3944 * True if parsing articles with feed output specific settings.
4045 * This is an horrible hack needed because of many MediaWiki misdesigns.
4146 */
@@ -73,7 +78,10 @@
7479 * ParserClearState hook handler function.
7580 */
7681 public static function ClearState( &$parser ) {
 82+ # These two parser attributes contain our private information.
 83+ # They take a piggyback ride on the parser object.
7784 $parser->mExtWikilog = new WikilogParserOutput;
 85+ $parser->mExtWikilogInfo = NULL;
7886
7987 # Disable TOC in feeds.
8088 if ( self::$feedParsing ) {
@@ -83,9 +91,9 @@
8492 }
8593
8694 /**
87 - * ParserBeforeInternalParse hook handler function.
 95+ * ParserBeforeStrip hook handler function.
8896 */
89 - public static function BeforeInternalParse( &$parser, &$text, &$stripState ) {
 97+ public static function BeforeStrip( &$parser, &$text, &$stripState ) {
9098 global $wgUser;
9199
92100 # Do nothing if a title is not set.
@@ -93,13 +101,13 @@
94102 return true;
95103
96104 # Do nothing if it is not a wikilog article.
97 - if ( ! ( $wi = Wikilog::getWikilogInfo( $parser->getTitle() ) ) )
 105+ if ( ! ( $parser->mExtWikilogInfo = Wikilog::getWikilogInfo( $title ) ) )
98106 return true;
99107
100 - if ( $wi->isItem() ) {
 108+ if ( $parser->mExtWikilogInfo->isItem() ) {
101109 # By default, use the item name as the default sort in categories.
102110 # This can be overriden by {{DEFAULTSORT:...}} if the user wants.
103 - $parser->setDefaultSort( $wi->getItemName() );
 111+ $parser->setDefaultSort( $parser->mExtWikilogInfo->getItemName() );
104112 }
105113
106114 return true;
@@ -114,10 +122,52 @@
115123 }
116124
117125 /**
 126+ * InternalParseBeforeLinks hook handler function. Called after nowiki,
 127+ * comments and templates are treated.
 128+ * For wikilog pages, look for the "--more--" marker and extract the
 129+ * article summary before it. If not found, look for the first heading
 130+ * and use the text before it (intro section).
 131+ */
 132+ public static function InternalParseBeforeLinks( &$parser, &$text, &$stripState ) {
 133+ if ( $parser->mExtWikilogInfo && $parser->mExtWikilogInfo->isItem() ) {
 134+ static $moreRegex = false;
 135+ if ( $moreRegex === false ) {
 136+ $mwMore =& MagicWord::get( 'wlk-more' );
 137+ $words = $mwMore->getBaseRegex();
 138+ $flags = $mwMore->getRegexCase();
 139+ $moreRegex = "/(?<=^|\\n)--+ *(?:$words) *--+\s*/$flags";
 140+ }
 141+
 142+ # Find and replace the --more-- marker. Extract summary.
 143+ # We do it anyway even if the summary is already set, in order
 144+ # to replace the marker with an invisible anchor.
 145+ $p = preg_split( $moreRegex, $text, 2 );
 146+ if ( count( $p ) > 1 ) {
 147+ self::trySetSummary( $parser, trim( $p[0] ) );
 148+ $anchor = $parser->insertStripItem( self::MORE_ANCHOR );
 149+ $text = $p[0] . $anchor . $p[1];
 150+ } else if ( !$parser->mExtWikilog->mSummary ) {
 151+ # Otherwise, make a summary from the intro section.
 152+ # Why we don't use $parser->getSection()? Because it has the
 153+ # side-effect of clearing the parser state, which is bad here
 154+ # since this hook happens during parsing. Instead, we
 155+ # anticipate the $parser->doHeadings() call and extract the
 156+ # text before the first heading.
 157+ $text = $parser->doHeadings( $text );
 158+ $p = preg_split( '/<(h[1-6])\\b.*?>.*?<\\/\\1\\s*>/i', $text, 2 );
 159+ if ( count( $p ) > 1 ) {
 160+ self::trySetSummary( $parser, trim( $p[0] ) );
 161+ }
 162+ }
 163+ }
 164+ return true;
 165+ }
 166+
 167+ /**
118168 * GetLocalURL hook handler function.
119169 * Expands local URL @a $url if self::$expandingUrls is true.
120170 */
121 - static function GetLocalURL( &$title, &$url, $query ) {
 171+ public static function GetLocalURL( &$title, &$url, $query ) {
122172 if ( self::$expandingUrls ) {
123173 $url = wfExpandUrl( $url );
124174 }
@@ -132,7 +182,7 @@
133183 * from Title::getLocalURL() in situations where action != 'render'.
134184 * @todo Report this bug to MediaWiki bugzilla.
135185 */
136 - static function GetFullURL( &$title, &$url, $query ) {
 186+ public static function GetFullURL( &$title, &$url, $query ) {
137187 global $wgServer;
138188 if ( self::$expandingUrls ) {
139189 $l = strlen( $wgServer );
@@ -155,14 +205,8 @@
156206
157207 # Remove extra space to make block rendering easier.
158208 $text = trim( $text );
 209+ self::trySetSummary( $parser, $text );
159210
160 - if ( !$parser->mExtWikilog->mSummary ) {
161 - $popt = $parser->getOptions();
162 - $popt->enableLimitReport( false );
163 - $output = $parser->parse( $text, $parser->getTitle(), $popt, true, false );
164 - $parser->mExtWikilog->mSummary = $output->getText();
165 - }
166 -
167211 $hidden = WikilogUtils::arrayMagicKeyGet( $params, $mwHidden );
168212 return $hidden ? '<!-- -->' : $parser->recursiveTagParse( $text );
169213 }
@@ -401,6 +445,23 @@
402446 #
403447
404448 /**
 449+ * Set the article summary, ignore if already set.
 450+ * @return True if set, false otherwise.
 451+ */
 452+ private static function trySetSummary( &$parser, $text ) {
 453+ if ( !$parser->mExtWikilog->mSummary ) {
 454+ $popt = clone $parser->getOptions();
 455+ $popt->enableLimitReport( false );
 456+ $output = $parser->parse( $text, $parser->getTitle(), $popt, true, false );
 457+ $parser->mExtWikilog->mSummary = $output->getText();
 458+// wfDebug( "Wikilog summary set to:\n----\n" . $parser->mExtWikilog->mSummary . "\n----\n" );
 459+ return true;
 460+ } else {
 461+ return false;
 462+ }
 463+ }
 464+
 465+ /**
405466 * Adds an author to the current article. If too many authors, warns.
406467 * @return False on overflow, true otherwise.
407468 */
Index: trunk/extensions/Wikilog/WikilogUtils.php
@@ -237,31 +237,39 @@
238238 }
239239
240240 /**
241 - * Split summary of a wikilog post from the contents.
242 - * If summary was provided in <summary>...</summary> tags, use it,
243 - * otherwise, use some heuristics to find it in the content.
 241+ * Split summary of a wikilog article from the contents.
 242+ * If summary is part of the parser output, use it; otherwise, try to
 243+ * extract it from the content text (section zero, before the first
 244+ * heading).
 245+ *
 246+ * @param $parserOutput ParserOutput object.
 247+ * @return Two-element array with summary and content. Summary may be
 248+ * NULL if nonexistent.
244249 */
245250 public static function splitSummaryContent( $parserOutput ) {
 251+ global $wgUseTidy;
 252+
246253 $content = Sanitizer::removeHTMLcomments( $parserOutput->getText() );
247254
248255 if ( isset( $parserOutput->mExtWikilog ) && $parserOutput->mExtWikilog->mSummary ) {
 256+ # Parser output contains wikilog output and summary, use it.
249257 $summary = Sanitizer::removeHTMLcomments( $parserOutput->mExtWikilog->mSummary );
250258 } else {
251 - $blocks = preg_split( '/< (h[1-6]) .*? > .*? <\\/\\1>/ix', $content );
252 -
 259+ # Try to extract summary from the content text.
 260+ $blocks = preg_split( '/<(h[1-6]).*?>.*?<\\/\\1>/i', $content, 2 );
253261 if ( count( $blocks ) > 1 ) {
254 - # Long article, get only the first paragraph.
255 - $pextr = '/<(p)
256 - ( \\s+ (?: [^\'"\\/>] | \'[^\']*\' | "[^"]*" )* )?
257 - (?: > .*? <\\/\\1\\s*> | \\/> )/isx';
258 -
259 - if ( preg_match_all( $pextr, $blocks[0], $m ) ) {
260 - $summary = implode( "\n", $m[0] );
261 - } else {
262 - $summary = NULL;
 262+ # Long article with multiple sections, use only the first one.
 263+ $summary = $blocks[0];
 264+ # It is possible for the regex to split on a heading that is
 265+ # not a child of the root element (e.g. <div><h2>...</h2>
 266+ # </div> leaving an open <div> tag). In order to handle such
 267+ # cases, we pass the summary through tidy if it is available.
 268+ if ( $wgUseTidy ) {
 269+ $summary = MWTidy::tidy( $summary );
263270 }
264271 } else {
265 - # Short article, no summary.
 272+ # Short article with a single section, use no summary and
 273+ # leave to the caller to decide what to do.
266274 $summary = NULL;
267275 }
268276 }

Status & tagging log