r114129 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r114128‎ | r114129 | r114130 >
Date:12:19, 19 March 2012
Author:maxsem
Status:ok
Tags:
Comment:
Text extraction rewrite:
* Renamed prop=excerpts --> prop=extracts
* Made it optionally return whole page extracts
* More reasonably structured output: no more dummy 1-element arrays just because of API's awkward past. Looks good both in XML and sane formats.
Will rename the file in the next commit.
Modified paths:
  • /trunk/extensions/MobileFrontend/MobileFrontend.php (modified) (history)
  • /trunk/extensions/MobileFrontend/api/ApiQueryExcerpts.php (modified) (history)

Diff [purge]

Index: trunk/extensions/MobileFrontend/api/ApiQueryExcerpts.php
@@ -1,10 +1,14 @@
22 <?php
33
4 -class ApiQueryExcerpts extends ApiQueryBase {
 4+class ApiQueryExtracts extends ApiQueryBase {
 5+ const SECTION_MARKER_START = "\1\2";
 6+ const SECTION_MARKER_END = "\2\1";
 7+
58 /**
69 * @var ParserOptions
710 */
811 private $parserOptions;
 12+ private $params;
913
1014 public function __construct( $query, $moduleName ) {
1115 parent::__construct( $query, $moduleName, 'ex' );
@@ -17,8 +21,16 @@
1822 wfProfileOut( __METHOD__ );
1923 return;
2024 }
21 - $params = $this->extractRequestParams();
 25+ $isXml = $this->getMain()->getPrinter()->getFormat() == 'XML';
 26+ $result = $this->getResult();
 27+ $params = $this->params = $this->extractRequestParams();
2228 $continue = 0;
 29+ $limit = intval( $params['limit'] );
 30+ if ( $limit > 1 && !$params['intro'] ) {
 31+ $limit = 1;
 32+ ///@todo:
 33+ //$result->setWarning( "Provided limit was too large for requests for whole article extracts, lowered to $limit" );
 34+ }
2335 if ( isset( $params['continue'] ) ) {
2436 $continue = intval( $params['continue'] );
2537 if ( $continue < 0 || $continue > count( $titles ) ) {
@@ -28,15 +40,19 @@
2941 }
3042 $count = 0;
3143 foreach ( $titles as $id => $t ) {
32 - if ( ++$count > $params['limit'] ) {
 44+ if ( ++$count > $limit ) {
3345 $this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
3446 break;
3547 }
36 - $text = $this->getExcerpt( $t, $params['plaintext'] );
 48+ $text = $this->getExtract( $t );
3749 if ( isset( $params['length'] ) ) {
38 - $text = $this->trimText( $text, $params['length'], $params['plaintext'] );
 50+ $text = $this->trimText( $text );
3951 }
40 - $fit = $this->addPageSubItem( $id, $text );
 52+ if ( $isXml ) {
 53+ $fit = $result->addValue( array( 'query', 'pages', $id ), 'extract', array( '*' => $text ) );
 54+ } else {
 55+ $fit = $result->addValue( array( 'query', 'pages', $id ), 'extract', $text );
 56+ }
4157 if ( !$fit ) {
4258 $this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
4359 break;
@@ -68,7 +84,7 @@
6985 $data = $api->getResultData();
7086 foreach ( $pageIds as $id ) {
7187 if ( isset( $data['query']['pages'][$id]['excerpts'][0] ) ) {
72 - $results[$id]['extract'] = $data['query']['pages'][$id]['excerpts'][0];
 88+ $results[$id]['extract'] = $data['query']['pages'][$id]['extract'][0];
7389 $results[$id]['extract trimmed'] = false;
7490 }
7591 }
@@ -78,28 +94,63 @@
7995 /**
8096 * Returns a processed, but not trimmed excerpt
8197 * @param Title $title
82 - * @return string
 98+ * @return string
8399 */
84 - private function getExcerpt( Title $title, $plainText ) {
85 - global $wgMemc;
86 -
 100+ private function getExtract( Title $title ) {
87101 wfProfileIn( __METHOD__ );
88102 $page = WikiPage::factory( $title );
89 - $key = wfMemcKey( 'mf', 'excerpt', $plainText, $title->getArticleID(), $page->getLatest() );
90 - $text = $wgMemc->get( $key );
91 - if ( $text !== false ) {
92 - wfProfileOut( __METHOD__ );
93 - return $text;
 103+
 104+ $introOnly = $this->params['intro'];
 105+ $text = $this->getFromCache( $page, $introOnly );
 106+ // if we need just first section, try retrieving full page and getting first section out of it
 107+ if ( $text === false && $introOnly ) {
 108+ $text = $this->getFromCache( $page, false );
 109+ if ( $text !== false ) {
 110+ $text = $this->getFirstSection( $text, $this->params['plaintext'] );
 111+ }
94112 }
95 - $text = $this->parse( $page );
96 - $text = $this->convertText( $text, $title, $plainText );
97 - $wgMemc->set( $key, $text );
 113+ if ( $text === false ) {
 114+ $text = $this->parse( $page );
 115+ $text = $this->convertText( $text, $title, $this->params['plaintext'] );
 116+ $this->setCache( $page, $text );
 117+ }
98118 wfProfileOut( __METHOD__ );
99119 return $text;
100120 }
101121
 122+ private function cacheKey( WikiPage $page, $introOnly ) {
 123+ return wfMemcKey( 'mf', 'extract', $page->getLatest(), $this->params['plaintext'], $introOnly );
 124+ }
 125+
 126+ private function getFromCache( WikiPage $page, $introOnly ) {
 127+ global $wgMemc;
 128+
 129+ $key = $this->cacheKey( $page, $introOnly );
 130+ return $wgMemc->get( $key );
 131+ }
 132+
 133+ private function setCache( WikiPage $page, $text ) {
 134+ global $wgMemc;
 135+
 136+ $key = $this->cacheKey( $page, $this->params['intro'] );
 137+ $wgMemc->set( $key, $text );
 138+ }
 139+
 140+ private function getFirstSection( $text, $plainText ) {
 141+ if ( $plainText ) {
 142+ $regexp = '/^(.*?)(?=' . self::SECTION_MARKER_START . ')/s';
 143+ } else {
 144+ $regexp = '/^(.*?)(?=<h[1-6]\b)/s';
 145+ }
 146+ if ( preg_match( $regexp, $text, $matches ) ) {
 147+ wfDebugDieBacktrace();
 148+ $text = $matches[0];
 149+ }
 150+ return $text;
 151+ }
 152+
102153 /**
103 - * Returns HTML of page's zeroth section
 154+ * Returns page HTML
104155 * @param WikiPage $page
105156 * @return string
106157 */
@@ -113,20 +164,23 @@
114165 $pout = ParserCache::singleton()->get( $page, $this->parserOptions );
115166 if ( $pout ) {
116167 $text = $pout->getText();
117 - $s = preg_replace( '/<h[1-6].*$/s', '', $text );
 168+ if ( $this->params['intro'] ) {
 169+ $text = $this->getFirstSection( $text, false );
 170+ }
118171 wfProfileOut( __METHOD__ );
119 - return $s;
 172+ return $text;
120173 }
121174 }
 175+ $request = array(
 176+ 'action' => 'parse',
 177+ 'page' => $page->getTitle()->getPrefixedText(),
 178+ 'prop' => 'text'
 179+ );
 180+ if ( $this->params['intro'] ) {
 181+ $request['section'] = 0;
 182+ }
122183 // in case of cache miss, render just the needed section
123 - $api = new ApiMain( new FauxRequest(
124 - array(
125 - 'action' => 'parse',
126 - 'page' => $page->getTitle()->getPrefixedText(),
127 - 'section' => 0,
128 - 'prop' => 'text'
129 - ) )
130 - );
 184+ $api = new ApiMain( new FauxRequest( $request ) );
131185 $api->execute();
132186 $data = $api->getResultData();
133187 wfProfileOut( __METHOD__ );
@@ -140,23 +194,11 @@
141195 * @param bool $plainText
142196 * @return string
143197 */
144 - private function convertText( $text, Title $title, $plainText ) {
 198+ private function convertText( $text ) {
145199 wfProfileIn( __METHOD__ );
146 - $fmt = new HtmlFormatter( HtmlFormatter::wrapHTML( $text, false ), $title, 'XHTML' );
147 - $fmt->removeImages();
148 - $fmt->remove( array( 'table', 'div', 'sup.reference', 'span.coordinates',
149 - 'span.geo-multi-punct', 'span.geo-nondefault', '.noexcerpt', '.error' )
150 - );
151 - if ( $plainText ) {
152 - $fmt->flattenAllTags();
153 - } else {
154 - $fmt->flatten( array( 'span', 'a' ) );
155 - }
156 - $fmt->filterContent();
 200+ $fmt = new ExtractFormatter( $text, $this->params['plaintext'], $this->params['sectionformat'] );
157201 $text = $fmt->getText();
158 - if ( $plainText ) {
159 - $text = html_entity_decode( $text );
160 - }
 202+
161203 wfProfileOut( __METHOD__ );
162204 return trim( $text );
163205 }
@@ -202,7 +244,12 @@
203245 ApiBase::PARAM_MAX => 20,
204246 ApiBase::PARAM_MAX2 => 20,
205247 ),
 248+ 'intro' => false,
206249 'plaintext' => false,
 250+ 'sectionformat' => array(
 251+ ApiBase::PARAM_TYPE => ExtractFormatter::$sectionFormats,
 252+ ApiBase::PARAM_DFLT => 'wiki',
 253+ ),
207254 'continue' => array(
208255 ApiBase::PARAM_TYPE => 'integer',
209256 ),
@@ -212,14 +259,21 @@
213260 public function getParamDescription() {
214261 return array(
215262 'length' => 'How many characters to return, actual text returned might be slightly longer.',
216 - 'limit' => 'How many excerpts to return',
217 - 'plaintext' => 'Return excerpts as plaintext instead of limited HTML',
 263+ 'limit' => 'How many extracts to return. ',
 264+ 'intro' => 'Return only content before the first section',
 265+ 'plaintext' => 'Return extracts as plaintext instead of limited HTML',
 266+ 'sectionformat' => array(
 267+ 'How to format sections in plaintext mode:',
 268+ ' none - No formatting',
 269+ ' wiki - Wikitext-style formatting == like this ==',
 270+ " raw - Return in this module's internal representation (secton titles prefixed with <ASCII 1><ASCII 2><section level><ASCII 2><ASCII 1>",
 271+ ),
218272 'continue' => 'When more results are available, use this to continue',
219273 );
220274 }
221275
222276 public function getDescription() {
223 - return 'Returns excerpts of the given page(s)';
 277+ return 'Returns plain-text or limited HTML extracts of the given page(s)';
224278 }
225279
226280 public function getPossibleErrors() {
@@ -230,7 +284,7 @@
231285
232286 public function getExamples() {
233287 return array(
234 - 'api.php?action=query&prop=excerpts&exlength=175&titles=Therion' => 'Get a 175-character excerpt',
 288+ 'api.php?action=query&prop=extracts&exlength=175&titles=Therion' => 'Get a 175-character extract',
235289 );
236290 }
237291
@@ -244,4 +298,72 @@
245299 }
246300 }
247301
 302+class ExtractFormatter extends HtmlFormatter {
 303+ private $plainText;
 304+ private $sectionFormat;
248305
 306+ public static $sectionFormats = array(
 307+ 'none',
 308+ 'wiki',
 309+ 'raw',
 310+ );
 311+
 312+ public function __construct( $text, $plainText, $sectionFormat ) {
 313+ parent::__construct( HtmlFormatter::wrapHTML( $text ) );
 314+ $this->plainText = $plainText;
 315+ $this->sectionFormat = $sectionFormat;
 316+
 317+ $this->removeImages();
 318+ $this->remove( array( 'table', 'div', '.editsection', 'sup.reference', 'span.coordinates',
 319+ 'span.geo-multi-punct', 'span.geo-nondefault', '.noexcerpt', '.error' )
 320+ );
 321+ if ( $plainText ) {
 322+ $this->flattenAllTags();
 323+ } else {
 324+ $this->flatten( array( 'span', 'a' ) );
 325+ }
 326+ }
 327+
 328+ public function getText( $dummy = null ) {
 329+ $this->filterContent();
 330+ $text = parent::getText();
 331+ if ( $this->plainText ) {
 332+ $text = html_entity_decode( $text );
 333+ $text = str_replace( "\r", "\n", $text );
 334+ $text = preg_replace( "/\n{3,}/", "\n\n", $text );
 335+ $text = preg_replace_callback(
 336+ "/" . ApiQueryExtracts::SECTION_MARKER_START . '(\d)'. ApiQueryExtracts::SECTION_MARKER_END . "(.*?)$/m",
 337+ array( $this, 'sectionCallback' ),
 338+ $text
 339+ );
 340+ }
 341+ return $text;
 342+ }
 343+
 344+ public function onHtmlReady( $html ) {
 345+ if ( $this->plainText ) {
 346+ $html = preg_replace( '/\s*(<h([1-6])\b)/i',
 347+ ApiQueryExtracts::SECTION_MARKER_START . '$2' . ApiQueryExtracts::SECTION_MARKER_END . '$1' ,
 348+ $html
 349+ );
 350+ }
 351+ return $html;
 352+ }
 353+
 354+ private function sectionCallback( $matches ) {
 355+ if ( $this->sectionFormat == 'raw' ) {
 356+ return $matches[0];
 357+ }
 358+ $func = "ExtractFormatter::doSection_{$this->sectionFormat}";
 359+ return call_user_func( $func, $matches[1], trim( $matches[2] ) );
 360+ }
 361+
 362+ private static function doSection_wiki( $level, $text ) {
 363+ $bars = str_repeat( '=', $level );
 364+ return "\n$bars $text $bars";
 365+ }
 366+
 367+ private static function doSection_none( $level, $text ) {
 368+ return "\n$text";
 369+ }
 370+}
\ No newline at end of file
Index: trunk/extensions/MobileFrontend/MobileFrontend.php
@@ -52,7 +52,7 @@
5353
5454 'ApiMobileView' => 'api/ApiMobileView',
5555 'ApiParseExtender' => 'api/ApiParseExtender',
56 - 'ApiQueryExcerpts' => 'api/ApiQueryExcerpts',
 56+ 'ApiQueryExtracts' => 'api/ApiQueryExcerpts',
5757
5858 'MobileFrontendTemplate' => 'templates/MobileFrontendTemplate',
5959 'ApplicationTemplate' => 'templates/ApplicationTemplate',
@@ -125,7 +125,7 @@
126126
127127 $wgExtensionFunctions[] = 'efMobileFrontend_Setup';
128128
129 -$wgAPIPropModules['excerpts'] = 'ApiQueryExcerpts';
 129+$wgAPIPropModules['extracts'] = 'ApiQueryExtracts';
130130 $wgAPIModules['mobileview'] = 'ApiMobileView';
131131
132132 $wgHooks['APIGetAllowedParams'][] = 'ApiParseExtender::onAPIGetAllowedParams';

Follow-up revisions

RevisionCommit summaryAuthorDate
r114130Follow-up r114129: rename filemaxsem12:24, 19 March 2012
r114161Follow-up r114129:...maxsem18:17, 19 March 2012

Status & tagging log