r114161 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r114160‎ | r114161 | r114162 >
Date:18:17, 19 March 2012
Author:maxsem
Status:ok
Tags:
Comment:
Follow-up r114129:
* Fixed leftoffs from excerpts --> extracts rename
* ...including broken OpenSearchXml hook handler
* Split exlength to exchars and exsentences for better control over stuff being returned
Modified paths:
  • /trunk/extensions/MobileFrontend/MobileFrontend.php (modified) (history)
  • /trunk/extensions/MobileFrontend/api/ApiQueryExtracts.php (modified) (history)

Diff [purge]

Index: trunk/extensions/MobileFrontend/api/ApiQueryExtracts.php
@@ -21,9 +21,10 @@
2222 wfProfileOut( __METHOD__ );
2323 return;
2424 }
25 - $isXml = $this->getMain()->getPrinter()->getFormat() == 'XML';
 25+ $isXml = $this->getMain()->isInternalMode() || $this->getMain()->getPrinter()->getFormat() == 'XML';
2626 $result = $this->getResult();
2727 $params = $this->params = $this->extractRequestParams();
 28+ $this->requireMaxOneParameter( $params, 'chars', 'sentences' );
2829 $continue = 0;
2930 $limit = intval( $params['limit'] );
3031 if ( $limit > 1 && !$params['intro'] ) {
@@ -45,9 +46,8 @@
4647 break;
4748 }
4849 $text = $this->getExtract( $t );
49 - if ( isset( $params['length'] ) ) {
50 - $text = $this->trimText( $text );
51 - }
 50+ $text = $this->truncate( $text );
 51+
5252 if ( $isXml ) {
5353 $fit = $result->addValue( array( 'query', 'pages', $id ), 'extract', array( '*' => $text ) );
5454 } else {
@@ -74,7 +74,7 @@
7575 $api = new ApiMain( new FauxRequest(
7676 array(
7777 'action' => 'query',
78 - 'prop' => 'excerpts',
 78+ 'prop' => 'extracts',
7979 'explaintext' => true,
8080 'exlimit' => count( $results ),
8181 'pageids' => implode( '|', $pageIds ),
@@ -83,8 +83,8 @@
8484 $api->execute();
8585 $data = $api->getResultData();
8686 foreach ( $pageIds as $id ) {
87 - if ( isset( $data['query']['pages'][$id]['excerpts'][0] ) ) {
88 - $results[$id]['extract'] = $data['query']['pages'][$id]['extract'][0];
 87+ if ( isset( $data['query']['pages'][$id]['extract']['*'] ) ) {
 88+ $results[$id]['extract'] = $data['query']['pages'][$id]['extract']['*'];
8989 $results[$id]['extract trimmed'] = false;
9090 }
9191 }
@@ -92,7 +92,7 @@
9393 }
9494
9595 /**
96 - * Returns a processed, but not trimmed excerpt
 96+ * Returns a processed, but not trimmed extract
9797 * @param Title $title
9898 * @return string
9999 */
@@ -188,10 +188,8 @@
189189 }
190190
191191 /**
192 - * Converts page HTML into an excerpt
 192+ * Converts page HTML into an extract
193193 * @param string $text
194 - * @param Title $title
195 - * @param bool $plainText
196194 * @return string
197195 */
198196 private function convertText( $text ) {
@@ -203,16 +201,22 @@
204202 return trim( $text );
205203 }
206204
 205+ private function truncate( $text ) {
 206+ if ( $this->params['chars'] ) {
 207+ return $this->getFirstChars( $text, $this->params['chars'] );
 208+ } elseif ( $this->params['sentences'] ) {
 209+ return $this->getFirstSentences( $text, $this->params['sentences'] );
 210+ }
 211+ return $text;
 212+ }
 213+
207214 /**
208215 *
209216 * @param string $text
210217 * @param int $requestedLength
211 - * @param bool $plainText
212218 * @return string
213219 */
214 - private function trimText( $text, $requestedLength, $plainText ) {
215 - global $wgUseTidy;
216 -
 220+ private function getFirstChars( $text, $requestedLength ) {
217221 wfProfileIn( __METHOD__ );
218222 $length = mb_strlen( $text );
219223 if ( $length <= $requestedLength ) {
@@ -223,20 +227,69 @@
224228 preg_match( $pattern, $text, $m );
225229 $text = $m[0];
226230 // Fix possibly unclosed tags
227 - if ( $wgUseTidy && !$plainText ) {
 231+ $text = $this->tidy( $text );
 232+ $text .= wfMessage( 'ellipsis' )->inContentLanguage()->text();
 233+ wfProfileOut( __METHOD__ );
 234+ return $text;
 235+ }
 236+
 237+ /**
 238+ *
 239+ * @param string $text
 240+ * @param int $requestedSentenceCount
 241+ */
 242+ private function getFirstSentences( $text, $requestedSentenceCount ) {
 243+ wfProfileIn( __METHOD__ );
 244+ // Based on code from OpenSearchXml by Brion Vibber
 245+ $endchars = array(
 246+ '([^\d])\.\s', '\!\s', '\?\s', // regular ASCII
 247+ '。', // full-width ideographic full-stop
 248+ '.', '!', '?', // double-width roman forms
 249+ '。', // half-width ideographic full stop
 250+ );
 251+
 252+ $endgroup = implode( '|', $endchars );
 253+ $end = "(?:$endgroup)";
 254+ $sentence = ".+?$end+";
 255+ $regexp = "/^($sentence){{$requestedSentenceCount}}/u";
 256+ $matches = array();
 257+ if( preg_match( $regexp, $text, $matches ) ) {
 258+ return $matches[0];
 259+ } else {
 260+ // Just return the first line
 261+ $lines = explode( "\n", $text );
 262+ return trim( $lines[0] );
 263+ }
 264+ $text = $this->tidy( $text );
 265+ wfProfileOut( __METHOD__ );
 266+ return $text;
 267+ }
 268+
 269+ /**
 270+ * A simple wrapper around tidy
 271+ * @param string $text
 272+ */
 273+ private function tidy( $text ) {
 274+ global $wgUseTidy;
 275+
 276+ wfProfileIn( __METHOD__ );
 277+ if ( $wgUseTidy && !$this->params['plaintext'] ) {
228278 $text = trim ( MWTidy::tidy( $text ) );
229279 }
230 - $text .= wfMessage( 'ellipsis' )->inContentLanguage()->text();
231280 wfProfileOut( __METHOD__ );
232281 return $text;
233282 }
234283
235284 public function getAllowedParams() {
236285 return array(
237 - 'length' => array(
 286+ 'chars' => array(
238287 ApiBase::PARAM_TYPE => 'integer',
239288 ApiBase::PARAM_MIN => 1,
240289 ),
 290+ 'sentences' => array(
 291+ ApiBase::PARAM_TYPE => 'integer',
 292+ ApiBase::PARAM_MIN => 1,
 293+ ),
241294 'limit' => array(
242295 ApiBase::PARAM_DFLT => 1,
243296 ApiBase::PARAM_TYPE => 'limit',
@@ -258,7 +311,8 @@
259312
260313 public function getParamDescription() {
261314 return array(
262 - 'length' => 'How many characters to return, actual text returned might be slightly longer.',
 315+ 'chars' => 'How many characters to return, actual text returned might be slightly longer.',
 316+ 'sentences' => 'How many sentences to return',
263317 'limit' => 'How many extracts to return. ',
264318 'intro' => 'Return only content before the first section',
265319 'plaintext' => 'Return extracts as plaintext instead of limited HTML',
@@ -284,7 +338,7 @@
285339
286340 public function getExamples() {
287341 return array(
288 - 'api.php?action=query&prop=extracts&exlength=175&titles=Therion' => 'Get a 175-character extract',
 342+ 'api.php?action=query&prop=extracts&exchars=175&titles=Therion' => 'Get a 175-character extract',
289343 );
290344 }
291345
@@ -329,8 +383,8 @@
330384 $text = parent::getText();
331385 if ( $this->plainText ) {
332386 $text = html_entity_decode( $text );
333 - $text = str_replace( "\r", "\n", $text );
334 - $text = preg_replace( "/\n{3,}/", "\n\n", $text );
 387+ $text = str_replace( "\r", "\n", $text ); // for Windows
 388+ $text = preg_replace( "/\n{3,}/", "\n\n", $text ); // normalise newlines
335389 $text = preg_replace_callback(
336390 "/" . ApiQueryExtracts::SECTION_MARKER_START . '(\d)'. ApiQueryExtracts::SECTION_MARKER_END . "(.*?)$/m",
337391 array( $this, 'sectionCallback' ),
Index: trunk/extensions/MobileFrontend/MobileFrontend.php
@@ -132,7 +132,7 @@
133133 $wgHooks['APIAfterExecute'][] = 'ApiParseExtender::onAPIAfterExecute';
134134 $wgHooks['APIGetParamDescription'][] = 'ApiParseExtender::onAPIGetParamDescription';
135135 $wgHooks['APIGetDescription'][] = 'ApiParseExtender::onAPIGetDescription';
136 -$wgHooks['OpenSearchXml'][] = 'ApiQueryExcerpts::onOpenSearchXml';
 136+$wgHooks['OpenSearchXml'][] = 'ApiQueryExtracts::onOpenSearchXml';
137137
138138 function efMobileFrontend_Setup() {
139139 global $wgExtMobileFrontend, $wgHooks;
@@ -174,6 +174,6 @@
175175 }
176176
177177 /**
178 - * Whether this extension should provide its excerpts to OpenSearchXml extension
 178+ * Whether this extension should provide its extracts to OpenSearchXml extension
179179 */
180180 $wgMFExtendOpenSearchXml = false;

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r114129Text extraction rewrite:...maxsem12:19, 19 March 2012

Status & tagging log