Index: trunk/extensions/MobileFrontend/api/ApiQueryExcerpts.php |
— | — | @@ -1,10 +1,14 @@ |
2 | 2 | <?php |
3 | 3 | |
4 | | -class ApiQueryExcerpts extends ApiQueryBase { |
| 4 | +class ApiQueryExtracts extends ApiQueryBase { |
| 5 | + const SECTION_MARKER_START = "\1\2"; |
| 6 | + const SECTION_MARKER_END = "\2\1"; |
| 7 | + |
5 | 8 | /** |
6 | 9 | * @var ParserOptions |
7 | 10 | */ |
8 | 11 | private $parserOptions; |
| 12 | + private $params; |
9 | 13 | |
10 | 14 | public function __construct( $query, $moduleName ) { |
11 | 15 | parent::__construct( $query, $moduleName, 'ex' ); |
— | — | @@ -17,8 +21,16 @@ |
18 | 22 | wfProfileOut( __METHOD__ ); |
19 | 23 | return; |
20 | 24 | } |
21 | | - $params = $this->extractRequestParams(); |
| 25 | + $isXml = $this->getMain()->getPrinter()->getFormat() == 'XML'; |
| 26 | + $result = $this->getResult(); |
| 27 | + $params = $this->params = $this->extractRequestParams(); |
22 | 28 | $continue = 0; |
| 29 | + $limit = intval( $params['limit'] ); |
| 30 | + if ( $limit > 1 && !$params['intro'] ) { |
| 31 | + $limit = 1; |
| 32 | + ///@todo: |
| 33 | + //$result->setWarning( "Provided limit was too large for requests for whole article extracts, lowered to $limit" ); |
| 34 | + } |
23 | 35 | if ( isset( $params['continue'] ) ) { |
24 | 36 | $continue = intval( $params['continue'] ); |
25 | 37 | if ( $continue < 0 || $continue > count( $titles ) ) { |
— | — | @@ -28,15 +40,19 @@ |
29 | 41 | } |
30 | 42 | $count = 0; |
31 | 43 | foreach ( $titles as $id => $t ) { |
32 | | - if ( ++$count > $params['limit'] ) { |
| 44 | + if ( ++$count > $limit ) { |
33 | 45 | $this->setContinueEnumParameter( 'continue', $continue + $count - 1 ); |
34 | 46 | break; |
35 | 47 | } |
36 | | - $text = $this->getExcerpt( $t, $params['plaintext'] ); |
| 48 | + $text = $this->getExtract( $t ); |
37 | 49 | if ( isset( $params['length'] ) ) { |
38 | | - $text = $this->trimText( $text, $params['length'], $params['plaintext'] ); |
| 50 | + $text = $this->trimText( $text ); |
39 | 51 | } |
40 | | - $fit = $this->addPageSubItem( $id, $text ); |
| 52 | + if ( $isXml ) { |
| 53 | + $fit = $result->addValue( array( 'query', 'pages', $id ), 'extract', array( '*' => $text ) ); |
| 54 | + } else { |
| 55 | + $fit = $result->addValue( array( 'query', 'pages', $id ), 'extract', $text ); |
| 56 | + } |
41 | 57 | if ( !$fit ) { |
42 | 58 | $this->setContinueEnumParameter( 'continue', $continue + $count - 1 ); |
43 | 59 | break; |
— | — | @@ -68,7 +84,7 @@ |
69 | 85 | $data = $api->getResultData(); |
70 | 86 | foreach ( $pageIds as $id ) { |
71 | 87 | if ( isset( $data['query']['pages'][$id]['excerpts'][0] ) ) { |
72 | | - $results[$id]['extract'] = $data['query']['pages'][$id]['excerpts'][0]; |
| 88 | + $results[$id]['extract'] = $data['query']['pages'][$id]['extract'][0]; |
73 | 89 | $results[$id]['extract trimmed'] = false; |
74 | 90 | } |
75 | 91 | } |
— | — | @@ -78,28 +94,63 @@ |
79 | 95 | /** |
80 | 96 | * Returns a processed, but not trimmed excerpt |
81 | 97 | * @param Title $title |
82 | | - * @return string |
| 98 | + * @return string |
83 | 99 | */ |
84 | | - private function getExcerpt( Title $title, $plainText ) { |
85 | | - global $wgMemc; |
86 | | - |
| 100 | + private function getExtract( Title $title ) { |
87 | 101 | wfProfileIn( __METHOD__ ); |
88 | 102 | $page = WikiPage::factory( $title ); |
89 | | - $key = wfMemcKey( 'mf', 'excerpt', $plainText, $title->getArticleID(), $page->getLatest() ); |
90 | | - $text = $wgMemc->get( $key ); |
91 | | - if ( $text !== false ) { |
92 | | - wfProfileOut( __METHOD__ ); |
93 | | - return $text; |
| 103 | + |
| 104 | + $introOnly = $this->params['intro']; |
| 105 | + $text = $this->getFromCache( $page, $introOnly ); |
| 106 | + // if we need just first section, try retrieving full page and getting first section out of it |
| 107 | + if ( $text === false && $introOnly ) { |
| 108 | + $text = $this->getFromCache( $page, false ); |
| 109 | + if ( $text !== false ) { |
| 110 | + $text = $this->getFirstSection( $text, $this->params['plaintext'] ); |
| 111 | + } |
94 | 112 | } |
95 | | - $text = $this->parse( $page ); |
96 | | - $text = $this->convertText( $text, $title, $plainText ); |
97 | | - $wgMemc->set( $key, $text ); |
| 113 | + if ( $text === false ) { |
| 114 | + $text = $this->parse( $page ); |
| 115 | + $text = $this->convertText( $text, $title, $this->params['plaintext'] ); |
| 116 | + $this->setCache( $page, $text ); |
| 117 | + } |
98 | 118 | wfProfileOut( __METHOD__ ); |
99 | 119 | return $text; |
100 | 120 | } |
101 | 121 | |
| 122 | + private function cacheKey( WikiPage $page, $introOnly ) { |
| 123 | + return wfMemcKey( 'mf', 'extract', $page->getLatest(), $this->params['plaintext'], $introOnly ); |
| 124 | + } |
| 125 | + |
| 126 | + private function getFromCache( WikiPage $page, $introOnly ) { |
| 127 | + global $wgMemc; |
| 128 | + |
| 129 | + $key = $this->cacheKey( $page, $introOnly ); |
| 130 | + return $wgMemc->get( $key ); |
| 131 | + } |
| 132 | + |
| 133 | + private function setCache( WikiPage $page, $text ) { |
| 134 | + global $wgMemc; |
| 135 | + |
| 136 | + $key = $this->cacheKey( $page, $this->params['intro'] ); |
| 137 | + $wgMemc->set( $key, $text ); |
| 138 | + } |
| 139 | + |
| 140 | + private function getFirstSection( $text, $plainText ) { |
| 141 | + if ( $plainText ) { |
| 142 | + $regexp = '/^(.*?)(?=' . self::SECTION_MARKER_START . ')/s'; |
| 143 | + } else { |
| 144 | + $regexp = '/^(.*?)(?=<h[1-6]\b)/s'; |
| 145 | + } |
| 146 | + if ( preg_match( $regexp, $text, $matches ) ) { |
| 147 | + wfDebugDieBacktrace(); |
| 148 | + $text = $matches[0]; |
| 149 | + } |
| 150 | + return $text; |
| 151 | + } |
| 152 | + |
102 | 153 | /** |
103 | | - * Returns HTML of page's zeroth section |
| 154 | + * Returns page HTML |
104 | 155 | * @param WikiPage $page |
105 | 156 | * @return string |
106 | 157 | */ |
— | — | @@ -113,20 +164,23 @@ |
114 | 165 | $pout = ParserCache::singleton()->get( $page, $this->parserOptions ); |
115 | 166 | if ( $pout ) { |
116 | 167 | $text = $pout->getText(); |
117 | | - $s = preg_replace( '/<h[1-6].*$/s', '', $text ); |
| 168 | + if ( $this->params['intro'] ) { |
| 169 | + $text = $this->getFirstSection( $text, false ); |
| 170 | + } |
118 | 171 | wfProfileOut( __METHOD__ ); |
119 | | - return $s; |
| 172 | + return $text; |
120 | 173 | } |
121 | 174 | } |
| 175 | + $request = array( |
| 176 | + 'action' => 'parse', |
| 177 | + 'page' => $page->getTitle()->getPrefixedText(), |
| 178 | + 'prop' => 'text' |
| 179 | + ); |
| 180 | + if ( $this->params['intro'] ) { |
| 181 | + $request['section'] = 0; |
| 182 | + } |
122 | 183 | // in case of cache miss, render just the needed section |
123 | | - $api = new ApiMain( new FauxRequest( |
124 | | - array( |
125 | | - 'action' => 'parse', |
126 | | - 'page' => $page->getTitle()->getPrefixedText(), |
127 | | - 'section' => 0, |
128 | | - 'prop' => 'text' |
129 | | - ) ) |
130 | | - ); |
| 184 | + $api = new ApiMain( new FauxRequest( $request ) ); |
131 | 185 | $api->execute(); |
132 | 186 | $data = $api->getResultData(); |
133 | 187 | wfProfileOut( __METHOD__ ); |
— | — | @@ -140,23 +194,11 @@ |
141 | 195 | * @param bool $plainText |
142 | 196 | * @return string |
143 | 197 | */ |
144 | | - private function convertText( $text, Title $title, $plainText ) { |
| 198 | + private function convertText( $text ) { |
145 | 199 | wfProfileIn( __METHOD__ ); |
146 | | - $fmt = new HtmlFormatter( HtmlFormatter::wrapHTML( $text, false ), $title, 'XHTML' ); |
147 | | - $fmt->removeImages(); |
148 | | - $fmt->remove( array( 'table', 'div', 'sup.reference', 'span.coordinates', |
149 | | - 'span.geo-multi-punct', 'span.geo-nondefault', '.noexcerpt', '.error' ) |
150 | | - ); |
151 | | - if ( $plainText ) { |
152 | | - $fmt->flattenAllTags(); |
153 | | - } else { |
154 | | - $fmt->flatten( array( 'span', 'a' ) ); |
155 | | - } |
156 | | - $fmt->filterContent(); |
| 200 | + $fmt = new ExtractFormatter( $text, $this->params['plaintext'], $this->params['sectionformat'] ); |
157 | 201 | $text = $fmt->getText(); |
158 | | - if ( $plainText ) { |
159 | | - $text = html_entity_decode( $text ); |
160 | | - } |
| 202 | + |
161 | 203 | wfProfileOut( __METHOD__ ); |
162 | 204 | return trim( $text ); |
163 | 205 | } |
— | — | @@ -202,7 +244,12 @@ |
203 | 245 | ApiBase::PARAM_MAX => 20, |
204 | 246 | ApiBase::PARAM_MAX2 => 20, |
205 | 247 | ), |
| 248 | + 'intro' => false, |
206 | 249 | 'plaintext' => false, |
| 250 | + 'sectionformat' => array( |
| 251 | + ApiBase::PARAM_TYPE => ExtractFormatter::$sectionFormats, |
| 252 | + ApiBase::PARAM_DFLT => 'wiki', |
| 253 | + ), |
207 | 254 | 'continue' => array( |
208 | 255 | ApiBase::PARAM_TYPE => 'integer', |
209 | 256 | ), |
— | — | @@ -212,14 +259,21 @@ |
213 | 260 | public function getParamDescription() { |
214 | 261 | return array( |
215 | 262 | 'length' => 'How many characters to return, actual text returned might be slightly longer.', |
216 | | - 'limit' => 'How many excerpts to return', |
217 | | - 'plaintext' => 'Return excerpts as plaintext instead of limited HTML', |
| 263 | + 'limit' => 'How many extracts to return. ', |
| 264 | + 'intro' => 'Return only content before the first section', |
| 265 | + 'plaintext' => 'Return extracts as plaintext instead of limited HTML', |
| 266 | + 'sectionformat' => array( |
| 267 | + 'How to format sections in plaintext mode:', |
| 268 | + ' none - No formatting', |
| 269 | + ' wiki - Wikitext-style formatting == like this ==', |
| 270 | + " raw - Return in this module's internal representation (secton titles prefixed with <ASCII 1><ASCII 2><section level><ASCII 2><ASCII 1>", |
| 271 | + ), |
218 | 272 | 'continue' => 'When more results are available, use this to continue', |
219 | 273 | ); |
220 | 274 | } |
221 | 275 | |
222 | 276 | public function getDescription() { |
223 | | - return 'Returns excerpts of the given page(s)'; |
| 277 | + return 'Returns plain-text or limited HTML extracts of the given page(s)'; |
224 | 278 | } |
225 | 279 | |
226 | 280 | public function getPossibleErrors() { |
— | — | @@ -230,7 +284,7 @@ |
231 | 285 | |
232 | 286 | public function getExamples() { |
233 | 287 | return array( |
234 | | - 'api.php?action=query&prop=excerpts&exlength=175&titles=Therion' => 'Get a 175-character excerpt', |
| 288 | + 'api.php?action=query&prop=extracts&exlength=175&titles=Therion' => 'Get a 175-character extract', |
235 | 289 | ); |
236 | 290 | } |
237 | 291 | |
— | — | @@ -244,4 +298,72 @@ |
245 | 299 | } |
246 | 300 | } |
247 | 301 | |
| 302 | +class ExtractFormatter extends HtmlFormatter { |
| 303 | + private $plainText; |
| 304 | + private $sectionFormat; |
248 | 305 | |
| 306 | + public static $sectionFormats = array( |
| 307 | + 'none', |
| 308 | + 'wiki', |
| 309 | + 'raw', |
| 310 | + ); |
| 311 | + |
| 312 | + public function __construct( $text, $plainText, $sectionFormat ) { |
| 313 | + parent::__construct( HtmlFormatter::wrapHTML( $text ) ); |
| 314 | + $this->plainText = $plainText; |
| 315 | + $this->sectionFormat = $sectionFormat; |
| 316 | + |
| 317 | + $this->removeImages(); |
| 318 | + $this->remove( array( 'table', 'div', '.editsection', 'sup.reference', 'span.coordinates', |
| 319 | + 'span.geo-multi-punct', 'span.geo-nondefault', '.noexcerpt', '.error' ) |
| 320 | + ); |
| 321 | + if ( $plainText ) { |
| 322 | + $this->flattenAllTags(); |
| 323 | + } else { |
| 324 | + $this->flatten( array( 'span', 'a' ) ); |
| 325 | + } |
| 326 | + } |
| 327 | + |
| 328 | + public function getText( $dummy = null ) { |
| 329 | + $this->filterContent(); |
| 330 | + $text = parent::getText(); |
| 331 | + if ( $this->plainText ) { |
| 332 | + $text = html_entity_decode( $text ); |
| 333 | + $text = str_replace( "\r", "\n", $text ); |
| 334 | + $text = preg_replace( "/\n{3,}/", "\n\n", $text ); |
| 335 | + $text = preg_replace_callback( |
| 336 | + "/" . ApiQueryExtracts::SECTION_MARKER_START . '(\d)'. ApiQueryExtracts::SECTION_MARKER_END . "(.*?)$/m", |
| 337 | + array( $this, 'sectionCallback' ), |
| 338 | + $text |
| 339 | + ); |
| 340 | + } |
| 341 | + return $text; |
| 342 | + } |
| 343 | + |
| 344 | + public function onHtmlReady( $html ) { |
| 345 | + if ( $this->plainText ) { |
| 346 | + $html = preg_replace( '/\s*(<h([1-6])\b)/i', |
| 347 | + ApiQueryExtracts::SECTION_MARKER_START . '$2' . ApiQueryExtracts::SECTION_MARKER_END . '$1' , |
| 348 | + $html |
| 349 | + ); |
| 350 | + } |
| 351 | + return $html; |
| 352 | + } |
| 353 | + |
| 354 | + private function sectionCallback( $matches ) { |
| 355 | + if ( $this->sectionFormat == 'raw' ) { |
| 356 | + return $matches[0]; |
| 357 | + } |
| 358 | + $func = "ExtractFormatter::doSection_{$this->sectionFormat}"; |
| 359 | + return call_user_func( $func, $matches[1], trim( $matches[2] ) ); |
| 360 | + } |
| 361 | + |
| 362 | + private static function doSection_wiki( $level, $text ) { |
| 363 | + $bars = str_repeat( '=', $level ); |
| 364 | + return "\n$bars $text $bars"; |
| 365 | + } |
| 366 | + |
| 367 | + private static function doSection_none( $level, $text ) { |
| 368 | + return "\n$text"; |
| 369 | + } |
| 370 | +} |
\ No newline at end of file |
Index: trunk/extensions/MobileFrontend/MobileFrontend.php |
— | — | @@ -52,7 +52,7 @@ |
53 | 53 | |
54 | 54 | 'ApiMobileView' => 'api/ApiMobileView', |
55 | 55 | 'ApiParseExtender' => 'api/ApiParseExtender', |
56 | | - 'ApiQueryExcerpts' => 'api/ApiQueryExcerpts', |
| 56 | + 'ApiQueryExtracts' => 'api/ApiQueryExcerpts', |
57 | 57 | |
58 | 58 | 'MobileFrontendTemplate' => 'templates/MobileFrontendTemplate', |
59 | 59 | 'ApplicationTemplate' => 'templates/ApplicationTemplate', |
— | — | @@ -125,7 +125,7 @@ |
126 | 126 | |
127 | 127 | $wgExtensionFunctions[] = 'efMobileFrontend_Setup'; |
128 | 128 | |
129 | | -$wgAPIPropModules['excerpts'] = 'ApiQueryExcerpts'; |
| 129 | +$wgAPIPropModules['extracts'] = 'ApiQueryExtracts'; |
130 | 130 | $wgAPIModules['mobileview'] = 'ApiMobileView'; |
131 | 131 | |
132 | 132 | $wgHooks['APIGetAllowedParams'][] = 'ApiParseExtender::onAPIGetAllowedParams'; |