r82550 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r82549‎ | r82550 | r82551 >
Date:14:17, 21 February 2011
Author:tstarling
Status:ok
Tags:
Comment:
Rewrote LanguageConverter::autoConvert() to make it use preg_match() with an offset instead of preg_split(). Reduces memory usage for my test case ([[台灣演員列表]]) to a negligible amount. This should eliminate the most common cause of OOMs on Wikimedia. Produces the exact same output for that test case for the zh -> zh-tw, parser tests pass, seems to work.
Modified paths:
  • /trunk/phase3/languages/LanguageConverter.php (modified) (history)

Diff [purge]

Index: trunk/phase3/languages/LanguageConverter.php
@@ -300,34 +300,6 @@
301301 }
302302
303303 /**
304 - * Caption convert, base on preg_replace_callback.
305 - *
306 - * To convert text in "title" or "alt", like '<img alt="text" ... '
307 - * or '<span title="text" ... '
308 - *
309 - * @return String like ' alt="yyyy"' or ' title="yyyy"'
310 - */
311 - protected function captionConvert( $matches ) {
312 - // TODO: cache the preferred variant in every autoConvert() process,
313 - // this helps improve performance in a way.
314 - $toVariant = $this->getPreferredVariant();
315 - $title = $matches[1];
316 - $text = $matches[2];
317 -
318 - // we convert captions except URL
319 - if ( !strpos( $text, '://' ) ) {
320 - $text = $this->translate( $text, $toVariant );
321 - }
322 -
323 - // remove HTML tags to prevent disrupting the layout
324 - $text = preg_replace( '/<[^>]+>/', '', $text );
325 - // escape HTML special chars to prevent disrupting the layout
326 - $text = htmlspecialchars( $text );
327 -
328 - return " {$title}=\"{$text}\"";
329 - }
330 -
331 - /**
332304 * Dictionary-based conversion.
333305 * This function would not parse the conversion rules.
334306 * If you want to parse rules, try to use convert() or
@@ -374,41 +346,75 @@
375347
376348 $reg = '/' . $codefix . $scriptfix . $prefix .
377349 '<[^>]+>|&[a-zA-Z#][a-z0-9]+;' . $marker . $htmlfix . '/s';
 350+ $startPos = 0;
 351+ $sourceBlob = '';
 352+ $literalBlob = '';
378353
379 - $matches = preg_split( $reg, $text, - 1, PREG_SPLIT_OFFSET_CAPTURE );
 354+ // Guard against delimiter nulls in the input
 355+ $text = str_replace( "\000", '', $text );
380356
381 - $m = array_shift( $matches );
 357+ while ( $startPos < strlen( $text ) ) {
 358+ if ( preg_match( $reg, $text, $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
 359+ $elementPos = $markupMatches[0][1];
 360+ $element = $markupMatches[0][0];
 361+ } else {
 362+ $elementPos = strlen( $text );
 363+ $element = '';
 364+ }
382365
383 - $ret = $this->translate( $m[0], $toVariant );
384 - $mstart = $m[1] + strlen( $m[0] );
 366+ // Queue the part before the markup for translation in a batch
 367+ $sourceBlob .= substr( $text, $startPos, $elementPos - $startPos ) . "\000";
385368
386 - // enable convertsion of '<img alt="xxxx" ... '
387 - // or '<span title="xxxx" ... '
388 - $captionpattern = '/\s(title|alt)\s*=\s*"([\s\S]*?)"/';
 369+ // Advance to the next position
 370+ $startPos = $elementPos + strlen( $element );
389371
390 - $trtext = '';
391 - $trtextmark = "\0";
392 - $notrtext = array();
393 - foreach ( $matches as $m ) {
394 - $mark = substr( $text, $mstart, $m[1] - $mstart );
395 - $mark = preg_replace_callback( $captionpattern,
396 - array( &$this, 'captionConvert' ),
397 - $mark );
398 - // Let's convert the trtext only once,
399 - // it would give us more performance improvement
400 - $notrtext[] = $mark;
401 - $trtext .= $m[0] . $trtextmark;
402 - $mstart = $m[1] + strlen( $m[0] );
 372+ // Translate any alt or title attributes inside the matched element
 373+ if ( $element !== '' && preg_match( '/^(<[^>\s]*)\s([^>]*)(.*)$/', $element,
 374+ $elementMatches ) )
 375+ {
 376+ $attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] );
 377+ $changed = false;
 378+ foreach ( array( 'title', 'alt' ) as $attrName ) {
 379+ if ( !isset( $attrs[$attrName] ) ) {
 380+ continue;
 381+ }
 382+ $attr = $attrs[$attrName];
 383+ // Don't convert URLs
 384+ if ( !strpos( $attr, '://' ) ) {
 385+ $attr = $this->translate( $attr, $toVariant );
 386+ }
 387+
 388+ // Remove HTML tags to avoid disrupting the layout
 389+ $attr = preg_replace( '/<[^>]+>/', '', $attr );
 390+ if ( $attr !== $attrs[$attrName] ) {
 391+ $attrs[$attrName] = $attr;
 392+ $changed = true;
 393+ }
 394+ }
 395+ if ( $changed ) {
 396+ $element = $elementMatches[1] . Html::expandAttributes( $attrs ) .
 397+ $elementMatches[3];
 398+ }
 399+ }
 400+ $literalBlob .= $element . "\000";
403401 }
404 - $notrtext[] = '';
405 - $trtext = $this->translate( $trtext, $toVariant );
406 - $trtext = StringUtils::explode( $trtextmark, $trtext );
407 - foreach ( $trtext as $t ) {
408 - $ret .= array_shift( $notrtext );
409 - $ret .= $t;
 402+
 403+ // Do the main translation batch
 404+ $translatedBlob = $this->translate( $sourceBlob, $toVariant );
 405+
 406+ // Put the output back together
 407+ $translatedIter = StringUtils::explode( "\000", $translatedBlob );
 408+ $literalIter = StringUtils::explode( "\000", $literalBlob );
 409+ $output = '';
 410+ while ( $translatedIter->valid() && $literalIter->valid() ) {
 411+ $output .= $translatedIter->current();
 412+ $output .= $literalIter->current();
 413+ $translatedIter->next();
 414+ $literalIter->next();
410415 }
 416+
411417 wfProfileOut( __METHOD__ );
412 - return $ret;
 418+ return $output;
413419 }
414420
415421 /**

Follow-up revisions

RevisionCommit summaryAuthorDate
r825761.17wmf1: MFT r82475, r82480, r82538, r82547, r82550, r82552, r82554, r82555,...catrope21:37, 21 February 2011
r82673Followup r82550: define variables passed by reference to preg_match() beforeh...catrope15:54, 23 February 2011
r85354MFT r82518, r82530, r82538, r82547, r82550, r82565, r82572, r82608, r82696, r...demon18:25, 4 April 2011

Status & tagging log