r61554 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r61553‎ | r61554 | r61555 >
Date:02:55, 27 January 2010
Author:tstarling
Status:ok
Tags:
Comment:
Merged in local changes, by reverse merging r61551 except without the unrelated style changes throughout Parser.php.
Modified paths:
  • /branches/platonides/phase3/RELEASE-NOTES (modified) (history)
  • /branches/platonides/phase3/includes/StringUtils.php (modified) (history)
  • /branches/platonides/phase3/includes/parser/Parser.php (modified) (history)
  • /branches/platonides/phase3/maintenance/parserTests.txt (modified) (history)
  • /branches/platonides/phase3/tests/preg_split_test.php (added) (history)

Diff [purge]

Index: branches/platonides/phase3/maintenance/parserTests.txt
@@ -116,7 +116,7 @@
117117 </li><li> plain<b><i>bold-italic</i>bold</b>plain
118118 </li><li> plain<i>italic<b>bold-italic</b></i>plain
119119 </li><li> plain<b>bold<i>bold-italic</i></b>plain
120 -</li><li> plain l'<i>italic</i>plain
 120+</li><li> plain l&#39;<i>italic</i>plain
121121 </li><li> plain l'<b>bold</b> plain
122122 </li></ul>
123123
@@ -5253,19 +5253,17 @@
52545254 </p>
52555255 !! end
52565256
5257 -# Original result was this:
5258 -# <p><b>bold</b><b>bold<i>bolditalics</i></b>
 5257+# This was the original html, but it has also been
 5258+# <p>'<i>bold'</i><b>bold<i>bolditalics</i></b>
52595259 # </p>
5260 -# While that might be marginally more intuitive, maybe, the six-apostrophe
5261 -# construct is clearly pathological and the result stated here (which is what
5262 -# the parser actually does) is about as reasonable as anything.
 5260+# See bug 18765.
52635261 !!test
52645262 Mixing markup for italics and bold
52655263 !! options
52665264 !! input
52675265 '''bold''''''bold''bolditalics'''''
52685266 !! result
5269 -<p>'<i>bold'</i><b>bold<i>bolditalics</i></b>
 5267+<p><b>bold</b><b>bold<i>bolditalics</i></b>
52705268 </p>
52715269 !! end
52725270
@@ -6417,7 +6415,7 @@
64186416 !! input
64196417 ''' ''x'
64206418 !! result
6421 -<pre>'<i> </i>x'
 6419+<pre>&#39;<i> </i>x'
64226420 </pre>
64236421 !!end
64246422
@@ -7560,6 +7558,82 @@
75617559 <a href="https://www.mediawiki.org/wiki/Main_Page#section" title="Main Page">#section</a>
75627560 !! end
75637561
 7562+!! test
 7563+Bold/italic markup handled differently depending on leading whitespace (bug 18765)
 7564+!!input
 7565+'''Look at ''this edit'''s complicated bold/italic markup!'''
 7566+
 7567+<!-- Comment -->'''Look at ''this edit'''s complicated bold/italic markup!'''
 7568+
 7569+<span> '''Look at ''this edit'''s complicated bold/italic markup!'''</span>
 7570+
 7571+<nowiki></nowiki> '''Look at ''this edit'''s complicated bold/italic markup!'''
 7572+
 7573+<!-- Hello world---> '''Look at ''this edit'''s complicated bold/italic markup!'''
 7574+
 7575+{|
 7576+| '''Look at ''this edit'''s complicated bold/italic markup!'''
 7577+|}
 7578+
 7579+'''This was Italic'' this was plain''' and this was bold'''
 7580+but '''This is bold'' this is bold italic''' and this is bold'''
 7581+
 7582+<!-- Wishlist: Breaking because <span> and | are treated as text
 7583+<span>'''Look at ''this edit'''s complicated bold/italic markup!'''</span>
 7584+{|
 7585+|'''Look at ''this edit'''s complicated bold/italic markup!'''
 7586+|}
 7587+-->
 7588+!! result
 7589+<p><b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b>
 7590+</p><p><b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b>
 7591+</p><p><span> <b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b></span>
 7592+</p><p> <b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b>
 7593+</p>
 7594+<pre><b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b>
 7595+</pre>
 7596+<table>
 7597+<tr>
 7598+<td> <b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b>
 7599+</td></tr></table>
 7600+<p><b>This was Italic<i> this was plain&#39;</i> and this was bold</b>
 7601+but <b>This is bold<i> this is bold italic&#39;</i> and this is bold</b>
 7602+</p><p><br />
 7603+</p>
 7604+!! end
 7605+
 7606+!! test
 7607+Six quotes
 7608+!!input
 7609+''Italic''''''Bold
 7610+
 7611+'''Bold''BoldItalic''''''Normal
 7612+
 7613+''Italic'''BoldItalic''''''Normal'''''
 7614+
 7615+'''''BoldItalic''''''MoreBoldItalic''
 7616+
 7617+''''''Normal
 7618+!!result
 7619+<p><i>Italic'</i><b>Bold</b>
 7620+</p><p><b>Bold<i>BoldItalic'</i></b>Normal
 7621+</p><p><i>Italic<b>BoldItalic'</b></i>Normal
 7622+</p><p><i><b>BoldItalic</b><b>MoreBoldItalic</b></i>
 7623+</p><p>Normal
 7624+</p>
 7625+!!end
 7626+
 7627+
 7628+!! test
 7629+Too many quotes
 7630+!!input
 7631+I '''like'''''quotes'''''''''''
 7632+!! result
 7633+<p>I <b>like</b><i>quotes''''''</i><b> </b>
 7634+</p>
 7635+!! end
 7636+
 7637+
75647638 Note: some elements used in these Microdata examples don't work, like <img>
75657639 and <time>.
75667640 !! test
Index: branches/platonides/phase3/tests/preg_split_test.php
@@ -0,0 +1,24 @@
 2+<?php
 3+include "../includes/StringUtils.php";
 4+
 5+$pattern = "/('')+/";
 6+$subject = str_repeat("'' ", 1024*1024 + 7);
 7+
 8+$m = memory_get_usage();
 9+
 10+$ps1 = preg_split($pattern, $subject);
 11+
 12+$r = "";
 13+foreach ($ps1 as $c) {
 14+ $r .= $c . "|";
 15+}
 16+echo "Original preg_split: " . md5($r) . " " . (memory_get_usage()-$m) . "\n";
 17+
 18+unset($ps1);
 19+
 20+$r = "";
 21+$ps2 = StringUtils::preg_split($pattern, $subject);
 22+foreach ($ps2 as $c) {
 23+ $r .= $c . "|";
 24+}
 25+echo "StringUtils preg_split: " . md5($r) . " " . (memory_get_usage()-$m) . "\n";
Index: branches/platonides/phase3/includes/parser/Parser.php
@@ -1108,100 +1108,59 @@
11091109 }
11101110
11111111 /**
 1112+ * Processes bolds and italics on a single line.
11121113 * Helper function for doAllQuotes()
11131114 */
11141115 public function doQuotes( $text ) {
1115 - $arr = preg_split( "/(''+)/", $text, -1, PREG_SPLIT_DELIM_CAPTURE );
1116 - if ( count( $arr ) == 1 )
 1116+ # Counts the number of occurrences of bold and italics mark-ups.
 1117+ self::countBoldAndItalic($text, $numbold, $numitalics);
 1118+
 1119+ if ( ( $numbold == 0 ) && ( $numitalics == 0 ) )
11171120 return $text;
11181121 else
11191122 {
1120 - # First, do some preliminary work. This may shift some apostrophes from
1121 - # being mark-up to being text. It also counts the number of occurrences
1122 - # of bold and italics mark-ups.
1123 - $i = 0;
1124 - $numbold = 0;
1125 - $numitalics = 0;
1126 - foreach ( $arr as $r )
1127 - {
1128 - if ( ( $i % 2 ) == 1 )
1129 - {
1130 - # If there are ever four apostrophes, assume the first is supposed to
1131 - # be text, and the remaining three constitute mark-up for bold text.
1132 - if ( strlen( $arr[$i] ) == 4 )
1133 - {
1134 - $arr[$i-1] .= "'";
1135 - $arr[$i] = "'''";
1136 - }
1137 - # If there are more than 5 apostrophes in a row, assume they're all
1138 - # text except for the last 5.
1139 - else if ( strlen( $arr[$i] ) > 5 )
1140 - {
1141 - $arr[$i-1] .= str_repeat( "'", strlen( $arr[$i] ) - 5 );
1142 - $arr[$i] = "'''''";
1143 - }
1144 - # Count the number of occurrences of bold and italics mark-ups.
1145 - # We are not counting sequences of five apostrophes.
1146 - if ( strlen( $arr[$i] ) == 2 ) { $numitalics++; }
1147 - else if ( strlen( $arr[$i] ) == 3 ) { $numbold++; }
1148 - else if ( strlen( $arr[$i] ) == 5 ) { $numitalics++; $numbold++; }
1149 - }
1150 - $i++;
1151 - }
1152 -
11531123 # If there is an odd number of both bold and italics, it is likely
11541124 # that one of the bold ones was meant to be an apostrophe followed
11551125 # by italics. Which one we cannot know for certain, but it is more
11561126 # likely to be one that has a single-letter word before it.
11571127 if ( ( $numbold % 2 == 1 ) && ( $numitalics % 2 == 1 ) )
11581128 {
1159 - $i = 0;
1160 - $firstsingleletterword = -1;
1161 - $firstmultiletterword = -1;
1162 - $firstspace = -1;
1163 - foreach ( $arr as $r )
1164 - {
1165 - if ( ( $i % 2 == 1 ) and ( strlen( $r ) == 3 ) )
1166 - {
1167 - $x1 = substr ($arr[$i-1], -1);
1168 - $x2 = substr ($arr[$i-1], -2, 1);
1169 - if ($x1 === ' ') {
1170 - if ($firstspace == -1) $firstspace = $i;
1171 - } else if ($x2 === ' ') {
1172 - if ($firstsingleletterword == -1) $firstsingleletterword = $i;
1173 - } else {
1174 - if ($firstmultiletterword == -1) $firstmultiletterword = $i;
1175 - }
1176 - }
1177 - $i++;
1178 - }
11791129
1180 - # If there is a single-letter word, use it!
1181 - if ($firstsingleletterword > -1)
1182 - {
1183 - $arr [ $firstsingleletterword ] = "''";
1184 - $arr [ $firstsingleletterword-1 ] .= "'";
 1130+ # This algorithm moves the literal quote at the
 1131+ # right of a single word, at the right of a
 1132+ # multiletter word or at the right of a space.
 1133+ # Otherwise, it does nothing.
 1134+ #
 1135+ # The original if-based version can be found at
 1136+ # http://svn.wikimedia.org/viewvc/mediawiki/trunk/phase3/includes/parser/Parser.php?revision=61519&view=markup
 1137+ #
 1138+ # Unlike the original one, here we convert the
 1139+ # texty quotes to &#39; which shouldn't matter.
 1140+
 1141+ $quoteBalancerReplacements = array(
 1142+ "/(?<= [^ ])'''(?!')/"=>"&#39;''",
 1143+ "/(?<=[^ '])'''(?!')/"=>"&#39;''",
 1144+ "/(^|(?<=[^'])) '''(?!')/"=>" &#39;''");
 1145+
 1146+ foreach( $quoteBalancerReplacements as $k => $v) {
 1147+ $text = preg_replace($k, $v, $text, 1, $count);
 1148+ if ($count != 0)
 1149+ break;
11851150 }
1186 - # If not, but there's a multi-letter word, use that one.
1187 - else if ($firstmultiletterword > -1)
1188 - {
1189 - $arr [ $firstmultiletterword ] = "''";
1190 - $arr [ $firstmultiletterword-1 ] .= "'";
1191 - }
1192 - # ... otherwise use the first one that has neither.
1193 - # (notice that it is possible for all three to be -1 if, for example,
1194 - # there is only one pentuple-apostrophe in the line)
1195 - else if ($firstspace > -1)
1196 - {
1197 - $arr [ $firstspace ] = "''";
1198 - $arr [ $firstspace-1 ] .= "'";
1199 - }
12001151 }
12011152
 1153+ # Split in groups of 2, 3, 5 or 6 apostrophes.
 1154+ # If there are ever four apostrophes, assume the first is supposed to
 1155+ # be text, and the remaining three constitute mark-up for bold text.
 1156+ # If there are more than 6 apostrophes in a row, assume they're all
 1157+ # text except for the last 6.
 1158+ $arr = Stringutils::preg_split( "/('{2,3}(?:''')?)(?!')/", $text, -1, PREG_SPLIT_DELIM_CAPTURE );
 1159+
 1160+
12021161 # Now let's actually convert our apostrophic mush to HTML!
1203 - $output = '';
1204 - $buffer = '';
1205 - $state = '';
 1162+ $output = ''; # Processed text
 1163+ $buffer = ''; # Content if $state is 'both'
 1164+ $state = ''; # Flags with the order of open tags: '|b|i|bi|ib|both'
12061165 $i = 0;
12071166 foreach ($arr as $r)
12081167 {
@@ -1218,43 +1177,58 @@
12191178 {
12201179 if ($state === 'i')
12211180 { $output .= '</i>'; $state = ''; }
1222 - else if ($state === 'bi')
 1181+ elseif ($state === 'bi')
12231182 { $output .= '</i>'; $state = 'b'; }
1224 - else if ($state === 'ib')
 1183+ elseif ($state === 'ib')
12251184 { $output .= '</b></i><b>'; $state = 'b'; }
1226 - else if ($state === 'both')
 1185+ elseif ($state === 'both')
12271186 { $output .= '<b><i>'.$buffer.'</i>'; $state = 'b'; }
12281187 else # $state can be 'b' or ''
12291188 { $output .= '<i>'; $state .= 'i'; }
12301189 }
1231 - else if (strlen ($r) == 3)
 1190+ elseif (strlen ($r) == 3)
12321191 {
12331192 if ($state === 'b')
12341193 { $output .= '</b>'; $state = ''; }
1235 - else if ($state === 'bi')
 1194+ elseif ($state === 'bi')
12361195 { $output .= '</i></b><i>'; $state = 'i'; }
1237 - else if ($state === 'ib')
 1196+ elseif ($state === 'ib')
12381197 { $output .= '</b>'; $state = 'i'; }
1239 - else if ($state === 'both')
 1198+ elseif ($state === 'both')
12401199 { $output .= '<i><b>'.$buffer.'</b>'; $state = 'i'; }
12411200 else # $state can be 'i' or ''
12421201 { $output .= '<b>'; $state .= 'b'; }
12431202 }
1244 - else if (strlen ($r) == 5)
 1203+ elseif (strlen ($r) == 5)
12451204 {
12461205 if ($state === 'b')
12471206 { $output .= '</b><i>'; $state = 'i'; }
1248 - else if ($state === 'i')
 1207+ elseif ($state === 'i')
12491208 { $output .= '</i><b>'; $state = 'b'; }
1250 - else if ($state === 'bi')
 1209+ elseif ($state === 'bi')
12511210 { $output .= '</i></b>'; $state = ''; }
1252 - else if ($state === 'ib')
 1211+ elseif ($state === 'ib')
12531212 { $output .= '</b></i>'; $state = ''; }
1254 - else if ($state === 'both')
 1213+ elseif ($state === 'both')
12551214 { $output .= '<i><b>'.$buffer.'</b></i>'; $state = ''; }
12561215 else # ($state == '')
12571216 { $buffer = ''; $state = 'both'; }
12581217 }
 1218+ elseif (strlen ($r) == 6)
 1219+ {
 1220+ if ($state === 'b')
 1221+ { $output .= '</b><b>'; $state = 'b'; }
 1222+ elseif ($state === 'i')
 1223+ { $output .= '\'</i><b>'; $state = 'b'; }
 1224+ elseif ($state === 'bi')
 1225+ { $output .= '\'</i></b>'; $state = ''; }
 1226+ elseif ($state === 'ib')
 1227+ { $output .= '\'</b></i>'; $state = ''; }
 1228+ elseif ($state === 'both')
 1229+ { $output .= '<i><b>'.$buffer.'</b><b>'; $state = 'ib'; }
 1230+ else # ($state == '')
 1231+ { $buffer = ''; $state = ''; }
 1232+ }
12591233 }
12601234 $i++;
12611235 }
@@ -1273,6 +1247,57 @@
12741248 }
12751249
12761250 /**
 1251+ * Counts the number of bold and italic items from a line of text.
 1252+ * Helper function for doQuotes()
 1253+ */
 1254+ private static function countBoldAndItalic($text, &$numBold, &$numItalics) {
 1255+ $numBold = 0;
 1256+ $numItalics = 0;
 1257+ $offset = 0;
 1258+
 1259+ do {
 1260+ $offset = strpos($text, "'", $offset);
 1261+ if ($offset === false)
 1262+ return;
 1263+
 1264+ $quoteLen = strspn($text, "'", $offset);
 1265+ $offset += $quoteLen;
 1266+
 1267+ switch ($quoteLen) {
 1268+ case 0:
 1269+ case 1:
 1270+ break;
 1271+
 1272+ case 2:
 1273+ $numItalics++;
 1274+ break;
 1275+
 1276+ case 3:
 1277+ $numBold++;
 1278+ break;
 1279+
 1280+ case 4:
 1281+ # If there are ever four apostrophes, assume the first is supposed to
 1282+ # be text, and the remaining three constitute mark-up for bold text.
 1283+ $numBold++;
 1284+ $numItalics++;
 1285+ break;
 1286+
 1287+ case 5:
 1288+ $numItalics++;
 1289+ $numBold++;
 1290+ break;
 1291+
 1292+ case 6:
 1293+ default:
 1294+ # If there are more than 6 apostrophes in a row, assume they're all
 1295+ # text except for the last 6.
 1296+ $numBold+=2;
 1297+ }
 1298+ } while (true);
 1299+ }
 1300+
 1301+ /**
12771302 * Replace external links (REL)
12781303 *
12791304 * Note: this is all very hackish and the order of execution matters a lot.
Index: branches/platonides/phase3/includes/StringUtils.php
@@ -179,6 +179,14 @@
180180 return new ArrayIterator( explode( $separator, $subject ) );
181181 }
182182 }
 183+
 184+ /**
 185+ * Workalike for preg_split() with limited memory usage.
 186+ * Returns an Iterator
 187+ */
 188+ static function preg_split( $pattern, $subject, $limit = -1, $flags = 0 ) {
 189+ return new PregSplitIterator( $pattern, $subject, $limit, $flags );
 190+ }
183191 }
184192
185193 /**
@@ -409,3 +417,82 @@
410418 }
411419 }
412420
 421+
 422+/**
 423+ * An iterator which works exactly like:
 424+ *
 425+ * foreach ( preg_split( $pattern, $s, $limit, $flags ) as $element ) {
 426+ * ...
 427+ * }
 428+ *
 429+ * Except it doesn't use huge amounts of memory when $limit is -1
 430+ *
 431+ * The flag PREG_SPLIT_OFFSET_CAPTURE isn't supported.
 432+ */
 433+class PregSplitIterator implements Iterator {
 434+ // The subject string
 435+ var $pattern, $subject, $originalLimit, $flags;
 436+
 437+ // The last extracted group of items.
 438+ var $smallArray;
 439+
 440+ // The position on the iterator.
 441+ var $curPos;
 442+
 443+ const MAX_LIMIT = 100;
 444+
 445+ /**
 446+ * Construct a PregSplitIterator
 447+ */
 448+ function __construct( $pattern, $s, $limit, $flags) {
 449+ $this->pattern = $pattern;
 450+ $this->subject = $s;
 451+ $this->originalLimit = $limit;
 452+ $this->flags = $flags;
 453+
 454+ $this->rewind();
 455+ }
 456+
 457+ private function effectiveLimit() {
 458+ if ($this->originalLimit == -1) {
 459+ return self::MAX_LIMIT + 1;
 460+ } else if ($this->limit > self::MAX_LIMIT) {
 461+ $this->limit -= self::MAX_LIMIT;
 462+ return self::MAX_LIMIT + 1;
 463+ } else {
 464+ $old = $this->limit;
 465+ $this->limit = 0;
 466+ return $old;
 467+ }
 468+ }
 469+
 470+ function rewind() {
 471+ $this->curPos = 0;
 472+ $this->limit = $this->originalLimit;
 473+ if ($this->limit == -1) $this->limit = self::MAX_LIMIT;
 474+ $this->smallArray = preg_split( $this->pattern, $this->subject, $this->effectiveLimit(), $this->flags);
 475+ }
 476+
 477+ function current() {
 478+ return $this->smallArray[$this->curPos % self::MAX_LIMIT];
 479+ }
 480+
 481+ function key() {
 482+ return $this->curPos;
 483+ }
 484+
 485+ function next() {
 486+ $this->curPos++;
 487+ if ( $this->curPos % self::MAX_LIMIT == 0 ) {
 488+ # Last item contains the rest unsplitted.
 489+ if ($this->limit > 0) {
 490+ $this->smallArray = preg_split( $this->pattern, $this->smallArray[self::MAX_LIMIT], $this->effectiveLimit(), $this->flags);
 491+ }
 492+ }
 493+ return;
 494+ }
 495+
 496+ function valid() {
 497+ return $this->curPos % self::MAX_LIMIT < count($this->smallArray);
 498+ }
 499+}
Index: branches/platonides/phase3/RELEASE-NOTES
@@ -711,6 +711,8 @@
712712 * (bug 9794) User rights log entries for foreign user now links to the foreign
713713 user's page if possible
714714 * (bug 14717) Don't load nonexistent CSS fix files for non-Monobook skins
 715+* (bug 18765) Increased consistency of bold-italic markup for unbalanced quotes.
 716+ Improved representation of six quotes (may break existing markup).
715717 * (bug 22034) Use wfClientAcceptsGzip() in wfGzipHandler instead of
716718 reimplementing it.
717719 * (bug 19226) First line renders differently on many UI messages.

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r61551Revert r61528, r61527, r61526, r61525, r61519, r61515, r61053, r61052 (Parser...tstarling02:41, 27 January 2010

Status & tagging log