r85922 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r85921‎ | r85922 | r85923 >
Date:21:27, 12 April 2011
Author:diebuche
Status:reverted (Comments)
Tags:
Comment:
Modified paths:
  • /trunk/phase3/includes/Sanitizer.php (modified) (history)
  • /trunk/phase3/includes/parser/Parser.php (modified) (history)
  • /trunk/phase3/skins/common/wikibits.js (modified) (history)
  • /trunk/phase3/tests/parser/parserTests.txt (modified) (history)

Diff [purge]

Index: trunk/phase3/skins/common/wikibits.js
@@ -551,14 +551,7 @@
552552 };
553553
554554 window.ts_makeSortable = function( table ) {
555 - var firstRow;
556 - if ( table.rows && table.rows.length > 0 ) {
557 - if ( table.tHead && table.tHead.rows.length > 0 ) {
558 - firstRow = table.tHead.rows[table.tHead.rows.length-1];
559 - } else {
560 - firstRow = table.rows[0];
561 - }
562 - }
 555+ var firstRow = table.rows[0];
563556 if ( !firstRow ) {
564557 return;
565558 }
Index: trunk/phase3/tests/parser/parserTests.txt
@@ -1237,8 +1237,9 @@
12381238 |}
12391239 !! result
12401240 <table>
1241 -<caption> caption
1242 -</caption><tr><td></td></tr></table>
 1241+<caption>caption</caption>
 1242+<tr><td></td></tr>
 1243+</table>
12431244
12441245 !! end
12451246
@@ -1253,13 +1254,14 @@
12541255 !! result
12551256 <table>
12561257 <tr>
1257 -<td> 1 </td>
1258 -<td> 2
1259 -</td></tr>
 1258+<td>1</td>
 1259+<td>2</td>
 1260+</tr>
12601261 <tr>
1261 -<td> 3 </td>
1262 -<td> 4
1263 -</td></tr></table>
 1262+<td>3</td>
 1263+<td>4</td>
 1264+</tr>
 1265+</table>
12641266
12651267 !! end
12661268
@@ -1288,49 +1290,46 @@
12891291 |}
12901292 !! result
12911293 <table border="1" cellpadding="2">
1292 -<caption>Multiplication table
1293 -</caption>
 1294+<caption>Multiplication table</caption>
 1295+<thead>
12941296 <tr>
1295 -<th> &#215; </th>
1296 -<th> 1 </th>
1297 -<th> 2 </th>
1298 -<th> 3
1299 -</th></tr>
 1297+<th>&#215;</th>
 1298+<th>1</th>
 1299+<th>2</th>
 1300+<th>3</th>
 1301+</tr></thead>
 1302+<tbody>
13001303 <tr>
1301 -<th> 1
1302 -</th>
1303 -<td> 1 </td>
1304 -<td> 2 </td>
1305 -<td> 3
1306 -</td></tr>
 1304+<th>1</th>
 1305+<td>1</td>
 1306+<td>2</td>
 1307+<td>3</td>
 1308+</tr>
13071309 <tr>
1308 -<th> 2
1309 -</th>
1310 -<td> 2 </td>
1311 -<td> 4 </td>
1312 -<td> 6
1313 -</td></tr>
 1310+<th>2</th>
 1311+<td>2</td>
 1312+<td>4</td>
 1313+<td>6</td>
 1314+</tr>
13141315 <tr>
1315 -<th> 3
1316 -</th>
1317 -<td> 3 </td>
1318 -<td> 6 </td>
1319 -<td> 9
1320 -</td></tr>
 1316+<th>3</th>
 1317+<td>3</td>
 1318+<td>6</td>
 1319+<td>9</td>
 1320+</tr>
13211321 <tr>
1322 -<th> 4
1323 -</th>
1324 -<td> 4 </td>
1325 -<td> 8 </td>
1326 -<td> 12
1327 -</td></tr>
 1322+<th>4</th>
 1323+<td>4</td>
 1324+<td>8</td>
 1325+<td>12</td>
 1326+</tr>
13281327 <tr>
1329 -<th> 5
1330 -</th>
1331 -<td> 5 </td>
1332 -<td> 10 </td>
1333 -<td> 15
1334 -</td></tr></table>
 1328+<th>5</th>
 1329+<td>5</td>
 1330+<td>10</td>
 1331+<td>15</td>
 1332+</tr></tbody>
 1333+</table>
13351334
13361335 !! end
13371336
@@ -1348,17 +1347,15 @@
13491348 !! result
13501349 <table align="right" border="1">
13511350 <tr>
1352 -<td> Cell 1, row 1
1353 -</td>
1354 -<td rowspan="2"> Cell 2, row 1 (and 2)
1355 -</td>
1356 -<td> Cell 3, row 1
1357 -</td></tr>
 1351+<td>Cell 1, row 1</td>
 1352+<td rowspan="2">Cell 2, row 1 (and 2)</td>
 1353+<td>Cell 3, row 1</td>
 1354+</tr>
13581355 <tr>
1359 -<td> Cell 1, row 2
1360 -</td>
1361 -<td> Cell 3, row 2
1362 -</td></tr></table>
 1356+<td>Cell 1, row 2</td>
 1357+<td>Cell 3, row 2</td>
 1358+</tr>
 1359+</table>
13631360
13641361 !! end
13651362
@@ -1378,19 +1375,19 @@
13791376 !! result
13801377 <table border="1">
13811378 <tr>
1382 -<td> &#945;
1383 -</td>
 1379+<td>&#945;</td>
13841380 <td>
13851381 <table bgcolor="#ABCDEF" border="2">
13861382 <tr>
1387 -<td>nested
1388 -</td></tr>
 1383+<td>nested</td>
 1384+</tr>
13891385 <tr>
1390 -<td>table
1391 -</td></tr></table>
1392 -</td>
1393 -<td>the original table again
1394 -</td></tr></table>
 1386+<td>table</td>
 1387+</tr>
 1388+</table></td>
 1389+<td>the original table again</td>
 1390+</tr>
 1391+</table>
13951392
13961393 !! end
13971394
@@ -1403,8 +1400,9 @@
14041401 !! result
14051402 <table>
14061403 <tr>
1407 -<td>broken
1408 -</td></tr></table>
 1404+<td>broken</td>
 1405+</tr>
 1406+</table>
14091407
14101408 !! end
14111409
@@ -1418,8 +1416,7 @@
14191417 <table>
14201418 <tr>
14211419 <td>[<a rel="nofollow" class="external free" href="ftp://%7Cx">ftp://%7Cx</a></td>
1422 -<td>]" onmouseover="alert(document.cookie)"&gt;test
1423 -</td>
 1420+<td>]" onmouseover="alert(document.cookie)"&gt;test</td>
14241421 </tr>
14251422 </table>
14261423
@@ -2685,8 +2682,9 @@
26862683 !! result
26872684 <table>
26882685 <tr>
2689 -<td>[[{{{1}}}|{{{2}}}]]
2690 -</td></tr></table>
 2686+<td>[[{{{1}}}|{{{2}}}]]</td>
 2687+</tr>
 2688+</table>
26912689
26922690 !! end
26932691
@@ -2795,13 +2793,14 @@
27962794 </p>
27972795 <table>
27982796 <tr>
2799 -<td> 1 </td>
2800 -<td> 2
2801 -</td></tr>
 2797+<td>1</td>
 2798+<td>2</td>
 2799+</tr>
28022800 <tr>
2803 -<td> 3 </td>
2804 -<td> 4
2805 -</td></tr></table>
 2801+<td>3</td>
 2802+<td>4</td>
 2803+</tr>
 2804+</table>
28062805
28072806 !! end
28082807
@@ -2815,13 +2814,14 @@
28162815 </p>
28172816 <table>
28182817 <tr>
2819 -<td> 1 </td>
2820 -<td> 2
2821 -</td></tr>
 2818+<td>1</td>
 2819+<td>2</td>
 2820+</tr>
28222821 <tr>
2823 -<td> 3 </td>
2824 -<td> 4
2825 -</td></tr></table>
 2822+<td>3</td>
 2823+<td>4</td>
 2824+</tr>
 2825+</table>
28262826
28272827 !! end
28282828
@@ -4370,8 +4370,9 @@
43714371 !! result
43724372 <table>
43734373 <tr>
4374 -<th class="awesome"> status
4375 -</th></tr></table>
 4374+<th class="awesome">status</th>
 4375+</tr>
 4376+</table>
43764377
43774378 !!end
43784379
@@ -4815,8 +4816,9 @@
48164817 !! result
48174818 <table>
48184819 <tr>
4819 -<th style="color:blue"> status
4820 -</th></tr></table>
 4820+<th style="color:blue">status</th>
 4821+</tr>
 4822+</table>
48214823
48224824 !!end
48234825
@@ -4829,8 +4831,9 @@
48304832 !! result
48314833 <table>
48324834 <tr>
4833 -<th style="/* insecure input */"> status
4834 -</th></tr></table>
 4835+<th style="/* insecure input */">status</th>
 4836+</tr>
 4837+</table>
48354838
48364839 !! end
48374840
@@ -5452,8 +5455,7 @@
54535456 !! result
54545457 <table>
54555458 <tr>
5456 -<td>
5457 -</td>
 5459+<td></td>
54585460 </tr>
54595461 </table>
54605462
@@ -5501,8 +5503,7 @@
55025504 <th>https://</th>
55035505 <th></th>
55045506 <th></th>
5505 -<th>
5506 -</td>
 5507+<th></th>
55075508 </tr>
55085509 </table>
55095510
@@ -5517,10 +5518,8 @@
55185519 !! result
55195520 <table>
55205521 <tr>
5521 -<th> <a rel="nofollow" class="external free" href="irc://{{ftp://a">irc://{{ftp://a</a>" onmouseover="alert('hello world');"
5522 -</th>
5523 -<td>
5524 -</td>
 5522+<th><a rel="nofollow" class="external free" href="irc://{{ftp://a">irc://{{ftp://a</a>" onmouseover="alert('hello world');"</th>
 5523+<td></td>
55255524 </tr>
55265525 </table>
55275526
@@ -5542,6 +5541,22 @@
55435542 !! end
55445543
55455544 # Known to produce bad XML for now
 5545+
 5546+# Note: the current result listed for this is not what the original one was,
 5547+# but the original bug was JavaScript injection, which is fixed in any case.
 5548+# It's not clear that the original result listed was any more correct than the
 5549+# current one. Original result:
 5550+# <table>
 5551+# {{{|
 5552+# <u class="&#124;">}}}} &gt;
 5553+# <br style="onmouseover=&#39;alert(document.cookie);&#39;" />
 5554+#
 5555+# MOVE YOUR MOUSE CURSOR OVER THIS TEXT
 5556+# <tr>
 5557+# <td></u>
 5558+# </td>
 5559+# </tr>
 5560+# </table>
55465561 !! test
55475562 Fuzz testing: Parser24
55485563 !! options
@@ -5556,15 +5571,14 @@
55575572 MOVE YOUR MOUSE CURSOR OVER THIS TEXT
55585573 |
55595574 !! result
5560 -<table>
5561 -{{{|
 5575+<p>{{{|
55625576 <u class="&#124;">}}}} &gt;
55635577 <br style="onmouseover=&#39;alert(document.cookie);&#39;" />
5564 -
5565 -MOVE YOUR MOUSE CURSOR OVER THIS TEXT
 5578+</p><p>MOVE YOUR MOUSE CURSOR OVER THIS TEXT
 5579+</p>
 5580+<table>
55665581 <tr>
5567 -<td></u>
5568 -</td>
 5582+<td></u></td>
55695583 </tr>
55705584 </table>
55715585
@@ -7735,13 +7749,13 @@
77367750 </p>
77377751 <table>
77387752 <tr>
7739 -<td> 1 </td>
7740 -<td> 2
7741 -</td></tr>
 7753+<td>1</td>
 7754+<td>2</td>
 7755+</tr>
77427756 <tr>
7743 -<td> 3 </td>
7744 -<td> 4
7745 -</td></tr></table>
 7757+<td>3</td>
 7758+<td>4</td>
 7759+</tr></table>
77467760 <p>y
77477761 </p>
77487762 !! end
Index: trunk/phase3/includes/parser/Parser.php
@@ -824,189 +824,283 @@
825825
826826 $lines = StringUtils::explode( "\n", $text );
827827 $out = '';
828 - $td_history = array(); # Is currently a td tag open?
829 - $last_tag_history = array(); # Save history of last lag activated (td, th or caption)
830 - $tr_history = array(); # Is currently a tr tag open?
831 - $tr_attributes = array(); # history of tr attributes
832 - $has_opened_tr = array(); # Did this table open a <tr> element?
833 - $indent_level = 0; # indent level of the table
 828+ $output =& $out;
834829
835830 foreach ( $lines as $outLine ) {
836831 $line = trim( $outLine );
837832
838 - if ( $line === '' ) { # empty line, go to next line
 833+ if ( $line == '') { //empty line, go to next line
839834 $out .= $outLine."\n";
840835 continue;
841836 }
842 -
843 - $first_character = $line[0];
 837+ $first_chars = $line[0];
 838+ if ( strlen($line) > 1) {
 839+ $first_chars .= in_array($line[1], array('}', '+', '-')) ? $line[1] : '';
 840+ }
844841 $matches = array();
845842
846843 if ( preg_match( '/^(:*)\{\|(.*)$/', $line , $matches ) ) {
847 - # First check if we are starting a new table
848 - $indent_level = strlen( $matches[1] );
 844+ $tables[] = array();
 845+ $table =& $this->last($tables);
 846+ $table[0] = array(); //first row
 847+ $current_row =& $table[0];
849848
 849+ $table['indent'] = strlen( $matches[1] );
 850+
850851 $attributes = $this->mStripState->unstripBoth( $matches[2] );
851852 $attributes = Sanitizer::fixTagAttributes( $attributes , 'table' );
852853
853 - $outLine = str_repeat( '<dl><dd>' , $indent_level ) . "<table{$attributes}>";
854 - array_push( $td_history , false );
855 - array_push( $last_tag_history , '' );
856 - array_push( $tr_history , false );
857 - array_push( $tr_attributes , '' );
858 - array_push( $has_opened_tr , false );
859 - } elseif ( count( $td_history ) == 0 ) {
860 - # Don't do any of the following
 854+ if ( $attributes !== '' ) {
 855+ $table['attributes'] = $attributes;
 856+ }
 857+ } else if ( !isset($tables[0]) ) {
 858+ // we're outside the table
 859+
861860 $out .= $outLine."\n";
862 - continue;
863 - } elseif ( substr( $line , 0 , 2 ) === '|}' ) {
864 - # We are ending a table
865 - $line = '</table>' . substr( $line , 2 );
866 - $last_tag = array_pop( $last_tag_history );
 861+ } else if ( $first_chars === '|}' ) {
 862+ // trim the |} code from the line
 863+ $line = substr ( $line , 2 );
867864
868 - if ( !array_pop( $has_opened_tr ) ) {
869 - $line = "<tr><td></td></tr>{$line}";
 865+ // Shorthand for last row
 866+ $last_row =& $this->last($table);
 867+
 868+ // a thead at the end becomes a tfoot, unless there is only one row
 869+ // Do this before deleting empty last lines to allow headers at the bottom of tables
 870+ if ( isset($last_row['type'] ) && $last_row['type'] == 'thead' && isset($table[1])) {
 871+ $last_row['type'] = 'tfoot';
 872+ for($i = 0; isset($last_row[$i]); $i++ ) {
 873+ $last_row[$i]['type'] = 'td';
 874+ }
870875 }
871876
872 - if ( array_pop( $tr_history ) ) {
873 - $line = "</tr>{$line}";
 877+ // Delete empty last lines
 878+ if ( empty($last_row) ) {
 879+ $last_row = NULL;
874880 }
 881+ $o = $this->printTableHtml( array_pop($tables) ) . $line;
875882
876 - if ( array_pop( $td_history ) ) {
877 - $line = "</{$last_tag}>{$line}";
 883+ if ( count($tables) > 0 ) {
 884+ $table =& $this->last($tables);
 885+ $current_row =& $this->last($table);
 886+ $current_element =& $this->last($current_row);
 887+
 888+ $output =& $current_element['content'];
 889+ } else {
 890+ $output =& $out;
878891 }
879 - array_pop( $tr_attributes );
880 - $outLine = $line . str_repeat( '</dd></dl>' , $indent_level );
881 - } elseif ( substr( $line , 0 , 2 ) === '|-' ) {
882 - # Now we have a table row
883 - $line = preg_replace( '#^\|-+#', '', $line );
884892
885 - # Whats after the tag is now only attributes
 893+ $output .= $o;
 894+
 895+ } else if ( $first_chars === '|-' ) {
 896+ // start a new row element
 897+ // but only when we haven't started one already
 898+ if( count($current_row) != 0 ) {
 899+ $table[] = array();
 900+ $current_row =& $this->last($table);
 901+ }
 902+ // Get the attributes, there's nothing else useful in $line now
 903+ $line = substr ( $line , 2 );
886904 $attributes = $this->mStripState->unstripBoth( $line );
887905 $attributes = Sanitizer::fixTagAttributes( $attributes, 'tr' );
888 - array_pop( $tr_attributes );
889 - array_push( $tr_attributes, $attributes );
 906+ if( $attributes !== '') {
 907+ $current_row['attributes'] = $attributes;
 908+ }
 909+
 910+ } else if ( $first_chars === '|+' ) {
 911+ // a table caption
 912+ $line = substr ( $line , 2 );
 913+
 914+ $c = $this->getCellAttr($line , 'caption');
 915+ $table['caption'] = array();
 916+ $table['caption']['content'] = $c[0];
 917+ if(isset($c[1])) $table['caption']['attributes'] = $c[1];
 918+ unset($c);
890919
891 - $line = '';
892 - $last_tag = array_pop( $last_tag_history );
893 - array_pop( $has_opened_tr );
894 - array_push( $has_opened_tr , true );
 920+ $output =& $table['caption'];
 921+ } else if ( $first_chars === '|' || $first_chars === '!' || $first_chars === '!+' ) {
 922+ // Which kind of cells are we dealing with
 923+ $this_tag = 'td';
 924+ $line = substr ( $line , 1 );
895925
896 - if ( array_pop( $tr_history ) ) {
897 - $line = '</tr>';
 926+ if ( $first_chars === '!' || $first_chars === '!+' ) {
 927+ $line = str_replace ( '!!' , '||' , $line );
 928+ $this_tag = 'th';
898929 }
899930
900 - if ( array_pop( $td_history ) ) {
901 - $line = "</{$last_tag}>{$line}";
902 - }
 931+ // Split up multiple cells on the same line.
 932+ $cells = StringUtils::explodeMarkup( '||' , $line );
 933+ $line = ''; // save memory
903934
904 - $outLine = $line;
905 - array_push( $tr_history , false );
906 - array_push( $td_history , false );
907 - array_push( $last_tag_history , '' );
908 - } elseif ( $first_character === '|' || $first_character === '!' || substr( $line , 0 , 2 ) === '|+' ) {
909 - # This might be cell elements, td, th or captions
910 - if ( substr( $line , 0 , 2 ) === '|+' ) {
911 - $first_character = '+';
912 - $line = substr( $line , 1 );
 935+ // decide whether thead to tbody
 936+ if ( !array_key_exists('type', $current_row) ) {
 937+ $current_row['type'] = ( $first_chars === '!' ) ? 'thead' : 'tbody' ;
 938+ } else if( $first_chars === '|' ) {
 939+ $current_row['type'] = 'tbody';
913940 }
914941
915 - $line = substr( $line , 1 );
 942+ // Loop through each table cell
 943+ foreach ( $cells as $cell ) {
 944+ // a new cell
 945+ $current_row[] = array();
 946+ $current_element =& $this->last($current_row);
916947
917 - if ( $first_character === '!' ) {
918 - $line = str_replace( '!!' , '||' , $line );
 948+ $current_element['type'] = $this_tag;
 949+
 950+ $c = $this->getCellAttr($cell , $this_tag);
 951+ $current_element['content'] = $c[0];
 952+ if(isset($c[1])) $current_element['attributes'] = $c[1];
 953+ unset($c);
919954 }
 955+ $output =& $current_element['content'];
 956+
 957+ } else {
 958+ $output .= $outLine."\n";
 959+ }
 960+ }
 961+
 962+ # Remove trailing line-ending (b/c)
 963+ if ( substr( $out, -1 ) === "\n" ) {
 964+ $out = substr( $out, 0, -1 );
 965+ }
 966+
 967+ #Close any unclosed tables
 968+ if (isset($tables) && count($tables) > 0 ) {
 969+ for ($i = 0; $i < count($tables); $i++) {
 970+ $out .= $this->printTableHtml( array_pop($tables) );
 971+ }
 972+ }
 973+
 974+ wfProfileOut( __METHOD__ );
920975
921 - # Split up multiple cells on the same line.
922 - # FIXME : This can result in improper nesting of tags processed
923 - # by earlier parser steps, but should avoid splitting up eg
924 - # attribute values containing literal "||".
925 - $cells = StringUtils::explodeMarkup( '||' , $line );
 976+ return $out;
 977+ }
926978
927 - $outLine = '';
928979
929 - # Loop through each table cell
930 - foreach ( $cells as $cell ) {
931 - $previous = '';
932 - if ( $first_character !== '+' ) {
933 - $tr_after = array_pop( $tr_attributes );
934 - if ( !array_pop( $tr_history ) ) {
935 - $previous = "<tr{$tr_after}>\n";
936 - }
937 - array_push( $tr_history , true );
938 - array_push( $tr_attributes , '' );
939 - array_pop( $has_opened_tr );
940 - array_push( $has_opened_tr , true );
941 - }
 980+ /**
 981+ * Helper function for doTableStuff() separating the contents of cells from
 982+ * attributes. Particularly useful as there's a possible bug and this action
 983+ * is repeated twice.
 984+ *
 985+ * @private
 986+ */
 987+ function getCellAttr ($cell , $tag_name) {
 988+ $content = null;
 989+ $attributes = null;
942990
943 - $last_tag = array_pop( $last_tag_history );
 991+ $cell = trim ( $cell );
944992
945 - if ( array_pop( $td_history ) ) {
946 - $previous = "</{$last_tag}>\n{$previous}";
947 - }
 993+ // A cell could contain both parameters and data
 994+ $cell_data = explode ( '|' , $cell , 2 );
948995
949 - if ( $first_character === '|' ) {
950 - $last_tag = 'td';
951 - } elseif ( $first_character === '!' ) {
952 - $last_tag = 'th';
953 - } elseif ( $first_character === '+' ) {
954 - $last_tag = 'caption';
955 - } else {
956 - $last_tag = '';
957 - }
 996+ // Bug 553: Note that a '|' inside an invalid link should not
 997+ // be mistaken as delimiting cell parameters
 998+ if ( strpos( $cell_data[0], '[[' ) !== false ) {
 999+ $content = trim ( $cell );
 1000+ }
 1001+ else if ( count ( $cell_data ) == 1 ) {
 1002+ $content = trim ( $cell_data[0] );
 1003+ }
 1004+ else {
 1005+ $attributes = $this->mStripState->unstripBoth( $cell_data[0] );
 1006+ $attributes = Sanitizer::fixTagAttributes( $attributes , $tag_name );
9581007
959 - array_push( $last_tag_history , $last_tag );
 1008+ $content = trim ( $cell_data[1] );
 1009+ }
 1010+ return array($content, $attributes);
 1011+ }
9601012
961 - # A cell could contain both parameters and data
962 - $cell_data = explode( '|' , $cell , 2 );
9631013
964 - # Bug 553: Note that a '|' inside an invalid link should not
965 - # be mistaken as delimiting cell parameters
966 - if ( strpos( $cell_data[0], '[[' ) !== false ) {
967 - $cell = "{$previous}<{$last_tag}>{$cell}";
968 - } elseif ( count( $cell_data ) == 1 ) {
969 - $cell = "{$previous}<{$last_tag}>{$cell_data[0]}";
970 - } else {
971 - $attributes = $this->mStripState->unstripBoth( $cell_data[0] );
972 - $attributes = Sanitizer::fixTagAttributes( $attributes , $last_tag );
973 - $cell = "{$previous}<{$last_tag}{$attributes}>{$cell_data[1]}";
974 - }
 1014+ /**
 1015+ * Helper function for doTableStuff(). This converts the structured array into html.
 1016+ *
 1017+ * @private
 1018+ */
 1019+ function printTableHtml (&$t) {
 1020+ $r = "\n";
 1021+ $r .= str_repeat( '<dl><dd>' , $t['indent'] );
 1022+ $r .= '<table';
 1023+ $r .= isset($t['attributes']) ? $t['attributes'] : '';
 1024+ $r .= '>';
 1025+ unset($t['attributes']);
9751026
976 - $outLine .= $cell;
977 - array_push( $td_history , true );
978 - }
979 - }
980 - $out .= $outLine . "\n";
 1027+ if ( isset($t['caption']) ) {
 1028+ $r .= "\n<caption";
 1029+ $r .= isset($t['caption']['attributes']) ? $t['caption']['attributes'] : '';
 1030+ $r .= '>';
 1031+ $r .= $t['caption']['content'];
 1032+ $r .= '</caption>';
9811033 }
982 -
983 - # Closing open td, tr && table
984 - while ( count( $td_history ) > 0 ) {
985 - if ( array_pop( $td_history ) ) {
986 - $out .= "</td>\n";
 1034+ $last_section = '';
 1035+ $empty = true;
 1036+ $simple = true;
 1037+
 1038+ //If we only have tbodies, mark table as simple
 1039+ for($i = 0; isset($t[$i]); $i++ ) {
 1040+ if ( !count( $t[$i]) ) continue;
 1041+ if ( !$last_section ) {
 1042+ $last_section = $t[$i]['type'];
 1043+ } else if ($last_section != $t[$i]['type']) {
 1044+ $simple = false;
 1045+ break;
 1046+ }
 1047+ }
 1048+ $last_section = '';
 1049+ for($i = 0; isset($t[$i]); $i++ ) {
 1050+ // Check for empty tables
 1051+ if ( count( $t[$i]) ) {
 1052+ $empty = false;
 1053+ } else {
 1054+ continue;
9871055 }
988 - if ( array_pop( $tr_history ) ) {
989 - $out .= "</tr>\n";
 1056+ if( $t[$i]['type'] != $last_section && !$simple ) {
 1057+ $r .= "\n<" . $t[$i]['type'] . '>';
9901058 }
991 - if ( !array_pop( $has_opened_tr ) ) {
992 - $out .= "<tr><td></td></tr>\n" ;
 1059+
 1060+ $r .= "\n<tr";
 1061+ $r .= isset($t[$i]['attributes']) ? $t[$i]['attributes'] : '';
 1062+ $r .= '>';
 1063+ for($j = 0; isset($t[$i][$j]); $j++ ) {
 1064+ $r .= "\n<" . $t[$i][$j]['type'];
 1065+ $r .= isset($t[$i][$j]['attributes']) ? $t[$i][$j]['attributes'] : '';
 1066+ $r .= '>';
 1067+
 1068+ $r .= $t[$i][$j]['content'];
 1069+
 1070+ $r .= '</' . $t[$i][$j]['type'] . '>';
 1071+ unset($t[$i][$j]);
9931072 }
 1073+ $r .= "\n</tr>";
9941074
995 - $out .= "</table>\n";
 1075+ if( ( !isset($t[$i+1]) && !$simple )|| ( isset($t[$i+1]) && ($t[$i]['type'] != $t[$i+1]['type'])) ) {
 1076+ $r .= '</' . $t[$i]['type'] . '>';
 1077+ }
 1078+ $last_section = $t[$i]['type'];
 1079+ unset($t[$i]);
9961080 }
997 -
998 - # Remove trailing line-ending (b/c)
999 - if ( substr( $out, -1 ) === "\n" ) {
1000 - $out = substr( $out, 0, -1 );
 1081+ if ( $empty ) {
 1082+ if ( isset($t['caption']) ) {
 1083+ $r .= "\n<tr><td></td></tr>";
 1084+ } else {
 1085+ return '';
 1086+ }
10011087 }
 1088+ $r .= "\n</table>";
 1089+ $r .= str_repeat( '</dd></dl>' , $t['indent'] );
10021090
1003 - # special case: don't return empty table
1004 - if ( $out === "<table>\n<tr><td></td></tr>\n</table>" ) {
1005 - $out = '';
1006 - }
 1091+ return $r;
 1092+ }
10071093
1008 - wfProfileOut( __METHOD__ );
1009 -
1010 - return $out;
 1094+ /**
 1095+ * like end() but only works on the numeric array index and php's internal pointers
 1096+ * returns a reference to the last element of an array much like "\$arr[-1]" in perl
 1097+ * ignores associative elements and will create a 0 key will a NULL value if there were
 1098+ * no numric elements and an array itself if not previously defined.
 1099+ *
 1100+ * @private
 1101+ */
 1102+ function &last (&$arr) {
 1103+ for($i = count($arr); (!isset($arr[$i]) && $i > 0); $i--) { }
 1104+ return $arr[$i];
10111105 }
10121106
10131107 /**
@@ -2239,7 +2333,7 @@
22402334 '<td|<th|<\\/?div|<hr|<\\/pre|<\\/p|'.$this->mUniqPrefix.'-pre|<\\/li|<\\/ul|<\\/ol|<\\/?center)/iS', $t );
22412335 if ( $openmatch or $closematch ) {
22422336 $paragraphStack = false;
2243 - # TODO bug 5718: paragraph closed
 2337+ # TODO bug 5718: paragraph closed
22442338 $output .= $this->closeParagraph();
22452339 if ( $preOpenMatch and !$preCloseMatch ) {
22462340 $this->mInPre = true;
Index: trunk/phase3/includes/Sanitizer.php
@@ -369,7 +369,7 @@
370370 'strike', 'strong', 'tt', 'var', 'div', 'center',
371371 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
372372 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
373 - 'kbd', 'samp'
 373+ 'kbd', 'samp', 'thead', 'tbody', 'tfoot'
374374 );
375375 $htmlsingle = array(
376376 'br', 'hr', 'li', 'dt', 'dd'

Follow-up revisions

RevisionCommit summaryAuthorDate
r85941Followup to r85922: Fix STRICT error, rmv redundant whitespacediebuche11:24, 13 April 2011
r85943Followup to r85922: Output th instead of td for tfootdiebuche12:44, 13 April 2011
r85985Followup to r85922: Adapt to MW-coding stylediebuche19:46, 13 April 2011
r85991Readd support for headings inside tables, broken in r85922 and reported on it...platonides22:20, 13 April 2011
r86002Follow up r85991. For some reason I committed the test for r85921 parser, not...platonides23:12, 13 April 2011
r86005Follow up r85922 moving the </caption> to another line, fixing one of the tes...platonides23:19, 13 April 2011
r86010Another evil test to join r86004 (didn't produce malformed output in the pre-...platonides23:36, 13 April 2011
r86039Fixing line-break issues with r85922diebuche10:02, 14 April 2011
r86144r85922: Fixing another table bordercasediebuche22:36, 15 April 2011
r87490Fix PHP notice in r85922. Sorry for the holdup, I kinda forgot about itdiebuche09:12, 5 May 2011
r89599add credits to bluehairedlawyer for thead/tbody...hashar20:09, 6 June 2011
r89600test thead / tfooter contains th elements...hashar20:10, 6 June 2011
r97145Reverted r85922 and related: new doTableStuff(). I copied in the old doTableS...tstarling12:10, 15 September 2011
r97173Merged revisions 97087,97091-97092,97094,97096-97098,97100-97101,97103,97136,...dantman16:19, 15 September 2011

Comments

#Comment by Happy-melon (talk | contribs)   21:28, 12 April 2011

Commit summary? What is this?

#Comment by DieBuche (talk | contribs)   21:32, 12 April 2011

Sorry for the comment less commit.

Implement tbody, thead & tfoot. Fixes Bug 4740 Original patch by bluehairedlawyer@gmail.com, rewritten at some places by me. ParserTests are changed to accommodate the new elements, but no parserTest `logic` is changed

#Comment by Aaron Schulz (talk | contribs)   02:33, 13 April 2011

Strict Standards: Only variables should be passed by reference in D:\www\MW_trunk\phase3\includes\parser\Parser.php on line 880

Please use E_ALL | E_STRICT for PHP errors.

#Comment by Aaron Schulz (talk | contribs)   02:35, 13 April 2011

Oh, and welcome to SVN committer land :)

#Comment by DieBuche (talk | contribs)   11:25, 13 April 2011

Thanks! Strict fixed in r85941

#Comment by Raymond (talk | contribs)   11:59, 13 April 2011

Could you please create a page with a few examples of the new possibilities at Translatewiki?

#Comment by DieBuche (talk | contribs)   12:25, 13 April 2011

It doesn't add any new syntax, but simply renders stuff like http://translatewiki.net/wiki/TablesSandbox semantically correct. (Headers are actually theads, etc.) This makes it possible to switch to a better tablesorting script in the future.

#Comment by Raymond (talk | contribs)   12:29, 13 April 2011

Thanks for the example. I was confused by the proposal for new syntax of the bug reporter.

#Comment by Bryan (talk | contribs)   12:47, 13 April 2011

Welcome to commit land!

I have some advise for you, mostly related to code style. See also Manual:Coding conventions.

You should follow code conventions for variables. Thus $lastSection instead of $last_section. Your variables should be more descriptive. Use $tableDescriptor rather than $t.

Check your whitespace use: array_pop( $a ) instead of array_pop($a). You can use stylize.php to enfore this style.

Function names should really indicate what it does: a function called printTableHtml is expected to print/echo something to the output. It does not do that, it makes a string containing the table html. It should thus be named build/create/makeTableHtml; choose your favorite.


Personally I don't really like mixed key arrays, where numeric keys indicate data elements, and string keys indicate attributes/options. I'd rather use a nested array solution, like array( 'attributes' => array( ... ), 'caption => '...', 'data' => array( $row1, $row2, ... ) ). Then you can use foreach on $t['data'] which is much cleaner than for ( $i = 0; isset( $t[i] ); $i++ ).

#Comment by Svippong (talk | contribs)   20:52, 13 April 2011

After these changes, tables are behaving incorrectly.

For instance, if you have a header in a table cell, it won't render correctly if it is at the beginning of the cell.

e.g.

| 

=== Foo ===

Will not render correctly, but

|

----

=== Foo ===

will.

In addition, tables which appear further down on pages seems to have a margin-top: 50% behaviour. This is probably overwriteable with CSS, but it seems to be an odd default behaviour.

Some examples:

#Comment by Platonides (talk | contribs)   21:13, 13 April 2011

I think that's due to the trim(). As the goal of this revision was to add thead/tfoot, I think it could be stripped back.

#Comment by DieBuche (talk | contribs)   10:03, 14 April 2011

This is fixed r86039

#Comment by Platonides (talk | contribs)   21:12, 13 April 2011
-			if ( $line ===  ) { # empty line, go to next line
+			if ( $line == ) { //empty line, go to next line

Why are you changing the strict equality to a weak one? Probably introduces some bug.

Oh, it was fixed in r85943


You're changing the tag history to using a stack. I'd appreciate some docs on the way it is done. It's probably ok, but would be easier to review it with an explanations.


Also, the new way looks more memory hungry.

#Comment by Platonides (talk | contribs)   21:25, 13 April 2011

Did you run the parserTests after your changes? You also broke other 3 parserTests:

  • Fuzz testing: Parser14-table...
  • Fuzz testing: Parser22...
  • Bug 529: Uncovered table already at line-start...

I don't think it's important that there's no longer a table in those Fuzz testing tests, but in such case I'd expect the {| to appear as a literal, not to magically disappear.

The change to 'Uncovered table already at line-start' is wrong.

#Comment by DieBuche (talk | contribs)   10:06, 14 April 2011

But didn't it practically disappear previously as well (besides leaving an invisible table) ? Parser test for Bug 529 works again in r86039. And thanks for the followups!

#Comment by Nikerabbit (talk | contribs)   14:14, 15 April 2011
[15-Apr-2011 09:05:06] PHP Notice:  Undefined index: type in /www/w/includes/parser/Parser.php on line 1056
[15-Apr-2011 09:05:06] PHP Notice:  Undefined index: type in /www/w/includes/parser/Parser.php on line 1070
[15-Apr-2011 09:05:06] PHP Notice:  Undefined index: type in /www/w/includes/parser/Parser.php on line 1078
[15-Apr-2011 09:05:06] PHP Notice:  Undefined index: type in /www/w/includes/parser/Parser.php on line 1086
[15-Apr-2011 09:05:06] PHP Notice:  Undefined index: type in /www/w/includes/parser/Parser.php on line 1094
#Comment by DieBuche (talk | contribs)   14:16, 15 April 2011

Do you have the offending code?

#Comment by DieBuche (talk | contribs)   14:16, 15 April 2011

(Clarify: The wikitext that triggered it)

#Comment by Nikerabbit (talk | contribs)   14:21, 15 April 2011

Of course not, since debugging PHP is such a pain. However, if it helps you I can try to match the error timestamps into requests and find the code that way.

#Comment by DieBuche (talk | contribs)   14:31, 15 April 2011

I think that would be helpful. I'd guess it's something like an empty row or cell, but I can't seem to find something that triggers it

#Comment by Nikerabbit (talk | contribs)   10:15, 22 April 2011

Still happening: [22-Apr-2011 09:02:57] PHP Notice: Undefined index: type in /www/w/includes/parser/Parser.php on line 1091

Source code at http://translatewiki.net/sandwiki/i.php?title=Translating:Group_statistics/Header7&action=edit

#Comment by Nikerabbit (talk | contribs)   07:38, 5 May 2011

Sigh, it's been almost two weeks now.

#Comment by Bryan (talk | contribs)   08:23, 5 May 2011

Revert?

#Comment by Nikerabbit (talk | contribs)   08:44, 5 May 2011

Doesn't look worth it given the number of follow-ups. I imagine the fix is easy for someone who has wrapped their head around the code.

#Comment by DieBuche (talk | contribs)   09:14, 5 May 2011

It was, just forgot to commit it :(

#Comment by 😂 (talk | contribs)   22:48, 25 May 2011

Was this all resolved? If not, we should back it out of REL1_18.

#Comment by DieBuche (talk | contribs)   08:58, 26 May 2011

I'd say it's resolved. (At least I know nothing that's still broken)

#Comment by Nikerabbit (talk | contribs)   09:04, 26 May 2011

+1, no errors from twn. I'd expect that Wikipedia finds some but it's impossible to say without testing.

#Comment by Hashar (talk | contribs)   19:52, 6 June 2011

Assuming this is resolved.

#Comment by Aaron Schulz (talk | contribs)   00:13, 14 September 2011

After running many versions of Parser, I can confirm that this causes bug 30746.

#Comment by Tim Starling (talk | contribs)   12:31, 14 September 2011

This version of doTableStuff() takes 20 times longer and uses 25 times as much memory as the old code for this test case: "{|\n" . str_repeat( "|-\n| x\n", 10000) . "|}\n"

#Comment by Tim Starling (talk | contribs)   12:10, 15 September 2011

I'm backing this out for 1.18.

Constructing a structured representation of the input text and then transforming it to HTML as a second step is an approach which has a very high memory overhead, and a moderately high CPU overhead. It should only be used as a last resort, where on-the-fly generation of output text is not feasible. It does appear to be feasible in this case -- in fact the main desired feature, adding of <thead> elements, should be possible with just a few extra lines in the old doTableStuff().

I'm also concerned about the number of unintended changes in the input syntax and output. Many were fixed, but at least bug 30746 remains. The lack of attention to backwards compatibility implied by these changes indicates that deployment may uncover further issues which affect our large existing text corpus.

A rewrite is fine in principle, since the code is old and ugly. But the behaviour should be made as close as possible to the old code, to enable differential fuzz testing to be done, and the overall design should take performance into account.

Status & tagging log