r30109 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r30108‎ | r30109 | r30110 >
Date:09:07, 24 January 2008
Author:tstarling
Status:old
Tags:
Comment:
* Make lc and uc parser functions skip strip markers
* Made ==foo==<!----> create a valid section edit link
* Changed header processing heuristics -- now double-equals signs are generally respected as header starts, and will break template invocations, and single equals signs are respected as header syntax but might not generate a section edit link.
Modified paths:
  • /trunk/phase3/includes/CoreParserFunctions.php (modified) (history)
  • /trunk/phase3/includes/Parser.php (modified) (history)
  • /trunk/phase3/includes/Preprocessor_DOM.php (modified) (history)
  • /trunk/phase3/maintenance/parserTests.txt (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/parserTests.txt
@@ -5348,11 +5348,10 @@
53495349 section=1
53505350 !! input
53515351 ==a==
5352 -==unmarked== <!-- an unmarked section -->
5353 -==b==
 5352+==b== <!-- -->
 5353+==c==
53545354 !! result
53555355 ==a==
5356 -==unmarked== <!-- an unmarked section -->
53575356 !! end
53585357
53595358 !! test
@@ -5361,10 +5360,10 @@
53625361 section=2
53635362 !! input
53645363 ==a==
5365 -==unmarked== <!-- an unmarked section -->
5366 -==b==
 5364+==b== <!-- -->
 5365+==c==
53675366 !! result
5368 -==b==
 5367+==b== <!-- -->
53695368 !! end
53705369
53715370 !! test
@@ -6712,59 +6711,29 @@
67136712 !! end
67146713
67156714 !! test
6716 -HHP1: Heuristics for headings in preprocessor parenthetical structures
 6715+HHP2.1: Heuristics for headings in preprocessor parenthetical structures
67176716 !! input
6718 -{{foo
6719 -==heading==
6720 -!! result
6721 -<p>{{foo
6722 -</p>
6723 -<a name="heading"></a><h2><span class="editsection">[<a href="https://www.mediawiki.org/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: heading">edit</a>]</span> <span class="mw-headline">heading</span></h2>
6724 -
6725 -!! end
6726 -
6727 -!! test
6728 -HHP2: Heuristics for headings in preprocessor parenthetical structures
6729 -!! input
67306717 {{foo|
6731 -==heading==
 6718+=heading=
67326719 !! result
67336720 <p>{{foo|
67346721 </p>
6735 -<a name="heading"></a><h2> <span class="mw-headline">heading</span></h2>
 6722+<a name="heading"></a><h1> <span class="mw-headline">heading</span></h1>
67366723
67376724 !! end
67386725
67396726 !! test
6740 -HHP3: Heuristics for headings in preprocessor parenthetical structures
 6727+HHP2.2: Heuristics for headings in preprocessor parenthetical structures
67416728 !! input
67426729 {{foo|
6743 -==heading 1==
6744 -==heading 2==
 6730+==heading==
67456731 !! result
67466732 <p>{{foo|
67476733 </p>
6748 -<a name="heading_1"></a><h2> <span class="mw-headline">heading 1</span></h2>
6749 -<a name="heading_2"></a><h2><span class="editsection">[<a href="https://www.mediawiki.org/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: heading 2">edit</a>]</span> <span class="mw-headline">heading 2</span></h2>
 6734+<a name="heading"></a><h2><span class="editsection">[<a href="https://www.mediawiki.org/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: heading">edit</a>]</span> <span class="mw-headline">heading</span></h2>
67506735
67516736 !! end
67526737
6753 -# Note that heading 2 is counted, so heading 3 gets section=2 not section=1
6754 -!! test
6755 -HHP4: Heuristics for headings in preprocessor parenthetical structures
6756 -!! input
6757 -{{foo|
6758 -==heading 1==
6759 -==heading 2==
6760 -}}
6761 -==heading 3==
6762 -!! result
6763 -<p>FOO
6764 -</p>
6765 -<a name="heading_3"></a><h2><span class="editsection">[<a href="https://www.mediawiki.org/index.php?title=Parser_test&amp;action=edit&amp;section=2" title="Edit section: heading 3">edit</a>]</span> <span class="mw-headline">heading 3</span></h2>
6766 -
6767 -!! end
6768 -
67696738 #
67706739 #
67716740 #
Index: trunk/phase3/includes/Preprocessor_DOM.php
@@ -99,7 +99,7 @@
100100
101101 $stack = new PPDStack;
102102
103 - $searchBase = '[{<'; #}
 103+ $searchBase = "[{<\n"; #}
104104 $revText = strrev( $text ); // For fast reverse searches
105105
106106 $i = 0; # Input pointer, starts out pointing to a pseudo-newline before the start
@@ -148,17 +148,6 @@
149149 if ( $findEquals ) {
150150 // First equals will be for the template
151151 $search .= '=';
152 - } else {
153 - // Look for headings
154 - // We can't look for headings when $findEquals is true, because the ambiguity
155 - // between template name/value separators and heading starts would be unresolved
156 - // until the closing double-brace is found. This would mean either infinite
157 - // backtrack, or creating and updating two separate tree structures until the
158 - // end of the ambiguity -- one tree structure assuming a heading, and the other
159 - // assuming a template argument.
160 - //
161 - // Easier to just break some section edit links.
162 - $search .= "\n";
163152 }
164153 $rule = null;
165154 # Output literal section, advance input counter
@@ -240,7 +229,7 @@
241230 $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );
242231 // Eat the line if possible
243232 // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
244 - // the overall start. That's not how Sanitizer::removeHTMLcomments() does it, but
 233+ // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
245234 // it's a possible beneficial b/c break.
246235 if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"
247236 && substr( $text, $wsEnd + 1, 1 ) == "\n" )
@@ -253,28 +242,24 @@
254243 if ( $wsLength > 0 && substr( $accum, -$wsLength ) === str_repeat( ' ', $wsLength ) ) {
255244 $accum = substr( $accum, 0, -$wsLength );
256245 }
257 - // Do a line-start run next time to look for headings after the comment,
258 - // but only if stack->top===false, because headings don't exist at deeper levels.
259 - if ( $stack->top === false ) {
260 - $fakeLineStart = true;
261 - }
 246+ // Do a line-start run next time to look for headings after the comment
 247+ $fakeLineStart = true;
262248 } else {
263249 // No line to eat, just take the comment itself
264250 $startPos = $i;
265251 $endPos += 2;
266252 }
267253
268 - /*
269254 if ( $stack->top ) {
270 - if ( $stack->top->commentEndPos !== false && $stack->top->commentEndPos == $wsStart ) {
 255+ $part = $stack->top->getCurrentPart();
 256+ if ( isset( $part->commentEnd ) && $part->commentEnd == $wsStart - 1 ) {
271257 // Comments abutting, no change in visual end
272 - $stack->top->commentEndPos = $wsEnd;
 258+ $part->commentEnd = $wsEnd;
273259 } else {
274 - $stack->top->visualEndPos = $wsStart;
275 - $stack->top->commentEndPos = $wsEnd;
 260+ $part->visualEnd = $wsStart;
 261+ $part->commentEnd = $endPos;
276262 }
277263 }
278 - */
279264 $i = $endPos + 1;
280265 $inner = substr( $text, $startPos, $endPos - $startPos + 1 );
281266 $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';
@@ -356,7 +341,11 @@
357342 }
358343
359344 $count = strspn( $text, '=', $i, 6 );
360 - if ( $count > 0 ) {
 345+ if ( $count == 1 && $findEquals ) {
 346+ // DWIM: This looks kind of like a name/value separator
 347+ // Let's let the equals handler have it and break the potential heading
 348+ // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex.
 349+ } elseif ( $count > 0 ) {
361350 $piece = array(
362351 'open' => "\n",
363352 'close' => "\n",
@@ -374,23 +363,32 @@
375364 $piece = $stack->top;
376365 // A heading must be open, otherwise \n wouldn't have been in the search list
377366 assert( $piece->open == "\n" );
 367+ $part = $piece->getCurrentPart();
378368 // Search back through the input to see if it has a proper close
379369 // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient
380 - $m = false;
 370+ $wsLength = strspn( $revText, " \t", strlen( $text ) - $i );
 371+ $searchStart = $i - $wsLength;
 372+ if ( isset( $part->commentEnd ) && $searchStart - 1 == $part->commentEnd ) {
 373+ // Comment found at line end
 374+ // Search for equals signs before the comment
 375+ $searchStart = $part->visualEnd;
 376+ $searchStart -= strspn( $revText, " \t", strlen( $text ) - $searchStart );
 377+ }
381378 $count = $piece->count;
382 - if ( preg_match( "/\s*(=+)/A", $revText, $m, 0, strlen( $text ) - $i ) ) {
383 - if ( $i - strlen( $m[0] ) == $piece->startPos ) {
 379+ $equalsLength = strspn( $revText, '=', strlen( $text ) - $searchStart );
 380+ if ( $equalsLength > 0 ) {
 381+ if ( $i - $equalsLength == $piece->startPos ) {
384382 // This is just a single string of equals signs on its own line
385383 // Replicate the doHeadings behaviour /={count}(.+)={count}/
386384 // First find out how many equals signs there really are (don't stop at 6)
387 - $count = strlen( $m[1] );
 385+ $count = $equalsLength;
388386 if ( $count < 3 ) {
389387 $count = 0;
390388 } else {
391389 $count = min( 6, intval( ( $count - 1 ) / 2 ) );
392390 }
393391 } else {
394 - $count = min( strlen( $m[1] ), $count );
 392+ $count = min( $equalsLength, $count );
395393 }
396394 if ( $count > 0 ) {
397395 // Normal match, output <h>
@@ -869,13 +867,6 @@
870868 } elseif ( is_array( $contextNode ) || $contextNode instanceof DOMNodeList ) {
871869 $newIterator = $contextNode;
872870 } elseif ( $contextNode instanceof DOMNode ) {
873 - /*
874 - print str_repeat( '&nbsp;', count( debug_backtrace() ) ) . $contextNode->nodeName;
875 - if ( $contextNode->nodeName == 'title' ) {
876 - print ' = ' . $contextNode->textContent;
877 - }
878 - print "<br/>\n";
879 - */
880871 if ( $contextNode->nodeType == XML_TEXT_NODE ) {
881872 $out .= $contextNode->nodeValue;
882873 } elseif ( $contextNode->nodeName == 'template' ) {
Index: trunk/phase3/includes/Parser.php
@@ -4815,6 +4815,30 @@
48164816 }
48174817 return $this->testSrvus( $text, $title, $options, self::OT_PREPROCESS );
48184818 }
 4819+
 4820+ function markerSkipCallback( $s, $callback ) {
 4821+ $i = 0;
 4822+ $out = '';
 4823+ while ( $i < strlen( $s ) ) {
 4824+ $markerStart = strpos( $s, $this->mUniqPrefix, $i );
 4825+ if ( $markerStart === false ) {
 4826+ $out .= call_user_func( $callback, substr( $s, $i ) );
 4827+ break;
 4828+ } else {
 4829+ $out .= call_user_func( $callback, substr( $s, $i, $markerStart - $i ) );
 4830+ $markerEnd = strpos( $s, $this->mMarkerSuffix, $markerStart );
 4831+ if ( $markerEnd === false ) {
 4832+ $out .= substr( $s, $markerStart );
 4833+ break;
 4834+ } else {
 4835+ $markerEnd += strlen( $this->mMarkerSuffix );
 4836+ $out .= substr( $s, $markerStart, $markerEnd - $markerStart );
 4837+ $i = $markerEnd;
 4838+ }
 4839+ }
 4840+ }
 4841+ return $out;
 4842+ }
48194843 }
48204844
48214845 /**
Index: trunk/phase3/includes/CoreParserFunctions.php
@@ -51,12 +51,20 @@
5252
5353 static function lc( $parser, $s = '' ) {
5454 global $wgContLang;
55 - return $wgContLang->lc( $s );
 55+ if ( is_callable( array( $parser, 'markerSkipCallback' ) ) ) {
 56+ return $parser->markerSkipCallback( $s, array( $wgContLang, 'lc' ) );
 57+ } else {
 58+ return $wgContLang->lc( $s );
 59+ }
5660 }
5761
5862 static function uc( $parser, $s = '' ) {
5963 global $wgContLang;
60 - return $wgContLang->uc( $s );
 64+ if ( is_callable( array( $parser, 'markerSkipCallback' ) ) ) {
 65+ return $parser->markerSkipCallback( $s, array( $wgContLang, 'uc' ) );
 66+ } else {
 67+ return $wgContLang->uc( $s );
 68+ }
6169 }
6270
6371 static function localurl( $parser, $s = '', $arg = null ) { return self::urlFunction( 'getLocalURL', $s, $arg ); }

Follow-up revisions

RevisionCommit summaryAuthorDate
r114231Fixed a few "strip tag exposed" bugs....tstarling04:39, 20 March 2012

Status & tagging log