r45487 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r45486‎ \| r45487 \| r45488 >
Date:	04:23, 7 January 2009
Author:	brion
Status:	ok
Tags:
Comment:	Pulling r45473 back for now "Reduce code duplication correctly this time" Let's hold off on further section anchor generation changes until we have decent test cases covering the different ways we we stuff through...
Modified paths:	/trunk/phase3/includes/parser/Parser.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/parser/Parser.php
—	—	@@ -3448,7 +3448,7 @@
3449	3449	* @private
3450	3450	*/
3451	3451	function formatHeadings( $text, $isMain=true ) {
3452		~~- global $wgMaxTocLevel, $wgContLang;~~
	3452	+ global $wgMaxTocLevel, $wgContLang, $wgEnforceHtmlIds;
3453	3453
3454	3454	$doNumberHeadings = $this->mOptions->getNumberHeadings();
3455	3455	$showEditLink = $this->mOptions->getEditSection();
—	—	@@ -3593,17 +3593,71 @@
3594	3594	}
3595	3595	}
3596	3596
3597		~~- list( $anchor, $legacyAnchor, $tocline, $headlineHint ) =~~
3598		~~- $this->processHeadingText( $headline );~~
	3597	+ # The safe header is a version of the header text safe to use for links
	3598	+ # Avoid insertion of weird stuff like <math> by expanding the relevant sections
	3599	+ $safeHeadline = $this->mStripState->unstripBoth( $headline );
3599	3600
	3601	+ # Remove link placeholders by the link text.
	3602	+ # <!--LINK number-->
	3603	+ # turns into
	3604	+ # link text with suffix
	3605	+ $safeHeadline = $this->replaceLinkHoldersText( $safeHeadline );
	3606	+
	3607	+ # Strip out HTML (other than plain <sup> and <sub>: bug 8393)
	3608	+ $tocline = preg_replace(
	3609	+ array( '#<(?!/?(sup\|sub)).?'.'>#', '#<(/?(sup\|sub)).?'.'>#' ),
	3610	+ array( '', '<$1>'),
	3611	+ $safeHeadline
	3612	+ );
	3613	+ $tocline = trim( $tocline );
	3614	+
	3615	+ # For the anchor, strip out HTML-y stuff period
	3616	+ $safeHeadline = preg_replace( '/<.*?'.'>/', '', $safeHeadline );
	3617	+ $safeHeadline = trim( $safeHeadline );
	3618	+
	3619	+ # Save headline for section edit hint before it's escaped
	3620	+ $headlineHint = $safeHeadline;
	3621	+
	3622	+ if ( $wgEnforceHtmlIds ) {
	3623	+ $legacyHeadline = false;
	3624	+ $safeHeadline = Sanitizer::escapeId( $safeHeadline,
	3625	+ 'noninitial' );
	3626	+ } else {
	3627	+ # For reverse compatibility, provide an id that's
	3628	+ # HTML4-compatible, like we used to.
	3629	+ #
	3630	+ # It may be worth noting, academically, that it's possible for
	3631	+ # the legacy anchor to conflict with a non-legacy headline
	3632	+ # anchor on the page. In this case likely the "correct" thing
	3633	+ # would be to either drop the legacy anchors or make sure
	3634	+ # they're numbered first. However, this would require people
	3635	+ # to type in section names like "abc_.D7.93.D7.90.D7.A4"
	3636	+ # manually, so let's not bother worrying about it.
	3637	+ $legacyHeadline = Sanitizer::escapeId( $safeHeadline,
	3638	+ 'noninitial' );
	3639	+ $safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' );
	3640	+
	3641	+ if ( $legacyHeadline == $safeHeadline ) {
	3642	+ # No reason to have both (in fact, we can't)
	3643	+ $legacyHeadline = false;
	3644	+ } elseif ( $legacyHeadline != Sanitizer::escapeId(
	3645	+ $legacyHeadline, 'xml' ) ) {
	3646	+ # The legacy id is invalid XML. We used to allow this, but
	3647	+ # there's no reason to do so anymore. Backward
	3648	+ # compatibility will fail slightly in this case, but it's
	3649	+ # no big deal.
	3650	+ $legacyHeadline = false;
	3651	+ }
	3652	+ }
	3653	+
3600	3654	# HTML names must be case-insensitively unique (bug 10721). FIXME:
3601	3655	# Does this apply to Unicode characters? Because we aren't
3602	3656	# handling those here.
3603		~~- $arrayKey = strtolower( $anchor );~~
3604		~~- if ( $legacyAnchor === false ) {~~
	3657	+ $arrayKey = strtolower( $safeHeadline );
	3658	+ if ( $legacyHeadline === false ) {
3605	3659	$legacyArrayKey = false;
3606	3660	} else {
3607		~~- $legacyArrayKey = strtolower( $legacyAnchor );~~
	3661	+ $legacyArrayKey = strtolower( $legacyHeadline );
3608	3662	}
3609	3663
3610	3664	# count how many in assoc. array so we can track dupes in anchors
—	—	@@ -3625,10 +3679,12 @@
3626	3680	}
3627	3681
3628	3682	# Create the anchor for linking from the TOC to the section
	3683	+ $anchor = $safeHeadline;
	3684	+ $legacyAnchor = $legacyHeadline;
3629	3685	if ( $refers[$arrayKey] > 1 ) {
3630	3686	$anchor .= '_' . $refers[$arrayKey];
3631	3687	}
3632		~~- if ( $legacyAnchor !== false && $refers[$legacyArrayKey] > 1 ) {~~
	3688	+ if ( $legacyHeadline !== false && $refers[$legacyArrayKey] > 1 ) {
3633	3689	$legacyAnchor .= '_' . $refers[$legacyArrayKey];
3634	3690	}
3635	3691	if( $enoughToc && ( !isset($wgMaxTocLevel) \|\| $toclevel<$wgMaxTocLevel ) ) {
—	—	@@ -3700,70 +3756,6 @@
3701	3757	}
3702	3758	}
3703	3759
3704		~~- private function processHeadingText( $headline ) {~~
3705		~~- global $wgEnforceHtmlIds;~~
3706		-
3707		~~- # The safe header is a version of the header text safe to use for links~~
3708		~~- # Avoid insertion of weird stuff like <math> by expanding the relevant sections~~
3709		~~- $safeHeadline = $this->mStripState->unstripBoth( $headline );~~
3710		-
3711		~~- # Remove link placeholders by the link text.~~
3712		~~- # <!--LINK number-->~~
3713		~~- # turns into~~
3714		~~- # link text with suffix~~
3715		~~- $safeHeadline = $this->replaceLinkHoldersText( $safeHeadline );~~
3716		-
3717		~~- # Strip out HTML (other than plain <sup> and <sub>: bug 8393)~~
3718		~~- $tocline = preg_replace(~~
3719		~~- array( '#<(?!/?(sup\|sub)).?'.'>#', '#<(/?(sup\|sub)).?'.'>#' ),~~
3720		~~- array( '', '<$1>'),~~
3721		~~- $safeHeadline~~
3722		~~- );~~
3723		~~- $tocline = trim( $tocline );~~
3724		-
3725		~~- # For the anchor, strip out HTML-y stuff period~~
3726		~~- $safeHeadline = preg_replace( '/<.*?'.'>/', '', $safeHeadline );~~
3727		~~- $safeHeadline = trim( $safeHeadline );~~
3728		-
3729		~~- # Save headline for section edit hint before it's escaped~~
3730		~~- $headlineHint = $safeHeadline;~~
3731		-
3732		~~- if ( $wgEnforceHtmlIds ) {~~
3733		~~- $legacyHeadline = false;~~
3734		~~- $safeHeadline = Sanitizer::escapeId( $safeHeadline,~~
3735		~~- 'noninitial' );~~
3736		~~- } else {~~
3737		~~- # For reverse compatibility, provide an id that's~~
3738		~~- # HTML4-compatible, like we used to.~~
3739		~~- #~~
3740		~~- # It may be worth noting, academically, that it's possible for~~
3741		~~- # the legacy anchor to conflict with a non-legacy headline~~
3742		~~- # anchor on the page. In this case likely the "correct" thing~~
3743		~~- # would be to either drop the legacy anchors or make sure~~
3744		~~- # they're numbered first. However, this would require people~~
3745		~~- # to type in section names like "abc_.D7.93.D7.90.D7.A4"~~
3746		~~- # manually, so let's not bother worrying about it.~~
3747		~~- $legacyHeadline = Sanitizer::escapeId( $safeHeadline,~~
3748		~~- 'noninitial' );~~
3749		~~- $safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' );~~
3750		-
3751		~~- if ( $legacyHeadline == $safeHeadline ) {~~
3752		~~- # No reason to have both (in fact, we can't)~~
3753		~~- $legacyHeadline = false;~~
3754		~~- } elseif ( $legacyHeadline != Sanitizer::escapeId(~~
3755		~~- $legacyHeadline, 'xml' ) ) {~~
3756		~~- # The legacy id is invalid XML. We used to allow this, but~~
3757		~~- # there's no reason to do so anymore. Backward~~
3758		~~- # compatibility will fail slightly in this case, but it's~~
3759		~~- # no big deal.~~
3760		~~- $legacyHeadline = false;~~
3761		~~- }~~
3762		~~- }~~
3763		-
3764		~~- return array( $safeHeadline, $legacyHeadline, $tocline,~~
3765		~~- $headlineHint );~~
3766		~~- }~~
3767		-
3768	3760	/**
3769	3761	* Transform wiki markup when saving a page by doing \r\n -> \n
3770	3762	* conversion, substitting signatures, {{subst:}} templates, etc.
—	—	@@ -4744,9 +4736,21 @@
4745	4737	* "== Header ==".
4746	4738	*/
4747	4739	public function guessSectionNameFromWikiText( $text ) {
	4740	+ # Strip out wikitext links(they break the anchor)
4748	4741	$text = $this->stripSectionName( $text );
4749		~~- list( $text, /* unneeded here */ ) = $this->processHeadingText( $text );~~
4750		~~- return "#$text";~~
	4742	+ $headline = Sanitizer::decodeCharReferences( $text );
	4743	+ # strip out HTML
	4744	+ $headline = StringUtils::delimiterReplace( '<', '>', '', $headline );
	4745	+ $headline = trim( $headline );
	4746	+ $sectionanchor = '#' . urlencode( str_replace( ' ', '_', $headline ) );
	4747	+ $replacearray = array(
	4748	+ '%3A' => ':',
	4749	+ '%' => '.'
	4750	+ );
	4751	+ return str_replace(
	4752	+ array_keys( $replacearray ),
	4753	+ array_values( $replacearray ),
	4754	+ $sectionanchor );
4751	4755	}
4752	4756
4753	4757	/**

Past revisions this follows-up on

Revision	Commit summary	Author	Date
r45473	Reduce code duplication correctly this time...	simetrical	00:41, 7 January 2009

Status & tagging log

04:26, 7 January 2009 Brion VIBBER (talk | contribs) changed the status of r45487 [removed: new added: ok]