r62488 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r62487‎ \| r62488 \| r62489 >
Date:	03:07, 15 February 2010
Author:	aaron
Status:	ok (Comments)
Tags:
Comment:	* Fixed bug 22326: "Truncated urls in summaries lead to different places" * Broke long lines
Modified paths:	/trunk/extensions/CodeReview/backend/CodeCommentLinker.php (modified) (history) /trunk/extensions/CodeReview/ui/SpecialCode.php (modified) (history)

Diff [purge]

Index: trunk/extensions/CodeReview/backend/CodeCommentLinker.php
—	—	@@ -11,11 +11,103 @@
12	12	function link( $text ) {
13	13	# Catch links like http://www.mediawiki.org/wiki/Special:Code/MediaWiki/44245#c829
14	14	# Ended by space or brackets (like those pesky <br /> tags)
15		~~- $text = preg_replace_callback( '/(^\|[^\w[])(' . wfUrlProtocols() . ')([^ <>]+)(\b)/', array( $this, 'generalLink' ), $text );~~
16		~~- $text = preg_replace_callback( '/\br(\d+)\b/', array( $this, 'messageRevLink' ), $text );~~
17		~~- $text = preg_replace_callback( '/\bbug #?(\d+)\b/i', array( $this, 'messageBugLink' ), $text );~~
	15	+ $text = preg_replace_callback( '/(^\|[^\w[])(' . wfUrlProtocols() . ')([^ <>]+)(\b)/',
	16	+ array( $this, 'generalLink' ), $text );
	17	+ $text = preg_replace_callback( '/\br(\d+)\b/',
	18	+ array( $this, 'messageRevLink' ), $text );
	19	+ $text = preg_replace_callback( '/\bbug #?(\d+)\b/i',
	20	+ array( $this, 'messageBugLink' ), $text );
18	21	return $text;
19	22	}
	23	+
	24	+ // truncate() for valid HTML with self-contained tags only
	25	+ // Note: tries to fix broken HTML with MWTidy
	26	+ // @TODO: cleanup and move to language.php
	27	+ function truncateHtml( $text, $maxLen, $ellipsis = '...' ) {
	28	+ global $wgLang;
	29	+ if( strlen($text) <= $maxLen ) {
	30	+ return $text; // string short enough even with HTML
	31	+ }
	32	+ $text = MWTidy::tidy( $text ); // fix tags
	33	+ $displayLen = 0;
	34	+ $doTruncate = true; // truncated string plus '...' shorter than original?
	35	+ $tagType = 0; // 0-open, 1-close
	36	+ $bracketState = 0; // 1-tag start, 2-tag name, 3-tag params, 0-neither
	37	+ $entityState = 0; // 0-not entity, 1-entity
	38	+ $tag = $ret = $ch = $lastCh = '';
	39	+ $openTags = array();
	40	+ for( $pos = 0; $pos < strlen($text); $pos++ ) {
	41	+ $lastCh = $ch;
	42	+ $ch = $text[$pos];
	43	+ if ( $ch == '<' ) {
	44	+ self::onEndBracket( $tag, $tagType, $lastCh, $openTags ); // for bad HTML
	45	+ $entityState = 0; // for bad HTML
	46	+ $bracketState = 1; // tag started (checking for backslash)
	47	+ } elseif ( $ch == '>' ) {
	48	+ self::onEndBracket( $tag, $tagType, $lastCh, $openTags );
	49	+ $entityState = 0; // for bad HTML
	50	+ $bracketState = 0; // out of brackets
	51	+ } elseif ( $bracketState == 1 ) {
	52	+ if ( $ch == '/' ) {
	53	+ $tagType = 1; // close tag
	54	+ } else {
	55	+ $tagType = 0; // open tag
	56	+ $tag .= $ch;
	57	+ }
	58	+ $bracketState = 2; // building tag name
	59	+ } elseif ( $bracketState == 2 ) {
	60	+ if ( $ch != ' ' ) {
	61	+ $tag .= $ch;
	62	+ } else {
	63	+ $bracketState = 3; // name found (e.g. "<a href=...")
	64	+ }
	65	+ } elseif ( $bracketState == 0 ) {
	66	+ if ( $entityState ) {
	67	+ if ( $ch == ';' ) {
	68	+ $entityState = 0;
	69	+ $displayLen++; // entity is one char
	70	+ }
	71	+ } elseif ( $ch == '&' ) {
	72	+ $entityState = 1; // entity found, (e.g. " ")
	73	+ } else {
	74	+ $displayLen++; // not in brackets
	75	+ }
	76	+ }
	77	+ $ret .= $ch;
	78	+ if( !$doTruncate ) continue;
	79	+ # Truncate if not in the middle of a bracket/entity...
	80	+ if ( $bracketState == 0 && $entityState == 0 && $displayLen >= $maxLen ) {
	81	+ $left = substr( $text, $pos + 1 ); // remaining string
	82	+ $left = StringUtils::delimiterReplace( '<', '>', '', $left ); // rm tags
	83	+ $left = StringUtils::delimiterReplace( '&', ';', '', $left ); // rm entities
	84	+ $doTruncate = ( strlen($left) > strlen($ellipsis) );
	85	+ if ( $doTruncate ) {
	86	+ # Hack: go one char over so truncate() will handle multi-byte chars
	87	+ $ret = $wgLang->truncate( $ret . 'x', strlen($ret), '' ) . $ellipsis;
	88	+ break;
	89	+ }
	90	+ }
	91	+ }
	92	+ self::onEndBracket( $tag, $lastCh, $tagType, $openTags ); // for bad HTML
	93	+ while ( count( $openTags ) > 0 ) {
	94	+ $ret .= '</' . array_pop($openTags) . '>'; // close open tags
	95	+ }
	96	+ return $ret;
	97	+ }
	98	+
	99	+ protected function onEndBracket( &$tag, $tagType, $lastCh, &$openTags ) {
	100	+ $tag = ltrim( $tag );
	101	+ if( $tag != '' ) {
	102	+ if( $tagType == 0 && $lastCh != '/' ) {
	103	+ $openTags[] = $tag; // tag opened (didn't close itself)
	104	+ } else if( $tagType == 1 ) {
	105	+ if( $openTags && $tag == $openTags[count($openTags)-1] ) {
	106	+ array_pop( $openTags ); // tag closed
	107	+ }
	108	+ }
	109	+ $tag = '';
	110	+ }
	111	+ }
20	112
21	113	function generalLink( $arr ) {
22	114	$url = $arr[2] . $arr[3];
Index: trunk/extensions/CodeReview/ui/SpecialCode.php
—	—	@@ -151,8 +151,9 @@
152	152	$message = trim( $value );
153	153	$lines = explode( "\n", $message, 2 );
154	154	$first = $lines[0];
155		~~- $trimmed = $wgLang->truncate( $first, 80 );~~
156		~~- return $this->formatMessage( $trimmed );~~
	155	+ $html = $this->formatMessage( $first );
	156	+ $linker = new CodeCommentLinkerHtml( $this->mRepo );
	157	+ return $linker->truncateHtml( $html, 80 );
157	158	}
158	159	/*
159	160	* Formatted HTML array for properties display

Follow-up revisions

Revision	Commit summary	Author	Date
r62506	CodeReview: MWTidy was only introduced in v1.15, so I've fixed the new CodeCo...	happydog	09:51, 15 February 2010
r62588	Follow up r62488, r62538: consistency fix for $maxLen=0 case; no display text...	aaron	19:48, 16 February 2010

Comments

#Comment by Tim Starling (talk | contribs) 08:02, 19 February 2010

Very nice. You'll be rewriting Parser.php in no time.

Next time you write one of these, you should try using strcspn() to skip across long stretches of non-markup characters, it'll be faster for typical test cases.

I probably would have used $remaining instead of $left. It took me a while to figure out that you didn't mean the left hand side of the string.

# Hack: go one char over so truncate() will handle multi-byte chars
$ret = $wgLang->truncate( $ret . 'x', strlen($ret),  ) . $ellipsis;

Obviously this could be done in Language::truncate() itself.

#Comment by Tim Starling (talk | contribs) 08:14, 19 February 2010

By the way, since you're in this area, there's a bug in the general link regex:

> print $c->link("http://foo\nbar")
[http://foo
bar http://foo
bar]

I wish CodeCommentLinkerWiki could just leave URLs alone, letting the parser link them, so that things like <nowiki> would work.

#Comment by Aaron Schulz (talk | contribs) 23:38, 19 February 2010

strcspn() looks like a great way to speed up + simplifiy...working on this now.

#Comment by Aaron Schulz (talk | contribs) 23:39, 19 February 2010

simplify

Status & tagging log

08:02, 19 February 2010 Tim Starling (talk | contribs) changed the status of r62488 [removed: new added: ok]