r71627 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r71626‎ | r71627 | r71628 >
Date:15:17, 25 August 2010
Author:thomasv
Status:resolved (Comments)
Tags:
Comment:
Major refactoring :
*new tag hook where users provide text alignment information.
*simplified matching algorithm : give up on variable nesting levels, this was too complicated.
*improved matching of paragraphs : always match balanced paragraphs.
Modified paths:
  • /trunk/extensions/DoubleWiki/DoubleWiki_body.php (modified) (history)

Diff [purge]

Index: trunk/extensions/DoubleWiki/DoubleWiki_body.php
@@ -19,16 +19,46 @@
2020
2121 class DoubleWiki {
2222
 23+ /*
 24+ * Tags that must be closed. (list copied from Sanitizer.php)
 25+ */
 26+ var $tags = "/<\/?(b|del|i|ins|u|font|big|small|sub|sup|h1|h2|h3|h4|h5|h6|cite|code|em|s|strike|strong|tt|tr|td|var|div|center|blockquote|ol|ul|dl|table|caption|pre|ruby|rt|rb|rp|p|span)([\s](.*?)>|>)/i";
 27+
2328 /**
2429 * Constructor
2530 */
2631 function DoubleWiki() {
2732 global $wgParser, $wgHooks;
 33+ $wgParser->setHook( 'iw_align' , array( &$this, 'iw_align' ) );
2834 $wgHooks['OutputPageBeforeHTML'][] = array( &$this, 'addMatchedText' );
2935 }
3036
 37+ /*
 38+ * Wrap the list of matched phrases into a hidden element.
 39+ */
 40+ function iw_align( $input, $args, $parser ) {
 41+ if ( isset( $args['lang'] ) ) {
 42+ $lang = $args['lang'];
 43+ return "<div id=\"align-$lang\" style=\"display:none;\">\n" . trim( $input ). "\n</div>";
 44+ }
 45+ return '';
 46+ }
3147
3248 /*
 49+ * Read the list of matched phrases and add tags to the html output.
 50+ */
 51+ function addMatchingTags ( &$text, $lang ) {
 52+ $pattern = "/<div id=\"align-$lang\" style=\"display:none;\">\n<p>([^<]*?)<\/p>\n<\/div>/is";
 53+ if( ! preg_match( $pattern, $text, $m ) ) return ;
 54+ $text = str_replace( $m[1], '', $text );
 55+ $line_pattern = "/\s*([^\|\n]*?)\s*\|\s*([^\|\n]*?)\s*\n/i";
 56+ preg_match_all( $line_pattern, $m[1], $items, PREG_SET_ORDER );
 57+ foreach( $items as $n => $i ) {
 58+ $text = str_replace( $i[1], "<span id=\"dw-$n\" title=\"{$i[2]}\"/>".$i[1], $text );
 59+ }
 60+ }
 61+
 62+ /*
3363 * Hook function called with &match=lang
3464 * Transform $text into a bilingual version
3565 */
@@ -40,6 +70,7 @@
4171 if ( $match_request === '' ) {
4272 return true;
4373 }
 74+ $this->addMatchingTags ( &$text, $match_request );
4475
4576 foreach( $parserOutput->mLanguageLinks as $l ) {
4677 $nt = Title::newFromText( $l );
@@ -76,14 +107,6 @@
77108 $text = preg_replace("/<a href=\"#(.*?)\"/i","<a href=\"#r_\\1\"", $text );
78109 $text = preg_replace("/<li id=\"(.*?)\"/i","<li id=\"r_\\1\"", $text );
79110
80 - #add tags before h2 and h3 sections
81 - $translation = preg_replace("/<h2>/i","<div title=\"@@h2\"></div>\n<h2>",
82 - $translation );
83 - $translation = preg_replace("/<h3>/i","<div title=\"@@h3\"></div>\n<h3>",
84 - $translation );
85 - $text = preg_replace("/<h2>/i","<div title=\"@@h2\"></div>\n<h2>", $text );
86 - $text = preg_replace("/<h3>/i","<div title=\"@@h3\"></div>\n<h3>", $text );
87 -
88111 #add ?match= to local links of the local wiki
89112 $text = preg_replace( "/<a href=\"\/([^\"\?]*)\"/i",
90113 "<a href=\"/\\1?match={$match_request}\"", $text );
@@ -99,237 +122,179 @@
100123 }
101124
102125
103 - /**
104 - * Return table with two columns of text
105 - * Text is split into slices based on title tags
 126+ /*
 127+ * Format the text as a two-column table with aligned paragraphs
106128 */
 129+ function matchColumns( $left_text, $left_title, $left_url, $left_lang,
 130+ $right_text, $right_title, $right_url, $right_lang ) {
107131
108 - function matchColumns( $left_text, $left_title, $left_url, $left_lang_code,
109 - $right_text, $right_title, $right_url, $right_lang_code ) {
 132+ list( $left_slices, $left_tags ) = $this->find_slices( $left_text );
110133
111 - # note about emdedding:
112 - # text is split only at a single level.
113 - # initially we assume that this level is zero
114 - # if nesting is encountered before the
115 - # first paragraph, then this split level is increased
116 - # we keep track of the current nesting level during processing
117 - # if (current level != split level) then we do not split the text
118 -
119 - # the current level of embedding (stack depth)
120 - $left_nesting = 0;
121 - $right_nesting = 0;
122 -
123 - #the level of embedding where the text is split
124 - #initial value is -1 until actual value is known
125 - $left_splitlevel = -1;
126 - $right_splitlevel = -1;
127 -
128 - # split text
129 - $tag_pattern = "/<div title=\"([^\"]*)\"><\/div>/i";
130 - $left_slices = preg_split( $tag_pattern, $left_text );
131 - $right_slices = preg_split( $tag_pattern, $right_text );
132 - preg_match_all( $tag_pattern, $left_text, $left_tags, PREG_PATTERN_ORDER );
133 - preg_match_all( $tag_pattern, $right_text, $right_tags, PREG_PATTERN_ORDER );
134 -
135 - /**
136 - * Order slices in a two-column array.
137 - * slices that are surrounded by the same tag belong in the same line
138 - * $i indexes the left column, $j the right column.
139 - */
140134 $body = '';
141135 $left_chunk = '';
142136 $right_chunk = '';
143 -
144 - $j=0;
145 - $max_i = count( $left_slices );
146 - for ( $i=0 ; $i < $max_i ; $i++ ) {
 137+
 138+ for ( $i=0 ; $i < count($left_slices) ; $i++ ) {
 139+
 140+ // some slices might be empty
 141+ if( $left_slices[$i] == '' ) {
 142+ continue;
 143+ }
 144+
147145 $found = false;
 146+ $tag = $left_tags[1][$i];
148147 $left_chunk .= $left_slices[$i];
149 -
150 - $max_k = count( $right_slices );
151148
152149 # if we are at the end of the loop, finish quickly
153 - if ( $i==$max_i - 1 ) {
154 - for ( $k=$j ; $k < $max_k ; $k++ ) $right_chunk .= $right_slices[$k];
 150+ if ( $i== count( $left_slices ) - 1 ) {
 151+ $right_chunk .= $right_text;
155152 $found = true;
156 - }
157 - else for ( $k=$j ; $k < $max_k ; $k++ ) {
158 -
 153+ } else {
159154 #look for requested tag in the text
160 - $a = strpos ( $right_slices[$k], $left_tags[1][$i] );
 155+ $a = strpos ( $right_text, $tag );
161156 if( $a ) {
162 - #go to beginning of paragraph
163 - #this regexp matches the rightmost delimiter
164 - $sub = substr( $right_slices[$k], 0, $a);
165 - if ( preg_match("/(.*)<(p|dl)>/is", $sub, $matches ) ){
166 - $right_chunk .= $matches[1];
167 - $right_slices[$k] = substr( $right_slices[$k], strlen($matches[1]) );
 157+ $found = true;
 158+ $sub = substr( $right_text, 0, $a);
 159+ // detect the end of previous paragraph
 160+ // regexp matches the rightmost delimiter
 161+ if ( preg_match("/(.*)<\/(p|dl)>/is", $sub, $m ) ) {
 162+ $right_chunk .= $m[0];
 163+ $right_text = substr( $right_text, strlen($m[0]) );
168164 }
169 - else {
170 - $right_chunk .= $sub;
171 - $right_slices[$k] = substr( $right_slices[$k], $a );
172 - }
173 -
174 - $found = true;
175 - $j = $k;
176 - break;
 165+ #} else {
 166+ # print "<br/>tag not found ".$tag;
177167 }
 168+ }
178169
179 - $right_chunk .= $right_slices[$k];
 170+ if( $found && $right_chunk ) {
 171+ // Detect paragraphs
 172+ $left_bits = $this->find_paragraphs( $left_chunk );
 173+ $right_bits = $this->find_paragraphs( $right_chunk );
180174
181 - if( $k < $max_k - 1 ) {
182 - if( $left_tags[0][$i] == $right_tags[0][$k] ) {
183 - $found = true;
184 - $j = $k+1;
185 - break;
186 - }
 175+ // $body .= "<tr style=\"background-color:#ffdddd;\"><td>".count($left_bits)."</td><td>".count($right_bits)."</td></tr>\n";
 176+ // Do not align paragraphs if counts are different
 177+ if ( count( $left_bits ) != count( $right_bits ) ) {
 178+ $left_bits = Array( $left_chunk );
 179+ $right_bits = Array( $right_chunk );
187180 }
188 - }
189 - if( $found ) {
190181
191 - #split chunks into smaller units (paragraphs)
192 - $paragraph_tags = "/<(p|dl)>/i";
193 - $left_bits = preg_split( $paragraph_tags, $left_chunk );
194 - $right_bits = preg_split( $paragraph_tags, $right_chunk );
195 - preg_match_all( $paragraph_tags, $left_chunk, $left_seps, PREG_PATTERN_ORDER );
196 - preg_match_all( $paragraph_tags, $right_chunk, $right_seps, PREG_PATTERN_ORDER );
197 -
198182 $left_chunk = '';
199183 $right_chunk = '';
200 -
201 - # add separators that were cut off
202 - for($l=1; $l < count( $left_bits ); $l++ ) {
203 - $left_bits[$l] = $left_seps[0][$l-1].$left_bits[$l];
 184+ for($l=0; $l < count( $left_bits ) ; $l++ ) {
 185+ $body .=
 186+ "<tr><td valign=\"top\" style=\"vertical-align:100%;padding-right: 0.5em\" lang=\"{$left_lang}\">"
 187+ ."<div style=\"width:35em; margin:0px auto\">\n".$left_bits[$l]."</div>"
 188+ ."</td>\n<td valign=\"top\" style=\"padding-left: 0.5em\" lang=\"{$right_lang}\">"
 189+ ."<div style=\"width:35em; margin:0px auto\">\n".$right_bits[$l]."</div>"
 190+ ."</td></tr>\n";
204191 }
205 - for($l=1; $l < count( $right_bits ); $l++ ) {
206 - $right_bits[$l] = $right_seps[0][$l-1].$right_bits[$l];
207 - }
208 -
209 - $max = max( count( $left_bits ) , count( $right_bits ));
210 - # initialize missing elements
211 - for($l= count( $left_bits ); $l<$max; $l++) $left_bits[$l]='';
212 - for($l= count( $right_bits ); $l<$max; $l++) $right_bits[$l]='';
213 -
214 - for($l=0; $l < $max; $l++ ) {
215 -
216 - list($left_delta,$left_o,$left_c) = $this->nesting_delta( $left_bits[$l] );
217 - list($right_delta,$right_o,$right_c) = $this->nesting_delta( $right_bits[$l] );
218 -
219 - $left_nesting = $left_nesting + $left_delta;
220 - $right_nesting = $right_nesting + $right_delta;
221 -
222 - #are we at the end?
223 - $the_end = ($l == $max-1) && ($i == $max_i -1 );
224 -
225 - if(( $left_splitlevel == -1) && ($right_splitlevel == -1)) {
226 - $left_splitlevel = $left_nesting;
227 - $right_splitlevel = $right_nesting;
228 - $left_opening = $left_o;
229 - $right_opening = $right_o;
230 - $left_closure = $left_c;
231 - $right_closure = $right_c;
232 -
233 - $left_prefix = '';
234 - $right_prefix = '';
235 - $left_suffix = $left_closure;
236 - $right_suffix = $right_closure;
237 - }
238 - else if($the_end) {
239 - $left_prefix = $left_opening;
240 - $right_prefix = $right_opening;
241 - $left_suffix = '';
242 - $right_suffix = '';
243 - }
244 - else {
245 - $left_prefix = $left_opening;
246 - $right_prefix = $right_opening;
247 - $left_suffix = $left_closure;
248 - $right_suffix = $right_closure;
249 - }
250 -
251 - if( ( ($left_nesting == $left_splitlevel)
252 - && ($right_nesting == $right_splitlevel) ) || $the_end) {
253 - $body .=
254 - "<tr><td valign=\"top\" style=\"padding-right: 0.5em\" lang=\"{$left_lang_code}\">"
255 - ."<div style=\"width:35em; margin:0px auto\">\n"
256 - .$left_prefix.$left_bits[$l].$left_suffix
257 - ."</div>"
258 -
259 - ."</td>\n<td valign=\"top\" style=\"padding-left: 0.5em\" lang=\"{$right_lang_code}\">"
260 - ."<div style=\"width:35em; margin:0px auto\">\n"
261 - .$right_prefix.$right_bits[$l].$right_suffix
262 - ."</div>"
263 - ."</td></tr>\n";
264 - }
265 - else {
266 - # procrastinate
267 - $left_nesting = $left_nesting - $left_delta;
268 - $right_nesting = $right_nesting - $right_delta;
269 - if ($l < $max-1) {
270 - $left_bits[$l+1] = $left_bits[$l] . $left_bits[$l+1];
271 - $right_bits[$l+1] = $right_bits[$l] . $right_bits[$l+1];
272 - } else {
273 - $left_chunk = $left_bits[$l] ;
274 - $right_chunk = $right_bits[$l];
275 - }
276 - }
277 - }
278192 }
279 - else{ $right_chunk='';}
280193 }
281194
282 -
283 - # format table head and return results
 195+ // format table head and return results
284196 $left_url = htmlspecialchars( $left_url );
285197 $right_url = htmlspecialchars( $right_url );
286198 $head =
287 -"<table width=\"100%\" border=\"0\" bgcolor=\"white\" rules=\"cols\" cellpadding=\"0\">
 199+ "<table id=\"doubleWikiTable\" width=\"100%\" border=\"0\" bgcolor=\"white\" rules=\"cols\" cellpadding=\"0\">
288200 <colgroup><col width=\"50%\"/><col width=\"50%\"/></colgroup><thead>
289 -<tr><td bgcolor=\"#cfcfff\" align=\"center\" lang=\"{$left_lang_code}\">
 201+<tr><td bgcolor=\"#cfcfff\" align=\"center\" lang=\"{$left_lang}\">
290202 <a href=\"{$left_url}\">{$left_title}</a></td>
291 -<td bgcolor=\"#cfcfff\" align=\"center\" lang=\"{$right_lang_code}\">
 203+<td bgcolor=\"#cfcfff\" align=\"center\" lang=\"{$right_lang}\">
292204 <a href=\"{$right_url}\" class='extiw'>{$right_title}</a>
293205 </td></tr></thead>\n";
294 - return $head.$body."</table>" ;
 206+ return $head . $body . "</table>" ;
295207 }
296208
297209
 210+
298211 /*
299 - * returns how much the stack is changed
300 - * also returns opening and closing sequences of tag
 212+ * Split text and return a set of html-balanced paragraphs
301213 */
302 - function nesting_delta ( $text ) {
303 - #tags that must be closed. (list copied from Sanitizer.php)
304 - $tags = "/<\/?(b|del|i|ins|u|font|big|small|sub|sup|h1|h2|h3|h4|h5|h6|"
305 - ."cite|code|em|s|strike|strong|tt|tr|td|var|div|center|blockquote|ol|ul|dl|"
306 - ."table|caption|pre|ruby|rt|rb|rp|p|span)([\s](.*?)>|>)/i";
307 - preg_match_all( $tags, $text, $m, PREG_SET_ORDER);
308 -
309 - $stack = array();
 214+ function find_paragraphs( $text ) {
 215+ $result = Array();
 216+ $bits = preg_split( $this->tags, $text );
 217+ preg_match_all( $this->tags, $text, $m, PREG_SET_ORDER);
310218 $counter = 0;
311 - $opening = '';
312 - $closure = '';
 219+ $out = '';
313220 for($i=0; $i < count($m); $i++){
314 - $t = $m[$i];
315 - if( substr( $t[0], 0, 2) != "</" ){
 221+ $t = $m[$i][0];
 222+ if( substr( $t, 0, 2) != "</" ) {
316223 $counter++;
317 - array_push($stack, $t);
318224 } else {
319 - $tt = array_pop($stack);
320225 $counter--;
321 - #if( ($tt != null) && ($tt[1] != $t[1]) ) {
322 - # #input html is buggy...
323 - # echo "Warning: ".$t[1]." encountered, expected ".$tt[1]."<br />\n";
324 - #}
325226 }
 227+ $out .= $bits[$i] . $t;
 228+ if( ($t == "</p>" || $t == "</dl>" ) && $counter==0 ) {
 229+ $result[] = $out;
 230+ $out = '';
 231+ }
326232 }
327 - for($i=0; $i<$counter; $i++){
328 - $opening .= $stack[$i][0];
329 - $closure = "</".$stack[$i][1].">".$closure;
 233+ if($out) {
 234+ $result[] = $out;
330235 }
 236+ return $result;
 237+ }
331238
332 - return array($counter, $opening, $closure);
333239
 240+ /*
 241+ * Split text and return a set of html-balanced slices
 242+ */
 243+ function find_slices( $left_text ) {
 244+
 245+ $tag_pattern = "/<span id=\"dw-[^\"]*\" title=\"([^\"]*)\"\/>/i";
 246+ $left_slices = preg_split( $tag_pattern, $left_text );
 247+ preg_match_all( $tag_pattern, $left_text, $left_tags, PREG_PATTERN_ORDER );
 248+ $n = count( $left_slices);
 249+
 250+ /*
 251+ * Make slices that are full paragraphs
 252+ * If two slices correspond to the same paragraph, the second one will be empty
 253+ */
 254+ for ( $i=0 ; $i < $n - 1 ; $i++ ) {
 255+ $str = $left_slices[$i];
 256+ if ( preg_match("/(.*)<(p|dl)>/is", $str, $m ) ) {
 257+ $left_slices[$i] = $m[1];
 258+ $left_slices[$i+1] = substr( $str, strlen($m[1]) ) . $left_slices[$i+1];
 259+ }
 260+ }
 261+
 262+ /*
 263+ * Keep only slices that contain balanced html
 264+ * If a slice is unbalanced, we merge it with the next one.
 265+ * The first and last slices are compensated.
 266+ */
 267+ $stack = array();
 268+ $counter = 0;
 269+ for( $i=0 ; $i < $n ; $i++) {
 270+ $bits = preg_split( $this->tags, $left_slices[$i] );
 271+ preg_match_all( $this->tags, $left_slices[$i], $m, PREG_SET_ORDER);
 272+ $counter = 0;
 273+ for($k=0 ; $k < count($m) ; $k++) {
 274+ $t = $m[$k];
 275+ if( substr( $t[0], 0, 2) != "</" ) {
 276+ $counter++;
 277+ array_push($stack, $t);
 278+ } else {
 279+ $tt = array_pop($stack);
 280+ $counter--;
 281+ }
 282+ }
 283+ if( $i==0 ) {
 284+ $opening = '';
 285+ $closure = '';
 286+ for( $k=0; $k < $counter ; $k++ ) {
 287+ $opening .= "<".$stack[$k][1].">";
 288+ $closure = "</".$stack[$k][1].">" . $closure;
 289+ }
 290+ $left_slices[$i] = $left_slices[$i] . $closure;
 291+ } else if( $i == $n - 1 ) {
 292+ $left_slices[$i] = $opening . $left_slices[$i];
 293+ } else if( $counter != 0 ) {
 294+ $left_slices[$i+1] = $left_slices[$i] . $left_slices[$i+1];
 295+ $left_slices[$i] = '';
 296+ }
 297+ }
 298+ return array($left_slices, $left_tags);
334299 }
335300
336301 }

Follow-up revisions

RevisionCommit summaryAuthorDate
r81674Fixup fixme set by me on r71627...reedy01:06, 8 February 2011

Comments

#Comment by Reedy (talk | contribs)   00:52, 8 February 2011
			if( $i==0 ) {
				$opening = '';
				$closure = '';
				for( $k=0; $k < $counter ; $k++ ) {
					$opening .= "<".$stack[$k][1].">";
					$closure = "</".$stack[$k][1].">" . $closure;
				}
				$left_slices[$i] = $left_slices[$i] . $closure;
			} else if( $i == $n - 1 ) {
				$left_slices[$i] = $opening . $left_slices[$i];
			} else if( $counter != 0 ) {
				$left_slices[$i + 1] = $left_slices[$i] . $left_slices[$i+1];
				$left_slices[$i] = '';
			}

$opening is undefined in the else if, and then what you assign to $opening is never assigned anywhere

#Comment by MarkAHershberger (talk | contribs)   00:58, 8 February 2011

$opening is assigned the first time through the loop, so it is assigned by the time $i == $n - 1

This is very gross, but probably not a FIXME.

#Comment by Reedy (talk | contribs)   01:04, 8 February 2011
for ( $i = 0; $i < 2; $i++ ) {

if ( $i == 0 ) {
$blah = 'foo';
} else {
echo $blah;
}
}

That echos foo once, so you're right

/me goes to find a corner to be sick in

Status & tagging log