Index: trunk/extensions/DoubleWiki/DoubleWiki_body.php |
— | — | @@ -19,16 +19,46 @@ |
20 | 20 | |
21 | 21 | class DoubleWiki { |
22 | 22 | |
| 23 | + /* |
| 24 | + * Tags that must be closed. (list copied from Sanitizer.php) |
| 25 | + */ |
| 26 | + var $tags = "/<\/?(b|del|i|ins|u|font|big|small|sub|sup|h1|h2|h3|h4|h5|h6|cite|code|em|s|strike|strong|tt|tr|td|var|div|center|blockquote|ol|ul|dl|table|caption|pre|ruby|rt|rb|rp|p|span)([\s](.*?)>|>)/i"; |
| 27 | + |
23 | 28 | /** |
24 | 29 | * Constructor |
25 | 30 | */ |
26 | 31 | function DoubleWiki() { |
27 | 32 | global $wgParser, $wgHooks; |
| 33 | + $wgParser->setHook( 'iw_align' , array( &$this, 'iw_align' ) ); |
28 | 34 | $wgHooks['OutputPageBeforeHTML'][] = array( &$this, 'addMatchedText' ); |
29 | 35 | } |
30 | 36 | |
| 37 | + /* |
| 38 | + * Wrap the list of matched phrases into a hidden element. |
| 39 | + */ |
| 40 | + function iw_align( $input, $args, $parser ) { |
| 41 | + if ( isset( $args['lang'] ) ) { |
| 42 | + $lang = $args['lang']; |
| 43 | + return "<div id=\"align-$lang\" style=\"display:none;\">\n" . trim( $input ). "\n</div>"; |
| 44 | + } |
| 45 | + return ''; |
| 46 | + } |
31 | 47 | |
32 | 48 | /* |
| 49 | + * Read the list of matched phrases and add tags to the html output. |
| 50 | + */ |
| 51 | + function addMatchingTags ( &$text, $lang ) { |
| 52 | + $pattern = "/<div id=\"align-$lang\" style=\"display:none;\">\n<p>([^<]*?)<\/p>\n<\/div>/is"; |
| 53 | + if( ! preg_match( $pattern, $text, $m ) ) return ; |
| 54 | + $text = str_replace( $m[1], '', $text ); |
| 55 | + $line_pattern = "/\s*([^\|\n]*?)\s*\|\s*([^\|\n]*?)\s*\n/i"; |
| 56 | + preg_match_all( $line_pattern, $m[1], $items, PREG_SET_ORDER ); |
| 57 | + foreach( $items as $n => $i ) { |
| 58 | + $text = str_replace( $i[1], "<span id=\"dw-$n\" title=\"{$i[2]}\"/>".$i[1], $text ); |
| 59 | + } |
| 60 | + } |
| 61 | + |
| 62 | + /* |
33 | 63 | * Hook function called with &match=lang |
34 | 64 | * Transform $text into a bilingual version |
35 | 65 | */ |
— | — | @@ -40,6 +70,7 @@ |
41 | 71 | if ( $match_request === '' ) { |
42 | 72 | return true; |
43 | 73 | } |
| 74 | + $this->addMatchingTags ( &$text, $match_request ); |
44 | 75 | |
45 | 76 | foreach( $parserOutput->mLanguageLinks as $l ) { |
46 | 77 | $nt = Title::newFromText( $l ); |
— | — | @@ -76,14 +107,6 @@ |
77 | 108 | $text = preg_replace("/<a href=\"#(.*?)\"/i","<a href=\"#r_\\1\"", $text ); |
78 | 109 | $text = preg_replace("/<li id=\"(.*?)\"/i","<li id=\"r_\\1\"", $text ); |
79 | 110 | |
80 | | - #add tags before h2 and h3 sections |
81 | | - $translation = preg_replace("/<h2>/i","<div title=\"@@h2\"></div>\n<h2>", |
82 | | - $translation ); |
83 | | - $translation = preg_replace("/<h3>/i","<div title=\"@@h3\"></div>\n<h3>", |
84 | | - $translation ); |
85 | | - $text = preg_replace("/<h2>/i","<div title=\"@@h2\"></div>\n<h2>", $text ); |
86 | | - $text = preg_replace("/<h3>/i","<div title=\"@@h3\"></div>\n<h3>", $text ); |
87 | | - |
88 | 111 | #add ?match= to local links of the local wiki |
89 | 112 | $text = preg_replace( "/<a href=\"\/([^\"\?]*)\"/i", |
90 | 113 | "<a href=\"/\\1?match={$match_request}\"", $text ); |
— | — | @@ -99,237 +122,179 @@ |
100 | 123 | } |
101 | 124 | |
102 | 125 | |
103 | | - /** |
104 | | - * Return table with two columns of text |
105 | | - * Text is split into slices based on title tags |
| 126 | + /* |
| 127 | + * Format the text as a two-column table with aligned paragraphs |
106 | 128 | */ |
| 129 | + function matchColumns( $left_text, $left_title, $left_url, $left_lang, |
| 130 | + $right_text, $right_title, $right_url, $right_lang ) { |
107 | 131 | |
108 | | - function matchColumns( $left_text, $left_title, $left_url, $left_lang_code, |
109 | | - $right_text, $right_title, $right_url, $right_lang_code ) { |
| 132 | + list( $left_slices, $left_tags ) = $this->find_slices( $left_text ); |
110 | 133 | |
111 | | - # note about emdedding: |
112 | | - # text is split only at a single level. |
113 | | - # initially we assume that this level is zero |
114 | | - # if nesting is encountered before the |
115 | | - # first paragraph, then this split level is increased |
116 | | - # we keep track of the current nesting level during processing |
117 | | - # if (current level != split level) then we do not split the text |
118 | | - |
119 | | - # the current level of embedding (stack depth) |
120 | | - $left_nesting = 0; |
121 | | - $right_nesting = 0; |
122 | | - |
123 | | - #the level of embedding where the text is split |
124 | | - #initial value is -1 until actual value is known |
125 | | - $left_splitlevel = -1; |
126 | | - $right_splitlevel = -1; |
127 | | - |
128 | | - # split text |
129 | | - $tag_pattern = "/<div title=\"([^\"]*)\"><\/div>/i"; |
130 | | - $left_slices = preg_split( $tag_pattern, $left_text ); |
131 | | - $right_slices = preg_split( $tag_pattern, $right_text ); |
132 | | - preg_match_all( $tag_pattern, $left_text, $left_tags, PREG_PATTERN_ORDER ); |
133 | | - preg_match_all( $tag_pattern, $right_text, $right_tags, PREG_PATTERN_ORDER ); |
134 | | - |
135 | | - /** |
136 | | - * Order slices in a two-column array. |
137 | | - * slices that are surrounded by the same tag belong in the same line |
138 | | - * $i indexes the left column, $j the right column. |
139 | | - */ |
140 | 134 | $body = ''; |
141 | 135 | $left_chunk = ''; |
142 | 136 | $right_chunk = ''; |
143 | | - |
144 | | - $j=0; |
145 | | - $max_i = count( $left_slices ); |
146 | | - for ( $i=0 ; $i < $max_i ; $i++ ) { |
| 137 | + |
| 138 | + for ( $i=0 ; $i < count($left_slices) ; $i++ ) { |
| 139 | + |
| 140 | + // some slices might be empty |
| 141 | + if( $left_slices[$i] == '' ) { |
| 142 | + continue; |
| 143 | + } |
| 144 | + |
147 | 145 | $found = false; |
| 146 | + $tag = $left_tags[1][$i]; |
148 | 147 | $left_chunk .= $left_slices[$i]; |
149 | | - |
150 | | - $max_k = count( $right_slices ); |
151 | 148 | |
152 | 149 | # if we are at the end of the loop, finish quickly |
153 | | - if ( $i==$max_i - 1 ) { |
154 | | - for ( $k=$j ; $k < $max_k ; $k++ ) $right_chunk .= $right_slices[$k]; |
| 150 | + if ( $i== count( $left_slices ) - 1 ) { |
| 151 | + $right_chunk .= $right_text; |
155 | 152 | $found = true; |
156 | | - } |
157 | | - else for ( $k=$j ; $k < $max_k ; $k++ ) { |
158 | | - |
| 153 | + } else { |
159 | 154 | #look for requested tag in the text |
160 | | - $a = strpos ( $right_slices[$k], $left_tags[1][$i] ); |
| 155 | + $a = strpos ( $right_text, $tag ); |
161 | 156 | if( $a ) { |
162 | | - #go to beginning of paragraph |
163 | | - #this regexp matches the rightmost delimiter |
164 | | - $sub = substr( $right_slices[$k], 0, $a); |
165 | | - if ( preg_match("/(.*)<(p|dl)>/is", $sub, $matches ) ){ |
166 | | - $right_chunk .= $matches[1]; |
167 | | - $right_slices[$k] = substr( $right_slices[$k], strlen($matches[1]) ); |
| 157 | + $found = true; |
| 158 | + $sub = substr( $right_text, 0, $a); |
| 159 | + // detect the end of previous paragraph |
| 160 | + // regexp matches the rightmost delimiter |
| 161 | + if ( preg_match("/(.*)<\/(p|dl)>/is", $sub, $m ) ) { |
| 162 | + $right_chunk .= $m[0]; |
| 163 | + $right_text = substr( $right_text, strlen($m[0]) ); |
168 | 164 | } |
169 | | - else { |
170 | | - $right_chunk .= $sub; |
171 | | - $right_slices[$k] = substr( $right_slices[$k], $a ); |
172 | | - } |
173 | | - |
174 | | - $found = true; |
175 | | - $j = $k; |
176 | | - break; |
| 165 | + #} else { |
| 166 | + # print "<br/>tag not found ".$tag; |
177 | 167 | } |
| 168 | + } |
178 | 169 | |
179 | | - $right_chunk .= $right_slices[$k]; |
| 170 | + if( $found && $right_chunk ) { |
| 171 | + // Detect paragraphs |
| 172 | + $left_bits = $this->find_paragraphs( $left_chunk ); |
| 173 | + $right_bits = $this->find_paragraphs( $right_chunk ); |
180 | 174 | |
181 | | - if( $k < $max_k - 1 ) { |
182 | | - if( $left_tags[0][$i] == $right_tags[0][$k] ) { |
183 | | - $found = true; |
184 | | - $j = $k+1; |
185 | | - break; |
186 | | - } |
| 175 | + // $body .= "<tr style=\"background-color:#ffdddd;\"><td>".count($left_bits)."</td><td>".count($right_bits)."</td></tr>\n"; |
| 176 | + // Do not align paragraphs if counts are different |
| 177 | + if ( count( $left_bits ) != count( $right_bits ) ) { |
| 178 | + $left_bits = Array( $left_chunk ); |
| 179 | + $right_bits = Array( $right_chunk ); |
187 | 180 | } |
188 | | - } |
189 | | - if( $found ) { |
190 | 181 | |
191 | | - #split chunks into smaller units (paragraphs) |
192 | | - $paragraph_tags = "/<(p|dl)>/i"; |
193 | | - $left_bits = preg_split( $paragraph_tags, $left_chunk ); |
194 | | - $right_bits = preg_split( $paragraph_tags, $right_chunk ); |
195 | | - preg_match_all( $paragraph_tags, $left_chunk, $left_seps, PREG_PATTERN_ORDER ); |
196 | | - preg_match_all( $paragraph_tags, $right_chunk, $right_seps, PREG_PATTERN_ORDER ); |
197 | | - |
198 | 182 | $left_chunk = ''; |
199 | 183 | $right_chunk = ''; |
200 | | - |
201 | | - # add separators that were cut off |
202 | | - for($l=1; $l < count( $left_bits ); $l++ ) { |
203 | | - $left_bits[$l] = $left_seps[0][$l-1].$left_bits[$l]; |
| 184 | + for($l=0; $l < count( $left_bits ) ; $l++ ) { |
| 185 | + $body .= |
| 186 | + "<tr><td valign=\"top\" style=\"vertical-align:100%;padding-right: 0.5em\" lang=\"{$left_lang}\">" |
| 187 | + ."<div style=\"width:35em; margin:0px auto\">\n".$left_bits[$l]."</div>" |
| 188 | + ."</td>\n<td valign=\"top\" style=\"padding-left: 0.5em\" lang=\"{$right_lang}\">" |
| 189 | + ."<div style=\"width:35em; margin:0px auto\">\n".$right_bits[$l]."</div>" |
| 190 | + ."</td></tr>\n"; |
204 | 191 | } |
205 | | - for($l=1; $l < count( $right_bits ); $l++ ) { |
206 | | - $right_bits[$l] = $right_seps[0][$l-1].$right_bits[$l]; |
207 | | - } |
208 | | - |
209 | | - $max = max( count( $left_bits ) , count( $right_bits )); |
210 | | - # initialize missing elements |
211 | | - for($l= count( $left_bits ); $l<$max; $l++) $left_bits[$l]=''; |
212 | | - for($l= count( $right_bits ); $l<$max; $l++) $right_bits[$l]=''; |
213 | | - |
214 | | - for($l=0; $l < $max; $l++ ) { |
215 | | - |
216 | | - list($left_delta,$left_o,$left_c) = $this->nesting_delta( $left_bits[$l] ); |
217 | | - list($right_delta,$right_o,$right_c) = $this->nesting_delta( $right_bits[$l] ); |
218 | | - |
219 | | - $left_nesting = $left_nesting + $left_delta; |
220 | | - $right_nesting = $right_nesting + $right_delta; |
221 | | - |
222 | | - #are we at the end? |
223 | | - $the_end = ($l == $max-1) && ($i == $max_i -1 ); |
224 | | - |
225 | | - if(( $left_splitlevel == -1) && ($right_splitlevel == -1)) { |
226 | | - $left_splitlevel = $left_nesting; |
227 | | - $right_splitlevel = $right_nesting; |
228 | | - $left_opening = $left_o; |
229 | | - $right_opening = $right_o; |
230 | | - $left_closure = $left_c; |
231 | | - $right_closure = $right_c; |
232 | | - |
233 | | - $left_prefix = ''; |
234 | | - $right_prefix = ''; |
235 | | - $left_suffix = $left_closure; |
236 | | - $right_suffix = $right_closure; |
237 | | - } |
238 | | - else if($the_end) { |
239 | | - $left_prefix = $left_opening; |
240 | | - $right_prefix = $right_opening; |
241 | | - $left_suffix = ''; |
242 | | - $right_suffix = ''; |
243 | | - } |
244 | | - else { |
245 | | - $left_prefix = $left_opening; |
246 | | - $right_prefix = $right_opening; |
247 | | - $left_suffix = $left_closure; |
248 | | - $right_suffix = $right_closure; |
249 | | - } |
250 | | - |
251 | | - if( ( ($left_nesting == $left_splitlevel) |
252 | | - && ($right_nesting == $right_splitlevel) ) || $the_end) { |
253 | | - $body .= |
254 | | - "<tr><td valign=\"top\" style=\"padding-right: 0.5em\" lang=\"{$left_lang_code}\">" |
255 | | - ."<div style=\"width:35em; margin:0px auto\">\n" |
256 | | - .$left_prefix.$left_bits[$l].$left_suffix |
257 | | - ."</div>" |
258 | | - |
259 | | - ."</td>\n<td valign=\"top\" style=\"padding-left: 0.5em\" lang=\"{$right_lang_code}\">" |
260 | | - ."<div style=\"width:35em; margin:0px auto\">\n" |
261 | | - .$right_prefix.$right_bits[$l].$right_suffix |
262 | | - ."</div>" |
263 | | - ."</td></tr>\n"; |
264 | | - } |
265 | | - else { |
266 | | - # procrastinate |
267 | | - $left_nesting = $left_nesting - $left_delta; |
268 | | - $right_nesting = $right_nesting - $right_delta; |
269 | | - if ($l < $max-1) { |
270 | | - $left_bits[$l+1] = $left_bits[$l] . $left_bits[$l+1]; |
271 | | - $right_bits[$l+1] = $right_bits[$l] . $right_bits[$l+1]; |
272 | | - } else { |
273 | | - $left_chunk = $left_bits[$l] ; |
274 | | - $right_chunk = $right_bits[$l]; |
275 | | - } |
276 | | - } |
277 | | - } |
278 | 192 | } |
279 | | - else{ $right_chunk='';} |
280 | 193 | } |
281 | 194 | |
282 | | - |
283 | | - # format table head and return results |
| 195 | + // format table head and return results |
284 | 196 | $left_url = htmlspecialchars( $left_url ); |
285 | 197 | $right_url = htmlspecialchars( $right_url ); |
286 | 198 | $head = |
287 | | -"<table width=\"100%\" border=\"0\" bgcolor=\"white\" rules=\"cols\" cellpadding=\"0\"> |
| 199 | + "<table id=\"doubleWikiTable\" width=\"100%\" border=\"0\" bgcolor=\"white\" rules=\"cols\" cellpadding=\"0\"> |
288 | 200 | <colgroup><col width=\"50%\"/><col width=\"50%\"/></colgroup><thead> |
289 | | -<tr><td bgcolor=\"#cfcfff\" align=\"center\" lang=\"{$left_lang_code}\"> |
| 201 | +<tr><td bgcolor=\"#cfcfff\" align=\"center\" lang=\"{$left_lang}\"> |
290 | 202 | <a href=\"{$left_url}\">{$left_title}</a></td> |
291 | | -<td bgcolor=\"#cfcfff\" align=\"center\" lang=\"{$right_lang_code}\"> |
| 203 | +<td bgcolor=\"#cfcfff\" align=\"center\" lang=\"{$right_lang}\"> |
292 | 204 | <a href=\"{$right_url}\" class='extiw'>{$right_title}</a> |
293 | 205 | </td></tr></thead>\n"; |
294 | | - return $head.$body."</table>" ; |
| 206 | + return $head . $body . "</table>" ; |
295 | 207 | } |
296 | 208 | |
297 | 209 | |
| 210 | + |
298 | 211 | /* |
299 | | - * returns how much the stack is changed |
300 | | - * also returns opening and closing sequences of tag |
| 212 | + * Split text and return a set of html-balanced paragraphs |
301 | 213 | */ |
302 | | - function nesting_delta ( $text ) { |
303 | | - #tags that must be closed. (list copied from Sanitizer.php) |
304 | | - $tags = "/<\/?(b|del|i|ins|u|font|big|small|sub|sup|h1|h2|h3|h4|h5|h6|" |
305 | | - ."cite|code|em|s|strike|strong|tt|tr|td|var|div|center|blockquote|ol|ul|dl|" |
306 | | - ."table|caption|pre|ruby|rt|rb|rp|p|span)([\s](.*?)>|>)/i"; |
307 | | - preg_match_all( $tags, $text, $m, PREG_SET_ORDER); |
308 | | - |
309 | | - $stack = array(); |
| 214 | + function find_paragraphs( $text ) { |
| 215 | + $result = Array(); |
| 216 | + $bits = preg_split( $this->tags, $text ); |
| 217 | + preg_match_all( $this->tags, $text, $m, PREG_SET_ORDER); |
310 | 218 | $counter = 0; |
311 | | - $opening = ''; |
312 | | - $closure = ''; |
| 219 | + $out = ''; |
313 | 220 | for($i=0; $i < count($m); $i++){ |
314 | | - $t = $m[$i]; |
315 | | - if( substr( $t[0], 0, 2) != "</" ){ |
| 221 | + $t = $m[$i][0]; |
| 222 | + if( substr( $t, 0, 2) != "</" ) { |
316 | 223 | $counter++; |
317 | | - array_push($stack, $t); |
318 | 224 | } else { |
319 | | - $tt = array_pop($stack); |
320 | 225 | $counter--; |
321 | | - #if( ($tt != null) && ($tt[1] != $t[1]) ) { |
322 | | - # #input html is buggy... |
323 | | - # echo "Warning: ".$t[1]." encountered, expected ".$tt[1]."<br />\n"; |
324 | | - #} |
325 | 226 | } |
| 227 | + $out .= $bits[$i] . $t; |
| 228 | + if( ($t == "</p>" || $t == "</dl>" ) && $counter==0 ) { |
| 229 | + $result[] = $out; |
| 230 | + $out = ''; |
| 231 | + } |
326 | 232 | } |
327 | | - for($i=0; $i<$counter; $i++){ |
328 | | - $opening .= $stack[$i][0]; |
329 | | - $closure = "</".$stack[$i][1].">".$closure; |
| 233 | + if($out) { |
| 234 | + $result[] = $out; |
330 | 235 | } |
| 236 | + return $result; |
| 237 | + } |
331 | 238 | |
332 | | - return array($counter, $opening, $closure); |
333 | 239 | |
| 240 | + /* |
| 241 | + * Split text and return a set of html-balanced slices |
| 242 | + */ |
| 243 | + function find_slices( $left_text ) { |
| 244 | + |
| 245 | + $tag_pattern = "/<span id=\"dw-[^\"]*\" title=\"([^\"]*)\"\/>/i"; |
| 246 | + $left_slices = preg_split( $tag_pattern, $left_text ); |
| 247 | + preg_match_all( $tag_pattern, $left_text, $left_tags, PREG_PATTERN_ORDER ); |
| 248 | + $n = count( $left_slices); |
| 249 | + |
| 250 | + /* |
| 251 | + * Make slices that are full paragraphs |
| 252 | + * If two slices correspond to the same paragraph, the second one will be empty |
| 253 | + */ |
| 254 | + for ( $i=0 ; $i < $n - 1 ; $i++ ) { |
| 255 | + $str = $left_slices[$i]; |
| 256 | + if ( preg_match("/(.*)<(p|dl)>/is", $str, $m ) ) { |
| 257 | + $left_slices[$i] = $m[1]; |
| 258 | + $left_slices[$i+1] = substr( $str, strlen($m[1]) ) . $left_slices[$i+1]; |
| 259 | + } |
| 260 | + } |
| 261 | + |
| 262 | + /* |
| 263 | + * Keep only slices that contain balanced html |
| 264 | + * If a slice is unbalanced, we merge it with the next one. |
| 265 | + * The first and last slices are compensated. |
| 266 | + */ |
| 267 | + $stack = array(); |
| 268 | + $counter = 0; |
| 269 | + for( $i=0 ; $i < $n ; $i++) { |
| 270 | + $bits = preg_split( $this->tags, $left_slices[$i] ); |
| 271 | + preg_match_all( $this->tags, $left_slices[$i], $m, PREG_SET_ORDER); |
| 272 | + $counter = 0; |
| 273 | + for($k=0 ; $k < count($m) ; $k++) { |
| 274 | + $t = $m[$k]; |
| 275 | + if( substr( $t[0], 0, 2) != "</" ) { |
| 276 | + $counter++; |
| 277 | + array_push($stack, $t); |
| 278 | + } else { |
| 279 | + $tt = array_pop($stack); |
| 280 | + $counter--; |
| 281 | + } |
| 282 | + } |
| 283 | + if( $i==0 ) { |
| 284 | + $opening = ''; |
| 285 | + $closure = ''; |
| 286 | + for( $k=0; $k < $counter ; $k++ ) { |
| 287 | + $opening .= "<".$stack[$k][1].">"; |
| 288 | + $closure = "</".$stack[$k][1].">" . $closure; |
| 289 | + } |
| 290 | + $left_slices[$i] = $left_slices[$i] . $closure; |
| 291 | + } else if( $i == $n - 1 ) { |
| 292 | + $left_slices[$i] = $opening . $left_slices[$i]; |
| 293 | + } else if( $counter != 0 ) { |
| 294 | + $left_slices[$i+1] = $left_slices[$i] . $left_slices[$i+1]; |
| 295 | + $left_slices[$i] = ''; |
| 296 | + } |
| 297 | + } |
| 298 | + return array($left_slices, $left_tags); |
334 | 299 | } |
335 | 300 | |
336 | 301 | } |