r39401 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r39400‎ | r39401 | r39402 >
Date:10:45, 15 August 2008
Author:guyvdb
Status:old
Tags:
Comment:
New diff implementation
Modified paths:
  • /branches/visual_diff/phase3/includes/Diff.php (modified) (history)
  • /branches/visual_diff/phase3/includes/DifferenceEngine.php (modified) (history)

Diff [purge]

Index: branches/visual_diff/phase3/includes/Diff.php
@@ -18,454 +18,487 @@
1919 */
2020
2121 /**
22 - * Implementation of the Diff algorithm.
23 - *
24 - * DO NOT USE ON PRODUCTION SYSTEMS
25 - *
26 - * The algorithm is based on the O(NP) LCS algorithm as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm"
27 - * and extended with my own ideas.
 22+ * This diff implementation is mainly lifted from the LCS algorithm of the Eclipse project which
 23+ * in turn is based on Myers' "An O(ND) difference algorithm and its variations"
 24+ * (http://citeseer.ist.psu.edu/myers86ond.html) with range compression (see Wu et al.'s
 25+ * "An O(NP) Sequence Comparison Algorithm").
2826 *
29 - * @return array($from_removed, $to_added)
30 - * TRUE if the token was removed or added.
 27+ * This implementation supports an upper bound on the excution time.
3128 *
 29+ * Complexity: O((M + N)D) worst case time, O(M + N + D^2) expected time, O(M + N) space
 30+ *
3231 * @author Guy Van den Broeck
 32+ *
3333 */
34 -function wikidiff3_diff(/*array*/ $from, /*array*/ $to, $boundRunningTime=FALSE, $max_NP_before_bound = 800000){
35 - wfProfileIn( __METHOD__ );
 34+class WikiDiff3{
3635
37 - $m = sizeof($from);
38 - $n = sizeof($to);
 36+ //Input variables
 37+ private $from;
 38+ private $to;
 39+ private $m;
 40+ private $n;
3941
40 - $result_from = array_fill(0,$m,TRUE);
41 - $result_to = array_fill(0,$n,TRUE);
 42+ private $tooLong;
 43+ private $powLimit;
4244
43 - //reduce the complexity for the next step (intentionally done twice)
44 - //remove common tokens at the start
45 - $i=0;
46 - while($i<$m && $i<$n && $from[$i]===$to[$i]){
47 - $result_from[$i] = $result_to[$i] = FALSE;
48 - unset($from[$i],$to[$i]);
49 - ++$i;
 45+ //State variables
 46+ private $maxDifferences;
 47+
 48+ //Output variables
 49+ public $length;
 50+ public $added;
 51+ public $removed;
 52+ public $heuristicUsed;
 53+
 54+ function __construct($tooLong = 2000000, $powLimit = 1.45){
 55+ $this->tooLong = $tooLong;
 56+ $this->powLimit = $powLimit;
5057 }
5158
52 - //remove common tokens at the end
53 - $j=1;
54 - while($i+$j<=$m && $i+$j<=$n && $from[$m-$j]===$to[$n-$j]){
55 - $result_from[$m-$j] = $result_to[$n-$j] = FALSE;
56 - unset($from[$m-$j],$to[$n-$j]);
57 - ++$j;
58 - }
 59+ public function diff(/*array*/ $from, /*array*/ $to){
 60+ $m = sizeof($from);
 61+ $n = sizeof($to);
 62+
 63+ $this->heuristicUsed = FALSE;
5964
60 - $newFrom = $newFromIndex = $newTo = $newToIndex = array();
 65+ $result_from = $m>0?array_fill(0,$m,true):array();
 66+ $result_to = $n>0?array_fill(0,$n,true):array();
6167
62 - //remove tokens not in both sequences
63 - $shared = array_fill_keys($from,FALSE);
64 - foreach($to as $index => $el){
65 - if(array_key_exists($el,$shared)){
66 - //keep it
67 - $shared[$el] = TRUE;
68 - $newTo[] = $el;
69 - $newToIndex[] = $index;
 68+ //reduce the complexity for the next step (intentionally done twice)
 69+ //remove common tokens at the start
 70+ $i=0;
 71+ while($i<$m && $i<$n && $from[$i]===$to[$i]){
 72+ $result_from[$i] = $result_to[$i] = false;
 73+ unset($from[$i],$to[$i]);
 74+ ++$i;
7075 }
71 - }
72 - foreach($from as $index => $el){
73 - if($shared[$el]){
74 - //keep it
75 - $newFrom[] = $el;
76 - $newFromIndex[] = $index;
 76+
 77+ //remove common tokens at the end
 78+ $j=1;
 79+ while($i+$j<=$m && $i+$j<=$n && $from[$m-$j]===$to[$n-$j]){
 80+ $result_from[$m-$j] = $result_to[$n-$j] = false;
 81+ unset($from[$m-$j],$to[$n-$j]);
 82+ ++$j;
7783 }
78 - }
7984
80 - unset($from, $to, $shared);
 85+ $newFrom = $newFromIndex = $newTo = $newToIndex = array();
8186
82 - $m2 = sizeof($newFrom);
83 - $n2 = sizeof($newTo);
84 - $offsetx = $offsety = 0;
85 - $from_inLcs = new InLcs($m2);
86 - $to_inLcs = new InLcs($n2);
 87+ //remove tokens not in both sequences
 88+ $shared = array_fill_keys($from,false);
 89+ foreach($to as $index => $el){
 90+ if(array_key_exists($el,$shared)){
 91+ //keep it
 92+ $this->to[] = $el;
 93+ $shared[$el] = true;
 94+ $newToIndex[] = $index;
 95+ }
 96+ }
 97+ foreach($from as $index => $el){
 98+ if($shared[$el]){
 99+ //keep it
 100+ $this->from[] = $el;
 101+ $newFromIndex[] = $index;
 102+ }
 103+ }
87104
88 - wikidiff3_diffPart($newFrom, $newTo, $from_inLcs, $to_inLcs, $m2, $n2, $offsetx, $offsety, $m2, $boundRunningTime, $max_NP_before_bound);
 105+ unset($shared);
89106
90 - foreach($from_inLcs->inLcs as $key => $in){
91 - if($in){
92 - $result_from[$newFromIndex[$key]] = FALSE;
 107+ $this->m = sizeof($this->from);
 108+ $this->n = sizeof($this->to);
 109+ $this->removed = $this->m>0?array_fill(0,$this->m,true):array();
 110+ $this->added = $this->n>0?array_fill(0,$this->n,true):array();
 111+
 112+ $this->longestCommonSubsequence();
 113+
 114+ $this->m = $m;
 115+ $this->n = $n;
 116+ $this->length += $i+$j-1;
 117+
 118+ foreach($this->removed as $key => $removed){
 119+ if(!$removed){
 120+ $result_from[$newFromIndex[$key]] = false;
 121+ }
93122 }
94 - }
95 - foreach($to_inLcs->inLcs as $key => $in){
96 - if($in){
97 - $result_to[$newToIndex[$key]] = FALSE;
 123+ foreach($this->added as $key => $added){
 124+ if(!$added){
 125+ $result_to[$newToIndex[$key]] = false;
 126+ }
98127 }
 128+ unset($this->added, $this->removed);
 129+ return array($result_from, $result_to);
99130 }
100 -
101 - wfProfileOut( __METHOD__ );
102 - return array($result_from, $result_to);
103 -}
104131
105 -function wikidiff3_diffPart(array $a, array $b, InLcs $a_inLcs, InLcs $b_inLcs, $m, $n, $offsetx, $offsety, $bestKnownLcs, $boundRunningTime=FALSE, $max_NP_before_bound = 800000){
106 - if($bestKnownLcs==0 || $m==0 || $n==0){
107 - return;
108 - }
109 -
110 - if($m>$n){
111 - return wikidiff3_diffPart($b, $a, $b_inLcs, $a_inLcs, $n, $m, $offsety, $offsetx, $bestKnownLcs, $boundRunningTime, $max_NP_before_bound);
112 - }
 132+ public function longestCommonSubsequence() {
 133+ if ($this->m == 0 || $this->n == 0) {
 134+ $this->length = 0;
 135+ return;
 136+ }
113137
114 - $a_inLcs_sym = &$a_inLcs->inLcs;
115 - $b_inLcs_sym = &$b_inLcs->inLcs;
 138+ $this->maxDifferences = ceil(($this->m + $this->n) / 2.0);
 139+ if ($this->m * $this->n > $this->tooLong) {
 140+ // limit complexity to D^POW_LIMIT for long sequences
 141+ $this->maxDifferences = floor(pow($this->maxDifferences, $this->powLimit - 1.0));
 142+ wfDebug("Limiting max number of differences to $this->maxDifferences");
 143+ }
116144
117 - //remove common tokens at the start
118 - while($m>0 && $n>0 && $a[0]===$b[0]){
119 - $a_inLcs_sym[$offsetx] = $b_inLcs_sym[$offsety] = TRUE;
120 - ++$offsetx; ++$offsety;
121 - --$m; --$n;
122 - --$bestKnownLcs;
123 - array_shift($a);
124 - array_shift($b);
125 - }
 145+ /*
 146+ * The common prefixes and suffixes are always part of some LCS, include
 147+ * them now to reduce our search space
 148+ */
 149+ $max = min($this->m, $this->n);
 150+ for ($forwardBound = 0; $forwardBound < $max
 151+ && $this->from[$forwardBound]===$this->to[$forwardBound]; ++$forwardBound) {
 152+ $this->removed[$forwardBound] = $this->added[$forwardBound] = false;
 153+ }
126154
127 - //remove common tokens at the end
128 - while($m>0 && $n>0 && $a[$m-1]===$b[$n-1]){
129 - $a_inLcs_sym[$offsetx+$m-1] = $b_inLcs_sym[$offsety+$n-1] = TRUE;
130 - --$m; --$n;
131 - --$bestKnownLcs;
132 - array_pop($a);
133 - array_pop($b);
134 - }
 155+ $backBoundL1 = $this->m - 1;
 156+ $backBoundL2 = $this->n - 1;
135157
136 - if($bestKnownLcs==0 || $m==0 || $n==0){
137 - return;
 158+ while ($backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound
 159+ && $this->from[$backBoundL1] === $this->to[$backBoundL2]) {
 160+ $this->removed[$backBoundL1--] = $this->added[$backBoundL2--] = false;
 161+ }
 162+
 163+ $temp = array_fill(0,$this->m + $this->n + 1, 0);
 164+ $V = array($temp,$temp);
 165+ $snake = array(0,0,0);
 166+
 167+ $this->length = $forwardBound + $this->m - $backBoundL1 - 1
 168+ + $this->lcs_rec($forwardBound, $backBoundL1, $forwardBound, $backBoundL2,
 169+ $V, $snake);
138170 }
139 - if($m>$n){
140 - return wikidiff3_diffPart($b, $a, $b_inLcs, $a_inLcs, $n, $m, $offsety, $offsetx, $bestKnownLcs, $boundRunningTime, $max_NP_before_bound);
141 - }
142171
143 - $delta = $n-$m;
144 - $delta_plus_1 = $delta+1;
145 - $delta_min_1 = $delta-1;
 172+ private function lcs_rec($bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake) {
 173+ // check that both sequences are non-empty
 174+ if ($bottoml1 > $topl1 || $bottoml2 > $topl2) {
 175+ return 0;
 176+ }
146177
147 - $fpForw = $snakeBeginForw = $snakeEndForw = array_fill( 0, $delta_plus_1, -1);
148 - $lcsSizeForw = $snakekForw = $lcsSizeBack = $snakekBack = array_fill( 0, $delta_plus_1, 0);
149 - $fpBack = $snakeBeginBack = $snakeEndBack = array_fill( 0, $delta_plus_1, $n);
150 - $overlap = $delta>1 ? array_fill( 1, $delta_min_1, FALSE) : array();
151 - $bestLcsLength = $bestLcsLengthTop = $bestLcsLengthBottom = -1;
 178+ $d = $this->find_middle_snake($bottoml1, $topl1, $bottoml2, $topl2, $V, $snake);
152179
153 - if($boundRunningTime){
154 - $maxp_before_bound = max((-$delta+sqrt($delta*$delta+($max_NP_before_bound<<2)))>>1,2);
155 - if($maxp_before_bound>=$m){
156 - $boundRunningTime = false;
157 - unset($maxp_before_bound);
 180+ // need to store these so we don't lose them when they're overwritten by
 181+ // the recursion
 182+ $len = $snake[2];
 183+ $startx = $snake[0];
 184+ $starty = $snake[1];
 185+
 186+ // the middle snake is part of the LCS, store it
 187+ for ($i = 0; $i < $len; ++$i) {
 188+ $this->removed[$startx + $i] = $this->added[$starty + $i] = false;
158189 }
 190+
 191+ if ($d > 1) {
 192+ return $len
 193+ + $this->lcs_rec($bottoml1, $startx - 1, $bottoml2, $starty - 1, $V, $snake)
 194+ + $this->lcs_rec($startx + $len, $topl1, $starty + $len, $topl2, $V, $snake);
 195+ } else if ($d == 1) {
 196+ /*
 197+ * In this case the sequences differ by exactly 1 line. We have
 198+ * already saved all the lines after the difference in the for loop
 199+ * above, now we need to save all the lines before the difference.
 200+ */
 201+ $max = min($startx - $bottoml1, $starty - $bottoml2);
 202+ for ($i = 0; $i < $max; ++$i) {
 203+ $this->removed[$bottoml1 + $i] = $this->added[$bottoml2 + $i] = false;
 204+ }
 205+ return $max + $len;
 206+ }
 207+ return $len;
159208 }
160209
161 - $p=-1;
162 - $m_min_1 = $m-1;
163 - $maxp=$m_min_1-$bestLcsLength;
 210+ private function find_middle_snake($bottoml1, $topl1, $bottoml2,$topl2, &$V, &$snake) {
 211+ $from = &$this->from;
 212+ $to = &$this->to;
 213+ $V0 = &$V[0];
 214+ $V1 = &$V[1];
 215+ $snake0 = &$snake[0];
 216+ $snake1 = &$snake[1];
 217+ $snake2 = &$snake[2];
 218+ $bottoml1_min_1 = $bottoml1-1;
 219+ $bottoml2_min_1 = $bottoml2-1;
 220+ $N = $topl1 - $bottoml1_min_1;
 221+ $M = $topl2 - $bottoml2_min_1;
 222+ $delta = $N - $M;
 223+ $maxabsx = $N+$bottoml1;
 224+ $maxabsy = $M+$bottoml2;
 225+ $limit = min($this->maxDifferences, ceil(($N + $M ) / 2)); // ceil((N+M)/2)
164226
165 - while($p<$maxp){
 227+ //value_to_add_forward: a 0 or 1 that we add to the start
 228+ // offset
 229+ // to make it odd/even
 230+ if (($M & 1) == 1) {
 231+ $value_to_add_forward = 1;
 232+ } else {
 233+ $value_to_add_forward = 0;
 234+ }
166235
167 - if($boundRunningTime && $p>$maxp_before_bound){
168 - // bound the running time by stopping early
169 - if($bestLcsLength>=0){
170 - // accept the best LCS thusfar
171 - break;
172 - }else{
173 - $bestLcsProgressForw = $bestkForw = 0;
174 - foreach($lcsSizeForw as $k => $localLcsProgress){
175 - if($localLcsProgress>$bestLcsProgressForw || ($localLcsProgress==$bestLcsProgressForw && $bestkForw > $k)){
176 - $bestLcsProgressForw = $localLcsProgress;
177 - $bestkForw = $k;
178 - }
179 - }
180 - $bestLcsProgressBack = $bestkBack = 0;
181 - foreach($lcsSizeBack as $k => $localLcsProgress){
182 - if($localLcsProgress>$bestLcsProgressBack || ($localLcsProgress==$bestLcsProgressBack && $bestkBack < $k)){
183 - $bestLcsProgressBack = $localLcsProgress;
184 - $bestkBack = $k;
185 - }
186 - }
187 - if($fpForw[$bestkForw]-$bestkForw>$fpBack[$bestkBack]-$bestkBack){
188 - // This is hard, maybe try some more? Can this even happen? A solution will be found soon anyway.
189 - }else{
190 - // lets do this
191 - $topSnakeStart = $snakeBeginForw[$bestkForw];
192 - $topSnakeEnd = $snakeEndForw[$bestkForw];
193 - $topSnakek = $snakekForw[$bestkForw];
194 - $bestLcsLengthTop = $lcsSizeForw[$bestkBack] + $topSnakeStart - $topSnakeEnd;
 236+ if (($N & 1) == 1) {
 237+ $value_to_add_backward = 1;
 238+ } else {
 239+ $value_to_add_backward = 0;
 240+ }
195241
196 - $bottomSnakeStart = $snakeEndBack[$bestkBack]+1;
197 - $bottomSnakeEnd = $snakeBeginBack[$bestkBack]+1;
198 - $bottomSnakek = $snakekBack[$bestkBack];
199 - $bestLcsLengthBottom = $lcsSizeBack[$bestkBack] + $bottomSnakeStart - $bottomSnakeEnd;
 242+ $start_forward = -$M;
 243+ $end_forward = $N;
 244+ $start_backward = -$N;
 245+ $end_backward = $M;
200246
201 - if($bottomSnakeStart-$topSnakeEnd>$n*0.6 && ($bottomSnakeStart-$bottomSnakek)-($topSnakeEnd-$topSnakek)>$m*0.6){
202 - // cut the sequence from both sides (there isn't enough progress, 60% of the sequence is not covered)
203 - // also diff the middle now
204 - if($bottomSnakeEnd>($fpForw[$bestkForw]>>1)){
205 - // do the middle diff between the snakes
206 - wfDebug(" Limiting diff run time from both sides, middle between snakes\n");
207 - $m_t = ($bottomSnakeStart-$bottomSnakek)-($topSnakeEnd-$topSnakek);
208 - $n_t = $bottomSnakeStart-$topSnakeEnd;
209 - $a_t = array_slice($a, $topSnakeEnd-$topSnakek, $m_t);
210 - $b_t = array_slice($b, $topSnakeEnd, $n_t);
211 - $offsetx_t = $offsetx+($topSnakeEnd-$topSnakek);
212 - $offsety_t = $offsety+$topSnakeEnd;
213 - }else{
214 - // the snake is too early in the sequence, do the middle diff between endpoints of progress
215 - wfDebug(" Limiting diff run time from both sides, middle between endpoints\n");
216 - $m_t = ($fpBack[$bestkBack]+1-$bestkBack)-($fpForw[$bestkForw]-$bestkForw);
217 - $n_t = $fpBack[$bestkBack]+1-$fpForw[$bestkForw];
218 - $a_t = array_slice($a, $fpForw[$bestkForw]-$bestkForw, $m_t);
219 - $b_t = array_slice($b, $fpForw[$bestkForw], $n_t);
220 - $offsetx_t = $offsetx+($fpForw[$bestkForw]-$bestkForw);
221 - $offsety_t = $offsety+$fpForw[$bestkForw];
222 - }
223 - wikidiff3_diffPart($a_t, $b_t, $a_inLcs, $b_inLcs, $m_t, $n_t, $offsetx_t, $offsety_t, $m_t, $boundRunningTime, $max_NP_before_bound);
224 - }else if(min($n-$bottomSnakeStart, $m-($bottomSnakeStart-$bottomSnakek))>min($topSnakeEnd, $topSnakeEnd-$topSnakek)){
225 - // the bottom snake is more interesting
226 - wfDebug("Limiting diff run time from bottom side\n");
227 - $topSnakeStart = $bottomSnakeStart;
228 - $topSnakeEnd = $bottomSnakeEnd;
229 - $topSnakek = $bottomSnakek;
230 - $bestLcsLengthTop = $topSnakeEnd-$topSnakek;
231 - $bottomSnakeStart = $bottomSnakeEnd;
232 - }else{
233 - // the top snake is more interesting
234 - wfDebug(" Limiting diff run time from top side\n");
235 - $bottomSnakeStart = $topSnakeEnd;
236 - $bottomSnakeEnd = $topSnakeEnd;
237 - $bottomSnakek = $topSnakek;
238 - $bestLcsLengthBottom = $m-($bottomSnakeEnd-$bottomSnakek);
239 - }
240 - break;
241 - }
242 - }
243 - }
244 - ++$p;
245 - $overlap[-$p] = $overlap[$delta+$p] = FALSE;
 247+ $limit_min_1 = $limit-1;
 248+ $limit_plus_1 = $limit+1;
246249
247 - $min_p_min_1 = -$p-1;
248 - $delta_plus_1_plus_p = $delta_plus_1+$p;
 250+ $V0[$limit_plus_1] = 0;
 251+ $V1[$limit_min_1] = $N;
 252+ $limit = min($this->maxDifferences, ceil(($N + $M ) / 2)); // ceil((N+M)/2)
 253+
 254+ if (($delta & 1) == 1) {
 255+ for ($d = 0; $d <= $limit; ++$d) {
 256+ $start_diag = max($value_to_add_forward + $start_forward, -$d);
 257+ $end_diag = min($end_forward, $d);
 258+ $value_to_add_forward = 1 - $value_to_add_forward;
249259
250 - // forward
251 - $fpForw[$min_p_min_1] = $snakeEndForw[$min_p_min_1] = $snakekForw[$min_p_min_1] = $snakeBeginForw[$min_p_min_1] = $fpForw[$delta_plus_1_plus_p] =
252 - $snakeBeginForw[$delta_plus_1_plus_p] = $snakeEndForw[$delta_plus_1_plus_p] = $snakekForw[$delta_plus_1_plus_p] = -1;
253 - $lcsSizeForw[$min_p_min_1] = $lcsSizeForw[$delta_plus_1_plus_p] = 0;
 260+ // compute forward furthest reaching paths
 261+ for ($k = $start_diag; $k <= $end_diag; $k += 2) {
 262+ if ($k == -$d
 263+ || ($k < $d && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k])) {
 264+ $x = $V0[$limit_plus_1 + $k];
 265+ } else {
 266+ $x = $V0[$limit_min_1 + $k] + 1;
 267+ }
254268
255 - $k=-$p;
256 - do {
257 - $k_plus_1 = $k+1;
258 - $k_min_1 = $k-1;
 269+ $absx = $snake0 = $x + $bottoml1;
 270+ $absy = $snake1 = $x - $k + $bottoml2;
 271+
 272+ while ($absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy]) {
 273+ ++$absx;
 274+ ++$absy;
 275+ }
 276+ $x = $absx-$bottoml1;
 277+
 278+ $snake2 = $absx -$snake0;
 279+ $V0[$limit + $k] = $x;
 280+ if ($k >= $delta - $d + 1 && $k <= $delta + $d - 1
 281+ && $x >= $V1[$limit + $k - $delta]) {
 282+ return 2 * $d - 1;
 283+ }
259284
260 - $fpBelow = $fpForw[$k_min_1]+1;
261 - $fpAbove = $fpForw[$k_plus_1];
262 - $y = &$fpForw[$k];
263 - if($fpBelow>$fpAbove){
264 - $oldy = $y = $fpBelow;
265 - $lcsSizeForw[$k] = $lcsSizeForw[$k_min_1];
266 - $snakeBeginForw[$k] = $snakeBeginForw[$k_min_1];
267 - $snakeEndForw[$k] = $snakeEndForw[$k_min_1];
268 - $snakekForw[$k] = $snakekForw[$k_min_1];
269 - }else{
270 - $oldy = $y = $fpAbove;
271 - $lcsSizeForw[$k] = $lcsSizeForw[$k_plus_1];
272 - $snakeBeginForw[$k] = $snakeBeginForw[$k_plus_1];
273 - $snakeEndForw[$k] = $snakeEndForw[$k_plus_1];
274 - $snakekForw[$k] = $snakekForw[$k_plus_1];
275 - }
276 - $x = $y-$k;
277 - while($x < $m && $y < $n && $a[$x]===$b[$y]){
278 - ++$x;
279 - ++$y;
280 - }
281 - if($y-$oldy>0){
282 - $lcsSizeForw[$k] += $y-$oldy;
283 - $snakeBeginForw[$k] = $oldy;
284 - $snakeEndForw[$k]= $y;
285 - $snakekForw[$k] = $k;
286 - }
287 - if($y>=$fpBack[$k] && !$overlap[$k]){
288 - // there is overlap
289 - $overlap[$k]=TRUE;
290 - $lcsLength = $lcsSizeForw[$k]+$lcsSizeBack[$k];
291 - if($y>$fpBack[$k]+1){
292 - $snakeoverlap = $y-$fpBack[$k]-1;
293 - $lcsLength -= $snakeoverlap;
 285+ // check to see if we can cut down the diagonal range
 286+ if ($x >= $N && $end_forward > $k - 1) {
 287+ $end_forward = $k - 1;
 288+ } else if ($absy-$bottoml2 >= $M) {
 289+ $start_forward = $k + 1;
 290+ $value_to_add_forward = 0;
 291+ }
294292 }
295 - if($lcsLength>$bestLcsLength){
296 - // a better sequence found!
297 - $bestLcsLength = $lcsLength;
298293
299 - $topSnakeStart = $snakeBeginForw[$k];
300 - $topSnakeEnd = $snakeEndForw[$k];
301 - $topSnakek = $snakekForw[$k];
 294+ $start_diag = max($value_to_add_backward + $start_backward, -$d);
 295+ $end_diag = min($end_backward, $d);
 296+ $value_to_add_backward = 1 - $value_to_add_backward;
302297
303 - // aligned snakes bite (\ )
304 - // ( \ \)
305 - $bottomSnakeStart = max($snakeEndBack[$k]+1, $topSnakeEnd+max(0,$snakekBack[$k]-$snakekForw[$k]));
306 - $bottomSnakeEnd = max($snakeBeginBack[$k]+1, $bottomSnakeStart);
307 - $bottomSnakek = $snakekBack[$k];
 298+ // compute backward furthest reaching paths
 299+ for ($k = $start_diag; $k <= $end_diag; $k += 2) {
 300+ if ($k == $d
 301+ || ($k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k])) {
 302+ $x = $V1[$limit_min_1 + $k];
 303+ } else {
 304+ $x = $V1[$limit_plus_1 + $k] - 1;
 305+ }
308306
309 - if($bottomSnakeEnd<$y){
310 - $bottomSnakeStart = $y;
311 - $bottomSnakeEnd = $y;
312 - $bottomSnakek = $k;
 307+ $y = $x - $k - $delta;
 308+
 309+ $snake2 = 0;
 310+ while ($x > 0 && $y > 0
 311+ && $from[$x +$bottoml1_min_1] === $to[$y + $bottoml2_min_1]) {
 312+ --$x;
 313+ --$y;
 314+ ++$snake2;
313315 }
 316+ $V1[$limit + $k] = $x;
314317
315 - $bestLcsLengthTop = $lcsSizeForw[$k]-($snakeEndForw[$k]-$snakeBeginForw[$k]);
316 - $bestLcsLengthBottom = $lcsSizeBack[$k]-($snakeBeginBack[$k]-$snakeEndBack[$k]);
317 - if($bestKnownLcs==$lcsLength){
318 - $fpForw[$k]=$y;
319 - break 2;
 318+ // check to see if we can cut down our diagonal range
 319+ if ($x <= 0) {
 320+ $start_backward = $k + 1;
 321+ $value_to_add_backward = 0;
 322+ } else if ($y <= 0 && $end_backward > $k - 1) {
 323+ $end_backward = $k - 1;
320324 }
321 - $maxp=$m_min_1-$bestLcsLength;
322325 }
323326 }
324 - if($k<$delta_min_1){
325 - ++$k;
326 - }else if($k>$delta){
327 - --$k;
328 - }else if($k==$delta_min_1){
329 - $k = $delta+$p;
330 - }else{
331 - break;
332 - }
333 - } while(TRUE);
 327+ } else {
 328+ for ($d = 0; $d <= $limit; ++$d) {
 329+ $start_diag = max($value_to_add_forward + $start_forward, -$d);
 330+ $end_diag = min($end_forward, $d);
 331+ $value_to_add_forward = 1 - $value_to_add_forward;
334332
335 - // backward
336 - $fpBack[$min_p_min_1] = $snakeBeginBack[$min_p_min_1] = $snakeEndBack[$min_p_min_1] = $snakekBack[$min_p_min_1] =
337 - $fpBack[$delta_plus_1_plus_p] = $snakeBeginBack[$delta_plus_1_plus_p] = $snakeEndBack[$delta_plus_1_plus_p] = $snakekBack[$delta_plus_1_plus_p] = $n;
338 - $lcsSizeBack[$min_p_min_1] = $lcsSizeBack[$delta_plus_1_plus_p] = 0;
 333+ // compute forward furthest reaching paths
 334+ for ($k = $start_diag; $k <= $end_diag; $k += 2) {
 335+ if ($k == -$d
 336+ || ($k < $d && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k])) {
 337+ $x = $V0[$limit_plus_1 + $k];
 338+ } else {
 339+ $x = $V0[$limit_min_1 + $k] + 1;
 340+ }
339341
340 - $k=$delta+$p;
341 - do {
342 - $k_plus_1 = $k+1;
343 - $k_min_1 = $k-1;
 342+ $absx = $snake0 = $x + $bottoml1;
 343+ $absy = $snake1 = $x - $k + $bottoml2;
 344+
 345+ while ($absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy]) {
 346+ ++$absx;
 347+ ++$absy;
 348+ }
 349+ $x = $absx-$bottoml1;
 350+ $snake2 = $absx -$snake0;
 351+ $V0[$limit + $k] = $x;
344352
345 - $fpBelow = $fpBack[$k_min_1];
346 - $fpAbove = $fpBack[$k_plus_1]-1;
347 - $y = &$fpBack[$k];
348 - if($fpBelow<$fpAbove){
349 - $oldy = $y = $fpBelow;
350 - $lcsSizeBack[$k] = $lcsSizeBack[$k_min_1];
351 - $snakeBeginBack[$k] = $snakeBeginBack[$k_min_1];
352 - $snakeEndBack[$k] = $snakeEndBack[$k_min_1];
353 - $snakekBack[$k] = $snakekBack[$k_min_1];
354 - }else{
355 - $oldy = $y = $fpAbove;
356 - $lcsSizeBack[$k] = $lcsSizeBack[$k_plus_1];
357 - $snakeBeginBack[$k] = $snakeBeginBack[$k_plus_1];
358 - $snakeEndBack[$k] = $snakeEndBack[$k_plus_1];
359 - $snakekBack[$k] = $snakekBack[$k_plus_1];
360 - }
361 - $x = $y-$k;
362 - while($x > -1 && $y > -1 && $a[$x]===$b[$y]){
363 - --$x;
364 - --$y;
365 - }
366 - if($oldy-$y>0){
367 - $lcsSizeBack[$k] += $oldy-$y;
368 - $snakeBeginBack[$k] = $oldy;
369 - $snakeEndBack[$k] = $y;
370 - $snakekBack[$k] = $k;
371 - }
372 - if($fpForw[$k]>=$y && !$overlap[$k]){
373 - // there is overlap
374 - $overlap[$k]=TRUE;
375 - $lcsLength = $lcsSizeForw[$k]+$lcsSizeBack[$k];
376 - if($fpForw[$k]>$y+1){
377 - $snakeoverlap = $fpForw[$k]-$y-1;
378 - $lcsLength -= $snakeoverlap;
 353+ // check to see if we can cut down the diagonal range
 354+ if ($x >= $N && $end_forward > $k - 1) {
 355+ $end_forward = $k - 1;
 356+ } else if ($absy-$bottoml2 >= $M) {
 357+ $start_forward = $k + 1;
 358+ $value_to_add_forward = 0;
 359+ }
379360 }
380 - if($lcsLength>$bestLcsLength){
381 - // a better sequence found!
382 - $bestLcsLength = $lcsLength;
383361
384 - $topSnakeStart = $snakeBeginForw[$k];
385 - $topSnakeEnd = $snakeEndForw[$k];
386 - $topSnakek = $snakekForw[$k];
 362+ $start_diag = max($value_to_add_backward + $start_backward, -$d);
 363+ $end_diag = min($end_backward, $d);
 364+ $value_to_add_backward = 1 - $value_to_add_backward;
387365
388 - // aligned snakes bite (\ )
389 - // ( \ \)
390 - $bottomSnakeStart = max($snakeEndBack[$k]+1, $topSnakeEnd+max(0,$snakekBack[$k]-$snakekForw[$k]));
391 - $bottomSnakeEnd = max($snakeBeginBack[$k]+1, $bottomSnakeStart);
392 - $bottomSnakek = $snakekBack[$k];
 366+ // compute backward furthest reaching paths
 367+ for ($k = $start_diag; $k <= $end_diag; $k += 2) {
 368+ if ($k == $d
 369+ || ($k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k])) {
 370+ $x = $V1[$limit_min_1 + $k];
 371+ } else {
 372+ $x = $V1[$limit_plus_1 + $k] - 1;
 373+ }
393374
394 - if($bottomSnakeEnd<$fpForw[$k]){
395 - $bottomSnakeStart = $fpForw[$k];
396 - $bottomSnakeEnd = $fpForw[$k];
397 - $bottomSnakek = $k;
 375+ $y = $x - $k - $delta;
 376+
 377+ $snake2 = 0;
 378+ while ($x > 0 && $y > 0
 379+ && $from[$x +$bottoml1_min_1] === $to[$y + $bottoml2_min_1]) {
 380+ --$x;
 381+ --$y;
 382+ ++$snake2;
398383 }
 384+ $V1[$limit + $k] = $x;
399385
400 - $bestLcsLengthTop = $lcsSizeForw[$k]-($snakeEndForw[$k]-$snakeBeginForw[$k]);
401 - $bestLcsLengthBottom = $lcsSizeBack[$k]-($snakeBeginBack[$k]-$snakeEndBack[$k]);
402 - if($bestKnownLcs==$lcsLength){
403 - $fpBack[$k] = $y;
404 - break 2;
 386+ if ($k >= -$delta - $d && $k <= $d - $delta
 387+ && $x <= $V0[$limit + $k + $delta]) {
 388+ $snake0 = $bottoml1 + $x;
 389+ $snake1 = $bottoml2 + $y;
 390+ return 2 * $d;
405391 }
406 - $maxp=$m_min_1-$bestLcsLength;
 392+
 393+ // check to see if we can cut down our diagonal range
 394+ if ($x <= 0) {
 395+ $start_backward = $k + 1;
 396+ $value_to_add_backward = 0;
 397+ } else if ($y <= 0 && $end_backward > $k - 1) {
 398+ $end_backward = $k - 1;
 399+ }
407400 }
408401 }
409 - if($k>1){
410 - --$k;
411 - }else if($k<0){
412 - ++$k;
413 - }else if($k==1){
414 - $k = -$p;
415 - }else{
416 - break;
417 - }
418 - } while(TRUE);
419 - }
 402+ }
 403+ /*
 404+ * computing the true LCS is too expensive, instead find the diagonal
 405+ * with the most progress and pretend a midle snake of length 0 occurs
 406+ * there.
 407+ */
420408
421 - unset($fpForw, $fpBack, $lcsSizeForw, $lcsSizeBack);
422 - unset($snakeBeginForw, $snakeBeginBack, $snakeEndForw, $snakeEndBack, $snakekForw, $snakekBack);
423 - unset($overlap);
 409+ $most_progress = self::findMostProgress($M, $N, $limit, $V);
424410
425 - // Mark snakes as in LCS
426 - $maxi = $offsetx+$topSnakeEnd-$topSnakek;
427 - for($i=$offsetx+$topSnakeStart-$topSnakek;$i<$maxi;++$i){
428 - $a_inLcs_sym[$i] = TRUE;
 411+ $snake0 = $bottoml1 + $most_progress[0];
 412+ $snake1 = $bottoml2 + $most_progress[1];
 413+ $snake2 = 0;
 414+ wfDebug('Computing the LCS is too expensive. Using a heuristic.');
 415+ $this->heuristicUsed = true;
 416+ return 5; /*
 417+ * HACK: since we didn't really finish the LCS computation
 418+ * we don't really know the length of the SES. We don't do
 419+ * anything with the result anyway, unless it's <=1. We know
 420+ * for a fact SES > 1 so 5 is as good a number as any to
 421+ * return here
 422+ */
429423 }
430 - $maxi = $offsety+$topSnakeEnd;
431 - for($i=$offsety+$topSnakeStart;$i<$maxi;++$i){
432 - $b_inLcs_sym[$i] = TRUE;
433 - }
434 - $maxi = $offsetx+$bottomSnakeEnd-$bottomSnakek;
435 - for($i=$offsetx+$bottomSnakeStart-$bottomSnakek;$i<$maxi;++$i){
436 - $a_inLcs_sym[$i] = TRUE;
437 - }
438 - $maxi = $offsety+$bottomSnakeEnd;
439 - for($i=$offsety+$bottomSnakeStart;$i<$maxi;++$i){
440 - $b_inLcs_sym[$i] = TRUE;
441 - }
442424
443 - $m_t = $topSnakeStart-$topSnakek;
444 - $a_t = array_slice($a, 0, $m_t);
445 - $b_t = array_slice($b, 0, $topSnakeStart);
 425+ private static function findMostProgress($M, $N, $limit, $V) {
 426+ $delta = $N - $M;
446427
447 - $m_b = $m-($bottomSnakeEnd-$bottomSnakek);
448 - $n_b = $n-$bottomSnakeEnd;
449 - $a_b = array_slice($a, $bottomSnakeEnd-$bottomSnakek, $m_b);
450 - $b_b = array_slice($b, $bottomSnakeEnd, $n_b);
 428+ if (($M & 1) == ($limit & 1)) {
 429+ $forward_start_diag = max(-$M, -$limit);
 430+ } else {
 431+ $forward_start_diag = max(1 - $M, -$limit);
 432+ }
451433
452 - wikidiff3_diffPart($a_t, $b_t, $a_inLcs, $b_inLcs, $m_t, $topSnakeStart, $offsetx, $offsety, $bestLcsLengthTop, $boundRunningTime, $max_NP_before_bound);
 434+ $forward_end_diag = min($N, $limit);
453435
454 - wikidiff3_diffPart($a_b, $b_b, $a_inLcs, $b_inLcs, $m_b, $n_b, $offsetx+($bottomSnakeEnd-$bottomSnakek), $offsety+$bottomSnakeEnd, $bestLcsLengthBottom, $boundRunningTime, $max_NP_before_bound);
455 -}
 436+ if (($N & 1) == ($limit & 1)) {
 437+ $backward_start_diag = max(-$N, -$limit);
 438+ } else {
 439+ $backward_start_diag = max(1 - $N, -$limit);
 440+ }
456441
457 -class InLcs {
 442+ $backward_end_diag = -min($M, $limit);
458443
459 - public $inLcs;
 444+ $temp = array(0,0,0);
460445
461 - function __construct($length){
462 - $this->inLcs = $length>0 ? array_fill(0,$length,FALSE): array();
463 - }
464446
465 - /**
466 - * Get the length of the Longest Common Subsequence (computed)
467 - */
468 - public function getLcsLength(){
469 - return array_sum($this->inLcs);
 447+ $max_progress = array_fill(0,ceil(max($forward_end_diag
 448+ - $forward_start_diag, $backward_end_diag - $backward_start_diag) / 2), $temp);
 449+ $num_progress = 0; // the 1st entry is current, it is initialized
 450+ // with 0s
 451+
 452+ // first search the forward diagonals
 453+ for ($k = $forward_start_diag; $k <= $forward_end_diag; $k += 2) {
 454+ $x = $V[0][$limit + $k];
 455+ $y = $x - $k;
 456+ if ($x > $N || $y > $M) {
 457+ continue;
 458+ }
 459+
 460+ $progress = $x + $y;
 461+ if ($progress > $max_progress[0][2]) {
 462+ $num_progress = 0;
 463+ $max_progress[0][0] = $x;
 464+ $max_progress[0][1] = $y;
 465+ $max_progress[0][2] = $progress;
 466+ } else if ($progress == $max_progress[0][2]) {
 467+ ++$num_progress;
 468+ $max_progress[$num_progress][0] = $x;
 469+ $max_progress[$num_progress][1] = $y;
 470+ $max_progress[$num_progress][2] = $progress;
 471+ }
 472+ }
 473+
 474+ $max_progress_forward = true; // initially the maximum
 475+ // progress is in the forward
 476+ // direction
 477+
 478+ // now search the backward diagonals
 479+ for ($k = $backward_start_diag; $k <= $backward_end_diag; $k += 2) {
 480+ $x = $V[1][$limit + $k];
 481+ $y = $x - $k - $delta;
 482+ if ($x < 0 || $y < 0) {
 483+ continue;
 484+ }
 485+
 486+ $progress = $N - $x + $M - $y;
 487+ if ($progress > $max_progress[0][2]) {
 488+ $num_progress = 0;
 489+ $max_progress_forward = false;
 490+ $max_progress[0][0] = $x;
 491+ $max_progress[0][1] = $y;
 492+ $max_progress[0][2] = $progress;
 493+ } else if ($progress == $max_progress[0][2] && !$max_progress_forward) {
 494+ ++$num_progress;
 495+ $max_progress[$num_progress][0] = $x;
 496+ $max_progress[$num_progress][1] = $y;
 497+ $max_progress[$num_progress][2] = $progress;
 498+ }
 499+ }
 500+
 501+ // return the middle diagonal with maximal progress.
 502+ return $max_progress[floor($num_progress / 2)];
470503 }
471504
472505 }
Index: branches/visual_diff/phase3/includes/DifferenceEngine.php
@@ -74,9 +74,10 @@
7575 }
7676
7777 function showDiffPage( $diffOnly = false ) {
78 - global $wgUser, $wgOut, $wgUseExternalEditor, $wgUseRCPatrol;
 78+ global $wgUser, $wgOut, $wgUseExternalEditor, $wgUseRCPatrol, $wgEnableHtmlDiff;
7979 wfProfileIn( __METHOD__ );
8080
 81+
8182 # If external diffs are enabled both globally and for the user,
8283 # we'll use the application/x-external-editor interface to call
8384 # an external diff tool like kompare, kdiff3, etc.
@@ -265,17 +266,16 @@
266267 '<div id="mw-diff-ntitle3">' . $newminor . $sk->revComment( $this->mNewRev, !$diffOnly, true ) . $rdel . "</div>" .
267268 '<div id="mw-diff-ntitle4">' . $nextlink . $patrol . '</div>';
268269
269 - $this->showDiff( $oldHeader, $newHeader );
 270+ if($wgEnableHtmlDiff){
 271+ $this->renderHtmlDiff();
 272+ }else{
270273
271 - if ( !$diffOnly ){
272 - global $wgEnableHtmlDiff;
273 - if($wgEnableHtmlDiff){
274 - $this->renderHtmlDiff();
275 - }else{
 274+ $this->showDiff( $oldHeader, $newHeader );
 275+
 276+ if ( !$diffOnly ){
276277 $this->renderNewRevision();
277278 }
278279 }
279 -
280280 wfProfileOut( __METHOD__ );
281281 }
282282
@@ -329,10 +329,11 @@
330330 function renderHtmlDiff() {
331331 global $wgOut, $IP;
332332 wfProfileIn( __METHOD__ );
333 -
 333+
 334+ $this->showDiffStyle();
 335+
334336 require_once( "$IP/includes/HTMLDiff.php" );
335337
336 - $wgOut->addHTML( "<hr /><h2>HTML Diff</h2>\n" );
337338 #add deleted rev tag if needed
338339 if( !$this->mNewRev->userCan(Revision::DELETED_TEXT) ) {
339340 $wgOut->addWikiMsg( 'rev-deleted-text-permission' );
@@ -1064,10 +1065,6 @@
10651066 // Diff and store locally
10661067 $this->diff_local($from_lines, $to_lines);
10671068
1068 - // Merge edits when possible
1069 - $this->_shift_boundaries($from_lines, $this->xchanged, $this->ychanged);
1070 - $this->_shift_boundaries($to_lines, $this->ychanged, $this->xchanged);
1071 -
10721069 // Compute the ranges operations.
10731070 $n_from = sizeof($from_lines);
10741071 $n_to = sizeof($to_lines);
@@ -1102,18 +1099,18 @@
11031100
11041101 function diff_local ($from_lines, $to_lines) {
11051102 global $wgExternalDiffEngine;
 1103+ wfProfileIn( __METHOD__);
11061104
1107 - $n_from = sizeof($from_lines);
1108 - $n_to = sizeof($to_lines);
1109 -
11101105 if($wgExternalDiffEngine == 'wikidiff3'){
11111106 // wikidiff3
11121107 global $IP;
11131108 require_once( "$IP/includes/Diff.php" );
1114 - list($this->xchanged, $this->ychanged) = wikidiff3_diff($from_lines, $to_lines, TRUE, 100000);
 1109+ $wikidiff3 = new WikiDiff3();
 1110+ list($this->xchanged, $this->ychanged) = $wikidiff3->diff($from_lines, $to_lines);
11151111 }else{
11161112 // old diff
1117 - wfProfileIn( __METHOD__ ." - basicdiff");
 1113+ $n_from = sizeof($from_lines);
 1114+ $n_to = sizeof($to_lines);
11181115 $this->xchanged = $this->ychanged = array();
11191116 $this->xv = $this->yv = array();
11201117 $this->xind = $this->yind = array();
@@ -1158,8 +1155,8 @@
11591156
11601157 // Find the LCS.
11611158 $this->_compareseq(0, sizeof($this->xv), 0, sizeof($this->yv));
1162 - wfProfileOut( __METHOD__ ." - basicdiff");
11631159 }
 1160+ wfProfileOut( __METHOD__ );
11641161 }
11651162
11661163 /**
@@ -1215,7 +1212,6 @@
12161213 $numer = $xlim - $xoff + $nchunks - 1;
12171214 $x = $xoff;
12181215 for ($chunk = 0; $chunk < $nchunks; $chunk++) {
1219 - wfProfileIn( __METHOD__ . "-chunk" );
12201216 if ($chunk > 0)
12211217 for ($i = 0; $i <= $this->lcs; $i++)
12221218 $ymids[$i][$chunk-1] = $this->seq[$i];
@@ -1268,7 +1264,6 @@
12691265 if ($end == 0 || $ypos > $this->seq[$end]) {
12701266 $this->seq[++$this->lcs] = $ypos;
12711267 $this->in_seq[$ypos] = 1;
1272 - wfProfileOut( __METHOD__ );
12731268 return $this->lcs;
12741269 }
12751270

Status & tagging log