r37074 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r37073‎ | r37074 | r37075 >
Date:22:26, 4 July 2008
Author:guyvdb
Status:old
Tags:
Comment:
O(NP) diffing, first attempt
Modified paths:
  • /branches/visual_diff/phase3/includes/Diff.php (modified) (history)

Diff [purge]

Index: branches/visual_diff/phase3/includes/Diff.php
@@ -13,6 +13,8 @@
1414 protected $b;
1515 protected $n;
1616
 17+ protected $swapped;
 18+
1719 /**
1820 * The sequences need to be ordered by length.
1921 */
@@ -22,27 +24,50 @@
2325 $this->m = sizeof($to);
2426 $this->b = $from;
2527 $this->n = sizeof($from);
 28+ $this->swapped = TRUE;
2629 }else{
2730 $this->a = $from;
2831 $this->m = sizeof($from);
2932 $this->b = $to;
3033 $this->n = sizeof($to);
 34+ $this->swapped = FALSE;
3135 }
3236 }
3337
3438 /**
3539 * Slide down the snake untill reaching an inequality.
3640 */
37 - protected function snake($k,$y){
38 - $x = $y-$k;
39 - while($x < $this->m && $y < $this->n && $this->equals($this->a[$x],$this->b[$y])){
40 - ++$x;
41 - ++$y;
 41+ protected function forward_snake( $k, $y, $maxx, $maxy, $offsetx=0, $offsety=0, $swapped=FALSE){
 42+ if($swapped){
 43+ return $this->forward_snake( $k, $y-$k, $maxy, $maxx, $offsety, $offsetx, $swapped );
 44+ }else{
 45+ $x = $y-$k;
 46+ while($x < $maxx && $y < $maxy && $this->equals($this->a[$offsetx+$x],$this->b[$offsety+$y])){
 47+ ++$x;
 48+ ++$y;
 49+ }
 50+
 51+ return $y;
4252 }
43 - return $y;
4453 }
4554
4655 /**
 56+ * Slide up the snake until reaching an inequality.
 57+ */
 58+ protected function backward_snake( $k, $y, $maxx, $maxy, $offsetx=0, $offsety=0, $swapped=FALSE){
 59+ if($swapped){
 60+ return $this->backward_snake( $k, $y-$k, $maxy, $maxx, $offsety, $offsetx, $swapped );
 61+ }else{
 62+ $x = $y-$k;
 63+ while($x > $maxx && $y > $maxy && $this->equals($this->a[$offsetx+$x],$this->b[$offsety+$y])){
 64+ --$x;
 65+ --$y;
 66+ }
 67+ return $y;
 68+ }
 69+ }
 70+
 71+ /**
4772 * Override this method to compute the LCS with different equality measures.
4873 */
4974 public function equals($left, $right){
@@ -51,12 +76,14 @@
5277 }
5378
5479 /**
55 - * Class implementing the Diff algorithm.
 80+ * Class implementing the LCS algorithm by backward propagation.
5681 *
5782 * This is the O(NP) variant as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm"
 83+ *
 84+ * @author Guy Van den Broeck
5885 */
59 -class LcsAlgorithm extends AbstractDiffAlgorithm {
60 -
 86+class BackwardLcs extends AbstractDiffAlgorithm {
 87+
6188 /*
6289 * Computers the length of the longest common subsequence of
6390 * the arrays $from and $to.
@@ -64,7 +91,7 @@
6592 public function lcs (array $from, array $to) {
6693 return (sizeof($from)+sizeof($to)-$this->ses( $from, $to))/2;
6794 }
68 -
 95+
6996 /*
7097 * Computers the length of the shortest edit script of
7198 * the arrays $from and $to.
@@ -72,34 +99,195 @@
73100 public function ses (array $from, array $to) {
74101 $this->order($from, $to);
75102
 103+ if($this->n==0){
 104+ return $this->m;
 105+ }else if($this->m==0){
 106+ return $this->n;
 107+ }
 108+
76109 $delta = $this->n-$this->m;
 110+ $fp = array_fill_keys( range( -($this->m+1), $this->n+1), $this->n);
 111+ $p = -1;
 112+
 113+ do{
 114+ ++$p;
 115+ for($k=$delta+$p;$k>0;$k--){
 116+ $fp[$k] = $this->backward_snake( $k, min( $fp[$k+1]-1, $fp[$k-1]), -1, -1);
 117+ }
 118+ for($k=-$p;$k<0;$k++){
 119+ $fp[$k] = $this->backward_snake( $k, min( $fp[$k+1]-1, $fp[$k-1]), -1, -1);
 120+ }
 121+ $fp[0] = $this->backward_snake( 0, min( $fp[1]-1, $fp[-1]), -1, -1);
 122+ }while($fp[0]!=-1);
 123+
 124+ echo "<br>$delta, $p";
 125+ return $delta+2*$p;
 126+ }
 127+
 128+}
 129+
 130+/**
 131+ * Class implementing the LCS algorithm by forward propagation.
 132+ *
 133+ * This is the O(NP) variant as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm"
 134+ *
 135+ * @author Guy Van den Broeck
 136+ */
 137+class ForwardLcs extends AbstractDiffAlgorithm {
 138+
 139+ /*
 140+ * Computers the length of the longest common subsequence of
 141+ * the arrays $from and $to.
 142+ */
 143+ public function lcs (array $from, array $to) {
 144+ return (sizeof($from)+sizeof($to)-$this->ses( $from, $to))/2;
 145+ }
 146+
 147+ /*
 148+ * Computers the length of the shortest edit script of
 149+ * the arrays $from and $to.
 150+ */
 151+ public function ses (array $from, array $to) {
 152+ $this->order($from, $to);
 153+
 154+ if($this->n==0){
 155+ return $this->m;
 156+ }else if($this->m==0){
 157+ return $this->n;
 158+ }
 159+
 160+ $delta = $this->n-$this->m;
77161 $fp = array_fill_keys( range( -($this->m+1), $this->n+1), -1);
78162 $p = -1;
79163
80164 do{
81165 ++$p;
82166 for($k=-$p;$k<$delta;$k++){
83 - $fp[$k] = $this->snake( $k, max( $fp[$k-1]+1, $fp[$k+1]));
 167+ $fp[$k] = $this->forward_snake( $k, max( $fp[$k-1]+1, $fp[$k+1]), $this->m, $this->n);
84168 }
85 -
86169 for($k=$delta+$p;$k>$delta;$k--){
87 - $fp[$k] = $this->snake( $k, max( $fp[$k-1]+1, $fp[$k+1]));
 170+ $fp[$k] = $this->forward_snake( $k, max( $fp[$k-1]+1, $fp[$k+1]), $this->m, $this->n);
88171 }
89 -
90 - $fp[$k] = $this->snake( $delta, max( $fp[$delta-1]+1, $fp[$delta+1]));
 172+ $fp[$delta] = $this->forward_snake( $delta, max( $fp[$delta-1]+1, $fp[$delta+1]), $this->m, $this->n);
91173 }while($fp[$delta]!=$this->n);
92174
 175+ echo "<br>$delta, $p";
93176 return $delta+2*$p;
94177 }
95 -
 178+
96179 }
97180
98181 /**
 182+ * Class implementing the LCS algorithm by bidirectional propagation.
 183+ *
 184+ * !!!!!!!!!!!!!!!!!!!!!!
 185+ * BEWARE HERE BE DRAGONS
 186+ * This algorithm is not guaranteed to find the exact size of the LCS.
 187+ * There are border cases where it will be 1 (or more) off.
 188+ * This implmentation is useful because it is very fast.
 189+ * !!!!!!!!!!!!!!!!!!!!!!
 190+ *
 191+ * This is the O(NP) variant as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm"
 192+ *
 193+ * @author Guy Van den Broeck
 194+ */
 195+class BidirectionalLcs extends AbstractDiffAlgorithm {
 196+
 197+ /*
 198+ * Computers the length of the longest common subsequence of
 199+ * the arrays $from and $to.
 200+ */
 201+ public function lcs (array $from, array $to) {
 202+ return (sizeof($from)+sizeof($to)-$this->ses( $from, $to))/2;
 203+ }
 204+
 205+ /*
 206+ * Computers the length of the shortest edit script of
 207+ * the arrays $from and $to.
 208+ */
 209+ public function ses (array $from, array $to) {
 210+ $this->order($from, $to);
 211+
 212+ if($this->n==0){
 213+ return $this->m;
 214+ }else if($this->m==0){
 215+ return $this->n;
 216+ }
 217+
 218+ $delta = $this->n-$this->m;
 219+
 220+ $fpkeys = range( -($this->m+1), $this->n+1);
 221+ $fpf = array_fill_keys( $fpkeys, -1);
 222+ $fpb = array_fill_keys( $fpkeys, $this->n);
 223+ $p = -1;
 224+
 225+ while(1){
 226+ ++$p;
 227+
 228+ //forward
 229+ for($k=-$p;$k<$delta;$k++){
 230+ $fpf[$k] = $this->forward_snake( $k, max( $fpf[$k-1]+1, $fpf[$k+1]), $this->m, $this->n);
 231+ if($fpf[$k]>$fpb[$k]+1){
 232+ echo "<br>0, $delta, $p";
 233+ return $delta+4*$p-4;
 234+ }
 235+ }
 236+ for($k=$delta+$p;$k>$delta;$k--){
 237+ $fpf[$k] = $this->forward_snake( $k, max( $fpf[$k-1]+1, $fpf[$k+1]), $this->m, $this->n);
 238+ if($fpf[$k]>$fpb[$k]+1){
 239+ echo "<br>0, $delta, $p";
 240+ return $delta+4*$p-4;
 241+ }
 242+ }
 243+ $fpf[$delta] = $this->forward_snake( $delta, max( $fpf[$delta-1]+1, $fpf[$delta+1]), $this->m, $this->n);
 244+
 245+ if($fpf[$delta]>$fpb[$delta]+1){
 246+ echo "<br>0, $delta, $p";
 247+ return $delta+4*$p-4;
 248+ }
 249+
 250+ //backward
 251+ for($k=$delta+$p;$k>0;$k--){
 252+ $fpb[$k] = $this->backward_snake( $k, min( $fpb[$k+1]-1, $fpb[$k-1]), -1, -1);
 253+ if($fpf[$k]>$fpb[$k]+1){
 254+ echo "<br>1, $delta, $p";
 255+ return $delta+4*$p;
 256+ }
 257+ }
 258+ for($k=-$p;$k<0;$k++){
 259+ $fpb[$k] = $this->backward_snake( $k, min( $fpb[$k+1]-1, $fpb[$k-1]), -1, -1);
 260+ if($fpf[$k]>$fpb[$k]+1){
 261+ echo "<br>1, $delta, $p";
 262+ return $delta+4*$p;
 263+ }
 264+ }
 265+ $fpb[0] = $this->backward_snake( 0, min( $fpb[1]-1, $fpb[-1]), -1, -1);
 266+
 267+ if($fpf[0]>$fpb[0]+1){
 268+ echo "<br>1, $delta, $p";
 269+ return $delta+4*$p;
 270+ }
 271+
 272+ }
 273+ }
 274+
 275+}
 276+
 277+/**
99278 * Class implementing the Diff algorithm.
100279 *
 280+ * !!!!!!!!!!!!!!!!!!!!!!
 281+ * BEWARE HERE BE DRAGONS
 282+ * This algorithm is not guaranteed to find the exact size of the LCS.
 283+ * There are border cases where it will be 1 (or more) off.
 284+ * This implmentation is useful because it is very fast.
 285+ * !!!!!!!!!!!!!!!!!!!!!!
 286+ *
101287 * This is the O(NP) variant as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm"
 288+ *
 289+ * @author Guy Van den Broeck
102290 */
103 -class DiffAlgorithm extends AbstractDiffAlgorithm {
 291+class FastDiffAlgorithm extends AbstractDiffAlgorithm {
104292
105293 private $a_inLcs;
106294
@@ -114,12 +302,163 @@
115303 */
116304 public function diff (array $from, array $to) {
117305 $this->order($from, $to);
 306+ $this->a_inLcs = array_fill(0,$this->m,FALSE);
 307+ $this->b_inLcs = array_fill(0,$this->n,FALSE);
118308
119309 //There is no need to remove common tokens at the beginning and end, the first snake will catch them.
120310 //Hashing the tokens is not generally applicable.
121311 //Removing tokens that are not in both sequences is O(N*M)>=O(N*P). There is little to gain when most edits are small.
122312
123 - //TODO
 313+ $this->diff_recursive(0,$this->m,0,$this->n);
 314+
 315+ if($this->swapped){
 316+ return array($this->b_inLcs,$this->a_inLcs);
 317+ }else{
 318+ return array($this->a_inLcs,$this->b_inLcs);
 319+ }
124320 }
 321+
 322+ /**
 323+ * Calculate the diff with a divide and conquer method.
 324+ *
 325+ * Ends are exclusive
 326+ */
 327+ protected function diff_recursive($x_start, $x_end, $y_start, $y_end){
 328+ echo "<br>$x_start-$x_end , $y_start-$y_end";
 329+ if($x_end>$x_start && $y_end>$y_start){
 330+
 331+ if($x_end-$x_start>$y_end-$y_start){
 332+ $temp = $x_end;
 333+ $x_end = $y_end;
 334+ $y_end = $temp;
 335+ $temp = $x_start;
 336+ $x_start = $y_start;
 337+ $y_start = $temp;
 338+ $swapped = TRUE;
 339+ }else{
 340+ $swapped = FALSE;
 341+ }
 342+
 343+ $m = $x_end-$x_start;
 344+ $n = $y_end-$y_start;
 345+
 346+ $delta = $n-$m;
 347+
 348+ $fpkeys = range( -($m+1), $n+1);
 349+ $fpf = array_fill_keys( $fpkeys, -1);
 350+ $fpb = array_fill_keys( $fpkeys, $n);
 351+ $p = -1;
 352+
 353+ while(1){
 354+ ++$p;
 355+
 356+ echo "$p\n";
 357+ echo "forward:\n";
 358+ print_r($fpf);
 359+ echo "backward:\n";
 360+ print_r($fpb);
 361+
 362+ //forward
 363+ for($k=-$p;$k<$delta;$k++){
 364+ $fpf[$k] = $this->forward_snake( $k, max( $fpf[$k-1]+1, $fpf[$k+1]), $m, $n, $x_start, $y_start, $swapped);
 365+ if($fpf[$k]>$fpb[$k]+1){
 366+ $this->foundSnake($x_start, $x_end, $y_start, $y_end, $x_start+$fpb[$k]-$k+1, $x_start+$fpf[$k]-$k, $y_start+$fpb[$k]+1, $y_start+$fpf[$k], $swapped);
 367+ return;
 368+ }
 369+ }
 370+ echo "forward:\n";
 371+ print_r($fpf);
 372+ echo "backward:\n";
 373+ print_r($fpb);
 374+ for($k=$delta+$p;$k>$delta;$k--){
 375+ $fpf[$k] = $this->forward_snake( $k, max( $fpf[$k-1]+1, $fpf[$k+1]), $m, $n, $x_start, $y_start, $swapped);
 376+ if($fpf[$k]>$fpb[$k]+1){
 377+ $this->foundSnake($x_start, $x_end, $y_start, $y_end, $x_start+$fpb[$k]-$k+1, $x_start+$fpf[$k]-$k, $y_start+$fpb[$k]+1, $y_start+$fpf[$k], $swapped);
 378+ return;
 379+ }
 380+ }
 381+ echo "forward:\n";
 382+ print_r($fpf);
 383+ echo "backward:\n";
 384+ print_r($fpb);
 385+ $fpf[$delta] = $this->forward_snake( $delta, max( $fpf[$delta-1]+1, $fpf[$delta+1]), $m, $n, $x_start, $y_start, $swapped);
 386+ if($fpf[$delta]>$fpb[$delta]+1){
 387+ $this->foundSnake($x_start, $x_end, $y_start, $y_end, $x_start+$fpb[$delta]-$delta+1, $x_start+$fpf[$delta]-$k, $y_start+$fpb[$delta]+1, $y_start+$fpf[$delta], $swapped);
 388+ return;
 389+ }
 390+ echo "forward:\n";
 391+ print_r($fpf);
 392+ echo "backward:\n";
 393+ print_r($fpb);
 394+
 395+ //backward
 396+ for($k=$delta+$p;$k>0;$k--){
 397+ $fpb[$k] = $this->backward_snake( $k, min( $fpb[$k+1]-1, $fpb[$k-1]), -1, -1, $x_start, $y_start, $swapped);
 398+ if($fpf[$k]>$fpb[$k]+1){
 399+ $this->foundSnake($x_start, $x_end, $y_start, $y_end, $x_start+$fpb[$k]-$k+1, $x_start+$fpf[$k]-$k, $y_start+$fpb[$k]+1, $y_start+$fpf[$k], $swapped);
 400+ return;
 401+ }
 402+ }
 403+ echo "forward:\n";
 404+ print_r($fpf);
 405+ echo "backward:\n";
 406+ print_r($fpb);
 407+ for($k=-$p;$k<0;$k++){
 408+ $fpb[$k] = $this->backward_snake( $k, min( $fpb[$k+1]-1, $fpb[$k-1]), -1, -1, $x_start, $y_start, $swapped);
 409+ if($fpf[$k]>$fpb[$k]+1){
 410+ $this->foundSnake($x_start, $x_end, $y_start, $y_end, $x_start+$fpb[$k]-$k+1, $x_start+$fpf[$k]-$k, $y_start+$fpb[$k]+1, $y_start+$fpf[$k], $swapped);
 411+ return;
 412+ }
 413+ }
 414+ echo "forward:\n";
 415+ print_r($fpf);
 416+ echo "backward:\n";
 417+ print_r($fpb);
 418+ $fpb[0] = $this->backward_snake( 0, min( $fpb[1]-1, $fpb[-1]), -1, -1, $x_start, $y_start, $swapped);
 419+ if($fpf[0]>$fpb[0]+1){
 420+ $this->foundSnake($x_start, $x_end, $y_start, $y_end, $x_start+$fpb[0]+1, $x_start+$fpf[0], $y_start+$fpb[0]+1, $y_start+$fpf[0], $swapped);
 421+ return;
 422+ }
 423+ echo "forward:\n";
 424+ print_r($fpf);
 425+ echo "backward:\n";
 426+ print_r($fpb);
 427+
 428+ }
 429+ }
 430+ }
 431+
 432+ /**
 433+ * Mark tokens $start until $end (exclusive) as in the LCS
 434+ * Diff the part before and after the snake.
 435+ */
 436+ protected function foundSnake($x_start, $x_end, $y_start, $y_end, $x_snakestart, $x_snakeend, $y_snakestart, $y_snakeend, $swapped){
 437+ echo "<br>$x_start, $x_end, $y_start, $y_end, $x_snakestart, $x_snakeend, $y_snakestart, $y_snakeend, $swapped";
 438+ if($swapped){
 439+ $temp = $x_snakestart;
 440+ $x_snakestart = $y_snakestart;
 441+ $y_snakestart = $temp;
 442+ $temp = $x_snakeend;
 443+ $x_snakeend = $y_snakeend;
 444+ $y_snakeend = $temp;
 445+
 446+ $temp = $x_start;
 447+ $x_start = $y_start;
 448+ $y_start = $temp;
 449+ $temp = $x_end;
 450+ $x_end = $y_end;
 451+ $y_end = $temp;
 452+ }
 453+ for($x=$x_snakestart;$x<$x_snakeend;$x++){
 454+ $this->a_inLcs[$x] = TRUE;
 455+ }
 456+ for($y=$y_snakestart;$y<$y_snakeend;$y++){
 457+ $this->b_inLcs[$y] = TRUE;
 458+ }
 459+ echo "<br>left:";
 460+ $this->diff_recursive($x_start, $x_snakestart, $y_start, $y_snakestart);
 461+ echo "<br>right:";
 462+ $this->diff_recursive($x_snakeend, $x_end, $y_snakeend, $y_end);
 463+ }
125464 }
126465 ?>
\ No newline at end of file

Status & tagging log