Index: branches/visual_diff/phase3/includes/Diff.php |
— | — | @@ -1,63 +1,125 @@ |
2 | 2 | <?php |
| 3 | + |
3 | 4 | /** |
4 | | - * Class implementing the Diff/LCS algorithm. |
| 5 | + * Class providing primitives for LCS or Diff. |
5 | 6 | * |
6 | | - * This is the O(NP) variant as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm" |
7 | | - * |
8 | 7 | * @author Guy Van den Broeck |
9 | 8 | */ |
| 9 | +class AbstractDiffAlgorithm { |
10 | 10 | |
11 | | -class Diff { |
| 11 | + protected $a; |
| 12 | + protected $m; |
12 | 13 | |
13 | | - private $bigSequence; |
14 | | - private $n; |
15 | | - private $bigSequence_inLcs; |
| 14 | + protected $b; |
| 15 | + protected $n; |
16 | 16 | |
17 | | - private $smallSequence; |
18 | | - private $m; |
19 | | - private $smallSequence_inLcs; |
20 | | - |
21 | | - private $lcsLength; |
22 | | - |
23 | | - /* |
24 | | - * Diffs two arrays and returns array($from_inLcs, $to_inLcs) |
25 | | - * where {from,to}_inLcs is 0 if the token was {removed,added} |
26 | | - * and 1 if it is in the longest common subsequence. |
| 17 | + /** |
| 18 | + * The sequences need to be ordered by length. |
27 | 19 | */ |
28 | | - public function diff (array $from, array $to) { |
29 | | - |
| 20 | + protected function order(array $from, array $to){ |
30 | 21 | if(sizeof($from)>=sizeof($to)){ |
31 | | - $bigSequence = $from; |
32 | | - $n = sizeof($from); |
33 | | - |
34 | | - $smallSequence = $to; |
35 | | - $m = sizeof($to); |
| 22 | + $this->a = $to; |
| 23 | + $this->m = sizeof($to); |
| 24 | + $this->b = $from; |
| 25 | + $this->n = sizeof($from); |
36 | 26 | }else{ |
37 | | - $bigSequence = $to; |
38 | | - $n = sizeof($to); |
39 | | - |
40 | | - $smallSequence = $from; |
41 | | - $m = sizeof($from); |
| 27 | + $this->a = $from; |
| 28 | + $this->m = sizeof($from); |
| 29 | + $this->b = $to; |
| 30 | + $this->n = sizeof($to); |
42 | 31 | } |
| 32 | + } |
43 | 33 | |
44 | | - //There is no need to remove common tokens at the beginning and end, the first snake will catch them. |
45 | | - //Hashing the tokens is not generally applicable. |
46 | | - //Removing tokens that are not in both sequences is O(N*M)>=O(NP). There is little to gain when most edits are small. |
47 | | - |
| 34 | + /** |
| 35 | + * Slide down the snake untill reaching an inequality. |
| 36 | + */ |
| 37 | + protected function snake($k,$y){ |
| 38 | + $x = $y-$k; |
| 39 | + while($x < $this->m && $y < $this->n && $this->equals($this->a[$x],$this->b[$y])){ |
| 40 | + ++$x; |
| 41 | + ++$y; |
| 42 | + } |
| 43 | + return $y; |
48 | 44 | } |
49 | 45 | |
| 46 | + /** |
| 47 | + * Override this method to compute the LCS with different equality measures. |
| 48 | + */ |
| 49 | + public function equals($left, $right){ |
| 50 | + return $left == $right; |
| 51 | + } |
| 52 | +} |
| 53 | + |
| 54 | +/** |
| 55 | + * Class implementing the Diff algorithm. |
| 56 | + * |
| 57 | + * This is the O(NP) variant as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm" |
| 58 | + */ |
| 59 | +class LcsAlgorithm extends AbstractDiffAlgorithm { |
| 60 | + |
50 | 61 | /* |
51 | | - * Computers the length of the longest common subsequence of |
| 62 | + * Computers the length of the longest common subsequence of |
52 | 63 | * the arrays $from and $to. |
53 | 64 | */ |
54 | 65 | public function lcs (array $from, array $to) { |
55 | | - $n = sizeof($from); |
56 | | - $m = sizeof($to); |
57 | | - //TODO |
| 66 | + return (sizeof($from)+sizeof($to)-$this->ses( $from, $to))/2; |
58 | 67 | } |
| 68 | + |
| 69 | + /* |
| 70 | + * Computers the length of the shortest edit script of |
| 71 | + * the arrays $from and $to. |
| 72 | + */ |
| 73 | + public function ses (array $from, array $to) { |
| 74 | + $this->order($from, $to); |
59 | 75 | |
60 | | - public function equals($left, $right){ |
61 | | - return $left == $right; |
| 76 | + $delta = $this->n-$this->m; |
| 77 | + $fp = array_fill_keys( range( -($this->m+1), $this->n+1), -1); |
| 78 | + $p = -1; |
| 79 | + |
| 80 | + do{ |
| 81 | + ++$p; |
| 82 | + for($k=-$p;$k<$delta;$k++){ |
| 83 | + $fp[$k] = $this->snake( $k, max( $fp[$k-1]+1, $fp[$k+1])); |
| 84 | + } |
| 85 | + |
| 86 | + for($k=$delta+$p;$k>$delta;$k--){ |
| 87 | + $fp[$k] = $this->snake( $k, max( $fp[$k-1]+1, $fp[$k+1])); |
| 88 | + } |
| 89 | + |
| 90 | + $fp[$k] = $this->snake( $delta, max( $fp[$delta-1]+1, $fp[$delta+1])); |
| 91 | + }while($fp[$delta]!=$this->n); |
| 92 | + |
| 93 | + return $delta+2*$p; |
62 | 94 | } |
| 95 | + |
63 | 96 | } |
| 97 | + |
| 98 | +/** |
| 99 | + * Class implementing the Diff algorithm. |
| 100 | + * |
| 101 | + * This is the O(NP) variant as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm" |
| 102 | + */ |
| 103 | +class DiffAlgorithm extends AbstractDiffAlgorithm { |
| 104 | + |
| 105 | + private $a_inLcs; |
| 106 | + |
| 107 | + private $b_inLcs; |
| 108 | + |
| 109 | + private $lcsLength; |
| 110 | + |
| 111 | + /* |
| 112 | + * Diffs two arrays and returns array($from_inLcs, $to_inLcs) |
| 113 | + * where {from,to}_inLcs is FALSE if the token was {removed,added} |
| 114 | + * and TRUE if it is in the longest common subsequence. |
| 115 | + */ |
| 116 | + public function diff (array $from, array $to) { |
| 117 | + $this->order($from, $to); |
| 118 | + |
| 119 | + //There is no need to remove common tokens at the beginning and end, the first snake will catch them. |
| 120 | + //Hashing the tokens is not generally applicable. |
| 121 | + //Removing tokens that are not in both sequences is O(N*M)>=O(N*P). There is little to gain when most edits are small. |
| 122 | + |
| 123 | + //TODO |
| 124 | + } |
| 125 | +} |
64 | 126 | ?> |
\ No newline at end of file |