r37013 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r37012‎ | r37013 | r37014 >
Date:22:13, 3 July 2008
Author:guyvdb
Status:old
Tags:
Comment:
New LCS implementation - 2 to 3 times faster than diff
Modified paths:
  • /branches/visual_diff/phase3/includes/Diff.php (modified) (history)

Diff [purge]

Index: branches/visual_diff/phase3/includes/Diff.php
@@ -1,63 +1,125 @@
22 <?php
 3+
34 /**
4 - * Class implementing the Diff/LCS algorithm.
 5+ * Class providing primitives for LCS or Diff.
56 *
6 - * This is the O(NP) variant as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm"
7 - *
87 * @author Guy Van den Broeck
98 */
 9+class AbstractDiffAlgorithm {
1010
11 -class Diff {
 11+ protected $a;
 12+ protected $m;
1213
13 - private $bigSequence;
14 - private $n;
15 - private $bigSequence_inLcs;
 14+ protected $b;
 15+ protected $n;
1616
17 - private $smallSequence;
18 - private $m;
19 - private $smallSequence_inLcs;
20 -
21 - private $lcsLength;
22 -
23 - /*
24 - * Diffs two arrays and returns array($from_inLcs, $to_inLcs)
25 - * where {from,to}_inLcs is 0 if the token was {removed,added}
26 - * and 1 if it is in the longest common subsequence.
 17+ /**
 18+ * The sequences need to be ordered by length.
2719 */
28 - public function diff (array $from, array $to) {
29 -
 20+ protected function order(array $from, array $to){
3021 if(sizeof($from)>=sizeof($to)){
31 - $bigSequence = $from;
32 - $n = sizeof($from);
33 -
34 - $smallSequence = $to;
35 - $m = sizeof($to);
 22+ $this->a = $to;
 23+ $this->m = sizeof($to);
 24+ $this->b = $from;
 25+ $this->n = sizeof($from);
3626 }else{
37 - $bigSequence = $to;
38 - $n = sizeof($to);
39 -
40 - $smallSequence = $from;
41 - $m = sizeof($from);
 27+ $this->a = $from;
 28+ $this->m = sizeof($from);
 29+ $this->b = $to;
 30+ $this->n = sizeof($to);
4231 }
 32+ }
4333
44 - //There is no need to remove common tokens at the beginning and end, the first snake will catch them.
45 - //Hashing the tokens is not generally applicable.
46 - //Removing tokens that are not in both sequences is O(N*M)>=O(NP). There is little to gain when most edits are small.
47 -
 34+ /**
 35+ * Slide down the snake untill reaching an inequality.
 36+ */
 37+ protected function snake($k,$y){
 38+ $x = $y-$k;
 39+ while($x < $this->m && $y < $this->n && $this->equals($this->a[$x],$this->b[$y])){
 40+ ++$x;
 41+ ++$y;
 42+ }
 43+ return $y;
4844 }
4945
 46+ /**
 47+ * Override this method to compute the LCS with different equality measures.
 48+ */
 49+ public function equals($left, $right){
 50+ return $left == $right;
 51+ }
 52+}
 53+
 54+/**
 55+ * Class implementing the Diff algorithm.
 56+ *
 57+ * This is the O(NP) variant as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm"
 58+ */
 59+class LcsAlgorithm extends AbstractDiffAlgorithm {
 60+
5061 /*
51 - * Computers the length of the longest common subsequence of
 62+ * Computers the length of the longest common subsequence of
5263 * the arrays $from and $to.
5364 */
5465 public function lcs (array $from, array $to) {
55 - $n = sizeof($from);
56 - $m = sizeof($to);
57 - //TODO
 66+ return (sizeof($from)+sizeof($to)-$this->ses( $from, $to))/2;
5867 }
 68+
 69+ /*
 70+ * Computers the length of the shortest edit script of
 71+ * the arrays $from and $to.
 72+ */
 73+ public function ses (array $from, array $to) {
 74+ $this->order($from, $to);
5975
60 - public function equals($left, $right){
61 - return $left == $right;
 76+ $delta = $this->n-$this->m;
 77+ $fp = array_fill_keys( range( -($this->m+1), $this->n+1), -1);
 78+ $p = -1;
 79+
 80+ do{
 81+ ++$p;
 82+ for($k=-$p;$k<$delta;$k++){
 83+ $fp[$k] = $this->snake( $k, max( $fp[$k-1]+1, $fp[$k+1]));
 84+ }
 85+
 86+ for($k=$delta+$p;$k>$delta;$k--){
 87+ $fp[$k] = $this->snake( $k, max( $fp[$k-1]+1, $fp[$k+1]));
 88+ }
 89+
 90+ $fp[$k] = $this->snake( $delta, max( $fp[$delta-1]+1, $fp[$delta+1]));
 91+ }while($fp[$delta]!=$this->n);
 92+
 93+ return $delta+2*$p;
6294 }
 95+
6396 }
 97+
 98+/**
 99+ * Class implementing the Diff algorithm.
 100+ *
 101+ * This is the O(NP) variant as descibed by Wu et al. in "An O(NP) Sequence Comparison Algorithm"
 102+ */
 103+class DiffAlgorithm extends AbstractDiffAlgorithm {
 104+
 105+ private $a_inLcs;
 106+
 107+ private $b_inLcs;
 108+
 109+ private $lcsLength;
 110+
 111+ /*
 112+ * Diffs two arrays and returns array($from_inLcs, $to_inLcs)
 113+ * where {from,to}_inLcs is FALSE if the token was {removed,added}
 114+ * and TRUE if it is in the longest common subsequence.
 115+ */
 116+ public function diff (array $from, array $to) {
 117+ $this->order($from, $to);
 118+
 119+ //There is no need to remove common tokens at the beginning and end, the first snake will catch them.
 120+ //Hashing the tokens is not generally applicable.
 121+ //Removing tokens that are not in both sequences is O(N*M)>=O(N*P). There is little to gain when most edits are small.
 122+
 123+ //TODO
 124+ }
 125+}
64126 ?>
\ No newline at end of file

Status & tagging log