r12986 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r12985‎ | r12986 | r12987 >
Date:10:36, 9 February 2006
Author:timstarling
Status:old
Tags:
Comment:
Our own diff engine, ported from PHP to C++. Makefile, interface file and spec file are copied from the original wikidiff, they are untested at the current time except for the standalone target in the makefile.
Modified paths:
  • /trunk/extensions/wikidiff2 (added) (history)
  • /trunk/extensions/wikidiff2/DiffEngine.h (added) (history)
  • /trunk/extensions/wikidiff2/Makefile (added) (history)
  • /trunk/extensions/wikidiff2/compile.sh (added) (history)
  • /trunk/extensions/wikidiff2/standalone.cpp (added) (history)
  • /trunk/extensions/wikidiff2/wikidiff2.cpp (added) (history)
  • /trunk/extensions/wikidiff2/wikidiff2.h (added) (history)
  • /trunk/extensions/wikidiff2/wikidiff2.i (added) (history)
  • /trunk/extensions/wikidiff2/wikidiff2.spec (added) (history)

Diff [purge]

Index: trunk/extensions/wikidiff2/wikidiff2.cpp
@@ -0,0 +1,343 @@
 2+/**
 3+ * Diff formatter, based on code by Steinar H. Gunderson, converted to work with the
 4+ * Dairiki diff engine by Tim Starling
 5+ *
 6+ * GPL.
 7+ */
 8+
 9+#include <stdio.h>
 10+
 11+#include "wikidiff2.h"
 12+
 13+void print_diff(std::vector<std::string> &text1, std::vector<std::string> &text2, int num_lines_context, std::string &ret)
 14+{
 15+ // first do line-level diff
 16+ Diff<std::string> linediff(text1, text2);
 17+
 18+ int ctx = 0;
 19+ int from_ind = 1, to_ind = 1;
 20+ int num_ops = linediff.size();
 21+
 22+ for (int i = 0; i < num_ops; ++i) {
 23+ int n, j, n1, n2;
 24+ // Line 1 changed, show heading with no leading context
 25+ if (linediff[i].op != DiffOp<std::string>::copy && i == 0) {
 26+ ret +=
 27+ "<tr>\n"
 28+ " <td colspan=\"2\" align=\"left\"><strong><!--LINE 1--></strong></td>\n"
 29+ " <td colspan=\"2\" align=\"left\"><strong><!--LINE 1--></strong></td>\n"
 30+ "</tr>\n";
 31+ }
 32+
 33+ switch (linediff[i].op) {
 34+ case DiffOp<std::string>::add:
 35+ // inserted lines
 36+ n = linediff[i].to.size();
 37+ for (j=0; j<n; j++) {
 38+ print_add(*linediff[i].to[j], ret);
 39+ }
 40+ to_ind += n;
 41+ break;
 42+ case DiffOp<std::string>::del:
 43+ // deleted lines
 44+ n = linediff[i].from.size();
 45+ for (j=0; j<n; j++) {
 46+ print_del(*linediff[i].from[j], ret);
 47+ }
 48+ from_ind += n;
 49+ break;
 50+ case DiffOp<std::string>::copy:
 51+ // copy/context
 52+ n = linediff[i].from.size();
 53+ for (j=0; j<n; j++) {
 54+ if ((i != 0 && j < num_lines_context) /*trailing*/
 55+ || (i != num_ops - 1 && j >= n - num_lines_context)) /*leading*/ {
 56+ if (j == n - num_lines_context || (j == 0 && n < num_lines_context)) {
 57+ // Print Line: heading
 58+ char buf[256]; // should be plenty
 59+ sprintf(buf,
 60+ "<tr>\n"
 61+ " <td colspan=\"2\" align=\"left\"><strong><!--LINE %u--></strong></td>\n"
 62+ " <td colspan=\"2\" align=\"left\"><strong><!--LINE %u--></strong></td>\n"
 63+ "</tr>\n",
 64+ from_ind, to_ind);
 65+ ret += buf;
 66+ }
 67+ // Print context
 68+ ret +=
 69+ "<tr>\n"
 70+ " <td> </td>\n"
 71+ " <td class=\"diff-context\">";
 72+ print_htmlspecialchars(*linediff[i].from[j], ret);
 73+ ret +=
 74+ "</td>\n"
 75+ " <td> </td>\n"
 76+ " <td class=\"diff-context\">";
 77+ print_htmlspecialchars(*linediff[i].from[j], ret);
 78+ ret += "</td>\n</tr>\n";
 79+ }
 80+ from_ind++;
 81+ to_ind++;
 82+ }
 83+ break;
 84+ case DiffOp<std::string>::change:
 85+ // replace, ie. we do a word diff between the two sets of lines
 86+ n1 = linediff[i].from.size();
 87+ n2 = linediff[i].to.size();
 88+ n = std::min(n1, n2);
 89+ for (j=0; j<n; j++) {
 90+ print_worddiff(*linediff[i].from[j], *linediff[i].to[j], ret);
 91+ }
 92+ from_ind += n;
 93+ to_ind += n;
 94+ if (n1 > n2) {
 95+ for (j=n2; j<n1; j++) {
 96+ print_del(*linediff[i].from[j], ret);
 97+ }
 98+ } else {
 99+ for (j=n1; j<n2; j++) {
 100+ print_add(*linediff[i].to[j], ret);
 101+ }
 102+ }
 103+ break;
 104+ }
 105+ }
 106+}
 107+
 108+void print_add(const std::string & line, std::string & ret)
 109+{
 110+ ret += "<tr>\n"
 111+ " <td colspan=\"2\">&nbsp;</td>\n"
 112+ " <td>+</td>\n"
 113+ " <td class=\"diff-addedline\">";
 114+ print_htmlspecialchars(line, ret);
 115+ ret += "</td>\n</tr>\n";
 116+}
 117+
 118+void print_del(const std::string & line, std::string & ret)
 119+{
 120+ ret += "<tr>\n"
 121+ " <td>-</td>\n"
 122+ " <td class=\"diff-deletedline\">";
 123+ print_htmlspecialchars(line, ret);
 124+ ret += "</td>\n"
 125+ " <td colspan=\"2\">&nbsp;</td>\n"
 126+ "</tr>\n";
 127+}
 128+
 129+void print_worddiff(const std::string & text1, const std::string & text2, std::string &ret)
 130+{
 131+ std::vector<Word> text1_words, text2_words;
 132+
 133+ split_tokens(text1.c_str(), text1_words);
 134+ split_tokens(text2.c_str(), text2_words);
 135+ Diff<Word> worddiff(text1_words, text2_words);
 136+
 137+ //debug_print_worddiff(worddiff, ret);
 138+
 139+ // print twice; first for left side, then for right side
 140+ ret += "<tr>\n"
 141+ " <td>-</td>\n"
 142+ " <td class=\"diff-deletedline\">\n";
 143+ print_worddiff_side(worddiff, false, ret);
 144+ ret += "\n </td>\n"
 145+ " <td>+</td>\n"
 146+ " <td class=\"diff-addedline\">\n";
 147+ print_worddiff_side(worddiff, true, ret);
 148+ ret += "\n </td>\n"
 149+ "</tr>\n";
 150+}
 151+
 152+void debug_print_worddiff(Diff<Word> &worddiff, std::string &ret)
 153+{
 154+ for (unsigned i = 0; i < worddiff.size(); ++i) {
 155+ DiffOp<Word> & op = worddiff[i];
 156+ switch (op.op) {
 157+ case DiffOp<Word>::copy:
 158+ ret += "Copy\n";
 159+ break;
 160+ case DiffOp<Word>::del:
 161+ ret += "Delete\n";
 162+ break;
 163+ case DiffOp<Word>::add:
 164+ ret += "Add\n";
 165+ break;
 166+ case DiffOp<Word>::change:
 167+ ret += "Change\n";
 168+ break;
 169+ }
 170+ ret += "From: ";
 171+ bool first = true;
 172+ for (int j=0; j<op.from.size(); j++) {
 173+ if (first) {
 174+ first = false;
 175+ } else {
 176+ ret += ", ";
 177+ }
 178+ ret += "(";
 179+ ret += op.from[j]->whole + ")";
 180+ }
 181+ ret += "\n";
 182+ ret += "To: ";
 183+ first = true;
 184+ for (int j=0; j<op.to.size(); j++) {
 185+ if (first) {
 186+ first = false;
 187+ } else {
 188+ ret += ", ";
 189+ }
 190+ ret += "(";
 191+ ret += op.to[j]->whole + ")";
 192+ }
 193+ ret += "\n\n";
 194+ }
 195+}
 196+
 197+void print_worddiff_side(Diff<Word> &worddiff, bool added, std::string &ret)
 198+{
 199+ for (unsigned i = 0; i < worddiff.size(); ++i) {
 200+ DiffOp<Word> & op = worddiff[i];
 201+ int n, j;
 202+ if (op.op == DiffOp<Word>::copy) {
 203+ n = op.from.size();
 204+ for (j=0; j<n; j++) {
 205+ print_htmlspecialchars(op.from[j]->whole, ret);
 206+ }
 207+ } else if (!added && (op.op == DiffOp<Word>::del || op.op == DiffOp<Word>::change)) {
 208+ n = op.from.size();
 209+ ret += "<span class=\"diffchange\">";
 210+ for (j=0; j<n; j++) {
 211+ print_htmlspecialchars(op.from[j]->whole, ret);
 212+ }
 213+ ret += "</span>";
 214+ } else if (added && (op.op == DiffOp<Word>::add || op.op == DiffOp<Word>::change)) {
 215+ n = op.to.size();
 216+ ret += "<span class=\"diffchange\">";
 217+ for (j=0; j<n; j++) {
 218+ print_htmlspecialchars(op.to[j]->whole, ret);
 219+ }
 220+ ret += "</span>";
 221+ }
 222+ }
 223+}
 224+
 225+void print_htmlspecialchars(const std::string & input, std::string & ret)
 226+{
 227+ size_t start = 0;
 228+ size_t end = input.find_first_of("<>&");
 229+ while (end != std::string::npos) {
 230+ if (end > start) {
 231+ ret.append(input, start, end - start);
 232+ }
 233+ switch (input[end]) {
 234+ case '<':
 235+ ret.append("&lt;");
 236+ break;
 237+ case '>':
 238+ ret.append("&gt;");
 239+ break;
 240+ default /*case '&'*/:
 241+ ret.append("&amp;");
 242+ }
 243+ start = end + 1;
 244+ end = input.find_first_of("<>&", start);
 245+ }
 246+ // Append the rest of the string after the last special character
 247+ if (start < input.size()) {
 248+ ret.append(input, start, input.size() - start);
 249+ }
 250+}
 251+
 252+
 253+inline bool my_istext(unsigned char ch)
 254+{
 255+ return (ch >= '0' && ch <= '9') ||
 256+ (ch == '_') ||
 257+ (ch >= 'A' && ch <= 'Z') ||
 258+ (ch >= 'a' && ch <= 'z') ||
 259+ (ch >= 0x80 /* && ch <= 0xFF */);
 260+}
 261+
 262+// split a string into multiple tokens, just like the monster regex in DifferenceEngine.php
 263+void split_tokens(const char *text, std::vector<Word> &tokens)
 264+{
 265+ if (strlen(text) > MAX_DIFF_LINE) {
 266+ std::string everything(text);
 267+ tokens.push_back(Word(everything, everything));
 268+ return;
 269+ }
 270+
 271+ const char *ptr = text;
 272+
 273+ while (*ptr) {
 274+ std::string body, suffix;
 275+ char ch = *ptr;
 276+
 277+ // first group has three different opportunities:
 278+ if (ch == ' ' || ch == '\t') {
 279+ // one or more whitespace characters (but not \n)
 280+ while (*ptr == ' ' || *ptr == '\t') {
 281+ body.push_back(*ptr++);
 282+ }
 283+ } else if (my_istext(ch)) {
 284+ // one or more text characters
 285+ while (my_istext(*ptr)) {
 286+ body.push_back(*ptr++);
 287+ }
 288+ } else {
 289+ // one character, no matter what it is
 290+ body.push_back(*ptr++);
 291+ }
 292+
 293+ // second group: if the first character was not \n,
 294+ // any whitespace character that is not \n (if any)
 295+ if (ch != '\n') {
 296+ if (*ptr == ' ' || *ptr == '\t') {
 297+ suffix.push_back(*ptr++);
 298+ }
 299+ }
 300+
 301+ tokens.push_back(Word(body, suffix));
 302+ }
 303+}
 304+
 305+void line_explode(const char *text, std::vector<std::string> &lines)
 306+{
 307+ const char *ptr = text;
 308+ while (*ptr) {
 309+ const char *ptr2 = strchr(ptr, '\n');
 310+ if (ptr2 == NULL)
 311+ ptr2 = ptr + strlen(ptr);
 312+
 313+ lines.push_back(std::string(ptr, ptr2));
 314+
 315+ ptr = ptr2;
 316+ if (*ptr)
 317+ ++ptr;
 318+ }
 319+}
 320+
 321+// Finally, the entry point for the PHP code.
 322+const char *wikidiff2_do_diff(const char *text1, const char *text2, int num_lines_context)
 323+{
 324+ try {
 325+ std::vector<std::string> lines1;
 326+ std::vector<std::string> lines2;
 327+ std::string ret;
 328+
 329+ // constant reallocation is bad for performance (note: we might want to reduce this
 330+ // later, it might be too much)
 331+ ret.reserve(strlen(text1) + strlen(text2) + 10000);
 332+
 333+ line_explode(text1, lines1);
 334+ line_explode(text2, lines2);
 335+ print_diff(lines1, lines2, num_lines_context, ret);
 336+
 337+ return strdup(ret.c_str());
 338+ } catch (std::bad_alloc &e) {
 339+ return strdup("Out of memory in diff.");
 340+ } catch (...) {
 341+ return strdup("Unknown exception in diff.");
 342+ }
 343+}
 344+
Property changes on: trunk/extensions/wikidiff2/wikidiff2.cpp
___________________________________________________________________
Added: svn:keywords
1345 + Author Date Id Revision
Added: svn:eol-style
2346 + native
Index: trunk/extensions/wikidiff2/compile.sh
@@ -0,0 +1,4 @@
 2+#! /bin/sh
 3+swig -php4 -c++ wikidiff2.i
 4+g++ -O2 `php-config --includes` -shared -o php_wikidiff2.so wikidiff2.cpp wikidiff2_wrap.cpp
 5+
Property changes on: trunk/extensions/wikidiff2/compile.sh
___________________________________________________________________
Added: svn:keywords
16 + Author Date Id Revision
Added: svn:eol-style
27 + native
Index: trunk/extensions/wikidiff2/DiffEngine.h
@@ -0,0 +1,531 @@
 2+/**
 3+ * GPL blah blah, see below for history
 4+ */
 5+
 6+#ifndef DIFFENGINE_H
 7+#define DIFFENGINE_H
 8+
 9+#include <vector>
 10+#include <map>
 11+#include <set>
 12+#include <utility>
 13+
 14+/**
 15+ * Diff operation
 16+ *
 17+ * from and to are vectors containing pointers to the objects passed in from_lines and to_lines
 18+ *
 19+ * op is one of the following
 20+ * copy: A sequence of lines (in from) which are the same in both files.
 21+ * del: A sequence of lines (in from) which were in the first file but not the second.
 22+ * add: A sequence of lines (in to) which were in the second file but not the first.
 23+ * change: A sequence of lines which are different between the two files. Lines from the
 24+ * first file are in from, lines from the second are in to. The two vectors need
 25+ * not be the same length.
 26+ */
 27+template<typename T>
 28+class DiffOp
 29+{
 30+ public:
 31+ DiffOp(int op_, const std::vector<const T*> & from_, const std::vector<const T*> & to_)
 32+ : op(op_), from(from_), to(to_) {}
 33+
 34+ enum {copy, del, add, change};
 35+ int op;
 36+ std::vector<const T*> from;
 37+ std::vector<const T*> to;
 38+};
 39+
 40+/**
 41+ * Basic diff template class. After construction, edits will contain a vector of DiffOp
 42+ * objects representing the diff
 43+ */
 44+template<typename T>
 45+class Diff
 46+{
 47+ public:
 48+ Diff(const std::vector<T> & from_lines, const std::vector<T> & to_lines);
 49+
 50+ virtual void add_edit(const DiffOp<T> & edit) {
 51+ edits.push_back(edit);
 52+ }
 53+ unsigned size() { return edits.size(); }
 54+ DiffOp<T> & operator[](int i) {return edits[i];}
 55+
 56+ std::vector<DiffOp<T> > edits;
 57+};
 58+/**
 59+ * Class used internally by Diff to actually compute the diffs.
 60+ *
 61+ * The algorithm used here is mostly lifted from the perl module
 62+ * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
 63+ * http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
 64+ *
 65+ * More ideas are taken from:
 66+ * http://www.ics.uci.edu/~eppstein/161/960229.html
 67+ *
 68+ * Some ideas are (and a bit of code) are from from analyze.c, from GNU
 69+ * diffutils-2.7, which can be found at:
 70+ * ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
 71+ *
 72+ * This implementation is largely due to Geoffrey T. Dairiki, who wrote this
 73+ * diff engine for phpwiki 1-3.3. It was then adopted by MediaWiki.
 74+ *
 75+ * Finally, it was ported to C++ by Tim Starling in February 2006
 76+ *
 77+ * @access private
 78+ * @package MediaWiki
 79+ * @subpackage DifferenceEngine
 80+ */
 81+
 82+template<typename T>
 83+class _DiffEngine
 84+{
 85+ public:
 86+ _DiffEngine() : done(false) {}
 87+ void clear();
 88+ void diff (const std::vector<T> & from_lines,
 89+ const std::vector<T> & to_lines, Diff<T> & diff);
 90+ int _lcs_pos (int ypos);
 91+ void _compareseq (int xoff, int xlim, int yoff, int ylim);
 92+ void _shift_boundaries (const std::vector<T> & lines, std::vector<bool> & changed,
 93+ const std::vector<bool> & other_changed);
 94+ protected:
 95+ int _diag (int xoff, int xlim, int yoff, int ylim, int nchunks,
 96+ std::vector<std::pair<int, int> > & seps);
 97+
 98+ std::vector<bool> xchanged, ychanged;
 99+ std::vector<const T*> xv, yv;
 100+ std::vector<int> xind, yind;
 101+ std::map<int, int> seq;
 102+ std::set<int> in_seq;
 103+ int lcs;
 104+ bool done;
 105+};
 106+
 107+//-----------------------------------------------------------------------------
 108+// _DiffEngine implementation
 109+//-----------------------------------------------------------------------------
 110+template<typename T>
 111+void _DiffEngine<T>::clear()
 112+{
 113+ xchanged.clear();
 114+ ychanged.clear();
 115+ xv.clear();
 116+ yv.clear();
 117+ xind.clear();
 118+ yind.clear();
 119+ seq.clear();
 120+ in_seq.clear();
 121+ done = false;
 122+}
 123+
 124+template<typename T>
 125+void _DiffEngine<T>::diff (const std::vector<T> & from_lines,
 126+ const std::vector<T> & to_lines, Diff<T> & diff)
 127+{
 128+ int n_from = (int)from_lines.size();
 129+ int n_to = (int)to_lines.size();
 130+
 131+ // If this diff engine has been used before for a diff, clear the member variables
 132+ if (done) {
 133+ clear();
 134+ }
 135+ xchanged.resize(n_from);
 136+ ychanged.resize(n_to);
 137+
 138+ // Skip leading common lines.
 139+ int skip, endskip;
 140+ for (skip = 0; skip < n_from && skip < n_to; skip++) {
 141+ if (from_lines[skip] != to_lines[skip])
 142+ break;
 143+ xchanged[skip] = ychanged[skip] = false;
 144+ }
 145+ // Skip trailing common lines.
 146+ int xi = n_from, yi = n_to;
 147+ for (endskip = 0; --xi > skip && --yi > skip; endskip++) {
 148+ if (from_lines[xi] != to_lines[yi])
 149+ break;
 150+ xchanged[xi] = ychanged[yi] = false;
 151+ }
 152+
 153+ // Ignore lines which do not exist in both files.
 154+ std::set<T> xhash, yhash;
 155+ for (xi = skip; xi < n_from - endskip; xi++) {
 156+ xhash.insert(from_lines[xi]);
 157+ }
 158+
 159+ for (yi = skip; yi < n_to - endskip; yi++) {
 160+ const T & line = to_lines[yi];
 161+ if ( (ychanged[yi] = (xhash.find(line) == xhash.end())) )
 162+ continue;
 163+ yhash.insert(line);
 164+ yv.push_back(&line);
 165+ yind.push_back(yi);
 166+ }
 167+ for (xi = skip; xi < n_from - endskip; xi++) {
 168+ const T & line = from_lines[xi];
 169+ if ( (xchanged[xi] = (yhash.find(line) == yhash.end())) )
 170+ continue;
 171+ xv.push_back(&line);
 172+ xind.push_back(xi);
 173+ }
 174+
 175+ // Find the LCS.
 176+ _compareseq(0, xv.size(), 0, yv.size());
 177+
 178+ // Merge edits when possible
 179+ _shift_boundaries(from_lines, xchanged, ychanged);
 180+ _shift_boundaries(to_lines, ychanged, xchanged);
 181+
 182+ // Compute the edit operations.
 183+ xi = yi = 0;
 184+ while (xi < n_from || yi < n_to) {
 185+ assert(yi < n_to || xchanged[xi]);
 186+ assert(xi < n_from || ychanged[yi]);
 187+
 188+ // Skip matching "snake".
 189+ std::vector<const T*> copy;
 190+ std::vector<const T*> empty;
 191+ while (xi < n_from && yi < n_to && !xchanged[xi] && !ychanged[yi]) {
 192+ copy.push_back(&from_lines[xi]);
 193+ ++xi;
 194+ ++yi;
 195+ }
 196+ if (copy.size())
 197+ diff.add_edit(DiffOp<T>(DiffOp<T>::copy, copy, empty));
 198+
 199+ // Find deletes & adds.
 200+ std::vector<const T*> del;
 201+ while (xi < n_from && xchanged[xi])
 202+ del.push_back(&from_lines[xi++]);
 203+
 204+ std::vector<const T*> add;
 205+ while (yi < n_to && ychanged[yi])
 206+ add.push_back(&to_lines[yi++]);
 207+
 208+ if (del.size() && add.size())
 209+ diff.add_edit(DiffOp<T>(DiffOp<T>::change, del, add));
 210+ else if (del.size())
 211+ diff.add_edit(DiffOp<T>(DiffOp<T>::del, del, empty));
 212+ else if (add.size())
 213+ diff.add_edit(DiffOp<T>(DiffOp<T>::add, empty, add));
 214+ }
 215+
 216+ done = true;
 217+}
 218+
 219+/* Divide the Largest Common Subsequence (LCS) of the sequences
 220+ * [XOFF, XLIM) and [YOFF, YLIM) into NCHUNKS approximately equally
 221+ * sized segments.
 222+ *
 223+ * Returns (LCS, PTS). LCS is the length of the LCS. PTS is an
 224+ * array of NCHUNKS+1 (X, Y) indexes giving the diving points between
 225+ * sub sequences. The first sub-sequence is contained in [X0, X1),
 226+ * [Y0, Y1), the second in [X1, X2), [Y1, Y2) and so on. Note
 227+ * that (X0, Y0) == (XOFF, YOFF) and
 228+ * (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
 229+ *
 230+ * This function assumes that the first lines of the specified portions
 231+ * of the two files do not match, and likewise that the last lines do not
 232+ * match. The caller must trim matching lines from the beginning and end
 233+ * of the portions it is going to specify.
 234+ */
 235+template <typename T>
 236+int _DiffEngine<T>::_diag (int xoff, int xlim, int yoff, int ylim, int nchunks,
 237+ std::vector<std::pair<int, int> > & seps)
 238+{
 239+ using std::vector;
 240+ using std::swap;
 241+ using std::make_pair;
 242+ bool flip = false;
 243+ std::map<T, vector<int> > ymatches;
 244+
 245+ if (xlim - xoff > ylim - yoff) {
 246+ // Things seems faster (I'm not sure I understand why)
 247+ // when the shortest sequence in X.
 248+ flip = true;
 249+ swap(xoff, yoff);
 250+ swap(xlim, ylim);
 251+ }
 252+
 253+ if (flip)
 254+ for (int i = ylim - 1; i >= yoff; i--)
 255+ ymatches[*xv[i]].push_back(i);
 256+ else
 257+ for (int i = ylim - 1; i >= yoff; i--)
 258+ ymatches[*yv[i]].push_back(i);
 259+
 260+ lcs = 0;
 261+ seq[0] = yoff - 1;
 262+ in_seq.clear();
 263+ vector<vector<int> > ymids(lcs+1);
 264+ for (int i = 0; i <= lcs; i++) {
 265+ ymids[i].resize(nchunks);
 266+ }
 267+
 268+ int numer = xlim - xoff + nchunks - 1;
 269+ int x = xoff, x1, y1;
 270+ for (int chunk = 0; chunk < nchunks; chunk++) {
 271+ if (chunk > 0)
 272+ for (int i = 0; i <= lcs; i++)
 273+ ymids[i][chunk-1] = seq[i];
 274+
 275+ x1 = xoff + (int)((numer + (xlim-xoff)*chunk) / nchunks);
 276+ for ( ; x < x1; x++) {
 277+ const T & line = flip ? *yv[x] : *xv[x];
 278+ if (ymatches.find(line) == ymatches.end())
 279+ continue;
 280+ vector<int> & matches = ymatches[line];
 281+ vector<int>::iterator y;
 282+ int k = 0;
 283+ for (y = matches.begin(); y != matches.end(); ++y) {
 284+ if (!in_seq.count(*y)) {
 285+ k = _lcs_pos(*y);
 286+ assert(k > 0);
 287+ ymids[k] = ymids[k-1];
 288+ break;
 289+ }
 290+ }
 291+ for (y = matches.begin(); y != matches.end(); ++y) {
 292+ if (*y > seq[k-1]) {
 293+ assert(*y < seq[k]);
 294+ // Optimization: this is a common case:
 295+ // next match is just replacing previous match.
 296+ in_seq.erase(seq[k]);
 297+ seq[k] = *y;
 298+ in_seq.insert(*y);
 299+ } else if (!in_seq.count(*y)) {
 300+ k = _lcs_pos(*y);
 301+ assert(k > 0);
 302+ ymids[k] = ymids[k-1];
 303+ }
 304+ }
 305+ }
 306+ }
 307+
 308+ seps.clear();
 309+ seps.resize(nchunks + 1);
 310+
 311+ seps[0] = flip ? make_pair(yoff, xoff) : make_pair(xoff, yoff);
 312+ vector<int> & ymid = ymids[lcs];
 313+ for (int n = 0; n < nchunks - 1; n++) {
 314+ x1 = xoff + (numer + (xlim - xoff) * n) / nchunks;
 315+ y1 = ymid[n] + 1;
 316+ seps[n+1] = flip ? make_pair(y1, x1) : make_pair(x1, y1);
 317+ }
 318+ seps[nchunks] = flip ? make_pair(ylim, xlim) : make_pair(xlim, ylim);
 319+ return lcs;
 320+}
 321+
 322+template <typename T>
 323+int _DiffEngine<T>::_lcs_pos (int ypos) {
 324+ int end = lcs;
 325+ if (end == 0 || ypos > seq[end]) {
 326+ seq[++lcs] = ypos;
 327+ in_seq.insert(ypos);
 328+ return lcs;
 329+ }
 330+
 331+ int beg = 1;
 332+ while (beg < end) {
 333+ int mid = (beg + end) / 2;
 334+ if ( ypos > seq[mid] )
 335+ beg = mid + 1;
 336+ else
 337+ end = mid;
 338+ }
 339+
 340+ assert(ypos != seq[end]);
 341+
 342+ in_seq.erase(seq[end]);
 343+ seq[end] = ypos;
 344+ in_seq.insert(ypos);
 345+ return end;
 346+}
 347+
 348+/* Find LCS of two sequences.
 349+ *
 350+ * The results are recorded in the vectors {x,y}changed[], by
 351+ * storing a 1 in the element for each line that is an insertion
 352+ * or deletion (ie. is not in the LCS).
 353+ *
 354+ * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
 355+ *
 356+ * Note that XLIM, YLIM are exclusive bounds.
 357+ * All line numbers are origin-0 and discarded lines are not counted.
 358+ */
 359+template <typename T>
 360+void _DiffEngine<T>::_compareseq (int xoff, int xlim, int yoff, int ylim) {
 361+ using std::vector;
 362+ using std::pair;
 363+
 364+ vector<pair<int, int> > seps;
 365+ int lcs;
 366+
 367+ // Slide down the bottom initial diagonal.
 368+ while (xoff < xlim && yoff < ylim && *xv[xoff] == *yv[yoff]) {
 369+ ++xoff;
 370+ ++yoff;
 371+ }
 372+
 373+ // Slide up the top initial diagonal.
 374+ while (xlim > xoff && ylim > yoff && *xv[xlim - 1] == *yv[ylim - 1]) {
 375+ --xlim;
 376+ --ylim;
 377+ }
 378+
 379+ if (xoff == xlim || yoff == ylim)
 380+ lcs = 0;
 381+ else {
 382+ // This is ad hoc but seems to work well.
 383+ //nchunks = sqrt(min(xlim - xoff, ylim - yoff) / 2.5);
 384+ //nchunks = max(2,min(8,(int)nchunks));
 385+ int nchunks = std::min(7, std::min(xlim - xoff, ylim - yoff)) + 1;
 386+ lcs = _diag(xoff, xlim, yoff, ylim, nchunks, seps);
 387+ }
 388+
 389+ if (lcs == 0) {
 390+ // X and Y sequences have no common subsequence:
 391+ // mark all changed.
 392+ while (yoff < ylim)
 393+ ychanged[yind[yoff++]] = true;
 394+ while (xoff < xlim)
 395+ xchanged[xind[xoff++]] = true;
 396+ } else {
 397+ // Use the partitions to split this problem into subproblems.
 398+ vector<pair<int, int> >::iterator pt1, pt2;
 399+ pt1 = pt2 = seps.begin();
 400+ while (++pt2 != seps.end()) {
 401+ _compareseq (pt1->first, pt2->first, pt1->second, pt2->second);
 402+ pt1 = pt2;
 403+ }
 404+ }
 405+}
 406+
 407+/* Adjust inserts/deletes of identical lines to join changes
 408+ * as much as possible.
 409+ *
 410+ * We do something when a run of changed lines include a
 411+ * line at one end and has an excluded, identical line at the other.
 412+ * We are free to choose which identical line is included.
 413+ * `compareseq' usually chooses the one at the beginning,
 414+ * but usually it is cleaner to consider the following identical line
 415+ * to be the "change".
 416+ *
 417+ * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
 418+ */
 419+template <typename T>
 420+void _DiffEngine<T>::_shift_boundaries (const std::vector<T> & lines, std::vector<bool> & changed,
 421+ const std::vector<bool> & other_changed)
 422+{
 423+ int i = 0;
 424+ int j = 0;
 425+
 426+ int len = (int)lines.size();
 427+ int other_len = (int)other_changed.size();
 428+
 429+ while (1) {
 430+ /*
 431+ * Scan forwards to find beginning of another run of changes.
 432+ * Also keep track of the corresponding point in the other file.
 433+ *
 434+ * Throughout this code, i and j are adjusted together so that
 435+ * the first i elements of changed and the first j elements
 436+ * of other_changed both contain the same number of zeros
 437+ * (unchanged lines).
 438+ * Furthermore, j is always kept so that j == other_len or
 439+ * other_changed[j] == false.
 440+ */
 441+ while (j < other_len && other_changed[j])
 442+ j++;
 443+
 444+ while (i < len && ! changed[i]) {
 445+ i++; j++;
 446+ while (j < other_len && other_changed[j])
 447+ j++;
 448+ }
 449+
 450+ if (i == len)
 451+ break;
 452+
 453+ int start = i, runlength, corresponding;
 454+
 455+ // Find the end of this run of changes.
 456+ while (++i < len && changed[i])
 457+ continue;
 458+
 459+ do {
 460+ /*
 461+ * Record the length of this run of changes, so that
 462+ * we can later determine whether the run has grown.
 463+ */
 464+ runlength = i - start;
 465+
 466+ /*
 467+ * Move the changed region back, so long as the
 468+ * previous unchanged line matches the last changed one.
 469+ * This merges with previous changed regions.
 470+ */
 471+ while (start > 0 && lines[start - 1] == lines[i - 1]) {
 472+ changed[--start] = true;
 473+ changed[--i] = false;
 474+ while (start > 0 && changed[start - 1])
 475+ start--;
 476+ while (other_changed[--j])
 477+ continue;
 478+ }
 479+
 480+ /*
 481+ * Set CORRESPONDING to the end of the changed run, at the last
 482+ * point where it corresponds to a changed run in the other file.
 483+ * CORRESPONDING == LEN means no such point has been found.
 484+ */
 485+ corresponding = j < other_len ? i : len;
 486+
 487+ /*
 488+ * Move the changed region forward, so long as the
 489+ * first changed line matches the following unchanged one.
 490+ * This merges with following changed regions.
 491+ * Do this second, so that if there are no merges,
 492+ * the changed region is moved forward as far as possible.
 493+ */
 494+ while (i < len && lines[start] == lines[i]) {
 495+ changed[start++] = false;
 496+ changed[i++] = true;
 497+ while (i < len && changed[i])
 498+ i++;
 499+
 500+ j++;
 501+ if (j < other_len && other_changed[j]) {
 502+ corresponding = i;
 503+ while (j < other_len && other_changed[j])
 504+ j++;
 505+ }
 506+ }
 507+ } while (runlength != i - start);
 508+
 509+ /*
 510+ * If possible, move the fully-merged run of changes
 511+ * back to a corresponding run in the other file.
 512+ */
 513+ while (corresponding < i) {
 514+ changed[--start] = 1;
 515+ changed[--i] = 0;
 516+ while (other_changed[--j])
 517+ continue;
 518+ }
 519+ }
 520+}
 521+//-----------------------------------------------------------------------------
 522+// Diff implementation
 523+//-----------------------------------------------------------------------------
 524+
 525+template<typename T>
 526+Diff<T>::Diff(const std::vector<T> & from_lines, const std::vector<T> & to_lines)
 527+{
 528+ _DiffEngine<T> engine;
 529+ engine.diff(from_lines, to_lines, *this);
 530+}
 531+
 532+#endif
Property changes on: trunk/extensions/wikidiff2/DiffEngine.h
___________________________________________________________________
Added: svn:keywords
1533 + Author Date Id Revision
Added: svn:eol-style
2534 + native
Index: trunk/extensions/wikidiff2/wikidiff2.h
@@ -0,0 +1,55 @@
 2+#ifndef WIKIDIFF2_H
 3+#define WIKIDIFF2_H
 4+
 5+#define MAX_DIFF_LINE 10000
 6+
 7+#include "DiffEngine.h"
 8+#include <string>
 9+
 10+// a small class to accomodate word-level diffs; basically, a body and an
 11+// optional suffix (the latter consisting of a single whitespace), where
 12+// only the bodies are compared on operator==.
 13+class Word {
 14+public:
 15+ std::string body;
 16+ std::string whole;
 17+
 18+ Word(std::string body, std::string suffix) : body(body), whole(body + suffix) {}
 19+ bool operator== (const Word &w) const {
 20+ return (body == w.body);
 21+ }
 22+ bool operator!=(const Word &w) const {
 23+ return !(body == w.body);
 24+ }
 25+ bool operator<(const Word &w) const {
 26+ return body < w.body;
 27+ }
 28+};
 29+
 30+// operations for the diff, as returned by do_diff
 31+template<class T>
 32+struct diff_op
 33+{
 34+ unsigned char op;
 35+ const T *from, *to;
 36+ unsigned from_ind, to_ind;
 37+
 38+ diff_op<T> () {}
 39+};
 40+
 41+template<class T>
 42+std::vector<diff_op<T> > do_diff(const std::vector<T> &text1, const std::vector<T> &text2);
 43+
 44+void print_diff(std::vector<std::string> &text1, std::vector<std::string> &text2, int num_lines_context, std::string &ret);
 45+void print_worddiff(const std::string & text1, const std::string & text2, std::string &ret);
 46+void print_worddiff_side(Diff<Word> &worddiff, bool added, std::string &ret);
 47+void split_tokens(const char *text, std::vector<Word> &tokens);
 48+void print_add(const std::string & line, std::string & ret);
 49+void print_del(const std::string & line, std::string & ret);
 50+void print_htmlspecialchars(const std::string & input, std::string & ret);
 51+void debug_print_worddiff(Diff<Word> &worddiff, std::string &ret);
 52+const char *wikidiff2_do_diff(const char *text1, const char *text2, int num_lines_context);
 53+
 54+
 55+#endif
 56+
Property changes on: trunk/extensions/wikidiff2/wikidiff2.h
___________________________________________________________________
Added: svn:keywords
157 + Author Date Id Revision
Added: svn:eol-style
258 + native
Index: trunk/extensions/wikidiff2/wikidiff2.i
@@ -0,0 +1,15 @@
 2+%module wikidiff2
 3+
 4+// Need to free the string to prevent memory leak
 5+%typemap(out) char * %{
 6+ if(!$1) {
 7+ ZVAL_NULL(return_value);
 8+ } else {
 9+ ZVAL_STRING(return_value,$1, 1);
 10+ free($1);
 11+ }
 12+%}
 13+
 14+%inline {
 15+ const char *wikidiff2_do_diff(const char *text1, const char *text2, int num_lines_context);
 16+}
Property changes on: trunk/extensions/wikidiff2/wikidiff2.i
___________________________________________________________________
Added: svn:keywords
117 + Author Date Id Revision
Added: svn:eol-style
218 + native
Index: trunk/extensions/wikidiff2/wikidiff2.spec
@@ -0,0 +1,32 @@
 2+Summary: PHP extension and standalone application to do fast word-level diffs
 3+Name: wikidiff2
 4+Version: 0.0.1
 5+Release: 2
 6+Copyright: GPL
 7+Group: Applications/Internet
 8+Source: wikidiff2-0.0.1.tar.gz
 9+BuildRoot: /var/tmp/%{name}-buildroot
 10+
 11+%description
 12+PHP extension and standalone application to do fast word-level diffs.
 13+(Packaged for Wikimedia local use!)
 14+
 15+%prep
 16+%setup -q
 17+
 18+%build
 19+make
 20+
 21+%install
 22+rm -rf $RPM_BUILD_ROOT
 23+INSTALL_TARGET="$RPM_BUILD_ROOT/usr/local/lib/php/extensions/no-debug-non-zts-20020429" make install
 24+
 25+%clean
 26+rm -rf $RPM_BUILD_ROOT
 27+
 28+%files
 29+%defattr(-,root,root)
 30+%dir /usr/local/lib/php/extensions/no-debug-non-zts-20020429
 31+
 32+/usr/local/lib/php/extensions/no-debug-non-zts-20020429/php_wikidiff2.so
 33+
Property changes on: trunk/extensions/wikidiff2/wikidiff2.spec
___________________________________________________________________
Added: svn:keywords
134 + Author Date Id Revision
Added: svn:eol-style
235 + native
Index: trunk/extensions/wikidiff2/standalone.cpp
@@ -0,0 +1,52 @@
 2+#include <stdio.h>
 3+#include <sys/stat.h>
 4+#include "wikidiff2.h"
 5+
 6+/**
 7+ * Standalone (i.e. PHP-free) application to produce HTML-formatted word-level diffs from two files
 8+ */
 9+
 10+
 11+void report_file_error(char* filename)
 12+{
 13+ char errorFormat[] = "Error opening file \"%s\"";
 14+ char * error = new char[strlen(filename) + sizeof(errorFormat)];
 15+ sprintf(error, errorFormat, filename);
 16+ perror(error);
 17+ delete[] error;
 18+ exit(1);
 19+}
 20+
 21+char* file_get_contents(char* filename)
 22+{
 23+ struct stat s;
 24+ char* buffer;
 25+ if (stat(filename, &s)) {
 26+ report_file_error(filename);
 27+ }
 28+ FILE * file = fopen(filename, "rb");
 29+ if (!file) {
 30+ report_file_error(filename);
 31+ }
 32+ buffer = new char[s.st_size + 1];
 33+ size_t bytes_read = fread(buffer, 1, s.st_size, file);
 34+ buffer[bytes_read] = '\0';
 35+ fclose(file);
 36+ return buffer;
 37+}
 38+
 39+int main(int argc, char** argv)
 40+{
 41+ if (argc != 3) {
 42+ printf("Usage: wikidiff2 <file1> <file2>\n");
 43+ exit(1);
 44+ }
 45+
 46+ char *buffer1 = file_get_contents(argv[1]);
 47+ char *buffer2 = file_get_contents(argv[2]);
 48+ const char *diff = wikidiff2_do_diff(buffer1, buffer2, 2);
 49+ fputs(diff, stdout);
 50+ return 0;
 51+}
 52+
 53+
Property changes on: trunk/extensions/wikidiff2/standalone.cpp
___________________________________________________________________
Added: svn:keywords
154 + Author Date Id Revision
Added: svn:eol-style
255 + native
Index: trunk/extensions/wikidiff2/Makefile
@@ -0,0 +1,62 @@
 2+INSTALL_TARGET?=`php-config --extension-dir`
 3+PRODUCT=wikidiff2
 4+VERSION=0.0.1
 5+CXX?=g++
 6+
 7+# For Linux
 8+SHARED = -shared
 9+
 10+# For Mac OS X
 11+# SHARED = -bundle
 12+
 13+OUTPUT=php_$(PRODUCT).so
 14+STANDALONE=$(PRODUCT)
 15+
 16+TMPDIST=$(PRODUCT)-$(VERSION)
 17+DISTFILES=Makefile \
 18+ $(PRODUCT).spec \
 19+ $(PRODUCT).cpp $(PRODUCT).i \
 20+ $(PRODUCT)_wrap.cpp php_$(PRODUCT).h \
 21+ test.php memleak.php t1.txt t2.txt
 22+
 23+$(OUTPUT) : $(PRODUCT).cpp $(PRODUCT)_wrap.cpp
 24+ $(CXX) -O3 `php-config --includes` $(SHARED) -o $@ $(PRODUCT).cpp $(PRODUCT)_wrap.cpp
 25+
 26+.PHONY: standalone
 27+standalone:
 28+ $(CXX) -o $(STANDALONE) -O3 $(PRODUCT).cpp standalone.cpp
 29+
 30+# The below _almost_ works. It gets unresolved symbol errors on load looking for _compiler_globals.
 31+# MACOSX_DEPLOYMENT_TARGET=10.3 g++ -O2 `php-config --includes` $(SHARED) -o php_wikidiff2.so wikidiff2.cpp wikidiff2_wrap.cpp -undefined dynamic_lookup
 32+
 33+$(PRODUCT)_wrap.cpp : $(PRODUCT).i
 34+ swig -php4 -c++ $(PRODUCT).i
 35+
 36+install : $(OUTPUT)
 37+ install -d "$(INSTALL_TARGET)"
 38+ install -m 0755 $(OUTPUT) "$(INSTALL_TARGET)"
 39+
 40+uninstall :
 41+ rm -f "$(INSTALL_TARGET)"/$(OUTPUT)
 42+
 43+clean :
 44+ rm -f $(OUTPUT)
 45+ rm -f $(PRODUCT)_wrap.cpp
 46+
 47+test : $(OUTPUT)
 48+ php test.php
 49+
 50+distclean : clean
 51+ rm -rf $(TMPDIST)
 52+ rm -f $(TMPDIST).tar.gz
 53+
 54+dist : $(DISTFILES) Makefile
 55+ rm -rf $(TMPDIST)
 56+ mkdir $(TMPDIST)
 57+ for x in $(DISTFILES); do cp -p $$x $(TMPDIST)/$$x; done
 58+ tar zcvf $(TMPDIST).tar.gz $(TMPDIST)
 59+
 60+rpm : dist
 61+ cp $(TMPDIST).tar.gz /usr/src/redhat/SOURCES
 62+ cp $(PRODUCT).spec /usr/src/redhat/SPECS/$(PRODUCT)-$(VERSION).spec
 63+ cd /usr/src/redhat/SPECS && rpmbuild -ba $(PRODUCT)-$(VERSION).spec
Property changes on: trunk/extensions/wikidiff2/Makefile
___________________________________________________________________
Added: svn:keywords
164 + Author Date Id Revision
Added: svn:eol-style
265 + native

Status & tagging log