r45851 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r45850‎ | r45851 | r45852 >
Date:21:35, 17 January 2009
Author:rarohde
Status:deferred (Comments)
Tags:
Comment:
Uploading EditSyntax conversion tools
Modified paths:
  • /trunk/tools/editsyntax (added) (history)
  • /trunk/tools/editsyntax/ConvertFromEditSyntax.py (added) (history)
  • /trunk/tools/editsyntax/ConvertToEditSyntax.py (added) (history)
  • /trunk/tools/editsyntax/EditSyntax.py (added) (history)

Diff [purge]

Index: trunk/tools/editsyntax/EditSyntax.py
@@ -0,0 +1,750 @@
 2+#-*- coding: utf-8 -*-
 3+import string, re, copy, hashlib
 4+
 5+## Global variables used by this script
 6+max_buffer_size = 15000000; ##Maximum number of characters to keep in
 7+ ##revision buffer.
 8+
 9+revision_buffer = dict();
 10+revision_order = []
 11+hash_table = dict();
 12+current_buffer_size = 0;
 13+current_revision = -1;
 14+previous_revision = -1;
 15+
 16+
 17+## This function should be called with the first revision of each
 18+## article. It clears the internal buffers and reinitializes the state.
 19+def newArticle(text, revision_id):
 20+ global revision_buffer, hash_table
 21+ global current_revision, previous_revision;
 22+ global current_buffer_size, revision_order;
 23+
 24+ revision_buffer = dict();
 25+ revision_buffer[revision_id] = text;
 26+
 27+ hash_table = dict();
 28+ hash_table[hash(text)] = [revision_id];
 29+
 30+ current_revision = revision_id;
 31+ previous_revision = -1;
 32+ revision_order = [revision_id];
 33+
 34+ current_buffer_size = len(text);
 35+
 36+
 37+## When reading from a full history dump, this function should be called
 38+## with each subsequent revision of the article started with newArticle.
 39+def newRevision(text, revision_id):
 40+ global revision_buffer, hash_table;
 41+ global current_revision, previous_revision;
 42+ global current_buffer_size, max_buffer_size, revision_order;
 43+
 44+ previous_revision = current_revision;
 45+ current_revision = revision_id;
 46+ revision_order.append(current_revision);
 47+
 48+ h = hash(text);
 49+ if h not in hash_table:
 50+ hash_table[h] = [current_revision];
 51+ else:
 52+ hash_table[h].append(current_revision);
 53+
 54+ ##Prevent Memory Overflow
 55+ pos = 0;
 56+ while current_buffer_size + len(text) > max_buffer_size \
 57+ and pos < len(revision_order) - 1:
 58+ key = revision_order[pos];
 59+ if revision_buffer[key] != False:
 60+ current_buffer_size -= len(revision_buffer[key]);
 61+ revision_buffer[key] = False;
 62+ pos += 1;
 63+
 64+ revision_buffer[revision_id] = text;
 65+ current_buffer_size += len(text);
 66+
 67+
 68+## When reading from dump generated using editSyntax, this function should
 69+## be called with each new <changes> block passed as a text string.
 70+def newChanges(xml, revision_id):
 71+ global revision_buffer, current_revision;
 72+ changes = readXMLToChanges(xml);
 73+
 74+ start = -1;
 75+ for k in range(len(changes)):
 76+ if changes[k]['node'] == 'revert':
 77+ text = revision_buffer[changes[0]['revision']];
 78+ start = k;
 79+
 80+ if start >= 0:
 81+ if len(changes) > start+1:
 82+ text = differenceRestorer(text,changes[start+1:]);
 83+ else:
 84+ text = differenceRestorer(revision_buffer[current_revision],changes);
 85+
 86+ newRevision(text, revision_id);
 87+
 88+
 89+## The full-text of the current revision.
 90+def getCurrentText():
 91+ global revision_buffer, current_revision;
 92+ return revision_buffer[current_revision];
 93+
 94+
 95+## Returns an XML formatted <changes> block comparing the current revision
 96+## to the previous one.
 97+def getXMLDifference(indent = 3, indentstr = " "):
 98+ global revision_buffer, hash_table, current_revision, previous_revision;
 99+
 100+ xml_out = "";
 101+ if previous_revision > -1:
 102+ h = hash(revision_buffer[current_revision]);
 103+ revert_id = -1;
 104+ if len(hash_table[h]) > 1:
 105+ choices = hash_table[h];
 106+ for c in choices[:-1]:
 107+ if revision_buffer[c] == revision_buffer[current_revision]:
 108+ revert_id = c;
 109+ break;
 110+ if revert_id > -1:
 111+ changes = [dict({'node':'revert','revision':revert_id})];
 112+ else:
 113+ changes = differenceGenerator(revision_buffer[previous_revision],
 114+ revision_buffer[current_revision]);
 115+
 116+ xml_out = XMLOutput(changes,indent,indentstr);
 117+ if len(xml_out) > len(revision_buffer[current_revision]) + 40:
 118+ changes = [dict({'node':'new','value':revision_buffer[current_revision]})];
 119+ xml_out = XMLOutput(changes,indent,indentstr);
 120+ else:
 121+ changes = [dict({'node':'new','value':revision_buffer[current_revision]})];
 122+ xml_out = XMLOutput(changes,indent,indentstr);
 123+
 124+ return xml_out;
 125+
 126+
 127+## Takes a changes structure and produces a string with XML formatted output
 128+def XMLOutput(changes, indent = 3, indentstr = " "):
 129+ in1 = "";
 130+ for k in range(indent):
 131+ in1 += indentstr;
 132+ in2 = in1 + indentstr;
 133+
 134+ if len(changes) == 1 and changes[0]['node'] == 'new':
 135+ st = in1 + '<text xml:space="preserve"';
 136+ if changes[0]['value'] != "":
 137+ st += ">" + changes[0]['value'] + "</text>\n";
 138+ else:
 139+ st += " />\n";
 140+ return st;
 141+
 142+ need_breaks = False;
 143+ for ch in changes:
 144+ if 'value' in ch and "\n" in ch['value']:
 145+ need_breaks = True;
 146+ break;
 147+ if need_breaks:
 148+ st = in1 + "<changes xml:space=\"preserve\">\n";
 149+ else:
 150+ st = in1 + "<changes>\n";
 151+
 152+ for ch in changes:
 153+ if ch['node'] == 'replace' or ch['node'] == 'delete':
 154+ if ch['line1'] == ch['line2']:
 155+ ch['line'] = ch['line1'];
 156+ else:
 157+ ch['lines'] = unicode(ch['line1']) + "-" + \
 158+ unicode(ch['line2']);
 159+ del ch['line1'];
 160+ del ch['line2'];
 161+ elif ch['node'] == 'text_replace':
 162+ ch['pos'] = unicode(ch['pos_start']) + "-" + \
 163+ unicode(ch['pos_end']);
 164+ del ch['pos_start'];
 165+ del ch['pos_end'];
 166+
 167+ st += in2 + "<" + ch['node'];
 168+ for key in ch:
 169+ if key != 'node' and key != 'value':
 170+ st += " " + key + '="' + unicode(ch[key]) + '"';
 171+ if 'value' in ch and ch['value'] != '':
 172+ st2 = "";
 173+ if ch['node'] == 'permute':
 174+ keylist = ch['value'].keys();
 175+ keylist.sort();
 176+ for k in keylist:
 177+ st2 += unicode(k) + ": " + unicode(ch['value'][k]) + ", ";
 178+ st2 = "{" + st2[:-2] + "}";
 179+ else:
 180+ st2 = unicode(ch['value']);
 181+ st += ">" + st2 + "</" + ch['node'] + ">\n";
 182+ else:
 183+ st += " />\n";
 184+
 185+ st += in1 + "</changes>\n";
 186+ return st;
 187+
 188+
 189+## Creates a changes structure denoting the transformation from
 190+## old_line to new_line.
 191+def textBlocking(old_line, new_line, line_number = False):
 192+
 193+ min_char_block = 40; ## Number of consecutive matching characters
 194+ ## within a block of changed text to justify
 195+ ## spliting one replacement into two.
 196+
 197+ matches = [];
 198+ k1 = 0;
 199+ min_k2 = 0;
 200+
 201+ while k1 < len(old_line)-min_char_block:
 202+ k2 = string.find(new_line,old_line[k1:k1+min_char_block],min_k2);
 203+ if k2 >= 0:
 204+ if k1 > 0 and k2 > 0 and old_line[k1-1] == new_line[k2-1]:
 205+ break;
 206+ s = min_char_block;
 207+ while k1 + s < len(old_line) \
 208+ and k2 + s < len(new_line) \
 209+ and old_line[k1:k1+s+1] == new_line[k2:k2+s+1]:
 210+ s += 1;
 211+ matches.append([k1,k2,s]);
 212+ k1 += s;
 213+ min_k2 = k2 + s;
 214+ break;
 215+ k1 += 1;
 216+
 217+ old = [];
 218+ new = [];
 219+ if len(matches) > 0:
 220+ mlast_o = 0;
 221+ mlast_n = 0;
 222+ for m in matches:
 223+ old.append(old_line[mlast_o:m[0]]);
 224+ old.append(old_line[m[0]:m[0]+m[2]]);
 225+ new.append(new_line[mlast_n:m[1]]);
 226+ new.append(new_line[m[1]:m[1]+m[2]]);
 227+ mlast_o = m[0]+m[2];
 228+ mlast_n = m[1]+m[2];
 229+ old.append(old_line[mlast_o:]);
 230+ new.append(new_line[mlast_n:]);
 231+ else:
 232+ changes = [dict({'node':'replace','line1':line_number,
 233+ 'line2':line_number,'value':new_line})];
 234+ return changes;
 235+
 236+ changes = [];
 237+ pos = 0;
 238+ for k in range(len(old)):
 239+ if old[k] != new[k]:
 240+ p1 = 0;
 241+ p2 = -1;
 242+ while p1 < len(old[k]) \
 243+ and p1 < len(new[k]) \
 244+ and old[k][p1] == new[k][p1]:
 245+ p1 += 1;
 246+
 247+ while -p2-1 < len(old[k]) \
 248+ and -p2-1 < len(new[k]) \
 249+ and old[k][p2] == new[k][p2]:
 250+ p2 -= 1;
 251+
 252+ if p2 == -1:
 253+ changes.append(dict({'node':'text_replace',
 254+ 'line':line_number,
 255+ 'pos_start':pos+p1,
 256+ 'pos_end':pos+len(old[k]),
 257+ 'value':new[k][p1:]}));
 258+ else:
 259+ changes.append(dict({'node':'text_replace',
 260+ 'line':line_number,
 261+ 'pos_start':pos+p1,
 262+ 'pos_end':pos+len(old[k])+p2+1,
 263+ 'value':new[k][p1:p2+1]}));
 264+ pos += len(new[k]);
 265+ else:
 266+ pos += len(old[k]);
 267+
 268+ return changes;
 269+
 270+
 271+## Private ##
 272+##
 273+## Takes a changes structure and combines consecutive entries if
 274+## the resulting code would be more compact.
 275+##
 276+## Must be called prior to calling normalizeLineNumbers
 277+def consolidateChanges(changes):
 278+ k = 0;
 279+
 280+ while k < len(changes) - 1:
 281+ if changes[k]['node'] == 'replace':
 282+ if changes[k+1]['node'] == 'replace' \
 283+ and changes[k+1]['line1'] == changes[k]['line2'] + 1:
 284+ changes[k]['value'] += "\n" + changes[k+1]['value'];
 285+ changes[k]['line2'] = changes[k+1]['line2'];
 286+ del changes[k+1];
 287+ elif changes[k+1]['node'] == 'insert' \
 288+ and changes[k+1]['line'] == changes[k]['line2'] + 1:
 289+ changes[k]['value'] += "\n" + changes[k+1]['value'];
 290+ del changes[k+1];
 291+ elif changes[k+1]['node'] == 'delete' \
 292+ and changes[k+1]['line1'] == changes[k]['line2'] + 1:
 293+ changes[k]['line2'] = changes[k+1]['line2'];
 294+ del changes[k+1];
 295+ else:
 296+ k += 1;
 297+ elif changes[k]['node'] == 'insert':
 298+ if changes[k+1]['node'] == 'replace' and \
 299+ changes[k+1]['line1'] == changes[k]['line']:
 300+ ch = copy.copy(changes[k+1]);
 301+ ch['value'] += changes[k]['value'] + "\n" + ch['value'];
 302+ changes[k] = ch;
 303+ del changes[k+1];
 304+ elif changes[k+1]['node'] == 'insert' and \
 305+ changes[k+1]['line'] == changes[k]['line']:
 306+ changes[k]['value'] += "\n" + changes[k+1]['value'];
 307+ del changes[k+1];
 308+ elif changes[k+1]['node'] == 'delete' and \
 309+ changes[k+1]['line1'] == changes[k]['line']:
 310+ ch = dict({'node':'replace'});
 311+ ch['line1'] = changes[k+1]['line1'];
 312+ ch['line2'] = changes[k+1]['line2'];
 313+ ch['value'] = changes[k]['value'];
 314+ changes[k] = ch;
 315+ del changes[k+1];
 316+ else:
 317+ k += 1;
 318+
 319+## Current code never generates these cases
 320+## may want to uncomment this block if that changes.
 321+##
 322+## elif changes[k]['node'] == 'delete':
 323+## if changes[k+1]['node'] == 'replace' and \
 324+## changes[k+1]['line1'] == changes[k]['line2']+1:
 325+## ch = copy.copy(changes[k+1]);
 326+## ch['line1'] = changes[k]['line1'];
 327+## changes[k] = ch;
 328+## del changes[k+1];
 329+## elif changes[k+1]['node'] == 'insert' and changes[k+1]['line']+1 == changes[k]['line2']:
 330+## ch = dict({'node':'replace'});
 331+## ch['line1'] = changes[k]['line1'];
 332+## ch['line2'] = changes[k]['line2'];
 333+## ch['value'] = changes[k+1]['value'];
 334+## changes[k] = ch;
 335+## del changes[k+1];
 336+## elif changes[k+1]['node'] == 'delete' and \
 337+## changes[k+1]['line1'] == changes[k]['line2']+1:
 338+## changes[k]['line2'] = changes[k+1]['line2'];
 339+## del changes[k+1];
 340+## else:
 341+## k += 1;
 342+
 343+ else:
 344+ k += 1;
 345+
 346+ return changes;
 347+
 348+## Private ##
 349+## Cleanup function addressing changes with embedded newlines.
 350+def normalizeLineNumbers(changes):
 351+ lshift = 0;
 352+ for ch in changes:
 353+ if 'line' in ch:
 354+ ch['line'] += lshift;
 355+ if 'line1' in ch:
 356+ ch['line1'] += lshift;
 357+ ch['line2'] += lshift;
 358+ if ch['node'] == 'delete':
 359+ lshift -= ch['line2']-ch['line1']+1;
 360+ if ch['node'] == 'insert':
 361+ lshift += 1;
 362+ if ch['node'] == 'replace':
 363+ lshift -= ch['line2']-ch['line1'];
 364+ if 'value' in ch and ch['node'] != 'permute':
 365+ lshift += ch['value'].count("\n");
 366+
 367+ return changes;
 368+
 369+## Private ##
 370+## Changes zero indexed values to one indexed for easier human readability.
 371+def addOne(changes):
 372+ for ch in changes:
 373+ if 'line' in ch:
 374+ ch['line'] += 1;
 375+ if 'line1' in ch:
 376+ ch['line1'] += 1;
 377+ ch['line2'] += 1;
 378+ if 'pos_start' in ch:
 379+ ch['pos_start'] += 1;
 380+ ch['pos_end'] += 1;
 381+ if ch['node'] == 'permute':
 382+ per = dict();
 383+ for key in ch['value']:
 384+ per[key+1] = ch['value'][key] + 1;
 385+ ch['value'] = per;
 386+
 387+ return changes;
 388+
 389+## Private ##
 390+## Changes one indexed values back to zero indexed values.
 391+def subtractOne(changes):
 392+ for ch in changes:
 393+ if 'line' in ch:
 394+ ch['line'] -= 1;
 395+ if 'line1' in ch:
 396+ ch['line1'] -= 1;
 397+ ch['line2'] -= 1;
 398+ if 'pos_start' in ch:
 399+ ch['pos_start'] -= 1;
 400+ ch['pos_end'] -= 1;
 401+ if ch['node'] == 'permute':
 402+ per = dict();
 403+ for key in ch['value']:
 404+ per[key-1] = ch['value'][key] - 1;
 405+ ch['value'] = per;
 406+
 407+ return changes;
 408+
 409+
 410+## Private ##
 411+## Called by differenceGenerator for cases requiring permutations.
 412+def permutedDifferenceGenerator(lines1, lines2, line_map):
 413+ changes = [];
 414+
 415+ rlnm = dict();
 416+ rlnm[0] = 0;
 417+ last_k = 0;
 418+ for k in range(len(lines2)):
 419+ if line_map[k] != rlnm[last_k] + k-last_k and line_map[k] >= 0:
 420+ rlnm[k] = line_map[k];
 421+ last_k = k;
 422+ if rlnm[0] == 0:
 423+ del rlnm[0];
 424+
 425+ changes.append(dict({'node':'permute','length':len(lines2),'value':rlnm}));
 426+ line_map2 = regeneratePermuteMap(len(lines2),rlnm);
 427+
 428+ for k in range(len(lines2)):
 429+
 430+ if line_map[k] == -1:
 431+ if line_map2[k] < len(lines1) and \
 432+ lines1[line_map2[k]] == lines2[k]:
 433+ line_map[k] = line_map2[k];
 434+ continue;
 435+ if line_map2[k] < len(lines1) \
 436+ and k < len(lines2) and line_map2[k] >= 0:
 437+ changes.extend(textBlocking(lines1[line_map2[k]],lines2[k],k));
 438+ continue;
 439+
 440+ changes.append(dict({'node':"replace",
 441+ 'line1':k,
 442+ 'line2':k,
 443+ 'value':lines2[k]}));
 444+ if line_map[k] == -10:
 445+ if line_map2[k] < len(lines1) and \
 446+ lines1[line_map2[k]] == lines2[k]:
 447+ line_map[k] = line_map2[k];
 448+ continue;
 449+ if line_map2[k] >= len(lines1):
 450+ continue;
 451+ changes.append(dict({'node':"replace",
 452+ 'line1':k,
 453+ 'line2':k,
 454+ 'value':""}));
 455+
 456+ changes = consolidateChanges(changes);
 457+ changes = normalizeLineNumbers(changes);
 458+ changes = addOne(changes);
 459+
 460+ return changes;
 461+
 462+
 463+## Creates a change structure showing the differences between old_text
 464+## and new_text.
 465+def differenceGenerator(old_text,new_text):
 466+ lines1 = old_text.split("\n");
 467+ lines2 = new_text.split("\n");
 468+
 469+ line_map = dict();
 470+ for k in range(len(lines2)):
 471+ ln = lines2[k];
 472+ if ln == "":
 473+ k2 = k - 1;
 474+ while k2 >= 0 and line_map[k2] < 0:
 475+ k2 = k2 - 1;
 476+ df = k - k2;
 477+ if k2 >= 0:
 478+ if line_map[k2] + df < len(lines1) and lines1[line_map[k2]+df] == "":
 479+ line_map[k] = line_map[k2] + df;
 480+ else:
 481+ line_map[k] = -10;
 482+ else:
 483+ line_map[k] = -10;
 484+ continue;
 485+ if ln in lines1:
 486+ if k > 0:
 487+ if line_map[k-1]+1 < len(lines1) and line_map[k-1] >= 0 \
 488+ and lines1[line_map[k-1]+1] == ln:
 489+ line_map[k] = line_map[k-1]+1;
 490+ continue;
 491+ line_map[k] = lines1.index(ln);
 492+ else:
 493+ line_map[k] = -1;
 494+ if k >= 2 and line_map[k-2] == -1:
 495+ if line_map[k-1] >= 0 and len(lines1[line_map[k-1]]) < 10:
 496+ line_map[k-1] = -1;
 497+
 498+ k_last = 0;
 499+ for k in range(1,len(line_map)):
 500+ if line_map[k] <= line_map[k_last] and line_map[k] >= 0:
 501+ return permutedDifferenceGenerator(lines1,lines2,line_map);
 502+ if line_map[k] >= 0:
 503+ k_last = k;
 504+
 505+ changes = [];
 506+ k2 = 0;
 507+ k = 0;
 508+ while k < len(line_map):
 509+ if line_map[k] == k2:
 510+ k2 = k2 + 1;
 511+ k = k + 1;
 512+ continue;
 513+ if line_map[k] == -1:
 514+ ks = k + 1;
 515+ while (ks < len(line_map) and line_map[ks] < k2) or \
 516+ (ks+1 < len(line_map) and lines2[ks] == ""):
 517+ ks += 1;
 518+ if ks >= len(line_map):
 519+ if k2 > 0:
 520+ changes.append(dict({'node':"truncate",'line':k2}));
 521+ changes.append(dict({'node':"append",
 522+ 'value':string.join(lines2[k:],"\n")}));
 523+ else:
 524+ changes.append(dict({'node':"new",
 525+ 'value':string.join(lines2,"\n")}));
 526+ break;
 527+ else:
 528+ if line_map[ks] == k2:
 529+ changes.append(dict({'node':"insert",
 530+ 'line':k2,
 531+ 'value':string.join(lines2[k:ks],"\n")}));
 532+ k = ks;
 533+ continue;
 534+ else:
 535+ ch2 = [];
 536+ for j in range(k,ks):
 537+ if k2 + j - k < line_map[ks]:
 538+ if lines2[j] != "":
 539+ ch2.extend(textBlocking(lines1[k2+j-k],lines2[j],k2+j-k));
 540+ else:
 541+ if len(ch2) > 0 and ch2[-1]['node'] == 'replace':
 542+ ch2[-1]['line2'] += 1;
 543+ ch2[-1]['value'] += "\n";
 544+ else:
 545+ ch2.append(dict({'node':"replace",
 546+ 'line1':k2+j-k,
 547+ 'line2':k2+j-k,
 548+ 'value':""}));
 549+ else:
 550+ ch2.append(dict({'node':"insert",
 551+ 'line':line_map[ks],
 552+ 'value':lines2[j]}));
 553+
 554+ if ks-k < line_map[ks]-k2:
 555+ ch2.append(dict({'node':"delete",
 556+ 'line1':k2+ks-k,
 557+ 'line2':line_map[ks]-1}));
 558+
 559+ changes.extend(ch2);
 560+
 561+ k2 = line_map[ks];
 562+ k = ks;
 563+ continue;
 564+ if line_map[k] == -10:
 565+ ks = k + 1;
 566+ while ks < len(line_map) and line_map[ks] == -10:
 567+ ks += 1;
 568+ if ks < len(line_map) and k2 == line_map[ks]:
 569+ temp = [];
 570+ for j in range(k,ks):
 571+ temp.append("");
 572+ changes.append(dict({'node':"insert",
 573+ 'line':k2,
 574+ 'value':string.join(temp,"\n")}));
 575+ k = ks;
 576+ continue;
 577+ else:
 578+ changes.append(dict({'node':"replace",
 579+ 'line1':k2,
 580+ 'line2':k2,
 581+ 'value':""}));
 582+ k2 += 1;
 583+ k += 1;
 584+ continue;
 585+ if line_map[k] > k2:
 586+ changes.append(dict({'node':'delete',
 587+ 'line1':k2,
 588+ 'line2':line_map[k]-1}));
 589+ k2 = line_map[k]+1;
 590+ k = k + 1;
 591+ continue;
 592+ if line_map[k] < k2 and line_map[k] > 0:
 593+ k2 = k2 -1;
 594+
 595+ print k;
 596+ k = k + 1;
 597+ k2 = k2 + 1;
 598+
 599+ if k2 < len(lines1) and len(changes) > 0:
 600+ if changes[-1]['node'] != 'append' and changes[-1]['node'] != 'truncate' \
 601+ and changes[-1]['node'] != 'new':
 602+ changes.append(dict({'node':"truncate",'line':k2}));
 603+ if k2 < len(lines1) and len(changes) == 0:
 604+ changes.append(dict({'node':"truncate",'line':k2}));
 605+
 606+ changes = consolidateChanges(changes);
 607+ changes = normalizeLineNumbers(changes);
 608+ changes = addOne(changes);
 609+
 610+ return changes;
 611+
 612+
 613+## Reconstruct a full line map from the condensed <permute> data.
 614+def regeneratePermuteMap(length,permute_data):
 615+ line_map = dict();
 616+ last_k = 0;
 617+ line_map[0] = 0;
 618+ for k in range(length):
 619+ if k in permute_data:
 620+ line_map[k] = permute_data[k];
 621+ last_k = k;
 622+ else:
 623+ line_map[k] = line_map[last_k] + k-last_k;
 624+
 625+ return line_map;
 626+
 627+
 628+## Applies "changes" to "old_text" and returns new text
 629+def differenceRestorer(old_text,changes):
 630+ changes = subtractOne(changes);
 631+
 632+ lines1 = old_text.split("\n");
 633+ lines2 = copy.copy(lines1);
 634+
 635+ lshift = 0; ##Keep track of newline insertions, faster than
 636+ ##restructuing each time a newline is inserted.
 637+
 638+ last_line = -1;
 639+
 640+ for k in range(len(changes)):
 641+ ch = changes[k];
 642+
 643+ ##Detect out of order operations
 644+ current_line = -1;
 645+ if 'line' in ch:
 646+ current_line = ch['line'];
 647+ elif 'line2' in ch:
 648+ current_line = ch['line2'];
 649+ if current_line >= 0 and current_line < last_line:
 650+ lines2 = string.join(lines2,"\n");
 651+ lines2 = lines2.split("\n");
 652+ lshift = 0;
 653+
 654+ if ch['node'] == 'permute':
 655+ lines1 = lines2;
 656+ lines2 = [];
 657+
 658+ line_map = regeneratePermuteMap(ch['length'],
 659+ ch['value']);
 660+
 661+ for k in range(ch['length']):
 662+ lines2.append("");
 663+ for k in line_map:
 664+ if line_map[k] >= 0 and line_map[k] < len(lines1):
 665+ lines2[k] = copy.copy(lines1[line_map[k]]);
 666+
 667+ elif ch['node'] == 'insert':
 668+ lines2.insert(ch['line']-lshift,ch['value']);
 669+ elif ch['node'] == 'replace':
 670+ del lines2[ch['line1']-lshift:ch['line2']-lshift+1];
 671+ lines2.insert(ch['line1']-lshift,ch['value']);
 672+ elif ch['node'] == 'text_replace':
 673+ ln = lines2[ch['line']-lshift];
 674+ ln = ln[:ch['pos_start']] + ch['value'] + ln[ch['pos_end']:];
 675+ lines2[ch['line']-lshift] = ln;
 676+ elif ch['node'] == 'truncate':
 677+ del lines2[ch['line']-lshift:];
 678+ elif ch['node'] == 'append':
 679+ lines2.append(ch['value']);
 680+ elif ch['node'] == 'delete':
 681+ del lines2[ch['line1']-lshift:ch['line2']-lshift+1];
 682+ elif ch['node'] == 'new':
 683+ lines2 = ch['value'].split("\n");
 684+ lshift = 0;
 685+ last_line = -1;
 686+
 687+ if 'value' in ch and ch['node'] != 'permute':
 688+ new_lines = ch['value'].count("\n");
 689+ lshift += new_lines;
 690+
 691+ if new_lines > 0:
 692+ if 'line' in ch:
 693+ last_line = ch['line'];
 694+ elif 'line2' in ch:
 695+ last_line = ch['line2'];
 696+ else:
 697+ last_line = len(lines2) + lshift;
 698+
 699+ return string.join(lines2,"\n");
 700+
 701+
 702+## A simplified XML parser that translates a <changes> block, given as a
 703+## string, into the the changes structure used internally.
 704+def readXMLToChanges(xml):
 705+ tagre = re.compile("<([^>]*)(?:>([^<]*)</[^>]*>| />)",re.I+re.S);
 706+ kwre = re.compile("([^ ]*)=\"([^\"]*)\"",re.I+re.S);
 707+ permutere = re.compile("([0-9]*): ([0-9]*)");
 708+
 709+ changes = []
 710+ tags = tagre.findall(xml);
 711+ for tg in tags:
 712+ tag = dict();
 713+ f1 = tg[0].find(" ");
 714+ if f1 > -1:
 715+ tag['node'] = tg[0][:f1];
 716+ else:
 717+ tag['node'] = tg[0];
 718+ kws = kwre.findall(tg[0]);
 719+ for kw in kws:
 720+ if kw[1].isdigit():
 721+ tag[kw[0]] = int(kw[1]);
 722+ else:
 723+ tag[kw[0]] = kw[1];
 724+ tag['value'] = tg[1];
 725+ changes.append(tag);
 726+
 727+ for ch in changes:
 728+ if 'line' in ch:
 729+ if ch['node'] == 'replace' or ch['node'] == 'delete':
 730+ ch['line1'] = ch['line'];
 731+ ch['line2'] = ch['line'];
 732+ del ch['line'];
 733+ if 'lines' in ch:
 734+ f = ch['lines'].find('-');
 735+ ch['line1'] = int(ch['lines'][:f]);
 736+ ch['line2'] = int(ch['lines'][f+1:]);
 737+ del ch['lines'];
 738+ if 'pos' in ch:
 739+ f = ch['pos'].find('-');
 740+ ch['pos_start'] = int(ch['pos'][:f]);
 741+ ch['pos_end'] = int(ch['pos'][f+1:]);
 742+ del ch['pos'];
 743+ if ch['node'] == 'permute':
 744+ vals = permutere.findall(ch['value']);
 745+ map_values = dict();
 746+ for v in vals:
 747+ map_values[int(v[0])] = int(v[1]);
 748+ ch['value'] = map_values;
 749+
 750+ return changes;
 751+
Property changes on: trunk/tools/editsyntax/EditSyntax.py
___________________________________________________________________
Name: svn:eol-style
1752 + native
Index: trunk/tools/editsyntax/ConvertToEditSyntax.py
@@ -0,0 +1,167 @@
 2+#-*- coding: utf-8 -*-
 3+
 4+## This file will compress the revision history of a database dump or
 5+## export file into a more compact "edit syntax".
 6+##
 7+## Usage: ConvertToEditSyntax input_file output_file
 8+##
 9+## An option flag "-v" is defined that generates a "verbose mode" with
 10+## rolling feedback on the scripts' progress.
 11+##
 12+## This process takes approximately 1 minute for every 250 MB of input.
 13+
 14+import os, string, re, time, sys;
 15+import codecs;
 16+import EditSyntax
 17+
 18+blockdata = "";
 19+blockpos = 0;
 20+blocksize = 500000;
 21+
 22+chars = 0;
 23+compressed = 0;
 24+cnt = 0;
 25+revcnt = 0;
 26+
 27+argv = sys.argv;
 28+input_file = "";
 29+output_file = "";
 30+
 31+def readRevision(f):
 32+ global blockdata, blockpos, blocksize;
 33+ p2 = blockdata.find("</revision>",blockpos);
 34+ if p2 > -1:
 35+ st = blockdata[blockpos:p2+11];
 36+ blockpos = p2+11;
 37+ return st;
 38+ else:
 39+ while p2 == -1:
 40+ new_block = fin.read(blocksize)
 41+ blockdata = blockdata[blockpos:] + new_block;
 42+ blockpos = 0;
 43+ p2 = blockdata.find("</revision>",blockpos);
 44+ if new_block == "":
 45+ return "";
 46+ st = blockdata[blockpos:p2+11];
 47+ blockpos = p2+11;
 48+ return st;
 49+
 50+
 51+if '-v' in argv:
 52+ verbose = True;
 53+ argv.remove('-v');
 54+else:
 55+ verbose = False;
 56+
 57+##verbose = True;
 58+##fname = "cowiki-20081203";
 59+##input_file = "J:\\Wikidata\\" + fname + "-pages-meta-history.xml";
 60+##output_file = "J:\\Wikidata\\"+ fname + "-pages-shrunk.xml";
 61+
 62+if len(argv) >= 3:
 63+ input_file = argv[1];
 64+ output_file = argv[2];
 65+
 66+if input_file == "" or output_file == "":
 67+ verbose = True;
 68+ input_file = raw_input("File to compress? ");
 69+ output_file = raw_input("Destination file? ");
 70+ print "\n";
 71+
 72+filesize = os.path.getsize(input_file);
 73+fin = codecs.open(input_file,'r','utf-8');
 74+fout = codecs.open(output_file,"w", 'utf-8');
 75+
 76+textre = re.compile("<revision>\\s*<id>([0-9]*)</id>.*<text[^>]*>([^<]*)</text>",re.I+re.S);
 77+revidre = re.compile("<revision>\\s*<id>([0-9]*)</id>",re.I+re.S);
 78+
 79+ct = time.clock();
 80+
 81+A=readRevision(fin);
 82+
 83+if verbose:
 84+ print " Pages Revisions File Read Compression Time Rev/s";
 85+
 86+while A != "":
 87+
 88+ if A.find("</page>") > -1 or cnt == 0:
 89+ rev1 = A;
 90+ text1 = textre.findall(A);
 91+ if len(text1) > 0:
 92+ revid = int(text1[0][0]);
 93+ text1 = text1[0][1];
 94+ else:
 95+ revid = revidre.findall(A);
 96+ revid = int(revid[0]);
 97+ text1 = "";
 98+ EditSyntax.newArticle(text1,revid);
 99+ fout.write(rev1);
 100+
 101+ revcnt += 1;
 102+ cnt += 1;
 103+ else:
 104+ rev2 = A;
 105+ text2 = textre.findall(A);
 106+ if len(text2) > 0:
 107+ revid = int(text2[0][0]);
 108+ text2 = text2[0][1];
 109+ elif A.find("<text ") > -1:
 110+ revid = revidre.findall(A);
 111+ revid = int(revid[0]);
 112+ text2 = "";
 113+
 114+ EditSyntax.newRevision(text2,revid);
 115+ revcnt += 1;
 116+
 117+ if text2 != "":
 118+ output = EditSyntax.getXMLDifference();
 119+
 120+ if verbose:
 121+ chars += len(text2) + 43;
 122+ compressed += len(output);
 123+
 124+ p1 = rev2.find("<text")-6;
 125+ p2 = rev2.find("</text>")+8;
 126+ rev2 = rev2[:p1] + output + rev2[p2:];
 127+
 128+ fout.write(rev2);
 129+
 130+ rev1 = rev2;
 131+ text1 = text2;
 132+
 133+ if revcnt % 1000 == 0 and verbose:
 134+ remaining = (filesize - fin.tell()) * (time.clock()-ct) / fin.tell();
 135+ hours = remaining / 3600;
 136+ minutes = (remaining % 3600) / 60;
 137+ seconds = remaining % 60;
 138+ print "%(1)6i %(2)8i %(3)5iM" % \
 139+ {'1': cnt, '2': revcnt, '3': fin.tell() / 2**20}, \
 140+ "(%(1)4.1f%%)" % {'1': fin.tell() / float(filesize) * 100}, \
 141+ " %(1)2.2f%%" % {'1': fout.tell() / float(fin.tell())*100}, \
 142+ "(%(1)2.2f%%)" % {'1': compressed / float(chars)*100}, \
 143+ " %(hour)i:%(min)02i:%(sec)02i" % \
 144+ {'hour' : hours, 'min' : minutes, 'sec': seconds}, \
 145+ " %(1)6.1f" % {'1': revcnt / (time.clock()-ct)};
 146+
 147+ A = readRevision(fin);
 148+
 149+fout.write(blockdata);
 150+
 151+if verbose:
 152+ fout.flush();
 153+ elapsed = time.clock()-ct
 154+ hours = elapsed / 3600;
 155+ minutes = (elapsed % 3600) / 60;
 156+ seconds = elapsed % 60;
 157+ print "\n ", cnt, "Pages, ", revcnt, "Revisions, ", \
 158+ "%(hour)i:%(min)02i:%(sec)02i Processing Time\n" % \
 159+ {'hour' : hours, 'min' : minutes, 'sec': seconds};
 160+ print " Total File Size: ", fin.tell(), "-->", fout.tell(), \
 161+ "(%(1)2.2f%%)" % {'1': fout.tell() / float(fin.tell())*100};
 162+ print " Compressible Characters: ", chars, "-->", compressed, \
 163+ "(%(1)2.2f%%)" % {'1': compressed / float(chars)*100};
 164+
 165+
 166+fin.close();
 167+fout.close();
 168+
Property changes on: trunk/tools/editsyntax/ConvertToEditSyntax.py
___________________________________________________________________
Name: svn:eol-style
1169 + native
Index: trunk/tools/editsyntax/ConvertFromEditSyntax.py
@@ -0,0 +1,161 @@
 2+#-*- coding: utf-8 -*-
 3+
 4+## This file will reverse the compress created by ConvertToEditSyntax.
 5+##
 6+## Usage: ConvertFromEditSyntax input_file output_file
 7+##
 8+## An option flag "-v" is defined that generates a "verbose mode" with
 9+## rolling feedback on the scripts' progress.
 10+##
 11+## This process takes approximately 1 minute for every 175 MB of input.
 12+
 13+import os, sys, string, re, time;
 14+import codecs;
 15+import EditSyntax
 16+
 17+blockdata = "";
 18+blockpos = 0;
 19+blocksize = 500000;
 20+
 21+cnt = 0;
 22+revcnt = 0;
 23+
 24+argv = sys.argv;
 25+input_file = "";
 26+output_file = "";
 27+
 28+def readRevision(f):
 29+ global blockdata, blockpos, blocksize;
 30+ p2 = blockdata.find("</revision>",blockpos);
 31+ if p2 > -1:
 32+ st = blockdata[blockpos:p2+11];
 33+ blockpos = p2+11;
 34+ return st;
 35+ else:
 36+ while p2 == -1:
 37+ new_block = fin.read(blocksize)
 38+ blockdata = blockdata[blockpos:] + new_block;
 39+ blockpos = 0;
 40+ p2 = blockdata.find("</revision>",blockpos);
 41+ if new_block == "":
 42+ return "";
 43+ st = blockdata[blockpos:p2+11];
 44+ blockpos = p2+11;
 45+ return st;
 46+
 47+if '-v' in argv:
 48+ verbose = True;
 49+ argv.remove('-v');
 50+else:
 51+ verbose = False;
 52+
 53+##verbose = True;
 54+##fname = "cowiki-20081203";
 55+##input_file = "J:\\Wikidata\\" + fname + "-pages-shrunk.xml";
 56+##output_file = "J:\\Wikidata\\"+ fname + "-pages-restored.xml";
 57+
 58+if len(argv) >= 3:
 59+ input_file = argv[1];
 60+ output_file = argv[2];
 61+
 62+if input_file == "" or output_file == "":
 63+ verbose = True;
 64+ input_file = raw_input("File to decompress? ");
 65+ output_file = raw_input("Destination file? ");
 66+ print "\n";
 67+
 68+filesize = os.path.getsize(input_file);
 69+fin = codecs.open(input_file,'r','utf-8');
 70+fout = codecs.open(output_file,"w", 'utf-8');
 71+
 72+textre = re.compile("<revision>\\s*<id>([0-9]*)</id>.*<text[^>]*>([^<]*)</text>",re.I+re.S);
 73+changesre = re.compile("<revision>\\s*<id>([0-9]*)</id>.*<changes[^>]*>(.*)</changes>",re.I+re.S);
 74+revidre = re.compile("<revision>\\s*<id>([0-9]*)</id>",re.I+re.S);
 75+
 76+ct = time.clock();
 77+
 78+A=readRevision(fin);
 79+
 80+if verbose:
 81+ print " Pages Revisions File Read Output Time Rev/s";
 82+
 83+while A != "":
 84+
 85+ if A.find("</page>") > -1 or cnt == 0:
 86+ rev1 = A;
 87+ text1 = textre.findall(A);
 88+ if len(text1) > 0:
 89+ revid = int(text1[0][0]);
 90+ text1 = text1[0][1];
 91+ else:
 92+ revid = revidre.findall(A);
 93+ revid = int(revid[0]);
 94+ text1 = "";
 95+ EditSyntax.newArticle(text1, revid);
 96+ revcnt += 1;
 97+ fout.write(rev1);
 98+ cnt += 1;
 99+
 100+ else:
 101+ rev2 = A;
 102+ text2 = textre.findall(A);
 103+ changes = "";
 104+ if len(text2) > 0:
 105+ revid = int(text2[0][0]);
 106+ text2 = text2[0][1];
 107+ EditSyntax.newRevision(text2, revid);
 108+ elif A.find("<text ") > -1:
 109+ revid = revidre.findall(A);
 110+ revid = int(revid[0]);
 111+ text2 = "";
 112+ EditSyntax.newRevision(text2, revid);
 113+ else:
 114+ changes = changesre.findall(A);
 115+ if len(changes) > 0:
 116+ EditSyntax.newChanges(changes[0][1],int(changes[0][0]));
 117+ text2 = EditSyntax.getCurrentText();
 118+ else:
 119+ text2 = "";
 120+
 121+ revcnt += 1;
 122+ if changes != "":
 123+ p1 = rev2.find("<changes");
 124+ p2 = rev2.find("</changes>")+10;
 125+ rev2 = rev2[:p1] + '<text xml:space="preserve">' + \
 126+ text2 + "</text>" + rev2[p2:];
 127+
 128+ fout.write(rev2);
 129+
 130+ rev1 = rev2;
 131+ text1 = text2;
 132+
 133+ if revcnt % 5000 == 0 and verbose:
 134+ remaining = (filesize - fin.tell()) * (time.clock()-ct) / fin.tell();
 135+ hours = remaining / 3600;
 136+ minutes = (remaining % 3600) / 60;
 137+ seconds = remaining % 60;
 138+ print "%(1)6i %(2)8i %(3)5iM" % \
 139+ {'1': cnt, '2': revcnt, '3': fin.tell() / 2**20}, \
 140+ "(%(1)4.1f%%)" % {'1': fin.tell() / float(filesize) * 100}, \
 141+ " %(1)5iM" % {'1': fout.tell() / 2**20}, \
 142+ " %(hour)i:%(min)02i:%(sec)02i" % \
 143+ {'hour' : hours, 'min' : minutes, 'sec': seconds}, \
 144+ " %(1)6.1f" % {'1': revcnt / (time.clock()-ct)};
 145+
 146+ A = readRevision(fin);
 147+
 148+fout.write(blockdata);
 149+
 150+if verbose:
 151+ fout.flush();
 152+ elapsed = time.clock()-ct
 153+ hours = elapsed / 3600;
 154+ minutes = (elapsed % 3600) / 60;
 155+ seconds = elapsed % 60;
 156+ print "\n ", cnt, "Pages, ", revcnt, "Revisions, ", \
 157+ "%(hour)i:%(min)02i:%(sec)02i Processing Time\n" % \
 158+ {'hour' : hours, 'min' : minutes, 'sec': seconds};
 159+ print " Total File Size: ", fin.tell(), "-->", fout.tell()
 160+
 161+fout.close();
 162+fin.close();
Property changes on: trunk/tools/editsyntax/ConvertFromEditSyntax.py
___________________________________________________________________
Name: svn:eol-style
1163 + native

Comments

#Comment by Brion VIBBER (talk | contribs)   02:59, 21 January 2009

oooooh will look these over later :D

Status & tagging log