Index: trunk/tools/editsyntax/EditSyntax.py |
— | — | @@ -0,0 +1,750 @@ |
| 2 | +#-*- coding: utf-8 -*- |
| 3 | +import string, re, copy, hashlib |
| 4 | + |
| 5 | +## Global variables used by this script |
| 6 | +max_buffer_size = 15000000; ##Maximum number of characters to keep in |
| 7 | + ##revision buffer. |
| 8 | + |
| 9 | +revision_buffer = dict(); |
| 10 | +revision_order = [] |
| 11 | +hash_table = dict(); |
| 12 | +current_buffer_size = 0; |
| 13 | +current_revision = -1; |
| 14 | +previous_revision = -1; |
| 15 | + |
| 16 | + |
| 17 | +## This function should be called with the first revision of each |
| 18 | +## article. It clears the internal buffers and reinitializes the state. |
| 19 | +def newArticle(text, revision_id): |
| 20 | + global revision_buffer, hash_table |
| 21 | + global current_revision, previous_revision; |
| 22 | + global current_buffer_size, revision_order; |
| 23 | + |
| 24 | + revision_buffer = dict(); |
| 25 | + revision_buffer[revision_id] = text; |
| 26 | + |
| 27 | + hash_table = dict(); |
| 28 | + hash_table[hash(text)] = [revision_id]; |
| 29 | + |
| 30 | + current_revision = revision_id; |
| 31 | + previous_revision = -1; |
| 32 | + revision_order = [revision_id]; |
| 33 | + |
| 34 | + current_buffer_size = len(text); |
| 35 | + |
| 36 | + |
| 37 | +## When reading from a full history dump, this function should be called |
| 38 | +## with each subsequent revision of the article started with newArticle. |
| 39 | +def newRevision(text, revision_id): |
| 40 | + global revision_buffer, hash_table; |
| 41 | + global current_revision, previous_revision; |
| 42 | + global current_buffer_size, max_buffer_size, revision_order; |
| 43 | + |
| 44 | + previous_revision = current_revision; |
| 45 | + current_revision = revision_id; |
| 46 | + revision_order.append(current_revision); |
| 47 | + |
| 48 | + h = hash(text); |
| 49 | + if h not in hash_table: |
| 50 | + hash_table[h] = [current_revision]; |
| 51 | + else: |
| 52 | + hash_table[h].append(current_revision); |
| 53 | + |
| 54 | + ##Prevent Memory Overflow |
| 55 | + pos = 0; |
| 56 | + while current_buffer_size + len(text) > max_buffer_size \ |
| 57 | + and pos < len(revision_order) - 1: |
| 58 | + key = revision_order[pos]; |
| 59 | + if revision_buffer[key] != False: |
| 60 | + current_buffer_size -= len(revision_buffer[key]); |
| 61 | + revision_buffer[key] = False; |
| 62 | + pos += 1; |
| 63 | + |
| 64 | + revision_buffer[revision_id] = text; |
| 65 | + current_buffer_size += len(text); |
| 66 | + |
| 67 | + |
| 68 | +## When reading from dump generated using editSyntax, this function should |
| 69 | +## be called with each new <changes> block passed as a text string. |
| 70 | +def newChanges(xml, revision_id): |
| 71 | + global revision_buffer, current_revision; |
| 72 | + changes = readXMLToChanges(xml); |
| 73 | + |
| 74 | + start = -1; |
| 75 | + for k in range(len(changes)): |
| 76 | + if changes[k]['node'] == 'revert': |
| 77 | + text = revision_buffer[changes[0]['revision']]; |
| 78 | + start = k; |
| 79 | + |
| 80 | + if start >= 0: |
| 81 | + if len(changes) > start+1: |
| 82 | + text = differenceRestorer(text,changes[start+1:]); |
| 83 | + else: |
| 84 | + text = differenceRestorer(revision_buffer[current_revision],changes); |
| 85 | + |
| 86 | + newRevision(text, revision_id); |
| 87 | + |
| 88 | + |
| 89 | +## The full-text of the current revision. |
| 90 | +def getCurrentText(): |
| 91 | + global revision_buffer, current_revision; |
| 92 | + return revision_buffer[current_revision]; |
| 93 | + |
| 94 | + |
| 95 | +## Returns an XML formatted <changes> block comparing the current revision |
| 96 | +## to the previous one. |
| 97 | +def getXMLDifference(indent = 3, indentstr = " "): |
| 98 | + global revision_buffer, hash_table, current_revision, previous_revision; |
| 99 | + |
| 100 | + xml_out = ""; |
| 101 | + if previous_revision > -1: |
| 102 | + h = hash(revision_buffer[current_revision]); |
| 103 | + revert_id = -1; |
| 104 | + if len(hash_table[h]) > 1: |
| 105 | + choices = hash_table[h]; |
| 106 | + for c in choices[:-1]: |
| 107 | + if revision_buffer[c] == revision_buffer[current_revision]: |
| 108 | + revert_id = c; |
| 109 | + break; |
| 110 | + if revert_id > -1: |
| 111 | + changes = [dict({'node':'revert','revision':revert_id})]; |
| 112 | + else: |
| 113 | + changes = differenceGenerator(revision_buffer[previous_revision], |
| 114 | + revision_buffer[current_revision]); |
| 115 | + |
| 116 | + xml_out = XMLOutput(changes,indent,indentstr); |
| 117 | + if len(xml_out) > len(revision_buffer[current_revision]) + 40: |
| 118 | + changes = [dict({'node':'new','value':revision_buffer[current_revision]})]; |
| 119 | + xml_out = XMLOutput(changes,indent,indentstr); |
| 120 | + else: |
| 121 | + changes = [dict({'node':'new','value':revision_buffer[current_revision]})]; |
| 122 | + xml_out = XMLOutput(changes,indent,indentstr); |
| 123 | + |
| 124 | + return xml_out; |
| 125 | + |
| 126 | + |
| 127 | +## Takes a changes structure and produces a string with XML formatted output |
| 128 | +def XMLOutput(changes, indent = 3, indentstr = " "): |
| 129 | + in1 = ""; |
| 130 | + for k in range(indent): |
| 131 | + in1 += indentstr; |
| 132 | + in2 = in1 + indentstr; |
| 133 | + |
| 134 | + if len(changes) == 1 and changes[0]['node'] == 'new': |
| 135 | + st = in1 + '<text xml:space="preserve"'; |
| 136 | + if changes[0]['value'] != "": |
| 137 | + st += ">" + changes[0]['value'] + "</text>\n"; |
| 138 | + else: |
| 139 | + st += " />\n"; |
| 140 | + return st; |
| 141 | + |
| 142 | + need_breaks = False; |
| 143 | + for ch in changes: |
| 144 | + if 'value' in ch and "\n" in ch['value']: |
| 145 | + need_breaks = True; |
| 146 | + break; |
| 147 | + if need_breaks: |
| 148 | + st = in1 + "<changes xml:space=\"preserve\">\n"; |
| 149 | + else: |
| 150 | + st = in1 + "<changes>\n"; |
| 151 | + |
| 152 | + for ch in changes: |
| 153 | + if ch['node'] == 'replace' or ch['node'] == 'delete': |
| 154 | + if ch['line1'] == ch['line2']: |
| 155 | + ch['line'] = ch['line1']; |
| 156 | + else: |
| 157 | + ch['lines'] = unicode(ch['line1']) + "-" + \ |
| 158 | + unicode(ch['line2']); |
| 159 | + del ch['line1']; |
| 160 | + del ch['line2']; |
| 161 | + elif ch['node'] == 'text_replace': |
| 162 | + ch['pos'] = unicode(ch['pos_start']) + "-" + \ |
| 163 | + unicode(ch['pos_end']); |
| 164 | + del ch['pos_start']; |
| 165 | + del ch['pos_end']; |
| 166 | + |
| 167 | + st += in2 + "<" + ch['node']; |
| 168 | + for key in ch: |
| 169 | + if key != 'node' and key != 'value': |
| 170 | + st += " " + key + '="' + unicode(ch[key]) + '"'; |
| 171 | + if 'value' in ch and ch['value'] != '': |
| 172 | + st2 = ""; |
| 173 | + if ch['node'] == 'permute': |
| 174 | + keylist = ch['value'].keys(); |
| 175 | + keylist.sort(); |
| 176 | + for k in keylist: |
| 177 | + st2 += unicode(k) + ": " + unicode(ch['value'][k]) + ", "; |
| 178 | + st2 = "{" + st2[:-2] + "}"; |
| 179 | + else: |
| 180 | + st2 = unicode(ch['value']); |
| 181 | + st += ">" + st2 + "</" + ch['node'] + ">\n"; |
| 182 | + else: |
| 183 | + st += " />\n"; |
| 184 | + |
| 185 | + st += in1 + "</changes>\n"; |
| 186 | + return st; |
| 187 | + |
| 188 | + |
| 189 | +## Creates a changes structure denoting the transformation from |
| 190 | +## old_line to new_line. |
| 191 | +def textBlocking(old_line, new_line, line_number = False): |
| 192 | + |
| 193 | + min_char_block = 40; ## Number of consecutive matching characters |
| 194 | + ## within a block of changed text to justify |
| 195 | + ## spliting one replacement into two. |
| 196 | + |
| 197 | + matches = []; |
| 198 | + k1 = 0; |
| 199 | + min_k2 = 0; |
| 200 | + |
| 201 | + while k1 < len(old_line)-min_char_block: |
| 202 | + k2 = string.find(new_line,old_line[k1:k1+min_char_block],min_k2); |
| 203 | + if k2 >= 0: |
| 204 | + if k1 > 0 and k2 > 0 and old_line[k1-1] == new_line[k2-1]: |
| 205 | + break; |
| 206 | + s = min_char_block; |
| 207 | + while k1 + s < len(old_line) \ |
| 208 | + and k2 + s < len(new_line) \ |
| 209 | + and old_line[k1:k1+s+1] == new_line[k2:k2+s+1]: |
| 210 | + s += 1; |
| 211 | + matches.append([k1,k2,s]); |
| 212 | + k1 += s; |
| 213 | + min_k2 = k2 + s; |
| 214 | + break; |
| 215 | + k1 += 1; |
| 216 | + |
| 217 | + old = []; |
| 218 | + new = []; |
| 219 | + if len(matches) > 0: |
| 220 | + mlast_o = 0; |
| 221 | + mlast_n = 0; |
| 222 | + for m in matches: |
| 223 | + old.append(old_line[mlast_o:m[0]]); |
| 224 | + old.append(old_line[m[0]:m[0]+m[2]]); |
| 225 | + new.append(new_line[mlast_n:m[1]]); |
| 226 | + new.append(new_line[m[1]:m[1]+m[2]]); |
| 227 | + mlast_o = m[0]+m[2]; |
| 228 | + mlast_n = m[1]+m[2]; |
| 229 | + old.append(old_line[mlast_o:]); |
| 230 | + new.append(new_line[mlast_n:]); |
| 231 | + else: |
| 232 | + changes = [dict({'node':'replace','line1':line_number, |
| 233 | + 'line2':line_number,'value':new_line})]; |
| 234 | + return changes; |
| 235 | + |
| 236 | + changes = []; |
| 237 | + pos = 0; |
| 238 | + for k in range(len(old)): |
| 239 | + if old[k] != new[k]: |
| 240 | + p1 = 0; |
| 241 | + p2 = -1; |
| 242 | + while p1 < len(old[k]) \ |
| 243 | + and p1 < len(new[k]) \ |
| 244 | + and old[k][p1] == new[k][p1]: |
| 245 | + p1 += 1; |
| 246 | + |
| 247 | + while -p2-1 < len(old[k]) \ |
| 248 | + and -p2-1 < len(new[k]) \ |
| 249 | + and old[k][p2] == new[k][p2]: |
| 250 | + p2 -= 1; |
| 251 | + |
| 252 | + if p2 == -1: |
| 253 | + changes.append(dict({'node':'text_replace', |
| 254 | + 'line':line_number, |
| 255 | + 'pos_start':pos+p1, |
| 256 | + 'pos_end':pos+len(old[k]), |
| 257 | + 'value':new[k][p1:]})); |
| 258 | + else: |
| 259 | + changes.append(dict({'node':'text_replace', |
| 260 | + 'line':line_number, |
| 261 | + 'pos_start':pos+p1, |
| 262 | + 'pos_end':pos+len(old[k])+p2+1, |
| 263 | + 'value':new[k][p1:p2+1]})); |
| 264 | + pos += len(new[k]); |
| 265 | + else: |
| 266 | + pos += len(old[k]); |
| 267 | + |
| 268 | + return changes; |
| 269 | + |
| 270 | + |
| 271 | +## Private ## |
| 272 | +## |
| 273 | +## Takes a changes structure and combines consecutive entries if |
| 274 | +## the resulting code would be more compact. |
| 275 | +## |
| 276 | +## Must be called prior to calling normalizeLineNumbers |
| 277 | +def consolidateChanges(changes): |
| 278 | + k = 0; |
| 279 | + |
| 280 | + while k < len(changes) - 1: |
| 281 | + if changes[k]['node'] == 'replace': |
| 282 | + if changes[k+1]['node'] == 'replace' \ |
| 283 | + and changes[k+1]['line1'] == changes[k]['line2'] + 1: |
| 284 | + changes[k]['value'] += "\n" + changes[k+1]['value']; |
| 285 | + changes[k]['line2'] = changes[k+1]['line2']; |
| 286 | + del changes[k+1]; |
| 287 | + elif changes[k+1]['node'] == 'insert' \ |
| 288 | + and changes[k+1]['line'] == changes[k]['line2'] + 1: |
| 289 | + changes[k]['value'] += "\n" + changes[k+1]['value']; |
| 290 | + del changes[k+1]; |
| 291 | + elif changes[k+1]['node'] == 'delete' \ |
| 292 | + and changes[k+1]['line1'] == changes[k]['line2'] + 1: |
| 293 | + changes[k]['line2'] = changes[k+1]['line2']; |
| 294 | + del changes[k+1]; |
| 295 | + else: |
| 296 | + k += 1; |
| 297 | + elif changes[k]['node'] == 'insert': |
| 298 | + if changes[k+1]['node'] == 'replace' and \ |
| 299 | + changes[k+1]['line1'] == changes[k]['line']: |
| 300 | + ch = copy.copy(changes[k+1]); |
| 301 | + ch['value'] += changes[k]['value'] + "\n" + ch['value']; |
| 302 | + changes[k] = ch; |
| 303 | + del changes[k+1]; |
| 304 | + elif changes[k+1]['node'] == 'insert' and \ |
| 305 | + changes[k+1]['line'] == changes[k]['line']: |
| 306 | + changes[k]['value'] += "\n" + changes[k+1]['value']; |
| 307 | + del changes[k+1]; |
| 308 | + elif changes[k+1]['node'] == 'delete' and \ |
| 309 | + changes[k+1]['line1'] == changes[k]['line']: |
| 310 | + ch = dict({'node':'replace'}); |
| 311 | + ch['line1'] = changes[k+1]['line1']; |
| 312 | + ch['line2'] = changes[k+1]['line2']; |
| 313 | + ch['value'] = changes[k]['value']; |
| 314 | + changes[k] = ch; |
| 315 | + del changes[k+1]; |
| 316 | + else: |
| 317 | + k += 1; |
| 318 | + |
| 319 | +## Current code never generates these cases |
| 320 | +## may want to uncomment this block if that changes. |
| 321 | +## |
| 322 | +## elif changes[k]['node'] == 'delete': |
| 323 | +## if changes[k+1]['node'] == 'replace' and \ |
| 324 | +## changes[k+1]['line1'] == changes[k]['line2']+1: |
| 325 | +## ch = copy.copy(changes[k+1]); |
| 326 | +## ch['line1'] = changes[k]['line1']; |
| 327 | +## changes[k] = ch; |
| 328 | +## del changes[k+1]; |
| 329 | +## elif changes[k+1]['node'] == 'insert' and changes[k+1]['line']+1 == changes[k]['line2']: |
| 330 | +## ch = dict({'node':'replace'}); |
| 331 | +## ch['line1'] = changes[k]['line1']; |
| 332 | +## ch['line2'] = changes[k]['line2']; |
| 333 | +## ch['value'] = changes[k+1]['value']; |
| 334 | +## changes[k] = ch; |
| 335 | +## del changes[k+1]; |
| 336 | +## elif changes[k+1]['node'] == 'delete' and \ |
| 337 | +## changes[k+1]['line1'] == changes[k]['line2']+1: |
| 338 | +## changes[k]['line2'] = changes[k+1]['line2']; |
| 339 | +## del changes[k+1]; |
| 340 | +## else: |
| 341 | +## k += 1; |
| 342 | + |
| 343 | + else: |
| 344 | + k += 1; |
| 345 | + |
| 346 | + return changes; |
| 347 | + |
| 348 | +## Private ## |
| 349 | +## Cleanup function addressing changes with embedded newlines. |
| 350 | +def normalizeLineNumbers(changes): |
| 351 | + lshift = 0; |
| 352 | + for ch in changes: |
| 353 | + if 'line' in ch: |
| 354 | + ch['line'] += lshift; |
| 355 | + if 'line1' in ch: |
| 356 | + ch['line1'] += lshift; |
| 357 | + ch['line2'] += lshift; |
| 358 | + if ch['node'] == 'delete': |
| 359 | + lshift -= ch['line2']-ch['line1']+1; |
| 360 | + if ch['node'] == 'insert': |
| 361 | + lshift += 1; |
| 362 | + if ch['node'] == 'replace': |
| 363 | + lshift -= ch['line2']-ch['line1']; |
| 364 | + if 'value' in ch and ch['node'] != 'permute': |
| 365 | + lshift += ch['value'].count("\n"); |
| 366 | + |
| 367 | + return changes; |
| 368 | + |
| 369 | +## Private ## |
| 370 | +## Changes zero indexed values to one indexed for easier human readability. |
| 371 | +def addOne(changes): |
| 372 | + for ch in changes: |
| 373 | + if 'line' in ch: |
| 374 | + ch['line'] += 1; |
| 375 | + if 'line1' in ch: |
| 376 | + ch['line1'] += 1; |
| 377 | + ch['line2'] += 1; |
| 378 | + if 'pos_start' in ch: |
| 379 | + ch['pos_start'] += 1; |
| 380 | + ch['pos_end'] += 1; |
| 381 | + if ch['node'] == 'permute': |
| 382 | + per = dict(); |
| 383 | + for key in ch['value']: |
| 384 | + per[key+1] = ch['value'][key] + 1; |
| 385 | + ch['value'] = per; |
| 386 | + |
| 387 | + return changes; |
| 388 | + |
| 389 | +## Private ## |
| 390 | +## Changes one indexed values back to zero indexed values. |
| 391 | +def subtractOne(changes): |
| 392 | + for ch in changes: |
| 393 | + if 'line' in ch: |
| 394 | + ch['line'] -= 1; |
| 395 | + if 'line1' in ch: |
| 396 | + ch['line1'] -= 1; |
| 397 | + ch['line2'] -= 1; |
| 398 | + if 'pos_start' in ch: |
| 399 | + ch['pos_start'] -= 1; |
| 400 | + ch['pos_end'] -= 1; |
| 401 | + if ch['node'] == 'permute': |
| 402 | + per = dict(); |
| 403 | + for key in ch['value']: |
| 404 | + per[key-1] = ch['value'][key] - 1; |
| 405 | + ch['value'] = per; |
| 406 | + |
| 407 | + return changes; |
| 408 | + |
| 409 | + |
| 410 | +## Private ## |
| 411 | +## Called by differenceGenerator for cases requiring permutations. |
| 412 | +def permutedDifferenceGenerator(lines1, lines2, line_map): |
| 413 | + changes = []; |
| 414 | + |
| 415 | + rlnm = dict(); |
| 416 | + rlnm[0] = 0; |
| 417 | + last_k = 0; |
| 418 | + for k in range(len(lines2)): |
| 419 | + if line_map[k] != rlnm[last_k] + k-last_k and line_map[k] >= 0: |
| 420 | + rlnm[k] = line_map[k]; |
| 421 | + last_k = k; |
| 422 | + if rlnm[0] == 0: |
| 423 | + del rlnm[0]; |
| 424 | + |
| 425 | + changes.append(dict({'node':'permute','length':len(lines2),'value':rlnm})); |
| 426 | + line_map2 = regeneratePermuteMap(len(lines2),rlnm); |
| 427 | + |
| 428 | + for k in range(len(lines2)): |
| 429 | + |
| 430 | + if line_map[k] == -1: |
| 431 | + if line_map2[k] < len(lines1) and \ |
| 432 | + lines1[line_map2[k]] == lines2[k]: |
| 433 | + line_map[k] = line_map2[k]; |
| 434 | + continue; |
| 435 | + if line_map2[k] < len(lines1) \ |
| 436 | + and k < len(lines2) and line_map2[k] >= 0: |
| 437 | + changes.extend(textBlocking(lines1[line_map2[k]],lines2[k],k)); |
| 438 | + continue; |
| 439 | + |
| 440 | + changes.append(dict({'node':"replace", |
| 441 | + 'line1':k, |
| 442 | + 'line2':k, |
| 443 | + 'value':lines2[k]})); |
| 444 | + if line_map[k] == -10: |
| 445 | + if line_map2[k] < len(lines1) and \ |
| 446 | + lines1[line_map2[k]] == lines2[k]: |
| 447 | + line_map[k] = line_map2[k]; |
| 448 | + continue; |
| 449 | + if line_map2[k] >= len(lines1): |
| 450 | + continue; |
| 451 | + changes.append(dict({'node':"replace", |
| 452 | + 'line1':k, |
| 453 | + 'line2':k, |
| 454 | + 'value':""})); |
| 455 | + |
| 456 | + changes = consolidateChanges(changes); |
| 457 | + changes = normalizeLineNumbers(changes); |
| 458 | + changes = addOne(changes); |
| 459 | + |
| 460 | + return changes; |
| 461 | + |
| 462 | + |
| 463 | +## Creates a change structure showing the differences between old_text |
| 464 | +## and new_text. |
| 465 | +def differenceGenerator(old_text,new_text): |
| 466 | + lines1 = old_text.split("\n"); |
| 467 | + lines2 = new_text.split("\n"); |
| 468 | + |
| 469 | + line_map = dict(); |
| 470 | + for k in range(len(lines2)): |
| 471 | + ln = lines2[k]; |
| 472 | + if ln == "": |
| 473 | + k2 = k - 1; |
| 474 | + while k2 >= 0 and line_map[k2] < 0: |
| 475 | + k2 = k2 - 1; |
| 476 | + df = k - k2; |
| 477 | + if k2 >= 0: |
| 478 | + if line_map[k2] + df < len(lines1) and lines1[line_map[k2]+df] == "": |
| 479 | + line_map[k] = line_map[k2] + df; |
| 480 | + else: |
| 481 | + line_map[k] = -10; |
| 482 | + else: |
| 483 | + line_map[k] = -10; |
| 484 | + continue; |
| 485 | + if ln in lines1: |
| 486 | + if k > 0: |
| 487 | + if line_map[k-1]+1 < len(lines1) and line_map[k-1] >= 0 \ |
| 488 | + and lines1[line_map[k-1]+1] == ln: |
| 489 | + line_map[k] = line_map[k-1]+1; |
| 490 | + continue; |
| 491 | + line_map[k] = lines1.index(ln); |
| 492 | + else: |
| 493 | + line_map[k] = -1; |
| 494 | + if k >= 2 and line_map[k-2] == -1: |
| 495 | + if line_map[k-1] >= 0 and len(lines1[line_map[k-1]]) < 10: |
| 496 | + line_map[k-1] = -1; |
| 497 | + |
| 498 | + k_last = 0; |
| 499 | + for k in range(1,len(line_map)): |
| 500 | + if line_map[k] <= line_map[k_last] and line_map[k] >= 0: |
| 501 | + return permutedDifferenceGenerator(lines1,lines2,line_map); |
| 502 | + if line_map[k] >= 0: |
| 503 | + k_last = k; |
| 504 | + |
| 505 | + changes = []; |
| 506 | + k2 = 0; |
| 507 | + k = 0; |
| 508 | + while k < len(line_map): |
| 509 | + if line_map[k] == k2: |
| 510 | + k2 = k2 + 1; |
| 511 | + k = k + 1; |
| 512 | + continue; |
| 513 | + if line_map[k] == -1: |
| 514 | + ks = k + 1; |
| 515 | + while (ks < len(line_map) and line_map[ks] < k2) or \ |
| 516 | + (ks+1 < len(line_map) and lines2[ks] == ""): |
| 517 | + ks += 1; |
| 518 | + if ks >= len(line_map): |
| 519 | + if k2 > 0: |
| 520 | + changes.append(dict({'node':"truncate",'line':k2})); |
| 521 | + changes.append(dict({'node':"append", |
| 522 | + 'value':string.join(lines2[k:],"\n")})); |
| 523 | + else: |
| 524 | + changes.append(dict({'node':"new", |
| 525 | + 'value':string.join(lines2,"\n")})); |
| 526 | + break; |
| 527 | + else: |
| 528 | + if line_map[ks] == k2: |
| 529 | + changes.append(dict({'node':"insert", |
| 530 | + 'line':k2, |
| 531 | + 'value':string.join(lines2[k:ks],"\n")})); |
| 532 | + k = ks; |
| 533 | + continue; |
| 534 | + else: |
| 535 | + ch2 = []; |
| 536 | + for j in range(k,ks): |
| 537 | + if k2 + j - k < line_map[ks]: |
| 538 | + if lines2[j] != "": |
| 539 | + ch2.extend(textBlocking(lines1[k2+j-k],lines2[j],k2+j-k)); |
| 540 | + else: |
| 541 | + if len(ch2) > 0 and ch2[-1]['node'] == 'replace': |
| 542 | + ch2[-1]['line2'] += 1; |
| 543 | + ch2[-1]['value'] += "\n"; |
| 544 | + else: |
| 545 | + ch2.append(dict({'node':"replace", |
| 546 | + 'line1':k2+j-k, |
| 547 | + 'line2':k2+j-k, |
| 548 | + 'value':""})); |
| 549 | + else: |
| 550 | + ch2.append(dict({'node':"insert", |
| 551 | + 'line':line_map[ks], |
| 552 | + 'value':lines2[j]})); |
| 553 | + |
| 554 | + if ks-k < line_map[ks]-k2: |
| 555 | + ch2.append(dict({'node':"delete", |
| 556 | + 'line1':k2+ks-k, |
| 557 | + 'line2':line_map[ks]-1})); |
| 558 | + |
| 559 | + changes.extend(ch2); |
| 560 | + |
| 561 | + k2 = line_map[ks]; |
| 562 | + k = ks; |
| 563 | + continue; |
| 564 | + if line_map[k] == -10: |
| 565 | + ks = k + 1; |
| 566 | + while ks < len(line_map) and line_map[ks] == -10: |
| 567 | + ks += 1; |
| 568 | + if ks < len(line_map) and k2 == line_map[ks]: |
| 569 | + temp = []; |
| 570 | + for j in range(k,ks): |
| 571 | + temp.append(""); |
| 572 | + changes.append(dict({'node':"insert", |
| 573 | + 'line':k2, |
| 574 | + 'value':string.join(temp,"\n")})); |
| 575 | + k = ks; |
| 576 | + continue; |
| 577 | + else: |
| 578 | + changes.append(dict({'node':"replace", |
| 579 | + 'line1':k2, |
| 580 | + 'line2':k2, |
| 581 | + 'value':""})); |
| 582 | + k2 += 1; |
| 583 | + k += 1; |
| 584 | + continue; |
| 585 | + if line_map[k] > k2: |
| 586 | + changes.append(dict({'node':'delete', |
| 587 | + 'line1':k2, |
| 588 | + 'line2':line_map[k]-1})); |
| 589 | + k2 = line_map[k]+1; |
| 590 | + k = k + 1; |
| 591 | + continue; |
| 592 | + if line_map[k] < k2 and line_map[k] > 0: |
| 593 | + k2 = k2 -1; |
| 594 | + |
| 595 | + print k; |
| 596 | + k = k + 1; |
| 597 | + k2 = k2 + 1; |
| 598 | + |
| 599 | + if k2 < len(lines1) and len(changes) > 0: |
| 600 | + if changes[-1]['node'] != 'append' and changes[-1]['node'] != 'truncate' \ |
| 601 | + and changes[-1]['node'] != 'new': |
| 602 | + changes.append(dict({'node':"truncate",'line':k2})); |
| 603 | + if k2 < len(lines1) and len(changes) == 0: |
| 604 | + changes.append(dict({'node':"truncate",'line':k2})); |
| 605 | + |
| 606 | + changes = consolidateChanges(changes); |
| 607 | + changes = normalizeLineNumbers(changes); |
| 608 | + changes = addOne(changes); |
| 609 | + |
| 610 | + return changes; |
| 611 | + |
| 612 | + |
| 613 | +## Reconstruct a full line map from the condensed <permute> data. |
| 614 | +def regeneratePermuteMap(length,permute_data): |
| 615 | + line_map = dict(); |
| 616 | + last_k = 0; |
| 617 | + line_map[0] = 0; |
| 618 | + for k in range(length): |
| 619 | + if k in permute_data: |
| 620 | + line_map[k] = permute_data[k]; |
| 621 | + last_k = k; |
| 622 | + else: |
| 623 | + line_map[k] = line_map[last_k] + k-last_k; |
| 624 | + |
| 625 | + return line_map; |
| 626 | + |
| 627 | + |
| 628 | +## Applies "changes" to "old_text" and returns new text |
| 629 | +def differenceRestorer(old_text,changes): |
| 630 | + changes = subtractOne(changes); |
| 631 | + |
| 632 | + lines1 = old_text.split("\n"); |
| 633 | + lines2 = copy.copy(lines1); |
| 634 | + |
| 635 | + lshift = 0; ##Keep track of newline insertions, faster than |
| 636 | + ##restructuing each time a newline is inserted. |
| 637 | + |
| 638 | + last_line = -1; |
| 639 | + |
| 640 | + for k in range(len(changes)): |
| 641 | + ch = changes[k]; |
| 642 | + |
| 643 | + ##Detect out of order operations |
| 644 | + current_line = -1; |
| 645 | + if 'line' in ch: |
| 646 | + current_line = ch['line']; |
| 647 | + elif 'line2' in ch: |
| 648 | + current_line = ch['line2']; |
| 649 | + if current_line >= 0 and current_line < last_line: |
| 650 | + lines2 = string.join(lines2,"\n"); |
| 651 | + lines2 = lines2.split("\n"); |
| 652 | + lshift = 0; |
| 653 | + |
| 654 | + if ch['node'] == 'permute': |
| 655 | + lines1 = lines2; |
| 656 | + lines2 = []; |
| 657 | + |
| 658 | + line_map = regeneratePermuteMap(ch['length'], |
| 659 | + ch['value']); |
| 660 | + |
| 661 | + for k in range(ch['length']): |
| 662 | + lines2.append(""); |
| 663 | + for k in line_map: |
| 664 | + if line_map[k] >= 0 and line_map[k] < len(lines1): |
| 665 | + lines2[k] = copy.copy(lines1[line_map[k]]); |
| 666 | + |
| 667 | + elif ch['node'] == 'insert': |
| 668 | + lines2.insert(ch['line']-lshift,ch['value']); |
| 669 | + elif ch['node'] == 'replace': |
| 670 | + del lines2[ch['line1']-lshift:ch['line2']-lshift+1]; |
| 671 | + lines2.insert(ch['line1']-lshift,ch['value']); |
| 672 | + elif ch['node'] == 'text_replace': |
| 673 | + ln = lines2[ch['line']-lshift]; |
| 674 | + ln = ln[:ch['pos_start']] + ch['value'] + ln[ch['pos_end']:]; |
| 675 | + lines2[ch['line']-lshift] = ln; |
| 676 | + elif ch['node'] == 'truncate': |
| 677 | + del lines2[ch['line']-lshift:]; |
| 678 | + elif ch['node'] == 'append': |
| 679 | + lines2.append(ch['value']); |
| 680 | + elif ch['node'] == 'delete': |
| 681 | + del lines2[ch['line1']-lshift:ch['line2']-lshift+1]; |
| 682 | + elif ch['node'] == 'new': |
| 683 | + lines2 = ch['value'].split("\n"); |
| 684 | + lshift = 0; |
| 685 | + last_line = -1; |
| 686 | + |
| 687 | + if 'value' in ch and ch['node'] != 'permute': |
| 688 | + new_lines = ch['value'].count("\n"); |
| 689 | + lshift += new_lines; |
| 690 | + |
| 691 | + if new_lines > 0: |
| 692 | + if 'line' in ch: |
| 693 | + last_line = ch['line']; |
| 694 | + elif 'line2' in ch: |
| 695 | + last_line = ch['line2']; |
| 696 | + else: |
| 697 | + last_line = len(lines2) + lshift; |
| 698 | + |
| 699 | + return string.join(lines2,"\n"); |
| 700 | + |
| 701 | + |
| 702 | +## A simplified XML parser that translates a <changes> block, given as a |
| 703 | +## string, into the the changes structure used internally. |
| 704 | +def readXMLToChanges(xml): |
| 705 | + tagre = re.compile("<([^>]*)(?:>([^<]*)</[^>]*>| />)",re.I+re.S); |
| 706 | + kwre = re.compile("([^ ]*)=\"([^\"]*)\"",re.I+re.S); |
| 707 | + permutere = re.compile("([0-9]*): ([0-9]*)"); |
| 708 | + |
| 709 | + changes = [] |
| 710 | + tags = tagre.findall(xml); |
| 711 | + for tg in tags: |
| 712 | + tag = dict(); |
| 713 | + f1 = tg[0].find(" "); |
| 714 | + if f1 > -1: |
| 715 | + tag['node'] = tg[0][:f1]; |
| 716 | + else: |
| 717 | + tag['node'] = tg[0]; |
| 718 | + kws = kwre.findall(tg[0]); |
| 719 | + for kw in kws: |
| 720 | + if kw[1].isdigit(): |
| 721 | + tag[kw[0]] = int(kw[1]); |
| 722 | + else: |
| 723 | + tag[kw[0]] = kw[1]; |
| 724 | + tag['value'] = tg[1]; |
| 725 | + changes.append(tag); |
| 726 | + |
| 727 | + for ch in changes: |
| 728 | + if 'line' in ch: |
| 729 | + if ch['node'] == 'replace' or ch['node'] == 'delete': |
| 730 | + ch['line1'] = ch['line']; |
| 731 | + ch['line2'] = ch['line']; |
| 732 | + del ch['line']; |
| 733 | + if 'lines' in ch: |
| 734 | + f = ch['lines'].find('-'); |
| 735 | + ch['line1'] = int(ch['lines'][:f]); |
| 736 | + ch['line2'] = int(ch['lines'][f+1:]); |
| 737 | + del ch['lines']; |
| 738 | + if 'pos' in ch: |
| 739 | + f = ch['pos'].find('-'); |
| 740 | + ch['pos_start'] = int(ch['pos'][:f]); |
| 741 | + ch['pos_end'] = int(ch['pos'][f+1:]); |
| 742 | + del ch['pos']; |
| 743 | + if ch['node'] == 'permute': |
| 744 | + vals = permutere.findall(ch['value']); |
| 745 | + map_values = dict(); |
| 746 | + for v in vals: |
| 747 | + map_values[int(v[0])] = int(v[1]); |
| 748 | + ch['value'] = map_values; |
| 749 | + |
| 750 | + return changes; |
| 751 | + |
Property changes on: trunk/tools/editsyntax/EditSyntax.py |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 752 | + native |
Index: trunk/tools/editsyntax/ConvertToEditSyntax.py |
— | — | @@ -0,0 +1,167 @@ |
| 2 | +#-*- coding: utf-8 -*- |
| 3 | + |
| 4 | +## This file will compress the revision history of a database dump or |
| 5 | +## export file into a more compact "edit syntax". |
| 6 | +## |
| 7 | +## Usage: ConvertToEditSyntax input_file output_file |
| 8 | +## |
| 9 | +## An option flag "-v" is defined that generates a "verbose mode" with |
| 10 | +## rolling feedback on the scripts' progress. |
| 11 | +## |
| 12 | +## This process takes approximately 1 minute for every 250 MB of input. |
| 13 | + |
| 14 | +import os, string, re, time, sys; |
| 15 | +import codecs; |
| 16 | +import EditSyntax |
| 17 | + |
| 18 | +blockdata = ""; |
| 19 | +blockpos = 0; |
| 20 | +blocksize = 500000; |
| 21 | + |
| 22 | +chars = 0; |
| 23 | +compressed = 0; |
| 24 | +cnt = 0; |
| 25 | +revcnt = 0; |
| 26 | + |
| 27 | +argv = sys.argv; |
| 28 | +input_file = ""; |
| 29 | +output_file = ""; |
| 30 | + |
| 31 | +def readRevision(f): |
| 32 | + global blockdata, blockpos, blocksize; |
| 33 | + p2 = blockdata.find("</revision>",blockpos); |
| 34 | + if p2 > -1: |
| 35 | + st = blockdata[blockpos:p2+11]; |
| 36 | + blockpos = p2+11; |
| 37 | + return st; |
| 38 | + else: |
| 39 | + while p2 == -1: |
| 40 | + new_block = fin.read(blocksize) |
| 41 | + blockdata = blockdata[blockpos:] + new_block; |
| 42 | + blockpos = 0; |
| 43 | + p2 = blockdata.find("</revision>",blockpos); |
| 44 | + if new_block == "": |
| 45 | + return ""; |
| 46 | + st = blockdata[blockpos:p2+11]; |
| 47 | + blockpos = p2+11; |
| 48 | + return st; |
| 49 | + |
| 50 | + |
| 51 | +if '-v' in argv: |
| 52 | + verbose = True; |
| 53 | + argv.remove('-v'); |
| 54 | +else: |
| 55 | + verbose = False; |
| 56 | + |
| 57 | +##verbose = True; |
| 58 | +##fname = "cowiki-20081203"; |
| 59 | +##input_file = "J:\\Wikidata\\" + fname + "-pages-meta-history.xml"; |
| 60 | +##output_file = "J:\\Wikidata\\"+ fname + "-pages-shrunk.xml"; |
| 61 | + |
| 62 | +if len(argv) >= 3: |
| 63 | + input_file = argv[1]; |
| 64 | + output_file = argv[2]; |
| 65 | + |
| 66 | +if input_file == "" or output_file == "": |
| 67 | + verbose = True; |
| 68 | + input_file = raw_input("File to compress? "); |
| 69 | + output_file = raw_input("Destination file? "); |
| 70 | + print "\n"; |
| 71 | + |
| 72 | +filesize = os.path.getsize(input_file); |
| 73 | +fin = codecs.open(input_file,'r','utf-8'); |
| 74 | +fout = codecs.open(output_file,"w", 'utf-8'); |
| 75 | + |
| 76 | +textre = re.compile("<revision>\\s*<id>([0-9]*)</id>.*<text[^>]*>([^<]*)</text>",re.I+re.S); |
| 77 | +revidre = re.compile("<revision>\\s*<id>([0-9]*)</id>",re.I+re.S); |
| 78 | + |
| 79 | +ct = time.clock(); |
| 80 | + |
| 81 | +A=readRevision(fin); |
| 82 | + |
| 83 | +if verbose: |
| 84 | + print " Pages Revisions File Read Compression Time Rev/s"; |
| 85 | + |
| 86 | +while A != "": |
| 87 | + |
| 88 | + if A.find("</page>") > -1 or cnt == 0: |
| 89 | + rev1 = A; |
| 90 | + text1 = textre.findall(A); |
| 91 | + if len(text1) > 0: |
| 92 | + revid = int(text1[0][0]); |
| 93 | + text1 = text1[0][1]; |
| 94 | + else: |
| 95 | + revid = revidre.findall(A); |
| 96 | + revid = int(revid[0]); |
| 97 | + text1 = ""; |
| 98 | + EditSyntax.newArticle(text1,revid); |
| 99 | + fout.write(rev1); |
| 100 | + |
| 101 | + revcnt += 1; |
| 102 | + cnt += 1; |
| 103 | + else: |
| 104 | + rev2 = A; |
| 105 | + text2 = textre.findall(A); |
| 106 | + if len(text2) > 0: |
| 107 | + revid = int(text2[0][0]); |
| 108 | + text2 = text2[0][1]; |
| 109 | + elif A.find("<text ") > -1: |
| 110 | + revid = revidre.findall(A); |
| 111 | + revid = int(revid[0]); |
| 112 | + text2 = ""; |
| 113 | + |
| 114 | + EditSyntax.newRevision(text2,revid); |
| 115 | + revcnt += 1; |
| 116 | + |
| 117 | + if text2 != "": |
| 118 | + output = EditSyntax.getXMLDifference(); |
| 119 | + |
| 120 | + if verbose: |
| 121 | + chars += len(text2) + 43; |
| 122 | + compressed += len(output); |
| 123 | + |
| 124 | + p1 = rev2.find("<text")-6; |
| 125 | + p2 = rev2.find("</text>")+8; |
| 126 | + rev2 = rev2[:p1] + output + rev2[p2:]; |
| 127 | + |
| 128 | + fout.write(rev2); |
| 129 | + |
| 130 | + rev1 = rev2; |
| 131 | + text1 = text2; |
| 132 | + |
| 133 | + if revcnt % 1000 == 0 and verbose: |
| 134 | + remaining = (filesize - fin.tell()) * (time.clock()-ct) / fin.tell(); |
| 135 | + hours = remaining / 3600; |
| 136 | + minutes = (remaining % 3600) / 60; |
| 137 | + seconds = remaining % 60; |
| 138 | + print "%(1)6i %(2)8i %(3)5iM" % \ |
| 139 | + {'1': cnt, '2': revcnt, '3': fin.tell() / 2**20}, \ |
| 140 | + "(%(1)4.1f%%)" % {'1': fin.tell() / float(filesize) * 100}, \ |
| 141 | + " %(1)2.2f%%" % {'1': fout.tell() / float(fin.tell())*100}, \ |
| 142 | + "(%(1)2.2f%%)" % {'1': compressed / float(chars)*100}, \ |
| 143 | + " %(hour)i:%(min)02i:%(sec)02i" % \ |
| 144 | + {'hour' : hours, 'min' : minutes, 'sec': seconds}, \ |
| 145 | + " %(1)6.1f" % {'1': revcnt / (time.clock()-ct)}; |
| 146 | + |
| 147 | + A = readRevision(fin); |
| 148 | + |
| 149 | +fout.write(blockdata); |
| 150 | + |
| 151 | +if verbose: |
| 152 | + fout.flush(); |
| 153 | + elapsed = time.clock()-ct |
| 154 | + hours = elapsed / 3600; |
| 155 | + minutes = (elapsed % 3600) / 60; |
| 156 | + seconds = elapsed % 60; |
| 157 | + print "\n ", cnt, "Pages, ", revcnt, "Revisions, ", \ |
| 158 | + "%(hour)i:%(min)02i:%(sec)02i Processing Time\n" % \ |
| 159 | + {'hour' : hours, 'min' : minutes, 'sec': seconds}; |
| 160 | + print " Total File Size: ", fin.tell(), "-->", fout.tell(), \ |
| 161 | + "(%(1)2.2f%%)" % {'1': fout.tell() / float(fin.tell())*100}; |
| 162 | + print " Compressible Characters: ", chars, "-->", compressed, \ |
| 163 | + "(%(1)2.2f%%)" % {'1': compressed / float(chars)*100}; |
| 164 | + |
| 165 | + |
| 166 | +fin.close(); |
| 167 | +fout.close(); |
| 168 | + |
Property changes on: trunk/tools/editsyntax/ConvertToEditSyntax.py |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 169 | + native |
Index: trunk/tools/editsyntax/ConvertFromEditSyntax.py |
— | — | @@ -0,0 +1,161 @@ |
| 2 | +#-*- coding: utf-8 -*- |
| 3 | + |
| 4 | +## This file will reverse the compress created by ConvertToEditSyntax. |
| 5 | +## |
| 6 | +## Usage: ConvertFromEditSyntax input_file output_file |
| 7 | +## |
| 8 | +## An option flag "-v" is defined that generates a "verbose mode" with |
| 9 | +## rolling feedback on the scripts' progress. |
| 10 | +## |
| 11 | +## This process takes approximately 1 minute for every 175 MB of input. |
| 12 | + |
| 13 | +import os, sys, string, re, time; |
| 14 | +import codecs; |
| 15 | +import EditSyntax |
| 16 | + |
| 17 | +blockdata = ""; |
| 18 | +blockpos = 0; |
| 19 | +blocksize = 500000; |
| 20 | + |
| 21 | +cnt = 0; |
| 22 | +revcnt = 0; |
| 23 | + |
| 24 | +argv = sys.argv; |
| 25 | +input_file = ""; |
| 26 | +output_file = ""; |
| 27 | + |
| 28 | +def readRevision(f): |
| 29 | + global blockdata, blockpos, blocksize; |
| 30 | + p2 = blockdata.find("</revision>",blockpos); |
| 31 | + if p2 > -1: |
| 32 | + st = blockdata[blockpos:p2+11]; |
| 33 | + blockpos = p2+11; |
| 34 | + return st; |
| 35 | + else: |
| 36 | + while p2 == -1: |
| 37 | + new_block = fin.read(blocksize) |
| 38 | + blockdata = blockdata[blockpos:] + new_block; |
| 39 | + blockpos = 0; |
| 40 | + p2 = blockdata.find("</revision>",blockpos); |
| 41 | + if new_block == "": |
| 42 | + return ""; |
| 43 | + st = blockdata[blockpos:p2+11]; |
| 44 | + blockpos = p2+11; |
| 45 | + return st; |
| 46 | + |
| 47 | +if '-v' in argv: |
| 48 | + verbose = True; |
| 49 | + argv.remove('-v'); |
| 50 | +else: |
| 51 | + verbose = False; |
| 52 | + |
| 53 | +##verbose = True; |
| 54 | +##fname = "cowiki-20081203"; |
| 55 | +##input_file = "J:\\Wikidata\\" + fname + "-pages-shrunk.xml"; |
| 56 | +##output_file = "J:\\Wikidata\\"+ fname + "-pages-restored.xml"; |
| 57 | + |
| 58 | +if len(argv) >= 3: |
| 59 | + input_file = argv[1]; |
| 60 | + output_file = argv[2]; |
| 61 | + |
| 62 | +if input_file == "" or output_file == "": |
| 63 | + verbose = True; |
| 64 | + input_file = raw_input("File to decompress? "); |
| 65 | + output_file = raw_input("Destination file? "); |
| 66 | + print "\n"; |
| 67 | + |
| 68 | +filesize = os.path.getsize(input_file); |
| 69 | +fin = codecs.open(input_file,'r','utf-8'); |
| 70 | +fout = codecs.open(output_file,"w", 'utf-8'); |
| 71 | + |
| 72 | +textre = re.compile("<revision>\\s*<id>([0-9]*)</id>.*<text[^>]*>([^<]*)</text>",re.I+re.S); |
| 73 | +changesre = re.compile("<revision>\\s*<id>([0-9]*)</id>.*<changes[^>]*>(.*)</changes>",re.I+re.S); |
| 74 | +revidre = re.compile("<revision>\\s*<id>([0-9]*)</id>",re.I+re.S); |
| 75 | + |
| 76 | +ct = time.clock(); |
| 77 | + |
| 78 | +A=readRevision(fin); |
| 79 | + |
| 80 | +if verbose: |
| 81 | + print " Pages Revisions File Read Output Time Rev/s"; |
| 82 | + |
| 83 | +while A != "": |
| 84 | + |
| 85 | + if A.find("</page>") > -1 or cnt == 0: |
| 86 | + rev1 = A; |
| 87 | + text1 = textre.findall(A); |
| 88 | + if len(text1) > 0: |
| 89 | + revid = int(text1[0][0]); |
| 90 | + text1 = text1[0][1]; |
| 91 | + else: |
| 92 | + revid = revidre.findall(A); |
| 93 | + revid = int(revid[0]); |
| 94 | + text1 = ""; |
| 95 | + EditSyntax.newArticle(text1, revid); |
| 96 | + revcnt += 1; |
| 97 | + fout.write(rev1); |
| 98 | + cnt += 1; |
| 99 | + |
| 100 | + else: |
| 101 | + rev2 = A; |
| 102 | + text2 = textre.findall(A); |
| 103 | + changes = ""; |
| 104 | + if len(text2) > 0: |
| 105 | + revid = int(text2[0][0]); |
| 106 | + text2 = text2[0][1]; |
| 107 | + EditSyntax.newRevision(text2, revid); |
| 108 | + elif A.find("<text ") > -1: |
| 109 | + revid = revidre.findall(A); |
| 110 | + revid = int(revid[0]); |
| 111 | + text2 = ""; |
| 112 | + EditSyntax.newRevision(text2, revid); |
| 113 | + else: |
| 114 | + changes = changesre.findall(A); |
| 115 | + if len(changes) > 0: |
| 116 | + EditSyntax.newChanges(changes[0][1],int(changes[0][0])); |
| 117 | + text2 = EditSyntax.getCurrentText(); |
| 118 | + else: |
| 119 | + text2 = ""; |
| 120 | + |
| 121 | + revcnt += 1; |
| 122 | + if changes != "": |
| 123 | + p1 = rev2.find("<changes"); |
| 124 | + p2 = rev2.find("</changes>")+10; |
| 125 | + rev2 = rev2[:p1] + '<text xml:space="preserve">' + \ |
| 126 | + text2 + "</text>" + rev2[p2:]; |
| 127 | + |
| 128 | + fout.write(rev2); |
| 129 | + |
| 130 | + rev1 = rev2; |
| 131 | + text1 = text2; |
| 132 | + |
| 133 | + if revcnt % 5000 == 0 and verbose: |
| 134 | + remaining = (filesize - fin.tell()) * (time.clock()-ct) / fin.tell(); |
| 135 | + hours = remaining / 3600; |
| 136 | + minutes = (remaining % 3600) / 60; |
| 137 | + seconds = remaining % 60; |
| 138 | + print "%(1)6i %(2)8i %(3)5iM" % \ |
| 139 | + {'1': cnt, '2': revcnt, '3': fin.tell() / 2**20}, \ |
| 140 | + "(%(1)4.1f%%)" % {'1': fin.tell() / float(filesize) * 100}, \ |
| 141 | + " %(1)5iM" % {'1': fout.tell() / 2**20}, \ |
| 142 | + " %(hour)i:%(min)02i:%(sec)02i" % \ |
| 143 | + {'hour' : hours, 'min' : minutes, 'sec': seconds}, \ |
| 144 | + " %(1)6.1f" % {'1': revcnt / (time.clock()-ct)}; |
| 145 | + |
| 146 | + A = readRevision(fin); |
| 147 | + |
| 148 | +fout.write(blockdata); |
| 149 | + |
| 150 | +if verbose: |
| 151 | + fout.flush(); |
| 152 | + elapsed = time.clock()-ct |
| 153 | + hours = elapsed / 3600; |
| 154 | + minutes = (elapsed % 3600) / 60; |
| 155 | + seconds = elapsed % 60; |
| 156 | + print "\n ", cnt, "Pages, ", revcnt, "Revisions, ", \ |
| 157 | + "%(hour)i:%(min)02i:%(sec)02i Processing Time\n" % \ |
| 158 | + {'hour' : hours, 'min' : minutes, 'sec': seconds}; |
| 159 | + print " Total File Size: ", fin.tell(), "-->", fout.tell() |
| 160 | + |
| 161 | +fout.close(); |
| 162 | +fin.close(); |
Property changes on: trunk/tools/editsyntax/ConvertFromEditSyntax.py |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 163 | + native |