Index: trunk/tools/wsor/diffs/revision_differ.py |
— | — | @@ -1,7 +1,24 @@ |
2 | 2 | #!/usr/local/bin/pypy |
3 | | - |
4 | | -import logging,traceback |
5 | | -import sys, re |
| 3 | +################################################################################ |
| 4 | +# Revision Differ |
| 5 | +# |
| 6 | +# This script was written to be a streaming mapper for wikihadoop |
| 7 | +# (see https://github.com/whym/wikihadoop). By default, this script runs under |
| 8 | +# pypy (much faster), but it can also be run under CPython 2.7+. |
| 9 | +# |
| 10 | +# Required to run this script are |
| 11 | +# - diff_match_patch.py (provided) |
| 12 | +# - xml_simulator.py (provided) |
| 13 | +# - wikimedia-utilities (https://bitbucket.org/halfak/wikimedia-utilities) |
| 14 | +# |
| 15 | +# Author: Aaron Halfaker (aaron.halfaker@gmail.com) |
| 16 | +# |
| 17 | +# This software licensed as GPLv2(http://www.gnu.org/licenses/gpl-2.0.html). and |
| 18 | +# is provided WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
| 19 | +# implied. |
| 20 | +# |
| 21 | +################################################################################ |
| 22 | +import logging, traceback, sys, re |
6 | 23 | from StringIO import StringIO |
7 | 24 | |
8 | 25 | from diff_match_patch import diff_match_patch |
— | — | @@ -12,12 +29,12 @@ |
13 | 30 | |
14 | 31 | def tokenize(content): |
15 | 32 | return re.findall( |
16 | | - r"[\w]+" + #Word |
17 | | - r"|\[\[" + #Opening internal link |
18 | | - r"|\]\]" + #Closing internal link |
19 | | - r"|\{\{" + #Opening template |
| 33 | + r"[\w]+" + #Word |
| 34 | + r"|\[\[" + #Opening internal link |
| 35 | + r"|\]\]" + #Closing internal link |
| 36 | + r"|\{\{" + #Opening template |
20 | 37 | r"|\}\}" + #Closing template |
21 | | - r"|\{\{\{" + #Opening template var |
| 38 | + r"|\{\{\{" + #Opening template var |
22 | 39 | r"|\}\}\}" + #Closing template var |
23 | 40 | r"|\n+" + #Line breaks |
24 | 41 | r"| +" + #Spaces |
— | — | @@ -29,8 +46,8 @@ |
30 | 47 | r"|\|\}" + #Closing table |
31 | 48 | r"|\|\-" + #Table row |
32 | 49 | r"|.", #Misc character |
33 | | - content |
34 | | - ) |
| 50 | + content |
| 51 | + ) |
35 | 52 | |
36 | 53 | def hashTokens(tokens, hash2Token=[], token2Hash={}): |
37 | 54 | hashBuffer = StringIO() |
— | — | @@ -56,15 +73,15 @@ |
57 | 74 | |
58 | 75 | dmp = diff_match_patch() |
59 | 76 | |
60 | | - diffs = dmp.diff_main(hashes1, hashes2, checklines=False) |
61 | | - |
62 | | - position = 0 |
63 | | - for (ar,hashes) in diffs: |
64 | | - content = unhash(hashes,h2t,sep=sep) |
65 | | - if ar in report: |
66 | | - yield position, ar, content |
67 | | - |
68 | | - if ar != -1: position += len(content) |
| 77 | + diffs = dmp.diff_main(hashes1, hashes2, checklines=False) |
| 78 | + |
| 79 | + position = 0 |
| 80 | + for (ar,hashes) in diffs: |
| 81 | + content = unhash(hashes,h2t,sep=sep) |
| 82 | + if ar in report: |
| 83 | + yield position, ar, content |
| 84 | + |
| 85 | + if ar != -1: position += len(content) |
69 | 86 | |
70 | 87 | |
71 | 88 | metaXML = """ |
— | — | @@ -100,6 +117,8 @@ |
101 | 118 | </namespaces> |
102 | 119 | </siteinfo> |
103 | 120 | """ |
| 121 | + |
| 122 | + |
104 | 123 | xmlSim = RecordingFileWrapper(sys.stdin, pre=metaXML, post='</mediawiki>') |
105 | 124 | |
106 | 125 | try: |
— | — | @@ -113,7 +132,9 @@ |
114 | 133 | sys.stderr.write('Processing: %s - %s\n' % (page.getId(), page.getTitle().encode('UTF-8'))) |
115 | 134 | try: |
116 | 135 | lastRev = None |
| 136 | + currRevId = None |
117 | 137 | for revision in page.readRevisions(): |
| 138 | + currRevId = revision.getId() |
118 | 139 | if lastRev == None: |
119 | 140 | lastRev = revision |
120 | 141 | else: |
— | — | @@ -135,16 +156,8 @@ |
136 | 157 | row.append(":".join(repr(v) for v in d)) |
137 | 158 | |
138 | 159 | print("\t".join(row)) |
| 160 | + sys.stderr.write('reporter:counter:SkippingTaskCounters,MapProcessedRecords,1\n') |
139 | 161 | |
140 | 162 | except Exception as e: |
141 | | - sys.stderr.write('%s' % e) |
142 | | - #fh.write('%s' % e) |
143 | | - #logging.error( |
144 | | - # "Failed to process page %s:%s - %s" % ( |
145 | | - # page.getId(), |
146 | | - # page.getTitle(), |
147 | | - # e |
148 | | - # )) |
149 | | - #logging.error(traceback.print_exc()) |
150 | | -#fh.close() |
151 | | -#sys.exit(0) |
| 163 | + sys.stderr.write('%s - while processing revId=%s\n' % (e, currRevId)) |
| 164 | + traceback.print_exc(file=sys.stderr) |
Index: trunk/tools/wsor/diffs/diff_match_patch.py |
— | — | @@ -1,5 +1,3 @@ |
2 | | -#!/usr/bin/env python |
3 | | - |
4 | 2 | """Diff Match and Patch |
5 | 3 | |
6 | 4 | Copyright 2006 Google Inc. |
Index: trunk/tools/wsor/diffs/README.txt |
— | — | @@ -0,0 +1,18 @@ |
| 2 | +Revision Differ |
| 3 | + |
| 4 | +This script was written to be a streaming mapper for wikihadoop |
| 5 | +(see https://github.com/whym/wikihadoop). By default, this script runs under |
| 6 | +pypy (much faster), but it can also be run under CPython 2.7+. |
| 7 | + |
| 8 | + |
| 9 | +Required to run this script are |
| 10 | + - revision_differ.py (provided) |
| 11 | + - diff_match_patch.py (provided) |
| 12 | + - xml_simulator.py (provided) |
| 13 | + - wikimedia-utilities (https://bitbucket.org/halfak/wikimedia-utilities) |
| 14 | + |
| 15 | +Author: Aaron Halfaker (aaron.halfaker@gmail.com) |
| 16 | + |
| 17 | +This software licensed as GPLv2(http://www.gnu.org/licenses/gpl-2.0.html). and |
| 18 | +is provided WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
| 19 | +implied. |