r94718 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r94717‎ | r94718 | r94719 >
Date:00:55, 17 August 2011
Author:halfak
Status:deferred
Tags:
Comment:
fixed diff files. Added docs.
Modified paths:
  • /trunk/tools/wsor/diffs/README.txt (added) (history)
  • /trunk/tools/wsor/diffs/diff_match_patch.py (modified) (history)
  • /trunk/tools/wsor/diffs/revision_differ.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/diffs/revision_differ.py
@@ -1,7 +1,24 @@
22 #!/usr/local/bin/pypy
3 -
4 -import logging,traceback
5 -import sys, re
 3+################################################################################
 4+# Revision Differ
 5+#
 6+# This script was written to be a streaming mapper for wikihadoop
 7+# (see https://github.com/whym/wikihadoop). By default, this script runs under
 8+# pypy (much faster), but it can also be run under CPython 2.7+.
 9+#
 10+# Required to run this script are
 11+# - diff_match_patch.py (provided)
 12+# - xml_simulator.py (provided)
 13+# - wikimedia-utilities (https://bitbucket.org/halfak/wikimedia-utilities)
 14+#
 15+# Author: Aaron Halfaker (aaron.halfaker@gmail.com)
 16+#
 17+# This software licensed as GPLv2(http://www.gnu.org/licenses/gpl-2.0.html). and
 18+# is provided WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 19+# implied.
 20+#
 21+################################################################################
 22+import logging, traceback, sys, re
623 from StringIO import StringIO
724
825 from diff_match_patch import diff_match_patch
@@ -12,12 +29,12 @@
1330
1431 def tokenize(content):
1532 return re.findall(
16 - r"[\w]+" + #Word
17 - r"|\[\[" + #Opening internal link
18 - r"|\]\]" + #Closing internal link
19 - r"|\{\{" + #Opening template
 33+ r"[\w]+" + #Word
 34+ r"|\[\[" + #Opening internal link
 35+ r"|\]\]" + #Closing internal link
 36+ r"|\{\{" + #Opening template
2037 r"|\}\}" + #Closing template
21 - r"|\{\{\{" + #Opening template var
 38+ r"|\{\{\{" + #Opening template var
2239 r"|\}\}\}" + #Closing template var
2340 r"|\n+" + #Line breaks
2441 r"| +" + #Spaces
@@ -29,8 +46,8 @@
3047 r"|\|\}" + #Closing table
3148 r"|\|\-" + #Table row
3249 r"|.", #Misc character
33 - content
34 - )
 50+ content
 51+ )
3552
3653 def hashTokens(tokens, hash2Token=[], token2Hash={}):
3754 hashBuffer = StringIO()
@@ -56,15 +73,15 @@
5774
5875 dmp = diff_match_patch()
5976
60 - diffs = dmp.diff_main(hashes1, hashes2, checklines=False)
61 -
62 - position = 0
63 - for (ar,hashes) in diffs:
64 - content = unhash(hashes,h2t,sep=sep)
65 - if ar in report:
66 - yield position, ar, content
67 -
68 - if ar != -1: position += len(content)
 77+ diffs = dmp.diff_main(hashes1, hashes2, checklines=False)
 78+
 79+ position = 0
 80+ for (ar,hashes) in diffs:
 81+ content = unhash(hashes,h2t,sep=sep)
 82+ if ar in report:
 83+ yield position, ar, content
 84+
 85+ if ar != -1: position += len(content)
6986
7087
7188 metaXML = """
@@ -100,6 +117,8 @@
101118 </namespaces>
102119 </siteinfo>
103120 """
 121+
 122+
104123 xmlSim = RecordingFileWrapper(sys.stdin, pre=metaXML, post='</mediawiki>')
105124
106125 try:
@@ -113,7 +132,9 @@
114133 sys.stderr.write('Processing: %s - %s\n' % (page.getId(), page.getTitle().encode('UTF-8')))
115134 try:
116135 lastRev = None
 136+ currRevId = None
117137 for revision in page.readRevisions():
 138+ currRevId = revision.getId()
118139 if lastRev == None:
119140 lastRev = revision
120141 else:
@@ -135,16 +156,8 @@
136157 row.append(":".join(repr(v) for v in d))
137158
138159 print("\t".join(row))
 160+ sys.stderr.write('reporter:counter:SkippingTaskCounters,MapProcessedRecords,1\n')
139161
140162 except Exception as e:
141 - sys.stderr.write('%s' % e)
142 - #fh.write('%s' % e)
143 - #logging.error(
144 - # "Failed to process page %s:%s - %s" % (
145 - # page.getId(),
146 - # page.getTitle(),
147 - # e
148 - # ))
149 - #logging.error(traceback.print_exc())
150 -#fh.close()
151 -#sys.exit(0)
 163+ sys.stderr.write('%s - while processing revId=%s\n' % (e, currRevId))
 164+ traceback.print_exc(file=sys.stderr)
Index: trunk/tools/wsor/diffs/diff_match_patch.py
@@ -1,5 +1,3 @@
2 -#!/usr/bin/env python
3 -
42 """Diff Match and Patch
53
64 Copyright 2006 Google Inc.
Index: trunk/tools/wsor/diffs/README.txt
@@ -0,0 +1,18 @@
 2+Revision Differ
 3+
 4+This script was written to be a streaming mapper for wikihadoop
 5+(see https://github.com/whym/wikihadoop). By default, this script runs under
 6+pypy (much faster), but it can also be run under CPython 2.7+.
 7+
 8+
 9+Required to run this script are
 10+ - revision_differ.py (provided)
 11+ - diff_match_patch.py (provided)
 12+ - xml_simulator.py (provided)
 13+ - wikimedia-utilities (https://bitbucket.org/halfak/wikimedia-utilities)
 14+
 15+Author: Aaron Halfaker (aaron.halfaker@gmail.com)
 16+
 17+This software licensed as GPLv2(http://www.gnu.org/licenses/gpl-2.0.html). and
 18+is provided WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 19+implied.

Status & tagging log