r108182 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r108181‎ | r108182 | r108183 >
Date:23:18, 5 January 2012
Author:halfak
Status:deferred
Tags:
Comment:
Bug fixes and work around for wikipedia api. Also added a little bit of R
Modified paths:
  • /trunk/tools/wsor/message_templates/R (added) (history)
  • /trunk/tools/wsor/message_templates/R/twinkle.R (added) (history)
  • /trunk/tools/wsor/message_templates/umetrics/generators/__init__.py (modified) (history)
  • /trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py (modified) (history)
  • /trunk/tools/wsor/message_templates/umetrics/generators/talk.py (modified) (history)
  • /trunk/tools/wsor/message_templates/umetrics/generators/warnings.py (modified) (history)
  • /trunk/tools/wsor/message_templates/umetrics/metrics.py (modified) (history)
  • /trunk/tools/wsor/message_templates/umetrics/postings.py (modified) (history)
  • /trunk/tools/wsor/message_templates/umetrics/util/mw_api.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/R/twinkle.R
@@ -0,0 +1 @@
 2+
Index: trunk/tools/wsor/message_templates/umetrics/generators/warnings.py
@@ -50,23 +50,22 @@
5151 def getProcessedRevs(self, username):
5252 return self.processRevs(self.getUserPageRevisions(username))
5353
54 - def getUserPageRevisions(self, username, rvcontinue=None):
 54+ def getUserPageRevisions(self, username, rvcontinue={}):
5555 js = self.api.request(
5656 action="query",
5757 prop="revisions",
58 - titles="User_talk:%s" % username,
 58+ titles="User_talk:%s" % username.encode('utf-8'),
5959 rvprop="ids|timestamp|content",
6060 rvdir="newer",
6161 rvlimit=50,
62 - rvcontinue=rvcontinue
 62+ **rvcontinue
6363 )
6464
65 - for rev in js['query']['pages'].values()[0]['revisions']:
66 - rev['timestamp']
 65+ for rev in js['query']['pages'].values()[0].get('revisions', []):
6766 yield rev
6867
6968 if 'query-continue' in js:
70 - for rev in self.getUserPageRevisions(username, js['query-continue']['revisions']['rvstartid']):
 69+ for rev in self.getUserPageRevisions(username, js['query-continue']['revisions']):
7170 yield rev
7271
7372
@@ -76,8 +75,10 @@
7776 previousLines = []
7877 for rev in revs:
7978 lines = rev.get('*', "").split("\n")
80 - del rev['*']
8179
 80+ try: del rev['*']
 81+ except KeyError: pass
 82+
8283 added = []
8384 sm = difflib.SequenceMatcher(None, previousLines, lines)
8485 for tag, i1, i2, j1, j2 in sm.get_opcodes():
@@ -94,4 +95,11 @@
9596
9697
9798
98 -
 99+
 100+def test():
 101+ from umetrics.generators import Warnings
 102+ from umetrics.util import MWAPI
 103+ w = Warnings(None, MWAPI('http://en.wikipedia.org/w/api.php'))
 104+ for rev in w.getProcessedRevs('EpochFai'):
 105+ print(rev[id])
 106+
Index: trunk/tools/wsor/message_templates/umetrics/generators/talk.py
@@ -22,7 +22,7 @@
2323 cursor = self.conn.cursor()
2424 cursor.execute("""
2525 SELECT
26 - IF(rev_timestamp > %(timestamp)s, "after", "before") as whense,
 26+ IF(rev_timestamp > %(timestamp)s, "after", "before") as whence,
2727 COUNT(*) as count,
2828 MAX(rev_timestamp) as last,
2929 MIN(rev_timestamp) as first
@@ -36,14 +36,14 @@
3737 """,
3838 {
3939 'timestamp': timestamp,
40 - 'page_title': username.encode('utf-8').replace(" ", "_"),
 40+ 'page_title': username.replace(" ", "_").encode('utf-8'),
4141 'username': username.encode('utf-8')
4242 }
4343 )
4444 for row in cursor:
45 - rowValues['other_talk_%(whence)s'] = row['count']
46 - rowValues['first_other_talk_%(whence)s'] = row['first']
47 - rowValues['last_other_talk_%(whence)s'] = row['last']
 45+ rowValues['other_talk_%(whence)s' % row] = row['count']
 46+ rowValues['first_other_talk_%(whence)s' % row] = row['first']
 47+ rowValues['last_other_talk_%(whence)s' % row] = row['last']
4848
4949 rowValues['other_talk_before'] = rowValues.get('other_talk_before', 0)
5050 rowValues['other_talk_after'] = rowValues.get('other_talk_after', 0)
Index: trunk/tools/wsor/message_templates/umetrics/generators/__init__.py
@@ -17,7 +17,7 @@
1818 self.generators = list(generators)
1919
2020 def headers(self):
21 - row = ['username', 'timestamp']
 21+ row = ['recipient_name', 'timestamp']
2222 for generator in self.generators:
2323 row.extend(generator.headers())
2424
Index: trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py
@@ -9,10 +9,10 @@
1010 def headers(self):
1111 return itertools.chain(*[
1212 [
 13+ 'ns_%s_revisions_before' % ns,
 14+ 'ns_%s_revisions_after' % ns,
1315 'ns_%s_revisions_deleted_before' % ns,
14 - 'ns_%s_revisions_deleted_after' % ns,
15 - 'ns_%s_revisions_not_deleted_before' % ns,
16 - 'ns_%s_revisions_not_deleted_after' % ns
 16+ 'ns_%s_revisions_deleted_after' % ns
1717 ]
1818 for ns in itertools.chain(range(0,16), [100, 101, 108, 109])
1919 ])
@@ -24,24 +24,24 @@
2525 cursor.execute("""
2626 (
2727 SELECT
28 - False as deleted,
2928 page_namespace as ns,
 29+ IF(rev_timestamp < %(timestamp)s, "before", "after") as whence,
 30+ "" as deleted,
3031 count(*) as revisions
3132 FROM enwiki.revision
3233 INNER JOIN enwiki.page ON rev_page = page_id
33 - WHERE rev_timestamp <= %(timestamp)s
34 - AND rev_user_text = %(username)s
35 - GROUP BY page_namespace
 34+ WHERE rev_user_text = %(username)s
 35+ GROUP BY 1, 2
3636 )
3737 UNION (
3838 SELECT
39 - True as deleted,
4039 ar_namespace as ns,
 40+ IF(ar_timestamp < %(timestamp)s, "before", "after") as whence,
 41+ "_deleted" as deleted,
4142 count(*) as revisions
4243 FROM enwiki.archive
43 - WHERE ar_timestamp <= %(timestamp)s
44 - AND ar_user_text = %(username)s
45 - GROUP BY ar_namespace
 44+ WHERE ar_user_text = %(username)s
 45+ GROUP BY 1, 2
4646 )""",
4747 {
4848 'timestamp': timestamp,
@@ -49,11 +49,6 @@
5050 }
5151 )
5252 for row in cursor:
53 - if(row['deleted']):
54 - deleted = "deleted"
55 - else:
56 - deleted = "not_deleted"
 53+ rowData['ns_%(ns)s_revisions%(deleted)s_%(whence)s' % row] = row['revisions']
5754
58 - rowData['ns_%s_before_revisions_%s' % (row['ns'], deleted)] = row['revisions']
59 -
6055 return [rowData.get(c, 0) for c in self.headers()]
Index: trunk/tools/wsor/message_templates/umetrics/metrics.py
@@ -1,6 +1,7 @@
22 import sys, argparse, os
33 import logging, types
44 import MySQLdb, MySQLdb.cursors
 5+import traceback
56
67 from .generators import GENERATORS, Metrics
78 from .util import MWAPI, MWAPIError
@@ -48,6 +49,21 @@
4950 default="http://en.wikipedia.org/w/api.php"
5051 )
5152 parser.add_argument(
 53+ '-o', '--old',
 54+ type=lambda fn: open(fn, 'r'),
 55+ help='a previous output file to read from. When provided, this script will skip all of the complete username/timestamp pairs found in the file.',
 56+ )
 57+ parser.add_argument(
 58+ '--debug',
 59+ action="store_true",
 60+ default=False
 61+ )
 62+ parser.add_argument(
 63+ '--headers',
 64+ action="store_true",
 65+ default=False
 66+ )
 67+ parser.add_argument(
5268 'generator',
5369 type=lambda g: GENERATORS[g],
5470 nargs="+",
@@ -56,8 +72,10 @@
5773 args = parser.parse_args()
5874
5975 LOGGING_STREAM = sys.stderr
 76+ if args.debug: logLevel = logging.DEBUG
 77+ else: logLevel = logging.INFO
6078 logging.basicConfig(
61 - level=logging.DEBUG,
 79+ level=logLevel,
6280 stream=LOGGING_STREAM,
6381 format='%(asctime)s %(levelname)-8s %(message)s',
6482 datefmt='%b-%d %H:%M:%S'
@@ -65,7 +83,7 @@
6684
6785 if sys.stdin.isatty():
6886 logging.error("No data piped to standard in!")
69 - return
 87+ return 1
7088
7189
7290 logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
@@ -78,23 +96,49 @@
7997
8098 logging.info("Loading generators...")
8199 metrics = Metrics(g(conn, args.api) for g in args.generator)
82 - print("\t".join(encode(h) for h in metrics.headers()))
83100
84101
 102+ oldPairs = set()
 103+ if args.old != None:
 104+ logging.info("Loading in old data file...")
 105+ for line in args.old:
 106+ username, timestamp = line.strip().split("\t")[0:2]
 107+ username = unicode(username.decode('string-escape'), 'utf-8')
 108+
 109+ oldPairs.add((username, timestamp))
 110+ LOGGING_STREAM.write(".")
 111+
 112+ LOGGING_STREAM.write("\n")
 113+
 114+ else:
 115+ if args.headers:
 116+ print("\t".join(encode(h) for h in metrics.headers()))
 117+
 118+
85119 logging.info("Processing users...")
86120 for line in sys.stdin:
87 - username, timestamp = line.strip().split("\t")[0:2]
88 - username = unicode(username, 'utf-8')
 121+ try:
 122+ username, timestamp = line.strip().split("\t")[0:2]
 123+ username = unicode(username.decode('string-escape'), 'utf-8')
 124+
 125+ if (username, timestamp) in oldPairs:
 126+ LOGGING_STREAM.write("s")
 127+ else:
 128+ logging.debug("\t%s at %s:" % (username, timestamp))
 129+ print("\t".join(encode(v) for v in metrics.values(username, timestamp)))
 130+ sys.stdout.flush()
 131+ LOGGING_STREAM.write(".")
 132+ except Exception as e:
 133+ logging.error("An error occurred while processing %s at %s." % (username, timestamp))
 134+ LOGGING_STREAM.write(traceback.format_exc())
 135+ return 1
89136
90 - logging.debug("\t%s at %s:" % (username, timestamp))
91 - print("\t".join(encode(v) for v in metrics.values(username, timestamp)))
92 - LOGGING_STREAM.write("o")
93 -
94137 LOGGING_STREAM.write("\n")
 138+ return 0
95139
96140
97141
98142
99143
100144 if __name__ == "__main__":
101 - main()
 145+ sys.exit(main())
Index: trunk/tools/wsor/message_templates/umetrics/postings.py
@@ -13,7 +13,7 @@
1414 - Timestamp - The time at which the posting was made
1515 - Revision ID - The identifier of the revision matching the posting
1616 - Poster ID - The identifier of the user who made the posting
17 - - Poster name - The name of the user who make the posting
 17+ - Poster name - The name of the user who made the posting
1818 - Message match - The portion of the message posting that was matched by the regular expression.
1919
2020 :Example:
@@ -36,19 +36,18 @@
3737
3838 return str(v).encode("string-escape")
3939
 40+HEADERS = [
 41+ 'recipient_name',
 42+ 'timestamp',
 43+ 'rev_id',
 44+ 'poster_id',
 45+ 'poster_name',
 46+ 'message_match'
 47+]
 48+
4049 def emit(rev):
41 -
4250 print(
43 - "\t".join(
44 - encode(rev[c]) for c in [
45 - 'recipient_name',
46 - 'rev_timestamp',
47 - 'rev_id',
48 - 'poster_id',
49 - 'poster_name',
50 - 'message_match'
51 - ]
52 - )
 51+ "\t".join(encode(rev[h]) for h in HEADERS)
5352 )
5453
5554
@@ -130,11 +129,26 @@
131130 help='regular expression to match against message content (required)',
132131 required=True
133132 )
 133+ parser.add_argument(
 134+ '--header',
 135+ action="store_true",
 136+ default=False
 137+ )
 138+ parser.add_argument(
 139+ '--debug',
 140+ action="store_true",
 141+ default=False
 142+ )
134143 args = parser.parse_args()
135144
136145 LOGGING_STREAM = sys.stderr
 146+ if args.debug:
 147+ logLevel = logging.DEBUG
 148+ else:
 149+ logLevel = logging.INFO
 150+
137151 logging.basicConfig(
138 - level=logging.DEBUG,
 152+ level=logLevel,
139153 stream=LOGGING_STREAM,
140154 format='%(asctime)s %(levelname)-8s %(message)s',
141155 datefmt='%b-%d %H:%M:%S'
@@ -152,10 +166,25 @@
153167 logging.info("Connecting to API @ %s." % args.api_uri)
154168 api = WPAPI(args.api_uri)
155169
 170+ if args.header:
 171+ print("\t".join(HEADERS))
 172+
156173 logging.info("Querying for matching revisions:")
 174+ revs = []
 175+ count = 0
 176+ for rev in db.getPostings(args.start, args.end, args.user_name, args.comment):
 177+ count += 1
 178+ revs.append(rev)
 179+ if count % 100 == 0: LOGGING_STREAM.write("|")
 180+
 181+ LOGGING_STREAM.write("\n")
 182+
 183+ logging.info("Checking for message templates")
157184 count = {"matched": 0, "missed": 0}
158 - for rev in db.getPostings(args.start, args.end, args.user_name, args.comment):
 185+ for rev in revs:
 186+ logging.debug("Matching revision %(rev_id)s peformed by %(poster_name)s @ %(rev_timestamp)s: %(rev_comment)s" % rev)
159187 message = api.getAdded(rev['rev_id'])
 188+
160189 match = args.message.search(message)
161190 if match != None:
162191 rev['message_match'] = match.group(0)
@@ -183,7 +212,7 @@
184213 if (userName, commentRE) == (None, None):
185214 raise TypeError("Must specify at at least one of userName or commentRE.")
186215
187 - cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
 216+ cursor = self.conn.cursor(MySQLdb.cursors.SSDictCursor)
188217 query = """
189218 SELECT
190219 r.rev_id,
@@ -212,8 +241,7 @@
213242 }
214243 )
215244
216 - for row in cursor:
217 - yield row
 245+ return cursor
218246
219247
220248
@@ -223,7 +251,7 @@
224252 def __init__(self, uri):
225253 self.uri = uri
226254
227 - def getDiff(self, revId, retries=10):
 255+ def getDiff(self, revId, retries=20):
228256 attempt = 0
229257 while attempt < retries:
230258 try:
@@ -239,11 +267,17 @@
240268 })
241269 )
242270 result = json.load(response)
243 - return result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
 271+
 272+ diff = result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
 273+ if type(diff) not in types.StringTypes: diff = ''
 274+
 275+ return diff
244276 except urllib2.HTTPError as e:
245 - time.sleep(attempt*2)
 277+ time.sleep(2**attempt)
246278 attempt += 1
 279+ logging.error("HTTP Error: %s. Retry #%s in %s seconds..." % (e, attempt, 2**attempt))
247280
 281+
248282
249283
250284 def getAdded(self, revId):
Index: trunk/tools/wsor/message_templates/umetrics/util/mw_api.py
@@ -36,17 +36,18 @@
3737
3838 try:
3939 response = urllib2.urlopen(request)
 40+
 41+ self.cookies.extract_cookies(response, request)
 42+
 43+ js = json.load(response)
 44+
 45+ if 'error' in js:
 46+ raise MWAPIError(js['error']['code'], js['error']['info'])
 47+ else:
 48+ return js
 49+
4050 except urllib2.HTTPError:
4151 #wait and try again
4252 time.sleep(2**retry)
4353 self.request(retry=retry+1, **kwargs)
44 -
45 - self.cookies.extract_cookies(response, request)
46 -
47 - js = json.load(response)
48 -
49 - if 'error' in js:
50 - raise MWAPIError(js['error']['code'], js['error']['info'])
51 - else:
52 - return js
5354

Status & tagging log