r107076 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r107075‎ | r107076 | r107077 >
Date:17:13, 22 December 2011
Author:halfak
Status:deferred
Tags:tools 
Comment:
message_postings.py scrip working and documented
Modified paths:
  • /trunk/tools/wsor/message_templates/message_postings.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/message_postings.py
@@ -1,7 +1,12 @@
2 -import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time
3 -import urllib, urllib2
 2+import sys, argparse, os
 3+import logging, types, re
 4+import time, datetime
 5+import MySQLdb, MySQLdb.cursors
 6+import urllib, urllib2, json, htmlentitydefs
47 import wmf
58
 9+class MissingRevError(Exception):pass
 10+
611 def encode(v):
712 if v == None: return "\N"
813
@@ -10,18 +15,46 @@
1116
1217 return str(v).encode("string-escape")
1318
14 -# | year | month | day | hour | minute | second |
15 -MW_DATE = re.compile(r"[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]")
 19+def emit(rev):
 20+
 21+ print(
 22+ "\t".join(
 23+ encode(rev[c]) for c in [
 24+ 'rev_id',
 25+ 'rev_timestamp',
 26+ 'poster_id',
 27+ 'poster_name',
 28+ 'recipient_name',
 29+ 'message_match'
 30+ ]
 31+ )
 32+ )
1633
 34+
 35+# MediaWiki Date format
 36+#
 37+# | year | month | day | hour | minute | second |
 38+MW_DATE = re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$")
1739 def mwDate(string):
1840 if MW_DATE.match(string) == None:
19 - raise ValueError("%s is not a valid date. Expected YYMMDDHHmmSS" % string)
 41+ raise ValueError("%r is not a valid date. Expected YYMMDDHHmmSS" % string)
2042 else:
2143 return string
2244
2345 def main():
2446 parser = argparse.ArgumentParser(
25 - description='Gathers template message postings based on comment and diff matching regular expressions.'
 47+ description="""
 48+ Gathers experimental message postings from user_talk messages.
 49+ """,
 50+ epilog="""
 51+ python message_postings.py
 52+ -h db42
 53+ --start=20111222000000
 54+ --end=20111223000000
 55+ --comment="\(\[\[WP:HG\|HG\]\]\)"
 56+ --message="Template:uw-vandalism1"
 57+ """,
 58+ conflict_handler="resolve"
2659 )
2760 parser.add_argument(
2861 '-c', '--cnf',
@@ -31,7 +64,7 @@
3265 default=os.path.expanduser("~/.my.cnf")
3366 )
3467 parser.add_argument(
35 - '-s', '--host',
 68+ '-h', '--host',
3669 type=str,
3770 help='the database host to connect to (defaults to localhost)',
3871 default="localhost"
@@ -45,29 +78,36 @@
4679 parser.add_argument(
4780 '-a', '--api_uri',
4881 type=str,
49 - help='the default Wikimedia API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
 82+ help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
5083 default="http://en.wikipedia.org/w/api.php"
5184 )
5285 parser.add_argument(
53 - '--before',
54 - type=str,
55 - help='the default Wikimedia API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
56 - default="http://en.wikipedia.org/w/api.php"
 86+ '--start',
 87+ type=mwDate,
 88+ help='the start of the experimental period. (Required)',
 89+ required=True
5790 )
5891 parser.add_argument(
59 - 'after',
60 - type=mwDate,
61 - help='regular expression to match against message content'
 92+ '--end',
 93+ type=mwDate,
 94+ help='the end of the experimental period. (defaults to NOW())',
 95+ default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
6296 )
6397 parser.add_argument(
64 - 'comment',
 98+ '--user_name',
 99+ type=str,
 100+ help='the user_name to further filter postings by (useful for tracking bots)'
 101+ )
 102+ parser.add_argument(
 103+ '--comment',
65104 type=re.compile,
66105 help='regular expression to match against message posting comment'
67106 )
68107 parser.add_argument(
69 - 'message',
 108+ '--message',
70109 type=re.compile,
71 - help='regular expression to match against message content'
 110+ help='regular expression to match against message content (required)',
 111+ required=True
72112 )
73113 args = parser.parse_args()
74114
@@ -78,6 +118,8 @@
79119 format='%(asctime)s %(levelname)-8s %(message)s',
80120 datefmt='%b-%d %H:%M:%S'
81121 )
 122+ logging.debug("Comment pattern is %r." % args.comment.pattern)
 123+ logging.debug("Message pattern is %r." % args.message.pattern)
82124
83125 logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
84126 db = Database(
@@ -85,6 +127,27 @@
86128 db=args.db,
87129 read_default_file=args.cnf
88130 )
 131+
 132+ logging.info("Connecting to API @ %s." % args.api_uri)
 133+ api = WPAPI(args.api_uri)
 134+
 135+ logging.info("Querying for matching revisions:")
 136+ count = {"matched": 0, "missed": 0}
 137+ for rev in db.getPostings(args.start, args.end, args.user_name, args.comment):
 138+ message = api.getAdded(rev['rev_id'])
 139+ match = args.message.search(message)
 140+ if match != None:
 141+ rev['message_match'] = match.group(0)
 142+
 143+ emit(rev)
 144+ LOGGING_STREAM.write("|")
 145+ count['matched'] += 1
 146+ else:
 147+ LOGGING_STREAM.write("o")
 148+ count['missed'] += 1
 149+
 150+ LOGGING_STREAM.write("\n")
 151+ logging.info("Process completed. %(matched)s messages matched, %(missed)s messages missed." % count)
89152
90153
91154
@@ -95,18 +158,36 @@
96159 self.kwargs = kwargs
97160 self.conn = MySQLdb.connect(*args, **kwargs)
98161
99 - def getPostings(self, afterDate, commentPattern):
 162+ def getPostings(self, start, end, userName=None, commentRE=None):
 163+ if (userName, commentRE) == (None, None):
 164+ raise TypeError("Must specify at at least one of userName or commentRE.")
 165+
100166 cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
 167+ query = """
 168+ SELECT
 169+ r.rev_id,
 170+ r.rev_timestamp,
 171+ r.rev_comment,
 172+ r.rev_user AS poster_id,
 173+ r.rev_user_text AS poster_name,
 174+ REPLACE(p.page_title, "_", " ") AS recipient_name
 175+ FROM revision r
 176+ INNER JOIN page p ON r.rev_page = p.page_id
 177+ WHERE rev_timestamp BETWEEN %(start)s AND %(end)s
 178+ AND page_namespace = 3
 179+ """
 180+ if userName != None:
 181+ query += "AND rev_user_text = %(user_name)s\n"
 182+ if commentRE != None:
 183+ query += "AND rev_comment REGEXP %(comment_pattern)s\n"
 184+
101185 cursor.execute(
102 - """
103 - SELECT * FROM
104 - FROM revision
105 - WHERE rev_timestamp > %(afterDate)s
106 - AND rev_comment REGEXP %(commentPattern)s
107 - """,
 186+ query,
108187 {
109 - 'afterDate': afterDate,
110 - 'commentPattern': commentPattern
 188+ 'start': start,
 189+ 'end': end,
 190+ 'user_name': userName,
 191+ 'comment_pattern': commentRE.pattern
111192 }
112193 )
113194
@@ -116,33 +197,65 @@
117198
118199
119200 class WPAPI:
 201+ DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>')
120202
121203 def __init__(self, uri):
122204 self.uri = uri
123205
124 - def getDiff(self, revId):
 206+ def getDiff(self, revId, retries=10):
 207+ attempt = 0
 208+ while attempt < retries:
 209+ try:
 210+ response = urllib2.urlopen(
 211+ self.uri,
 212+ urllib.urlencode({
 213+ 'action': 'query',
 214+ 'prop': 'revisions',
 215+ 'revids': revId,
 216+ 'rvprop': 'ids',
 217+ 'rvdiffto': 'prev',
 218+ 'format': 'json'
 219+ })
 220+ )
 221+ result = json.load(response)
 222+ return result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
 223+ except urllib2.HTTPError as e:
 224+ time.sleep(attempt*2)
 225+ attempt += 1
 226+
 227+
 228+
 229+ def getAdded(self, revId):
 230+ diff = self.getDiff(revId)
125231
126 - response = urllib2.urlopen(
127 - self.uri,
128 - data=urllib.urlencode({
129 - 'action': "query",
130 - 'prop': "revisions",
131 - 'revids': revId,
132 - 'rvprop': "diff",
133 - 'format': "json"
134 - })
 232+ return self.unescape(
 233+ "\n".join(
 234+ match.group(1)
 235+ for match in WPAPI.DIFF_ADD_RE.finditer(diff)
 236+ )
135237 )
136238
137 - js = json.load(response)
138 -
139 -
140 -
141 - try:
142 - if 'badrevids' in js['query']:
143 - raise KeyError(revId)
 239+ def unescape(self, text):
 240+ def fixup(m):
 241+ text = m.group(0)
 242+ if text[:2] == "&#":
 243+ # character reference
 244+ try:
 245+ if text[:3] == "&#x":
 246+ return unichr(int(text[3:-1], 16))
 247+ else:
 248+ return unichr(int(text[2:-1]))
 249+ except ValueError:
 250+ pass
144251 else:
145 - return js['query']['pages'].values()[0]['revisions'][0]['diff']['*']
146 - except KeyError:
147 -
 252+ # named entity
 253+ try:
 254+ text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
 255+ except KeyError:
 256+ pass
 257+ return text # leave as is
 258+ return re.sub("&#?\w+;", fixup, text)
148259
149260
 261+if __name__ == "__main__":
 262+ main()

Status & tagging log