Index: trunk/tools/wsor/message_templates/message_postings.py |
— | — | @@ -1,7 +1,12 @@ |
2 | | -import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time |
3 | | -import urllib, urllib2 |
| 2 | +import sys, argparse, os |
| 3 | +import logging, types, re |
| 4 | +import time, datetime |
| 5 | +import MySQLdb, MySQLdb.cursors |
| 6 | +import urllib, urllib2, json, htmlentitydefs |
4 | 7 | import wmf |
5 | 8 | |
| 9 | +class MissingRevError(Exception):pass |
| 10 | + |
6 | 11 | def encode(v): |
7 | 12 | if v == None: return "\N" |
8 | 13 | |
— | — | @@ -10,18 +15,46 @@ |
11 | 16 | |
12 | 17 | return str(v).encode("string-escape") |
13 | 18 | |
14 | | -# | year | month | day | hour | minute | second | |
15 | | -MW_DATE = re.compile(r"[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]") |
| 19 | +def emit(rev): |
| 20 | + |
| 21 | + print( |
| 22 | + "\t".join( |
| 23 | + encode(rev[c]) for c in [ |
| 24 | + 'rev_id', |
| 25 | + 'rev_timestamp', |
| 26 | + 'poster_id', |
| 27 | + 'poster_name', |
| 28 | + 'recipient_name', |
| 29 | + 'message_match' |
| 30 | + ] |
| 31 | + ) |
| 32 | + ) |
16 | 33 | |
| 34 | + |
| 35 | +# MediaWiki Date format |
| 36 | +# |
| 37 | +# | year | month | day | hour | minute | second | |
| 38 | +MW_DATE = re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$") |
17 | 39 | def mwDate(string): |
18 | 40 | if MW_DATE.match(string) == None: |
19 | | - raise ValueError("%s is not a valid date. Expected YYMMDDHHmmSS" % string) |
| 41 | + raise ValueError("%r is not a valid date. Expected YYMMDDHHmmSS" % string) |
20 | 42 | else: |
21 | 43 | return string |
22 | 44 | |
23 | 45 | def main(): |
24 | 46 | parser = argparse.ArgumentParser( |
25 | | - description='Gathers template message postings based on comment and diff matching regular expressions.' |
| 47 | + description=""" |
| 48 | + Gathers experimental message postings from user_talk messages. |
| 49 | + """, |
| 50 | + epilog=""" |
| 51 | + python message_postings.py |
| 52 | + -h db42 |
| 53 | + --start=20111222000000 |
| 54 | + --end=20111223000000 |
| 55 | + --comment="\(\[\[WP:HG\|HG\]\]\)" |
| 56 | + --message="Template:uw-vandalism1" |
| 57 | + """, |
| 58 | + conflict_handler="resolve" |
26 | 59 | ) |
27 | 60 | parser.add_argument( |
28 | 61 | '-c', '--cnf', |
— | — | @@ -31,7 +64,7 @@ |
32 | 65 | default=os.path.expanduser("~/.my.cnf") |
33 | 66 | ) |
34 | 67 | parser.add_argument( |
35 | | - '-s', '--host', |
| 68 | + '-h', '--host', |
36 | 69 | type=str, |
37 | 70 | help='the database host to connect to (defaults to localhost)', |
38 | 71 | default="localhost" |
— | — | @@ -45,29 +78,36 @@ |
46 | 79 | parser.add_argument( |
47 | 80 | '-a', '--api_uri', |
48 | 81 | type=str, |
49 | | - help='the default Wikimedia API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', |
| 82 | + help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', |
50 | 83 | default="http://en.wikipedia.org/w/api.php" |
51 | 84 | ) |
52 | 85 | parser.add_argument( |
53 | | - '--before', |
54 | | - type=str, |
55 | | - help='the default Wikimedia API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', |
56 | | - default="http://en.wikipedia.org/w/api.php" |
| 86 | + '--start', |
| 87 | + type=mwDate, |
| 88 | + help='the start of the experimental period. (Required)', |
| 89 | + required=True |
57 | 90 | ) |
58 | 91 | parser.add_argument( |
59 | | - 'after', |
60 | | - type=mwDate, |
61 | | - help='regular expression to match against message content' |
| 92 | + '--end', |
| 93 | + type=mwDate, |
| 94 | + help='the end of the experimental period. (defaults to NOW())', |
| 95 | + default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") |
62 | 96 | ) |
63 | 97 | parser.add_argument( |
64 | | - 'comment', |
| 98 | + '--user_name', |
| 99 | + type=str, |
| 100 | + help='the user_name to further filter postings by (useful for tracking bots)' |
| 101 | + ) |
| 102 | + parser.add_argument( |
| 103 | + '--comment', |
65 | 104 | type=re.compile, |
66 | 105 | help='regular expression to match against message posting comment' |
67 | 106 | ) |
68 | 107 | parser.add_argument( |
69 | | - 'message', |
| 108 | + '--message', |
70 | 109 | type=re.compile, |
71 | | - help='regular expression to match against message content' |
| 110 | + help='regular expression to match against message content (required)', |
| 111 | + required=True |
72 | 112 | ) |
73 | 113 | args = parser.parse_args() |
74 | 114 | |
— | — | @@ -78,6 +118,8 @@ |
79 | 119 | format='%(asctime)s %(levelname)-8s %(message)s', |
80 | 120 | datefmt='%b-%d %H:%M:%S' |
81 | 121 | ) |
| 122 | + logging.debug("Comment pattern is %r." % args.comment.pattern) |
| 123 | + logging.debug("Message pattern is %r." % args.message.pattern) |
82 | 124 | |
83 | 125 | logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
84 | 126 | db = Database( |
— | — | @@ -85,6 +127,27 @@ |
86 | 128 | db=args.db, |
87 | 129 | read_default_file=args.cnf |
88 | 130 | ) |
| 131 | + |
| 132 | + logging.info("Connecting to API @ %s." % args.api_uri) |
| 133 | + api = WPAPI(args.api_uri) |
| 134 | + |
| 135 | + logging.info("Querying for matching revisions:") |
| 136 | + count = {"matched": 0, "missed": 0} |
| 137 | + for rev in db.getPostings(args.start, args.end, args.user_name, args.comment): |
| 138 | + message = api.getAdded(rev['rev_id']) |
| 139 | + match = args.message.search(message) |
| 140 | + if match != None: |
| 141 | + rev['message_match'] = match.group(0) |
| 142 | + |
| 143 | + emit(rev) |
| 144 | + LOGGING_STREAM.write("|") |
| 145 | + count['matched'] += 1 |
| 146 | + else: |
| 147 | + LOGGING_STREAM.write("o") |
| 148 | + count['missed'] += 1 |
| 149 | + |
| 150 | + LOGGING_STREAM.write("\n") |
| 151 | + logging.info("Process completed. %(matched)s messages matched, %(missed)s messages missed." % count) |
89 | 152 | |
90 | 153 | |
91 | 154 | |
— | — | @@ -95,18 +158,36 @@ |
96 | 159 | self.kwargs = kwargs |
97 | 160 | self.conn = MySQLdb.connect(*args, **kwargs) |
98 | 161 | |
99 | | - def getPostings(self, afterDate, commentPattern): |
| 162 | + def getPostings(self, start, end, userName=None, commentRE=None): |
| 163 | + if (userName, commentRE) == (None, None): |
| 164 | + raise TypeError("Must specify at at least one of userName or commentRE.") |
| 165 | + |
100 | 166 | cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) |
| 167 | + query = """ |
| 168 | + SELECT |
| 169 | + r.rev_id, |
| 170 | + r.rev_timestamp, |
| 171 | + r.rev_comment, |
| 172 | + r.rev_user AS poster_id, |
| 173 | + r.rev_user_text AS poster_name, |
| 174 | + REPLACE(p.page_title, "_", " ") AS recipient_name |
| 175 | + FROM revision r |
| 176 | + INNER JOIN page p ON r.rev_page = p.page_id |
| 177 | + WHERE rev_timestamp BETWEEN %(start)s AND %(end)s |
| 178 | + AND page_namespace = 3 |
| 179 | + """ |
| 180 | + if userName != None: |
| 181 | + query += "AND rev_user_text = %(user_name)s\n" |
| 182 | + if commentRE != None: |
| 183 | + query += "AND rev_comment REGEXP %(comment_pattern)s\n" |
| 184 | + |
101 | 185 | cursor.execute( |
102 | | - """ |
103 | | - SELECT * FROM |
104 | | - FROM revision |
105 | | - WHERE rev_timestamp > %(afterDate)s |
106 | | - AND rev_comment REGEXP %(commentPattern)s |
107 | | - """, |
| 186 | + query, |
108 | 187 | { |
109 | | - 'afterDate': afterDate, |
110 | | - 'commentPattern': commentPattern |
| 188 | + 'start': start, |
| 189 | + 'end': end, |
| 190 | + 'user_name': userName, |
| 191 | + 'comment_pattern': commentRE.pattern |
111 | 192 | } |
112 | 193 | ) |
113 | 194 | |
— | — | @@ -116,33 +197,65 @@ |
117 | 198 | |
118 | 199 | |
119 | 200 | class WPAPI: |
| 201 | + DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>') |
120 | 202 | |
121 | 203 | def __init__(self, uri): |
122 | 204 | self.uri = uri |
123 | 205 | |
124 | | - def getDiff(self, revId): |
| 206 | + def getDiff(self, revId, retries=10): |
| 207 | + attempt = 0 |
| 208 | + while attempt < retries: |
| 209 | + try: |
| 210 | + response = urllib2.urlopen( |
| 211 | + self.uri, |
| 212 | + urllib.urlencode({ |
| 213 | + 'action': 'query', |
| 214 | + 'prop': 'revisions', |
| 215 | + 'revids': revId, |
| 216 | + 'rvprop': 'ids', |
| 217 | + 'rvdiffto': 'prev', |
| 218 | + 'format': 'json' |
| 219 | + }) |
| 220 | + ) |
| 221 | + result = json.load(response) |
| 222 | + return result['query']['pages'].values()[0]['revisions'][0]['diff']['*'] |
| 223 | + except urllib2.HTTPError as e: |
| 224 | + time.sleep(attempt*2) |
| 225 | + attempt += 1 |
| 226 | + |
| 227 | + |
| 228 | + |
| 229 | + def getAdded(self, revId): |
| 230 | + diff = self.getDiff(revId) |
125 | 231 | |
126 | | - response = urllib2.urlopen( |
127 | | - self.uri, |
128 | | - data=urllib.urlencode({ |
129 | | - 'action': "query", |
130 | | - 'prop': "revisions", |
131 | | - 'revids': revId, |
132 | | - 'rvprop': "diff", |
133 | | - 'format': "json" |
134 | | - }) |
| 232 | + return self.unescape( |
| 233 | + "\n".join( |
| 234 | + match.group(1) |
| 235 | + for match in WPAPI.DIFF_ADD_RE.finditer(diff) |
| 236 | + ) |
135 | 237 | ) |
136 | 238 | |
137 | | - js = json.load(response) |
138 | | - |
139 | | - |
140 | | - |
141 | | - try: |
142 | | - if 'badrevids' in js['query']: |
143 | | - raise KeyError(revId) |
| 239 | + def unescape(self, text): |
| 240 | + def fixup(m): |
| 241 | + text = m.group(0) |
| 242 | + if text[:2] == "&#": |
| 243 | + # character reference |
| 244 | + try: |
| 245 | + if text[:3] == "&#x": |
| 246 | + return unichr(int(text[3:-1], 16)) |
| 247 | + else: |
| 248 | + return unichr(int(text[2:-1])) |
| 249 | + except ValueError: |
| 250 | + pass |
144 | 251 | else: |
145 | | - return js['query']['pages'].values()[0]['revisions'][0]['diff']['*'] |
146 | | - except KeyError: |
147 | | - |
| 252 | + # named entity |
| 253 | + try: |
| 254 | + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) |
| 255 | + except KeyError: |
| 256 | + pass |
| 257 | + return text # leave as is |
| 258 | + return re.sub("&#?\w+;", fixup, text) |
148 | 259 | |
149 | 260 | |
| 261 | +if __name__ == "__main__": |
| 262 | + main() |