r107959 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r107958‎ | r107959 | r107960 >
Date:23:12, 3 January 2012
Author:halfak
Status:deferred
Tags:
Comment:
Postings and metrics script working. Metrics for editcounts, warnings, talk and blocks working, but not spot checked.
Modified paths:
  • /trunk/tools/wsor/message_templates/generators (deleted) (history)
  • /trunk/tools/wsor/message_templates/message_postings.py (deleted) (history)
  • /trunk/tools/wsor/message_templates/metrics (added) (history)
  • /trunk/tools/wsor/message_templates/postings (added) (history)
  • /trunk/tools/wsor/message_templates/sample_postings.tsv (added) (history)
  • /trunk/tools/wsor/message_templates/sql/test.sql (modified) (history)
  • /trunk/tools/wsor/message_templates/umetrics (added) (history)
  • /trunk/tools/wsor/message_templates/umetrics/__init__.py (added) (history)
  • /trunk/tools/wsor/message_templates/umetrics/generators (added) (history)
  • /trunk/tools/wsor/message_templates/umetrics/generators/__init__.py (modified) (history)
  • /trunk/tools/wsor/message_templates/umetrics/generators/blocks.py (added) (history)
  • /trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py (modified) (history)
  • /trunk/tools/wsor/message_templates/umetrics/generators/talk.py (added) (history)
  • /trunk/tools/wsor/message_templates/umetrics/generators/warnings.py (added) (history)
  • /trunk/tools/wsor/message_templates/umetrics/metrics.py (added) (history)
  • /trunk/tools/wsor/message_templates/umetrics/postings.py (added) (history)
  • /trunk/tools/wsor/message_templates/umetrics/util (added) (history)
  • /trunk/tools/wsor/message_templates/umetrics/util/__init__.py (added) (history)
  • /trunk/tools/wsor/message_templates/umetrics/util/mw_api.py (added) (history)
  • /trunk/tools/wsor/message_templates/user_metrics.py (deleted) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/message_postings.py
@@ -1,282 +0,0 @@
2 -'''
3 -This script connects to a mediawiki database and API to collect User_talk revisions
4 -that match a set of patterns (and optionally, username).
5 -
6 -:Parameters:
7 - Access the script's documentation for a parameter listing.
8 -
9 - % python message_postings.py --help
10 -
11 -:Output:
12 - This script writes a set of escaped, tab separated columns to standard out.
13 - - Recipient name - The name of the user who received the posting
14 - - Timestamp - The time at which the posting was made
15 - - Revision ID - The identifier of the revision matching the posting
16 - - Poster ID - The identifier of the user who made the posting
17 - - Poster name - The name of the user who make the posting
18 - - Message match - The portion of the message posting that was matched by the regular expression.
19 -
20 -:Example:
21 - python message_postings.py -h db42 --start=20111222000000 --end=20111223000000 --comment="\(\[\[WP:HG\|HG\]\]\)" --message="Template:uw-vandalism1"
22 -'''
23 -import sys, argparse, os
24 -import logging, types, re
25 -import time, datetime
26 -import MySQLdb, MySQLdb.cursors
27 -import urllib, urllib2, json, htmlentitydefs
28 -import wmf
29 -
30 -class MissingRevError(Exception):pass
31 -
32 -def encode(v):
33 - if v == None: return "\N"
34 -
35 - if type(v) == types.LongType: v = int(v)
36 - elif type(v) == types.UnicodeType: v = v.encode('utf-8')
37 -
38 - return str(v).encode("string-escape")
39 -
40 -def emit(rev):
41 -
42 - print(
43 - "\t".join(
44 - encode(rev[c]) for c in [
45 - 'recipient_name',
46 - 'rev_timestamp',
47 - 'rev_id',
48 - 'poster_id',
49 - 'poster_name',
50 - 'message_match'
51 - ]
52 - )
53 - )
54 -
55 -
56 -# MediaWiki Date format
57 -#
58 -# | year | month | day | hour | minute | second |
59 -MW_DATE = re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$")
60 -def mwDate(string):
61 - if MW_DATE.match(string) == None:
62 - raise ValueError("%r is not a valid date. Expected YYMMDDHHmmSS" % string)
63 - else:
64 - return string
65 -
66 -def main():
67 - parser = argparse.ArgumentParser(
68 - description="""
69 - Gathers experimental message postings from user_talk messages.
70 - """,
71 - epilog="""
72 - python message_postings.py
73 - -h db42
74 - --start=20111222000000
75 - --end=20111223000000
76 - --comment="\(\[\[WP:HG\|HG\]\]\)"
77 - --message="Template:uw-vandalism1"
78 - """,
79 - conflict_handler="resolve"
80 - )
81 - parser.add_argument(
82 - '-c', '--cnf',
83 - metavar="<path>",
84 - type=str,
85 - help='the path to MySQL config info (defaults to ~/.my.cnf)',
86 - default=os.path.expanduser("~/.my.cnf")
87 - )
88 - parser.add_argument(
89 - '-h', '--host',
90 - type=str,
91 - help='the database host to connect to (defaults to localhost)',
92 - default="localhost"
93 - )
94 - parser.add_argument(
95 - '-d', '--db',
96 - type=str,
97 - help='the language db to run the query in (defaults to enwiki)',
98 - default="enwiki"
99 - )
100 - parser.add_argument(
101 - '-a', '--api_uri',
102 - type=str,
103 - help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
104 - default="http://en.wikipedia.org/w/api.php"
105 - )
106 - parser.add_argument(
107 - '--start',
108 - type=mwDate,
109 - help='the start of the experimental period. (Required)',
110 - required=True
111 - )
112 - parser.add_argument(
113 - '--end',
114 - type=mwDate,
115 - help='the end of the experimental period. (defaults to NOW())',
116 - default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
117 - )
118 - parser.add_argument(
119 - '--user_name',
120 - type=str,
121 - help='the user_name to further filter postings by (useful for tracking bots)'
122 - )
123 - parser.add_argument(
124 - '--comment',
125 - type=re.compile,
126 - help='regular expression to match against message posting comment'
127 - )
128 - parser.add_argument(
129 - '--message',
130 - type=re.compile,
131 - help='regular expression to match against message content (required)',
132 - required=True
133 - )
134 - args = parser.parse_args()
135 -
136 - LOGGING_STREAM = sys.stderr
137 - logging.basicConfig(
138 - level=logging.DEBUG,
139 - stream=LOGGING_STREAM,
140 - format='%(asctime)s %(levelname)-8s %(message)s',
141 - datefmt='%b-%d %H:%M:%S'
142 - )
143 - logging.debug("Comment pattern is %r." % args.comment.pattern)
144 - logging.debug("Message pattern is %r." % args.message.pattern)
145 -
146 - logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
147 - db = Database(
148 - host=args.host,
149 - db=args.db,
150 - read_default_file=args.cnf
151 - )
152 -
153 - logging.info("Connecting to API @ %s." % args.api_uri)
154 - api = WPAPI(args.api_uri)
155 -
156 - logging.info("Querying for matching revisions:")
157 - count = {"matched": 0, "missed": 0}
158 - for rev in db.getPostings(args.start, args.end, args.user_name, args.comment):
159 - message = api.getAdded(rev['rev_id'])
160 - match = args.message.search(message)
161 - if match != None:
162 - rev['message_match'] = match.group(0)
163 -
164 - emit(rev)
165 - LOGGING_STREAM.write("|")
166 - count['matched'] += 1
167 - else:
168 - LOGGING_STREAM.write("o")
169 - count['missed'] += 1
170 -
171 - LOGGING_STREAM.write("\n")
172 - logging.info("Process completed. %(matched)s messages matched, %(missed)s messages missed." % count)
173 -
174 -
175 -
176 -class Database:
177 -
178 - def __init__(self, *args, **kwargs):
179 - self.args = args
180 - self.kwargs = kwargs
181 - self.conn = MySQLdb.connect(*args, **kwargs)
182 -
183 - def getPostings(self, start, end, userName=None, commentRE=None):
184 - if (userName, commentRE) == (None, None):
185 - raise TypeError("Must specify at at least one of userName or commentRE.")
186 -
187 - cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
188 - query = """
189 - SELECT
190 - r.rev_id,
191 - r.rev_timestamp,
192 - r.rev_comment,
193 - r.rev_user AS poster_id,
194 - r.rev_user_text AS poster_name,
195 - REPLACE(p.page_title, "_", " ") AS recipient_name
196 - FROM revision r
197 - INNER JOIN page p ON r.rev_page = p.page_id
198 - WHERE rev_timestamp BETWEEN %(start)s AND %(end)s
199 - AND page_namespace = 3
200 - """
201 - if userName != None:
202 - query += "AND rev_user_text = %(user_name)s\n"
203 - if commentRE != None:
204 - query += "AND rev_comment REGEXP %(comment_pattern)s\n"
205 -
206 - cursor.execute(
207 - query,
208 - {
209 - 'start': start,
210 - 'end': end,
211 - 'user_name': userName,
212 - 'comment_pattern': commentRE.pattern
213 - }
214 - )
215 -
216 - for row in cursor:
217 - yield row
218 -
219 -
220 -
221 -class WPAPI:
222 - DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>')
223 -
224 - def __init__(self, uri):
225 - self.uri = uri
226 -
227 - def getDiff(self, revId, retries=10):
228 - attempt = 0
229 - while attempt < retries:
230 - try:
231 - response = urllib2.urlopen(
232 - self.uri,
233 - urllib.urlencode({
234 - 'action': 'query',
235 - 'prop': 'revisions',
236 - 'revids': revId,
237 - 'rvprop': 'ids',
238 - 'rvdiffto': 'prev',
239 - 'format': 'json'
240 - })
241 - )
242 - result = json.load(response)
243 - return result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
244 - except urllib2.HTTPError as e:
245 - time.sleep(attempt*2)
246 - attempt += 1
247 -
248 -
249 -
250 - def getAdded(self, revId):
251 - diff = self.getDiff(revId)
252 -
253 - return self.unescape(
254 - "\n".join(
255 - match.group(1)
256 - for match in WPAPI.DIFF_ADD_RE.finditer(diff)
257 - )
258 - )
259 -
260 - def unescape(self, text):
261 - def fixup(m):
262 - text = m.group(0)
263 - if text[:2] == "&#":
264 - # character reference
265 - try:
266 - if text[:3] == "&#x":
267 - return unichr(int(text[3:-1], 16))
268 - else:
269 - return unichr(int(text[2:-1]))
270 - except ValueError:
271 - pass
272 - else:
273 - # named entity
274 - try:
275 - text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
276 - except KeyError:
277 - pass
278 - return text # leave as is
279 - return re.sub("&#?\w+;", fixup, text)
280 -
281 -
282 -if __name__ == "__main__":
283 - main()
Index: trunk/tools/wsor/message_templates/user_metrics.py
@@ -1,102 +0,0 @@
2 -import sys, argparse, os
3 -import logging, types
4 -import MySQLdb, MySQLdb.cursors
5 -
6 -from generators import GENERATORS, Metrics, EditCounts
7 -
8 -
9 -class MissingRevError(Exception):pass
10 -
11 -def encode(v):
12 - if v == None: return "\N"
13 -
14 - if type(v) == types.LongType: v = int(v)
15 - elif type(v) == types.UnicodeType: v = v.encode('utf-8')
16 -
17 - return str(v).encode("string-escape")
18 -
19 -
20 -def main():
21 -
22 - parser = argparse.ArgumentParser(
23 - description="""
24 - Gathers metrics for users around a timestamp.
25 - """,
26 - conflict_handler="resolve"
27 - )
28 - parser.add_argument(
29 - '-c', '--cnf',
30 - metavar="<path>",
31 - type=str,
32 - help='the path to MySQL config info (defaults to ~/.my.cnf)',
33 - default=os.path.expanduser("~/.my.cnf")
34 - )
35 - parser.add_argument(
36 - '-h', '--host',
37 - type=str,
38 - help='the database host to connect to (defaults to localhost)',
39 - default="localhost"
40 - )
41 - parser.add_argument(
42 - '-d', '--db',
43 - type=str,
44 - help='the language db to run the query in (defaults to enwiki)',
45 - default="enwiki"
46 - )
47 - parser.add_argument(
48 - '-a', '--api_uri',
49 - type=str,
50 - help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
51 - default="http://en.wikipedia.org/w/api.php"
52 - )
53 - parser.add_argument(
54 - 'generator',
55 - type=lambda g: GENERATORS[g],
56 - nargs="+",
57 - help='the metric generators to run (%s)' % ', '.join(GENERATORS.keys())
58 - )
59 - args = parser.parse_args()
60 -
61 - LOGGING_STREAM = sys.stderr
62 - logging.basicConfig(
63 - level=logging.DEBUG,
64 - stream=LOGGING_STREAM,
65 - format='%(asctime)s %(levelname)-8s %(message)s',
66 - datefmt='%b-%d %H:%M:%S'
67 - )
68 -
69 - if sys.stdin.isatty():
70 - logging.error("No data piped to standard in!")
71 - return
72 -
73 -
74 - logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
75 - conn = MySQLdb.connect(
76 - host=args.host,
77 - db=args.db,
78 - read_default_file=args.cnf,
79 - cursorclass=MySQLdb.cursors.DictCursor
80 - )
81 -
82 - logging.info("Loading generators...")
83 - metrics = Metrics(g(conn, args.api_uri) for g in args.generator)
84 - print("\t".join(encode(h) for h in metrics.headers()))
85 -
86 -
87 - logging.info("Processing users...")
88 - for line in sys.stdin:
89 - username, timestamp = line.strip().split("\t")[0:2]
90 - username = unicode(username, 'utf-8')
91 -
92 - logging.debug("\t%s at %s:" % (username, timestamp))
93 - print("\t".join(encode(v) for v in metrics.values(username, timestamp)))
94 - LOGGING_STREAM.write("o")
95 -
96 - LOGGING_STREAM.write("\n")
97 -
98 -
99 -
100 -
101 -
102 -if __name__ == "__main__":
103 - main()
Index: trunk/tools/wsor/message_templates/sql/test.sql
@@ -1 +1,47 @@
 2+(
 3+ SELECT
 4+ False as deleted,
 5+ page_namespace as ns,
 6+ count(*) as revisions
 7+ FROM enwiki.revision
 8+ INNER JOIN enwiki.page ON rev_page = page_id
 9+ WHERE rev_timestamp <= "20110101000000"
 10+ AND rev_user_text = "EpochFail"
 11+ GROUP BY page_namespace
 12+)
 13+UNION
 14+(
 15+ SELECT
 16+ True as deleted,
 17+ ar_namespace as ns,
 18+ count(*) as revisions
 19+ FROM enwiki.archive
 20+ WHERE ar_timestamp <= "20110101000000"
 21+ AND ar_user_text = "EpochFail"
 22+ GROUP BY ar_namespace
 23+)
224
 25+SELECT
 26+ r.rev_id,
 27+ r.rev_timestamp,
 28+ r.rev_comment,
 29+ r.rev_user AS poster_id,
 30+ r.rev_user_text AS poster_name,
 31+ REPLACE(p.page_title, "_", " ") AS recipient_name
 32+FROM revision r
 33+INNER JOIN page p ON r.rev_page = p.page_id
 34+WHERE rev_timestamp BETWEEN "20111222000000" AND "20111223000000"
 35+AND page_namespace = 3;
 36+
 37+
 38+SELECT
 39+ IF(log_params LIKE "%indefinite%", "ban", "block") as type,
 40+ IF(log_timestamp > "20110101000000", "after", "before") as tense,
 41+ count(*) as count,
 42+ min(log_timestamp) as first,
 43+ max(log_timestamp) as last
 44+FROM logging
 45+WHERE log_type = "block"
 46+AND log_action = "block"
 47+AND log_title = "EpochFail"
 48+GROUP BY 1, 2;
Index: trunk/tools/wsor/message_templates/sample_postings.tsv
@@ -0,0 +1,122 @@
 2+99.92.179.246 20111222000623 467110760 205121 Koavf Template:uw-vandalism1
 3+206.51.176.121 20111222002717 467113472 7695475 Jim1138 Template:uw-vandalism1
 4+96.255.37.252 20111222002944 467113720 7695475 Jim1138 Template:uw-vandalism1
 5+88.207.182.230 20111222003427 467114243 7695475 Jim1138 Template:uw-vandalism1
 6+75.28.52.111 20111222004219 467115072 8371165 AbigailAbernathy Template:uw-vandalism1
 7+69.142.218.3 20111222004409 467115300 7695475 Jim1138 Template:uw-vandalism1
 8+65.96.250.23 20111222004727 467115720 7695475 Jim1138 Template:uw-vandalism1
 9+69.158.17.157 20111222004835 467115846 7695475 Jim1138 Template:uw-vandalism1
 10+99.226.152.51 20111222005322 467116380 8371165 AbigailAbernathy Template:uw-vandalism1
 11+50.90.114.25 20111222005336 467116401 8371165 AbigailAbernathy Template:uw-vandalism1
 12+24.218.185.51 20111222010055 467117261 8371165 AbigailAbernathy Template:uw-vandalism1
 13+99.60.53.154 20111222012001 467119343 8371165 AbigailAbernathy Template:uw-vandalism1
 14+174.57.17.84 20111222012338 467119703 8371165 AbigailAbernathy Template:uw-vandalism1
 15+PMPC 20111222012759 467120150 7695475 Jim1138 Template:uw-vandalism1
 16+109.77.113.235 20111222012830 467120200 7695475 Jim1138 Template:uw-vandalism1
 17+108.130.64.2 20111222012855 467120243 7695475 Jim1138 Template:uw-vandalism1
 18+173.74.248.50 20111222013909 467121294 7695475 Jim1138 Template:uw-vandalism1
 19+218.24.165.201 20111222014206 467121631 7695475 Jim1138 Template:uw-vandalism1
 20+108.48.90.203 20111222014414 467121875 7695475 Jim1138 Template:uw-vandalism1
 21+183.177.191.220 20111222015126 467122559 7695475 Jim1138 Template:uw-vandalism1
 22+66.168.26.250 20111222023031 467125938 15020596 Mark Arsten Template:uw-vandalism1
 23+68.98.168.116 20111222023202 467126066 7821268 RandomAct Template:uw-vandalism1
 24+72.184.167.111 20111222023634 467126522 15020596 Mark Arsten Template:uw-vandalism1
 25+82.11.48.151 20111222023654 467126555 15020596 Mark Arsten Template:uw-vandalism1
 26+76.69.9.18 20111222023843 467126740 15020596 Mark Arsten Template:uw-vandalism1
 27+76.179.88.134 20111222024014 467126906 7821268 RandomAct Template:uw-vandalism1
 28+223.228.169.111 20111222024511 467127446 15020596 Mark Arsten Template:uw-vandalism1
 29+70.73.15.114 20111222024531 467127488 15020596 Mark Arsten Template:uw-vandalism1
 30+115.240.209.58 20111222024600 467127532 15020596 Mark Arsten Template:uw-vandalism1
 31+198.151.130.54 20111222024729 467127691 7821268 RandomAct Template:uw-vandalism1
 32+184.37.78.85 20111222024756 467127733 15020596 Mark Arsten Template:uw-vandalism1
 33+112.210.239.1 20111222025209 467128144 15020596 Mark Arsten Template:uw-vandalism1
 34+71.227.110.218 20111222025853 467128908 7821268 RandomAct Template:uw-vandalism1
 35+70.127.78.61 20111222030015 467129038 15020596 Mark Arsten Template:uw-vandalism1
 36+50.9.34.247 20111222030027 467129069 15020596 Mark Arsten Template:uw-vandalism1
 37+68.1.183.212 20111222030559 467129619 15020596 Mark Arsten Template:uw-vandalism1
 38+67.128.239.58 20111222030620 467129655 15020596 Mark Arsten Template:uw-vandalism1
 39+Carenblake 20111222030640 467129689 15020596 Mark Arsten Template:uw-vandalism1
 40+2.101.130.254 20111222030856 467129927 15020596 Mark Arsten Template:uw-vandalism1
 41+68.68.187.69 20111222031254 467130311 7695475 Jim1138 Template:uw-vandalism1
 42+75.86.201.236 20111222031313 467130337 15020596 Mark Arsten Template:uw-vandalism1
 43+74.132.43.218 20111222031420 467130443 15020596 Mark Arsten Template:uw-vandalism1
 44+142.68.160.131 20111222031625 467130652 15020596 Mark Arsten Template:uw-vandalism1
 45+Blueturtle2 20111222031752 467130815 7695475 Jim1138 Template:uw-vandalism1
 46+69.153.186.30 20111222032159 467131245 15020596 Mark Arsten Template:uw-vandalism1
 47+24.191.10.180 20111222032237 467131326 15020596 Mark Arsten Template:uw-vandalism1
 48+89.124.240.80 20111222032924 467132037 15020596 Mark Arsten Template:uw-vandalism1
 49+76.202.230.190 20111222033021 467132144 7695475 Jim1138 Template:uw-vandalism1
 50+98.154.111.227 20111222033157 467132313 15020596 Mark Arsten Template:uw-vandalism1
 51+99.119.25.29 20111222033215 467132346 15020596 Mark Arsten Template:uw-vandalism1
 52+72.67.11.201 20111222033240 467132395 15020596 Mark Arsten Template:uw-vandalism1
 53+108.84.217.178 20111222033339 467132499 15020596 Mark Arsten Template:uw-vandalism1
 54+98.254.245.97 20111222033347 467132515 15020596 Mark Arsten Template:uw-vandalism1
 55+116.68.248.117 20111222033402 467132545 7695475 Jim1138 Template:uw-vandalism1
 56+68.48.81.29 20111222033407 467132557 15020596 Mark Arsten Template:uw-vandalism1
 57+69.14.32.169 20111222033431 467132594 15020596 Mark Arsten Template:uw-vandalism1
 58+76.172.11.143 20111222033513 467132652 15020596 Mark Arsten Template:uw-vandalism1
 59+207.255.163.58 20111222033517 467132663 7695475 Jim1138 Template:uw-vandalism1
 60+82.37.109.26 20111222033535 467132688 15020596 Mark Arsten Template:uw-vandalism1
 61+24.1.86.54 20111222033637 467132794 15020596 Mark Arsten Template:uw-vandalism1
 62+68.229.166.67 20111222033749 467132911 15020596 Mark Arsten Template:uw-vandalism1
 63+68.197.139.18 20111222034204 467133304 7695475 Jim1138 Template:uw-vandalism1
 64+67.241.26.211 20111222034557 467133679 15020596 Mark Arsten Template:uw-vandalism1
 65+71.72.129.150 20111222034630 467133724 7695475 Jim1138 Template:uw-vandalism1
 66+99.112.124.88 20111222034726 467133802 7695475 Jim1138 Template:uw-vandalism1
 67+4.254.81.152 20111222034915 467133967 7695475 Jim1138 Template:uw-vandalism1
 68+71.191.34.185 20111222035000 467134028 15020596 Mark Arsten Template:uw-vandalism1
 69+108.132.160.105 20111222035215 467134272 15020596 Mark Arsten Template:uw-vandalism1
 70+174.113.229.186 20111222035227 467134295 15020596 Mark Arsten Template:uw-vandalism1
 71+86.19.242.120 20111222035300 467134348 15020596 Mark Arsten Template:uw-vandalism1
 72+173.176.118.87 20111222035508 467134517 15020596 Mark Arsten Template:uw-vandalism1
 73+114.142.166.229 20111222035601 467134589 15020596 Mark Arsten Template:uw-vandalism1
 74+76.25.211.198 20111222035646 467134656 7695475 Jim1138 Template:uw-vandalism1
 75+68.3.112.168 20111222035745 467134737 7695475 Jim1138 Template:uw-vandalism1
 76+67.20.133.136 20111222042426 467136925 7695475 Jim1138 Template:uw-vandalism1
 77+68.5.93.197 20111222042635 467137125 7695475 Jim1138 Template:uw-vandalism1
 78+50.135.29.111 20111222042958 467137392 7695475 Jim1138 Template:uw-vandalism1
 79+99.91.215.6 20111222043424 467137772 7695475 Jim1138 Template:uw-vandalism1
 80+Eastderp1 20111222044200 467138536 7695475 Jim1138 Template:uw-vandalism1
 81+173.26.101.78 20111222044530 467138890 7695475 Jim1138 Template:uw-vandalism1
 82+80.194.26.184 20111222050300 467140546 7695475 Jim1138 Template:uw-vandalism1
 83+69.230.195.135 20111222050355 467140632 7695475 Jim1138 Template:uw-vandalism1
 84+Violinmagician 20111222050359 467140640 7695475 Jim1138 Template:uw-vandalism1
 85+76.166.147.85 20111222050726 467140963 7695475 Jim1138 Template:uw-vandalism1
 86+72.229.150.7 20111222050921 467141171 7695475 Jim1138 Template:uw-vandalism1
 87+96.250.109.41 20111222051253 467141520 7695475 Jim1138 Template:uw-vandalism1
 88+70.249.216.200 20111222051603 467141788 7695475 Jim1138 Template:uw-vandalism1
 89+76.114.237.52 20111222051804 467142013 7695475 Jim1138 Template:uw-vandalism1
 90+99.231.38.15 20111222052155 467142435 7695475 Jim1138 Template:uw-vandalism1
 91+70.232.36.209 20111222052236 467142502 58193 OverlordQ Template:uw-vandalism1
 92+70.171.84.236 20111222055646 467145424 7695475 Jim1138 Template:uw-vandalism1
 93+117.229.124.35 20111222060552 467146249 7695475 Jim1138 Template:uw-vandalism1
 94+92.8.86.211 20111222061442 467147022 7695475 Jim1138 Template:uw-vandalism1
 95+Jonnygharris 20111222063512 467148666 2359527 Tgeairn Template:uw-vandalism1
 96+68.32.26.30 20111222064442 467149456 7695475 Jim1138 Template:uw-vandalism1
 97+122.172.47.49 20111222064608 467149587 7695475 Jim1138 Template:uw-vandalism1
 98+68.67.110.139 20111222064632 467149618 2359527 Tgeairn Template:uw-vandalism1
 99+58.68.46.210 20111222070300 467150889 2359527 Tgeairn Template:uw-vandalism1
 100+120.56.171.37 20111222070459 467151018 7695475 Jim1138 Template:uw-vandalism1
 101+24.185.100.176 20111222070521 467151047 2359527 Tgeairn Template:uw-vandalism1
 102+94.76.32.252 20111222070626 467151122 7695475 Jim1138 Template:uw-vandalism1
 103+71.2.35.65 20111222070954 467151442 2359527 Tgeairn Template:uw-vandalism1
 104+203.52.228.196 20111222071602 467152062 7695475 Jim1138 Template:uw-vandalism1
 105+124.84.101.186 20111222072924 467153017 2359527 Tgeairn Template:uw-vandalism1
 106+74.120.224.200 20111222073655 467153546 7695475 Jim1138 Template:uw-vandalism1
 107+70.230.154.187 20111222074036 467153814 7695475 Jim1138 Template:uw-vandalism1
 108+69.226.149.86 20111222074434 467154103 7695475 Jim1138 Template:uw-vandalism1
 109+75.131.131.219 20111222075957 467155209 7695475 Jim1138 Template:uw-vandalism1
 110+49.245.133.171 20111222080029 467155275 7695475 Jim1138 Template:uw-vandalism1
 111+152.118.24.10 20111222080106 467155326 7695475 Jim1138 Template:uw-vandalism1
 112+99.224.122.205 20111222081721 467156446 7695475 Jim1138 Template:uw-vandalism1
 113+198.240.133.75 20111222082450 467156945 7695475 Jim1138 Template:uw-vandalism1
 114+110.77.227.221 20111222083046 467157456 7695475 Jim1138 Template:uw-vandalism1
 115+2.188.4.3 20111222083310 467157657 7695475 Jim1138 Template:uw-vandalism1
 116+145.103.249.34 20111222084352 467158487 7695475 Jim1138 Template:uw-vandalism1
 117+50.64.10.116 20111222084516 467158585 7695475 Jim1138 Template:uw-vandalism1
 118+76.67.17.205 20111222085448 467159343 7695475 Jim1138 Template:uw-vandalism1
 119+120.144.129.107 20111222092827 467162126 7695475 Jim1138 Template:uw-vandalism1
 120+14.139.243.229 20111222093150 467162401 7695475 Jim1138 Template:uw-vandalism1
 121+180.149.52.45 20111222093649 467162851 7695475 Jim1138 Template:uw-vandalism1
 122+24.191.25.201 20111222094616 467163617 7695475 Jim1138 Template:uw-vandalism1
 123+71.227.10.224 20111222095309 467164127 7695475 Jim1138 Template:uw-vandalism1
Index: trunk/tools/wsor/message_templates/metrics
@@ -0,0 +1,2 @@
 2+#!/usr/bin/env python
 3+from umetrics.metrics import main;main()
Property changes on: trunk/tools/wsor/message_templates/metrics
___________________________________________________________________
Added: svn:executable
14 + *
Index: trunk/tools/wsor/message_templates/postings
@@ -0,0 +1,2 @@
 2+#!/usr/bin/env python
 3+from umetrics.postings import main;main()
Property changes on: trunk/tools/wsor/message_templates/postings
___________________________________________________________________
Added: svn:executable
14 + *
Index: trunk/tools/wsor/message_templates/umetrics/__init__.py
@@ -0,0 +1 @@
 2+
Index: trunk/tools/wsor/message_templates/umetrics/generators/warnings.py
@@ -0,0 +1,97 @@
 2+import itertools, wmf, difflib, re
 3+from .metric_generator import MetricGenerator
 4+
 5+
 6+class Warnings(MetricGenerator):
 7+
 8+ WARN_RE = re.compile(r'<!--\s*Template:uw-')
 9+
 10+ def __init__(self, conn, api):
 11+ self.conn = conn
 12+ self.api = api
 13+
 14+ def headers(self):
 15+ return [
 16+ 'warns_before',
 17+ 'warns_after',
 18+ 'first_warn_before',
 19+ 'last_warn_before',
 20+ 'first_warn_after',
 21+ 'last_warn_after'
 22+ ]
 23+
 24+ def values(self, username, timestamp):
 25+ rowValues = {
 26+ 'warns_before': 0,
 27+ 'warns_after': 0
 28+ }
 29+
 30+ timestamp = wmf.wp2Timestamp(timestamp)
 31+
 32+ for rev in self.getProcessedRevs(username):
 33+ #determine if we have a warning
 34+ if self.WARN_RE.search(rev['added']) != None:
 35+ if rev['timestamp'] < timestamp:
 36+ whence = "before"
 37+ elif rev['timestamp'] > timestamp:
 38+ whence = "after"
 39+ else:
 40+ continue
 41+
 42+ rowValues['warns_%s' % whence] += 1
 43+
 44+ if 'first_warn_%s' % whence not in rowValues:
 45+ rowValues['first_warn_%s' % whence] = wmf.timestamp2WP(rev['timestamp'])
 46+
 47+ rowValues['last_warn_%s' % whence] = wmf.timestamp2WP(rev['timestamp'])
 48+
 49+ return [rowValues.get(c) for c in self.headers()]
 50+
 51+ def getProcessedRevs(self, username):
 52+ return self.processRevs(self.getUserPageRevisions(username))
 53+
 54+ def getUserPageRevisions(self, username, rvcontinue=None):
 55+ js = self.api.request(
 56+ action="query",
 57+ prop="revisions",
 58+ titles="User_talk:%s" % username,
 59+ rvprop="ids|timestamp|content",
 60+ rvdir="newer",
 61+ rvlimit=50,
 62+ rvcontinue=rvcontinue
 63+ )
 64+
 65+ for rev in js['query']['pages'].values()[0]['revisions']:
 66+ rev['timestamp']
 67+ yield rev
 68+
 69+ if 'query-continue' in js:
 70+ for rev in self.getUserPageRevisions(username, js['query-continue']['revisions']['rvstartid']):
 71+ yield rev
 72+
 73+
 74+
 75+ def processRevs(self, revs):
 76+
 77+ previousLines = []
 78+ for rev in revs:
 79+ lines = rev.get('*', "").split("\n")
 80+ del rev['*']
 81+
 82+ added = []
 83+ sm = difflib.SequenceMatcher(None, previousLines, lines)
 84+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
 85+ if tag == "insert":
 86+ added.extend(lines[j1:j2])
 87+ elif tag == "replace":
 88+ added.extend(lines[j1:j2])
 89+
 90+
 91+ rev['added'] = "\n".join(added)
 92+ rev['timestamp'] = wmf.wp2Timestamp(rev['timestamp'])
 93+ yield rev
 94+ previousLines = lines
 95+
 96+
 97+
 98+
Index: trunk/tools/wsor/message_templates/umetrics/generators/metric_generator.py
@@ -0,0 +1,4 @@
 2+class MetricGenerator:
 3+ def __init__(self): pass
 4+ def headers(self): raise NotImplementedError()
 5+ def values(self, username, timestamp): raise NotImplementedError()
Index: trunk/tools/wsor/message_templates/umetrics/generators/talk.py
@@ -0,0 +1,51 @@
 2+import itertools
 3+from .metric_generator import MetricGenerator
 4+
 5+class Talk(MetricGenerator):
 6+
 7+ def __init__(self, conn, api):
 8+ self.conn = conn
 9+
 10+ def headers(self):
 11+ return [
 12+ 'other_talk_before',
 13+ 'first_other_talk_before',
 14+ 'last_other_talk_before',
 15+ 'other_talk_after',
 16+ 'first_other_talk_after',
 17+ 'last_other_talk_after',
 18+ ]
 19+
 20+ def values(self, username, timestamp):
 21+ rowValues = {}
 22+
 23+ cursor = self.conn.cursor()
 24+ cursor.execute("""
 25+ SELECT
 26+ IF(rev_timestamp > %(timestamp)s, "after", "before") as whense,
 27+ COUNT(*) as count,
 28+ MAX(rev_timestamp) as last,
 29+ MIN(rev_timestamp) as first
 30+ FROM revision
 31+ INNER JOIN page ON rev_page = page_id
 32+ WHERE page_namespace = 3
 33+ AND rev_timestamp != %(timestamp)s
 34+ AND page_title = %(page_title)s
 35+ AND rev_user_text != %(username)s
 36+ GROUP BY 1
 37+ """,
 38+ {
 39+ 'timestamp': timestamp,
 40+ 'page_title': username.encode('utf-8').replace(" ", "_"),
 41+ 'username': username.encode('utf-8')
 42+ }
 43+ )
 44+ for row in cursor:
 45+ rowValues['other_talk_%(whence)s'] = row['count']
 46+ rowValues['first_other_talk_%(whence)s'] = row['first']
 47+ rowValues['last_other_talk_%(whence)s'] = row['last']
 48+
 49+ rowValues['other_talk_before'] = rowValues.get('other_talk_before', 0)
 50+ rowValues['other_talk_after'] = rowValues.get('other_talk_after', 0)
 51+
 52+ return [rowValues.get(c) for c in self.headers()]
Index: trunk/tools/wsor/message_templates/umetrics/generators/blocks.py
@@ -0,0 +1,57 @@
 2+import itertools
 3+from .metric_generator import MetricGenerator
 4+
 5+class Blocks(MetricGenerator):
 6+
 7+ def __init__(self, conn, api):
 8+ self.conn = conn
 9+
 10+ def headers(self):
 11+ return [
 12+ 'blocks_before',
 13+ 'blocks_after',
 14+ 'first_block_before',
 15+ 'last_block_before',
 16+ 'first_block_after',
 17+ 'last_block_after',
 18+ 'bans_before',
 19+ 'bans_after',
 20+ 'first_ban_before',
 21+ 'last_ban_before',
 22+ 'first_ban_after',
 23+ 'last_ban_after'
 24+ ]
 25+
 26+ def values(self, username, timestamp):
 27+ rowValues = {}
 28+
 29+ cursor = self.conn.cursor()
 30+ cursor.execute("""
 31+ SELECT
 32+ IF(log_params LIKE "%%indefinite%%", "ban", "block") as type,
 33+ IF(log_timestamp > %(timestamp)s, "after", "before") as whense,
 34+ count(*) as count,
 35+ min(log_timestamp) as first,
 36+ max(log_timestamp) as last
 37+ FROM logging
 38+ WHERE log_type = "block"
 39+ AND log_action = "block"
 40+ AND log_title = %(username)s
 41+ GROUP BY 1, 2
 42+ """,
 43+ {
 44+ 'timestamp': timestamp,
 45+ 'username': username.encode('utf-8').replace(" ", "_")
 46+ }
 47+ )
 48+ for row in cursor:
 49+ rowValues['%(type)ss_%(whense)s' % row] = row['count']
 50+ rowValues['first_%(type)s_%(whense)s' % row] = row['first']
 51+ rowValues['last_%(type)s_%(whense)s' % row] = row['last']
 52+
 53+ rowValues['blocks_before'] = rowValues.get('blocks_before', 0)
 54+ rowValues['blocks_after'] = rowValues.get('blocks_after', 0)
 55+ rowValues['bans_before'] = rowValues.get('bans_before', 0)
 56+ rowValues['bans_after'] = rowValues.get('bans_after', 0)
 57+
 58+ return [rowValues.get(c) for c in self.headers()]
Index: trunk/tools/wsor/message_templates/umetrics/generators/__init__.py
@@ -0,0 +1,31 @@
 2+from .edit_counts import EditCounts
 3+from .talk import Talk
 4+from .blocks import Blocks
 5+from .warnings import Warnings
 6+from .metric_generator import MetricGenerator
 7+
 8+GENERATORS = {
 9+ 'editcounts': EditCounts,
 10+ 'talk': Talk,
 11+ 'blocks': Blocks,
 12+ 'warnings': Warnings
 13+}
 14+
 15+class Metrics(MetricGenerator):
 16+
 17+ def __init__(self, generators):
 18+ self.generators = list(generators)
 19+
 20+ def headers(self):
 21+ row = ['username', 'timestamp']
 22+ for generator in self.generators:
 23+ row.extend(generator.headers())
 24+
 25+ return row
 26+
 27+ def values(self, username, timestamp):
 28+ row = [username, timestamp]
 29+ for generator in self.generators:
 30+ row.extend(generator.values(username, timestamp))
 31+
 32+ return row
Index: trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py
@@ -0,0 +1,59 @@
 2+import itertools
 3+from .metric_generator import MetricGenerator
 4+
 5+class EditCounts(MetricGenerator):
 6+
 7+ def __init__(self, conn, api):
 8+ self.conn = conn
 9+
 10+ def headers(self):
 11+ return itertools.chain(*[
 12+ [
 13+ 'ns_%s_revisions_deleted_before' % ns,
 14+ 'ns_%s_revisions_deleted_after' % ns,
 15+ 'ns_%s_revisions_not_deleted_before' % ns,
 16+ 'ns_%s_revisions_not_deleted_after' % ns
 17+ ]
 18+ for ns in itertools.chain(range(0,16), [100, 101, 108, 109])
 19+ ])
 20+
 21+ def values(self, username, timestamp):
 22+ rowData = {}
 23+
 24+ cursor = self.conn.cursor()
 25+ cursor.execute("""
 26+ (
 27+ SELECT
 28+ False as deleted,
 29+ page_namespace as ns,
 30+ count(*) as revisions
 31+ FROM enwiki.revision
 32+ INNER JOIN enwiki.page ON rev_page = page_id
 33+ WHERE rev_timestamp <= %(timestamp)s
 34+ AND rev_user_text = %(username)s
 35+ GROUP BY page_namespace
 36+ )
 37+ UNION (
 38+ SELECT
 39+ True as deleted,
 40+ ar_namespace as ns,
 41+ count(*) as revisions
 42+ FROM enwiki.archive
 43+ WHERE ar_timestamp <= %(timestamp)s
 44+ AND ar_user_text = %(username)s
 45+ GROUP BY ar_namespace
 46+ )""",
 47+ {
 48+ 'timestamp': timestamp,
 49+ 'username': username.encode('utf-8')
 50+ }
 51+ )
 52+ for row in cursor:
 53+ if(row['deleted']):
 54+ deleted = "deleted"
 55+ else:
 56+ deleted = "not_deleted"
 57+
 58+ rowData['ns_%s_before_revisions_%s' % (row['ns'], deleted)] = row['revisions']
 59+
 60+ return [rowData.get(c, 0) for c in self.headers()]
Index: trunk/tools/wsor/message_templates/umetrics/metrics.py
@@ -0,0 +1,100 @@
 2+import sys, argparse, os
 3+import logging, types
 4+import MySQLdb, MySQLdb.cursors
 5+
 6+from .generators import GENERATORS, Metrics
 7+from .util import MWAPI, MWAPIError
 8+
 9+def encode(v):
 10+ if v == None: return "\N"
 11+
 12+ if type(v) == types.LongType: v = int(v)
 13+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 14+
 15+ return str(v).encode("string-escape")
 16+
 17+
 18+def main():
 19+
 20+ parser = argparse.ArgumentParser(
 21+ description="""
 22+ Gathers metrics for users around a timestamp.
 23+ """,
 24+ conflict_handler="resolve"
 25+ )
 26+ parser.add_argument(
 27+ '-c', '--cnf',
 28+ metavar="<path>",
 29+ type=str,
 30+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 31+ default=os.path.expanduser("~/.my.cnf")
 32+ )
 33+ parser.add_argument(
 34+ '-h', '--host',
 35+ type=str,
 36+ help='the database host to connect to (defaults to localhost)',
 37+ default="localhost"
 38+ )
 39+ parser.add_argument(
 40+ '-d', '--db',
 41+ type=str,
 42+ help='the language db to run the query in (defaults to enwiki)',
 43+ default="enwiki"
 44+ )
 45+ parser.add_argument(
 46+ '-a', '--api',
 47+ type=MWAPI,
 48+ help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
 49+ default="http://en.wikipedia.org/w/api.php"
 50+ )
 51+ parser.add_argument(
 52+ 'generator',
 53+ type=lambda g: GENERATORS[g],
 54+ nargs="+",
 55+ help='the metric generators to run (%s)' % ', '.join(GENERATORS.keys())
 56+ )
 57+ args = parser.parse_args()
 58+
 59+ LOGGING_STREAM = sys.stderr
 60+ logging.basicConfig(
 61+ level=logging.DEBUG,
 62+ stream=LOGGING_STREAM,
 63+ format='%(asctime)s %(levelname)-8s %(message)s',
 64+ datefmt='%b-%d %H:%M:%S'
 65+ )
 66+
 67+ if sys.stdin.isatty():
 68+ logging.error("No data piped to standard in!")
 69+ return
 70+
 71+
 72+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 73+ conn = MySQLdb.connect(
 74+ host=args.host,
 75+ db=args.db,
 76+ read_default_file=args.cnf,
 77+ cursorclass=MySQLdb.cursors.DictCursor
 78+ )
 79+
 80+ logging.info("Loading generators...")
 81+ metrics = Metrics(g(conn, args.api) for g in args.generator)
 82+ print("\t".join(encode(h) for h in metrics.headers()))
 83+
 84+
 85+ logging.info("Processing users...")
 86+ for line in sys.stdin:
 87+ username, timestamp = line.strip().split("\t")[0:2]
 88+ username = unicode(username, 'utf-8')
 89+
 90+ logging.debug("\t%s at %s:" % (username, timestamp))
 91+ print("\t".join(encode(v) for v in metrics.values(username, timestamp)))
 92+ LOGGING_STREAM.write("o")
 93+
 94+ LOGGING_STREAM.write("\n")
 95+
 96+
 97+
 98+
 99+
 100+if __name__ == "__main__":
 101+ main()
Index: trunk/tools/wsor/message_templates/umetrics/postings.py
@@ -0,0 +1,282 @@
 2+'''
 3+This script connects to a mediawiki database and API to collect User_talk revisions
 4+that match a set of patterns (and optionally, username).
 5+
 6+:Parameters:
 7+ Access the script's documentation for a parameter listing.
 8+
 9+ % python message_postings.py --help
 10+
 11+:Output:
 12+ This script writes a set of escaped, tab separated columns to standard out.
 13+ - Recipient name - The name of the user who received the posting
 14+ - Timestamp - The time at which the posting was made
 15+ - Revision ID - The identifier of the revision matching the posting
 16+ - Poster ID - The identifier of the user who made the posting
 17+ - Poster name - The name of the user who make the posting
 18+ - Message match - The portion of the message posting that was matched by the regular expression.
 19+
 20+:Example:
 21+ python message_postings.py -h db42 --start=20111222000000 --end=20111223000000 --comment="\(\[\[WP:HG\|HG\]\]\)" --message="Template:uw-vandalism1"
 22+'''
 23+import sys, argparse, os
 24+import logging, types, re
 25+import time, datetime
 26+import MySQLdb, MySQLdb.cursors
 27+import urllib, urllib2, json, htmlentitydefs
 28+import wmf
 29+
 30+class MissingRevError(Exception):pass
 31+
 32+def encode(v):
 33+ if v == None: return "\N"
 34+
 35+ if type(v) == types.LongType: v = int(v)
 36+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 37+
 38+ return str(v).encode("string-escape")
 39+
 40+def emit(rev):
 41+
 42+ print(
 43+ "\t".join(
 44+ encode(rev[c]) for c in [
 45+ 'recipient_name',
 46+ 'rev_timestamp',
 47+ 'rev_id',
 48+ 'poster_id',
 49+ 'poster_name',
 50+ 'message_match'
 51+ ]
 52+ )
 53+ )
 54+
 55+
 56+# MediaWiki Date format
 57+#
 58+# | year | month | day | hour | minute | second |
 59+MW_DATE = re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$")
 60+def mwDate(string):
 61+ if MW_DATE.match(string) == None:
 62+ raise ValueError("%r is not a valid date. Expected YYMMDDHHmmSS" % string)
 63+ else:
 64+ return string
 65+
 66+def main():
 67+ parser = argparse.ArgumentParser(
 68+ description="""
 69+ Gathers experimental message postings from user_talk messages.
 70+ """,
 71+ epilog="""
 72+ python message_postings.py
 73+ -h db42
 74+ --start=20111222000000
 75+ --end=20111223000000
 76+ --comment="\(\[\[WP:HG\|HG\]\]\)"
 77+ --message="Template:uw-vandalism1"
 78+ """,
 79+ conflict_handler="resolve"
 80+ )
 81+ parser.add_argument(
 82+ '-c', '--cnf',
 83+ metavar="<path>",
 84+ type=str,
 85+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 86+ default=os.path.expanduser("~/.my.cnf")
 87+ )
 88+ parser.add_argument(
 89+ '-h', '--host',
 90+ type=str,
 91+ help='the database host to connect to (defaults to localhost)',
 92+ default="localhost"
 93+ )
 94+ parser.add_argument(
 95+ '-d', '--db',
 96+ type=str,
 97+ help='the language db to run the query in (defaults to enwiki)',
 98+ default="enwiki"
 99+ )
 100+ parser.add_argument(
 101+ '-a', '--api_uri',
 102+ type=str,
 103+ help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
 104+ default="http://en.wikipedia.org/w/api.php"
 105+ )
 106+ parser.add_argument(
 107+ '--start',
 108+ type=mwDate,
 109+ help='the start of the experimental period. (Required)',
 110+ required=True
 111+ )
 112+ parser.add_argument(
 113+ '--end',
 114+ type=mwDate,
 115+ help='the end of the experimental period. (defaults to NOW())',
 116+ default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
 117+ )
 118+ parser.add_argument(
 119+ '--user_name',
 120+ type=str,
 121+ help='the user_name to further filter postings by (useful for tracking bots)'
 122+ )
 123+ parser.add_argument(
 124+ '--comment',
 125+ type=re.compile,
 126+ help='regular expression to match against message posting comment'
 127+ )
 128+ parser.add_argument(
 129+ '--message',
 130+ type=re.compile,
 131+ help='regular expression to match against message content (required)',
 132+ required=True
 133+ )
 134+ args = parser.parse_args()
 135+
 136+ LOGGING_STREAM = sys.stderr
 137+ logging.basicConfig(
 138+ level=logging.DEBUG,
 139+ stream=LOGGING_STREAM,
 140+ format='%(asctime)s %(levelname)-8s %(message)s',
 141+ datefmt='%b-%d %H:%M:%S'
 142+ )
 143+ logging.debug("Comment pattern is %r." % args.comment.pattern)
 144+ logging.debug("Message pattern is %r." % args.message.pattern)
 145+
 146+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 147+ db = Database(
 148+ host=args.host,
 149+ db=args.db,
 150+ read_default_file=args.cnf
 151+ )
 152+
 153+ logging.info("Connecting to API @ %s." % args.api_uri)
 154+ api = WPAPI(args.api_uri)
 155+
 156+ logging.info("Querying for matching revisions:")
 157+ count = {"matched": 0, "missed": 0}
 158+ for rev in db.getPostings(args.start, args.end, args.user_name, args.comment):
 159+ message = api.getAdded(rev['rev_id'])
 160+ match = args.message.search(message)
 161+ if match != None:
 162+ rev['message_match'] = match.group(0)
 163+
 164+ emit(rev)
 165+ LOGGING_STREAM.write("|")
 166+ count['matched'] += 1
 167+ else:
 168+ LOGGING_STREAM.write("o")
 169+ count['missed'] += 1
 170+
 171+ LOGGING_STREAM.write("\n")
 172+ logging.info("Process completed. %(matched)s messages matched, %(missed)s messages missed." % count)
 173+
 174+
 175+
 176+class Database:
 177+
 178+ def __init__(self, *args, **kwargs):
 179+ self.args = args
 180+ self.kwargs = kwargs
 181+ self.conn = MySQLdb.connect(*args, **kwargs)
 182+
 183+ def getPostings(self, start, end, userName=None, commentRE=None):
 184+ if (userName, commentRE) == (None, None):
 185+ raise TypeError("Must specify at at least one of userName or commentRE.")
 186+
 187+ cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
 188+ query = """
 189+ SELECT
 190+ r.rev_id,
 191+ r.rev_timestamp,
 192+ r.rev_comment,
 193+ r.rev_user AS poster_id,
 194+ r.rev_user_text AS poster_name,
 195+ REPLACE(p.page_title, "_", " ") AS recipient_name
 196+ FROM revision r
 197+ INNER JOIN page p ON r.rev_page = p.page_id
 198+ WHERE rev_timestamp BETWEEN %(start)s AND %(end)s
 199+ AND page_namespace = 3
 200+ """
 201+ if userName != None:
 202+ query += "AND rev_user_text = %(user_name)s\n"
 203+ if commentRE != None:
 204+ query += "AND rev_comment REGEXP %(comment_pattern)s\n"
 205+
 206+ cursor.execute(
 207+ query,
 208+ {
 209+ 'start': start,
 210+ 'end': end,
 211+ 'user_name': userName,
 212+ 'comment_pattern': commentRE.pattern
 213+ }
 214+ )
 215+
 216+ for row in cursor:
 217+ yield row
 218+
 219+
 220+
 221+class WPAPI:
 222+ DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>')
 223+
 224+ def __init__(self, uri):
 225+ self.uri = uri
 226+
 227+ def getDiff(self, revId, retries=10):
 228+ attempt = 0
 229+ while attempt < retries:
 230+ try:
 231+ response = urllib2.urlopen(
 232+ self.uri,
 233+ urllib.urlencode({
 234+ 'action': 'query',
 235+ 'prop': 'revisions',
 236+ 'revids': revId,
 237+ 'rvprop': 'ids',
 238+ 'rvdiffto': 'prev',
 239+ 'format': 'json'
 240+ })
 241+ )
 242+ result = json.load(response)
 243+ return result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
 244+ except urllib2.HTTPError as e:
 245+ time.sleep(attempt*2)
 246+ attempt += 1
 247+
 248+
 249+
 250+ def getAdded(self, revId):
 251+ diff = self.getDiff(revId)
 252+
 253+ return self.unescape(
 254+ "\n".join(
 255+ match.group(1)
 256+ for match in WPAPI.DIFF_ADD_RE.finditer(diff)
 257+ )
 258+ )
 259+
 260+ def unescape(self, text):
 261+ def fixup(m):
 262+ text = m.group(0)
 263+ if text[:2] == "&#":
 264+ # character reference
 265+ try:
 266+ if text[:3] == "&#x":
 267+ return unichr(int(text[3:-1], 16))
 268+ else:
 269+ return unichr(int(text[2:-1]))
 270+ except ValueError:
 271+ pass
 272+ else:
 273+ # named entity
 274+ try:
 275+ text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
 276+ except KeyError:
 277+ pass
 278+ return text # leave as is
 279+ return re.sub("&#?\w+;", fixup, text)
 280+
 281+
 282+if __name__ == "__main__":
 283+ main()
Index: trunk/tools/wsor/message_templates/umetrics/util/mw_api.py
@@ -0,0 +1,52 @@
 2+import urllib2, urllib, json
 3+import time
 4+from cookielib import CookieJar
 5+
 6+class MWAPIError(Exception):
 7+ def __init__(self, code, message):
 8+ self.code = code
 9+ self.info = info
 10+
 11+ def __repr__(self):
 12+ return "%s(%s)" % (
 13+ self.__class__.__name__,
 14+ ", ".join(
 15+ repr(self.code),
 16+ repr(self.info)
 17+ )
 18+ )
 19+
 20+ def __str__(self):
 21+ return "%s: %s" % (self.code, self.info)
 22+
 23+class MWAPI:
 24+
 25+ def __init__(self, uri):
 26+ self.uri = uri
 27+ self.cookies = CookieJar()
 28+
 29+ def request(self, retry=0, **kwargs):
 30+ kwargs['format'] = "json"
 31+
 32+ request = urllib2.Request(
 33+ self.uri,
 34+ urllib.urlencode(kwargs)
 35+ )
 36+ self.cookies.add_cookie_header(request)
 37+
 38+ try:
 39+ response = urllib2.urlopen(request)
 40+ except urllib2.HTTPError:
 41+ #wait and try again
 42+ time.sleep(2**retry)
 43+ self.request(retry=retry+1, **kwargs)
 44+
 45+ self.cookies.extract_cookies(response, request)
 46+
 47+ js = json.load(response)
 48+
 49+ if 'error' in js:
 50+ raise MWAPIError(js['error']['code'], js['error']['info'])
 51+ else:
 52+ return js
 53+
Index: trunk/tools/wsor/message_templates/umetrics/util/__init__.py
@@ -0,0 +1 @@
 2+from .mw_api import MWAPI, MWAPIError

Status & tagging log