Index: trunk/tools/wsor/message_templates/message_postings.py |
— | — | @@ -1,282 +0,0 @@ |
2 | | -''' |
3 | | -This script connects to a mediawiki database and API to collect User_talk revisions |
4 | | -that match a set of patterns (and optionally, username). |
5 | | - |
6 | | -:Parameters: |
7 | | - Access the script's documentation for a parameter listing. |
8 | | - |
9 | | - % python message_postings.py --help |
10 | | - |
11 | | -:Output: |
12 | | - This script writes a set of escaped, tab separated columns to standard out. |
13 | | - - Recipient name - The name of the user who received the posting |
14 | | - - Timestamp - The time at which the posting was made |
15 | | - - Revision ID - The identifier of the revision matching the posting |
16 | | - - Poster ID - The identifier of the user who made the posting |
17 | | - - Poster name - The name of the user who make the posting |
18 | | - - Message match - The portion of the message posting that was matched by the regular expression. |
19 | | - |
20 | | -:Example: |
21 | | - python message_postings.py -h db42 --start=20111222000000 --end=20111223000000 --comment="\(\[\[WP:HG\|HG\]\]\)" --message="Template:uw-vandalism1" |
22 | | -''' |
23 | | -import sys, argparse, os |
24 | | -import logging, types, re |
25 | | -import time, datetime |
26 | | -import MySQLdb, MySQLdb.cursors |
27 | | -import urllib, urllib2, json, htmlentitydefs |
28 | | -import wmf |
29 | | - |
30 | | -class MissingRevError(Exception):pass |
31 | | - |
32 | | -def encode(v): |
33 | | - if v == None: return "\N" |
34 | | - |
35 | | - if type(v) == types.LongType: v = int(v) |
36 | | - elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
37 | | - |
38 | | - return str(v).encode("string-escape") |
39 | | - |
40 | | -def emit(rev): |
41 | | - |
42 | | - print( |
43 | | - "\t".join( |
44 | | - encode(rev[c]) for c in [ |
45 | | - 'recipient_name', |
46 | | - 'rev_timestamp', |
47 | | - 'rev_id', |
48 | | - 'poster_id', |
49 | | - 'poster_name', |
50 | | - 'message_match' |
51 | | - ] |
52 | | - ) |
53 | | - ) |
54 | | - |
55 | | - |
56 | | -# MediaWiki Date format |
57 | | -# |
58 | | -# | year | month | day | hour | minute | second | |
59 | | -MW_DATE = re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$") |
60 | | -def mwDate(string): |
61 | | - if MW_DATE.match(string) == None: |
62 | | - raise ValueError("%r is not a valid date. Expected YYMMDDHHmmSS" % string) |
63 | | - else: |
64 | | - return string |
65 | | - |
66 | | -def main(): |
67 | | - parser = argparse.ArgumentParser( |
68 | | - description=""" |
69 | | - Gathers experimental message postings from user_talk messages. |
70 | | - """, |
71 | | - epilog=""" |
72 | | - python message_postings.py |
73 | | - -h db42 |
74 | | - --start=20111222000000 |
75 | | - --end=20111223000000 |
76 | | - --comment="\(\[\[WP:HG\|HG\]\]\)" |
77 | | - --message="Template:uw-vandalism1" |
78 | | - """, |
79 | | - conflict_handler="resolve" |
80 | | - ) |
81 | | - parser.add_argument( |
82 | | - '-c', '--cnf', |
83 | | - metavar="<path>", |
84 | | - type=str, |
85 | | - help='the path to MySQL config info (defaults to ~/.my.cnf)', |
86 | | - default=os.path.expanduser("~/.my.cnf") |
87 | | - ) |
88 | | - parser.add_argument( |
89 | | - '-h', '--host', |
90 | | - type=str, |
91 | | - help='the database host to connect to (defaults to localhost)', |
92 | | - default="localhost" |
93 | | - ) |
94 | | - parser.add_argument( |
95 | | - '-d', '--db', |
96 | | - type=str, |
97 | | - help='the language db to run the query in (defaults to enwiki)', |
98 | | - default="enwiki" |
99 | | - ) |
100 | | - parser.add_argument( |
101 | | - '-a', '--api_uri', |
102 | | - type=str, |
103 | | - help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', |
104 | | - default="http://en.wikipedia.org/w/api.php" |
105 | | - ) |
106 | | - parser.add_argument( |
107 | | - '--start', |
108 | | - type=mwDate, |
109 | | - help='the start of the experimental period. (Required)', |
110 | | - required=True |
111 | | - ) |
112 | | - parser.add_argument( |
113 | | - '--end', |
114 | | - type=mwDate, |
115 | | - help='the end of the experimental period. (defaults to NOW())', |
116 | | - default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") |
117 | | - ) |
118 | | - parser.add_argument( |
119 | | - '--user_name', |
120 | | - type=str, |
121 | | - help='the user_name to further filter postings by (useful for tracking bots)' |
122 | | - ) |
123 | | - parser.add_argument( |
124 | | - '--comment', |
125 | | - type=re.compile, |
126 | | - help='regular expression to match against message posting comment' |
127 | | - ) |
128 | | - parser.add_argument( |
129 | | - '--message', |
130 | | - type=re.compile, |
131 | | - help='regular expression to match against message content (required)', |
132 | | - required=True |
133 | | - ) |
134 | | - args = parser.parse_args() |
135 | | - |
136 | | - LOGGING_STREAM = sys.stderr |
137 | | - logging.basicConfig( |
138 | | - level=logging.DEBUG, |
139 | | - stream=LOGGING_STREAM, |
140 | | - format='%(asctime)s %(levelname)-8s %(message)s', |
141 | | - datefmt='%b-%d %H:%M:%S' |
142 | | - ) |
143 | | - logging.debug("Comment pattern is %r." % args.comment.pattern) |
144 | | - logging.debug("Message pattern is %r." % args.message.pattern) |
145 | | - |
146 | | - logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
147 | | - db = Database( |
148 | | - host=args.host, |
149 | | - db=args.db, |
150 | | - read_default_file=args.cnf |
151 | | - ) |
152 | | - |
153 | | - logging.info("Connecting to API @ %s." % args.api_uri) |
154 | | - api = WPAPI(args.api_uri) |
155 | | - |
156 | | - logging.info("Querying for matching revisions:") |
157 | | - count = {"matched": 0, "missed": 0} |
158 | | - for rev in db.getPostings(args.start, args.end, args.user_name, args.comment): |
159 | | - message = api.getAdded(rev['rev_id']) |
160 | | - match = args.message.search(message) |
161 | | - if match != None: |
162 | | - rev['message_match'] = match.group(0) |
163 | | - |
164 | | - emit(rev) |
165 | | - LOGGING_STREAM.write("|") |
166 | | - count['matched'] += 1 |
167 | | - else: |
168 | | - LOGGING_STREAM.write("o") |
169 | | - count['missed'] += 1 |
170 | | - |
171 | | - LOGGING_STREAM.write("\n") |
172 | | - logging.info("Process completed. %(matched)s messages matched, %(missed)s messages missed." % count) |
173 | | - |
174 | | - |
175 | | - |
176 | | -class Database: |
177 | | - |
178 | | - def __init__(self, *args, **kwargs): |
179 | | - self.args = args |
180 | | - self.kwargs = kwargs |
181 | | - self.conn = MySQLdb.connect(*args, **kwargs) |
182 | | - |
183 | | - def getPostings(self, start, end, userName=None, commentRE=None): |
184 | | - if (userName, commentRE) == (None, None): |
185 | | - raise TypeError("Must specify at at least one of userName or commentRE.") |
186 | | - |
187 | | - cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) |
188 | | - query = """ |
189 | | - SELECT |
190 | | - r.rev_id, |
191 | | - r.rev_timestamp, |
192 | | - r.rev_comment, |
193 | | - r.rev_user AS poster_id, |
194 | | - r.rev_user_text AS poster_name, |
195 | | - REPLACE(p.page_title, "_", " ") AS recipient_name |
196 | | - FROM revision r |
197 | | - INNER JOIN page p ON r.rev_page = p.page_id |
198 | | - WHERE rev_timestamp BETWEEN %(start)s AND %(end)s |
199 | | - AND page_namespace = 3 |
200 | | - """ |
201 | | - if userName != None: |
202 | | - query += "AND rev_user_text = %(user_name)s\n" |
203 | | - if commentRE != None: |
204 | | - query += "AND rev_comment REGEXP %(comment_pattern)s\n" |
205 | | - |
206 | | - cursor.execute( |
207 | | - query, |
208 | | - { |
209 | | - 'start': start, |
210 | | - 'end': end, |
211 | | - 'user_name': userName, |
212 | | - 'comment_pattern': commentRE.pattern |
213 | | - } |
214 | | - ) |
215 | | - |
216 | | - for row in cursor: |
217 | | - yield row |
218 | | - |
219 | | - |
220 | | - |
221 | | -class WPAPI: |
222 | | - DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>') |
223 | | - |
224 | | - def __init__(self, uri): |
225 | | - self.uri = uri |
226 | | - |
227 | | - def getDiff(self, revId, retries=10): |
228 | | - attempt = 0 |
229 | | - while attempt < retries: |
230 | | - try: |
231 | | - response = urllib2.urlopen( |
232 | | - self.uri, |
233 | | - urllib.urlencode({ |
234 | | - 'action': 'query', |
235 | | - 'prop': 'revisions', |
236 | | - 'revids': revId, |
237 | | - 'rvprop': 'ids', |
238 | | - 'rvdiffto': 'prev', |
239 | | - 'format': 'json' |
240 | | - }) |
241 | | - ) |
242 | | - result = json.load(response) |
243 | | - return result['query']['pages'].values()[0]['revisions'][0]['diff']['*'] |
244 | | - except urllib2.HTTPError as e: |
245 | | - time.sleep(attempt*2) |
246 | | - attempt += 1 |
247 | | - |
248 | | - |
249 | | - |
250 | | - def getAdded(self, revId): |
251 | | - diff = self.getDiff(revId) |
252 | | - |
253 | | - return self.unescape( |
254 | | - "\n".join( |
255 | | - match.group(1) |
256 | | - for match in WPAPI.DIFF_ADD_RE.finditer(diff) |
257 | | - ) |
258 | | - ) |
259 | | - |
260 | | - def unescape(self, text): |
261 | | - def fixup(m): |
262 | | - text = m.group(0) |
263 | | - if text[:2] == "&#": |
264 | | - # character reference |
265 | | - try: |
266 | | - if text[:3] == "&#x": |
267 | | - return unichr(int(text[3:-1], 16)) |
268 | | - else: |
269 | | - return unichr(int(text[2:-1])) |
270 | | - except ValueError: |
271 | | - pass |
272 | | - else: |
273 | | - # named entity |
274 | | - try: |
275 | | - text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) |
276 | | - except KeyError: |
277 | | - pass |
278 | | - return text # leave as is |
279 | | - return re.sub("&#?\w+;", fixup, text) |
280 | | - |
281 | | - |
282 | | -if __name__ == "__main__": |
283 | | - main() |
Index: trunk/tools/wsor/message_templates/user_metrics.py |
— | — | @@ -1,102 +0,0 @@ |
2 | | -import sys, argparse, os |
3 | | -import logging, types |
4 | | -import MySQLdb, MySQLdb.cursors |
5 | | - |
6 | | -from generators import GENERATORS, Metrics, EditCounts |
7 | | - |
8 | | - |
9 | | -class MissingRevError(Exception):pass |
10 | | - |
11 | | -def encode(v): |
12 | | - if v == None: return "\N" |
13 | | - |
14 | | - if type(v) == types.LongType: v = int(v) |
15 | | - elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
16 | | - |
17 | | - return str(v).encode("string-escape") |
18 | | - |
19 | | - |
20 | | -def main(): |
21 | | - |
22 | | - parser = argparse.ArgumentParser( |
23 | | - description=""" |
24 | | - Gathers metrics for users around a timestamp. |
25 | | - """, |
26 | | - conflict_handler="resolve" |
27 | | - ) |
28 | | - parser.add_argument( |
29 | | - '-c', '--cnf', |
30 | | - metavar="<path>", |
31 | | - type=str, |
32 | | - help='the path to MySQL config info (defaults to ~/.my.cnf)', |
33 | | - default=os.path.expanduser("~/.my.cnf") |
34 | | - ) |
35 | | - parser.add_argument( |
36 | | - '-h', '--host', |
37 | | - type=str, |
38 | | - help='the database host to connect to (defaults to localhost)', |
39 | | - default="localhost" |
40 | | - ) |
41 | | - parser.add_argument( |
42 | | - '-d', '--db', |
43 | | - type=str, |
44 | | - help='the language db to run the query in (defaults to enwiki)', |
45 | | - default="enwiki" |
46 | | - ) |
47 | | - parser.add_argument( |
48 | | - '-a', '--api_uri', |
49 | | - type=str, |
50 | | - help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', |
51 | | - default="http://en.wikipedia.org/w/api.php" |
52 | | - ) |
53 | | - parser.add_argument( |
54 | | - 'generator', |
55 | | - type=lambda g: GENERATORS[g], |
56 | | - nargs="+", |
57 | | - help='the metric generators to run (%s)' % ', '.join(GENERATORS.keys()) |
58 | | - ) |
59 | | - args = parser.parse_args() |
60 | | - |
61 | | - LOGGING_STREAM = sys.stderr |
62 | | - logging.basicConfig( |
63 | | - level=logging.DEBUG, |
64 | | - stream=LOGGING_STREAM, |
65 | | - format='%(asctime)s %(levelname)-8s %(message)s', |
66 | | - datefmt='%b-%d %H:%M:%S' |
67 | | - ) |
68 | | - |
69 | | - if sys.stdin.isatty(): |
70 | | - logging.error("No data piped to standard in!") |
71 | | - return |
72 | | - |
73 | | - |
74 | | - logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
75 | | - conn = MySQLdb.connect( |
76 | | - host=args.host, |
77 | | - db=args.db, |
78 | | - read_default_file=args.cnf, |
79 | | - cursorclass=MySQLdb.cursors.DictCursor |
80 | | - ) |
81 | | - |
82 | | - logging.info("Loading generators...") |
83 | | - metrics = Metrics(g(conn, args.api_uri) for g in args.generator) |
84 | | - print("\t".join(encode(h) for h in metrics.headers())) |
85 | | - |
86 | | - |
87 | | - logging.info("Processing users...") |
88 | | - for line in sys.stdin: |
89 | | - username, timestamp = line.strip().split("\t")[0:2] |
90 | | - username = unicode(username, 'utf-8') |
91 | | - |
92 | | - logging.debug("\t%s at %s:" % (username, timestamp)) |
93 | | - print("\t".join(encode(v) for v in metrics.values(username, timestamp))) |
94 | | - LOGGING_STREAM.write("o") |
95 | | - |
96 | | - LOGGING_STREAM.write("\n") |
97 | | - |
98 | | - |
99 | | - |
100 | | - |
101 | | - |
102 | | -if __name__ == "__main__": |
103 | | - main() |
Index: trunk/tools/wsor/message_templates/sql/test.sql |
— | — | @@ -1 +1,47 @@ |
| 2 | +( |
| 3 | + SELECT |
| 4 | + False as deleted, |
| 5 | + page_namespace as ns, |
| 6 | + count(*) as revisions |
| 7 | + FROM enwiki.revision |
| 8 | + INNER JOIN enwiki.page ON rev_page = page_id |
| 9 | + WHERE rev_timestamp <= "20110101000000" |
| 10 | + AND rev_user_text = "EpochFail" |
| 11 | + GROUP BY page_namespace |
| 12 | +) |
| 13 | +UNION |
| 14 | +( |
| 15 | + SELECT |
| 16 | + True as deleted, |
| 17 | + ar_namespace as ns, |
| 18 | + count(*) as revisions |
| 19 | + FROM enwiki.archive |
| 20 | + WHERE ar_timestamp <= "20110101000000" |
| 21 | + AND ar_user_text = "EpochFail" |
| 22 | + GROUP BY ar_namespace |
| 23 | +) |
2 | 24 | |
| 25 | +SELECT |
| 26 | + r.rev_id, |
| 27 | + r.rev_timestamp, |
| 28 | + r.rev_comment, |
| 29 | + r.rev_user AS poster_id, |
| 30 | + r.rev_user_text AS poster_name, |
| 31 | + REPLACE(p.page_title, "_", " ") AS recipient_name |
| 32 | +FROM revision r |
| 33 | +INNER JOIN page p ON r.rev_page = p.page_id |
| 34 | +WHERE rev_timestamp BETWEEN "20111222000000" AND "20111223000000" |
| 35 | +AND page_namespace = 3; |
| 36 | + |
| 37 | + |
| 38 | +SELECT |
| 39 | + IF(log_params LIKE "%indefinite%", "ban", "block") as type, |
| 40 | + IF(log_timestamp > "20110101000000", "after", "before") as tense, |
| 41 | + count(*) as count, |
| 42 | + min(log_timestamp) as first, |
| 43 | + max(log_timestamp) as last |
| 44 | +FROM logging |
| 45 | +WHERE log_type = "block" |
| 46 | +AND log_action = "block" |
| 47 | +AND log_title = "EpochFail" |
| 48 | +GROUP BY 1, 2; |
Index: trunk/tools/wsor/message_templates/sample_postings.tsv |
— | — | @@ -0,0 +1,122 @@ |
| 2 | +99.92.179.246 20111222000623 467110760 205121 Koavf Template:uw-vandalism1 |
| 3 | +206.51.176.121 20111222002717 467113472 7695475 Jim1138 Template:uw-vandalism1 |
| 4 | +96.255.37.252 20111222002944 467113720 7695475 Jim1138 Template:uw-vandalism1 |
| 5 | +88.207.182.230 20111222003427 467114243 7695475 Jim1138 Template:uw-vandalism1 |
| 6 | +75.28.52.111 20111222004219 467115072 8371165 AbigailAbernathy Template:uw-vandalism1 |
| 7 | +69.142.218.3 20111222004409 467115300 7695475 Jim1138 Template:uw-vandalism1 |
| 8 | +65.96.250.23 20111222004727 467115720 7695475 Jim1138 Template:uw-vandalism1 |
| 9 | +69.158.17.157 20111222004835 467115846 7695475 Jim1138 Template:uw-vandalism1 |
| 10 | +99.226.152.51 20111222005322 467116380 8371165 AbigailAbernathy Template:uw-vandalism1 |
| 11 | +50.90.114.25 20111222005336 467116401 8371165 AbigailAbernathy Template:uw-vandalism1 |
| 12 | +24.218.185.51 20111222010055 467117261 8371165 AbigailAbernathy Template:uw-vandalism1 |
| 13 | +99.60.53.154 20111222012001 467119343 8371165 AbigailAbernathy Template:uw-vandalism1 |
| 14 | +174.57.17.84 20111222012338 467119703 8371165 AbigailAbernathy Template:uw-vandalism1 |
| 15 | +PMPC 20111222012759 467120150 7695475 Jim1138 Template:uw-vandalism1 |
| 16 | +109.77.113.235 20111222012830 467120200 7695475 Jim1138 Template:uw-vandalism1 |
| 17 | +108.130.64.2 20111222012855 467120243 7695475 Jim1138 Template:uw-vandalism1 |
| 18 | +173.74.248.50 20111222013909 467121294 7695475 Jim1138 Template:uw-vandalism1 |
| 19 | +218.24.165.201 20111222014206 467121631 7695475 Jim1138 Template:uw-vandalism1 |
| 20 | +108.48.90.203 20111222014414 467121875 7695475 Jim1138 Template:uw-vandalism1 |
| 21 | +183.177.191.220 20111222015126 467122559 7695475 Jim1138 Template:uw-vandalism1 |
| 22 | +66.168.26.250 20111222023031 467125938 15020596 Mark Arsten Template:uw-vandalism1 |
| 23 | +68.98.168.116 20111222023202 467126066 7821268 RandomAct Template:uw-vandalism1 |
| 24 | +72.184.167.111 20111222023634 467126522 15020596 Mark Arsten Template:uw-vandalism1 |
| 25 | +82.11.48.151 20111222023654 467126555 15020596 Mark Arsten Template:uw-vandalism1 |
| 26 | +76.69.9.18 20111222023843 467126740 15020596 Mark Arsten Template:uw-vandalism1 |
| 27 | +76.179.88.134 20111222024014 467126906 7821268 RandomAct Template:uw-vandalism1 |
| 28 | +223.228.169.111 20111222024511 467127446 15020596 Mark Arsten Template:uw-vandalism1 |
| 29 | +70.73.15.114 20111222024531 467127488 15020596 Mark Arsten Template:uw-vandalism1 |
| 30 | +115.240.209.58 20111222024600 467127532 15020596 Mark Arsten Template:uw-vandalism1 |
| 31 | +198.151.130.54 20111222024729 467127691 7821268 RandomAct Template:uw-vandalism1 |
| 32 | +184.37.78.85 20111222024756 467127733 15020596 Mark Arsten Template:uw-vandalism1 |
| 33 | +112.210.239.1 20111222025209 467128144 15020596 Mark Arsten Template:uw-vandalism1 |
| 34 | +71.227.110.218 20111222025853 467128908 7821268 RandomAct Template:uw-vandalism1 |
| 35 | +70.127.78.61 20111222030015 467129038 15020596 Mark Arsten Template:uw-vandalism1 |
| 36 | +50.9.34.247 20111222030027 467129069 15020596 Mark Arsten Template:uw-vandalism1 |
| 37 | +68.1.183.212 20111222030559 467129619 15020596 Mark Arsten Template:uw-vandalism1 |
| 38 | +67.128.239.58 20111222030620 467129655 15020596 Mark Arsten Template:uw-vandalism1 |
| 39 | +Carenblake 20111222030640 467129689 15020596 Mark Arsten Template:uw-vandalism1 |
| 40 | +2.101.130.254 20111222030856 467129927 15020596 Mark Arsten Template:uw-vandalism1 |
| 41 | +68.68.187.69 20111222031254 467130311 7695475 Jim1138 Template:uw-vandalism1 |
| 42 | +75.86.201.236 20111222031313 467130337 15020596 Mark Arsten Template:uw-vandalism1 |
| 43 | +74.132.43.218 20111222031420 467130443 15020596 Mark Arsten Template:uw-vandalism1 |
| 44 | +142.68.160.131 20111222031625 467130652 15020596 Mark Arsten Template:uw-vandalism1 |
| 45 | +Blueturtle2 20111222031752 467130815 7695475 Jim1138 Template:uw-vandalism1 |
| 46 | +69.153.186.30 20111222032159 467131245 15020596 Mark Arsten Template:uw-vandalism1 |
| 47 | +24.191.10.180 20111222032237 467131326 15020596 Mark Arsten Template:uw-vandalism1 |
| 48 | +89.124.240.80 20111222032924 467132037 15020596 Mark Arsten Template:uw-vandalism1 |
| 49 | +76.202.230.190 20111222033021 467132144 7695475 Jim1138 Template:uw-vandalism1 |
| 50 | +98.154.111.227 20111222033157 467132313 15020596 Mark Arsten Template:uw-vandalism1 |
| 51 | +99.119.25.29 20111222033215 467132346 15020596 Mark Arsten Template:uw-vandalism1 |
| 52 | +72.67.11.201 20111222033240 467132395 15020596 Mark Arsten Template:uw-vandalism1 |
| 53 | +108.84.217.178 20111222033339 467132499 15020596 Mark Arsten Template:uw-vandalism1 |
| 54 | +98.254.245.97 20111222033347 467132515 15020596 Mark Arsten Template:uw-vandalism1 |
| 55 | +116.68.248.117 20111222033402 467132545 7695475 Jim1138 Template:uw-vandalism1 |
| 56 | +68.48.81.29 20111222033407 467132557 15020596 Mark Arsten Template:uw-vandalism1 |
| 57 | +69.14.32.169 20111222033431 467132594 15020596 Mark Arsten Template:uw-vandalism1 |
| 58 | +76.172.11.143 20111222033513 467132652 15020596 Mark Arsten Template:uw-vandalism1 |
| 59 | +207.255.163.58 20111222033517 467132663 7695475 Jim1138 Template:uw-vandalism1 |
| 60 | +82.37.109.26 20111222033535 467132688 15020596 Mark Arsten Template:uw-vandalism1 |
| 61 | +24.1.86.54 20111222033637 467132794 15020596 Mark Arsten Template:uw-vandalism1 |
| 62 | +68.229.166.67 20111222033749 467132911 15020596 Mark Arsten Template:uw-vandalism1 |
| 63 | +68.197.139.18 20111222034204 467133304 7695475 Jim1138 Template:uw-vandalism1 |
| 64 | +67.241.26.211 20111222034557 467133679 15020596 Mark Arsten Template:uw-vandalism1 |
| 65 | +71.72.129.150 20111222034630 467133724 7695475 Jim1138 Template:uw-vandalism1 |
| 66 | +99.112.124.88 20111222034726 467133802 7695475 Jim1138 Template:uw-vandalism1 |
| 67 | +4.254.81.152 20111222034915 467133967 7695475 Jim1138 Template:uw-vandalism1 |
| 68 | +71.191.34.185 20111222035000 467134028 15020596 Mark Arsten Template:uw-vandalism1 |
| 69 | +108.132.160.105 20111222035215 467134272 15020596 Mark Arsten Template:uw-vandalism1 |
| 70 | +174.113.229.186 20111222035227 467134295 15020596 Mark Arsten Template:uw-vandalism1 |
| 71 | +86.19.242.120 20111222035300 467134348 15020596 Mark Arsten Template:uw-vandalism1 |
| 72 | +173.176.118.87 20111222035508 467134517 15020596 Mark Arsten Template:uw-vandalism1 |
| 73 | +114.142.166.229 20111222035601 467134589 15020596 Mark Arsten Template:uw-vandalism1 |
| 74 | +76.25.211.198 20111222035646 467134656 7695475 Jim1138 Template:uw-vandalism1 |
| 75 | +68.3.112.168 20111222035745 467134737 7695475 Jim1138 Template:uw-vandalism1 |
| 76 | +67.20.133.136 20111222042426 467136925 7695475 Jim1138 Template:uw-vandalism1 |
| 77 | +68.5.93.197 20111222042635 467137125 7695475 Jim1138 Template:uw-vandalism1 |
| 78 | +50.135.29.111 20111222042958 467137392 7695475 Jim1138 Template:uw-vandalism1 |
| 79 | +99.91.215.6 20111222043424 467137772 7695475 Jim1138 Template:uw-vandalism1 |
| 80 | +Eastderp1 20111222044200 467138536 7695475 Jim1138 Template:uw-vandalism1 |
| 81 | +173.26.101.78 20111222044530 467138890 7695475 Jim1138 Template:uw-vandalism1 |
| 82 | +80.194.26.184 20111222050300 467140546 7695475 Jim1138 Template:uw-vandalism1 |
| 83 | +69.230.195.135 20111222050355 467140632 7695475 Jim1138 Template:uw-vandalism1 |
| 84 | +Violinmagician 20111222050359 467140640 7695475 Jim1138 Template:uw-vandalism1 |
| 85 | +76.166.147.85 20111222050726 467140963 7695475 Jim1138 Template:uw-vandalism1 |
| 86 | +72.229.150.7 20111222050921 467141171 7695475 Jim1138 Template:uw-vandalism1 |
| 87 | +96.250.109.41 20111222051253 467141520 7695475 Jim1138 Template:uw-vandalism1 |
| 88 | +70.249.216.200 20111222051603 467141788 7695475 Jim1138 Template:uw-vandalism1 |
| 89 | +76.114.237.52 20111222051804 467142013 7695475 Jim1138 Template:uw-vandalism1 |
| 90 | +99.231.38.15 20111222052155 467142435 7695475 Jim1138 Template:uw-vandalism1 |
| 91 | +70.232.36.209 20111222052236 467142502 58193 OverlordQ Template:uw-vandalism1 |
| 92 | +70.171.84.236 20111222055646 467145424 7695475 Jim1138 Template:uw-vandalism1 |
| 93 | +117.229.124.35 20111222060552 467146249 7695475 Jim1138 Template:uw-vandalism1 |
| 94 | +92.8.86.211 20111222061442 467147022 7695475 Jim1138 Template:uw-vandalism1 |
| 95 | +Jonnygharris 20111222063512 467148666 2359527 Tgeairn Template:uw-vandalism1 |
| 96 | +68.32.26.30 20111222064442 467149456 7695475 Jim1138 Template:uw-vandalism1 |
| 97 | +122.172.47.49 20111222064608 467149587 7695475 Jim1138 Template:uw-vandalism1 |
| 98 | +68.67.110.139 20111222064632 467149618 2359527 Tgeairn Template:uw-vandalism1 |
| 99 | +58.68.46.210 20111222070300 467150889 2359527 Tgeairn Template:uw-vandalism1 |
| 100 | +120.56.171.37 20111222070459 467151018 7695475 Jim1138 Template:uw-vandalism1 |
| 101 | +24.185.100.176 20111222070521 467151047 2359527 Tgeairn Template:uw-vandalism1 |
| 102 | +94.76.32.252 20111222070626 467151122 7695475 Jim1138 Template:uw-vandalism1 |
| 103 | +71.2.35.65 20111222070954 467151442 2359527 Tgeairn Template:uw-vandalism1 |
| 104 | +203.52.228.196 20111222071602 467152062 7695475 Jim1138 Template:uw-vandalism1 |
| 105 | +124.84.101.186 20111222072924 467153017 2359527 Tgeairn Template:uw-vandalism1 |
| 106 | +74.120.224.200 20111222073655 467153546 7695475 Jim1138 Template:uw-vandalism1 |
| 107 | +70.230.154.187 20111222074036 467153814 7695475 Jim1138 Template:uw-vandalism1 |
| 108 | +69.226.149.86 20111222074434 467154103 7695475 Jim1138 Template:uw-vandalism1 |
| 109 | +75.131.131.219 20111222075957 467155209 7695475 Jim1138 Template:uw-vandalism1 |
| 110 | +49.245.133.171 20111222080029 467155275 7695475 Jim1138 Template:uw-vandalism1 |
| 111 | +152.118.24.10 20111222080106 467155326 7695475 Jim1138 Template:uw-vandalism1 |
| 112 | +99.224.122.205 20111222081721 467156446 7695475 Jim1138 Template:uw-vandalism1 |
| 113 | +198.240.133.75 20111222082450 467156945 7695475 Jim1138 Template:uw-vandalism1 |
| 114 | +110.77.227.221 20111222083046 467157456 7695475 Jim1138 Template:uw-vandalism1 |
| 115 | +2.188.4.3 20111222083310 467157657 7695475 Jim1138 Template:uw-vandalism1 |
| 116 | +145.103.249.34 20111222084352 467158487 7695475 Jim1138 Template:uw-vandalism1 |
| 117 | +50.64.10.116 20111222084516 467158585 7695475 Jim1138 Template:uw-vandalism1 |
| 118 | +76.67.17.205 20111222085448 467159343 7695475 Jim1138 Template:uw-vandalism1 |
| 119 | +120.144.129.107 20111222092827 467162126 7695475 Jim1138 Template:uw-vandalism1 |
| 120 | +14.139.243.229 20111222093150 467162401 7695475 Jim1138 Template:uw-vandalism1 |
| 121 | +180.149.52.45 20111222093649 467162851 7695475 Jim1138 Template:uw-vandalism1 |
| 122 | +24.191.25.201 20111222094616 467163617 7695475 Jim1138 Template:uw-vandalism1 |
| 123 | +71.227.10.224 20111222095309 467164127 7695475 Jim1138 Template:uw-vandalism1 |
Index: trunk/tools/wsor/message_templates/metrics |
— | — | @@ -0,0 +1,2 @@ |
| 2 | +#!/usr/bin/env python |
| 3 | +from umetrics.metrics import main;main() |
Property changes on: trunk/tools/wsor/message_templates/metrics |
___________________________________________________________________ |
Added: svn:executable |
1 | 4 | + * |
Index: trunk/tools/wsor/message_templates/postings |
— | — | @@ -0,0 +1,2 @@ |
| 2 | +#!/usr/bin/env python |
| 3 | +from umetrics.postings import main;main() |
Property changes on: trunk/tools/wsor/message_templates/postings |
___________________________________________________________________ |
Added: svn:executable |
1 | 4 | + * |
Index: trunk/tools/wsor/message_templates/umetrics/__init__.py |
— | — | @@ -0,0 +1 @@ |
| 2 | + |
Index: trunk/tools/wsor/message_templates/umetrics/generators/warnings.py |
— | — | @@ -0,0 +1,97 @@ |
| 2 | +import itertools, wmf, difflib, re |
| 3 | +from .metric_generator import MetricGenerator |
| 4 | + |
| 5 | + |
| 6 | +class Warnings(MetricGenerator): |
| 7 | + |
| 8 | + WARN_RE = re.compile(r'<!--\s*Template:uw-') |
| 9 | + |
| 10 | + def __init__(self, conn, api): |
| 11 | + self.conn = conn |
| 12 | + self.api = api |
| 13 | + |
| 14 | + def headers(self): |
| 15 | + return [ |
| 16 | + 'warns_before', |
| 17 | + 'warns_after', |
| 18 | + 'first_warn_before', |
| 19 | + 'last_warn_before', |
| 20 | + 'first_warn_after', |
| 21 | + 'last_warn_after' |
| 22 | + ] |
| 23 | + |
| 24 | + def values(self, username, timestamp): |
| 25 | + rowValues = { |
| 26 | + 'warns_before': 0, |
| 27 | + 'warns_after': 0 |
| 28 | + } |
| 29 | + |
| 30 | + timestamp = wmf.wp2Timestamp(timestamp) |
| 31 | + |
| 32 | + for rev in self.getProcessedRevs(username): |
| 33 | + #determine if we have a warning |
| 34 | + if self.WARN_RE.search(rev['added']) != None: |
| 35 | + if rev['timestamp'] < timestamp: |
| 36 | + whence = "before" |
| 37 | + elif rev['timestamp'] > timestamp: |
| 38 | + whence = "after" |
| 39 | + else: |
| 40 | + continue |
| 41 | + |
| 42 | + rowValues['warns_%s' % whence] += 1 |
| 43 | + |
| 44 | + if 'first_warn_%s' % whence not in rowValues: |
| 45 | + rowValues['first_warn_%s' % whence] = wmf.timestamp2WP(rev['timestamp']) |
| 46 | + |
| 47 | + rowValues['last_warn_%s' % whence] = wmf.timestamp2WP(rev['timestamp']) |
| 48 | + |
| 49 | + return [rowValues.get(c) for c in self.headers()] |
| 50 | + |
| 51 | + def getProcessedRevs(self, username): |
| 52 | + return self.processRevs(self.getUserPageRevisions(username)) |
| 53 | + |
| 54 | + def getUserPageRevisions(self, username, rvcontinue=None): |
| 55 | + js = self.api.request( |
| 56 | + action="query", |
| 57 | + prop="revisions", |
| 58 | + titles="User_talk:%s" % username, |
| 59 | + rvprop="ids|timestamp|content", |
| 60 | + rvdir="newer", |
| 61 | + rvlimit=50, |
| 62 | + rvcontinue=rvcontinue |
| 63 | + ) |
| 64 | + |
| 65 | + for rev in js['query']['pages'].values()[0]['revisions']: |
| 66 | + rev['timestamp'] |
| 67 | + yield rev |
| 68 | + |
| 69 | + if 'query-continue' in js: |
| 70 | + for rev in self.getUserPageRevisions(username, js['query-continue']['revisions']['rvstartid']): |
| 71 | + yield rev |
| 72 | + |
| 73 | + |
| 74 | + |
| 75 | + def processRevs(self, revs): |
| 76 | + |
| 77 | + previousLines = [] |
| 78 | + for rev in revs: |
| 79 | + lines = rev.get('*', "").split("\n") |
| 80 | + del rev['*'] |
| 81 | + |
| 82 | + added = [] |
| 83 | + sm = difflib.SequenceMatcher(None, previousLines, lines) |
| 84 | + for tag, i1, i2, j1, j2 in sm.get_opcodes(): |
| 85 | + if tag == "insert": |
| 86 | + added.extend(lines[j1:j2]) |
| 87 | + elif tag == "replace": |
| 88 | + added.extend(lines[j1:j2]) |
| 89 | + |
| 90 | + |
| 91 | + rev['added'] = "\n".join(added) |
| 92 | + rev['timestamp'] = wmf.wp2Timestamp(rev['timestamp']) |
| 93 | + yield rev |
| 94 | + previousLines = lines |
| 95 | + |
| 96 | + |
| 97 | + |
| 98 | + |
Index: trunk/tools/wsor/message_templates/umetrics/generators/metric_generator.py |
— | — | @@ -0,0 +1,4 @@ |
| 2 | +class MetricGenerator: |
| 3 | + def __init__(self): pass |
| 4 | + def headers(self): raise NotImplementedError() |
| 5 | + def values(self, username, timestamp): raise NotImplementedError() |
Index: trunk/tools/wsor/message_templates/umetrics/generators/talk.py |
— | — | @@ -0,0 +1,51 @@ |
| 2 | +import itertools |
| 3 | +from .metric_generator import MetricGenerator |
| 4 | + |
| 5 | +class Talk(MetricGenerator): |
| 6 | + |
| 7 | + def __init__(self, conn, api): |
| 8 | + self.conn = conn |
| 9 | + |
| 10 | + def headers(self): |
| 11 | + return [ |
| 12 | + 'other_talk_before', |
| 13 | + 'first_other_talk_before', |
| 14 | + 'last_other_talk_before', |
| 15 | + 'other_talk_after', |
| 16 | + 'first_other_talk_after', |
| 17 | + 'last_other_talk_after', |
| 18 | + ] |
| 19 | + |
| 20 | + def values(self, username, timestamp): |
| 21 | + rowValues = {} |
| 22 | + |
| 23 | + cursor = self.conn.cursor() |
| 24 | + cursor.execute(""" |
| 25 | + SELECT |
| 26 | + IF(rev_timestamp > %(timestamp)s, "after", "before") as whense, |
| 27 | + COUNT(*) as count, |
| 28 | + MAX(rev_timestamp) as last, |
| 29 | + MIN(rev_timestamp) as first |
| 30 | + FROM revision |
| 31 | + INNER JOIN page ON rev_page = page_id |
| 32 | + WHERE page_namespace = 3 |
| 33 | + AND rev_timestamp != %(timestamp)s |
| 34 | + AND page_title = %(page_title)s |
| 35 | + AND rev_user_text != %(username)s |
| 36 | + GROUP BY 1 |
| 37 | + """, |
| 38 | + { |
| 39 | + 'timestamp': timestamp, |
| 40 | + 'page_title': username.encode('utf-8').replace(" ", "_"), |
| 41 | + 'username': username.encode('utf-8') |
| 42 | + } |
| 43 | + ) |
| 44 | + for row in cursor: |
| 45 | + rowValues['other_talk_%(whence)s'] = row['count'] |
| 46 | + rowValues['first_other_talk_%(whence)s'] = row['first'] |
| 47 | + rowValues['last_other_talk_%(whence)s'] = row['last'] |
| 48 | + |
| 49 | + rowValues['other_talk_before'] = rowValues.get('other_talk_before', 0) |
| 50 | + rowValues['other_talk_after'] = rowValues.get('other_talk_after', 0) |
| 51 | + |
| 52 | + return [rowValues.get(c) for c in self.headers()] |
Index: trunk/tools/wsor/message_templates/umetrics/generators/blocks.py |
— | — | @@ -0,0 +1,57 @@ |
| 2 | +import itertools |
| 3 | +from .metric_generator import MetricGenerator |
| 4 | + |
| 5 | +class Blocks(MetricGenerator): |
| 6 | + |
| 7 | + def __init__(self, conn, api): |
| 8 | + self.conn = conn |
| 9 | + |
| 10 | + def headers(self): |
| 11 | + return [ |
| 12 | + 'blocks_before', |
| 13 | + 'blocks_after', |
| 14 | + 'first_block_before', |
| 15 | + 'last_block_before', |
| 16 | + 'first_block_after', |
| 17 | + 'last_block_after', |
| 18 | + 'bans_before', |
| 19 | + 'bans_after', |
| 20 | + 'first_ban_before', |
| 21 | + 'last_ban_before', |
| 22 | + 'first_ban_after', |
| 23 | + 'last_ban_after' |
| 24 | + ] |
| 25 | + |
| 26 | + def values(self, username, timestamp): |
| 27 | + rowValues = {} |
| 28 | + |
| 29 | + cursor = self.conn.cursor() |
| 30 | + cursor.execute(""" |
| 31 | + SELECT |
| 32 | + IF(log_params LIKE "%%indefinite%%", "ban", "block") as type, |
| 33 | + IF(log_timestamp > %(timestamp)s, "after", "before") as whense, |
| 34 | + count(*) as count, |
| 35 | + min(log_timestamp) as first, |
| 36 | + max(log_timestamp) as last |
| 37 | + FROM logging |
| 38 | + WHERE log_type = "block" |
| 39 | + AND log_action = "block" |
| 40 | + AND log_title = %(username)s |
| 41 | + GROUP BY 1, 2 |
| 42 | + """, |
| 43 | + { |
| 44 | + 'timestamp': timestamp, |
| 45 | + 'username': username.encode('utf-8').replace(" ", "_") |
| 46 | + } |
| 47 | + ) |
| 48 | + for row in cursor: |
| 49 | + rowValues['%(type)ss_%(whense)s' % row] = row['count'] |
| 50 | + rowValues['first_%(type)s_%(whense)s' % row] = row['first'] |
| 51 | + rowValues['last_%(type)s_%(whense)s' % row] = row['last'] |
| 52 | + |
| 53 | + rowValues['blocks_before'] = rowValues.get('blocks_before', 0) |
| 54 | + rowValues['blocks_after'] = rowValues.get('blocks_after', 0) |
| 55 | + rowValues['bans_before'] = rowValues.get('bans_before', 0) |
| 56 | + rowValues['bans_after'] = rowValues.get('bans_after', 0) |
| 57 | + |
| 58 | + return [rowValues.get(c) for c in self.headers()] |
Index: trunk/tools/wsor/message_templates/umetrics/generators/__init__.py |
— | — | @@ -0,0 +1,31 @@ |
| 2 | +from .edit_counts import EditCounts |
| 3 | +from .talk import Talk |
| 4 | +from .blocks import Blocks |
| 5 | +from .warnings import Warnings |
| 6 | +from .metric_generator import MetricGenerator |
| 7 | + |
| 8 | +GENERATORS = { |
| 9 | + 'editcounts': EditCounts, |
| 10 | + 'talk': Talk, |
| 11 | + 'blocks': Blocks, |
| 12 | + 'warnings': Warnings |
| 13 | +} |
| 14 | + |
| 15 | +class Metrics(MetricGenerator): |
| 16 | + |
| 17 | + def __init__(self, generators): |
| 18 | + self.generators = list(generators) |
| 19 | + |
| 20 | + def headers(self): |
| 21 | + row = ['username', 'timestamp'] |
| 22 | + for generator in self.generators: |
| 23 | + row.extend(generator.headers()) |
| 24 | + |
| 25 | + return row |
| 26 | + |
| 27 | + def values(self, username, timestamp): |
| 28 | + row = [username, timestamp] |
| 29 | + for generator in self.generators: |
| 30 | + row.extend(generator.values(username, timestamp)) |
| 31 | + |
| 32 | + return row |
Index: trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py |
— | — | @@ -0,0 +1,59 @@ |
| 2 | +import itertools |
| 3 | +from .metric_generator import MetricGenerator |
| 4 | + |
| 5 | +class EditCounts(MetricGenerator): |
| 6 | + |
| 7 | + def __init__(self, conn, api): |
| 8 | + self.conn = conn |
| 9 | + |
| 10 | + def headers(self): |
| 11 | + return itertools.chain(*[ |
| 12 | + [ |
| 13 | + 'ns_%s_revisions_deleted_before' % ns, |
| 14 | + 'ns_%s_revisions_deleted_after' % ns, |
| 15 | + 'ns_%s_revisions_not_deleted_before' % ns, |
| 16 | + 'ns_%s_revisions_not_deleted_after' % ns |
| 17 | + ] |
| 18 | + for ns in itertools.chain(range(0,16), [100, 101, 108, 109]) |
| 19 | + ]) |
| 20 | + |
| 21 | + def values(self, username, timestamp): |
| 22 | + rowData = {} |
| 23 | + |
| 24 | + cursor = self.conn.cursor() |
| 25 | + cursor.execute(""" |
| 26 | + ( |
| 27 | + SELECT |
| 28 | + False as deleted, |
| 29 | + page_namespace as ns, |
| 30 | + count(*) as revisions |
| 31 | + FROM enwiki.revision |
| 32 | + INNER JOIN enwiki.page ON rev_page = page_id |
| 33 | + WHERE rev_timestamp <= %(timestamp)s |
| 34 | + AND rev_user_text = %(username)s |
| 35 | + GROUP BY page_namespace |
| 36 | + ) |
| 37 | + UNION ( |
| 38 | + SELECT |
| 39 | + True as deleted, |
| 40 | + ar_namespace as ns, |
| 41 | + count(*) as revisions |
| 42 | + FROM enwiki.archive |
| 43 | + WHERE ar_timestamp <= %(timestamp)s |
| 44 | + AND ar_user_text = %(username)s |
| 45 | + GROUP BY ar_namespace |
| 46 | + )""", |
| 47 | + { |
| 48 | + 'timestamp': timestamp, |
| 49 | + 'username': username.encode('utf-8') |
| 50 | + } |
| 51 | + ) |
| 52 | + for row in cursor: |
| 53 | + if(row['deleted']): |
| 54 | + deleted = "deleted" |
| 55 | + else: |
| 56 | + deleted = "not_deleted" |
| 57 | + |
| 58 | + rowData['ns_%s_before_revisions_%s' % (row['ns'], deleted)] = row['revisions'] |
| 59 | + |
| 60 | + return [rowData.get(c, 0) for c in self.headers()] |
Index: trunk/tools/wsor/message_templates/umetrics/metrics.py |
— | — | @@ -0,0 +1,100 @@ |
| 2 | +import sys, argparse, os |
| 3 | +import logging, types |
| 4 | +import MySQLdb, MySQLdb.cursors |
| 5 | + |
| 6 | +from .generators import GENERATORS, Metrics |
| 7 | +from .util import MWAPI, MWAPIError |
| 8 | + |
| 9 | +def encode(v): |
| 10 | + if v == None: return "\N" |
| 11 | + |
| 12 | + if type(v) == types.LongType: v = int(v) |
| 13 | + elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
| 14 | + |
| 15 | + return str(v).encode("string-escape") |
| 16 | + |
| 17 | + |
| 18 | +def main(): |
| 19 | + |
| 20 | + parser = argparse.ArgumentParser( |
| 21 | + description=""" |
| 22 | + Gathers metrics for users around a timestamp. |
| 23 | + """, |
| 24 | + conflict_handler="resolve" |
| 25 | + ) |
| 26 | + parser.add_argument( |
| 27 | + '-c', '--cnf', |
| 28 | + metavar="<path>", |
| 29 | + type=str, |
| 30 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 31 | + default=os.path.expanduser("~/.my.cnf") |
| 32 | + ) |
| 33 | + parser.add_argument( |
| 34 | + '-h', '--host', |
| 35 | + type=str, |
| 36 | + help='the database host to connect to (defaults to localhost)', |
| 37 | + default="localhost" |
| 38 | + ) |
| 39 | + parser.add_argument( |
| 40 | + '-d', '--db', |
| 41 | + type=str, |
| 42 | + help='the language db to run the query in (defaults to enwiki)', |
| 43 | + default="enwiki" |
| 44 | + ) |
| 45 | + parser.add_argument( |
| 46 | + '-a', '--api', |
| 47 | + type=MWAPI, |
| 48 | + help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', |
| 49 | + default="http://en.wikipedia.org/w/api.php" |
| 50 | + ) |
| 51 | + parser.add_argument( |
| 52 | + 'generator', |
| 53 | + type=lambda g: GENERATORS[g], |
| 54 | + nargs="+", |
| 55 | + help='the metric generators to run (%s)' % ', '.join(GENERATORS.keys()) |
| 56 | + ) |
| 57 | + args = parser.parse_args() |
| 58 | + |
| 59 | + LOGGING_STREAM = sys.stderr |
| 60 | + logging.basicConfig( |
| 61 | + level=logging.DEBUG, |
| 62 | + stream=LOGGING_STREAM, |
| 63 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 64 | + datefmt='%b-%d %H:%M:%S' |
| 65 | + ) |
| 66 | + |
| 67 | + if sys.stdin.isatty(): |
| 68 | + logging.error("No data piped to standard in!") |
| 69 | + return |
| 70 | + |
| 71 | + |
| 72 | + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
| 73 | + conn = MySQLdb.connect( |
| 74 | + host=args.host, |
| 75 | + db=args.db, |
| 76 | + read_default_file=args.cnf, |
| 77 | + cursorclass=MySQLdb.cursors.DictCursor |
| 78 | + ) |
| 79 | + |
| 80 | + logging.info("Loading generators...") |
| 81 | + metrics = Metrics(g(conn, args.api) for g in args.generator) |
| 82 | + print("\t".join(encode(h) for h in metrics.headers())) |
| 83 | + |
| 84 | + |
| 85 | + logging.info("Processing users...") |
| 86 | + for line in sys.stdin: |
| 87 | + username, timestamp = line.strip().split("\t")[0:2] |
| 88 | + username = unicode(username, 'utf-8') |
| 89 | + |
| 90 | + logging.debug("\t%s at %s:" % (username, timestamp)) |
| 91 | + print("\t".join(encode(v) for v in metrics.values(username, timestamp))) |
| 92 | + LOGGING_STREAM.write("o") |
| 93 | + |
| 94 | + LOGGING_STREAM.write("\n") |
| 95 | + |
| 96 | + |
| 97 | + |
| 98 | + |
| 99 | + |
| 100 | +if __name__ == "__main__": |
| 101 | + main() |
Index: trunk/tools/wsor/message_templates/umetrics/postings.py |
— | — | @@ -0,0 +1,282 @@ |
| 2 | +''' |
| 3 | +This script connects to a mediawiki database and API to collect User_talk revisions |
| 4 | +that match a set of patterns (and optionally, username). |
| 5 | + |
| 6 | +:Parameters: |
| 7 | + Access the script's documentation for a parameter listing. |
| 8 | + |
| 9 | + % python message_postings.py --help |
| 10 | + |
| 11 | +:Output: |
| 12 | + This script writes a set of escaped, tab separated columns to standard out. |
| 13 | + - Recipient name - The name of the user who received the posting |
| 14 | + - Timestamp - The time at which the posting was made |
| 15 | + - Revision ID - The identifier of the revision matching the posting |
| 16 | + - Poster ID - The identifier of the user who made the posting |
| 17 | + - Poster name - The name of the user who make the posting |
| 18 | + - Message match - The portion of the message posting that was matched by the regular expression. |
| 19 | + |
| 20 | +:Example: |
| 21 | + python message_postings.py -h db42 --start=20111222000000 --end=20111223000000 --comment="\(\[\[WP:HG\|HG\]\]\)" --message="Template:uw-vandalism1" |
| 22 | +''' |
| 23 | +import sys, argparse, os |
| 24 | +import logging, types, re |
| 25 | +import time, datetime |
| 26 | +import MySQLdb, MySQLdb.cursors |
| 27 | +import urllib, urllib2, json, htmlentitydefs |
| 28 | +import wmf |
| 29 | + |
| 30 | +class MissingRevError(Exception):pass |
| 31 | + |
| 32 | +def encode(v): |
| 33 | + if v == None: return "\N" |
| 34 | + |
| 35 | + if type(v) == types.LongType: v = int(v) |
| 36 | + elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
| 37 | + |
| 38 | + return str(v).encode("string-escape") |
| 39 | + |
| 40 | +def emit(rev): |
| 41 | + |
| 42 | + print( |
| 43 | + "\t".join( |
| 44 | + encode(rev[c]) for c in [ |
| 45 | + 'recipient_name', |
| 46 | + 'rev_timestamp', |
| 47 | + 'rev_id', |
| 48 | + 'poster_id', |
| 49 | + 'poster_name', |
| 50 | + 'message_match' |
| 51 | + ] |
| 52 | + ) |
| 53 | + ) |
| 54 | + |
| 55 | + |
| 56 | +# MediaWiki Date format |
| 57 | +# |
| 58 | +# | year | month | day | hour | minute | second | |
| 59 | +MW_DATE = re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$") |
| 60 | +def mwDate(string): |
| 61 | + if MW_DATE.match(string) == None: |
| 62 | + raise ValueError("%r is not a valid date. Expected YYMMDDHHmmSS" % string) |
| 63 | + else: |
| 64 | + return string |
| 65 | + |
| 66 | +def main(): |
| 67 | + parser = argparse.ArgumentParser( |
| 68 | + description=""" |
| 69 | + Gathers experimental message postings from user_talk messages. |
| 70 | + """, |
| 71 | + epilog=""" |
| 72 | + python message_postings.py |
| 73 | + -h db42 |
| 74 | + --start=20111222000000 |
| 75 | + --end=20111223000000 |
| 76 | + --comment="\(\[\[WP:HG\|HG\]\]\)" |
| 77 | + --message="Template:uw-vandalism1" |
| 78 | + """, |
| 79 | + conflict_handler="resolve" |
| 80 | + ) |
| 81 | + parser.add_argument( |
| 82 | + '-c', '--cnf', |
| 83 | + metavar="<path>", |
| 84 | + type=str, |
| 85 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 86 | + default=os.path.expanduser("~/.my.cnf") |
| 87 | + ) |
| 88 | + parser.add_argument( |
| 89 | + '-h', '--host', |
| 90 | + type=str, |
| 91 | + help='the database host to connect to (defaults to localhost)', |
| 92 | + default="localhost" |
| 93 | + ) |
| 94 | + parser.add_argument( |
| 95 | + '-d', '--db', |
| 96 | + type=str, |
| 97 | + help='the language db to run the query in (defaults to enwiki)', |
| 98 | + default="enwiki" |
| 99 | + ) |
| 100 | + parser.add_argument( |
| 101 | + '-a', '--api_uri', |
| 102 | + type=str, |
| 103 | + help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', |
| 104 | + default="http://en.wikipedia.org/w/api.php" |
| 105 | + ) |
| 106 | + parser.add_argument( |
| 107 | + '--start', |
| 108 | + type=mwDate, |
| 109 | + help='the start of the experimental period. (Required)', |
| 110 | + required=True |
| 111 | + ) |
| 112 | + parser.add_argument( |
| 113 | + '--end', |
| 114 | + type=mwDate, |
| 115 | + help='the end of the experimental period. (defaults to NOW())', |
| 116 | + default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") |
| 117 | + ) |
| 118 | + parser.add_argument( |
| 119 | + '--user_name', |
| 120 | + type=str, |
| 121 | + help='the user_name to further filter postings by (useful for tracking bots)' |
| 122 | + ) |
| 123 | + parser.add_argument( |
| 124 | + '--comment', |
| 125 | + type=re.compile, |
| 126 | + help='regular expression to match against message posting comment' |
| 127 | + ) |
| 128 | + parser.add_argument( |
| 129 | + '--message', |
| 130 | + type=re.compile, |
| 131 | + help='regular expression to match against message content (required)', |
| 132 | + required=True |
| 133 | + ) |
| 134 | + args = parser.parse_args() |
| 135 | + |
| 136 | + LOGGING_STREAM = sys.stderr |
| 137 | + logging.basicConfig( |
| 138 | + level=logging.DEBUG, |
| 139 | + stream=LOGGING_STREAM, |
| 140 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 141 | + datefmt='%b-%d %H:%M:%S' |
| 142 | + ) |
| 143 | + logging.debug("Comment pattern is %r." % args.comment.pattern) |
| 144 | + logging.debug("Message pattern is %r." % args.message.pattern) |
| 145 | + |
| 146 | + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
| 147 | + db = Database( |
| 148 | + host=args.host, |
| 149 | + db=args.db, |
| 150 | + read_default_file=args.cnf |
| 151 | + ) |
| 152 | + |
| 153 | + logging.info("Connecting to API @ %s." % args.api_uri) |
| 154 | + api = WPAPI(args.api_uri) |
| 155 | + |
| 156 | + logging.info("Querying for matching revisions:") |
| 157 | + count = {"matched": 0, "missed": 0} |
| 158 | + for rev in db.getPostings(args.start, args.end, args.user_name, args.comment): |
| 159 | + message = api.getAdded(rev['rev_id']) |
| 160 | + match = args.message.search(message) |
| 161 | + if match != None: |
| 162 | + rev['message_match'] = match.group(0) |
| 163 | + |
| 164 | + emit(rev) |
| 165 | + LOGGING_STREAM.write("|") |
| 166 | + count['matched'] += 1 |
| 167 | + else: |
| 168 | + LOGGING_STREAM.write("o") |
| 169 | + count['missed'] += 1 |
| 170 | + |
| 171 | + LOGGING_STREAM.write("\n") |
| 172 | + logging.info("Process completed. %(matched)s messages matched, %(missed)s messages missed." % count) |
| 173 | + |
| 174 | + |
| 175 | + |
| 176 | +class Database: |
| 177 | + |
| 178 | + def __init__(self, *args, **kwargs): |
| 179 | + self.args = args |
| 180 | + self.kwargs = kwargs |
| 181 | + self.conn = MySQLdb.connect(*args, **kwargs) |
| 182 | + |
| 183 | + def getPostings(self, start, end, userName=None, commentRE=None): |
| 184 | + if (userName, commentRE) == (None, None): |
| 185 | + raise TypeError("Must specify at at least one of userName or commentRE.") |
| 186 | + |
| 187 | + cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) |
| 188 | + query = """ |
| 189 | + SELECT |
| 190 | + r.rev_id, |
| 191 | + r.rev_timestamp, |
| 192 | + r.rev_comment, |
| 193 | + r.rev_user AS poster_id, |
| 194 | + r.rev_user_text AS poster_name, |
| 195 | + REPLACE(p.page_title, "_", " ") AS recipient_name |
| 196 | + FROM revision r |
| 197 | + INNER JOIN page p ON r.rev_page = p.page_id |
| 198 | + WHERE rev_timestamp BETWEEN %(start)s AND %(end)s |
| 199 | + AND page_namespace = 3 |
| 200 | + """ |
| 201 | + if userName != None: |
| 202 | + query += "AND rev_user_text = %(user_name)s\n" |
| 203 | + if commentRE != None: |
| 204 | + query += "AND rev_comment REGEXP %(comment_pattern)s\n" |
| 205 | + |
| 206 | + cursor.execute( |
| 207 | + query, |
| 208 | + { |
| 209 | + 'start': start, |
| 210 | + 'end': end, |
| 211 | + 'user_name': userName, |
| 212 | + 'comment_pattern': commentRE.pattern |
| 213 | + } |
| 214 | + ) |
| 215 | + |
| 216 | + for row in cursor: |
| 217 | + yield row |
| 218 | + |
| 219 | + |
| 220 | + |
| 221 | +class WPAPI: |
| 222 | + DIFF_ADD_RE = re.compile(r'<td class="diff-addedline"><div>(.+)</div></td>') |
| 223 | + |
| 224 | + def __init__(self, uri): |
| 225 | + self.uri = uri |
| 226 | + |
| 227 | + def getDiff(self, revId, retries=10): |
| 228 | + attempt = 0 |
| 229 | + while attempt < retries: |
| 230 | + try: |
| 231 | + response = urllib2.urlopen( |
| 232 | + self.uri, |
| 233 | + urllib.urlencode({ |
| 234 | + 'action': 'query', |
| 235 | + 'prop': 'revisions', |
| 236 | + 'revids': revId, |
| 237 | + 'rvprop': 'ids', |
| 238 | + 'rvdiffto': 'prev', |
| 239 | + 'format': 'json' |
| 240 | + }) |
| 241 | + ) |
| 242 | + result = json.load(response) |
| 243 | + return result['query']['pages'].values()[0]['revisions'][0]['diff']['*'] |
| 244 | + except urllib2.HTTPError as e: |
| 245 | + time.sleep(attempt*2) |
| 246 | + attempt += 1 |
| 247 | + |
| 248 | + |
| 249 | + |
| 250 | + def getAdded(self, revId): |
| 251 | + diff = self.getDiff(revId) |
| 252 | + |
| 253 | + return self.unescape( |
| 254 | + "\n".join( |
| 255 | + match.group(1) |
| 256 | + for match in WPAPI.DIFF_ADD_RE.finditer(diff) |
| 257 | + ) |
| 258 | + ) |
| 259 | + |
| 260 | + def unescape(self, text): |
| 261 | + def fixup(m): |
| 262 | + text = m.group(0) |
| 263 | + if text[:2] == "&#": |
| 264 | + # character reference |
| 265 | + try: |
| 266 | + if text[:3] == "&#x": |
| 267 | + return unichr(int(text[3:-1], 16)) |
| 268 | + else: |
| 269 | + return unichr(int(text[2:-1])) |
| 270 | + except ValueError: |
| 271 | + pass |
| 272 | + else: |
| 273 | + # named entity |
| 274 | + try: |
| 275 | + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) |
| 276 | + except KeyError: |
| 277 | + pass |
| 278 | + return text # leave as is |
| 279 | + return re.sub("&#?\w+;", fixup, text) |
| 280 | + |
| 281 | + |
| 282 | +if __name__ == "__main__": |
| 283 | + main() |
Index: trunk/tools/wsor/message_templates/umetrics/util/mw_api.py |
— | — | @@ -0,0 +1,52 @@ |
| 2 | +import urllib2, urllib, json |
| 3 | +import time |
| 4 | +from cookielib import CookieJar |
| 5 | + |
| 6 | +class MWAPIError(Exception): |
| 7 | + def __init__(self, code, message): |
| 8 | + self.code = code |
| 9 | + self.info = info |
| 10 | + |
| 11 | + def __repr__(self): |
| 12 | + return "%s(%s)" % ( |
| 13 | + self.__class__.__name__, |
| 14 | + ", ".join( |
| 15 | + repr(self.code), |
| 16 | + repr(self.info) |
| 17 | + ) |
| 18 | + ) |
| 19 | + |
| 20 | + def __str__(self): |
| 21 | + return "%s: %s" % (self.code, self.info) |
| 22 | + |
| 23 | +class MWAPI: |
| 24 | + |
| 25 | + def __init__(self, uri): |
| 26 | + self.uri = uri |
| 27 | + self.cookies = CookieJar() |
| 28 | + |
| 29 | + def request(self, retry=0, **kwargs): |
| 30 | + kwargs['format'] = "json" |
| 31 | + |
| 32 | + request = urllib2.Request( |
| 33 | + self.uri, |
| 34 | + urllib.urlencode(kwargs) |
| 35 | + ) |
| 36 | + self.cookies.add_cookie_header(request) |
| 37 | + |
| 38 | + try: |
| 39 | + response = urllib2.urlopen(request) |
| 40 | + except urllib2.HTTPError: |
| 41 | + #wait and try again |
| 42 | + time.sleep(2**retry) |
| 43 | + self.request(retry=retry+1, **kwargs) |
| 44 | + |
| 45 | + self.cookies.extract_cookies(response, request) |
| 46 | + |
| 47 | + js = json.load(response) |
| 48 | + |
| 49 | + if 'error' in js: |
| 50 | + raise MWAPIError(js['error']['code'], js['error']['info']) |
| 51 | + else: |
| 52 | + return js |
| 53 | + |
Index: trunk/tools/wsor/message_templates/umetrics/util/__init__.py |
— | — | @@ -0,0 +1 @@ |
| 2 | +from .mw_api import MWAPI, MWAPIError |