Index: trunk/tools/wsor/message_templates/R/twinkle.R |
— | — | @@ -0,0 +1 @@ |
| 2 | + |
Index: trunk/tools/wsor/message_templates/umetrics/generators/warnings.py |
— | — | @@ -50,23 +50,22 @@ |
51 | 51 | def getProcessedRevs(self, username): |
52 | 52 | return self.processRevs(self.getUserPageRevisions(username)) |
53 | 53 | |
54 | | - def getUserPageRevisions(self, username, rvcontinue=None): |
| 54 | + def getUserPageRevisions(self, username, rvcontinue={}): |
55 | 55 | js = self.api.request( |
56 | 56 | action="query", |
57 | 57 | prop="revisions", |
58 | | - titles="User_talk:%s" % username, |
| 58 | + titles="User_talk:%s" % username.encode('utf-8'), |
59 | 59 | rvprop="ids|timestamp|content", |
60 | 60 | rvdir="newer", |
61 | 61 | rvlimit=50, |
62 | | - rvcontinue=rvcontinue |
| 62 | + **rvcontinue |
63 | 63 | ) |
64 | 64 | |
65 | | - for rev in js['query']['pages'].values()[0]['revisions']: |
66 | | - rev['timestamp'] |
| 65 | + for rev in js['query']['pages'].values()[0].get('revisions', []): |
67 | 66 | yield rev |
68 | 67 | |
69 | 68 | if 'query-continue' in js: |
70 | | - for rev in self.getUserPageRevisions(username, js['query-continue']['revisions']['rvstartid']): |
| 69 | + for rev in self.getUserPageRevisions(username, js['query-continue']['revisions']): |
71 | 70 | yield rev |
72 | 71 | |
73 | 72 | |
— | — | @@ -76,8 +75,10 @@ |
77 | 76 | previousLines = [] |
78 | 77 | for rev in revs: |
79 | 78 | lines = rev.get('*', "").split("\n") |
80 | | - del rev['*'] |
81 | 79 | |
| 80 | + try: del rev['*'] |
| 81 | + except KeyError: pass |
| 82 | + |
82 | 83 | added = [] |
83 | 84 | sm = difflib.SequenceMatcher(None, previousLines, lines) |
84 | 85 | for tag, i1, i2, j1, j2 in sm.get_opcodes(): |
— | — | @@ -94,4 +95,11 @@ |
95 | 96 | |
96 | 97 | |
97 | 98 | |
98 | | - |
| 99 | + |
| 100 | +def test(): |
| 101 | + from umetrics.generators import Warnings |
| 102 | + from umetrics.util import MWAPI |
| 103 | + w = Warnings(None, MWAPI('http://en.wikipedia.org/w/api.php')) |
| 104 | + for rev in w.getProcessedRevs('EpochFai'): |
| 105 | + print(rev[id]) |
| 106 | + |
Index: trunk/tools/wsor/message_templates/umetrics/generators/talk.py |
— | — | @@ -22,7 +22,7 @@ |
23 | 23 | cursor = self.conn.cursor() |
24 | 24 | cursor.execute(""" |
25 | 25 | SELECT |
26 | | - IF(rev_timestamp > %(timestamp)s, "after", "before") as whense, |
| 26 | + IF(rev_timestamp > %(timestamp)s, "after", "before") as whence, |
27 | 27 | COUNT(*) as count, |
28 | 28 | MAX(rev_timestamp) as last, |
29 | 29 | MIN(rev_timestamp) as first |
— | — | @@ -36,14 +36,14 @@ |
37 | 37 | """, |
38 | 38 | { |
39 | 39 | 'timestamp': timestamp, |
40 | | - 'page_title': username.encode('utf-8').replace(" ", "_"), |
| 40 | + 'page_title': username.replace(" ", "_").encode('utf-8'), |
41 | 41 | 'username': username.encode('utf-8') |
42 | 42 | } |
43 | 43 | ) |
44 | 44 | for row in cursor: |
45 | | - rowValues['other_talk_%(whence)s'] = row['count'] |
46 | | - rowValues['first_other_talk_%(whence)s'] = row['first'] |
47 | | - rowValues['last_other_talk_%(whence)s'] = row['last'] |
| 45 | + rowValues['other_talk_%(whence)s' % row] = row['count'] |
| 46 | + rowValues['first_other_talk_%(whence)s' % row] = row['first'] |
| 47 | + rowValues['last_other_talk_%(whence)s' % row] = row['last'] |
48 | 48 | |
49 | 49 | rowValues['other_talk_before'] = rowValues.get('other_talk_before', 0) |
50 | 50 | rowValues['other_talk_after'] = rowValues.get('other_talk_after', 0) |
Index: trunk/tools/wsor/message_templates/umetrics/generators/__init__.py |
— | — | @@ -17,7 +17,7 @@ |
18 | 18 | self.generators = list(generators) |
19 | 19 | |
20 | 20 | def headers(self): |
21 | | - row = ['username', 'timestamp'] |
| 21 | + row = ['recipient_name', 'timestamp'] |
22 | 22 | for generator in self.generators: |
23 | 23 | row.extend(generator.headers()) |
24 | 24 | |
Index: trunk/tools/wsor/message_templates/umetrics/generators/edit_counts.py |
— | — | @@ -9,10 +9,10 @@ |
10 | 10 | def headers(self): |
11 | 11 | return itertools.chain(*[ |
12 | 12 | [ |
| 13 | + 'ns_%s_revisions_before' % ns, |
| 14 | + 'ns_%s_revisions_after' % ns, |
13 | 15 | 'ns_%s_revisions_deleted_before' % ns, |
14 | | - 'ns_%s_revisions_deleted_after' % ns, |
15 | | - 'ns_%s_revisions_not_deleted_before' % ns, |
16 | | - 'ns_%s_revisions_not_deleted_after' % ns |
| 16 | + 'ns_%s_revisions_deleted_after' % ns |
17 | 17 | ] |
18 | 18 | for ns in itertools.chain(range(0,16), [100, 101, 108, 109]) |
19 | 19 | ]) |
— | — | @@ -24,24 +24,24 @@ |
25 | 25 | cursor.execute(""" |
26 | 26 | ( |
27 | 27 | SELECT |
28 | | - False as deleted, |
29 | 28 | page_namespace as ns, |
| 29 | + IF(rev_timestamp < %(timestamp)s, "before", "after") as whence, |
| 30 | + "" as deleted, |
30 | 31 | count(*) as revisions |
31 | 32 | FROM enwiki.revision |
32 | 33 | INNER JOIN enwiki.page ON rev_page = page_id |
33 | | - WHERE rev_timestamp <= %(timestamp)s |
34 | | - AND rev_user_text = %(username)s |
35 | | - GROUP BY page_namespace |
| 34 | + WHERE rev_user_text = %(username)s |
| 35 | + GROUP BY 1, 2 |
36 | 36 | ) |
37 | 37 | UNION ( |
38 | 38 | SELECT |
39 | | - True as deleted, |
40 | 39 | ar_namespace as ns, |
| 40 | + IF(ar_timestamp < %(timestamp)s, "before", "after") as whence, |
| 41 | + "_deleted" as deleted, |
41 | 42 | count(*) as revisions |
42 | 43 | FROM enwiki.archive |
43 | | - WHERE ar_timestamp <= %(timestamp)s |
44 | | - AND ar_user_text = %(username)s |
45 | | - GROUP BY ar_namespace |
| 44 | + WHERE ar_user_text = %(username)s |
| 45 | + GROUP BY 1, 2 |
46 | 46 | )""", |
47 | 47 | { |
48 | 48 | 'timestamp': timestamp, |
— | — | @@ -49,11 +49,6 @@ |
50 | 50 | } |
51 | 51 | ) |
52 | 52 | for row in cursor: |
53 | | - if(row['deleted']): |
54 | | - deleted = "deleted" |
55 | | - else: |
56 | | - deleted = "not_deleted" |
| 53 | + rowData['ns_%(ns)s_revisions%(deleted)s_%(whence)s' % row] = row['revisions'] |
57 | 54 | |
58 | | - rowData['ns_%s_before_revisions_%s' % (row['ns'], deleted)] = row['revisions'] |
59 | | - |
60 | 55 | return [rowData.get(c, 0) for c in self.headers()] |
Index: trunk/tools/wsor/message_templates/umetrics/metrics.py |
— | — | @@ -1,6 +1,7 @@ |
2 | 2 | import sys, argparse, os |
3 | 3 | import logging, types |
4 | 4 | import MySQLdb, MySQLdb.cursors |
| 5 | +import traceback |
5 | 6 | |
6 | 7 | from .generators import GENERATORS, Metrics |
7 | 8 | from .util import MWAPI, MWAPIError |
— | — | @@ -48,6 +49,21 @@ |
49 | 50 | default="http://en.wikipedia.org/w/api.php" |
50 | 51 | ) |
51 | 52 | parser.add_argument( |
| 53 | + '-o', '--old', |
| 54 | + type=lambda fn: open(fn, 'r'), |
| 55 | + help='a previous output file to read from. When provided, this script will skip all of the complete username/timestamp pairs found in the file.', |
| 56 | + ) |
| 57 | + parser.add_argument( |
| 58 | + '--debug', |
| 59 | + action="store_true", |
| 60 | + default=False |
| 61 | + ) |
| 62 | + parser.add_argument( |
| 63 | + '--headers', |
| 64 | + action="store_true", |
| 65 | + default=False |
| 66 | + ) |
| 67 | + parser.add_argument( |
52 | 68 | 'generator', |
53 | 69 | type=lambda g: GENERATORS[g], |
54 | 70 | nargs="+", |
— | — | @@ -56,8 +72,10 @@ |
57 | 73 | args = parser.parse_args() |
58 | 74 | |
59 | 75 | LOGGING_STREAM = sys.stderr |
| 76 | + if args.debug: logLevel = logging.DEBUG |
| 77 | + else: logLevel = logging.INFO |
60 | 78 | logging.basicConfig( |
61 | | - level=logging.DEBUG, |
| 79 | + level=logLevel, |
62 | 80 | stream=LOGGING_STREAM, |
63 | 81 | format='%(asctime)s %(levelname)-8s %(message)s', |
64 | 82 | datefmt='%b-%d %H:%M:%S' |
— | — | @@ -65,7 +83,7 @@ |
66 | 84 | |
67 | 85 | if sys.stdin.isatty(): |
68 | 86 | logging.error("No data piped to standard in!") |
69 | | - return |
| 87 | + return 1 |
70 | 88 | |
71 | 89 | |
72 | 90 | logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
— | — | @@ -78,23 +96,49 @@ |
79 | 97 | |
80 | 98 | logging.info("Loading generators...") |
81 | 99 | metrics = Metrics(g(conn, args.api) for g in args.generator) |
82 | | - print("\t".join(encode(h) for h in metrics.headers())) |
83 | 100 | |
84 | 101 | |
| 102 | + oldPairs = set() |
| 103 | + if args.old != None: |
| 104 | + logging.info("Loading in old data file...") |
| 105 | + for line in args.old: |
| 106 | + username, timestamp = line.strip().split("\t")[0:2] |
| 107 | + username = unicode(username.decode('string-escape'), 'utf-8') |
| 108 | + |
| 109 | + oldPairs.add((username, timestamp)) |
| 110 | + LOGGING_STREAM.write(".") |
| 111 | + |
| 112 | + LOGGING_STREAM.write("\n") |
| 113 | + |
| 114 | + else: |
| 115 | + if args.headers: |
| 116 | + print("\t".join(encode(h) for h in metrics.headers())) |
| 117 | + |
| 118 | + |
85 | 119 | logging.info("Processing users...") |
86 | 120 | for line in sys.stdin: |
87 | | - username, timestamp = line.strip().split("\t")[0:2] |
88 | | - username = unicode(username, 'utf-8') |
| 121 | + try: |
| 122 | + username, timestamp = line.strip().split("\t")[0:2] |
| 123 | + username = unicode(username.decode('string-escape'), 'utf-8') |
| 124 | + |
| 125 | + if (username, timestamp) in oldPairs: |
| 126 | + LOGGING_STREAM.write("s") |
| 127 | + else: |
| 128 | + logging.debug("\t%s at %s:" % (username, timestamp)) |
| 129 | + print("\t".join(encode(v) for v in metrics.values(username, timestamp))) |
| 130 | + sys.stdout.flush() |
| 131 | + LOGGING_STREAM.write(".") |
| 132 | + except Exception as e: |
| 133 | + logging.error("An error occurred while processing %s at %s." % (username, timestamp)) |
| 134 | + LOGGING_STREAM.write(traceback.format_exc()) |
| 135 | + return 1 |
89 | 136 | |
90 | | - logging.debug("\t%s at %s:" % (username, timestamp)) |
91 | | - print("\t".join(encode(v) for v in metrics.values(username, timestamp))) |
92 | | - LOGGING_STREAM.write("o") |
93 | | - |
94 | 137 | LOGGING_STREAM.write("\n") |
| 138 | + return 0 |
95 | 139 | |
96 | 140 | |
97 | 141 | |
98 | 142 | |
99 | 143 | |
100 | 144 | if __name__ == "__main__": |
101 | | - main() |
| 145 | + sys.exit(main()) |
Index: trunk/tools/wsor/message_templates/umetrics/postings.py |
— | — | @@ -13,7 +13,7 @@ |
14 | 14 | - Timestamp - The time at which the posting was made |
15 | 15 | - Revision ID - The identifier of the revision matching the posting |
16 | 16 | - Poster ID - The identifier of the user who made the posting |
17 | | - - Poster name - The name of the user who make the posting |
| 17 | + - Poster name - The name of the user who made the posting |
18 | 18 | - Message match - The portion of the message posting that was matched by the regular expression. |
19 | 19 | |
20 | 20 | :Example: |
— | — | @@ -36,19 +36,18 @@ |
37 | 37 | |
38 | 38 | return str(v).encode("string-escape") |
39 | 39 | |
| 40 | +HEADERS = [ |
| 41 | + 'recipient_name', |
| 42 | + 'timestamp', |
| 43 | + 'rev_id', |
| 44 | + 'poster_id', |
| 45 | + 'poster_name', |
| 46 | + 'message_match' |
| 47 | +] |
| 48 | + |
40 | 49 | def emit(rev): |
41 | | - |
42 | 50 | print( |
43 | | - "\t".join( |
44 | | - encode(rev[c]) for c in [ |
45 | | - 'recipient_name', |
46 | | - 'rev_timestamp', |
47 | | - 'rev_id', |
48 | | - 'poster_id', |
49 | | - 'poster_name', |
50 | | - 'message_match' |
51 | | - ] |
52 | | - ) |
| 51 | + "\t".join(encode(rev[h]) for h in HEADERS) |
53 | 52 | ) |
54 | 53 | |
55 | 54 | |
— | — | @@ -130,11 +129,26 @@ |
131 | 130 | help='regular expression to match against message content (required)', |
132 | 131 | required=True |
133 | 132 | ) |
| 133 | + parser.add_argument( |
| 134 | + '--header', |
| 135 | + action="store_true", |
| 136 | + default=False |
| 137 | + ) |
| 138 | + parser.add_argument( |
| 139 | + '--debug', |
| 140 | + action="store_true", |
| 141 | + default=False |
| 142 | + ) |
134 | 143 | args = parser.parse_args() |
135 | 144 | |
136 | 145 | LOGGING_STREAM = sys.stderr |
| 146 | + if args.debug: |
| 147 | + logLevel = logging.DEBUG |
| 148 | + else: |
| 149 | + logLevel = logging.INFO |
| 150 | + |
137 | 151 | logging.basicConfig( |
138 | | - level=logging.DEBUG, |
| 152 | + level=logLevel, |
139 | 153 | stream=LOGGING_STREAM, |
140 | 154 | format='%(asctime)s %(levelname)-8s %(message)s', |
141 | 155 | datefmt='%b-%d %H:%M:%S' |
— | — | @@ -152,10 +166,25 @@ |
153 | 167 | logging.info("Connecting to API @ %s." % args.api_uri) |
154 | 168 | api = WPAPI(args.api_uri) |
155 | 169 | |
| 170 | + if args.header: |
| 171 | + print("\t".join(HEADERS)) |
| 172 | + |
156 | 173 | logging.info("Querying for matching revisions:") |
| 174 | + revs = [] |
| 175 | + count = 0 |
| 176 | + for rev in db.getPostings(args.start, args.end, args.user_name, args.comment): |
| 177 | + count += 1 |
| 178 | + revs.append(rev) |
| 179 | + if count % 100 == 0: LOGGING_STREAM.write("|") |
| 180 | + |
| 181 | + LOGGING_STREAM.write("\n") |
| 182 | + |
| 183 | + logging.info("Checking for message templates") |
157 | 184 | count = {"matched": 0, "missed": 0} |
158 | | - for rev in db.getPostings(args.start, args.end, args.user_name, args.comment): |
| 185 | + for rev in revs: |
| 186 | + logging.debug("Matching revision %(rev_id)s peformed by %(poster_name)s @ %(rev_timestamp)s: %(rev_comment)s" % rev) |
159 | 187 | message = api.getAdded(rev['rev_id']) |
| 188 | + |
160 | 189 | match = args.message.search(message) |
161 | 190 | if match != None: |
162 | 191 | rev['message_match'] = match.group(0) |
— | — | @@ -183,7 +212,7 @@ |
184 | 213 | if (userName, commentRE) == (None, None): |
185 | 214 | raise TypeError("Must specify at at least one of userName or commentRE.") |
186 | 215 | |
187 | | - cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) |
| 216 | + cursor = self.conn.cursor(MySQLdb.cursors.SSDictCursor) |
188 | 217 | query = """ |
189 | 218 | SELECT |
190 | 219 | r.rev_id, |
— | — | @@ -212,8 +241,7 @@ |
213 | 242 | } |
214 | 243 | ) |
215 | 244 | |
216 | | - for row in cursor: |
217 | | - yield row |
| 245 | + return cursor |
218 | 246 | |
219 | 247 | |
220 | 248 | |
— | — | @@ -223,7 +251,7 @@ |
224 | 252 | def __init__(self, uri): |
225 | 253 | self.uri = uri |
226 | 254 | |
227 | | - def getDiff(self, revId, retries=10): |
| 255 | + def getDiff(self, revId, retries=20): |
228 | 256 | attempt = 0 |
229 | 257 | while attempt < retries: |
230 | 258 | try: |
— | — | @@ -239,11 +267,17 @@ |
240 | 268 | }) |
241 | 269 | ) |
242 | 270 | result = json.load(response) |
243 | | - return result['query']['pages'].values()[0]['revisions'][0]['diff']['*'] |
| 271 | + |
| 272 | + diff = result['query']['pages'].values()[0]['revisions'][0]['diff']['*'] |
| 273 | + if type(diff) not in types.StringTypes: diff = '' |
| 274 | + |
| 275 | + return diff |
244 | 276 | except urllib2.HTTPError as e: |
245 | | - time.sleep(attempt*2) |
| 277 | + time.sleep(2**attempt) |
246 | 278 | attempt += 1 |
| 279 | + logging.error("HTTP Error: %s. Retry #%s in %s seconds..." % (e, attempt, 2**attempt)) |
247 | 280 | |
| 281 | + |
248 | 282 | |
249 | 283 | |
250 | 284 | def getAdded(self, revId): |
Index: trunk/tools/wsor/message_templates/umetrics/util/mw_api.py |
— | — | @@ -36,17 +36,18 @@ |
37 | 37 | |
38 | 38 | try: |
39 | 39 | response = urllib2.urlopen(request) |
| 40 | + |
| 41 | + self.cookies.extract_cookies(response, request) |
| 42 | + |
| 43 | + js = json.load(response) |
| 44 | + |
| 45 | + if 'error' in js: |
| 46 | + raise MWAPIError(js['error']['code'], js['error']['info']) |
| 47 | + else: |
| 48 | + return js |
| 49 | + |
40 | 50 | except urllib2.HTTPError: |
41 | 51 | #wait and try again |
42 | 52 | time.sleep(2**retry) |
43 | 53 | self.request(retry=retry+1, **kwargs) |
44 | | - |
45 | | - self.cookies.extract_cookies(response, request) |
46 | | - |
47 | | - js = json.load(response) |
48 | | - |
49 | | - if 'error' in js: |
50 | | - raise MWAPIError(js['error']['code'], js['error']['info']) |
51 | | - else: |
52 | | - return js |
53 | 54 | |