Index: trunk/tools/wsor/message_templates/sql/test.sql |
— | — | @@ -0,0 +1 @@ |
| 2 | + |
Index: trunk/tools/wsor/message_templates/message_postings.py |
— | — | @@ -1,3 +1,24 @@ |
| 2 | +''' |
| 3 | +This script connects to a mediawiki database and API to collect User_talk revisions |
| 4 | +that match a set of patterns (and optionally, username). |
| 5 | + |
| 6 | +:Parameters: |
| 7 | + Access the script's documentation for a parameter listing. |
| 8 | + |
| 9 | + % python message_postings.py --help |
| 10 | + |
| 11 | +:Output: |
| 12 | + This script writes a set of escaped, tab separated columns to standard out. |
| 13 | + - Recipient name - The name of the user who received the posting |
| 14 | + - Timestamp - The time at which the posting was made |
| 15 | + - Revision ID - The identifier of the revision matching the posting |
| 16 | + - Poster ID - The identifier of the user who made the posting |
| 17 | + - Poster name - The name of the user who make the posting |
| 18 | + - Message match - The portion of the message posting that was matched by the regular expression. |
| 19 | + |
| 20 | +:Example: |
| 21 | + python message_postings.py -h db42 --start=20111222000000 --end=20111223000000 --comment="\(\[\[WP:HG\|HG\]\]\)" --message="Template:uw-vandalism1" |
| 22 | +''' |
2 | 23 | import sys, argparse, os |
3 | 24 | import logging, types, re |
4 | 25 | import time, datetime |
— | — | @@ -20,11 +41,11 @@ |
21 | 42 | print( |
22 | 43 | "\t".join( |
23 | 44 | encode(rev[c]) for c in [ |
| 45 | + 'recipient_name', |
| 46 | + 'rev_timestamp', |
24 | 47 | 'rev_id', |
25 | | - 'rev_timestamp', |
26 | 48 | 'poster_id', |
27 | 49 | 'poster_name', |
28 | | - 'recipient_name', |
29 | 50 | 'message_match' |
30 | 51 | ] |
31 | 52 | ) |
Index: trunk/tools/wsor/message_templates/generators/metric_generator.py |
— | — | @@ -0,0 +1,4 @@ |
| 2 | +class MetricGenerator: |
| 3 | + def __init__(self): pass |
| 4 | + def headers(self): raise NotImplementedError() |
| 5 | + def values(self, username, timestamp): raise NotImplementedError() |
Index: trunk/tools/wsor/message_templates/generators/__init__.py |
— | — | @@ -0,0 +1,25 @@ |
| 2 | +from .edit_counts import EditCounts |
| 3 | +from .metric_generator import MetricGenerator |
| 4 | + |
| 5 | +GENERATORS = { |
| 6 | + 'editcounts': EditCounts |
| 7 | +} |
| 8 | + |
| 9 | +class Metrics(MetricGenerator): |
| 10 | + |
| 11 | + def __init__(self, generators): |
| 12 | + self.generators = list(generators) |
| 13 | + |
| 14 | + def headers(self): |
| 15 | + row = ['username', 'timestamp'] |
| 16 | + for generator in self.generators: |
| 17 | + row.extend(generator.headers()) |
| 18 | + |
| 19 | + return row |
| 20 | + |
| 21 | + def values(self, username, timestamp): |
| 22 | + row = [username, timestamp] |
| 23 | + for generator in self.generators: |
| 24 | + row.extend(generator.values(username, timestamp)) |
| 25 | + |
| 26 | + return row |
Index: trunk/tools/wsor/message_templates/generators/edit_counts.py |
— | — | @@ -0,0 +1,59 @@ |
| 2 | +import itertools |
| 3 | +from .metric_generator import MetricGenerator |
| 4 | + |
| 5 | +class EditCounts(MetricGenerator): |
| 6 | + |
| 7 | + def __init__(self, conn, api_uri): |
| 8 | + self.conn = conn |
| 9 | + |
| 10 | + def headers(self): |
| 11 | + return itertools.chain(*[ |
| 12 | + [ |
| 13 | + 'ns_%s_before_revisions_deleted' % ns, |
| 14 | + 'ns_%s_after_revisions_deleted' % ns, |
| 15 | + 'ns_%s_before_revisions_not_deleted' % ns, |
| 16 | + 'ns_%s_after_revisions_not_deleted' % ns |
| 17 | + ] |
| 18 | + for ns in itertools.chain(range(0,16), [100, 101, 108, 109]) |
| 19 | + ]) |
| 20 | + |
| 21 | + def values(self, username, timestamp): |
| 22 | + rowData = {} |
| 23 | + |
| 24 | + cursor = self.conn.cursor() |
| 25 | + cursor.execute(""" |
| 26 | + ( |
| 27 | + SELECT |
| 28 | + False as deleted, |
| 29 | + page_namespace as ns, |
| 30 | + count(*) as revisions |
| 31 | + FROM enwiki.revision |
| 32 | + INNER JOIN enwiki.page ON rev_page = page_id |
| 33 | + WHERE rev_timestamp <= %(timestamp)s |
| 34 | + AND rev_user_text = %(username)s |
| 35 | + GROUP BY page_namespace |
| 36 | + ) |
| 37 | + UNION ( |
| 38 | + SELECT |
| 39 | + True as deleted, |
| 40 | + ar_namespace as ns, |
| 41 | + count(*) as revisions |
| 42 | + FROM enwiki.archive |
| 43 | + WHERE ar_timestamp <= %(timestamp)s |
| 44 | + AND ar_user_text = %(username)s |
| 45 | + GROUP BY ar_namespace |
| 46 | + )""", |
| 47 | + { |
| 48 | + 'timestamp': timestamp, |
| 49 | + 'username': username.encode('utf-8') |
| 50 | + } |
| 51 | + ) |
| 52 | + for row in cursor: |
| 53 | + if(row['deleted']): |
| 54 | + deleted = "deleted" |
| 55 | + else: |
| 56 | + deleted = "not_deleted" |
| 57 | + |
| 58 | + rowData['ns_%s_before_revisions_%s' % (row['ns'], deleted)] = row['revisions'] |
| 59 | + |
| 60 | + return [rowData.get(c, 0) for c in self.headers()] |
Index: trunk/tools/wsor/message_templates/user_metrics.py |
— | — | @@ -0,0 +1,102 @@ |
| 2 | +import sys, argparse, os |
| 3 | +import logging, types |
| 4 | +import MySQLdb, MySQLdb.cursors |
| 5 | + |
| 6 | +from generators import GENERATORS, Metrics, EditCounts |
| 7 | + |
| 8 | + |
| 9 | +class MissingRevError(Exception):pass |
| 10 | + |
| 11 | +def encode(v): |
| 12 | + if v == None: return "\N" |
| 13 | + |
| 14 | + if type(v) == types.LongType: v = int(v) |
| 15 | + elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
| 16 | + |
| 17 | + return str(v).encode("string-escape") |
| 18 | + |
| 19 | + |
| 20 | +def main(): |
| 21 | + |
| 22 | + parser = argparse.ArgumentParser( |
| 23 | + description=""" |
| 24 | + Gathers metrics for users around a timestamp. |
| 25 | + """, |
| 26 | + conflict_handler="resolve" |
| 27 | + ) |
| 28 | + parser.add_argument( |
| 29 | + '-c', '--cnf', |
| 30 | + metavar="<path>", |
| 31 | + type=str, |
| 32 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 33 | + default=os.path.expanduser("~/.my.cnf") |
| 34 | + ) |
| 35 | + parser.add_argument( |
| 36 | + '-h', '--host', |
| 37 | + type=str, |
| 38 | + help='the database host to connect to (defaults to localhost)', |
| 39 | + default="localhost" |
| 40 | + ) |
| 41 | + parser.add_argument( |
| 42 | + '-d', '--db', |
| 43 | + type=str, |
| 44 | + help='the language db to run the query in (defaults to enwiki)', |
| 45 | + default="enwiki" |
| 46 | + ) |
| 47 | + parser.add_argument( |
| 48 | + '-a', '--api_uri', |
| 49 | + type=str, |
| 50 | + help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', |
| 51 | + default="http://en.wikipedia.org/w/api.php" |
| 52 | + ) |
| 53 | + parser.add_argument( |
| 54 | + 'generator', |
| 55 | + type=lambda g: GENERATORS[g], |
| 56 | + nargs="+", |
| 57 | + help='the metric generators to run (%s)' % ', '.join(GENERATORS.keys()) |
| 58 | + ) |
| 59 | + args = parser.parse_args() |
| 60 | + |
| 61 | + LOGGING_STREAM = sys.stderr |
| 62 | + logging.basicConfig( |
| 63 | + level=logging.DEBUG, |
| 64 | + stream=LOGGING_STREAM, |
| 65 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 66 | + datefmt='%b-%d %H:%M:%S' |
| 67 | + ) |
| 68 | + |
| 69 | + if sys.stdin.isatty(): |
| 70 | + logging.error("No data piped to standard in!") |
| 71 | + return |
| 72 | + |
| 73 | + |
| 74 | + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
| 75 | + conn = MySQLdb.connect( |
| 76 | + host=args.host, |
| 77 | + db=args.db, |
| 78 | + read_default_file=args.cnf, |
| 79 | + cursorclass=MySQLdb.cursors.DictCursor |
| 80 | + ) |
| 81 | + |
| 82 | + logging.info("Loading generators...") |
| 83 | + metrics = Metrics(g(conn, args.api_uri) for g in args.generator) |
| 84 | + print("\t".join(encode(h) for h in metrics.headers())) |
| 85 | + |
| 86 | + |
| 87 | + logging.info("Processing users...") |
| 88 | + for line in sys.stdin: |
| 89 | + username, timestamp = line.strip().split("\t")[0:2] |
| 90 | + username = unicode(username, 'utf-8') |
| 91 | + |
| 92 | + logging.debug("\t%s at %s:" % (username, timestamp)) |
| 93 | + print("\t".join(encode(v) for v in metrics.values(username, timestamp))) |
| 94 | + LOGGING_STREAM.write("o") |
| 95 | + |
| 96 | + LOGGING_STREAM.write("\n") |
| 97 | + |
| 98 | + |
| 99 | + |
| 100 | + |
| 101 | + |
| 102 | +if __name__ == "__main__": |
| 103 | + main() |