r107849 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r107848‎ | r107849 | r107850 >
Date:20:26, 2 January 2012
Author:halfak
Status:deferred
Tags:tools 
Comment:
Working user_metrics with simple example
Modified paths:
  • /trunk/tools/wsor/message_templates/generators (added) (history)
  • /trunk/tools/wsor/message_templates/generators/__init__.py (added) (history)
  • /trunk/tools/wsor/message_templates/generators/edit_counts.py (added) (history)
  • /trunk/tools/wsor/message_templates/generators/metric_generator.py (added) (history)
  • /trunk/tools/wsor/message_templates/message_postings.py (modified) (history)
  • /trunk/tools/wsor/message_templates/sql (added) (history)
  • /trunk/tools/wsor/message_templates/sql/test.sql (added) (history)
  • /trunk/tools/wsor/message_templates/user_metrics.py (added) (history)

Diff [purge]

Index: trunk/tools/wsor/message_templates/sql/test.sql
@@ -0,0 +1 @@
 2+
Index: trunk/tools/wsor/message_templates/message_postings.py
@@ -1,3 +1,24 @@
 2+'''
 3+This script connects to a mediawiki database and API to collect User_talk revisions
 4+that match a set of patterns (and optionally, username).
 5+
 6+:Parameters:
 7+ Access the script's documentation for a parameter listing.
 8+
 9+ % python message_postings.py --help
 10+
 11+:Output:
 12+ This script writes a set of escaped, tab separated columns to standard out.
 13+ - Recipient name - The name of the user who received the posting
 14+ - Timestamp - The time at which the posting was made
 15+ - Revision ID - The identifier of the revision matching the posting
 16+ - Poster ID - The identifier of the user who made the posting
 17+ - Poster name - The name of the user who make the posting
 18+ - Message match - The portion of the message posting that was matched by the regular expression.
 19+
 20+:Example:
 21+ python message_postings.py -h db42 --start=20111222000000 --end=20111223000000 --comment="\(\[\[WP:HG\|HG\]\]\)" --message="Template:uw-vandalism1"
 22+'''
223 import sys, argparse, os
324 import logging, types, re
425 import time, datetime
@@ -20,11 +41,11 @@
2142 print(
2243 "\t".join(
2344 encode(rev[c]) for c in [
 45+ 'recipient_name',
 46+ 'rev_timestamp',
2447 'rev_id',
25 - 'rev_timestamp',
2648 'poster_id',
2749 'poster_name',
28 - 'recipient_name',
2950 'message_match'
3051 ]
3152 )
Index: trunk/tools/wsor/message_templates/generators/metric_generator.py
@@ -0,0 +1,4 @@
 2+class MetricGenerator:
 3+ def __init__(self): pass
 4+ def headers(self): raise NotImplementedError()
 5+ def values(self, username, timestamp): raise NotImplementedError()
Index: trunk/tools/wsor/message_templates/generators/__init__.py
@@ -0,0 +1,25 @@
 2+from .edit_counts import EditCounts
 3+from .metric_generator import MetricGenerator
 4+
 5+GENERATORS = {
 6+ 'editcounts': EditCounts
 7+}
 8+
 9+class Metrics(MetricGenerator):
 10+
 11+ def __init__(self, generators):
 12+ self.generators = list(generators)
 13+
 14+ def headers(self):
 15+ row = ['username', 'timestamp']
 16+ for generator in self.generators:
 17+ row.extend(generator.headers())
 18+
 19+ return row
 20+
 21+ def values(self, username, timestamp):
 22+ row = [username, timestamp]
 23+ for generator in self.generators:
 24+ row.extend(generator.values(username, timestamp))
 25+
 26+ return row
Index: trunk/tools/wsor/message_templates/generators/edit_counts.py
@@ -0,0 +1,59 @@
 2+import itertools
 3+from .metric_generator import MetricGenerator
 4+
 5+class EditCounts(MetricGenerator):
 6+
 7+ def __init__(self, conn, api_uri):
 8+ self.conn = conn
 9+
 10+ def headers(self):
 11+ return itertools.chain(*[
 12+ [
 13+ 'ns_%s_before_revisions_deleted' % ns,
 14+ 'ns_%s_after_revisions_deleted' % ns,
 15+ 'ns_%s_before_revisions_not_deleted' % ns,
 16+ 'ns_%s_after_revisions_not_deleted' % ns
 17+ ]
 18+ for ns in itertools.chain(range(0,16), [100, 101, 108, 109])
 19+ ])
 20+
 21+ def values(self, username, timestamp):
 22+ rowData = {}
 23+
 24+ cursor = self.conn.cursor()
 25+ cursor.execute("""
 26+ (
 27+ SELECT
 28+ False as deleted,
 29+ page_namespace as ns,
 30+ count(*) as revisions
 31+ FROM enwiki.revision
 32+ INNER JOIN enwiki.page ON rev_page = page_id
 33+ WHERE rev_timestamp <= %(timestamp)s
 34+ AND rev_user_text = %(username)s
 35+ GROUP BY page_namespace
 36+ )
 37+ UNION (
 38+ SELECT
 39+ True as deleted,
 40+ ar_namespace as ns,
 41+ count(*) as revisions
 42+ FROM enwiki.archive
 43+ WHERE ar_timestamp <= %(timestamp)s
 44+ AND ar_user_text = %(username)s
 45+ GROUP BY ar_namespace
 46+ )""",
 47+ {
 48+ 'timestamp': timestamp,
 49+ 'username': username.encode('utf-8')
 50+ }
 51+ )
 52+ for row in cursor:
 53+ if(row['deleted']):
 54+ deleted = "deleted"
 55+ else:
 56+ deleted = "not_deleted"
 57+
 58+ rowData['ns_%s_before_revisions_%s' % (row['ns'], deleted)] = row['revisions']
 59+
 60+ return [rowData.get(c, 0) for c in self.headers()]
Index: trunk/tools/wsor/message_templates/user_metrics.py
@@ -0,0 +1,102 @@
 2+import sys, argparse, os
 3+import logging, types
 4+import MySQLdb, MySQLdb.cursors
 5+
 6+from generators import GENERATORS, Metrics, EditCounts
 7+
 8+
 9+class MissingRevError(Exception):pass
 10+
 11+def encode(v):
 12+ if v == None: return "\N"
 13+
 14+ if type(v) == types.LongType: v = int(v)
 15+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 16+
 17+ return str(v).encode("string-escape")
 18+
 19+
 20+def main():
 21+
 22+ parser = argparse.ArgumentParser(
 23+ description="""
 24+ Gathers metrics for users around a timestamp.
 25+ """,
 26+ conflict_handler="resolve"
 27+ )
 28+ parser.add_argument(
 29+ '-c', '--cnf',
 30+ metavar="<path>",
 31+ type=str,
 32+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 33+ default=os.path.expanduser("~/.my.cnf")
 34+ )
 35+ parser.add_argument(
 36+ '-h', '--host',
 37+ type=str,
 38+ help='the database host to connect to (defaults to localhost)',
 39+ default="localhost"
 40+ )
 41+ parser.add_argument(
 42+ '-d', '--db',
 43+ type=str,
 44+ help='the language db to run the query in (defaults to enwiki)',
 45+ default="enwiki"
 46+ )
 47+ parser.add_argument(
 48+ '-a', '--api_uri',
 49+ type=str,
 50+ help='the mediawiki API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
 51+ default="http://en.wikipedia.org/w/api.php"
 52+ )
 53+ parser.add_argument(
 54+ 'generator',
 55+ type=lambda g: GENERATORS[g],
 56+ nargs="+",
 57+ help='the metric generators to run (%s)' % ', '.join(GENERATORS.keys())
 58+ )
 59+ args = parser.parse_args()
 60+
 61+ LOGGING_STREAM = sys.stderr
 62+ logging.basicConfig(
 63+ level=logging.DEBUG,
 64+ stream=LOGGING_STREAM,
 65+ format='%(asctime)s %(levelname)-8s %(message)s',
 66+ datefmt='%b-%d %H:%M:%S'
 67+ )
 68+
 69+ if sys.stdin.isatty():
 70+ logging.error("No data piped to standard in!")
 71+ return
 72+
 73+
 74+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 75+ conn = MySQLdb.connect(
 76+ host=args.host,
 77+ db=args.db,
 78+ read_default_file=args.cnf,
 79+ cursorclass=MySQLdb.cursors.DictCursor
 80+ )
 81+
 82+ logging.info("Loading generators...")
 83+ metrics = Metrics(g(conn, args.api_uri) for g in args.generator)
 84+ print("\t".join(encode(h) for h in metrics.headers()))
 85+
 86+
 87+ logging.info("Processing users...")
 88+ for line in sys.stdin:
 89+ username, timestamp = line.strip().split("\t")[0:2]
 90+ username = unicode(username, 'utf-8')
 91+
 92+ logging.debug("\t%s at %s:" % (username, timestamp))
 93+ print("\t".join(encode(v) for v in metrics.values(username, timestamp)))
 94+ LOGGING_STREAM.write("o")
 95+
 96+ LOGGING_STREAM.write("\n")
 97+
 98+
 99+
 100+
 101+
 102+if __name__ == "__main__":
 103+ main()

Status & tagging log