r91613 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r91612‎ | r91613 | r91614 >
Date:22:15, 6 July 2011
Author:halfak
Status:deferred
Tags:
Comment:
Adding directory and work for vandal_conversion
Modified paths:
  • /trunk/tools/wsor/scripts/classes/file_wrapper.py (modified) (history)
  • /trunk/tools/wsor/vandal_conversion (added) (history)
  • /trunk/tools/wsor/vandal_conversion/get_editor_conversions.py (added) (history)
  • /trunk/tools/wsor/vandal_conversion/queries.sql (added) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/file_wrapper.py
@@ -15,7 +15,6 @@
1616 if self.closed: raise ValueError("I/O operation on closed file")
1717
1818 preBytes = self.pre.read(bytes)
19 - print(str(preBytes))
2019 if len(preBytes) < bytes:
2120 fpBytes = self.fp.read(bytes-len(preBytes))
2221 else:
Index: trunk/tools/wsor/vandal_conversion/get_editor_conversions.py
@@ -0,0 +1,230 @@
 2+import sys, MySQLdb, MySQLdb.cursors, argparse
 3+import wmf
 4+
 5+
 6+def main():
 7+ parser = argparse.ArgumentParser(
 8+ description='Gathers editor data for first and last session'
 9+ )
 10+ parser.add_argument(
 11+ 'min_edits',
 12+ type=int,
 13+ help='the minimum number of edits that editors must have perfomed to be included'
 14+ )
 15+ parser.add_argument(
 16+ 'session',
 17+ type=int,
 18+ help='maximum time between session edits (in seconds)'
 19+ )
 20+ parser.add_argument(
 21+ '-c', '--cnf',
 22+ metavar="<path>",
 23+ type=str,
 24+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 25+ default=os.path.expanduser("~/.my.cnf")
 26+ )
 27+ parser.add_argument(
 28+ '-s', '--host',
 29+ type=str,
 30+ help='the database host to connect to (defaults to localhost)',
 31+ default="localhost"
 32+ )
 33+ parser.add_argument(
 34+ '-d', '--db',
 35+ type=str,
 36+ help='the language db to run the query in (defaults to enwiki)',
 37+ default="enwiki"
 38+ )
 39+ args = parser.parse_args()
 40+
 41+ LOGGING_STREAM = sys.stderr
 42+ logging.basicConfig(
 43+ level=logging.DEBUG,
 44+ stream=LOGGING_STREAM,
 45+ format='%(asctime)s %(levelname)-8s %(message)s',
 46+ datefmt='%b-%d %H:%M:%S'
 47+ )
 48+
 49+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 50+ db = Database(
 51+ host=args.host,
 52+ db=args.db,
 53+ read_default_file=args.cnf
 54+ )
 55+
 56+ print(
 57+ "\t".join([
 58+ 'user_id',
 59+ 'user_name',
 60+ 'first_edit',
 61+ 'fes_edits',
 62+ 'fes_reverted',
 63+ 'fes_vandalism',
 64+ 'fes_deleted',
 65+ 'last10_edits',
 66+ 'last10_reverted',
 67+ 'last10_vandalism',
 68+ 'last10_deleted'
 69+ ])
 70+ )
 71+
 72+ logging.info("Processing users:")
 73+ for user in db.getUsers(minimumEdits=args.min_edits):
 74+ firstSession = []
 75+ last = None
 76+ for rev in db.getFirstEdits(user['user_id']):
 77+ if last != None:
 78+ diff = wmf.wp2Timestamp(rev['rev_timestamp']) - wmf.wp2Timestamp(last['rev_timestamp'])
 79+ assert diff >= 0
 80+ if diff < args.session:
 81+ firstSession.append(rev)
 82+ else:
 83+ break
 84+
 85+ else:
 86+ firstSession.append(rev)
 87+
 88+ last = rev
 89+
 90+ last10 = list(db.getLastEdits(user['user_id'], maximum=10))
 91+ logging.debug("%s(%s): %s %s" % (user['user_name'], user['user_id'], len(firstSession)*">", len(last10)*"<"))
 92+ print(
 93+ "\t".join(
 94+ str(v).encode("string-escape") for v in [
 95+ user['user_id'],
 96+ user['user_name'],
 97+ user['editcount'],
 98+ firstSession[0]['rev_timestamp'],
 99+ len(firstSession),
 100+ len([r for r in firstSession if r['is_reverted']]),
 101+ len([r for r in firstSession if r['is_vandalism']]),
 102+ len([r for r in firstSession if r['deleted']]),
 103+ len(last10),
 104+ len([r for r in last10 if r['is_reverted']]),
 105+ len([r for r in last10 if r['is_vandalism']]),
 106+ len([r for r in last10 if r['deleted']])
 107+ ]
 108+ )
 109+ )
 110+
 111+
 112+
 113+
 114+
 115+class Database:
 116+
 117+ def __init__(self, *args, **kwargs):
 118+ self.args = args
 119+ self.kwargs = kwargs
 120+ self.usersConn = MySQLdb.connect(*args, **kwargs)
 121+ self.editsConn = MySQLdb.connect(*args, **kwargs)
 122+
 123+ def getUsers(self, minimumEdits=0):
 124+ minimumEdits = int(minimumEdits)
 125+ cursor = self.usersConn.cursor(MySQLdb.cursors.SSDictCursor)
 126+ cursor.execute(
 127+ """
 128+ SELECT
 129+ u.user_id,
 130+ u.user_name,
 131+ um.first_edit,
 132+ u.user_editcount as editcount
 133+ FROM user u
 134+ INNER JOIN halfak.user_meta um USING (user_id)
 135+ WHERE u.user_editcount >= %(minimum_edits)s
 136+ """,
 137+ {
 138+ 'minimum_edits': minimumEdits
 139+ }
 140+ )
 141+ for row in cursor:
 142+ yield row
 143+
 144+
 145+
 146+ def getEdits(self, userId, maximum=10000, chronologically=True):
 147+ userId = int(userId)
 148+ revisionCursor = self.editsConn.cursor(MySQLdb.cursors.SSDictCursor)
 149+ archiveCursor = self.editsConn.cursor(MySQLdb.cursors.SSDictCursor)
 150+
 151+ if chronologically: direction = "ASC"
 152+ else: direction = "DESC"
 153+
 154+ revisionCursor.execute(
 155+ """
 156+ SELECT
 157+ r.rev_id,
 158+ r.rev_timestamp,
 159+ rvtd.revision_id IS NOT NULL AS is_reverted,
 160+ rvtd.is_vandalism IS NOT NULL AND rvtd.is_vandalism = TRUE AS is_vandalism,
 161+ False AS deleted
 162+ FROM revision r
 163+ LEFT JOIN halfak.reverted_20110115 rvtd
 164+ ON r.rev_id = rvtd.revision_id
 165+ WHERE rev_user = %(user_id)s
 166+ ORDER BY r.timestamp """ + direction + """
 167+ LIMIT %(maximum)s;
 168+ """,
 169+ {
 170+ 'user_id': userId,
 171+ 'maximum': maximum
 172+ }
 173+ )
 174+ archiveCursor.execute(
 175+ """
 176+ SELECT
 177+ ar.ar_rev_id AS rev_id,
 178+ ar.ar_timestamp AS rev_timestamp,
 179+ NULL AS is_reverted,
 180+ NULL AS is_vandalism,
 181+ True AS deleted
 182+ FROM archive ar
 183+ WHERE ar_user = %(user_id)s
 184+ ORDER BY ar.timestamp """ + direction + """
 185+ LIMIT %(maximum)s;
 186+ """,
 187+ {
 188+ 'user_id': userId,
 189+ 'maximum': maximum
 190+ }
 191+ )
 192+ if chronologically:
 193+ order = lambda t1, t2:t1 < t2
 194+ else:
 195+ order = lambda t1, t2:t1 > t2
 196+
 197+ revPointer = revisionCursor.fetchrow()
 198+ archPointer = archiveCursor.fetchrow()
 199+ count = 0
 200+ while revPointer != None or archPointer != None: #still something to output
 201+ if revPointer != None and archPointer != None: #both cursors still have something
 202+ if order(revPointer['rev_timestamp'], archPointer['rev_timestamp']):
 203+ yield revPointer
 204+ revPointer = revisionCursor.fetchrow()
 205+ else:
 206+ yield archPointer
 207+ archPointer = archiveCursor.fetchrow()
 208+ elif revPointer != None: #only revisions left
 209+ yield revPointer
 210+ revPointer = revisionCursor.fetchrow()
 211+ elif archPointer != None:
 212+ yield archPointer
 213+ archPointer = archiveCursor.fetchrow()
 214+
 215+ count += 1
 216+ if count >= maximum: break
 217+
 218+
 219+
 220+
 221+
 222+
 223+ def getFirstEdits(self, userId, maximum=10000):
 224+ return self.getEdits(userId, maximum, chronologically=True)
 225+
 226+ def getLastEdits(self, userId, maximum=10000):
 227+ return self.getEdits(userId, maximum, chronologically=False)
 228+
 229+
 230+if __name__ == "__main__":
 231+ main()
Index: trunk/tools/wsor/vandal_conversion/queries.sql
@@ -0,0 +1,25 @@
 2+SELECT
 3+ r.rev_id,
 4+ r.timestamp,
 5+ rvtd.revision_id IS NOT NULL,
 6+ rvtd.is_vandalism IS NOT NULL AND rvtd.is_vandalism = TRUE
 7+FROM revision r
 8+LEFT JOIN halfak.reverted_20110115 rvtd
 9+ ON r.rev_id = rvtd.revision_id
 10+WHERE rev_user = 2345678
 11+ORDER BY r.rev_id ASC
 12+LIMIT 10000;
 13+
 14+SELECT
 15+ r.rev_id,
 16+ r.timestamp,
 17+ rvtd.revision_id IS NOT NULL,
 18+ rvtd.is_vandalism IS NOT NULL AND rvtd.is_vandalism = TRUE
 19+FROM revision r
 20+LEFT JOIN halfak.reverted_20110115 rvtd
 21+ ON r.rev_id = rvtd.revision_id
 22+WHERE r.rev_user = 2345678
 23+ORDER BY r.rev_id DESC
 24+LIMIT 10000;
 25+
 26+

Status & tagging log