r94535 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r94534‎ | r94535 | r94536 >
Date:18:06, 15 August 2011
Author:halfak
Status:deferred
Tags:
Comment:
another commit, but there's more
Modified paths:
  • /trunk/tools/wsor/article_feedback (added) (history)
  • /trunk/tools/wsor/article_feedback/R (added) (history)
  • /trunk/tools/wsor/scripts/fix_reg_date.dumb.py (added) (history)
  • /trunk/tools/wsor/scripts/rev_len_changed.py (added) (history)
  • /trunk/tools/wsor/welcomers (added) (history)
  • /trunk/tools/wsor/welcomers/welcomers.sql (added) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/rev_len_changed.py
@@ -0,0 +1,202 @@
 2+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types
 3+import wmf
 4+
 5+def encode(v):
 6+ if v == None: return "\N"
 7+
 8+ if type(v) == types.LongType: v = int(v)
 9+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 10+
 11+ return str(v).encode("string-escape")
 12+
 13+def main():
 14+ parser = argparse.ArgumentParser(
 15+ description='Gathers editor data for first and last session'
 16+ )
 17+ parser.add_argument(
 18+ '-c', '--cnf',
 19+ metavar="<path>",
 20+ type=str,
 21+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 22+ default=os.path.expanduser("~/.my.cnf")
 23+ )
 24+ parser.add_argument(
 25+ '-s', '--host',
 26+ type=str,
 27+ help='the database host to connect to (defaults to localhost)',
 28+ default="localhost"
 29+ )
 30+ parser.add_argument(
 31+ '-d', '--db',
 32+ type=str,
 33+ help='the language db to run the query in (defaults to enwiki)',
 34+ default="enwiki"
 35+ )
 36+ parser.add_argument(
 37+ '-o', '--out',
 38+ type=lambda fn:open(fn, 'w'),
 39+ help='an output file to write to (defaults to stdout)',
 40+ default=sys.stdout
 41+ )
 42+ parser.add_argument(
 43+ '-a', '--after',
 44+ type=int,
 45+ help="The page_id to start after. (Defaults to zero)",
 46+ default=0
 47+ )
 48+ args = parser.parse_args()
 49+
 50+ LOGGING_STREAM = sys.stderr
 51+ logging.basicConfig(
 52+ level=logging.DEBUG,
 53+ stream=LOGGING_STREAM,
 54+ format='%(asctime)s %(levelname)-8s %(message)s',
 55+ datefmt='%b-%d %H:%M:%S'
 56+ )
 57+
 58+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 59+ db = Database(
 60+ host=args.host,
 61+ db=args.db,
 62+ read_default_file=args.cnf
 63+ )
 64+ headers = [
 65+ 'rev_id',
 66+ 'rev_timestamp',
 67+ 'rev_year',
 68+ 'rev_month',
 69+ 'rev_len',
 70+ 'user_id',
 71+ 'user_text',
 72+ 'page_id',
 73+ 'namespace',
 74+ 'parent_id',
 75+ 'len_change'
 76+ ]
 77+
 78+ print("\t".join(headers))
 79+
 80+ logging.info("Processing revisions:")
 81+ count = 0
 82+ for page in db.getPages(afterId=args.after):
 83+ LOGGING_STREAM.write("|")
 84+ last = None
 85+ for revision in db.getPageRevisions(page['page_id']):
 86+
 87+ revision['namespace'] = page['page_namespace']
 88+
 89+ if last == None:
 90+ revision['parent_id'] = None
 91+ revision['len_change'] = revision['rev_len']
 92+ else:
 93+ revision['parent_id'] = last['rev_id']
 94+ revision['len_change'] = revision['rev_len'] - last['rev_len']
 95+
 96+ print("\t".join(encode(revision[h]) for h in headers))
 97+
 98+ if count % 10000 == 0 or count == 0 :
 99+ LOGGING_STREAM.write("\n%09d" % count)
 100+ if count % 1000 == 0:
 101+ LOGGING_STREAM.write(".")
 102+ last = revision
 103+ count += 1
 104+
 105+ LOGGING_STREAM.write("\n")
 106+
 107+
 108+
 109+
 110+
 111+
 112+
 113+class Database:
 114+
 115+ def __init__(self, *args, **kwargs):
 116+ self.args = args
 117+ self.kwargs = kwargs
 118+ self.conn = MySQLdb.connect(*args, **kwargs)
 119+ self.revConn = MySQLdb.connect(*args, **kwargs)
 120+
 121+ def getPages(self, afterId=0, bufferSize=10000):
 122+ counter = 1 #not zero
 123+ while counter > 0:
 124+ cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
 125+ cursor.execute(
 126+ """
 127+ SELECT * FROM page
 128+ WHERE page_id > %(page_id)s
 129+ ORDER BY page_id
 130+ LIMIT %(limit)s
 131+ """,
 132+ {
 133+ 'page_id': afterId,
 134+ 'limit': bufferSize
 135+ }
 136+ )
 137+ counter = 0
 138+ for row in cursor:
 139+ yield row
 140+ afterId = row['page_id']
 141+ counter += 1
 142+
 143+
 144+ def getPageRevisions(self, pageId):
 145+ cursor = self.revConn.cursor()
 146+ cursor.execute(
 147+ """
 148+ SELECT
 149+ rev_id,
 150+ rev_timestamp,
 151+ YEAR(rev_timestamp) as rev_year,
 152+ MONTH(rev_timestamp) as rev_month,
 153+ rev_len,
 154+ rev_user as user_id,
 155+ rev_user_text as user_text,
 156+ rev_page as page_id,
 157+ IFNULL(rev_len, 0) as rev_len
 158+ FROM revision r
 159+ WHERE rev_page = %(page_id)s
 160+ ORDER BY rev_id
 161+ """,
 162+ {
 163+ 'page_id': pageId
 164+ }
 165+ )
 166+ for row in cursor:
 167+ yield dict(
 168+ zip(
 169+ [d[0] for d in cursor.description],
 170+ row
 171+ )
 172+ )
 173+
 174+ def getRevisions(self):
 175+ cursor = self.conn.cursor(MySQLdb.cursors.SSCursor)
 176+ cursor.execute(
 177+ """
 178+ SELECT
 179+ rev_id,
 180+ rev_timestamp,
 181+ YEAR(rev_timestamp) as rev_year,
 182+ MONTH(rev_timestamp) as rev_month,
 183+ rev_len,
 184+ rev_user as user_id,
 185+ rev_user_text as user_text,
 186+ rev_page as page_id,
 187+ p.page_namespace as namespace,
 188+ IFNULL(rev_len, 0) as rev_len
 189+ FROM revision r
 190+ INNER JOIN page p
 191+ ON r.rev_page = p.page_id
 192+ ORDER BY p.page_id, rev_id
 193+ """
 194+ )
 195+ for row in cursor:
 196+ yield dict(
 197+ zip(
 198+ [d[0] for d in cursor.description],
 199+ row
 200+ )
 201+ )
 202+
 203+if __name__ == "__main__": main()
Index: trunk/tools/wsor/scripts/fix_reg_date.dumb.py
@@ -0,0 +1,106 @@
 2+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types
 3+import wmf
 4+
 5+def encode(v):
 6+ if v == None: return "\N"
 7+
 8+ if type(v) == types.LongType: v = int(v)
 9+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 10+
 11+ return str(v).encode("string-escape")
 12+
 13+
 14+def main():
 15+ parser = argparse.ArgumentParser(
 16+ description='Gathers editor data for first and last session'
 17+ )
 18+ parser.add_argument(
 19+ 'date',
 20+ type=str,
 21+ help='the date to start querying for users with dumb registration dates'
 22+ )
 23+ parser.add_argument(
 24+ '-c', '--cnf',
 25+ metavar="<path>",
 26+ type=str,
 27+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 28+ default=os.path.expanduser("~/.my.cnf")
 29+ )
 30+ parser.add_argument(
 31+ '-s', '--host',
 32+ type=str,
 33+ help='the database host to connect to (defaults to localhost)',
 34+ default="localhost"
 35+ )
 36+ parser.add_argument(
 37+ '-d', '--db',
 38+ type=str,
 39+ help='the language db to run the query in (defaults to enwiki)',
 40+ default="enwiki"
 41+ )
 42+ args = parser.parse_args()
 43+
 44+ LOGGING_STREAM = sys.stderr
 45+ logging.basicConfig(
 46+ level=logging.DEBUG,
 47+ stream=LOGGING_STREAM,
 48+ format='%(asctime)s %(levelname)-8s %(message)s',
 49+ datefmt='%b-%d %H:%M:%S'
 50+ )
 51+
 52+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 53+ db = Database(
 54+ host=args.host,
 55+ db=args.db,
 56+ read_default_file=args.cnf
 57+ )
 58+ headers = [
 59+ 'user_id',
 60+ 'user_registration'
 61+ ]
 62+
 63+ lowestDate = args.date
 64+ logging.info("foo")
 65+ for user in db.getUsersBefore(args.date):
 66+ if user['user_registration'] == None:
 67+ LOGGING_STREAM.write("!")
 68+ user['user_registration'] = lowestDate
 69+ print("\t".join(str(user[h]) for h in headers))
 70+ else:
 71+ LOGGING_STREAM.write(".")
 72+
 73+ lowestDate = min(user['user_registration'], lowestDate)
 74+
 75+ LOGGING_STREAM.write("\n")
 76+
 77+
 78+
 79+
 80+
 81+class Database:
 82+
 83+ def __init__(self, *args, **kwargs):
 84+ self.args = args
 85+ self.kwargs = kwargs
 86+ self.usersConn = MySQLdb.connect(*args, **kwargs)
 87+
 88+ def getUsersBefore(self, date):
 89+ cursor = self.usersConn.cursor(MySQLdb.cursors.SSDictCursor)
 90+ cursor.execute(
 91+ """
 92+ SELECT
 93+ user_id,
 94+ user_registration
 95+ FROM user
 96+ WHERE user_registration <= %(date)s
 97+ OR user_registration IS NULL
 98+ ORDER BY user_id DESC
 99+ """,
 100+ {
 101+ 'date': date
 102+ }
 103+ )
 104+ for row in cursor:
 105+ yield row
 106+
 107+if __name__ == "__main__": main()
Index: trunk/tools/wsor/welcomers/welcomers.sql
@@ -0,0 +1,196 @@
 2+CREATE TABLE halfak.user_first_msg (
 3+ user_id INT UNSIGNED,
 4+ user_name VARBINARY(255),
 5+ msg_timestamp DATETIME
 6+);
 7+CREATE UNIQUE INDEX user_name ON halfak.user_first_msg (user_name);
 8+
 9+
 10+SELECT
 11+ user_name,
 12+ user_id,
 13+ is_anon,
 14+ user_registration,
 15+ sender_user,
 16+ sender_user_text,
 17+ msg_timestamp,
 18+ bot,
 19+ self,
 20+ tool,
 21+ warning,
 22+ edit_count,
 23+ ban_date,
 24+ edits_after_msg+deleted_edits_after_msg AS edits_after,
 25+ edits_before_msg+deleted_edits_before_msg AS edits_before
 26+FROM
 27+
 28+
 29+CREATE TABLE halfak.user_activity_first_msg
 30+SELECT
 31+uf.user_id,
 32+uf.user_name,
 33+rlc.namespace,
 34+
 35+
 36+SUM(
 37+ rlc.rev_timestamp < uf.msg_timestamp AND
 38+ rvt.revision_id IS NOT NULL
 39+) AS reverting_edits_before,
 40+SUM(
 41+ rlc.rev_timestamp < uf.msg_timestamp AND
 42+ rvt.revision_id IS NOT NULL AND
 43+ rvtd.revision_id IS NOT NULL
 44+) AS reverted_reverting_edits_before,
 45+
 46+
 47+SUM(
 48+ rlc.rev_timestamp < uf.msg_timestamp AND
 49+ rvt.revision_id IS NULL AND
 50+ len_change > 0
 51+) AS add_edits_before,
 52+SUM(IF(
 53+ rlc.rev_timestamp < uf.msg_timestamp AND
 54+ rvt.revision_id IS NULL AND
 55+ len_change > 0,
 56+ len_change, 0
 57+)) AS len_added_before,
 58+SUM(
 59+ rlc.rev_timestamp < uf.msg_timestamp AND
 60+ rvt.revision_id IS NULL AND
 61+ rvtd.revision_id IS NOT NULL AND
 62+ len_change > 0
 63+) AS reverted_add_edits_before,
 64+SUM(IF(
 65+ rlc.rev_timestamp < uf.msg_timestamp AND
 66+ rvt.revision_id IS NULL AND
 67+ rvtd.revision_id IS NOT NULL AND
 68+ len_change > 0,
 69+ len_change, 0
 70+)) AS reverted_len_added_before,
 71+
 72+
 73+SUM(
 74+ rlc.rev_timestamp < uf.msg_timestamp AND
 75+ rvt.revision_id IS NULL AND
 76+ len_change < 0
 77+) AS remove_edits_before,
 78+SUM(IF(
 79+ rlc.rev_timestamp < uf.msg_timestamp AND
 80+ rvt.revision_id IS NULL AND
 81+ len_change < 0,
 82+ len_change, 0
 83+)) AS len_removed_before,
 84+SUM(
 85+ rlc.rev_timestamp < uf.msg_timestamp AND
 86+ rvt.revision_id IS NULL AND
 87+ rvtd.revision_id IS NOT NULL AND
 88+ len_change < 0
 89+) AS reverted_remove_edits_before,
 90+SUM(IF(
 91+ rlc.rev_timestamp < uf.msg_timestamp AND
 92+ rvt.revision_id IS NULL AND
 93+ rvtd.revision_id IS NOT NULL AND
 94+ len_change < 0,
 95+ len_change, 0
 96+)) AS reverted_len_remove_before,
 97+
 98+
 99+SUM(
 100+ rlc.rev_timestamp < uf.msg_timestamp AND
 101+ rvt.revision_id IS NULL AND
 102+ len_change = 0
 103+) AS noop_edits_before,
 104+SUM(
 105+ rlc.rev_timestamp < uf.msg_timestamp AND
 106+ rvt.revision_id IS NULL AND
 107+ rvtd.revision_id IS NOT NULL AND
 108+ len_change = 0
 109+) AS reverted_noop_edits_before,
 110+
 111+
 112+
 113+
 114+SUM(
 115+ rlc.rev_timestamp > uf.msg_timestamp AND
 116+ rvt.revision_id IS NOT NULL
 117+) AS reverting_edits_after,
 118+SUM(
 119+ rlc.rev_timestamp > uf.msg_timestamp AND
 120+ rvt.revision_id IS NOT NULL AND
 121+ rvtd.revision_id IS NOT NULL
 122+) AS reverted_reverting_edits_after,
 123+
 124+
 125+SUM(
 126+ rlc.rev_timestamp > uf.msg_timestamp AND
 127+ rvt.revision_id IS NULL AND
 128+ len_change > 0
 129+) AS add_edits_after,
 130+SUM(IF(
 131+ rlc.rev_timestamp > uf.msg_timestamp AND
 132+ rvt.revision_id IS NULL AND
 133+ len_change > 0,
 134+ len_change, 0
 135+)) AS len_added_after,
 136+SUM(
 137+ rlc.rev_timestamp > uf.msg_timestamp AND
 138+ rvt.revision_id IS NULL AND
 139+ rvtd.revision_id IS NOT NULL AND
 140+ len_change > 0
 141+) AS reverted_add_edits_after,
 142+SUM(IF(
 143+ rlc.rev_timestamp > uf.msg_timestamp AND
 144+ rvt.revision_id IS NULL AND
 145+ rvtd.revision_id IS NOT NULL AND
 146+ len_change > 0,
 147+ len_change, 0
 148+)) AS reverted_len_added_after,
 149+
 150+
 151+SUM(
 152+ rlc.rev_timestamp > uf.msg_timestamp AND
 153+ rvt.revision_id IS NULL AND
 154+ len_change < 0
 155+) AS remove_edits_after,
 156+SUM(IF(
 157+ rlc.rev_timestamp > uf.msg_timestamp AND
 158+ rvt.revision_id IS NULL AND
 159+ len_change < 0,
 160+ len_change, 0
 161+)) AS len_removed_after,
 162+SUM(
 163+ rlc.rev_timestamp < uf.msg_timestamp AND
 164+ rvt.revision_id IS NULL AND
 165+ rvtd.revision_id IS NOT NULL AND
 166+ len_change < 0
 167+) AS reverted_remove_edits_after,
 168+SUM(IF(
 169+ rlc.rev_timestamp > uf.msg_timestamp AND
 170+ rvt.revision_id IS NULL AND
 171+ rvtd.revision_id IS NOT NULL AND
 172+ len_change < 0,
 173+ len_change, 0
 174+)) AS reverted_len_remove_after,
 175+
 176+
 177+SUM(
 178+ rlc.rev_timestamp > uf.msg_timestamp AND
 179+ rvt.revision_id IS NULL AND
 180+ len_change = 0
 181+) AS noop_edits_after,
 182+SUM(
 183+ rlc.rev_timestamp > uf.msg_timestamp AND
 184+ rvt.revision_id IS NULL AND
 185+ rvtd.revision_id IS NOT NULL AND
 186+ len_change = 0
 187+) AS reverted_noop_edits_after
 188+
 189+FROM halfak.user_first_msg uf
 190+INNER JOIN halfak.rev_len_changed_namespace rlc
 191+ ON uf.user_name = rlc.user_text
 192+LEFT JOIN halfak.revert_20110115 rvt
 193+ ON rvt.revision_id = rlc.rev_id
 194+LEFT JOIN halfak.reverted_20110115 rvtd
 195+ ON rvtd.revision_id = rlc.rev_id
 196+GROUP BY uf.user_name, rlc.namespace;
 197+

Status & tagging log