Index: trunk/tools/wsor/scripts/rev_len_changed.py |
— | — | @@ -0,0 +1,202 @@ |
| 2 | +import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types |
| 3 | +import wmf |
| 4 | + |
| 5 | +def encode(v): |
| 6 | + if v == None: return "\N" |
| 7 | + |
| 8 | + if type(v) == types.LongType: v = int(v) |
| 9 | + elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
| 10 | + |
| 11 | + return str(v).encode("string-escape") |
| 12 | + |
| 13 | +def main(): |
| 14 | + parser = argparse.ArgumentParser( |
| 15 | + description='Gathers editor data for first and last session' |
| 16 | + ) |
| 17 | + parser.add_argument( |
| 18 | + '-c', '--cnf', |
| 19 | + metavar="<path>", |
| 20 | + type=str, |
| 21 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 22 | + default=os.path.expanduser("~/.my.cnf") |
| 23 | + ) |
| 24 | + parser.add_argument( |
| 25 | + '-s', '--host', |
| 26 | + type=str, |
| 27 | + help='the database host to connect to (defaults to localhost)', |
| 28 | + default="localhost" |
| 29 | + ) |
| 30 | + parser.add_argument( |
| 31 | + '-d', '--db', |
| 32 | + type=str, |
| 33 | + help='the language db to run the query in (defaults to enwiki)', |
| 34 | + default="enwiki" |
| 35 | + ) |
| 36 | + parser.add_argument( |
| 37 | + '-o', '--out', |
| 38 | + type=lambda fn:open(fn, 'w'), |
| 39 | + help='an output file to write to (defaults to stdout)', |
| 40 | + default=sys.stdout |
| 41 | + ) |
| 42 | + parser.add_argument( |
| 43 | + '-a', '--after', |
| 44 | + type=int, |
| 45 | + help="The page_id to start after. (Defaults to zero)", |
| 46 | + default=0 |
| 47 | + ) |
| 48 | + args = parser.parse_args() |
| 49 | + |
| 50 | + LOGGING_STREAM = sys.stderr |
| 51 | + logging.basicConfig( |
| 52 | + level=logging.DEBUG, |
| 53 | + stream=LOGGING_STREAM, |
| 54 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 55 | + datefmt='%b-%d %H:%M:%S' |
| 56 | + ) |
| 57 | + |
| 58 | + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
| 59 | + db = Database( |
| 60 | + host=args.host, |
| 61 | + db=args.db, |
| 62 | + read_default_file=args.cnf |
| 63 | + ) |
| 64 | + headers = [ |
| 65 | + 'rev_id', |
| 66 | + 'rev_timestamp', |
| 67 | + 'rev_year', |
| 68 | + 'rev_month', |
| 69 | + 'rev_len', |
| 70 | + 'user_id', |
| 71 | + 'user_text', |
| 72 | + 'page_id', |
| 73 | + 'namespace', |
| 74 | + 'parent_id', |
| 75 | + 'len_change' |
| 76 | + ] |
| 77 | + |
| 78 | + print("\t".join(headers)) |
| 79 | + |
| 80 | + logging.info("Processing revisions:") |
| 81 | + count = 0 |
| 82 | + for page in db.getPages(afterId=args.after): |
| 83 | + LOGGING_STREAM.write("|") |
| 84 | + last = None |
| 85 | + for revision in db.getPageRevisions(page['page_id']): |
| 86 | + |
| 87 | + revision['namespace'] = page['page_namespace'] |
| 88 | + |
| 89 | + if last == None: |
| 90 | + revision['parent_id'] = None |
| 91 | + revision['len_change'] = revision['rev_len'] |
| 92 | + else: |
| 93 | + revision['parent_id'] = last['rev_id'] |
| 94 | + revision['len_change'] = revision['rev_len'] - last['rev_len'] |
| 95 | + |
| 96 | + print("\t".join(encode(revision[h]) for h in headers)) |
| 97 | + |
| 98 | + if count % 10000 == 0 or count == 0 : |
| 99 | + LOGGING_STREAM.write("\n%09d" % count) |
| 100 | + if count % 1000 == 0: |
| 101 | + LOGGING_STREAM.write(".") |
| 102 | + last = revision |
| 103 | + count += 1 |
| 104 | + |
| 105 | + LOGGING_STREAM.write("\n") |
| 106 | + |
| 107 | + |
| 108 | + |
| 109 | + |
| 110 | + |
| 111 | + |
| 112 | + |
| 113 | +class Database: |
| 114 | + |
| 115 | + def __init__(self, *args, **kwargs): |
| 116 | + self.args = args |
| 117 | + self.kwargs = kwargs |
| 118 | + self.conn = MySQLdb.connect(*args, **kwargs) |
| 119 | + self.revConn = MySQLdb.connect(*args, **kwargs) |
| 120 | + |
| 121 | + def getPages(self, afterId=0, bufferSize=10000): |
| 122 | + counter = 1 #not zero |
| 123 | + while counter > 0: |
| 124 | + cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) |
| 125 | + cursor.execute( |
| 126 | + """ |
| 127 | + SELECT * FROM page |
| 128 | + WHERE page_id > %(page_id)s |
| 129 | + ORDER BY page_id |
| 130 | + LIMIT %(limit)s |
| 131 | + """, |
| 132 | + { |
| 133 | + 'page_id': afterId, |
| 134 | + 'limit': bufferSize |
| 135 | + } |
| 136 | + ) |
| 137 | + counter = 0 |
| 138 | + for row in cursor: |
| 139 | + yield row |
| 140 | + afterId = row['page_id'] |
| 141 | + counter += 1 |
| 142 | + |
| 143 | + |
| 144 | + def getPageRevisions(self, pageId): |
| 145 | + cursor = self.revConn.cursor() |
| 146 | + cursor.execute( |
| 147 | + """ |
| 148 | + SELECT |
| 149 | + rev_id, |
| 150 | + rev_timestamp, |
| 151 | + YEAR(rev_timestamp) as rev_year, |
| 152 | + MONTH(rev_timestamp) as rev_month, |
| 153 | + rev_len, |
| 154 | + rev_user as user_id, |
| 155 | + rev_user_text as user_text, |
| 156 | + rev_page as page_id, |
| 157 | + IFNULL(rev_len, 0) as rev_len |
| 158 | + FROM revision r |
| 159 | + WHERE rev_page = %(page_id)s |
| 160 | + ORDER BY rev_id |
| 161 | + """, |
| 162 | + { |
| 163 | + 'page_id': pageId |
| 164 | + } |
| 165 | + ) |
| 166 | + for row in cursor: |
| 167 | + yield dict( |
| 168 | + zip( |
| 169 | + [d[0] for d in cursor.description], |
| 170 | + row |
| 171 | + ) |
| 172 | + ) |
| 173 | + |
| 174 | + def getRevisions(self): |
| 175 | + cursor = self.conn.cursor(MySQLdb.cursors.SSCursor) |
| 176 | + cursor.execute( |
| 177 | + """ |
| 178 | + SELECT |
| 179 | + rev_id, |
| 180 | + rev_timestamp, |
| 181 | + YEAR(rev_timestamp) as rev_year, |
| 182 | + MONTH(rev_timestamp) as rev_month, |
| 183 | + rev_len, |
| 184 | + rev_user as user_id, |
| 185 | + rev_user_text as user_text, |
| 186 | + rev_page as page_id, |
| 187 | + p.page_namespace as namespace, |
| 188 | + IFNULL(rev_len, 0) as rev_len |
| 189 | + FROM revision r |
| 190 | + INNER JOIN page p |
| 191 | + ON r.rev_page = p.page_id |
| 192 | + ORDER BY p.page_id, rev_id |
| 193 | + """ |
| 194 | + ) |
| 195 | + for row in cursor: |
| 196 | + yield dict( |
| 197 | + zip( |
| 198 | + [d[0] for d in cursor.description], |
| 199 | + row |
| 200 | + ) |
| 201 | + ) |
| 202 | + |
| 203 | +if __name__ == "__main__": main() |
Index: trunk/tools/wsor/scripts/fix_reg_date.dumb.py |
— | — | @@ -0,0 +1,106 @@ |
| 2 | +import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types |
| 3 | +import wmf |
| 4 | + |
| 5 | +def encode(v): |
| 6 | + if v == None: return "\N" |
| 7 | + |
| 8 | + if type(v) == types.LongType: v = int(v) |
| 9 | + elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
| 10 | + |
| 11 | + return str(v).encode("string-escape") |
| 12 | + |
| 13 | + |
| 14 | +def main(): |
| 15 | + parser = argparse.ArgumentParser( |
| 16 | + description='Gathers editor data for first and last session' |
| 17 | + ) |
| 18 | + parser.add_argument( |
| 19 | + 'date', |
| 20 | + type=str, |
| 21 | + help='the date to start querying for users with dumb registration dates' |
| 22 | + ) |
| 23 | + parser.add_argument( |
| 24 | + '-c', '--cnf', |
| 25 | + metavar="<path>", |
| 26 | + type=str, |
| 27 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 28 | + default=os.path.expanduser("~/.my.cnf") |
| 29 | + ) |
| 30 | + parser.add_argument( |
| 31 | + '-s', '--host', |
| 32 | + type=str, |
| 33 | + help='the database host to connect to (defaults to localhost)', |
| 34 | + default="localhost" |
| 35 | + ) |
| 36 | + parser.add_argument( |
| 37 | + '-d', '--db', |
| 38 | + type=str, |
| 39 | + help='the language db to run the query in (defaults to enwiki)', |
| 40 | + default="enwiki" |
| 41 | + ) |
| 42 | + args = parser.parse_args() |
| 43 | + |
| 44 | + LOGGING_STREAM = sys.stderr |
| 45 | + logging.basicConfig( |
| 46 | + level=logging.DEBUG, |
| 47 | + stream=LOGGING_STREAM, |
| 48 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 49 | + datefmt='%b-%d %H:%M:%S' |
| 50 | + ) |
| 51 | + |
| 52 | + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
| 53 | + db = Database( |
| 54 | + host=args.host, |
| 55 | + db=args.db, |
| 56 | + read_default_file=args.cnf |
| 57 | + ) |
| 58 | + headers = [ |
| 59 | + 'user_id', |
| 60 | + 'user_registration' |
| 61 | + ] |
| 62 | + |
| 63 | + lowestDate = args.date |
| 64 | + logging.info("foo") |
| 65 | + for user in db.getUsersBefore(args.date): |
| 66 | + if user['user_registration'] == None: |
| 67 | + LOGGING_STREAM.write("!") |
| 68 | + user['user_registration'] = lowestDate |
| 69 | + print("\t".join(str(user[h]) for h in headers)) |
| 70 | + else: |
| 71 | + LOGGING_STREAM.write(".") |
| 72 | + |
| 73 | + lowestDate = min(user['user_registration'], lowestDate) |
| 74 | + |
| 75 | + LOGGING_STREAM.write("\n") |
| 76 | + |
| 77 | + |
| 78 | + |
| 79 | + |
| 80 | + |
| 81 | +class Database: |
| 82 | + |
| 83 | + def __init__(self, *args, **kwargs): |
| 84 | + self.args = args |
| 85 | + self.kwargs = kwargs |
| 86 | + self.usersConn = MySQLdb.connect(*args, **kwargs) |
| 87 | + |
| 88 | + def getUsersBefore(self, date): |
| 89 | + cursor = self.usersConn.cursor(MySQLdb.cursors.SSDictCursor) |
| 90 | + cursor.execute( |
| 91 | + """ |
| 92 | + SELECT |
| 93 | + user_id, |
| 94 | + user_registration |
| 95 | + FROM user |
| 96 | + WHERE user_registration <= %(date)s |
| 97 | + OR user_registration IS NULL |
| 98 | + ORDER BY user_id DESC |
| 99 | + """, |
| 100 | + { |
| 101 | + 'date': date |
| 102 | + } |
| 103 | + ) |
| 104 | + for row in cursor: |
| 105 | + yield row |
| 106 | + |
| 107 | +if __name__ == "__main__": main() |
Index: trunk/tools/wsor/welcomers/welcomers.sql |
— | — | @@ -0,0 +1,196 @@ |
| 2 | +CREATE TABLE halfak.user_first_msg ( |
| 3 | + user_id INT UNSIGNED, |
| 4 | + user_name VARBINARY(255), |
| 5 | + msg_timestamp DATETIME |
| 6 | +); |
| 7 | +CREATE UNIQUE INDEX user_name ON halfak.user_first_msg (user_name); |
| 8 | + |
| 9 | + |
| 10 | +SELECT |
| 11 | + user_name, |
| 12 | + user_id, |
| 13 | + is_anon, |
| 14 | + user_registration, |
| 15 | + sender_user, |
| 16 | + sender_user_text, |
| 17 | + msg_timestamp, |
| 18 | + bot, |
| 19 | + self, |
| 20 | + tool, |
| 21 | + warning, |
| 22 | + edit_count, |
| 23 | + ban_date, |
| 24 | + edits_after_msg+deleted_edits_after_msg AS edits_after, |
| 25 | + edits_before_msg+deleted_edits_before_msg AS edits_before |
| 26 | +FROM |
| 27 | + |
| 28 | + |
| 29 | +CREATE TABLE halfak.user_activity_first_msg |
| 30 | +SELECT |
| 31 | +uf.user_id, |
| 32 | +uf.user_name, |
| 33 | +rlc.namespace, |
| 34 | + |
| 35 | + |
| 36 | +SUM( |
| 37 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 38 | + rvt.revision_id IS NOT NULL |
| 39 | +) AS reverting_edits_before, |
| 40 | +SUM( |
| 41 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 42 | + rvt.revision_id IS NOT NULL AND |
| 43 | + rvtd.revision_id IS NOT NULL |
| 44 | +) AS reverted_reverting_edits_before, |
| 45 | + |
| 46 | + |
| 47 | +SUM( |
| 48 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 49 | + rvt.revision_id IS NULL AND |
| 50 | + len_change > 0 |
| 51 | +) AS add_edits_before, |
| 52 | +SUM(IF( |
| 53 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 54 | + rvt.revision_id IS NULL AND |
| 55 | + len_change > 0, |
| 56 | + len_change, 0 |
| 57 | +)) AS len_added_before, |
| 58 | +SUM( |
| 59 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 60 | + rvt.revision_id IS NULL AND |
| 61 | + rvtd.revision_id IS NOT NULL AND |
| 62 | + len_change > 0 |
| 63 | +) AS reverted_add_edits_before, |
| 64 | +SUM(IF( |
| 65 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 66 | + rvt.revision_id IS NULL AND |
| 67 | + rvtd.revision_id IS NOT NULL AND |
| 68 | + len_change > 0, |
| 69 | + len_change, 0 |
| 70 | +)) AS reverted_len_added_before, |
| 71 | + |
| 72 | + |
| 73 | +SUM( |
| 74 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 75 | + rvt.revision_id IS NULL AND |
| 76 | + len_change < 0 |
| 77 | +) AS remove_edits_before, |
| 78 | +SUM(IF( |
| 79 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 80 | + rvt.revision_id IS NULL AND |
| 81 | + len_change < 0, |
| 82 | + len_change, 0 |
| 83 | +)) AS len_removed_before, |
| 84 | +SUM( |
| 85 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 86 | + rvt.revision_id IS NULL AND |
| 87 | + rvtd.revision_id IS NOT NULL AND |
| 88 | + len_change < 0 |
| 89 | +) AS reverted_remove_edits_before, |
| 90 | +SUM(IF( |
| 91 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 92 | + rvt.revision_id IS NULL AND |
| 93 | + rvtd.revision_id IS NOT NULL AND |
| 94 | + len_change < 0, |
| 95 | + len_change, 0 |
| 96 | +)) AS reverted_len_remove_before, |
| 97 | + |
| 98 | + |
| 99 | +SUM( |
| 100 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 101 | + rvt.revision_id IS NULL AND |
| 102 | + len_change = 0 |
| 103 | +) AS noop_edits_before, |
| 104 | +SUM( |
| 105 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 106 | + rvt.revision_id IS NULL AND |
| 107 | + rvtd.revision_id IS NOT NULL AND |
| 108 | + len_change = 0 |
| 109 | +) AS reverted_noop_edits_before, |
| 110 | + |
| 111 | + |
| 112 | + |
| 113 | + |
| 114 | +SUM( |
| 115 | + rlc.rev_timestamp > uf.msg_timestamp AND |
| 116 | + rvt.revision_id IS NOT NULL |
| 117 | +) AS reverting_edits_after, |
| 118 | +SUM( |
| 119 | + rlc.rev_timestamp > uf.msg_timestamp AND |
| 120 | + rvt.revision_id IS NOT NULL AND |
| 121 | + rvtd.revision_id IS NOT NULL |
| 122 | +) AS reverted_reverting_edits_after, |
| 123 | + |
| 124 | + |
| 125 | +SUM( |
| 126 | + rlc.rev_timestamp > uf.msg_timestamp AND |
| 127 | + rvt.revision_id IS NULL AND |
| 128 | + len_change > 0 |
| 129 | +) AS add_edits_after, |
| 130 | +SUM(IF( |
| 131 | + rlc.rev_timestamp > uf.msg_timestamp AND |
| 132 | + rvt.revision_id IS NULL AND |
| 133 | + len_change > 0, |
| 134 | + len_change, 0 |
| 135 | +)) AS len_added_after, |
| 136 | +SUM( |
| 137 | + rlc.rev_timestamp > uf.msg_timestamp AND |
| 138 | + rvt.revision_id IS NULL AND |
| 139 | + rvtd.revision_id IS NOT NULL AND |
| 140 | + len_change > 0 |
| 141 | +) AS reverted_add_edits_after, |
| 142 | +SUM(IF( |
| 143 | + rlc.rev_timestamp > uf.msg_timestamp AND |
| 144 | + rvt.revision_id IS NULL AND |
| 145 | + rvtd.revision_id IS NOT NULL AND |
| 146 | + len_change > 0, |
| 147 | + len_change, 0 |
| 148 | +)) AS reverted_len_added_after, |
| 149 | + |
| 150 | + |
| 151 | +SUM( |
| 152 | + rlc.rev_timestamp > uf.msg_timestamp AND |
| 153 | + rvt.revision_id IS NULL AND |
| 154 | + len_change < 0 |
| 155 | +) AS remove_edits_after, |
| 156 | +SUM(IF( |
| 157 | + rlc.rev_timestamp > uf.msg_timestamp AND |
| 158 | + rvt.revision_id IS NULL AND |
| 159 | + len_change < 0, |
| 160 | + len_change, 0 |
| 161 | +)) AS len_removed_after, |
| 162 | +SUM( |
| 163 | + rlc.rev_timestamp < uf.msg_timestamp AND |
| 164 | + rvt.revision_id IS NULL AND |
| 165 | + rvtd.revision_id IS NOT NULL AND |
| 166 | + len_change < 0 |
| 167 | +) AS reverted_remove_edits_after, |
| 168 | +SUM(IF( |
| 169 | + rlc.rev_timestamp > uf.msg_timestamp AND |
| 170 | + rvt.revision_id IS NULL AND |
| 171 | + rvtd.revision_id IS NOT NULL AND |
| 172 | + len_change < 0, |
| 173 | + len_change, 0 |
| 174 | +)) AS reverted_len_remove_after, |
| 175 | + |
| 176 | + |
| 177 | +SUM( |
| 178 | + rlc.rev_timestamp > uf.msg_timestamp AND |
| 179 | + rvt.revision_id IS NULL AND |
| 180 | + len_change = 0 |
| 181 | +) AS noop_edits_after, |
| 182 | +SUM( |
| 183 | + rlc.rev_timestamp > uf.msg_timestamp AND |
| 184 | + rvt.revision_id IS NULL AND |
| 185 | + rvtd.revision_id IS NOT NULL AND |
| 186 | + len_change = 0 |
| 187 | +) AS reverted_noop_edits_after |
| 188 | + |
| 189 | +FROM halfak.user_first_msg uf |
| 190 | +INNER JOIN halfak.rev_len_changed_namespace rlc |
| 191 | + ON uf.user_name = rlc.user_text |
| 192 | +LEFT JOIN halfak.revert_20110115 rvt |
| 193 | + ON rvt.revision_id = rlc.rev_id |
| 194 | +LEFT JOIN halfak.reverted_20110115 rvtd |
| 195 | + ON rvtd.revision_id = rlc.rev_id |
| 196 | +GROUP BY uf.user_name, rlc.namespace; |
| 197 | + |