r91681 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r91680‎ | r91681 | r91682 >
Date:20:48, 7 July 2011
Author:halfak
Status:deferred
Tags:
Comment:
vandal conversion work
Modified paths:
  • /trunk/tools/wsor/ts_samples/get_established_users.py (added) (history)
  • /trunk/tools/wsor/ts_samples/testing.sql (modified) (history)
  • /trunk/tools/wsor/vandal_conversion/get_editor_conversions.py (modified) (history)
  • /trunk/tools/wsor/vandal_conversion/queries.sql (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/ts_samples/testing.sql
@@ -67,3 +67,16 @@
6868 SUBSTRING(first_edit, 5,2)
6969 ) as foo
7070 GROUP BY year, biannual;
 71+
 72+
 73+
 74+SELECT max(rev_timestamp) as timestamp, count(*) as edits
 75+FROM (
 76+ SELECT rev_user, rev_timestamp, rev_page
 77+ FROM enwiki.revision r2
 78+ WHERE r2.rev_user = 40
 79+ ORDER BY rev_timestamp LIMIT 100
 80+) AS foo
 81+ORDER BY rev_timestamp DESC
 82+LIMIT 1
 83+
Index: trunk/tools/wsor/ts_samples/get_established_users.py
@@ -0,0 +1,160 @@
 2+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types
 3+import wmf
 4+
 5+def encode(v):
 6+ if v == None: return "\N"
 7+
 8+ if type(v) == types.LongType: v = int(v)
 9+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 10+
 11+ return str(v).encode("string-escape")
 12+
 13+
 14+def main():
 15+ parser = argparse.ArgumentParser(
 16+ description='Gathers editor established period'
 17+ )
 18+ parser.add_argument(
 19+ 'min_edits',
 20+ type=int,
 21+ help='the minimum number of edits that editors must have perfomed before being considered established'
 22+ )
 23+ parser.add_argument(
 24+ '-c', '--cnf',
 25+ metavar="<path>",
 26+ type=str,
 27+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 28+ default=os.path.expanduser("~/.my.cnf")
 29+ )
 30+ parser.add_argument(
 31+ '-s', '--host',
 32+ type=str,
 33+ help='the database host to connect to (defaults to localhost)',
 34+ default="localhost"
 35+ )
 36+ parser.add_argument(
 37+ '-d', '--db',
 38+ type=str,
 39+ help='the language db to run the query in (defaults to enwiki)',
 40+ default="enwiki"
 41+ )
 42+ parser.add_argument(
 43+ '-o', '--old',
 44+ type=lambda fn:open(fn, 'r'),
 45+ help='an old output file to process (defaults to stdin)',
 46+ default=sys.stdin
 47+ )
 48+ args = parser.parse_args()
 49+
 50+ LOGGING_STREAM = sys.stderr
 51+ logging.basicConfig(
 52+ level=logging.DEBUG,
 53+ stream=LOGGING_STREAM,
 54+ format='%(asctime)s %(levelname)-8s %(message)s',
 55+ datefmt='%b-%d %H:%M:%S'
 56+ )
 57+
 58+ processedUsers = set()
 59+ args.old.readline() # dump headers
 60+ for line in args.old:
 61+ userId = int(line.strip().split("\t")[0])
 62+ processedUsers.add(userId)
 63+
 64+
 65+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 66+ db = Database(
 67+ host=args.host,
 68+ db=args.db,
 69+ read_default_file=args.cnf
 70+ )
 71+ headers = [
 72+ 'user_id',
 73+ 'user_name',
 74+ '100th_edit',
 75+ 'last_edit'
 76+ ]
 77+ print(
 78+ "\t".join(headers)
 79+ )
 80+
 81+ logging.info("Processing users:")
 82+ for user in db.getUsers(minimumEdits=args.min_edits):
 83+ if user['user_id'] in processedUsers: continue
 84+
 85+ user['100th_edit'] = db.getHundredthEdit(user['user_id'])
 86+ user['last_edit'] = db.getLastEdit(user['user_id'])
 87+ LOGGING_STREAM.write(".")
 88+ print("\t".join(encode(user[h]) for h in headers))
 89+
 90+ LOGGING_STREAM.write("\n")
 91+
 92+class Database:
 93+
 94+ def __init__(self, *args, **kwargs):
 95+ self.args = args
 96+ self.kwargs = kwargs
 97+ self.conn = MySQLdb.connect(*args, **kwargs)
 98+
 99+ def getUsers(self, minEdits=0):
 100+ cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
 101+ cursor.execute(
 102+ """
 103+ SELECT * FROM user
 104+ WHERE user_editcount >= %(min_edits)s
 105+ """,
 106+ {
 107+ 'min_edits': minEdits
 108+ }
 109+ )
 110+ for user in cursor: yield user
 111+
 112+ def getHundredthEdit(self, userId):
 113+ userId = int(userId)
 114+ cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
 115+ cursor.execute(
 116+ """
 117+ SELECT max(rev_timestamp), count(*) as revisions
 118+ FROM (
 119+ SELECT rev_timestamp
 120+ FROM revision r2
 121+ WHERE r2.rev_user = %(user_id)s
 122+ ORDER BY rev_timestamp LIMIT 100
 123+ ) AS foo
 124+ ORDER BY rev_timestamp DESC
 125+ LIMIT 1
 126+ """,
 127+ {
 128+ 'user_id': userId
 129+ }
 130+ )
 131+ row = cursor.fetchone()
 132+ if row = None:
 133+ return None
 134+ else:
 135+ return row['rev_timestamp']
 136+
 137+
 138+ def getLastEdit(self, userId):
 139+ userId = int(userId)
 140+ cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
 141+ cursor.execute(
 142+ """
 143+ SELECT rev_timestamp
 144+ FROM revision r2
 145+ WHERE r2.rev_user = %(user_id)s
 146+ ORDER BY rev_timestamp DESC
 147+ LIMIT 1
 148+ """,
 149+ {
 150+ 'user_id': userId
 151+ }
 152+ )
 153+ row = cursor.fetchone()
 154+
 155+ if row = None: return None
 156+ else: return row['rev_timestamp']
 157+
 158+
 159+
 160+
 161+if __name__ == "__main__": main()
Index: trunk/tools/wsor/vandal_conversion/get_editor_conversions.py
@@ -1,7 +1,15 @@
2 -import sys, MySQLdb, MySQLdb.cursors, argparse
 2+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types
33 import wmf
44
 5+def encode(v):
 6+ if v == None: return "\N"
 7+
 8+ if type(v) == types.LongType: v = int(v)
 9+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 10+
 11+ return str(v).encode("string-escape")
512
 13+
614 def main():
715 parser = argparse.ArgumentParser(
816 description='Gathers editor data for first and last session'
@@ -35,6 +43,12 @@
3644 help='the language db to run the query in (defaults to enwiki)',
3745 default="enwiki"
3846 )
 47+ parser.add_argument(
 48+ '-o', '--old',
 49+ type=lambda fn:open(fn, 'r'),
 50+ help='an old output file to process (defaults to stdin)',
 51+ default=sys.stdin
 52+ )
3953 args = parser.parse_args()
4054
4155 LOGGING_STREAM = sys.stderr
@@ -45,34 +59,55 @@
4660 datefmt='%b-%d %H:%M:%S'
4761 )
4862
 63+ processedUsers = set()
 64+ if not args.old.isatty():
 65+ logging.info("Loading previously processed users.")
 66+ args.old.readline() # dump headers
 67+ for line in args.old:
 68+ try:
 69+ userId = int(line.strip().split("\t")[0])
 70+ processedUsers.add(userId)
 71+ except ValueError:
 72+ logging.warning("Could not convert %s into a user id." % line.strip().split("\t")[0])
 73+
4974 logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
5075 db = Database(
5176 host=args.host,
5277 db=args.db,
5378 read_default_file=args.cnf
5479 )
55 -
 80+ headers = [
 81+ 'user_id',
 82+ 'user_name',
 83+ 'first_edit',
 84+ 'last_edit',
 85+ 'fes_edits',
 86+ 'fes_reverted',
 87+ 'fes_vandalism',
 88+ 'fes_deleted',
 89+ 'last10_edits',
 90+ 'last10_reverted',
 91+ 'last10_vandalism',
 92+ 'last10_deleted'
 93+ ]
5694 print(
57 - "\t".join([
58 - 'user_id',
59 - 'user_name',
60 - 'first_edit',
61 - 'fes_edits',
62 - 'fes_reverted',
63 - 'fes_vandalism',
64 - 'fes_deleted',
65 - 'last10_edits',
66 - 'last10_reverted',
67 - 'last10_vandalism',
68 - 'last10_deleted'
69 - ])
 95+ "\t".join(headers)
7096 )
7197
72 - logging.info("Processing users:")
 98+ logging.info("Processing users:")
 99+ users = []
73100 for user in db.getUsers(minimumEdits=args.min_edits):
 101+ if user['user_id'] in processedUsers:
 102+ logging.info("Skipping %s(%s)" % (user['user_name'], user['user_id']))
 103+ continue
 104+ else:
 105+ users.append(user)
 106+
 107+ for user in users:
74108 firstSession = []
75109 last = None
76 - for rev in db.getFirstEdits(user['user_id']):
 110+ #logging.debug("Getting first edits for %s" % user['user_name'])
 111+ for rev in db.getFirstEdits(user['user_id'], maximum=100):
77112 if last != None:
78113 diff = wmf.wp2Timestamp(rev['rev_timestamp']) - wmf.wp2Timestamp(last['rev_timestamp'])
79114 assert diff >= 0
@@ -86,38 +121,40 @@
87122
88123 last = rev
89124
 125+ #logging.debug("Getting last edits for %s" % user['user_name'])
90126 last10 = list(db.getLastEdits(user['user_id'], maximum=10))
91127 logging.debug("%s(%s): %s %s" % (user['user_name'], user['user_id'], len(firstSession)*">", len(last10)*"<"))
92 - print(
93 - "\t".join(
94 - str(v).encode("string-escape") for v in [
95 - user['user_id'],
96 - user['user_name'],
97 - user['editcount'],
98 - firstSession[0]['rev_timestamp'],
99 - len(firstSession),
100 - len([r for r in firstSession if r['is_reverted']]),
101 - len([r for r in firstSession if r['is_vandalism']]),
102 - len([r for r in firstSession if r['deleted']]),
103 - len(last10),
104 - len([r for r in last10 if r['is_reverted']]),
105 - len([r for r in last10 if r['is_vandalism']]),
106 - len([r for r in last10 if r['deleted']])
107 - ]
108 - )
109 - )
 128+ user['first_edit'] = firstSession[0]['rev_timestamp']
 129+ user['last_edit'] = last10[0]['rev_timestamp']
 130+ user['fes_edits'] = len(firstSession)
 131+ user['fes_reverted'] = 0
 132+ user['fes_vandalism'] = 0
 133+ user['fes_deleted'] = 0
 134+ for rev in firstSession:
 135+ if rev['is_reverted']: user['fes_reverted'] += 1
 136+ if rev['is_vandalism']: user['fes_vandalism'] += 1
 137+ if rev['deleted']: user['fes_deleted'] += 1
 138+
 139+ user['last10_edits'] = len(last10)
 140+ user['last10_reverted'] = 0
 141+ user['last10_vandalism'] = 0
 142+ user['last10_deleted'] = 0
 143+ for rev in last10:
 144+ if rev['is_reverted']: user['last10_reverted'] += 1
 145+ if rev['is_vandalism']: user['last10_vandalism'] += 1
 146+ if rev['deleted']: user['last10_deleted'] += 1
110147
111 -
 148+ print("\t".join(encode(user[h]) for h in headers))
112149
113150
114 -
115151 class Database:
116152
117153 def __init__(self, *args, **kwargs):
118154 self.args = args
119155 self.kwargs = kwargs
120156 self.usersConn = MySQLdb.connect(*args, **kwargs)
121 - self.editsConn = MySQLdb.connect(*args, **kwargs)
 157+ self.revsConn = MySQLdb.connect(*args, **kwargs)
 158+ self.archConn = MySQLdb.connect(*args, **kwargs)
122159
123160 def getUsers(self, minimumEdits=0):
124161 minimumEdits = int(minimumEdits)
@@ -127,10 +164,8 @@
128165 SELECT
129166 u.user_id,
130167 u.user_name,
131 - um.first_edit,
132168 u.user_editcount as editcount
133169 FROM user u
134 - INNER JOIN halfak.user_meta um USING (user_id)
135170 WHERE u.user_editcount >= %(minimum_edits)s
136171 """,
137172 {
@@ -144,8 +179,8 @@
145180
146181 def getEdits(self, userId, maximum=10000, chronologically=True):
147182 userId = int(userId)
148 - revisionCursor = self.editsConn.cursor(MySQLdb.cursors.SSDictCursor)
149 - archiveCursor = self.editsConn.cursor(MySQLdb.cursors.SSDictCursor)
 183+ revisionCursor = self.revsConn.cursor(MySQLdb.cursors.SSDictCursor)
 184+ archiveCursor = self.archConn.cursor(MySQLdb.cursors.SSDictCursor)
150185
151186 if chronologically: direction = "ASC"
152187 else: direction = "DESC"
@@ -162,7 +197,7 @@
163198 LEFT JOIN halfak.reverted_20110115 rvtd
164199 ON r.rev_id = rvtd.revision_id
165200 WHERE rev_user = %(user_id)s
166 - ORDER BY r.timestamp """ + direction + """
 201+ ORDER BY r.rev_timestamp """ + direction + """
167202 LIMIT %(maximum)s;
168203 """,
169204 {
@@ -173,14 +208,14 @@
174209 archiveCursor.execute(
175210 """
176211 SELECT
177 - ar.ar_rev_id AS rev_id,
178 - ar.ar_timestamp AS rev_timestamp,
179 - NULL AS is_reverted,
180 - NULL AS is_vandalism,
181 - True AS deleted
182 - FROM archive ar
 212+ ar_rev_id AS rev_id,
 213+ ar_timestamp AS rev_timestamp,
 214+ NULL AS is_reverted,
 215+ NULL AS is_vandalism,
 216+ True AS deleted
 217+ FROM archive
183218 WHERE ar_user = %(user_id)s
184 - ORDER BY ar.timestamp """ + direction + """
 219+ ORDER BY ar_timestamp """ + direction + """
185220 LIMIT %(maximum)s;
186221 """,
187222 {
@@ -193,29 +228,29 @@
194229 else:
195230 order = lambda t1, t2:t1 > t2
196231
197 - revPointer = revisionCursor.fetchrow()
198 - archPointer = archiveCursor.fetchrow()
 232+ revPointer = revisionCursor.fetchone()
 233+ archPointer = archiveCursor.fetchone()
199234 count = 0
200235 while revPointer != None or archPointer != None: #still something to output
201236 if revPointer != None and archPointer != None: #both cursors still have something
202237 if order(revPointer['rev_timestamp'], archPointer['rev_timestamp']):
203238 yield revPointer
204 - revPointer = revisionCursor.fetchrow()
 239+ revPointer = revisionCursor.fetchone()
205240 else:
206241 yield archPointer
207 - archPointer = archiveCursor.fetchrow()
 242+ archPointer = archiveCursor.fetchone()
208243 elif revPointer != None: #only revisions left
209244 yield revPointer
210 - revPointer = revisionCursor.fetchrow()
211 - elif archPointer != None:
 245+ revPointer = revisionCursor.fetchone()
 246+ elif archPointer != None: #only archives left
212247 yield archPointer
213 - archPointer = archiveCursor.fetchrow()
 248+ archPointer = archiveCursor.fetchone()
214249
215250 count += 1
216251 if count >= maximum: break
217 -
218252
219 -
 253+ revisionCursor.close()
 254+ archiveCursor.close()
220255
221256
222257
@@ -226,5 +261,4 @@
227262 return self.getEdits(userId, maximum, chronologically=False)
228263
229264
230 -if __name__ == "__main__":
231 - main()
 265+if __name__ == "__main__": main()
Index: trunk/tools/wsor/vandal_conversion/queries.sql
@@ -1,15 +1,27 @@
22 SELECT
33 r.rev_id,
4 - r.timestamp,
5 - rvtd.revision_id IS NOT NULL,
6 - rvtd.is_vandalism IS NOT NULL AND rvtd.is_vandalism = TRUE
 4+ r.rev_timestamp,
 5+ rvtd.revision_id IS NOT NULL AS is_reverted,
 6+ rvtd.is_vandalism IS NOT NULL AND rvtd.is_vandalism = TRUE AS is_vandalism,
 7+ False AS deleted
78 FROM revision r
89 LEFT JOIN halfak.reverted_20110115 rvtd
910 ON r.rev_id = rvtd.revision_id
10 -WHERE rev_user = 2345678
11 -ORDER BY r.rev_id ASC
12 -LIMIT 10000;
 11+WHERE rev_user = 40
 12+ORDER BY r.rev_timestamp ASC
 13+LIMIT 10;
1314
 15+SELECT
 16+ ar_rev_id AS rev_id,
 17+ ar_timestamp AS rev_timestamp,
 18+ NULL AS is_reverted,
 19+ NULL AS is_vandalism,
 20+ True AS deleted
 21+FROM archive
 22+WHERE ar_user = 4
 23+ORDER BY ar_timestamp ASC
 24+LIMIT 10;
 25+
1426 SELECT
1527 r.rev_id,
1628 r.timestamp,
@@ -23,3 +35,42 @@
2436 LIMIT 10000;
2537
2638
 39+
 40+----Just testsing here--------------------------
 41+use enwiki;
 42+create table zexley.quarterly_cumulative_counts_from_revision
 43+select r.rev_user,
 44+ sum(if(datediff(r.rev_timestamp,m.first_edit)<92,1,0)) as 1q,
 45+ sum(if(datediff(r.rev_timestamp,m.first_edit)<183,1,0)) as 2q,
 46+sum(if(datediff(r.rev_timestamp,m.first_edit)<274,1,0)) as 3q,
 47+sum(if(datediff(r.rev_timestamp,m.first_edit)<366,1,0)) as 4q
 48+from revision r
 49+inner join halfak.user_meta m on r.rev_user=m.user_id
 50+group by r.rev_user;
 51+
 52+SELECT
 53+ u.user_id,
 54+ sum(r.rev_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL .25 YEAR)) as 1q,
 55+ sum(r.rev_timestamp BETWEEN DATE_ADD(u.first_edit, INTERVAL .25 YEAR)) AND DATE_ADD(u.first_edit, INTERVAL .5 YEAR)) as 2q,
 56+ sum(r.rev_timestamp BETWEEN DATE_ADD(u.first_edit, INTERVAL .5 YEAR)) AND DATE_ADD(u.first_edit, INTERVAL .75 YEAR)) as 3q,
 57+ sum(r.rev_timestamp > DATE_ADD(u.first_edit, INTERVAL .75 YEAR)) as 4q,
 58+FROM halfak.user_meta u
 59+INNER JOIN revision r
 60+ ON u.user_id = r.rev_user AND
 61+ r.rev_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL 1 YEAR)
 62+
 63+GROUP BY u.user_id;
 64+
 65+SELECT
 66+ u.user_id,
 67+ sum(r.rev_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL .25 YEAR)) as 1q,
 68+ sum(r.rev_timestamp BETWEEN DATE_ADD(u.first_edit, INTERVAL .25 YEAR) AND DATE_ADD(u.first_edit, INTERVAL .5 YEAR)) as 2q,
 69+ sum(r.rev_timestamp BETWEEN DATE_ADD(u.first_edit, INTERVAL .5 YEAR) AND DATE_ADD(u.first_edit, INTERVAL .75 YEAR)) as 3q,
 70+ sum(r.rev_timestamp > DATE_ADD(u.first_edit, INTERVAL .75 YEAR)) as 4q
 71+FROM halfak.user_meta u
 72+INNER JOIN revision r
 73+ ON u.user_id = r.rev_user AND
 74+ r.rev_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL 1 YEAR)
 75+GROUP BY u.user_id;
 76+-----------------------------------------
 77+

Status & tagging log