r92067 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r92066‎ | r92067 | r92068 >
Date:17:21, 13 July 2011
Author:halfak
Status:deferred
Tags:
Comment:
added newbie warnings
Modified paths:
  • /trunk/tools/wsor/newbie_warnings (added) (history)
  • /trunk/tools/wsor/newbie_warnings/queries.sql (added) (history)
  • /trunk/tools/wsor/newbie_warnings/track_messages.py (added) (history)
  • /trunk/tools/wsor/overworked/R/loader/load_reverter_months.R (modified) (history)
  • /trunk/tools/wsor/overworked/R/revert_fighters.R (modified) (history)
  • /trunk/tools/wsor/overworked/R/reverter_work.R (modified) (history)
  • /trunk/tools/wsor/overworked/R/vandal_fighters.R (modified) (history)
  • /trunk/tools/wsor/overworked/remove_bots.py (modified) (history)
  • /trunk/tools/wsor/overworked/testing.sql (modified) (history)
  • /trunk/tools/wsor/vandal_conversion/R/conversions.R (modified) (history)
  • /trunk/tools/wsor/vandal_conversion/R/loader/load_editor_first_and_last.R (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/newbie_warnings/queries.sql
@@ -0,0 +1,50 @@
 2+SELECT reciever.user_id, reciever.user_name
 3+FROM revision r
 4+INNER JOIN page p
 5+ ON r.rev_page = p.page_id
 6+ AND p.page_namespace = 3
 7+ AND r.rev_user = 14934614
 8+INNER JOIN user reciever
 9+ ON reciever.user_name = REPLACE(p.page_title, "_", " ")
 10+INNER JOIN user_newtalk nt
 11+ ON reciever.user_id = nt.user_id
 12+GROUP BY reciever.user_id, reciever.user_name
 13+
 14+SELECT reciever.user_id, reciever.user_name, count(*)
 15+FROM revision r
 16+INNER JOIN
 17+ ON r.rev_page = p.page_id
 18+ AND p.page_namespace = 3
 19+ AND r.rev_user = 6396742
 20+INNER JOIN user reciever
 21+ ON reciever.user_name = REPLACE(p.page_title, "_", " ")
 22+INNER JOIN user_newtalk nt
 23+ ON reciever.user_id = nt.user_id
 24+WHERE
 25+GROUP BY reciever.user_id, reciever.user_name;
 26+
 27+
 28+StuGeiger: 14934614
 29+EpochFail: 6396742
 30+
 31+
 32+SELECT rc_timestamp FROM recentchanges ORDER BY rc_timestamp DESC LIMIT 1
 33+
 34+SELECT
 35+ reciever.user_id,
 36+ reciever.user_name,
 37+ count(*) AS messages_waiting
 38+FROM (
 39+SELECT DISTINCT p.page_title
 40+FROM revision r
 41+INNER JOIN page p
 42+ ON r.rev_page = p.page_id
 43+WHERE r.rev_user = 14934614
 44+AND p.page_namespace = 3
 45+) AS tp
 46+INNER JOIN user reciever
 47+ ON reciever.user_name = REPLACE(tp.page_title, "_", " ")
 48+INNER JOIN user_newtalk nt
 49+ ON reciever.user_id = nt.user_id
 50+GROUP BY reciever.user_id, reciever.user_name;
 51+
Index: trunk/tools/wsor/newbie_warnings/track_messages.py
@@ -0,0 +1,171 @@
 2+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types
 3+import wmf
 4+
 5+def encode(v):
 6+ if v == None: return "\N"
 7+
 8+ if type(v) == types.LongType: v = int(v)
 9+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 10+
 11+ return str(v).encode("string-escape")
 12+
 13+
 14+def main():
 15+ parser = argparse.ArgumentParser(
 16+ description='Gathers editor data for first and last session'
 17+ )
 18+ parser.add_argument(
 19+ 'user_id',
 20+ type=int,
 21+ help='the user_id of the editor whose talk postings should be tracked'
 22+ )
 23+ parser.add_argument(
 24+ '-c', '--cnf',
 25+ metavar="<path>",
 26+ type=str,
 27+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 28+ default=os.path.expanduser("~/.my.cnf")
 29+ )
 30+ parser.add_argument(
 31+ '-s', '--host',
 32+ type=str,
 33+ help='the database host to connect to (defaults to localhost)',
 34+ default="localhost"
 35+ )
 36+ parser.add_argument(
 37+ '-d', '--db',
 38+ type=str,
 39+ help='the language db to run the query in (defaults to enwiki)',
 40+ default="enwiki"
 41+ )
 42+ parser.add_argument(
 43+ '-o', '--out',
 44+ type=lambda fn:open(fn, 'a'),
 45+ help='Where should output be appended',
 46+ default=sys.stdout
 47+ )
 48+ args = parser.parse_args()
 49+
 50+ LOGGING_STREAM = sys.stderr
 51+ logging.basicConfig(
 52+ level=logging.DEBUG,
 53+ stream=LOGGING_STREAM,
 54+ format='%(asctime)s %(levelname)-8s %(message)s',
 55+ datefmt='%b-%d %H:%M:%S'
 56+ )
 57+
 58+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 59+ db = Database(
 60+ host=args.host,
 61+ db=args.db,
 62+ read_default_file=args.cnf
 63+ )
 64+
 65+ print(
 66+ "\t".join(
 67+ [db.getTime()]+
 68+ [
 69+ ":".join(
 70+ [
 71+ e['user_id'],
 72+ e['user_name'],
 73+ e['messages_waiting']
 74+ ]
 75+ ) for e in db.getEditorsWithTalk()
 76+ ]
 77+ )
 78+ )
 79+
 80+
 81+
 82+
 83+
 84+class Database:
 85+
 86+ def __init__(self, *args, **kwargs):
 87+ self.args = args
 88+ self.kwargs = kwargs
 89+ self.usersConn = MySQLdb.connect(*args, **kwargs)
 90+
 91+ def getTime(self):
 92+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
 93+ cursor.execute(
 94+ """
 95+ SELECT rc_timestamp AS time
 96+ FROM recentchanges
 97+ ORDER BY rc_timestamp DESC
 98+ LIMIT 1
 99+ """
 100+ )
 101+ yield cursor.fetchone()['time']
 102+
 103+
 104+ def getEditorsWithTalk(self, userId):
 105+ userId = int(userId)
 106+ cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor)
 107+ cursor.execute(
 108+ """
 109+ SELECT
 110+ reciever.user_id,
 111+ reciever.user_name,
 112+ count(*) AS messages_waiting
 113+ FROM (
 114+ SELECT DISTINCT p.page_title
 115+ FROM revision r
 116+ INNER JOIN page p
 117+ ON r.rev_page = p.page_id
 118+ WHERE r.rev_user = %(user_id)s
 119+ AND p.page_namespace = 3
 120+ ) AS tp
 121+ LEFT JOIN user reciever
 122+ ON reciever.user_name = REPLACE(tp.page_title, "_", " ")
 123+ INNER JOIN user_newtalk nt
 124+ ON reciever.user_id = nt.user_id
 125+ OR nt.user_ip = tp.page_title
 126+ GROUP BY reciever.user_id, reciever.user_name
 127+ """,
 128+ {
 129+ 'user_id': userId
 130+ }
 131+ )
 132+
 133+ for user in cursor:
 134+ yield user
 135+
 136+ cursor.execute(
 137+ """
 138+ SELECT
 139+ NULL AS user_id,
 140+ tp.page_title AS user_name,
 141+ count(*) AS messages_waiting
 142+ FROM (
 143+ SELECT DISTINCT p.page_title
 144+ FROM revision r
 145+ INNER JOIN page p
 146+ ON r.rev_page = p.page_id
 147+ WHERE r.rev_user = %(user_id)s
 148+ AND p.page_namespace = 3
 149+ ) AS tp
 150+ INNER JOIN user_newtalk nt
 151+ ON tp.page_title = nt.user_ip
 152+ GROUP BY tp.page_title
 153+ """,
 154+ {
 155+ 'user_id': userId
 156+ }
 157+ )
 158+
 159+ for user in cursor:
 160+ yield user
 161+
 162+
 163+
 164+
 165+ def getFirstEdits(self, userId, maximum=10000):
 166+ return self.getEdits(userId, maximum, chronologically=True)
 167+
 168+ def getLastEdits(self, userId, maximum=10000):
 169+ return self.getEdits(userId, maximum, chronologically=False)
 170+
 171+
 172+if __name__ == "__main__": main()
Index: trunk/tools/wsor/overworked/R/loader/load_reverter_months.R
@@ -3,7 +3,7 @@
44
55
66 load_reverter_months = function(verbose=T, reload=F){
7 - filename = paste(DATA_DIR, "en.reverter_months.20110115.no_quotes_or_bots.tsv", sep="/")
 7+ filename = paste(DATA_DIR, "en.reverter_months.20110115.no_quotes_or_bots.final.tsv", sep="/")
88 if(!exists("REVERTER_MONTHS")){
99 REVERTER_MONTHS <<- NULL
1010 }
Index: trunk/tools/wsor/overworked/R/reverter_work.R
@@ -2,6 +2,7 @@
33
44 library(lattice)
55 library(grid)
 6+library(doBy)
67
78 reverter_months = load_reverter_months()
89 reverter_months = reverter_months[!grepl("bot( |$|[^a-z])", reverter_months$username, ignore.case=T),]
@@ -122,22 +123,43 @@
123124 cat("Adding", year.month, "...")
124125 top_vfers = rbind(
125126 top_vfers,
126 - month_vfers[order(month_vfers$reverts),][1:50,]
 127+ month_vfers[order(month_vfers$reverts, decreasing=T),][1:50,]
127128 )
128129 cat("DONE!\n")
129130 }
130131
131132 top_activity_months = summaryBy(
132133 vandal_reverts + reverts + revisions ~ year.month,
133 - data=reverter_months,
 134+ data=top_vfers,
134135 FUN=c(mean, sd, length)
135136 )
136137
 138+png("plots/reverting_revisions.per_user_month.top_50.png", height=768, width=1024)
 139+with(
 140+ top_activity_months,
 141+ plot_activity_mean(year.month, reverts.mean, reverts.sd, reverts.length, "reverting revisions")
 142+)
 143+dev.off()
137144
 145+png("plots/vandal_reverting_revisions.per_user_month.top_50.png", height=768, width=1024)
 146+with(
 147+ top_activity_months,
 148+ plot_activity_mean(year.month, vandal_reverts.mean, vandal_reverts.sd, vandal_reverts.length, "vandal reverting revisions")
 149+)
 150+dev.off()
138151
 152+png("plots/revisions.per_user_month.top_50.png", height=768, width=1024)
 153+with(
 154+ top_activity_months,
 155+ plot_activity_mean(year.month, revisions.mean, revisions.sd, revisions.length, "revisions")
 156+)
 157+dev.off()
139158
140159
141160
142161
143162
144163
 164+
 165+
 166+
Index: trunk/tools/wsor/overworked/R/vandal_fighters.R
@@ -25,8 +25,6 @@
2626 FUN=sum
2727 ),
2828 data.frame(
29 - year = year,
30 - month = month,
3129 active.users = active.sum,
3230 vf5.users = vf5.sum,
3331 vf50.users = vf50.sum,
@@ -34,10 +32,10 @@
3533 r5.users = r5.sum,
3634 r50.users = r50.sum,
3735 r500.users = r500.sum,
38 - year.month = as.factor(paste(year, month, sep="/"))
 36+ year.month = year.month
3937 )
4038 )
41 -activity_counts$log.users.active = log(activity_counts$users.active, base=10)
 39+activity_counts$log.active.users = log(activity_counts$active.users, base=10)
4240 activity_counts$log.v5.users = log(activity_counts$vf5.users, base=10)
4341 activity_counts$log.v50.users = log(activity_counts$vf50.users, base=10)
4442
@@ -82,9 +80,9 @@
8381 png("plots/vandal_fighters.by_month.logged.png", width=1024, height=768)
8482 plot(
8583 activity_counts$year.month,
86 - (activity_counts$log.users.active*0)-10000,
 84+ (activity_counts$log.active.users*0)-10000,
8785 col="#FFFFFF",
88 - ylim=c(0, max(activity_counts$log.users.active)+.5),
 86+ ylim=c(0, max(activity_counts$log.active.users)+.5),
8987 main="Vandal fighters and active editors over time",
9088 xlab="Time (in months)",
9189 ylab="Number of users (log10 scaled)"
@@ -177,7 +175,7 @@
178176 plot_prop_with_regression(
179177 activity_counts$year.month,
180178 activity_counts$vf500.users/activity_counts$active.users,
181 - "vandal_50",
 179+ "vandal_500",
182180 "revert >=500 vandals per month"
183181 )
184182 plot_prop_with_regression(
Index: trunk/tools/wsor/overworked/R/revert_fighters.R
@@ -37,6 +37,19 @@
3838 users = user_id.length
3939 )
4040 )
 41+sdvfighter_counts = with(
 42+ summaryBy(
 43+ user_id ~ year + month + svfighter,
 44+ data=reverter_months[reverter_months$active,],
 45+ FUN=length
 46+ ),
 47+ data.frame(
 48+ year = year,
 49+ month = month,
 50+ svfighter = svfighter,
 51+ users = user_id.length
 52+ )
 53+)
4154
4255 activity_counts = merge(
4356 merge(
Index: trunk/tools/wsor/overworked/remove_bots.py
@@ -4,6 +4,7 @@
55 def main(args):
66
77 bots = set()
 8+ args.bots.readline() #strip off header
89 for line in args.bots:
910 bots.add(int(line.strip()))
1011
Index: trunk/tools/wsor/overworked/testing.sql
@@ -185,8 +185,25 @@
186186 SUBSTR(rev_timestamp, 1,2);
187187
188188
 189+
 190+--Reformed vandal users:
 191+----Julianmh -> Demize (name change process)
 192+----(something) -> MisterWiki -> Diego Grez(spanish wikipedia)
189193
 194+
190195
191 -"(Reverted ([0-9]+ )?edits by \[\[Special:Contributions/[^\|]+\|[^\]]+]] \(\[\[User talk:[^\|]+\|talk\]\]\) to last version by .+)|" +
192 -"(Message re. \[\[[^\]]+\]\])|" +
193 -"(Level [0-9]+ warning re. \[\[[^\]]+\]\]"
 196+INSERT INTO halfak.huggle_revision_20110701
 197+SELECT
 198+ rev_id,
 199+ "rev_comment RLIKE \"(Reverted ([0-9]+ )?edits by \\[\\[Special:Contributions/[^\\|]+\\|[^\]]+\\]\\] \\(\\[\\[User talk:[^\\|]+\\|talk\\]\\]\\) to last version by .+)|(Message re\\. \\[\\[[^\]]+\\]\\])|(Level [0-9]+ warning re\\. \\[\\[[^\]]+\\]\\])\""
 200+FROM revision
 201+WHERE rev_comment RLIKE
 202+"(Reverted ([0-9]+ )?edits by \\[\\[Special:Contributions/[^\\|]+\\|[^\]]+\\]\\] \\(\\[\\[User talk:[^\\|]+\\|talk\\]\\]\\) to last version by .+)|(Message re\\. \\[\\[[^\]]+\\]\\])|(Level [0-9]+ warning re\\. \\[\\[[^\]]+\\]\\])";
 203+
 204+
 205+CREATE TABLE halfak.bot_20110711
 206+SELECT DISTINCT ug_user as user_id FROM user_groups WHERE ug_group = "bot"
 207+UNION (
 208+ SELECT user_id FROM zexley.bots
 209+);
 210+
Index: trunk/tools/wsor/vandal_conversion/R/loader/load_editor_first_and_last.R
@@ -16,7 +16,7 @@
1717 filename,
1818 header=T, sep="\t",
1919 quote="", comment.char="",
20 - na.strings="\\N",
 20+ na.strings="\\N"
2121 )
2222 if(verbose){cat("DONE!\n")}
2323 }
Index: trunk/tools/wsor/vandal_conversion/R/conversions.R
@@ -1,3 +1,4 @@
22 source("loader/load_editor_first_and_last.R")
33
44 editor_first_and_last = load_editor_first_and_last()
 5+

Status & tagging log