r106747 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r106746‎ | r106747 | r106748 >
Date:01:41, 20 December 2011
Author:halfak
Status:deferred
Tags:tools 
Comment:
Added message_templates and a few other non committed changes
Modified paths:
  • /trunk/tools/wsor/message_templates (added) (history)
  • /trunk/tools/wsor/message_templates/message_postings.py (added) (history)
  • /trunk/tools/wsor/overworked/R/loader/load_reverter_months.R (modified) (history)
  • /trunk/tools/wsor/overworked/R/loader/load_reverting_months.R (modified) (history)
  • /trunk/tools/wsor/overworked/R/reverter_work.R (modified) (history)
  • /trunk/tools/wsor/overworked/testing.sql (modified) (history)
  • /trunk/tools/wsor/scripts/reverts.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/overworked/R/reverter_work.R
@@ -1,16 +1,33 @@
22 source("loader/load_reverter_months.R")
3 -
 3+source("loader/load_bots.R")
44 library(lattice)
55 library(grid)
66 library(doBy)
77
88 reverter_months = load_reverter_months()
9 -reverter_months = reverter_months[!grepl("bot( |$|[^a-z])", reverter_months$username, ignore.case=T),]
10 -reverter_months = reverter_months[reverter_months$username != "DASHBotAV",]
 9+bots = load_bots()
 10+
 11+padMonth = function(x){
 12+ sapply(
 13+ x,
 14+ function(val){
 15+ if(is.na(val)){
 16+ NA
 17+ }
 18+ else if(val<10){
 19+ paste("0", val, sep="")
 20+ }else{
 21+ paste(val)
 22+ }
 23+ }
 24+ )
 25+}
 26+
 27+reverter_months$bot = reverter_months$user_id %in% bots$user_id
1128 reverter_months$active = reverter_months$revisions >= 5
1229 reverter_months$year.month = with(
1330 reverter_months,
14 - as.factor(paste(year, month, sep="/"))
 31+ as.factor(paste(year, padMonth(month), sep="/"))
1532 )
1633
1734 vfer_years = with(
@@ -54,18 +71,17 @@
5572
5673 activity_months = summaryBy(
5774 vandal_reverts + reverts + revisions ~ year.month,
58 - data=reverter_months,
 75+ data=reverter_months[!reverter_months$bot,],
5976 FUN=c(mean, sd, length)
6077 )
6178
62 -plot_activity_mean = function(year.month, m, s, n, name){
63 - model = lm(
64 - m ~ as.numeric(year.month),
65 - data=activity_months
 79+plot_activity_mean = function(year.month, m, s, n, name, model=T){
 80+ lmodel = lm(
 81+ m ~ as.numeric(year.month)
6682 )
67 - summary(model)
 83+ modelSummary = summary(lmodel)
6884 monthLine = function(x){
69 - model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x
 85+ lmodel$coefficients[['(Intercept)']] + lmodel$coefficients[['as.numeric(year.month)']]*x
7086 }
7187
7288 print(xyplot(
@@ -75,21 +91,23 @@
7692 se = s[subscripts]/sqrt(n[subscripts])
7793 panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...)
7894 panel.lines(x[order(x)], y[order(x)], lwd=2, ...)
79 - panel.lines(x[order(x)], monthLine(as.numeric(x[order(x)])), lwd=2, col="#000000")
80 - grid.text(
81 - paste(
82 - "R^2=", round(modelSummary$r.squared, 3),
83 - " coef=", round(model$coefficients[['as.numeric(year.month)']], 5),
84 - " p=", round(modelSummary$coefficients[2,4], 8)
85 - ),
86 - .5,
87 - .95
88 - )
 95+ if(model){
 96+ panel.lines(x[order(x)], monthLine(as.numeric(x[order(x)])), lwd=2, col="#000000")
 97+ grid.text(
 98+ paste(
 99+ "R^2=", round(modelSummary$r.squared, 3),
 100+ " coef=", round(lmodel$coefficients[['as.numeric(year.month)']], 5),
 101+ " p=", round(modelSummary$coefficients[2,4], 8)
 102+ ),
 103+ .5,
 104+ .95
 105+ )
 106+ }
89107 },
90108 #main="Average Patroller workload by month",
91109 ylab=paste("Mean", name, "per user-month"),
92 - xlab="Month",
93 - scales=list(x=list(rot=45)),
 110+ xlab="Year",
 111+ scales=list(x=list(rot=45, at=0:9*12, labels=2001:2010)),
94112 ylim=c(0, max(m)*1.1)
95113 ))
96114 }
@@ -119,7 +137,7 @@
120138
121139 top_vfers = data.frame()
122140 for(year.month in unique(reverter_months$year.month)){
123 - month_vfers = reverter_months[reverter_months$year.month == year.month,]
 141+ month_vfers = reverter_months[!reverter_months$bot & reverter_months$year.month == year.month,]
124142 cat("Adding", year.month, "...")
125143 top_vfers = rbind(
126144 top_vfers,
@@ -127,39 +145,107 @@
128146 )
129147 cat("DONE!\n")
130148 }
 149+top_vfers$year.month = with(
 150+ top_vfers,
 151+ as.factor(paste(year, padMonth(as.numeric(as.character(top_vfers$month))), sep="/"))
 152+)
131153
132154 top_activity_months = summaryBy(
133155 vandal_reverts + reverts + revisions ~ year.month,
134 - data=top_vfers,
 156+ data=top_vfers[
 157+ top_vfers$year.month != "NULL/NA" &
 158+ top_vfers$year.month != "NA/NA" &
 159+ top_vfers$year != "2011",
 160+ ],
135161 FUN=c(mean, sd, length)
136162 )
137163
138 -png("plots/reverting_revisions.per_user_month.top_50.png", height=768, width=1024)
 164+#png("plots/reverting_revisions.per_user_month.top_50.png", height=768, width=1024)
 165+pdf("plots/reverting_revisions.per_user_month.top_50.pdf", height=6, width=8, paper="special")
139166 with(
140167 top_activity_months,
141 - plot_activity_mean(year.month, reverts.mean, reverts.sd, reverts.length, "reverting revisions")
 168+ plot_activity_mean(year.month, reverts.mean, reverts.sd, reverts.length, "reverting revisions", model=F)
142169 )
143170 dev.off()
144171
145 -png("plots/vandal_reverting_revisions.per_user_month.top_50.png", height=768, width=1024)
 172+
 173+#png("plots/vandal_reverting_revisions.per_user_month.top_50.png", height=768, width=1024)
 174+pdf("plots/vandal_reverting_revisions.per_user_month.top_50.pdf", height=6, width=8, paper="special")
146175 with(
147176 top_activity_months,
148 - plot_activity_mean(year.month, vandal_reverts.mean, vandal_reverts.sd, vandal_reverts.length, "vandal reverting revisions")
 177+ plot_activity_mean(year.month, vandal_reverts.mean, vandal_reverts.sd, vandal_reverts.length, "vandal reverting revisions", model=F)
149178 )
150179 dev.off()
151180
152 -png("plots/revisions.per_user_month.top_50.png", height=768, width=1024)
 181+#png("plots/revisions.per_user_month.top_50.png", height=768, width=1024)
 182+pdf("plots/revisions.per_user_month.top_50.pdf", height=6, width=8, paper="special")
153183 with(
154184 top_activity_months,
155 - plot_activity_mean(year.month, revisions.mean, revisions.sd, revisions.length, "revisions")
 185+ plot_activity_mean(year.month, revisions.mean, revisions.sd, revisions.length, "revisions", model=F)
156186 )
157187 dev.off()
158188
159189
160190
161191
 192+plot_activity_mean = function(year.month, m, s, n, name, model=T){
 193+ lmodel = lm(
 194+ m ~ as.numeric(year.month)
 195+ )
 196+ modelSummary = summary(lmodel)
 197+ monthLine = function(x){
 198+ lmodel$coefficients[['(Intercept)']] + lmodel$coefficients[['as.numeric(year.month)']]*x
 199+ }
 200+
 201+ print(xyplot(
 202+ m ~ as.factor(year.month),
 203+ panel = function(x, y, subscripts, ...){
 204+ panel.xyplot(x, y, ...)
 205+ se = s[subscripts]/sqrt(n[subscripts])
 206+ panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...)
 207+ panel.lines(x[order(x)], y[order(x)], lwd=2, ...)
 208+ if(model){
 209+ panel.lines(x[order(x)], monthLine(as.numeric(x[order(x)])), lwd=2, col="#000000")
 210+ grid.text(
 211+ paste(
 212+ "R^2=", round(modelSummary$r.squared, 3),
 213+ " coef=", round(lmodel$coefficients[['as.numeric(year.month)']], 5),
 214+ " p=", round(modelSummary$coefficients[2,4], 8)
 215+ ),
 216+ .5,
 217+ .95
 218+ )
 219+ }
 220+ },
 221+ #main="Average Patroller workload by month",
 222+ ylab=paste("Mean", name, "per user-month"),
 223+ xlab="Year",
 224+ scales=list(x=list(rot=45, at=0:3*12, labels=2007:2010)),
 225+ ylim=c(0, max(m)*1.1)
 226+ ))
 227+}
162228
 229+limited_activity_months = summaryBy(
 230+ vandal_reverts + reverts + revisions ~ year.month,
 231+ data=top_vfers[
 232+ top_vfers$year.month != "NULL/NA" &
 233+ top_vfers$year.month != "NA/NA" &
 234+ as.numeric(as.character(top_vfers$year)) >= 2007 &
 235+ top_vfers$year != "2011",
 236+ ],
 237+ FUN=c(mean, sd, length)
 238+)
163239
 240+pdf("plots/reverting_revisions.per_user_month.top_50.after_2007.pdf", height=6, width=8, paper="special")
 241+with(
 242+ limited_activity_months,
 243+ plot_activity_mean(year.month, reverts.mean, reverts.sd, reverts.length, "reverting revisions")
 244+)
 245+dev.off()
164246
165247
166248
 249+
 250+
 251+
 252+
Index: trunk/tools/wsor/overworked/R/loader/load_reverter_months.R
@@ -3,7 +3,7 @@
44
55
66 load_reverter_months = function(verbose=T, reload=F){
7 - filename = paste(DATA_DIR, "en.reverter_months.20110115.no_quotes_or_bots.final.tsv", sep="/")
 7+ filename = paste(DATA_DIR, "reverter_months_20110115.tsv", sep="/")
88 if(!exists("REVERTER_MONTHS")){
99 REVERTER_MONTHS <<- NULL
1010 }
Index: trunk/tools/wsor/overworked/R/loader/load_reverting_months.R
@@ -3,7 +3,7 @@
44
55
66 load_reverting_months = function(verbose=T, reload=F){
7 - filename = paste(DATA_DIR, "en.reverting_years.20110115.tsv", sep="/")
 7+ filename = paste(DATA_DIR, "reverting_months.20110115.tsv", sep="/")
88 if(!exists("REVERTING_MONTHS")){
99 REVERTING_MONTHS <<- NULL
1010 }
Index: trunk/tools/wsor/overworked/testing.sql
@@ -173,8 +173,8 @@
174174
175175
176176 SELECT
177 - SUBSTR(rev_timestamp, 1,4),
178 - SUBSTR(rev_timestamp, 1,2),
 177+ YEAR(rev_timestamp),
 178+ MONTH(rev_timestamp),
179179 count(*),
180180 sum(revision_id IS NOT NULL),
181181 sum(revision_id IS NOT NULL AND is_vandalism)
@@ -207,3 +207,68 @@
208208 SELECT user_id FROM zexley.bots
209209 );
210210
 211+
 212+CREATE TABLE halfak.reverter_months_20110115
 213+SELECT
 214+ rev_user,
 215+ rev_user_text,
 216+ YEAR(rev_timestamp) AS year,
 217+ MONTH(rev_timestamp) AS month,
 218+ COUNT(*) as revisions,
 219+ SUM(revert.rev_id IS NOT NULL) as reverts,
 220+ SUM(revert.rev_id IS NOT NULL AND revert.is_vandalism) as vandal_reverts
 221+FROM revision
 222+LEFT JOIN halfak.revert_20110115 revert USING (rev_id)
 223+WHERE rev_timestamp < "20110115000000"
 224+GROUP BY rev_user, YEAR(rev_timestamp), MONTH(rev_timestamp);
 225+
 226+
 227+CREATE TABLE halfak.admins_20110911
 228+SELECT
 229+ user_id,
 230+ user_name,
 231+ user_email_authenticated,
 232+ user_registration,
 233+ user_editcount,
 234+ count(*) as actions,
 235+ min(log_timestamp) as became_admin
 236+FROM logging
 237+INNER JOIN user ON REPLACE(log_title, "_", " ") = user_name
 238+WHERE log_action = "rights"
 239+AND log_type = "rights"
 240+AND (
 241+ log_comment LIKE "%+sysop%" OR
 242+ log_params LIKE "%sysop%"
 243+)
 244+GROUP BY user.user_id;
 245+CREATE UNIQUE INDEX user_idx ON halfak.admins_20110911 (user_id);
 246+
 247+
 248+--
 249+--Limit bot reverts to namespace zero
 250+--revision_copy
 251+--
 252+CREATE TABLE halfak.actor_revert_months
 253+SELECT
 254+ IF(
 255+ bot.user_id IS NOT NULL,
 256+ "bot",
 257+ IF(
 258+ tool IS NOT NULL,
 259+ "tool",
 260+ "human"
 261+ )
 262+ ) AS actor,
 263+ YEAR(rc.rev_timestamp) AS year,
 264+ MONTH(rc.rev_timestamp) as month,
 265+ COUNT(*) AS revisions,
 266+ SUM(revert.rev_id IS NOT NULL) as reverts,
 267+ SUM(revert.rev_id IS NOT NULL AND is_vandalism) as vandal_reverts
 268+FROM staeiou.revision_copy rc
 269+INNER JOIN enwiki.revision r USING (rev_id)
 270+INNER JOIN enwiki.page p ON page_id = r.rev_page
 271+LEFT JOIN halfak.revert_20110115 revert USING (rev_id)
 272+LEFT JOIN halfak.bot_20110711 bot ON rc.rev_user = user_id
 273+WHERE rc.rev_timestamp < "20110115000000"
 274+AND page_namespace = 0
 275+GROUP BY 1, 2, 3;
Index: trunk/tools/wsor/scripts/reverts.py
@@ -53,7 +53,6 @@
5454 revertedFile = open(args.output_prefix + "reverted.tsv", "w")
5555 logging.info("Creating output file: %s" % (args.output_prefix + "reverted.tsv"))
5656
57 - print(args.dump)
5857 logging.info("Prcoessing...")
5958 for out in dump.map(args.dump, reverts.process, threads=args.threads):
6059 if out[0] == 'revert':
@@ -63,6 +62,7 @@
6463 revertedFile.write("\t".join(encode(v) for v in out[1:]) + "\n")
6564 LOGGING_STREAM.write(".")
6665
 66+ LOGGING_STREAM.write("\n")
6767
6868 revertFile.close()
6969 revertedFile.close()
Index: trunk/tools/wsor/message_templates/message_postings.py
@@ -0,0 +1,148 @@
 2+import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time
 3+import urllib, urllib2
 4+import wmf
 5+
 6+def encode(v):
 7+ if v == None: return "\N"
 8+
 9+ if type(v) == types.LongType: v = int(v)
 10+ elif type(v) == types.UnicodeType: v = v.encode('utf-8')
 11+
 12+ return str(v).encode("string-escape")
 13+
 14+# | year | month | day | hour | minute | second |
 15+MW_DATE = re.compile(r"[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]")
 16+
 17+def mwDate(string):
 18+ if MW_DATE.match(string) == None:
 19+ raise ValueError("%s is not a valid date. Expected YYMMDDHHmmSS" % string)
 20+ else:
 21+ return string
 22+
 23+def main():
 24+ parser = argparse.ArgumentParser(
 25+ description='Gathers template message postings based on comment and diff matching regular expressions.'
 26+ )
 27+ parser.add_argument(
 28+ '-c', '--cnf',
 29+ metavar="<path>",
 30+ type=str,
 31+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 32+ default=os.path.expanduser("~/.my.cnf")
 33+ )
 34+ parser.add_argument(
 35+ '-s', '--host',
 36+ type=str,
 37+ help='the database host to connect to (defaults to localhost)',
 38+ default="localhost"
 39+ )
 40+ parser.add_argument(
 41+ '-d', '--db',
 42+ type=str,
 43+ help='the language db to run the query in (defaults to enwiki)',
 44+ default="enwiki"
 45+ )
 46+ parser.add_argument(
 47+ '-a', '--api_uri',
 48+ type=str,
 49+ help='the default Wikimedia API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
 50+ default="http://en.wikipedia.org/w/api.php"
 51+ )
 52+ parser.add_argument(
 53+ '--before',
 54+ type=str,
 55+ help='the default Wikimedia API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)',
 56+ default="http://en.wikipedia.org/w/api.php"
 57+ )
 58+ parser.add_argument(
 59+ 'after',
 60+ type=mwDate,
 61+ help='regular expression to match against message content'
 62+ )
 63+ parser.add_argument(
 64+ 'comment',
 65+ type=re.compile,
 66+ help='regular expression to match against message posting comment'
 67+ )
 68+ parser.add_argument(
 69+ 'message',
 70+ type=re.compile,
 71+ help='regular expression to match against message content'
 72+ )
 73+ args = parser.parse_args()
 74+
 75+ LOGGING_STREAM = sys.stderr
 76+ logging.basicConfig(
 77+ level=logging.DEBUG,
 78+ stream=LOGGING_STREAM,
 79+ format='%(asctime)s %(levelname)-8s %(message)s',
 80+ datefmt='%b-%d %H:%M:%S'
 81+ )
 82+
 83+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 84+ db = Database(
 85+ host=args.host,
 86+ db=args.db,
 87+ read_default_file=args.cnf
 88+ )
 89+
 90+
 91+
 92+class Database:
 93+
 94+ def __init__(self, *args, **kwargs):
 95+ self.args = args
 96+ self.kwargs = kwargs
 97+ self.conn = MySQLdb.connect(*args, **kwargs)
 98+
 99+ def getPostings(self, afterDate, commentPattern):
 100+ cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
 101+ cursor.execute(
 102+ """
 103+ SELECT * FROM
 104+ FROM revision
 105+ WHERE rev_timestamp > %(afterDate)s
 106+ AND rev_comment REGEXP %(commentPattern)s
 107+ """,
 108+ {
 109+ 'afterDate': afterDate,
 110+ 'commentPattern': commentPattern
 111+ }
 112+ )
 113+
 114+ for row in cursor:
 115+ yield row
 116+
 117+
 118+
 119+class WPAPI:
 120+
 121+ def __init__(self, uri):
 122+ self.uri = uri
 123+
 124+ def getDiff(self, revId):
 125+
 126+ response = urllib2.urlopen(
 127+ self.uri,
 128+ data=urllib.urlencode({
 129+ 'action': "query",
 130+ 'prop': "revisions",
 131+ 'revids': revId,
 132+ 'rvprop': "diff",
 133+ 'format': "json"
 134+ })
 135+ )
 136+
 137+ js = json.load(response)
 138+
 139+
 140+
 141+ try:
 142+ if 'badrevids' in js['query']:
 143+ raise KeyError(revId)
 144+ else:
 145+ return js['query']['pages'].values()[0]['revisions'][0]['diff']['*']
 146+ except KeyError:
 147+
 148+
 149+

Status & tagging log