Index: trunk/tools/wsor/overworked/R/reverter_work.R |
— | — | @@ -1,16 +1,33 @@ |
2 | 2 | source("loader/load_reverter_months.R") |
3 | | - |
| 3 | +source("loader/load_bots.R") |
4 | 4 | library(lattice) |
5 | 5 | library(grid) |
6 | 6 | library(doBy) |
7 | 7 | |
8 | 8 | reverter_months = load_reverter_months() |
9 | | -reverter_months = reverter_months[!grepl("bot( |$|[^a-z])", reverter_months$username, ignore.case=T),] |
10 | | -reverter_months = reverter_months[reverter_months$username != "DASHBotAV",] |
| 9 | +bots = load_bots() |
| 10 | + |
| 11 | +padMonth = function(x){ |
| 12 | + sapply( |
| 13 | + x, |
| 14 | + function(val){ |
| 15 | + if(is.na(val)){ |
| 16 | + NA |
| 17 | + } |
| 18 | + else if(val<10){ |
| 19 | + paste("0", val, sep="") |
| 20 | + }else{ |
| 21 | + paste(val) |
| 22 | + } |
| 23 | + } |
| 24 | + ) |
| 25 | +} |
| 26 | + |
| 27 | +reverter_months$bot = reverter_months$user_id %in% bots$user_id |
11 | 28 | reverter_months$active = reverter_months$revisions >= 5 |
12 | 29 | reverter_months$year.month = with( |
13 | 30 | reverter_months, |
14 | | - as.factor(paste(year, month, sep="/")) |
| 31 | + as.factor(paste(year, padMonth(month), sep="/")) |
15 | 32 | ) |
16 | 33 | |
17 | 34 | vfer_years = with( |
— | — | @@ -54,18 +71,17 @@ |
55 | 72 | |
56 | 73 | activity_months = summaryBy( |
57 | 74 | vandal_reverts + reverts + revisions ~ year.month, |
58 | | - data=reverter_months, |
| 75 | + data=reverter_months[!reverter_months$bot,], |
59 | 76 | FUN=c(mean, sd, length) |
60 | 77 | ) |
61 | 78 | |
62 | | -plot_activity_mean = function(year.month, m, s, n, name){ |
63 | | - model = lm( |
64 | | - m ~ as.numeric(year.month), |
65 | | - data=activity_months |
| 79 | +plot_activity_mean = function(year.month, m, s, n, name, model=T){ |
| 80 | + lmodel = lm( |
| 81 | + m ~ as.numeric(year.month) |
66 | 82 | ) |
67 | | - summary(model) |
| 83 | + modelSummary = summary(lmodel) |
68 | 84 | monthLine = function(x){ |
69 | | - model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x |
| 85 | + lmodel$coefficients[['(Intercept)']] + lmodel$coefficients[['as.numeric(year.month)']]*x |
70 | 86 | } |
71 | 87 | |
72 | 88 | print(xyplot( |
— | — | @@ -75,21 +91,23 @@ |
76 | 92 | se = s[subscripts]/sqrt(n[subscripts]) |
77 | 93 | panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...) |
78 | 94 | panel.lines(x[order(x)], y[order(x)], lwd=2, ...) |
79 | | - panel.lines(x[order(x)], monthLine(as.numeric(x[order(x)])), lwd=2, col="#000000") |
80 | | - grid.text( |
81 | | - paste( |
82 | | - "R^2=", round(modelSummary$r.squared, 3), |
83 | | - " coef=", round(model$coefficients[['as.numeric(year.month)']], 5), |
84 | | - " p=", round(modelSummary$coefficients[2,4], 8) |
85 | | - ), |
86 | | - .5, |
87 | | - .95 |
88 | | - ) |
| 95 | + if(model){ |
| 96 | + panel.lines(x[order(x)], monthLine(as.numeric(x[order(x)])), lwd=2, col="#000000") |
| 97 | + grid.text( |
| 98 | + paste( |
| 99 | + "R^2=", round(modelSummary$r.squared, 3), |
| 100 | + " coef=", round(lmodel$coefficients[['as.numeric(year.month)']], 5), |
| 101 | + " p=", round(modelSummary$coefficients[2,4], 8) |
| 102 | + ), |
| 103 | + .5, |
| 104 | + .95 |
| 105 | + ) |
| 106 | + } |
89 | 107 | }, |
90 | 108 | #main="Average Patroller workload by month", |
91 | 109 | ylab=paste("Mean", name, "per user-month"), |
92 | | - xlab="Month", |
93 | | - scales=list(x=list(rot=45)), |
| 110 | + xlab="Year", |
| 111 | + scales=list(x=list(rot=45, at=0:9*12, labels=2001:2010)), |
94 | 112 | ylim=c(0, max(m)*1.1) |
95 | 113 | )) |
96 | 114 | } |
— | — | @@ -119,7 +137,7 @@ |
120 | 138 | |
121 | 139 | top_vfers = data.frame() |
122 | 140 | for(year.month in unique(reverter_months$year.month)){ |
123 | | - month_vfers = reverter_months[reverter_months$year.month == year.month,] |
| 141 | + month_vfers = reverter_months[!reverter_months$bot & reverter_months$year.month == year.month,] |
124 | 142 | cat("Adding", year.month, "...") |
125 | 143 | top_vfers = rbind( |
126 | 144 | top_vfers, |
— | — | @@ -127,39 +145,107 @@ |
128 | 146 | ) |
129 | 147 | cat("DONE!\n") |
130 | 148 | } |
| 149 | +top_vfers$year.month = with( |
| 150 | + top_vfers, |
| 151 | + as.factor(paste(year, padMonth(as.numeric(as.character(top_vfers$month))), sep="/")) |
| 152 | +) |
131 | 153 | |
132 | 154 | top_activity_months = summaryBy( |
133 | 155 | vandal_reverts + reverts + revisions ~ year.month, |
134 | | - data=top_vfers, |
| 156 | + data=top_vfers[ |
| 157 | + top_vfers$year.month != "NULL/NA" & |
| 158 | + top_vfers$year.month != "NA/NA" & |
| 159 | + top_vfers$year != "2011", |
| 160 | + ], |
135 | 161 | FUN=c(mean, sd, length) |
136 | 162 | ) |
137 | 163 | |
138 | | -png("plots/reverting_revisions.per_user_month.top_50.png", height=768, width=1024) |
| 164 | +#png("plots/reverting_revisions.per_user_month.top_50.png", height=768, width=1024) |
| 165 | +pdf("plots/reverting_revisions.per_user_month.top_50.pdf", height=6, width=8, paper="special") |
139 | 166 | with( |
140 | 167 | top_activity_months, |
141 | | - plot_activity_mean(year.month, reverts.mean, reverts.sd, reverts.length, "reverting revisions") |
| 168 | + plot_activity_mean(year.month, reverts.mean, reverts.sd, reverts.length, "reverting revisions", model=F) |
142 | 169 | ) |
143 | 170 | dev.off() |
144 | 171 | |
145 | | -png("plots/vandal_reverting_revisions.per_user_month.top_50.png", height=768, width=1024) |
| 172 | + |
| 173 | +#png("plots/vandal_reverting_revisions.per_user_month.top_50.png", height=768, width=1024) |
| 174 | +pdf("plots/vandal_reverting_revisions.per_user_month.top_50.pdf", height=6, width=8, paper="special") |
146 | 175 | with( |
147 | 176 | top_activity_months, |
148 | | - plot_activity_mean(year.month, vandal_reverts.mean, vandal_reverts.sd, vandal_reverts.length, "vandal reverting revisions") |
| 177 | + plot_activity_mean(year.month, vandal_reverts.mean, vandal_reverts.sd, vandal_reverts.length, "vandal reverting revisions", model=F) |
149 | 178 | ) |
150 | 179 | dev.off() |
151 | 180 | |
152 | | -png("plots/revisions.per_user_month.top_50.png", height=768, width=1024) |
| 181 | +#png("plots/revisions.per_user_month.top_50.png", height=768, width=1024) |
| 182 | +pdf("plots/revisions.per_user_month.top_50.pdf", height=6, width=8, paper="special") |
153 | 183 | with( |
154 | 184 | top_activity_months, |
155 | | - plot_activity_mean(year.month, revisions.mean, revisions.sd, revisions.length, "revisions") |
| 185 | + plot_activity_mean(year.month, revisions.mean, revisions.sd, revisions.length, "revisions", model=F) |
156 | 186 | ) |
157 | 187 | dev.off() |
158 | 188 | |
159 | 189 | |
160 | 190 | |
161 | 191 | |
| 192 | +plot_activity_mean = function(year.month, m, s, n, name, model=T){ |
| 193 | + lmodel = lm( |
| 194 | + m ~ as.numeric(year.month) |
| 195 | + ) |
| 196 | + modelSummary = summary(lmodel) |
| 197 | + monthLine = function(x){ |
| 198 | + lmodel$coefficients[['(Intercept)']] + lmodel$coefficients[['as.numeric(year.month)']]*x |
| 199 | + } |
| 200 | + |
| 201 | + print(xyplot( |
| 202 | + m ~ as.factor(year.month), |
| 203 | + panel = function(x, y, subscripts, ...){ |
| 204 | + panel.xyplot(x, y, ...) |
| 205 | + se = s[subscripts]/sqrt(n[subscripts]) |
| 206 | + panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...) |
| 207 | + panel.lines(x[order(x)], y[order(x)], lwd=2, ...) |
| 208 | + if(model){ |
| 209 | + panel.lines(x[order(x)], monthLine(as.numeric(x[order(x)])), lwd=2, col="#000000") |
| 210 | + grid.text( |
| 211 | + paste( |
| 212 | + "R^2=", round(modelSummary$r.squared, 3), |
| 213 | + " coef=", round(lmodel$coefficients[['as.numeric(year.month)']], 5), |
| 214 | + " p=", round(modelSummary$coefficients[2,4], 8) |
| 215 | + ), |
| 216 | + .5, |
| 217 | + .95 |
| 218 | + ) |
| 219 | + } |
| 220 | + }, |
| 221 | + #main="Average Patroller workload by month", |
| 222 | + ylab=paste("Mean", name, "per user-month"), |
| 223 | + xlab="Year", |
| 224 | + scales=list(x=list(rot=45, at=0:3*12, labels=2007:2010)), |
| 225 | + ylim=c(0, max(m)*1.1) |
| 226 | + )) |
| 227 | +} |
162 | 228 | |
| 229 | +limited_activity_months = summaryBy( |
| 230 | + vandal_reverts + reverts + revisions ~ year.month, |
| 231 | + data=top_vfers[ |
| 232 | + top_vfers$year.month != "NULL/NA" & |
| 233 | + top_vfers$year.month != "NA/NA" & |
| 234 | + as.numeric(as.character(top_vfers$year)) >= 2007 & |
| 235 | + top_vfers$year != "2011", |
| 236 | + ], |
| 237 | + FUN=c(mean, sd, length) |
| 238 | +) |
163 | 239 | |
| 240 | +pdf("plots/reverting_revisions.per_user_month.top_50.after_2007.pdf", height=6, width=8, paper="special") |
| 241 | +with( |
| 242 | + limited_activity_months, |
| 243 | + plot_activity_mean(year.month, reverts.mean, reverts.sd, reverts.length, "reverting revisions") |
| 244 | +) |
| 245 | +dev.off() |
164 | 246 | |
165 | 247 | |
166 | 248 | |
| 249 | + |
| 250 | + |
| 251 | + |
| 252 | + |
Index: trunk/tools/wsor/overworked/R/loader/load_reverter_months.R |
— | — | @@ -3,7 +3,7 @@ |
4 | 4 | |
5 | 5 | |
6 | 6 | load_reverter_months = function(verbose=T, reload=F){ |
7 | | - filename = paste(DATA_DIR, "en.reverter_months.20110115.no_quotes_or_bots.final.tsv", sep="/") |
| 7 | + filename = paste(DATA_DIR, "reverter_months_20110115.tsv", sep="/") |
8 | 8 | if(!exists("REVERTER_MONTHS")){ |
9 | 9 | REVERTER_MONTHS <<- NULL |
10 | 10 | } |
Index: trunk/tools/wsor/overworked/R/loader/load_reverting_months.R |
— | — | @@ -3,7 +3,7 @@ |
4 | 4 | |
5 | 5 | |
6 | 6 | load_reverting_months = function(verbose=T, reload=F){ |
7 | | - filename = paste(DATA_DIR, "en.reverting_years.20110115.tsv", sep="/") |
| 7 | + filename = paste(DATA_DIR, "reverting_months.20110115.tsv", sep="/") |
8 | 8 | if(!exists("REVERTING_MONTHS")){ |
9 | 9 | REVERTING_MONTHS <<- NULL |
10 | 10 | } |
Index: trunk/tools/wsor/overworked/testing.sql |
— | — | @@ -173,8 +173,8 @@ |
174 | 174 | |
175 | 175 | |
176 | 176 | SELECT |
177 | | - SUBSTR(rev_timestamp, 1,4), |
178 | | - SUBSTR(rev_timestamp, 1,2), |
| 177 | + YEAR(rev_timestamp), |
| 178 | + MONTH(rev_timestamp), |
179 | 179 | count(*), |
180 | 180 | sum(revision_id IS NOT NULL), |
181 | 181 | sum(revision_id IS NOT NULL AND is_vandalism) |
— | — | @@ -207,3 +207,68 @@ |
208 | 208 | SELECT user_id FROM zexley.bots |
209 | 209 | ); |
210 | 210 | |
| 211 | + |
| 212 | +CREATE TABLE halfak.reverter_months_20110115 |
| 213 | +SELECT |
| 214 | + rev_user, |
| 215 | + rev_user_text, |
| 216 | + YEAR(rev_timestamp) AS year, |
| 217 | + MONTH(rev_timestamp) AS month, |
| 218 | + COUNT(*) as revisions, |
| 219 | + SUM(revert.rev_id IS NOT NULL) as reverts, |
| 220 | + SUM(revert.rev_id IS NOT NULL AND revert.is_vandalism) as vandal_reverts |
| 221 | +FROM revision |
| 222 | +LEFT JOIN halfak.revert_20110115 revert USING (rev_id) |
| 223 | +WHERE rev_timestamp < "20110115000000" |
| 224 | +GROUP BY rev_user, YEAR(rev_timestamp), MONTH(rev_timestamp); |
| 225 | + |
| 226 | + |
| 227 | +CREATE TABLE halfak.admins_20110911 |
| 228 | +SELECT |
| 229 | + user_id, |
| 230 | + user_name, |
| 231 | + user_email_authenticated, |
| 232 | + user_registration, |
| 233 | + user_editcount, |
| 234 | + count(*) as actions, |
| 235 | + min(log_timestamp) as became_admin |
| 236 | +FROM logging |
| 237 | +INNER JOIN user ON REPLACE(log_title, "_", " ") = user_name |
| 238 | +WHERE log_action = "rights" |
| 239 | +AND log_type = "rights" |
| 240 | +AND ( |
| 241 | + log_comment LIKE "%+sysop%" OR |
| 242 | + log_params LIKE "%sysop%" |
| 243 | +) |
| 244 | +GROUP BY user.user_id; |
| 245 | +CREATE UNIQUE INDEX user_idx ON halfak.admins_20110911 (user_id); |
| 246 | + |
| 247 | + |
| 248 | +-- |
| 249 | +--Limit bot reverts to namespace zero |
| 250 | +--revision_copy |
| 251 | +-- |
| 252 | +CREATE TABLE halfak.actor_revert_months |
| 253 | +SELECT |
| 254 | + IF( |
| 255 | + bot.user_id IS NOT NULL, |
| 256 | + "bot", |
| 257 | + IF( |
| 258 | + tool IS NOT NULL, |
| 259 | + "tool", |
| 260 | + "human" |
| 261 | + ) |
| 262 | + ) AS actor, |
| 263 | + YEAR(rc.rev_timestamp) AS year, |
| 264 | + MONTH(rc.rev_timestamp) as month, |
| 265 | + COUNT(*) AS revisions, |
| 266 | + SUM(revert.rev_id IS NOT NULL) as reverts, |
| 267 | + SUM(revert.rev_id IS NOT NULL AND is_vandalism) as vandal_reverts |
| 268 | +FROM staeiou.revision_copy rc |
| 269 | +INNER JOIN enwiki.revision r USING (rev_id) |
| 270 | +INNER JOIN enwiki.page p ON page_id = r.rev_page |
| 271 | +LEFT JOIN halfak.revert_20110115 revert USING (rev_id) |
| 272 | +LEFT JOIN halfak.bot_20110711 bot ON rc.rev_user = user_id |
| 273 | +WHERE rc.rev_timestamp < "20110115000000" |
| 274 | +AND page_namespace = 0 |
| 275 | +GROUP BY 1, 2, 3; |
Index: trunk/tools/wsor/scripts/reverts.py |
— | — | @@ -53,7 +53,6 @@ |
54 | 54 | revertedFile = open(args.output_prefix + "reverted.tsv", "w") |
55 | 55 | logging.info("Creating output file: %s" % (args.output_prefix + "reverted.tsv")) |
56 | 56 | |
57 | | - print(args.dump) |
58 | 57 | logging.info("Prcoessing...") |
59 | 58 | for out in dump.map(args.dump, reverts.process, threads=args.threads): |
60 | 59 | if out[0] == 'revert': |
— | — | @@ -63,6 +62,7 @@ |
64 | 63 | revertedFile.write("\t".join(encode(v) for v in out[1:]) + "\n") |
65 | 64 | LOGGING_STREAM.write(".") |
66 | 65 | |
| 66 | + LOGGING_STREAM.write("\n") |
67 | 67 | |
68 | 68 | revertFile.close() |
69 | 69 | revertedFile.close() |
Index: trunk/tools/wsor/message_templates/message_postings.py |
— | — | @@ -0,0 +1,148 @@ |
| 2 | +import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time |
| 3 | +import urllib, urllib2 |
| 4 | +import wmf |
| 5 | + |
| 6 | +def encode(v): |
| 7 | + if v == None: return "\N" |
| 8 | + |
| 9 | + if type(v) == types.LongType: v = int(v) |
| 10 | + elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
| 11 | + |
| 12 | + return str(v).encode("string-escape") |
| 13 | + |
| 14 | +# | year | month | day | hour | minute | second | |
| 15 | +MW_DATE = re.compile(r"[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]") |
| 16 | + |
| 17 | +def mwDate(string): |
| 18 | + if MW_DATE.match(string) == None: |
| 19 | + raise ValueError("%s is not a valid date. Expected YYMMDDHHmmSS" % string) |
| 20 | + else: |
| 21 | + return string |
| 22 | + |
| 23 | +def main(): |
| 24 | + parser = argparse.ArgumentParser( |
| 25 | + description='Gathers template message postings based on comment and diff matching regular expressions.' |
| 26 | + ) |
| 27 | + parser.add_argument( |
| 28 | + '-c', '--cnf', |
| 29 | + metavar="<path>", |
| 30 | + type=str, |
| 31 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 32 | + default=os.path.expanduser("~/.my.cnf") |
| 33 | + ) |
| 34 | + parser.add_argument( |
| 35 | + '-s', '--host', |
| 36 | + type=str, |
| 37 | + help='the database host to connect to (defaults to localhost)', |
| 38 | + default="localhost" |
| 39 | + ) |
| 40 | + parser.add_argument( |
| 41 | + '-d', '--db', |
| 42 | + type=str, |
| 43 | + help='the language db to run the query in (defaults to enwiki)', |
| 44 | + default="enwiki" |
| 45 | + ) |
| 46 | + parser.add_argument( |
| 47 | + '-a', '--api_uri', |
| 48 | + type=str, |
| 49 | + help='the default Wikimedia API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', |
| 50 | + default="http://en.wikipedia.org/w/api.php" |
| 51 | + ) |
| 52 | + parser.add_argument( |
| 53 | + '--before', |
| 54 | + type=str, |
| 55 | + help='the default Wikimedia API to connect to in order to retrieve message content (defaults to http://en.wikipedia.org/w/api.php)', |
| 56 | + default="http://en.wikipedia.org/w/api.php" |
| 57 | + ) |
| 58 | + parser.add_argument( |
| 59 | + 'after', |
| 60 | + type=mwDate, |
| 61 | + help='regular expression to match against message content' |
| 62 | + ) |
| 63 | + parser.add_argument( |
| 64 | + 'comment', |
| 65 | + type=re.compile, |
| 66 | + help='regular expression to match against message posting comment' |
| 67 | + ) |
| 68 | + parser.add_argument( |
| 69 | + 'message', |
| 70 | + type=re.compile, |
| 71 | + help='regular expression to match against message content' |
| 72 | + ) |
| 73 | + args = parser.parse_args() |
| 74 | + |
| 75 | + LOGGING_STREAM = sys.stderr |
| 76 | + logging.basicConfig( |
| 77 | + level=logging.DEBUG, |
| 78 | + stream=LOGGING_STREAM, |
| 79 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 80 | + datefmt='%b-%d %H:%M:%S' |
| 81 | + ) |
| 82 | + |
| 83 | + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
| 84 | + db = Database( |
| 85 | + host=args.host, |
| 86 | + db=args.db, |
| 87 | + read_default_file=args.cnf |
| 88 | + ) |
| 89 | + |
| 90 | + |
| 91 | + |
| 92 | +class Database: |
| 93 | + |
| 94 | + def __init__(self, *args, **kwargs): |
| 95 | + self.args = args |
| 96 | + self.kwargs = kwargs |
| 97 | + self.conn = MySQLdb.connect(*args, **kwargs) |
| 98 | + |
| 99 | + def getPostings(self, afterDate, commentPattern): |
| 100 | + cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) |
| 101 | + cursor.execute( |
| 102 | + """ |
| 103 | + SELECT * FROM |
| 104 | + FROM revision |
| 105 | + WHERE rev_timestamp > %(afterDate)s |
| 106 | + AND rev_comment REGEXP %(commentPattern)s |
| 107 | + """, |
| 108 | + { |
| 109 | + 'afterDate': afterDate, |
| 110 | + 'commentPattern': commentPattern |
| 111 | + } |
| 112 | + ) |
| 113 | + |
| 114 | + for row in cursor: |
| 115 | + yield row |
| 116 | + |
| 117 | + |
| 118 | + |
| 119 | +class WPAPI: |
| 120 | + |
| 121 | + def __init__(self, uri): |
| 122 | + self.uri = uri |
| 123 | + |
| 124 | + def getDiff(self, revId): |
| 125 | + |
| 126 | + response = urllib2.urlopen( |
| 127 | + self.uri, |
| 128 | + data=urllib.urlencode({ |
| 129 | + 'action': "query", |
| 130 | + 'prop': "revisions", |
| 131 | + 'revids': revId, |
| 132 | + 'rvprop': "diff", |
| 133 | + 'format': "json" |
| 134 | + }) |
| 135 | + ) |
| 136 | + |
| 137 | + js = json.load(response) |
| 138 | + |
| 139 | + |
| 140 | + |
| 141 | + try: |
| 142 | + if 'badrevids' in js['query']: |
| 143 | + raise KeyError(revId) |
| 144 | + else: |
| 145 | + return js['query']['pages'].values()[0]['revisions'][0]['diff']['*'] |
| 146 | + except KeyError: |
| 147 | + |
| 148 | + |
| 149 | + |