Index: trunk/tools/wsor/overworked/R/reverter_work.R |
— | — | @@ -0,0 +1,143 @@ |
| 2 | +source("loader/load_reverter_months.R") |
| 3 | + |
| 4 | +library(lattice) |
| 5 | +library(grid) |
| 6 | + |
| 7 | +reverter_months = load_reverter_months() |
| 8 | +reverter_months = reverter_months[!grepl("bot( |$|[^a-z])", reverter_months$username, ignore.case=T),] |
| 9 | +reverter_months = reverter_months[reverter_months$username != "DASHBotAV",] |
| 10 | +reverter_months$active = reverter_months$revisions >= 5 |
| 11 | +reverter_months$year.month = with( |
| 12 | + reverter_months, |
| 13 | + as.factor(paste(year, month, sep="/")) |
| 14 | +) |
| 15 | + |
| 16 | +vfer_years = with( |
| 17 | + summaryBy( |
| 18 | + reverts ~ year + user_id + username, |
| 19 | + data=reverter_months, |
| 20 | + FUN=sum |
| 21 | + ), |
| 22 | + data.frame( |
| 23 | + year = year, |
| 24 | + user_id = user_id, |
| 25 | + username = username, |
| 26 | + reverts = reverts.sum |
| 27 | + ) |
| 28 | +) |
| 29 | + |
| 30 | +top_vfers = data.frame() |
| 31 | + |
| 32 | +for(year in unique(vfer_years$year)){ |
| 33 | + vfer_year = vfer_years[vfer_years$year==year,] |
| 34 | + vfer_year = vfer_year[order(vfer_year$reverts, decreasing=T),] |
| 35 | + png(paste('plots/vandal_fighter_activity', 'no_bots', year, 'png', sep="."), height=768, width=1024) |
| 36 | + print(barchart( |
| 37 | + reorder(username, reverts) ~ reverts, |
| 38 | + data=vfer_year[1:50,], |
| 39 | + horizontal=T, |
| 40 | + xlab="Vandal reverts", |
| 41 | + xlim=c(0,50000) |
| 42 | + )) |
| 43 | + dev.off() |
| 44 | +} |
| 45 | + |
| 46 | +format_p = function(pval){ |
| 47 | + if(pval < ".001"){ |
| 48 | + "< .001" |
| 49 | + }else{ |
| 50 | + paste("=", pval) |
| 51 | + } |
| 52 | +} |
| 53 | + |
| 54 | + |
| 55 | +activity_months = summaryBy( |
| 56 | + vandal_reverts + reverts + revisions ~ year.month, |
| 57 | + data=reverter_months, |
| 58 | + FUN=c(mean, sd, length) |
| 59 | +) |
| 60 | + |
| 61 | +plot_activity_mean = function(year.month, m, s, n, name){ |
| 62 | + model = lm( |
| 63 | + m ~ as.numeric(year.month), |
| 64 | + data=activity_months |
| 65 | + ) |
| 66 | + summary(model) |
| 67 | + monthLine = function(x){ |
| 68 | + model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x |
| 69 | + } |
| 70 | + |
| 71 | + print(xyplot( |
| 72 | + m ~ as.factor(year.month), |
| 73 | + panel = function(x, y, subscripts, ...){ |
| 74 | + panel.xyplot(x, y, ...) |
| 75 | + se = s[subscripts]/sqrt(n[subscripts]) |
| 76 | + panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...) |
| 77 | + panel.lines(x[order(x)], y[order(x)], lwd=2, ...) |
| 78 | + panel.lines(x[order(x)], monthLine(as.numeric(x[order(x)])), lwd=2, col="#000000") |
| 79 | + grid.text( |
| 80 | + paste( |
| 81 | + "R^2=", round(modelSummary$r.squared, 3), |
| 82 | + " coef=", round(model$coefficients[['as.numeric(year.month)']], 5), |
| 83 | + " p=", round(modelSummary$coefficients[2,4], 8) |
| 84 | + ), |
| 85 | + .5, |
| 86 | + .95 |
| 87 | + ) |
| 88 | + }, |
| 89 | + #main="Average Patroller workload by month", |
| 90 | + ylab=paste("Mean", name, "per user-month"), |
| 91 | + xlab="Month", |
| 92 | + scales=list(x=list(rot=45)), |
| 93 | + ylim=c(0, max(m)*1.1) |
| 94 | + )) |
| 95 | +} |
| 96 | + |
| 97 | + |
| 98 | +png("plots/reverting_revisions.per_user_month.png", height=768, width=1024) |
| 99 | +with( |
| 100 | + activity_months, |
| 101 | + plot_activity_mean(year.month, reverts.mean, reverts.sd, reverts.length, "reverting revisions") |
| 102 | +) |
| 103 | +dev.off() |
| 104 | + |
| 105 | +png("plots/vandal_reverting_revisions.per_user_month.png", height=768, width=1024) |
| 106 | +with( |
| 107 | + activity_months, |
| 108 | + plot_activity_mean(year.month, vandal_reverts.mean, vandal_reverts.sd, vandal_reverts.length, "vandal reverting revisions") |
| 109 | +) |
| 110 | +dev.off() |
| 111 | + |
| 112 | +png("plots/revisions.per_user_month.png", height=768, width=1024) |
| 113 | +with( |
| 114 | + activity_months, |
| 115 | + plot_activity_mean(year.month, revisions.mean, revisions.sd, revisions.length, "revisions") |
| 116 | +) |
| 117 | +dev.off() |
| 118 | + |
| 119 | + |
| 120 | +top_vfers = data.frame() |
| 121 | +for(year.month in unique(reverter_months$year.month)){ |
| 122 | + month_vfers = reverter_months[reverter_months$year.month == year.month,] |
| 123 | + cat("Adding", year.month, "...") |
| 124 | + top_vfers = rbind( |
| 125 | + top_vfers, |
| 126 | + month_vfers[order(month_vfers$reverts),][1:50,] |
| 127 | + ) |
| 128 | + cat("DONE!\n") |
| 129 | +} |
| 130 | + |
| 131 | +top_activity_months = summaryBy( |
| 132 | + vandal_reverts + reverts + revisions ~ year.month, |
| 133 | + data=reverter_months, |
| 134 | + FUN=c(mean, sd, length) |
| 135 | +) |
| 136 | + |
| 137 | + |
| 138 | + |
| 139 | + |
| 140 | + |
| 141 | + |
| 142 | + |
| 143 | + |
| 144 | + |
Index: trunk/tools/wsor/overworked/R/vandal_fighters.R |
— | — | @@ -0,0 +1,302 @@ |
| 2 | +source("loader/load_reverter_months.R") |
| 3 | + |
| 4 | +reverter_months = load_reverter_months() |
| 5 | +reverter_months = reverter_months[!grepl("bot( |$|[^a-z])", reverter_months$username, ignore.case=T),] |
| 6 | +reverter_months$vf5 = reverter_months$vandal_reverts >= 5 |
| 7 | +reverter_months$vf50 = reverter_months$vandal_reverts >= 50 |
| 8 | +reverter_months$vf500 = reverter_months$vandal_reverts >= 500 |
| 9 | +reverter_months$r5 = reverter_months$reverts >= 5 |
| 10 | +reverter_months$r50 = reverter_months$reverts >= 50 |
| 11 | +reverter_months$r500 = reverter_months$reverts >= 500 |
| 12 | + |
| 13 | +reverter_months$active = reverter_months$revisions >= 5 |
| 14 | +reverter_months$year.month = with( |
| 15 | + reverter_months, |
| 16 | + as.factor(paste(year, month, sep="/")) |
| 17 | +) |
| 18 | + |
| 19 | +library(doBy) |
| 20 | +library(lattice) |
| 21 | + |
| 22 | +activity_counts = with( |
| 23 | + summaryBy( |
| 24 | + active + vf5 + vf50 + vf500 + r5 + r50 + r500 ~ year.month, |
| 25 | + data=reverter_months[reverter_months$active,], |
| 26 | + FUN=sum |
| 27 | + ), |
| 28 | + data.frame( |
| 29 | + year = year, |
| 30 | + month = month, |
| 31 | + active.users = active.sum, |
| 32 | + vf5.users = vf5.sum, |
| 33 | + vf50.users = vf50.sum, |
| 34 | + vf500.users = vf500.sum, |
| 35 | + r5.users = r5.sum, |
| 36 | + r50.users = r50.sum, |
| 37 | + r500.users = r500.sum, |
| 38 | + year.month = as.factor(paste(year, month, sep="/")) |
| 39 | + ) |
| 40 | +) |
| 41 | +activity_counts$log.users.active = log(activity_counts$users.active, base=10) |
| 42 | +activity_counts$log.v5.users = log(activity_counts$vf5.users, base=10) |
| 43 | +activity_counts$log.v50.users = log(activity_counts$vf50.users, base=10) |
| 44 | + |
| 45 | + |
| 46 | +png("plots/vandal_fighters.by_month.png", width=1024, height=768) |
| 47 | +plot( |
| 48 | + activity_counts$year.month, |
| 49 | + (activity_counts$active.users*0)-10000, |
| 50 | + col="#FFFFFF", |
| 51 | + ylim=c(0, max(activity_counts$users.active)+.5), |
| 52 | + main="Vandal fighters and active editors over time", |
| 53 | + xlab="Time (in months)", |
| 54 | + ylab="Number of users (log10 scaled)" |
| 55 | +) |
| 56 | +lines( |
| 57 | + activity_counts$year.month, |
| 58 | + activity_counts$active.users, |
| 59 | + type="o", |
| 60 | + pch=20, |
| 61 | + lty=1, |
| 62 | + col="blue" |
| 63 | +) |
| 64 | +lines( |
| 65 | + activity_counts$year.month, |
| 66 | + activity_counts$vf5.users, |
| 67 | + type="o", |
| 68 | + pch=22, |
| 69 | + lty=2, |
| 70 | + col="red" |
| 71 | +) |
| 72 | +legend( |
| 73 | + max(as.numeric(activity_counts$year.month))-10, |
| 74 | + max(activity_counts$active.users)+.5, |
| 75 | + c("active editors","vandal fighters"), |
| 76 | + cex=1.2, |
| 77 | + col=c("blue","red"), |
| 78 | + pch=20:21, |
| 79 | + lty=1:2 |
| 80 | +) |
| 81 | +dev.off() |
| 82 | + |
| 83 | +png("plots/vandal_fighters.by_month.logged.png", width=1024, height=768) |
| 84 | +plot( |
| 85 | + activity_counts$year.month, |
| 86 | + (activity_counts$log.users.active*0)-10000, |
| 87 | + col="#FFFFFF", |
| 88 | + ylim=c(0, max(activity_counts$log.users.active)+.5), |
| 89 | + main="Vandal fighters and active editors over time", |
| 90 | + xlab="Time (in months)", |
| 91 | + ylab="Number of users (log10 scaled)" |
| 92 | +) |
| 93 | +lines( |
| 94 | + activity_counts$year.month, |
| 95 | + activity_counts$log.active.users, |
| 96 | + type="o", |
| 97 | + pch=20, |
| 98 | + lty=1, |
| 99 | + col="blue" |
| 100 | +) |
| 101 | +lines( |
| 102 | + activity_counts$year.month, |
| 103 | + activity_counts$log.vf5.users, |
| 104 | + type="o", |
| 105 | + pch=22, |
| 106 | + lty=2, |
| 107 | + col="red" |
| 108 | +) |
| 109 | +legend( |
| 110 | + max(as.numeric(activity_counts$year.month))-10, |
| 111 | + max(activity_counts$log.active.users)+.5, |
| 112 | + c("active editors","vandal fighters"), |
| 113 | + cex=1.2, |
| 114 | + col=c("blue","red"), |
| 115 | + pch=20:21, |
| 116 | + lty=1:2 |
| 117 | +) |
| 118 | +dev.off() |
| 119 | + |
| 120 | +plot_prop_with_regression = function(year.month, prop, name, desc){ |
| 121 | + model = lm( |
| 122 | + prop ~ as.numeric(year.month) |
| 123 | + ) |
| 124 | + modelSummary = summary(model) |
| 125 | + monthLine = function(x){ |
| 126 | + model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x |
| 127 | + } |
| 128 | + png(paste("plots/", name, "_prop.by_month.png", sep=""), width=1024, height=768) |
| 129 | + plot( |
| 130 | + year.month, |
| 131 | + (prop*0)-10000, |
| 132 | + col="#FFFFFF", |
| 133 | + ylim=c(0, max(prop)*1.1), |
| 134 | + main=paste("Proportion of active editors who", desc, "(no bots)"), |
| 135 | + xlab="Month", |
| 136 | + ylab="Prop of of vandal fighters" |
| 137 | + ) |
| 138 | + lines( |
| 139 | + year.month, |
| 140 | + prop, |
| 141 | + type="o", |
| 142 | + pch=22, |
| 143 | + lty=2, |
| 144 | + col="red" |
| 145 | + ) |
| 146 | + lines( |
| 147 | + year.month[order(year.month)], |
| 148 | + monthLine(as.numeric(year.month[order(year.month)])), |
| 149 | + lty=1, |
| 150 | + col="black" |
| 151 | + ) |
| 152 | + legend( |
| 153 | + mean(as.numeric(year.month)), |
| 154 | + max(prop)*1.1, |
| 155 | + paste( |
| 156 | + "R^2=", round(modelSummary$r.squared, 3), |
| 157 | + " coef=", round(model$coefficients[['as.numeric(year.month)']], 5), |
| 158 | + " p=", round(modelSummary$coefficients[2,4], 5) |
| 159 | + ), |
| 160 | + cex=1.2, |
| 161 | + col=c("black"), |
| 162 | + lty=1 |
| 163 | + ) |
| 164 | + dev.off() |
| 165 | +} |
| 166 | +plot_prop_with_regression( |
| 167 | + activity_counts$year.month, |
| 168 | + activity_counts$vf5.users/activity_counts$active.users, |
| 169 | + "vandal_5", |
| 170 | + "revert >=5 vandals per month" |
| 171 | +) |
| 172 | +plot_prop_with_regression( |
| 173 | + activity_counts$year.month, |
| 174 | + activity_counts$vf50.users/activity_counts$active.users, |
| 175 | + "vandal_50", |
| 176 | + "revert >=50 vandals per month" |
| 177 | +) |
| 178 | +plot_prop_with_regression( |
| 179 | + activity_counts$year.month, |
| 180 | + activity_counts$vf500.users/activity_counts$active.users, |
| 181 | + "vandal_50", |
| 182 | + "revert >=500 vandals per month" |
| 183 | +) |
| 184 | +plot_prop_with_regression( |
| 185 | + activity_counts$year.month, |
| 186 | + activity_counts$r5.users/activity_counts$active.users, |
| 187 | + "revert_5", |
| 188 | + "revert >=5 times per month" |
| 189 | +) |
| 190 | +plot_prop_with_regression( |
| 191 | + activity_counts$year.month, |
| 192 | + activity_counts$r50.users/activity_counts$active.users, |
| 193 | + "revert_50", |
| 194 | + "revert >=50 times per month" |
| 195 | +) |
| 196 | +plot_prop_with_regression( |
| 197 | + activity_counts$year.month, |
| 198 | + activity_counts$r500.users/activity_counts$active.users, |
| 199 | + "revert_500", |
| 200 | + "revert >=500 times per month" |
| 201 | +) |
| 202 | + |
| 203 | + |
| 204 | +activity_counts$vandal_fighter_prop = activity_counts$users.vf/activity_counts$users.active |
| 205 | +activity_counts$year.month = as.factor(paste(activity_counts$year, activity_counts$month, sep="/")) |
| 206 | +model = lm( |
| 207 | + vandal_fighter_prop ~ as.numeric(year.month), |
| 208 | + data=activity_counts |
| 209 | +) |
| 210 | +modelSummary = summary(model) |
| 211 | +monthLine = function(x){ |
| 212 | + model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x |
| 213 | +} |
| 214 | +png("plots/vandal_fighter_prop.by_month.png", width=1024, height=768) |
| 215 | +x = activity_counts$year.month |
| 216 | +y = activity_counts$vandal_fighter_prop |
| 217 | +plot( |
| 218 | + x, |
| 219 | + (y*0)-10000, |
| 220 | + col="#FFFFFF", |
| 221 | + ylim=c(0, max(y)+.01), |
| 222 | + main="Proportion of active editors who are vandal fighters (no bots)", |
| 223 | + xlab="Month", |
| 224 | + ylab="Prop of of vandal fighters" |
| 225 | +) |
| 226 | +lines( |
| 227 | + x, |
| 228 | + y, |
| 229 | + type="o", |
| 230 | + pch=22, |
| 231 | + lty=2, |
| 232 | + col="red" |
| 233 | +) |
| 234 | +lines( |
| 235 | + x[order(x)], |
| 236 | + monthLine(as.numeric(x[order(x)])), |
| 237 | + lty=1, |
| 238 | + col="black" |
| 239 | +) |
| 240 | +legend( |
| 241 | + mean(as.numeric(x)), |
| 242 | + max(y)+.01, |
| 243 | + paste( |
| 244 | + "R^2=", round(modelSummary$r.squared, 3), |
| 245 | + " coef=", round(model$coefficients[['as.numeric(year.month)']], 5), |
| 246 | + " p=", round(modelSummary$coefficients[2,4], 5) |
| 247 | + ), |
| 248 | + cex=1.2, |
| 249 | + col=c("black"), |
| 250 | + lty=1 |
| 251 | +) |
| 252 | +dev.off() |
| 253 | + |
| 254 | + |
| 255 | +activity_counts$super_vandal_fighter_prop = activity_counts$users.vf/activity_counts$users.active |
| 256 | +model = lm( |
| 257 | + super_vandal_fighter_prop ~ as.numeric(year.month), |
| 258 | + data=activity_counts |
| 259 | +) |
| 260 | +modelSummary = summary(model) |
| 261 | +monthLine = function(x){ |
| 262 | + model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x |
| 263 | +} |
| 264 | +png("plots/super_vandal_fighter_prop.by_month.png", width=1024, height=768) |
| 265 | +x = activity_counts$year.month |
| 266 | +y = activity_counts$super_vandal_fighter_prop |
| 267 | +plot( |
| 268 | + x, |
| 269 | + (y*0)-10000, |
| 270 | + col="#FFFFFF", |
| 271 | + ylim=c(0, max(y)+.01), |
| 272 | + main="Proportion of active editors who are super vandal fighters (no bots)", |
| 273 | + sub="super vandal fighter >= 50 vandal reverts per month", |
| 274 | + xlab="Month", |
| 275 | + ylab="Prop of of vandal fighters" |
| 276 | +) |
| 277 | +lines( |
| 278 | + x, |
| 279 | + y, |
| 280 | + type="o", |
| 281 | + pch=22, |
| 282 | + lty=2, |
| 283 | + col="red" |
| 284 | +) |
| 285 | +lines( |
| 286 | + x[order(x)], |
| 287 | + monthLine(as.numeric(x[order(x)])), |
| 288 | + lty=1, |
| 289 | + col="black" |
| 290 | +) |
| 291 | +legend( |
| 292 | + mean(as.numeric(x)), |
| 293 | + max(y)+.01, |
| 294 | + paste( |
| 295 | + "R^2=", round(modelSummary$r.squared, 3), |
| 296 | + " coef=", round(model$coefficients[['as.numeric(year.month)']], 5), |
| 297 | + " p=", round(modelSummary$coefficients[2,4], 5) |
| 298 | + ), |
| 299 | + cex=1.2, |
| 300 | + col=c("black"), |
| 301 | + lty=1 |
| 302 | +) |
| 303 | +dev.off() |
Index: trunk/tools/wsor/overworked/R/revert_fighters.R |
— | — | @@ -0,0 +1,236 @@ |
| 2 | +source("loader/load_reverter_months.R") |
| 3 | + |
| 4 | +reverter_months = load_reverter_months() |
| 5 | +reverter_months = reverter_months[!grepl("bot( |$|[^a-z])", reverter_months$username, ignore.case=T),] |
| 6 | +reverter_months$vfighter = reverter_months$vandal_reverts >= 5 |
| 7 | +reverter_months$svfighter = reverter_months$vandal_reverts >= 50 |
| 8 | +reverter_months$sdvfighter = reverter_months$vandal_reverts >= 500 |
| 9 | +reverter_months$reverter = reverter_months$reverts >= 5 |
| 10 | +reverter_months$active = reverter_months$revisions >= 5 |
| 11 | + |
| 12 | +library(doBy) |
| 13 | +library(lattice) |
| 14 | + |
| 15 | +vfighter_counts = with( |
| 16 | + summaryBy( |
| 17 | + user_id ~ year + month + vfighter, |
| 18 | + data=reverter_months[reverter_months$active,], |
| 19 | + FUN=length |
| 20 | + ), |
| 21 | + data.frame( |
| 22 | + year = year, |
| 23 | + month = month, |
| 24 | + vfighter = vfighter, |
| 25 | + users = user_id.length |
| 26 | + ) |
| 27 | +) |
| 28 | +svfighter_counts = with( |
| 29 | + summaryBy( |
| 30 | + user_id ~ year + month + svfighter, |
| 31 | + data=reverter_months[reverter_months$active,], |
| 32 | + FUN=length |
| 33 | + ), |
| 34 | + data.frame( |
| 35 | + year = year, |
| 36 | + month = month, |
| 37 | + svfighter = svfighter, |
| 38 | + users = user_id.length |
| 39 | + ) |
| 40 | +) |
| 41 | + |
| 42 | +activity_counts = merge( |
| 43 | + merge( |
| 44 | + vfighter_counts[vfighter_counts$vfighter,], |
| 45 | + vfighter_counts[!vfighter_counts$vfighter,], |
| 46 | + by=c("year", "month"), |
| 47 | + suffixes=c(".vf", ".nonvf") |
| 48 | + ), |
| 49 | + merge( |
| 50 | + svfighter_counts[svfighter_counts$svfighter,], |
| 51 | + svfighter_counts[!svfighter_counts$svfighter,], |
| 52 | + by=c("year", "month"), |
| 53 | + suffixes=c(".svf", ".nonsvf") |
| 54 | + |
| 55 | + ), |
| 56 | + by=c("year", "month") |
| 57 | +) |
| 58 | +activity_counts$users.active = activity_counts$users.vf + activity_counts$users.nonvf |
| 59 | +activity_counts$log.users.active = log(activity_counts$users.active, base=10) |
| 60 | +activity_counts$log.users.vf = log(activity_counts$users.vf, base=10) |
| 61 | +activity_counts$log.users.svf = log(activity_counts$users.svf, base=10) |
| 62 | + |
| 63 | + |
| 64 | +png("plots/vandal_fighters.by_month.png", width=1024, height=768) |
| 65 | +plot( |
| 66 | + as.factor(paste(activity_counts$year, activity_counts$month, sep="/")), |
| 67 | + (activity_counts$users.active*0)-10000, |
| 68 | + col="#FFFFFF", |
| 69 | + ylim=c(0, max(activity_counts$users.active)+.5), |
| 70 | + main="Vandal fighters and active editors over time", |
| 71 | + xlab="Time (in months)", |
| 72 | + ylab="Number of users (log10 scaled)" |
| 73 | +) |
| 74 | +lines( |
| 75 | + as.factor(paste(activity_counts$year, activity_counts$month, sep="/")), |
| 76 | + activity_counts$users.active, |
| 77 | + type="o", |
| 78 | + pch=20, |
| 79 | + lty=1, |
| 80 | + col="blue" |
| 81 | +) |
| 82 | +lines( |
| 83 | + as.factor(paste(activity_counts$year, activity_counts$month, sep="/")), |
| 84 | + activity_counts$users.vf, |
| 85 | + type="o", |
| 86 | + pch=22, |
| 87 | + lty=2, |
| 88 | + col="red" |
| 89 | +) |
| 90 | +legend( |
| 91 | + max(as.numeric(as.factor(paste(activity_counts$year, activity_counts$month, sep="/"))))-10, |
| 92 | + max(activity_counts$users.active)+.5, |
| 93 | + c("active editors","vandal fighters"), |
| 94 | + cex=1.2, |
| 95 | + col=c("blue","red"), |
| 96 | + pch=20:22, |
| 97 | + lty=1:2 |
| 98 | +) |
| 99 | +dev.off() |
| 100 | + |
| 101 | +png("plots/vandal_fighters.by_month.logged.png", width=1024, height=768) |
| 102 | +plot( |
| 103 | + as.factor(paste(activity_counts$year, activity_counts$month, sep="/")), |
| 104 | + (activity_counts$log.users.active*0)-10000, |
| 105 | + col="#FFFFFF", |
| 106 | + ylim=c(0, max(activity_counts$log.users.active)+.5), |
| 107 | + main="Vandal fighters and active editors over time", |
| 108 | + xlab="Time (in months)", |
| 109 | + ylab="Number of users (log10 scaled)" |
| 110 | +) |
| 111 | +lines( |
| 112 | + as.factor(paste(activity_counts$year, activity_counts$month, sep="/")), |
| 113 | + activity_counts$log.users.active, |
| 114 | + type="o", |
| 115 | + pch=20, |
| 116 | + lty=1, |
| 117 | + col="blue" |
| 118 | +) |
| 119 | +lines( |
| 120 | + as.factor(paste(activity_counts$year, activity_counts$month, sep="/")), |
| 121 | + activity_counts$log.users.vf, |
| 122 | + type="o", |
| 123 | + pch=22, |
| 124 | + lty=2, |
| 125 | + col="red" |
| 126 | +) |
| 127 | +legend( |
| 128 | + max(as.numeric(as.factor(paste(activity_counts$year, activity_counts$month, sep="/"))))-10, |
| 129 | + max(activity_counts$log.users.active)+.5, |
| 130 | + c("active editors","vandal fighters"), |
| 131 | + cex=1.2, |
| 132 | + col=c("blue","red"), |
| 133 | + pch=20:22, |
| 134 | + lty=1:2 |
| 135 | +) |
| 136 | +dev.off() |
| 137 | + |
| 138 | +activity_counts$vandal_fighter_prop = activity_counts$users.vf/activity_counts$users.active |
| 139 | +activity_counts$year.month = as.factor(paste(activity_counts$year, activity_counts$month, sep="/")) |
| 140 | +model = lm( |
| 141 | + vandal_fighter_prop ~ as.numeric(year.month), |
| 142 | + data=activity_counts |
| 143 | +) |
| 144 | +modelSummary = summary(model) |
| 145 | +monthLine = function(x){ |
| 146 | + model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x |
| 147 | +} |
| 148 | +png("plots/vandal_fighter_prop.by_month.png", width=1024, height=768) |
| 149 | +x = activity_counts$year.month |
| 150 | +y = activity_counts$vandal_fighter_prop |
| 151 | +plot( |
| 152 | + x, |
| 153 | + (y*0)-10000, |
| 154 | + col="#FFFFFF", |
| 155 | + ylim=c(0, max(y)+.01), |
| 156 | + main="Proportion of active editors who are vandal fighters (no bots)", |
| 157 | + xlab="Month", |
| 158 | + ylab="Prop of of vandal fighters" |
| 159 | +) |
| 160 | +lines( |
| 161 | + x, |
| 162 | + y, |
| 163 | + type="o", |
| 164 | + pch=22, |
| 165 | + lty=2, |
| 166 | + col="red" |
| 167 | +) |
| 168 | +lines( |
| 169 | + x[order(x)], |
| 170 | + monthLine(as.numeric(x[order(x)])), |
| 171 | + lty=1, |
| 172 | + col="black" |
| 173 | +) |
| 174 | +legend( |
| 175 | + mean(as.numeric(x)), |
| 176 | + max(y)+.01, |
| 177 | + paste( |
| 178 | + "R^2=", round(modelSummary$r.squared, 3), |
| 179 | + " coef=", round(model$coefficients[['as.numeric(year.month)']], 5), |
| 180 | + " p=", round(modelSummary$coefficients[2,4], 6) |
| 181 | + ), |
| 182 | + cex=1.2, |
| 183 | + col=c("black"), |
| 184 | + lty=1 |
| 185 | +) |
| 186 | +dev.off() |
| 187 | + |
| 188 | + |
| 189 | +activity_counts$super_vandal_fighter_prop = activity_counts$users.svf/activity_counts$users.active |
| 190 | +model = lm( |
| 191 | + super_vandal_fighter_prop ~ as.numeric(year.month), |
| 192 | + data=activity_counts |
| 193 | +) |
| 194 | +modelSummary = summary(model) |
| 195 | +monthLine = function(x){ |
| 196 | + model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x |
| 197 | +} |
| 198 | +png("plots/super_vandal_fighter_prop.by_month.png", width=1024, height=768) |
| 199 | +x = activity_counts$year.month |
| 200 | +y = activity_counts$super_vandal_fighter_prop |
| 201 | +plot( |
| 202 | + x, |
| 203 | + (y*0)-10000, |
| 204 | + col="#FFFFFF", |
| 205 | + ylim=c(0, max(y)+.005), |
| 206 | + main="Proportion of active editors who are super vandal fighters (no bots)", |
| 207 | + sub="super vandal fighter >= 50 vandal reverts per month", |
| 208 | + xlab="Month", |
| 209 | + ylab="Prop of of vandal fighters" |
| 210 | +) |
| 211 | +lines( |
| 212 | + x, |
| 213 | + y, |
| 214 | + type="o", |
| 215 | + pch=22, |
| 216 | + lty=2, |
| 217 | + col="red" |
| 218 | +) |
| 219 | +lines( |
| 220 | + x[order(x)], |
| 221 | + monthLine(as.numeric(x[order(x)])), |
| 222 | + lty=1, |
| 223 | + col="black" |
| 224 | +) |
| 225 | +legend( |
| 226 | + mean(as.numeric(x)), |
| 227 | + max(y)+.005, |
| 228 | + paste( |
| 229 | + "R^2=", round(modelSummary$r.squared, 3), |
| 230 | + " coef=", round(model$coefficients[['as.numeric(year.month)']], 5), |
| 231 | + " p=", round(modelSummary$coefficients[2,4], 6) |
| 232 | + ), |
| 233 | + cex=1.2, |
| 234 | + col=c("black"), |
| 235 | + lty=1 |
| 236 | +) |
| 237 | +dev.off() |
Index: trunk/tools/wsor/overworked/R/loader/load_reverter_months.R |
— | — | @@ -3,7 +3,10 @@ |
4 | 4 | |
5 | 5 | |
6 | 6 | load_reverter_months = function(verbose=T, reload=F){ |
7 | | - filename = paste(DATA_DIR, "en.reverter_months.20110115.tsv", sep="/") |
| 7 | + filename = paste(DATA_DIR, "en.reverter_months.20110115.no_quotes_or_bots.tsv", sep="/") |
| 8 | + if(!exists("REVERTER_MONTHS")){ |
| 9 | + REVERTER_MONTHS <<- NULL |
| 10 | + } |
8 | 11 | if(is.null(REVERTER_MONTHS) | reload){ |
9 | 12 | REVERTER_MONTHS <<- NULL |
10 | 13 | } |
— | — | @@ -12,8 +15,8 @@ |
13 | 16 | REVERTER_MONTHS <<- read.table( |
14 | 17 | filename, |
15 | 18 | header=T, sep="\t", |
16 | | - quote="'\"", comment.char="", |
17 | | - na.strings="\\N", |
| 19 | + quote="", comment.char="", |
| 20 | + na.strings="\\N" |
18 | 21 | ) |
19 | 22 | if(verbose){cat("DONE!\n")} |
20 | 23 | } |
Index: trunk/tools/wsor/overworked/R/loader/load_reverting_months.R |
— | — | @@ -0,0 +1,26 @@ |
| 2 | +source("util/env.R") |
| 3 | + |
| 4 | + |
| 5 | + |
| 6 | +load_reverting_months = function(verbose=T, reload=F){ |
| 7 | + filename = paste(DATA_DIR, "en.reverting_years.20110115.tsv", sep="/") |
| 8 | + if(!exists("REVERTING_MONTHS")){ |
| 9 | + REVERTING_MONTHS <<- NULL |
| 10 | + } |
| 11 | + if(is.null(REVERTING_MONTHS) | reload){ |
| 12 | + REVERTING_MONTHS <<- NULL |
| 13 | + } |
| 14 | + if(is.null(REVERTING_MONTHS)){ |
| 15 | + if(verbose){cat("Loading reverter months from", filename, "...")} |
| 16 | + REVERTING_MONTHS <<- read.table( |
| 17 | + filename, |
| 18 | + header=T, sep="\t", |
| 19 | + quote="'\"", comment.char="", |
| 20 | + na.strings="\\N", |
| 21 | + ) |
| 22 | + if(verbose){cat("DONE!\n")} |
| 23 | + } |
| 24 | + REVERTING_MONTHS |
| 25 | +} |
| 26 | + |
| 27 | + |
Index: trunk/tools/wsor/overworked/R/revert_distributions.R |
— | — | @@ -1,199 +1,43 @@ |
2 | | -source("loader/load_patroller_days.R") |
| 2 | +source("loader/load_reverting_months.R") |
3 | 3 | |
4 | | -patroller_days = load_patroller_days() |
5 | | -patroller_days = patroller_days[!grepl("bot( |$)", patroller_days$username, ignore.case=T),] |
6 | | -patroller_days = patroller_days[!grepl("DASHBot", patroller_days$username, ignore.case=T),] |
| 4 | +reverting_months = load_reverting_months() |
7 | 5 | |
8 | | -library(lattice) |
9 | 6 | library(doBy) |
10 | 7 | |
11 | | - |
12 | | -patroller_years = with( |
| 8 | +reverting_years = with( |
13 | 9 | summaryBy( |
14 | | - count ~ year + user_id + username, |
15 | | - data=patroller_days, |
| 10 | + revisions + reverts + vandalism ~ year, |
| 11 | + data=reverting_months[reverting_months$year <= 2010,], |
16 | 12 | FUN=sum |
17 | 13 | ), |
18 | 14 | data.frame( |
19 | | - year = year, |
20 | | - user_id = user_id, |
21 | | - username = username, |
22 | | - count = count.sum |
| 15 | + year = year, |
| 16 | + revisions = revisions.sum, |
| 17 | + reverts = reverts.sum, |
| 18 | + vandalism = vandalism.sum |
23 | 19 | ) |
24 | 20 | ) |
25 | 21 | |
26 | | -patroller_years = patroller_years[order(patroller_years$count),] |
27 | | -patroller_years$count_bucket = 2^round(log(patroller_years$count, base=2)) |
28 | | - |
29 | | -patroller_years.count_dist = with( |
30 | | - summaryBy( |
31 | | - user_id ~ year + count, |
32 | | - data = patroller_years, |
33 | | - FUN=length |
34 | | - ), |
35 | | - data.frame( |
36 | | - year = year, |
37 | | - count = count, |
38 | | - freq = user_id.length |
39 | | - ) |
| 22 | +png("plots/vandal_revert_trend.by_year.png", width=1024, height=768) |
| 23 | +plot( |
| 24 | + reverting_years$year, reverting_years$reverts, |
| 25 | + type="o", |
| 26 | + pch=20, |
| 27 | + col="blue", |
| 28 | + lty=1, |
| 29 | + main="Reverts and vandalism by year through 2010", |
| 30 | + xlab="Year", |
| 31 | + ylab="Number of reverts" |
40 | 32 | ) |
41 | | - |
42 | | -png('plots/dist.patroller_years_activity.png', height=768, width=1024) |
43 | | -xyplot( |
44 | | - freq ~ count | as.character(year), |
45 | | - data = patroller_years.count_dist, |
46 | | - panel = function(x, y, subscripts, group, ...){ |
47 | | - panel.xyplot(x, y) |
48 | | - panel.lines(x, y) |
49 | | - }, |
50 | | - main="Distribution of activity level among editors", |
51 | | - ylab="Frequency", |
52 | | - xlab="Activity level", |
53 | | - #scales=list( |
54 | | - # x=list( |
55 | | - # log=2, |
56 | | - # at=2^(1:max(patroller_years.count_dist$count)), |
57 | | - # labels=2^(1:max(patroller_years.count_dist$count)) |
58 | | - # ) |
59 | | - #), |
60 | | - layout=c(length(unique(patroller_years.count_dist$year)), 1) |
| 33 | +lines( |
| 34 | + reverting_years$year, reverting_years$vandalism, |
| 35 | + type="o", |
| 36 | + pch=22, |
| 37 | + lty=2, |
| 38 | + col="red" |
61 | 39 | ) |
| 40 | +legend(2001, max(reverting_years$reverts), c("total reverts","vandalism"), cex=1.2, |
| 41 | + col=c("blue","red"), pch=20:22, lty=1:2); |
62 | 42 | dev.off() |
63 | 43 | |
64 | 44 | |
65 | | - |
66 | | -for(year in sort(unique(patroller_years$year))){ |
67 | | - p_year = patroller_years[patroller_years$year==year,] |
68 | | - p_year = p_year[order(p_year$count, decreasing=T),] |
69 | | - png(paste('plots/bars.patroller_years_activity', year, 'png', sep="."), height=768, width=1024) |
70 | | - print(barchart( |
71 | | - reorder(substring(as.character(username),1,30), count) ~ count, |
72 | | - data=p_year[1:50,], |
73 | | - horizontal=T, |
74 | | - xlim=c(0, 110000), |
75 | | - xlab="Patrolled pages" |
76 | | - )) |
77 | | - dev.off() |
78 | | - cat(year, "\n") |
79 | | - print(summary(p_year$count)) |
80 | | -} |
81 | | - |
82 | | - |
83 | | -patroller_months = with( |
84 | | - summaryBy( |
85 | | - count ~ year + month + user_id + username, |
86 | | - data=patroller_days, |
87 | | - FUN=sum |
88 | | - ), |
89 | | - data.frame( |
90 | | - year = year, |
91 | | - month = month, |
92 | | - user_id = user_id, |
93 | | - username = username, |
94 | | - count = count.sum |
95 | | - ) |
96 | | -) |
97 | | - |
98 | | -nNoNA = function(x){ |
99 | | - length(subset(x, !is.na(x))) |
100 | | -} |
101 | | -sdNoNA = function(x){ |
102 | | - sd(x, na.rm=T)/sqrt(nNoNA(x)) |
103 | | -} |
104 | | -meanNoNA = function(x){ |
105 | | - mean(x, na.rm=T) |
106 | | -} |
107 | | - |
108 | | -patrol_months.per_user = with( |
109 | | - summaryBy( |
110 | | - count ~ year + month, |
111 | | - data=patroller_months, |
112 | | - FUN=c(meanNoNA, sdNoNA, nNoNA) |
113 | | - ), |
114 | | - data.frame( |
115 | | - year = year, |
116 | | - month = month, |
117 | | - year.month = year + month/100, |
118 | | - count.mean = count.meanNoNA, |
119 | | - count.sd = count.sdNoNA, |
120 | | - count.n = count.nNoNA |
121 | | - ) |
122 | | -) |
123 | | - |
124 | | -model = lm( |
125 | | - count.mean ~ as.numeric(factor(year.month)), |
126 | | - data=patrol_months.per_user[patrol_months.per_user$year.month <= 2011.05,] |
127 | | -) |
128 | | -summary(model) |
129 | | -monthLine = function(x){ |
130 | | - model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(factor(year.month))']]*x |
131 | | -} |
132 | | - |
133 | | -png("plots/patrol_months.per_user.png", height=768, width=1024) |
134 | | -print(xyplot( |
135 | | - count.mean ~ as.factor(year.month), |
136 | | - data = patrol_months.per_user[patrol_months.per_user$year.month <= 2011.05,], |
137 | | - panel = function(x, y, subscripts, ...){ |
138 | | - f = patrol_months.per_user[patrol_months.per_user$year.month <= 2011.05,][subscripts,] |
139 | | - panel.xyplot(x, y, col="#000000", ...) |
140 | | - se = f$count.sd/sqrt(f$count.n) |
141 | | - panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...) |
142 | | - panel.lines(x[order(x)], y[order(x)], lwd=2, ...) |
143 | | - panel.lines(x[order(x)], monthLine(as.numeric(x[order(x)])), lwd=2, col="#000000") |
144 | | - }, |
145 | | - #main="Average Patroller workload by month", |
146 | | - ylab="Mean patrolled pages per user", |
147 | | - xlab="Month", |
148 | | - scales=list(x=list(rot=45)) |
149 | | -)) |
150 | | -dev.off() |
151 | | - |
152 | | -patrol_years.per_user = with( |
153 | | - summaryBy( |
154 | | - count ~ year, |
155 | | - data=patroller_years, |
156 | | - FUN=c(meanNoNA, sdNoNA, nNoNA) |
157 | | - ), |
158 | | - data.frame( |
159 | | - year = year, |
160 | | - count.mean = count.meanNoNA, |
161 | | - count.sd = count.sdNoNA, |
162 | | - count.n = count.nNoNA |
163 | | - ) |
164 | | -) |
165 | | - |
166 | | -model = lm( |
167 | | - count.mean ~ year, |
168 | | - data=patrol_years.per_user[patrol_years.per_user$year <= 2010,] |
169 | | -) |
170 | | -summary(model) |
171 | | - |
172 | | -model = lm( |
173 | | - count.mean ~ log(year-2006, base=2), |
174 | | - data=patrol_years.per_user[patrol_years.per_user$year <= 2010,] |
175 | | -) |
176 | | -summary(model) |
177 | | -yearCurve=function(x){ |
178 | | - model$coefficients[['(Intercept)']] + log(x-2006, base=2)*model$coefficients[['log(year - 2006, base = 2)']] |
179 | | -} |
180 | | -png("plots/patrol_years.per_user.png", height=768, width=1024) |
181 | | -print(xyplot( |
182 | | - count.mean ~ year-2006, |
183 | | - data = patrol_years.per_user[patrol_years.per_user$year <= 2010,], |
184 | | - panel = function(x, y, subscripts, ...){ |
185 | | - f = patrol_years.per_user[patrol_years.per_user$year.month <= 2011.05,][subscripts,] |
186 | | - panel.xyplot(x, y, col="#000000", ...) |
187 | | - se = f$count.sd/sqrt(f$count.n) |
188 | | - panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...) |
189 | | - #panel.lines(x[order(x)], y[order(x)], lwd=2, ...) |
190 | | - #panel.curve(myCurve, 2006, 2011, col="#000000") |
191 | | - panel.lines(seq(0, 5, .1), yearCurve(seq(2006, 2011, .1)), lwd=2, col="#000000") |
192 | | - }, |
193 | | - #main="Average Patroller workload by year", |
194 | | - ylab="Mean patrolled pages per user", |
195 | | - xlab="Year (log scaled)", |
196 | | - pch=20, |
197 | | - scales=list(x=list(at=1:5, labels=2007:2010)) |
198 | | -)) |
199 | | -dev.off() |
200 | | - |
Index: trunk/tools/wsor/overworked/remove_username.py |
— | — | @@ -0,0 +1,30 @@ |
| 2 | +import sys, argparse |
| 3 | + |
| 4 | +def encode(val): |
| 5 | + return str(val).encode("string_escape") |
| 6 | + |
| 7 | +def main(args): |
| 8 | + headers = [eval(v) for v in args.input.readline().strip().split("\t")] |
| 9 | + print("\t".join(encode(v) for v in headers)) |
| 10 | + |
| 11 | + for line in args.input: |
| 12 | + vals = [eval(v) for v in line.strip().split("\t")] |
| 13 | + print("\t".join(encode(v) for v in vals)) |
| 14 | + |
| 15 | + |
| 16 | + |
| 17 | + |
| 18 | +if __name__ == "__main__": |
| 19 | + parser = argparse.ArgumentParser( |
| 20 | + description= |
| 21 | + 'Removes quotes from dataset' |
| 22 | + ) |
| 23 | + parser.add_argument( |
| 24 | + '-i', '--input', |
| 25 | + metavar="<path>", |
| 26 | + type=lambda fn:open(fn, "r"), |
| 27 | + help='the path of the file to filter (defaults to stdin)', |
| 28 | + default=sys.stdin |
| 29 | + ) |
| 30 | + args = parser.parse_args() |
| 31 | + main(args) |
Index: trunk/tools/wsor/overworked/remove_quotes.py |
— | — | @@ -0,0 +1,29 @@ |
| 2 | +import sys, argparse |
| 3 | + |
| 4 | +def encode(val): |
| 5 | + return str(val).encode("string_escape") |
| 6 | + |
| 7 | +def main(args): |
| 8 | + headers = [eval(v) for v in args.input.readline().strip().split("\t")] |
| 9 | + print("\t".join(encode(v) for v in headers)) |
| 10 | + |
| 11 | + for line in args.input: |
| 12 | + vals = [eval(v) for v in line.strip().split("\t")] |
| 13 | + print("\t".join(encode(v) for v in vals)) |
| 14 | + |
| 15 | + |
| 16 | + |
| 17 | + |
| 18 | +if __name__ == "__main__": |
| 19 | + parser = argparse.ArgumentParser( |
| 20 | + description='Removes quotes from dataset' |
| 21 | + ) |
| 22 | + parser.add_argument( |
| 23 | + '-i', '--input', |
| 24 | + metavar="<path>", |
| 25 | + type=lambda fn:open(fn, "r"), |
| 26 | + help='the path of the file to filter (defaults to stdin)', |
| 27 | + default=sys.stdin |
| 28 | + ) |
| 29 | + args = parser.parse_args() |
| 30 | + main(args) |
Index: trunk/tools/wsor/overworked/remove_bots.py |
— | — | @@ -8,12 +8,12 @@ |
9 | 9 | bots.add(int(line.strip())) |
10 | 10 | |
11 | 11 | headerLine = args.input.readline().strip() |
12 | | - headers = [eval(h) for h in headerLine.split("\t")] |
| 12 | + headers = headerLine.split("\t") |
13 | 13 | print(headerLine) |
14 | 14 | |
15 | 15 | for line in args.input: |
16 | | - row = dict(zip(headers, [eval(v) for v in line.strip().split("\t")])) |
17 | | - if row['user_id'] not in bots: |
| 16 | + row = dict(zip(headers, line.strip().split("\t"))) |
| 17 | + if int(row['user_id']) not in bots: |
18 | 18 | print(line.strip()) |
19 | 19 | |
20 | 20 | |
— | — | @@ -21,8 +21,7 @@ |
22 | 22 | |
23 | 23 | if __name__ == "__main__": |
24 | 24 | parser = argparse.ArgumentParser( |
25 | | - description= |
26 | | - 'Removes bot editors from patrollers file' |
| 25 | + description='Removes bot editors from patrollers file' |
27 | 26 | ) |
28 | 27 | parser.add_argument( |
29 | 28 | 'bots', |
Index: trunk/tools/wsor/overworked/testing.sql |
— | — | @@ -90,7 +90,7 @@ |
91 | 91 | ) |
92 | 92 | |
93 | 93 | |
94 | | -CREATE TABLE halfak.revert_pre_20110115( |
| 94 | +CREATE TABLE halfak.revert_20110115( |
95 | 95 | revision_id INT, |
96 | 96 | rvtd_to_id INT, |
97 | 97 | revs_reverted INT |
— | — | @@ -143,32 +143,13 @@ |
144 | 144 | INNER JOIN revision reverted |
145 | 145 | ON r.revision_id = reverted.rev_id |
146 | 146 | INNER JOIN revision reverting |
147 | | - ON r.revision_id = reverting.rev_id |
| 147 | + ON r.rvtg_id = reverting.rev_id |
148 | 148 | INNER JOIN revision reverted_to |
149 | | - ON r.revision_id = reverted_to.rev_id; |
150 | | -CREATE INDEX rev_id_idx ON halfak.reverted_20110115 (revision_id); |
151 | | -CREATE INDEX rvtg_id_idx ON halfak.reverted_20110115 (rvtg_id); |
| 149 | + ON r.rvtd_to_id = reverted_to.rev_id; |
152 | 150 | |
153 | 151 | |
154 | | -CREATE TABLE halfak.revert_20110115( |
155 | | - revision_id INT, |
156 | | - rvtto_id INT, |
157 | | - is_vandalism BOOL, |
158 | | - revs_reverted INT |
159 | | -); |
160 | | -INSERT INTO halfak.revert_20110115 |
161 | | -SELECT |
162 | | - rvt.revision_id, |
163 | | - rvt.rvtd_to_id, |
164 | | - bit_or(rvtd.is_vandalism), |
165 | | - rvt.revs_reverted |
166 | | -FROM halfak.revert_pre_20110115 rvt |
167 | | -INNER JOIN halfak.reverted_20110115 rvtd |
168 | | - ON rvt.revision_id = rvtd.rvtg_id |
169 | | -GROUP BY rvt.revision_id, rvt.rvtd_to_id, rvt.revs_reverted; |
170 | | -CREATE INDEX rev_id_idx ON halfak.revert_20110115 (revision_id); |
171 | | -CREATE INDEX is_vandalism ON halfak.revert_20110115 (is_vandalism); |
172 | 152 | |
| 153 | + |
173 | 154 | |
174 | 155 | |
175 | 156 | |
— | — | @@ -181,7 +162,7 @@ |
182 | 163 | u.user_name as username, |
183 | 164 | COUNT(*) as revisions, |
184 | 165 | SUM(rvt.revision_id IS NOT NULL) as reverts, |
185 | | - SUM(rvt.revision_id IS NOT NULL AND rvt.is_vandalism) as vandal_reverts |
| 166 | + SUM(rvt.is_vandalism) as vandal_reverts |
186 | 167 | FROM revision r |
187 | 168 | LEFT JOIN halfak.revert_20100130 rvt |
188 | 169 | ON r.rev_id = rvt.revision_id |
— | — | @@ -190,3 +171,22 @@ |
191 | 172 | WHERE rev_timestamp < "20110000000000" |
192 | 173 | GROUP BY SUBSTRING(rev_timestamp, 1,4), rev_user, u.user_name |
193 | 174 | |
| 175 | + |
| 176 | +SELECT |
| 177 | + SUBSTR(rev_timestamp, 1,4), |
| 178 | + SUBSTR(rev_timestamp, 1,2), |
| 179 | + count(*), |
| 180 | + sum(revision_id IS NOT NULL), |
| 181 | + sum(revision_id IS NOT NULL AND is_vandalism) |
| 182 | +FROM halfak.revert_20110115 rvt |
| 183 | +INNER JOIN revision r ON rvt.revision_id = r.rev_id |
| 184 | +GROUP BY |
| 185 | + SUBSTR(rev_timestamp, 1,4), |
| 186 | + SUBSTR(rev_timestamp, 1,2); |
| 187 | + |
| 188 | + |
| 189 | + |
| 190 | + |
| 191 | +"(Reverted ([0-9]+ )?edits by \[\[Special:Contributions/[^\|]+\|[^\]]+]] \(\[\[User talk:[^\|]+\|talk\]\]\) to last version by .+)|" + |
| 192 | +"(Message re. \[\[[^\]]+\]\])|" + |
| 193 | +"(Level [0-9]+ warning re. \[\[[^\]]+\]\]" |
Index: trunk/tools/wsor/scripts/classes/tests/test_file_wrapper.py |
— | — | @@ -0,0 +1,24 @@ |
| 2 | +from StringIO import StringIO |
| 3 | +from nose.tools import eq_ |
| 4 | +from ..file_wrapper import FileWrapper |
| 5 | + |
| 6 | +def test_file_wrapper(): |
| 7 | + pre = "foo\nbar\nbaz\n" |
| 8 | + fp = StringIO("herp\nderp\n") |
| 9 | + post = "foobar\n" |
| 10 | + concat = pre + fp.getvalue() + post |
| 11 | + |
| 12 | + fw = FileWrapper(fp, pre, post) |
| 13 | + |
| 14 | + eq_( |
| 15 | + fw.read(), |
| 16 | + concat |
| 17 | + ) |
| 18 | + |
| 19 | + fp = StringIO("herp\nderp\n") |
| 20 | + |
| 21 | + fw = FileWrapper(fp, pre, post) |
| 22 | + |
| 23 | + for i in range(0, 20): |
| 24 | + eq_(fw.read(5), concat[i*5:(i+1)*5]) |
| 25 | + |
Index: trunk/tools/wsor/scripts/classes/tests/__init__.py |
Index: trunk/tools/wsor/scripts/classes/__init__.py |
Index: trunk/tools/wsor/scripts/classes/file_wrapper.py |
— | — | @@ -0,0 +1,61 @@ |
| 2 | +import sys |
| 3 | +from StringIO import StringIO |
| 4 | + |
| 5 | +class FileWrapper: |
| 6 | + |
| 7 | + def __init__(self, fp, pre='', post=''): |
| 8 | + self.fp = fp |
| 9 | + self.pre = StringIO(pre) |
| 10 | + self.post = StringIO(post) |
| 11 | + self.closed = False |
| 12 | + self.mode = "r" |
| 13 | + |
| 14 | + def read(self, bytes=sys.maxint): |
| 15 | + bytes = int(bytes) |
| 16 | + if self.closed: raise ValueError("I/O operation on closed file") |
| 17 | + |
| 18 | + preBytes = self.pre.read(bytes) |
| 19 | + print(str(preBytes)) |
| 20 | + if len(preBytes) < bytes: |
| 21 | + fpBytes = self.fp.read(bytes-len(preBytes)) |
| 22 | + else: |
| 23 | + fpBytes = '' |
| 24 | + |
| 25 | + if len(preBytes) + len(fpBytes) < bytes: |
| 26 | + postBytes = self.post.read(bytes-(len(preBytes) + len(fpBytes))) |
| 27 | + else: |
| 28 | + postBytes = '' |
| 29 | + |
| 30 | + return preBytes + fpBytes + postBytes |
| 31 | + |
| 32 | + def readline(self): |
| 33 | + if self.closed: raise ValueError("I/O operation on closed file") |
| 34 | + |
| 35 | + output = self.pre.readline() |
| 36 | + if len(output) == 0 or output[-1] != "\n": |
| 37 | + output += self.fp.readline() |
| 38 | + if len(output) == 0 or output[-1] != "\n": |
| 39 | + output += self.post.readline() |
| 40 | + |
| 41 | + return output |
| 42 | + |
| 43 | + def readlines(self): raise NotImplementedError() |
| 44 | + |
| 45 | + def __iter__(self): |
| 46 | + |
| 47 | + line = self.readline() |
| 48 | + while line != '': |
| 49 | + yield line |
| 50 | + line = self.readline() |
| 51 | + |
| 52 | + |
| 53 | + def seek(self): raise NotImplementedError() |
| 54 | + def write(self): raise NotImplementedError() |
| 55 | + def writelines(self): raise NotImplementedError() |
| 56 | + def tell(self): |
| 57 | + return self.pre.tell() + self.fp.tell() + self.post.tell() |
| 58 | + |
| 59 | + |
| 60 | + def close(self): |
| 61 | + self.closed = True |
| 62 | + self.fp.close() |