r91507 MediaWiki - Code Review archive

Revision:r91506‎ | r91507 | r91508 >
Date:00:20, 6 July 2011
Added file wrapper class (tested) and R work for overworked
Modified paths:
  • /trunk/tools/wsor/overworked/R/loader/load_reverter_months.R (modified) (history)
  • /trunk/tools/wsor/overworked/R/loader/load_reverting_months.R (added) (history)
  • /trunk/tools/wsor/overworked/R/revert_distributions.R (modified) (history)
  • /trunk/tools/wsor/overworked/R/revert_fighters.R (added) (history)
  • /trunk/tools/wsor/overworked/R/reverter_work.R (added) (history)
  • /trunk/tools/wsor/overworked/R/vandal_fighters.R (added) (history)
  • /trunk/tools/wsor/overworked/remove_bots.py (modified) (history)
  • /trunk/tools/wsor/overworked/remove_quotes.py (added) (history)
  • /trunk/tools/wsor/overworked/remove_username.py (added) (history)
  • /trunk/tools/wsor/overworked/testing.sql (modified) (history)
  • /trunk/tools/wsor/scripts/classes/__init__.py (added) (history)
  • /trunk/tools/wsor/scripts/classes/file_wrapper.py (added) (history)
  • /trunk/tools/wsor/scripts/classes/tests (added) (history)
  • /trunk/tools/wsor/scripts/classes/tests/__init__.py (added) (history)
  • /trunk/tools/wsor/scripts/classes/tests/test_file_wrapper.py (added) (history)

Diff [purge]

Index: trunk/tools/wsor/overworked/R/reverter_work.R
@@ -0,0 +1,143 @@
 7+reverter_months = load_reverter_months()
 8+reverter_months = reverter_months[!grepl("bot( |$|[^a-z])", reverter_months$username, ignore.case=T),]
 9+reverter_months = reverter_months[reverter_months$username != "DASHBotAV",]
 10+reverter_months$active = reverter_months$revisions >= 5
 11+reverter_months$year.month = with(
 12+ reverter_months,
 13+ as.factor(paste(year, month, sep="/"))
 16+vfer_years = with(
 17+ summaryBy(
 18+ reverts ~ year + user_id + username,
 19+ data=reverter_months,
 20+ FUN=sum
 21+ ),
 22+ data.frame(
 23+ year = year,
 24+ user_id = user_id,
 25+ username = username,
 26+ reverts = reverts.sum
 27+ )
 30+top_vfers = data.frame()
 32+for(year in unique(vfer_years$year)){
 33+ vfer_year = vfer_years[vfer_years$year==year,]
 34+ vfer_year = vfer_year[order(vfer_year$reverts, decreasing=T),]
 35+ png(paste('plots/vandal_fighter_activity', 'no_bots', year, 'png', sep="."), height=768, width=1024)
 36+ print(barchart(
 37+ reorder(username, reverts) ~ reverts,
 38+ data=vfer_year[1:50,],
 39+ horizontal=T,
 40+ xlab="Vandal reverts",
 41+ xlim=c(0,50000)
 42+ ))
 43+ dev.off()
 46+format_p = function(pval){
 47+ if(pval < ".001"){
 48+ "< .001"
 49+ }else{
 50+ paste("=", pval)
 51+ }
 55+activity_months = summaryBy(
 56+ vandal_reverts + reverts + revisions ~ year.month,
 57+ data=reverter_months,
 58+ FUN=c(mean, sd, length)
 61+plot_activity_mean = function(year.month, m, s, n, name){
 62+ model = lm(
 63+ m ~ as.numeric(year.month),
 64+ data=activity_months
 65+ )
 66+ summary(model)
 67+ monthLine = function(x){
 68+ model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x
 69+ }
 71+ print(xyplot(
 72+ m ~ as.factor(year.month),
 73+ panel = function(x, y, subscripts, ...){
 74+ panel.xyplot(x, y, ...)
 75+ se = s[subscripts]/sqrt(n[subscripts])
 76+ panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...)
 77+ panel.lines(x[order(x)], y[order(x)], lwd=2, ...)
 78+ panel.lines(x[order(x)], monthLine(as.numeric(x[order(x)])), lwd=2, col="#000000")
 79+ grid.text(
 80+ paste(
 81+ "R^2=", round(modelSummary$r.squared, 3),
 82+ " coef=", round(model$coefficients[['as.numeric(year.month)']], 5),
 83+ " p=", round(modelSummary$coefficients[2,4], 8)
 84+ ),
 85+ .5,
 86+ .95
 87+ )
 88+ },
 89+ #main="Average Patroller workload by month",
 90+ ylab=paste("Mean", name, "per user-month"),
 91+ xlab="Month",
 92+ scales=list(x=list(rot=45)),
 93+ ylim=c(0, max(m)*1.1)
 94+ ))
 98+png("plots/reverting_revisions.per_user_month.png", height=768, width=1024)
 100+ activity_months,
 101+ plot_activity_mean(year.month, reverts.mean, reverts.sd, reverts.length, "reverting revisions")
 105+png("plots/vandal_reverting_revisions.per_user_month.png", height=768, width=1024)
 107+ activity_months,
 108+ plot_activity_mean(year.month, vandal_reverts.mean, vandal_reverts.sd, vandal_reverts.length, "vandal reverting revisions")
 112+png("plots/revisions.per_user_month.png", height=768, width=1024)
 114+ activity_months,
 115+ plot_activity_mean(year.month, revisions.mean, revisions.sd, revisions.length, "revisions")
 120+top_vfers = data.frame()
 121+for(year.month in unique(reverter_months$year.month)){
 122+ month_vfers = reverter_months[reverter_months$year.month == year.month,]
 123+ cat("Adding", year.month, "...")
 124+ top_vfers = rbind(
 125+ top_vfers,
 126+ month_vfers[order(month_vfers$reverts),][1:50,]
 127+ )
 128+ cat("DONE!\n")
 131+top_activity_months = summaryBy(
 132+ vandal_reverts + reverts + revisions ~ year.month,
 133+ data=reverter_months,
 134+ FUN=c(mean, sd, length)
Index: trunk/tools/wsor/overworked/R/vandal_fighters.R
@@ -0,0 +1,302 @@
 4+reverter_months = load_reverter_months()
 5+reverter_months = reverter_months[!grepl("bot( |$|[^a-z])", reverter_months$username, ignore.case=T),]
 6+reverter_months$vf5 = reverter_months$vandal_reverts >= 5
 7+reverter_months$vf50 = reverter_months$vandal_reverts >= 50
 8+reverter_months$vf500 = reverter_months$vandal_reverts >= 500
 9+reverter_months$r5 = reverter_months$reverts >= 5
 10+reverter_months$r50 = reverter_months$reverts >= 50
 11+reverter_months$r500 = reverter_months$reverts >= 500
 13+reverter_months$active = reverter_months$revisions >= 5
 14+reverter_months$year.month = with(
 15+ reverter_months,
 16+ as.factor(paste(year, month, sep="/"))
 22+activity_counts = with(
 23+ summaryBy(
 24+ active + vf5 + vf50 + vf500 + r5 + r50 + r500 ~ year.month,
 25+ data=reverter_months[reverter_months$active,],
 26+ FUN=sum
 27+ ),
 28+ data.frame(
 29+ year = year,
 30+ month = month,
 31+ active.users = active.sum,
 32+ vf5.users = vf5.sum,
 33+ vf50.users = vf50.sum,
 34+ vf500.users = vf500.sum,
 35+ r5.users = r5.sum,
 36+ r50.users = r50.sum,
 37+ r500.users = r500.sum,
 38+ year.month = as.factor(paste(year, month, sep="/"))
 39+ )
 41+activity_counts$log.users.active = log(activity_counts$users.active, base=10)
 42+activity_counts$log.v5.users = log(activity_counts$vf5.users, base=10)
 43+activity_counts$log.v50.users = log(activity_counts$vf50.users, base=10)
 46+png("plots/vandal_fighters.by_month.png", width=1024, height=768)
 48+ activity_counts$year.month,
 49+ (activity_counts$active.users*0)-10000,
 50+ col="#FFFFFF",
 51+ ylim=c(0, max(activity_counts$users.active)+.5),
 52+ main="Vandal fighters and active editors over time",
 53+ xlab="Time (in months)",
 54+ ylab="Number of users (log10 scaled)"
 57+ activity_counts$year.month,
 58+ activity_counts$active.users,
 59+ type="o",
 60+ pch=20,
 61+ lty=1,
 62+ col="blue"
 65+ activity_counts$year.month,
 66+ activity_counts$vf5.users,
 67+ type="o",
 68+ pch=22,
 69+ lty=2,
 70+ col="red"
 73+ max(as.numeric(activity_counts$year.month))-10,
 74+ max(activity_counts$active.users)+.5,
 75+ c("active editors","vandal fighters"),
 76+ cex=1.2,
 77+ col=c("blue","red"),
 78+ pch=20:21,
 79+ lty=1:2
 83+png("plots/vandal_fighters.by_month.logged.png", width=1024, height=768)
 85+ activity_counts$year.month,
 86+ (activity_counts$log.users.active*0)-10000,
 87+ col="#FFFFFF",
 88+ ylim=c(0, max(activity_counts$log.users.active)+.5),
 89+ main="Vandal fighters and active editors over time",
 90+ xlab="Time (in months)",
 91+ ylab="Number of users (log10 scaled)"
 94+ activity_counts$year.month,
 95+ activity_counts$log.active.users,
 96+ type="o",
 97+ pch=20,
 98+ lty=1,
 99+ col="blue"
 102+ activity_counts$year.month,
 103+ activity_counts$log.vf5.users,
 104+ type="o",
 105+ pch=22,
 106+ lty=2,
 107+ col="red"
 110+ max(as.numeric(activity_counts$year.month))-10,
 111+ max(activity_counts$log.active.users)+.5,
 112+ c("active editors","vandal fighters"),
 113+ cex=1.2,
 114+ col=c("blue","red"),
 115+ pch=20:21,
 116+ lty=1:2
 120+plot_prop_with_regression = function(year.month, prop, name, desc){
 121+ model = lm(
 122+ prop ~ as.numeric(year.month)
 123+ )
 124+ modelSummary = summary(model)
 125+ monthLine = function(x){
 126+ model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x
 127+ }
 128+ png(paste("plots/", name, "_prop.by_month.png", sep=""), width=1024, height=768)
 129+ plot(
 130+ year.month,
 131+ (prop*0)-10000,
 132+ col="#FFFFFF",
 133+ ylim=c(0, max(prop)*1.1),
 134+ main=paste("Proportion of active editors who", desc, "(no bots)"),
 135+ xlab="Month",
 136+ ylab="Prop of of vandal fighters"
 137+ )
 138+ lines(
 139+ year.month,
 140+ prop,
 141+ type="o",
 142+ pch=22,
 143+ lty=2,
 144+ col="red"
 145+ )
 146+ lines(
 147+ year.month[order(year.month)],
 148+ monthLine(as.numeric(year.month[order(year.month)])),
 149+ lty=1,
 150+ col="black"
 151+ )
 152+ legend(
 153+ mean(as.numeric(year.month)),
 154+ max(prop)*1.1,
 155+ paste(
 156+ "R^2=", round(modelSummary$r.squared, 3),
 157+ " coef=", round(model$coefficients[['as.numeric(year.month)']], 5),
 158+ " p=", round(modelSummary$coefficients[2,4], 5)
 159+ ),
 160+ cex=1.2,
 161+ col=c("black"),
 162+ lty=1
 163+ )
 164+ dev.off()
 167+ activity_counts$year.month,
 168+ activity_counts$vf5.users/activity_counts$active.users,
 169+ "vandal_5",
 170+ "revert >=5 vandals per month"
 173+ activity_counts$year.month,
 174+ activity_counts$vf50.users/activity_counts$active.users,
 175+ "vandal_50",
 176+ "revert >=50 vandals per month"
 179+ activity_counts$year.month,
 180+ activity_counts$vf500.users/activity_counts$active.users,
 181+ "vandal_50",
 182+ "revert >=500 vandals per month"
 185+ activity_counts$year.month,
 186+ activity_counts$r5.users/activity_counts$active.users,
 187+ "revert_5",
 188+ "revert >=5 times per month"
 191+ activity_counts$year.month,
 192+ activity_counts$r50.users/activity_counts$active.users,
 193+ "revert_50",
 194+ "revert >=50 times per month"
 197+ activity_counts$year.month,
 198+ activity_counts$r500.users/activity_counts$active.users,
 199+ "revert_500",
 200+ "revert >=500 times per month"
 204+activity_counts$vandal_fighter_prop = activity_counts$users.vf/activity_counts$users.active
 205+activity_counts$year.month = as.factor(paste(activity_counts$year, activity_counts$month, sep="/"))
 206+model = lm(
 207+ vandal_fighter_prop ~ as.numeric(year.month),
 208+ data=activity_counts
 210+modelSummary = summary(model)
 211+monthLine = function(x){
 212+ model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x
 214+png("plots/vandal_fighter_prop.by_month.png", width=1024, height=768)
 215+x = activity_counts$year.month
 216+y = activity_counts$vandal_fighter_prop
 218+ x,
 219+ (y*0)-10000,
 220+ col="#FFFFFF",
 221+ ylim=c(0, max(y)+.01),
 222+ main="Proportion of active editors who are vandal fighters (no bots)",
 223+ xlab="Month",
 224+ ylab="Prop of of vandal fighters"
 227+ x,
 228+ y,
 229+ type="o",
 230+ pch=22,
 231+ lty=2,
 232+ col="red"
 235+ x[order(x)],
 236+ monthLine(as.numeric(x[order(x)])),
 237+ lty=1,
 238+ col="black"
 241+ mean(as.numeric(x)),
 242+ max(y)+.01,
 243+ paste(
 244+ "R^2=", round(modelSummary$r.squared, 3),
 245+ " coef=", round(model$coefficients[['as.numeric(year.month)']], 5),
 246+ " p=", round(modelSummary$coefficients[2,4], 5)
 247+ ),
 248+ cex=1.2,
 249+ col=c("black"),
 250+ lty=1
 255+activity_counts$super_vandal_fighter_prop = activity_counts$users.vf/activity_counts$users.active
 256+model = lm(
 257+ super_vandal_fighter_prop ~ as.numeric(year.month),
 258+ data=activity_counts
 260+modelSummary = summary(model)
 261+monthLine = function(x){
 262+ model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x
 264+png("plots/super_vandal_fighter_prop.by_month.png", width=1024, height=768)
 265+x = activity_counts$year.month
 266+y = activity_counts$super_vandal_fighter_prop
 268+ x,
 269+ (y*0)-10000,
 270+ col="#FFFFFF",
 271+ ylim=c(0, max(y)+.01),
 272+ main="Proportion of active editors who are super vandal fighters (no bots)",
 273+ sub="super vandal fighter >= 50 vandal reverts per month",
 274+ xlab="Month",
 275+ ylab="Prop of of vandal fighters"
 278+ x,
 279+ y,
 280+ type="o",
 281+ pch=22,
 282+ lty=2,
 283+ col="red"
 286+ x[order(x)],
 287+ monthLine(as.numeric(x[order(x)])),
 288+ lty=1,
 289+ col="black"
 292+ mean(as.numeric(x)),
 293+ max(y)+.01,
 294+ paste(
 295+ "R^2=", round(modelSummary$r.squared, 3),
 296+ " coef=", round(model$coefficients[['as.numeric(year.month)']], 5),
 297+ " p=", round(modelSummary$coefficients[2,4], 5)
 298+ ),
 299+ cex=1.2,
 300+ col=c("black"),
 301+ lty=1
Index: trunk/tools/wsor/overworked/R/revert_fighters.R
@@ -0,0 +1,236 @@
 4+reverter_months = load_reverter_months()
 5+reverter_months = reverter_months[!grepl("bot( |$|[^a-z])", reverter_months$username, ignore.case=T),]
 6+reverter_months$vfighter = reverter_months$vandal_reverts >= 5
 7+reverter_months$svfighter = reverter_months$vandal_reverts >= 50
 8+reverter_months$sdvfighter = reverter_months$vandal_reverts >= 500
 9+reverter_months$reverter = reverter_months$reverts >= 5
 10+reverter_months$active = reverter_months$revisions >= 5
 15+vfighter_counts = with(
 16+ summaryBy(
 17+ user_id ~ year + month + vfighter,
 18+ data=reverter_months[reverter_months$active,],
 19+ FUN=length
 20+ ),
 21+ data.frame(
 22+ year = year,
 23+ month = month,
 24+ vfighter = vfighter,
 25+ users = user_id.length
 26+ )
 28+svfighter_counts = with(
 29+ summaryBy(
 30+ user_id ~ year + month + svfighter,
 31+ data=reverter_months[reverter_months$active,],
 32+ FUN=length
 33+ ),
 34+ data.frame(
 35+ year = year,
 36+ month = month,
 37+ svfighter = svfighter,
 38+ users = user_id.length
 39+ )
 42+activity_counts = merge(
 43+ merge(
 44+ vfighter_counts[vfighter_counts$vfighter,],
 45+ vfighter_counts[!vfighter_counts$vfighter,],
 46+ by=c("year", "month"),
 47+ suffixes=c(".vf", ".nonvf")
 48+ ),
 49+ merge(
 50+ svfighter_counts[svfighter_counts$svfighter,],
 51+ svfighter_counts[!svfighter_counts$svfighter,],
 52+ by=c("year", "month"),
 53+ suffixes=c(".svf", ".nonsvf")
 55+ ),
 56+ by=c("year", "month")
 58+activity_counts$users.active = activity_counts$users.vf + activity_counts$users.nonvf
 59+activity_counts$log.users.active = log(activity_counts$users.active, base=10)
 60+activity_counts$log.users.vf = log(activity_counts$users.vf, base=10)
 61+activity_counts$log.users.svf = log(activity_counts$users.svf, base=10)
 64+png("plots/vandal_fighters.by_month.png", width=1024, height=768)
 66+ as.factor(paste(activity_counts$year, activity_counts$month, sep="/")),
 67+ (activity_counts$users.active*0)-10000,
 68+ col="#FFFFFF",
 69+ ylim=c(0, max(activity_counts$users.active)+.5),
 70+ main="Vandal fighters and active editors over time",
 71+ xlab="Time (in months)",
 72+ ylab="Number of users (log10 scaled)"
 75+ as.factor(paste(activity_counts$year, activity_counts$month, sep="/")),
 76+ activity_counts$users.active,
 77+ type="o",
 78+ pch=20,
 79+ lty=1,
 80+ col="blue"
 83+ as.factor(paste(activity_counts$year, activity_counts$month, sep="/")),
 84+ activity_counts$users.vf,
 85+ type="o",
 86+ pch=22,
 87+ lty=2,
 88+ col="red"
 91+ max(as.numeric(as.factor(paste(activity_counts$year, activity_counts$month, sep="/"))))-10,
 92+ max(activity_counts$users.active)+.5,
 93+ c("active editors","vandal fighters"),
 94+ cex=1.2,
 95+ col=c("blue","red"),
 96+ pch=20:22,
 97+ lty=1:2
 101+png("plots/vandal_fighters.by_month.logged.png", width=1024, height=768)
 103+ as.factor(paste(activity_counts$year, activity_counts$month, sep="/")),
 104+ (activity_counts$log.users.active*0)-10000,
 105+ col="#FFFFFF",
 106+ ylim=c(0, max(activity_counts$log.users.active)+.5),
 107+ main="Vandal fighters and active editors over time",
 108+ xlab="Time (in months)",
 109+ ylab="Number of users (log10 scaled)"
 112+ as.factor(paste(activity_counts$year, activity_counts$month, sep="/")),
 113+ activity_counts$log.users.active,
 114+ type="o",
 115+ pch=20,
 116+ lty=1,
 117+ col="blue"
 120+ as.factor(paste(activity_counts$year, activity_counts$month, sep="/")),
 121+ activity_counts$log.users.vf,
 122+ type="o",
 123+ pch=22,
 124+ lty=2,
 125+ col="red"
 128+ max(as.numeric(as.factor(paste(activity_counts$year, activity_counts$month, sep="/"))))-10,
 129+ max(activity_counts$log.users.active)+.5,
 130+ c("active editors","vandal fighters"),
 131+ cex=1.2,
 132+ col=c("blue","red"),
 133+ pch=20:22,
 134+ lty=1:2
 138+activity_counts$vandal_fighter_prop = activity_counts$users.vf/activity_counts$users.active
 139+activity_counts$year.month = as.factor(paste(activity_counts$year, activity_counts$month, sep="/"))
 140+model = lm(
 141+ vandal_fighter_prop ~ as.numeric(year.month),
 142+ data=activity_counts
 144+modelSummary = summary(model)
 145+monthLine = function(x){
 146+ model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x
 148+png("plots/vandal_fighter_prop.by_month.png", width=1024, height=768)
 149+x = activity_counts$year.month
 150+y = activity_counts$vandal_fighter_prop
 152+ x,
 153+ (y*0)-10000,
 154+ col="#FFFFFF",
 155+ ylim=c(0, max(y)+.01),
 156+ main="Proportion of active editors who are vandal fighters (no bots)",
 157+ xlab="Month",
 158+ ylab="Prop of of vandal fighters"
 161+ x,
 162+ y,
 163+ type="o",
 164+ pch=22,
 165+ lty=2,
 166+ col="red"
 169+ x[order(x)],
 170+ monthLine(as.numeric(x[order(x)])),
 171+ lty=1,
 172+ col="black"
 175+ mean(as.numeric(x)),
 176+ max(y)+.01,
 177+ paste(
 178+ "R^2=", round(modelSummary$r.squared, 3),
 179+ " coef=", round(model$coefficients[['as.numeric(year.month)']], 5),
 180+ " p=", round(modelSummary$coefficients[2,4], 6)
 181+ ),
 182+ cex=1.2,
 183+ col=c("black"),
 184+ lty=1
 189+activity_counts$super_vandal_fighter_prop = activity_counts$users.svf/activity_counts$users.active
 190+model = lm(
 191+ super_vandal_fighter_prop ~ as.numeric(year.month),
 192+ data=activity_counts
 194+modelSummary = summary(model)
 195+monthLine = function(x){
 196+ model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(year.month)']]*x
 198+png("plots/super_vandal_fighter_prop.by_month.png", width=1024, height=768)
 199+x = activity_counts$year.month
 200+y = activity_counts$super_vandal_fighter_prop
 202+ x,
 203+ (y*0)-10000,
 204+ col="#FFFFFF",
 205+ ylim=c(0, max(y)+.005),
 206+ main="Proportion of active editors who are super vandal fighters (no bots)",
 207+ sub="super vandal fighter >= 50 vandal reverts per month",
 208+ xlab="Month",
 209+ ylab="Prop of of vandal fighters"
 212+ x,
 213+ y,
 214+ type="o",
 215+ pch=22,
 216+ lty=2,
 217+ col="red"
 220+ x[order(x)],
 221+ monthLine(as.numeric(x[order(x)])),
 222+ lty=1,
 223+ col="black"
 226+ mean(as.numeric(x)),
 227+ max(y)+.005,
 228+ paste(
 229+ "R^2=", round(modelSummary$r.squared, 3),
 230+ " coef=", round(model$coefficients[['as.numeric(year.month)']], 5),
 231+ " p=", round(modelSummary$coefficients[2,4], 6)
 232+ ),
 233+ cex=1.2,
 234+ col=c("black"),
 235+ lty=1
Index: trunk/tools/wsor/overworked/R/loader/load_reverter_months.R
@@ -3,7 +3,10 @@
66 load_reverter_months = function(verbose=T, reload=F){
7 - filename = paste(DATA_DIR, "en.reverter_months.20110115.tsv", sep="/")
 7+ filename = paste(DATA_DIR, "en.reverter_months.20110115.no_quotes_or_bots.tsv", sep="/")
 8+ if(!exists("REVERTER_MONTHS")){
 10+ }
811 if(is.null(REVERTER_MONTHS) | reload){
1013 }
@@ -12,8 +15,8 @@
1316 REVERTER_MONTHS <<- read.table(
1417 filename,
1518 header=T, sep="\t",
16 - quote="'\"", comment.char="",
17 - na.strings="\\N",
 19+ quote="", comment.char="",
 20+ na.strings="\\N"
1821 )
1922 if(verbose){cat("DONE!\n")}
2023 }
Index: trunk/tools/wsor/overworked/R/loader/load_reverting_months.R
@@ -0,0 +1,26 @@
 6+load_reverting_months = function(verbose=T, reload=F){
 7+ filename = paste(DATA_DIR, "en.reverting_years.20110115.tsv", sep="/")
 8+ if(!exists("REVERTING_MONTHS")){
 10+ }
 11+ if(is.null(REVERTING_MONTHS) | reload){
 13+ }
 14+ if(is.null(REVERTING_MONTHS)){
 15+ if(verbose){cat("Loading reverter months from", filename, "...")}
 16+ REVERTING_MONTHS <<- read.table(
 17+ filename,
 18+ header=T, sep="\t",
 19+ quote="'\"", comment.char="",
 20+ na.strings="\\N",
 21+ )
 22+ if(verbose){cat("DONE!\n")}
 23+ }
Index: trunk/tools/wsor/overworked/R/revert_distributions.R
@@ -1,199 +1,43 @@
2 -source("loader/load_patroller_days.R")
4 -patroller_days = load_patroller_days()
5 -patroller_days = patroller_days[!grepl("bot( |$)", patroller_days$username, ignore.case=T),]
6 -patroller_days = patroller_days[!grepl("DASHBot", patroller_days$username, ignore.case=T),]
 4+reverting_months = load_reverting_months()
8 -library(lattice)
96 library(doBy)
11 -
12 -patroller_years = with(
 8+reverting_years = with(
139 summaryBy(
14 - count ~ year + user_id + username,
15 - data=patroller_days,
 10+ revisions + reverts + vandalism ~ year,
 11+ data=reverting_months[reverting_months$year <= 2010,],
1612 FUN=sum
1713 ),
1814 data.frame(
19 - year = year,
20 - user_id = user_id,
21 - username = username,
22 - count = count.sum
 15+ year = year,
 16+ revisions = revisions.sum,
 17+ reverts = reverts.sum,
 18+ vandalism = vandalism.sum
2319 )
2420 )
26 -patroller_years = patroller_years[order(patroller_years$count),]
27 -patroller_years$count_bucket = 2^round(log(patroller_years$count, base=2))
28 -
29 -patroller_years.count_dist = with(
30 - summaryBy(
31 - user_id ~ year + count,
32 - data = patroller_years,
33 - FUN=length
34 - ),
35 - data.frame(
36 - year = year,
37 - count = count,
38 - freq = user_id.length
39 - )
 22+png("plots/vandal_revert_trend.by_year.png", width=1024, height=768)
 24+ reverting_years$year, reverting_years$reverts,
 25+ type="o",
 26+ pch=20,
 27+ col="blue",
 28+ lty=1,
 29+ main="Reverts and vandalism by year through 2010",
 30+ xlab="Year",
 31+ ylab="Number of reverts"
4032 )
41 -
42 -png('plots/dist.patroller_years_activity.png', height=768, width=1024)
43 -xyplot(
44 - freq ~ count | as.character(year),
45 - data = patroller_years.count_dist,
46 - panel = function(x, y, subscripts, group, ...){
47 - panel.xyplot(x, y)
48 - panel.lines(x, y)
49 - },
50 - main="Distribution of activity level among editors",
51 - ylab="Frequency",
52 - xlab="Activity level",
53 - #scales=list(
54 - # x=list(
55 - # log=2,
56 - # at=2^(1:max(patroller_years.count_dist$count)),
57 - # labels=2^(1:max(patroller_years.count_dist$count))
58 - # )
59 - #),
60 - layout=c(length(unique(patroller_years.count_dist$year)), 1)
 34+ reverting_years$year, reverting_years$vandalism,
 35+ type="o",
 36+ pch=22,
 37+ lty=2,
 38+ col="red"
6139 )
 40+legend(2001, max(reverting_years$reverts), c("total reverts","vandalism"), cex=1.2,
 41+ col=c("blue","red"), pch=20:22, lty=1:2);
6242 dev.off()
65 -
66 -for(year in sort(unique(patroller_years$year))){
67 - p_year = patroller_years[patroller_years$year==year,]
68 - p_year = p_year[order(p_year$count, decreasing=T),]
69 - png(paste('plots/bars.patroller_years_activity', year, 'png', sep="."), height=768, width=1024)
70 - print(barchart(
71 - reorder(substring(as.character(username),1,30), count) ~ count,
72 - data=p_year[1:50,],
73 - horizontal=T,
74 - xlim=c(0, 110000),
75 - xlab="Patrolled pages"
76 - ))
77 - dev.off()
78 - cat(year, "\n")
79 - print(summary(p_year$count))
80 -}
81 -
82 -
83 -patroller_months = with(
84 - summaryBy(
85 - count ~ year + month + user_id + username,
86 - data=patroller_days,
87 - FUN=sum
88 - ),
89 - data.frame(
90 - year = year,
91 - month = month,
92 - user_id = user_id,
93 - username = username,
94 - count = count.sum
95 - )
96 -)
97 -
98 -nNoNA = function(x){
99 - length(subset(x, !is.na(x)))
100 -}
101 -sdNoNA = function(x){
102 - sd(x, na.rm=T)/sqrt(nNoNA(x))
103 -}
104 -meanNoNA = function(x){
105 - mean(x, na.rm=T)
106 -}
107 -
108 -patrol_months.per_user = with(
109 - summaryBy(
110 - count ~ year + month,
111 - data=patroller_months,
112 - FUN=c(meanNoNA, sdNoNA, nNoNA)
113 - ),
114 - data.frame(
115 - year = year,
116 - month = month,
117 - year.month = year + month/100,
118 - count.mean = count.meanNoNA,
119 - count.sd = count.sdNoNA,
120 - count.n = count.nNoNA
121 - )
122 -)
123 -
124 -model = lm(
125 - count.mean ~ as.numeric(factor(year.month)),
126 - data=patrol_months.per_user[patrol_months.per_user$year.month <= 2011.05,]
127 -)
128 -summary(model)
129 -monthLine = function(x){
130 - model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(factor(year.month))']]*x
131 -}
132 -
133 -png("plots/patrol_months.per_user.png", height=768, width=1024)
134 -print(xyplot(
135 - count.mean ~ as.factor(year.month),
136 - data = patrol_months.per_user[patrol_months.per_user$year.month <= 2011.05,],
137 - panel = function(x, y, subscripts, ...){
138 - f = patrol_months.per_user[patrol_months.per_user$year.month <= 2011.05,][subscripts,]
139 - panel.xyplot(x, y, col="#000000", ...)
140 - se = f$count.sd/sqrt(f$count.n)
141 - panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...)
142 - panel.lines(x[order(x)], y[order(x)], lwd=2, ...)
143 - panel.lines(x[order(x)], monthLine(as.numeric(x[order(x)])), lwd=2, col="#000000")
144 - },
145 - #main="Average Patroller workload by month",
146 - ylab="Mean patrolled pages per user",
147 - xlab="Month",
148 - scales=list(x=list(rot=45))
149 -))
150 -dev.off()
151 -
152 -patrol_years.per_user = with(
153 - summaryBy(
154 - count ~ year,
155 - data=patroller_years,
156 - FUN=c(meanNoNA, sdNoNA, nNoNA)
157 - ),
158 - data.frame(
159 - year = year,
160 - count.mean = count.meanNoNA,
161 - count.sd = count.sdNoNA,
162 - count.n = count.nNoNA
163 - )
164 -)
165 -
166 -model = lm(
167 - count.mean ~ year,
168 - data=patrol_years.per_user[patrol_years.per_user$year <= 2010,]
169 -)
170 -summary(model)
171 -
172 -model = lm(
173 - count.mean ~ log(year-2006, base=2),
174 - data=patrol_years.per_user[patrol_years.per_user$year <= 2010,]
175 -)
176 -summary(model)
177 -yearCurve=function(x){
178 - model$coefficients[['(Intercept)']] + log(x-2006, base=2)*model$coefficients[['log(year - 2006, base = 2)']]
179 -}
180 -png("plots/patrol_years.per_user.png", height=768, width=1024)
181 -print(xyplot(
182 - count.mean ~ year-2006,
183 - data = patrol_years.per_user[patrol_years.per_user$year <= 2010,],
184 - panel = function(x, y, subscripts, ...){
185 - f = patrol_years.per_user[patrol_years.per_user$year.month <= 2011.05,][subscripts,]
186 - panel.xyplot(x, y, col="#000000", ...)
187 - se = f$count.sd/sqrt(f$count.n)
188 - panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...)
189 - #panel.lines(x[order(x)], y[order(x)], lwd=2, ...)
190 - #panel.curve(myCurve, 2006, 2011, col="#000000")
191 - panel.lines(seq(0, 5, .1), yearCurve(seq(2006, 2011, .1)), lwd=2, col="#000000")
192 - },
193 - #main="Average Patroller workload by year",
194 - ylab="Mean patrolled pages per user",
195 - xlab="Year (log scaled)",
196 - pch=20,
197 - scales=list(x=list(at=1:5, labels=2007:2010))
198 -))
199 -dev.off()
200 -
Index: trunk/tools/wsor/overworked/remove_username.py
@@ -0,0 +1,30 @@
 2+import sys, argparse
 4+def encode(val):
 5+ return str(val).encode("string_escape")
 7+def main(args):
 8+ headers = [eval(v) for v in args.input.readline().strip().split("\t")]
 9+ print("\t".join(encode(v) for v in headers))
 11+ for line in args.input:
 12+ vals = [eval(v) for v in line.strip().split("\t")]
 13+ print("\t".join(encode(v) for v in vals))
 18+if __name__ == "__main__":
 19+ parser = argparse.ArgumentParser(
 20+ description=
 21+ 'Removes quotes from dataset'
 22+ )
 23+ parser.add_argument(
 24+ '-i', '--input',
 25+ metavar="<path>",
 26+ type=lambda fn:open(fn, "r"),
 27+ help='the path of the file to filter (defaults to stdin)',
 28+ default=sys.stdin
 29+ )
 30+ args = parser.parse_args()
 31+ main(args)
Index: trunk/tools/wsor/overworked/remove_quotes.py
@@ -0,0 +1,29 @@
 2+import sys, argparse
 4+def encode(val):
 5+ return str(val).encode("string_escape")
 7+def main(args):
 8+ headers = [eval(v) for v in args.input.readline().strip().split("\t")]
 9+ print("\t".join(encode(v) for v in headers))
 11+ for line in args.input:
 12+ vals = [eval(v) for v in line.strip().split("\t")]
 13+ print("\t".join(encode(v) for v in vals))
 18+if __name__ == "__main__":
 19+ parser = argparse.ArgumentParser(
 20+ description='Removes quotes from dataset'
 21+ )
 22+ parser.add_argument(
 23+ '-i', '--input',
 24+ metavar="<path>",
 25+ type=lambda fn:open(fn, "r"),
 26+ help='the path of the file to filter (defaults to stdin)',
 27+ default=sys.stdin
 28+ )
 29+ args = parser.parse_args()
 30+ main(args)
Index: trunk/tools/wsor/overworked/remove_bots.py
@@ -8,12 +8,12 @@
99 bots.add(int(line.strip()))
1111 headerLine = args.input.readline().strip()
12 - headers = [eval(h) for h in headerLine.split("\t")]
 12+ headers = headerLine.split("\t")
1313 print(headerLine)
1515 for line in args.input:
16 - row = dict(zip(headers, [eval(v) for v in line.strip().split("\t")]))
17 - if row['user_id'] not in bots:
 16+ row = dict(zip(headers, line.strip().split("\t")))
 17+ if int(row['user_id']) not in bots:
1818 print(line.strip())
@@ -21,8 +21,7 @@
2323 if __name__ == "__main__":
2424 parser = argparse.ArgumentParser(
25 - description=
26 - 'Removes bot editors from patrollers file'
 25+ description='Removes bot editors from patrollers file'
2726 )
2827 parser.add_argument(
2928 'bots',
Index: trunk/tools/wsor/overworked/testing.sql
@@ -90,7 +90,7 @@
9191 )
94 -CREATE TABLE halfak.revert_pre_20110115(
 94+CREATE TABLE halfak.revert_20110115(
9595 revision_id INT,
9696 rvtd_to_id INT,
9797 revs_reverted INT
@@ -143,32 +143,13 @@
144144 INNER JOIN revision reverted
145145 ON r.revision_id = reverted.rev_id
146146 INNER JOIN revision reverting
147 - ON r.revision_id = reverting.rev_id
 147+ ON r.rvtg_id = reverting.rev_id
148148 INNER JOIN revision reverted_to
149 - ON r.revision_id = reverted_to.rev_id;
150 -CREATE INDEX rev_id_idx ON halfak.reverted_20110115 (revision_id);
151 -CREATE INDEX rvtg_id_idx ON halfak.reverted_20110115 (rvtg_id);
 149+ ON r.rvtd_to_id = reverted_to.rev_id;
154 -CREATE TABLE halfak.revert_20110115(
155 - revision_id INT,
156 - rvtto_id INT,
157 - is_vandalism BOOL,
158 - revs_reverted INT
159 -);
160 -INSERT INTO halfak.revert_20110115
162 - rvt.revision_id,
163 - rvt.rvtd_to_id,
164 - bit_or(rvtd.is_vandalism),
165 - rvt.revs_reverted
166 -FROM halfak.revert_pre_20110115 rvt
167 -INNER JOIN halfak.reverted_20110115 rvtd
168 - ON rvt.revision_id = rvtd.rvtg_id
169 -GROUP BY rvt.revision_id, rvt.rvtd_to_id, rvt.revs_reverted;
170 -CREATE INDEX rev_id_idx ON halfak.revert_20110115 (revision_id);
171 -CREATE INDEX is_vandalism ON halfak.revert_20110115 (is_vandalism);
@@ -181,7 +162,7 @@
182163 u.user_name as username,
183164 COUNT(*) as revisions,
184165 SUM(rvt.revision_id IS NOT NULL) as reverts,
185 - SUM(rvt.revision_id IS NOT NULL AND rvt.is_vandalism) as vandal_reverts
 166+ SUM(rvt.is_vandalism) as vandal_reverts
186167 FROM revision r
187168 LEFT JOIN halfak.revert_20100130 rvt
188169 ON r.rev_id = rvt.revision_id
@@ -190,3 +171,22 @@
191172 WHERE rev_timestamp < "20110000000000"
192173 GROUP BY SUBSTRING(rev_timestamp, 1,4), rev_user, u.user_name
 177+ SUBSTR(rev_timestamp, 1,4),
 178+ SUBSTR(rev_timestamp, 1,2),
 179+ count(*),
 180+ sum(revision_id IS NOT NULL),
 181+ sum(revision_id IS NOT NULL AND is_vandalism)
 182+FROM halfak.revert_20110115 rvt
 183+INNER JOIN revision r ON rvt.revision_id = r.rev_id
 185+ SUBSTR(rev_timestamp, 1,4),
 186+ SUBSTR(rev_timestamp, 1,2);
 191+"(Reverted ([0-9]+ )?edits by \[\[Special:Contributions/[^\|]+\|[^\]]+]] \(\[\[User talk:[^\|]+\|talk\]\]\) to last version by .+)|" +
 192+"(Message re. \[\[[^\]]+\]\])|" +
 193+"(Level [0-9]+ warning re. \[\[[^\]]+\]\]"
Index: trunk/tools/wsor/scripts/classes/tests/test_file_wrapper.py
@@ -0,0 +1,24 @@
 2+from StringIO import StringIO
 3+from nose.tools import eq_
 4+from ..file_wrapper import FileWrapper
 6+def test_file_wrapper():
 7+ pre = "foo\nbar\nbaz\n"
 8+ fp = StringIO("herp\nderp\n")
 9+ post = "foobar\n"
 10+ concat = pre + fp.getvalue() + post
 12+ fw = FileWrapper(fp, pre, post)
 14+ eq_(
 15+ fw.read(),
 16+ concat
 17+ )
 19+ fp = StringIO("herp\nderp\n")
 21+ fw = FileWrapper(fp, pre, post)
 23+ for i in range(0, 20):
 24+ eq_(fw.read(5), concat[i*5:(i+1)*5])
Index: trunk/tools/wsor/scripts/classes/tests/__init__.py
Index: trunk/tools/wsor/scripts/classes/__init__.py
Index: trunk/tools/wsor/scripts/classes/file_wrapper.py
@@ -0,0 +1,61 @@
 2+import sys
 3+from StringIO import StringIO
 5+class FileWrapper:
 7+ def __init__(self, fp, pre='', post=''):
 8+ self.fp = fp
 9+ self.pre = StringIO(pre)
 10+ self.post = StringIO(post)
 11+ self.closed = False
 12+ self.mode = "r"
 14+ def read(self, bytes=sys.maxint):
 15+ bytes = int(bytes)
 16+ if self.closed: raise ValueError("I/O operation on closed file")
 18+ preBytes = self.pre.read(bytes)
 19+ print(str(preBytes))
 20+ if len(preBytes) < bytes:
 21+ fpBytes = self.fp.read(bytes-len(preBytes))
 22+ else:
 23+ fpBytes = ''
 25+ if len(preBytes) + len(fpBytes) < bytes:
 26+ postBytes = self.post.read(bytes-(len(preBytes) + len(fpBytes)))
 27+ else:
 28+ postBytes = ''
 30+ return preBytes + fpBytes + postBytes
 32+ def readline(self):
 33+ if self.closed: raise ValueError("I/O operation on closed file")
 35+ output = self.pre.readline()
 36+ if len(output) == 0 or output[-1] != "\n":
 37+ output += self.fp.readline()
 38+ if len(output) == 0 or output[-1] != "\n":
 39+ output += self.post.readline()
 41+ return output
 43+ def readlines(self): raise NotImplementedError()
 45+ def __iter__(self):
 47+ line = self.readline()
 48+ while line != '':
 49+ yield line
 50+ line = self.readline()
 53+ def seek(self): raise NotImplementedError()
 54+ def write(self): raise NotImplementedError()
 55+ def writelines(self): raise NotImplementedError()
 56+ def tell(self):
 57+ return self.pre.tell() + self.fp.tell() + self.post.tell()
 60+ def close(self):
 61+ self.closed = True
 62+ self.fp.close()

Status & tagging log