r90916 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r90915‎ | r90916 | r90917 >
Date:22:41, 27 June 2011
Author:halfak
Status:deferred
Tags:
Comment:
adding utils and some scripts I forgot
Modified paths:
  • /trunk/tools/wsor/overworked/R/Rplots.pdf (modified) (history)
  • /trunk/tools/wsor/overworked/R/loader/load_reverter_months.R (added) (history)
  • /trunk/tools/wsor/overworked/R/revert_distributions.R (added) (history)
  • /trunk/tools/wsor/overworked/convert_reverts.py (added) (history)
  • /trunk/tools/wsor/overworked/testing.sql (modified) (history)
  • /trunk/tools/wsor/scripts/revision_meta.py (added) (history)
  • /trunk/tools/wsor/ts_samples/total_talk_edits_staeiou.py (added) (history)
  • /trunk/tools/wsor/utilities (added) (history)
  • /trunk/tools/wsor/utilities/limited_dict_lists.py (added) (history)
  • /trunk/tools/wsor/utilities/limited_queue.py (added) (history)
  • /trunk/tools/wsor/wikimedia/setup.py (modified) (history)
  • /trunk/tools/wsor/wikimedia/wmf/util.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/wikimedia/setup.py
@@ -2,7 +2,7 @@
33 from setuptools import setup, find_packages
44
55 setup(
6 - name='util',
 6+ name='wmf',
77 version='1.0',
88 description="WMF utilities",
99 long_description="""
Index: trunk/tools/wsor/wikimedia/wmf/util.py
@@ -234,3 +234,5 @@
235235 return True;
236236
237237 return False;
 238+
 239+
Index: trunk/tools/wsor/overworked/R/loader/load_reverter_months.R
@@ -0,0 +1,23 @@
 2+source("util/env.R")
 3+
 4+
 5+
 6+load_reverter_months = function(verbose=T, reload=F){
 7+ filename = paste(DATA_DIR, "en.reverter_months.20110115.tsv", sep="/")
 8+ if(is.null(REVERTER_MONTHS) | reload){
 9+ REVERTER_MONTHS <<- NULL
 10+ }
 11+ if(is.null(REVERTER_MONTHS)){
 12+ if(verbose){cat("Loading reverter months from", filename, "...")}
 13+ REVERTER_MONTHS <<- read.table(
 14+ filename,
 15+ header=T, sep="\t",
 16+ quote="'\"", comment.char="",
 17+ na.strings="\\N",
 18+ )
 19+ if(verbose){cat("DONE!\n")}
 20+ }
 21+ REVERTER_MONTHS
 22+}
 23+
 24+
Index: trunk/tools/wsor/overworked/R/revert_distributions.R
@@ -0,0 +1,199 @@
 2+source("loader/load_patroller_days.R")
 3+
 4+patroller_days = load_patroller_days()
 5+patroller_days = patroller_days[!grepl("bot( |$)", patroller_days$username, ignore.case=T),]
 6+patroller_days = patroller_days[!grepl("DASHBot", patroller_days$username, ignore.case=T),]
 7+
 8+library(lattice)
 9+library(doBy)
 10+
 11+
 12+patroller_years = with(
 13+ summaryBy(
 14+ count ~ year + user_id + username,
 15+ data=patroller_days,
 16+ FUN=sum
 17+ ),
 18+ data.frame(
 19+ year = year,
 20+ user_id = user_id,
 21+ username = username,
 22+ count = count.sum
 23+ )
 24+)
 25+
 26+patroller_years = patroller_years[order(patroller_years$count),]
 27+patroller_years$count_bucket = 2^round(log(patroller_years$count, base=2))
 28+
 29+patroller_years.count_dist = with(
 30+ summaryBy(
 31+ user_id ~ year + count,
 32+ data = patroller_years,
 33+ FUN=length
 34+ ),
 35+ data.frame(
 36+ year = year,
 37+ count = count,
 38+ freq = user_id.length
 39+ )
 40+)
 41+
 42+png('plots/dist.patroller_years_activity.png', height=768, width=1024)
 43+xyplot(
 44+ freq ~ count | as.character(year),
 45+ data = patroller_years.count_dist,
 46+ panel = function(x, y, subscripts, group, ...){
 47+ panel.xyplot(x, y)
 48+ panel.lines(x, y)
 49+ },
 50+ main="Distribution of activity level among editors",
 51+ ylab="Frequency",
 52+ xlab="Activity level",
 53+ #scales=list(
 54+ # x=list(
 55+ # log=2,
 56+ # at=2^(1:max(patroller_years.count_dist$count)),
 57+ # labels=2^(1:max(patroller_years.count_dist$count))
 58+ # )
 59+ #),
 60+ layout=c(length(unique(patroller_years.count_dist$year)), 1)
 61+)
 62+dev.off()
 63+
 64+
 65+
 66+for(year in sort(unique(patroller_years$year))){
 67+ p_year = patroller_years[patroller_years$year==year,]
 68+ p_year = p_year[order(p_year$count, decreasing=T),]
 69+ png(paste('plots/bars.patroller_years_activity', year, 'png', sep="."), height=768, width=1024)
 70+ print(barchart(
 71+ reorder(substring(as.character(username),1,30), count) ~ count,
 72+ data=p_year[1:50,],
 73+ horizontal=T,
 74+ xlim=c(0, 110000),
 75+ xlab="Patrolled pages"
 76+ ))
 77+ dev.off()
 78+ cat(year, "\n")
 79+ print(summary(p_year$count))
 80+}
 81+
 82+
 83+patroller_months = with(
 84+ summaryBy(
 85+ count ~ year + month + user_id + username,
 86+ data=patroller_days,
 87+ FUN=sum
 88+ ),
 89+ data.frame(
 90+ year = year,
 91+ month = month,
 92+ user_id = user_id,
 93+ username = username,
 94+ count = count.sum
 95+ )
 96+)
 97+
 98+nNoNA = function(x){
 99+ length(subset(x, !is.na(x)))
 100+}
 101+sdNoNA = function(x){
 102+ sd(x, na.rm=T)/sqrt(nNoNA(x))
 103+}
 104+meanNoNA = function(x){
 105+ mean(x, na.rm=T)
 106+}
 107+
 108+patrol_months.per_user = with(
 109+ summaryBy(
 110+ count ~ year + month,
 111+ data=patroller_months,
 112+ FUN=c(meanNoNA, sdNoNA, nNoNA)
 113+ ),
 114+ data.frame(
 115+ year = year,
 116+ month = month,
 117+ year.month = year + month/100,
 118+ count.mean = count.meanNoNA,
 119+ count.sd = count.sdNoNA,
 120+ count.n = count.nNoNA
 121+ )
 122+)
 123+
 124+model = lm(
 125+ count.mean ~ as.numeric(factor(year.month)),
 126+ data=patrol_months.per_user[patrol_months.per_user$year.month <= 2011.05,]
 127+)
 128+summary(model)
 129+monthLine = function(x){
 130+ model$coefficients[['(Intercept)']] + model$coefficients[['as.numeric(factor(year.month))']]*x
 131+}
 132+
 133+png("plots/patrol_months.per_user.png", height=768, width=1024)
 134+print(xyplot(
 135+ count.mean ~ as.factor(year.month),
 136+ data = patrol_months.per_user[patrol_months.per_user$year.month <= 2011.05,],
 137+ panel = function(x, y, subscripts, ...){
 138+ f = patrol_months.per_user[patrol_months.per_user$year.month <= 2011.05,][subscripts,]
 139+ panel.xyplot(x, y, col="#000000", ...)
 140+ se = f$count.sd/sqrt(f$count.n)
 141+ panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...)
 142+ panel.lines(x[order(x)], y[order(x)], lwd=2, ...)
 143+ panel.lines(x[order(x)], monthLine(as.numeric(x[order(x)])), lwd=2, col="#000000")
 144+ },
 145+ #main="Average Patroller workload by month",
 146+ ylab="Mean patrolled pages per user",
 147+ xlab="Month",
 148+ scales=list(x=list(rot=45))
 149+))
 150+dev.off()
 151+
 152+patrol_years.per_user = with(
 153+ summaryBy(
 154+ count ~ year,
 155+ data=patroller_years,
 156+ FUN=c(meanNoNA, sdNoNA, nNoNA)
 157+ ),
 158+ data.frame(
 159+ year = year,
 160+ count.mean = count.meanNoNA,
 161+ count.sd = count.sdNoNA,
 162+ count.n = count.nNoNA
 163+ )
 164+)
 165+
 166+model = lm(
 167+ count.mean ~ year,
 168+ data=patrol_years.per_user[patrol_years.per_user$year <= 2010,]
 169+)
 170+summary(model)
 171+
 172+model = lm(
 173+ count.mean ~ log(year-2006, base=2),
 174+ data=patrol_years.per_user[patrol_years.per_user$year <= 2010,]
 175+)
 176+summary(model)
 177+yearCurve=function(x){
 178+ model$coefficients[['(Intercept)']] + log(x-2006, base=2)*model$coefficients[['log(year - 2006, base = 2)']]
 179+}
 180+png("plots/patrol_years.per_user.png", height=768, width=1024)
 181+print(xyplot(
 182+ count.mean ~ year-2006,
 183+ data = patrol_years.per_user[patrol_years.per_user$year <= 2010,],
 184+ panel = function(x, y, subscripts, ...){
 185+ f = patrol_years.per_user[patrol_years.per_user$year.month <= 2011.05,][subscripts,]
 186+ panel.xyplot(x, y, col="#000000", ...)
 187+ se = f$count.sd/sqrt(f$count.n)
 188+ panel.arrows(x, y+se, x, y-se, ends="both", angle=90, col="#000000", length=0.05, ...)
 189+ #panel.lines(x[order(x)], y[order(x)], lwd=2, ...)
 190+ #panel.curve(myCurve, 2006, 2011, col="#000000")
 191+ panel.lines(seq(0, 5, .1), yearCurve(seq(2006, 2011, .1)), lwd=2, col="#000000")
 192+ },
 193+ #main="Average Patroller workload by year",
 194+ ylab="Mean patrolled pages per user",
 195+ xlab="Year (log scaled)",
 196+ pch=20,
 197+ scales=list(x=list(at=1:5, labels=2007:2010))
 198+))
 199+dev.off()
 200+
Index: trunk/tools/wsor/overworked/R/Rplots.pdf
@@ -2,8 +2,8 @@
33 %���ρ�\r
44 1 0 obj
55 <<
6 -/CreationDate (D:20110610225803)
7 -/ModDate (D:20110610225803)
 6+/CreationDate (D:20110627164459)
 7+/ModDate (D:20110627164459)
88 /Title (R Graphics Output)
99 /Producer (R 2.13.0)
1010 /Creator (R)
@@ -29,292 +29,85 @@
3030 >>
3131 stream
3232 1 J 1 j q
33 -Q q
34 -Q q
 33+Q q 59.04 73.44 414.72 371.52 re W n
 34+/sRGB CS 0.000 0.000 0.000 SCN
 35+0.75 w
 36+[] 0 d
 37+1 J
 38+1 j
 39+10.00 M
3540 BT
36 -/sRGB cs 0.000 0.000 0.000 scn
37 -/F2 1 Tf 12.00 0.00 -0.00 12.00 279.25 12.00 Tm [(P) 40 (atrolled pages)] TJ
 41+/F1 1 Tf 1 Tr 7.48 0 0 7.48 263.44 256.60 Tm (l) Tj 0 Tr
3842 ET
3943 Q q
40 -Q q
41 -Q q
42 -Q q 160.73 50.80 319.93 418.19 re W n
43 -Q q
44 -Q q
4544 /sRGB CS 0.000 0.000 0.000 SCN
4645 0.75 w
4746 [] 0 d
4847 1 J
4948 1 j
5049 10.00 M
51 -172.58 468.99 m 172.58 474.66 l S
52 -231.82 468.99 m 231.82 474.66 l S
53 -291.07 468.99 m 291.07 474.66 l S
54 -350.32 468.99 m 350.32 474.66 l S
55 -409.57 468.99 m 409.57 474.66 l S
56 -468.81 468.99 m 468.81 474.66 l S
57 -Q q
58 -Q q
 50+74.40 73.44 m 458.40 73.44 l S
 51+74.40 73.44 m 74.40 66.24 l S
 52+170.40 73.44 m 170.40 66.24 l S
 53+266.40 73.44 m 266.40 66.24 l S
 54+362.40 73.44 m 362.40 66.24 l S
 55+458.40 73.44 m 458.40 66.24 l S
5956 BT
6057 /sRGB cs 0.000 0.000 0.000 scn
61 -/F2 1 Tf 10.00 0.00 -0.00 10.00 109.08 52.21 Tm [(Gonz) 15 (onoir)] TJ
 58+/F2 1 Tf 12.00 0.00 -0.00 12.00 66.06 47.52 Tm (0.6) Tj
6259 ET
6360 BT
64 -/F2 1 Tf 10.00 0.00 -0.00 10.00 94.00 60.54 Tm [(T) 120 (eapotgeorge)] TJ
 61+/F2 1 Tf 12.00 0.00 -0.00 12.00 162.06 47.52 Tm (0.8) Tj
6562 ET
6663 BT
67 -/F2 1 Tf 10.00 0.00 -0.00 10.00 117.27 68.87 Tm (Pichpich) Tj
 64+/F2 1 Tf 12.00 0.00 -0.00 12.00 258.06 47.52 Tm (1.0) Tj
6865 ET
6966 BT
70 -/F2 1 Tf 10.00 0.00 -0.00 10.00 121.71 77.20 Tm (Pdcook) Tj
 67+/F2 1 Tf 12.00 0.00 -0.00 12.00 354.06 47.52 Tm (1.2) Tj
7168 ET
7269 BT
73 -/F2 1 Tf 10.00 0.00 -0.00 10.00 109.34 85.53 Tm [(Acroter) -15 (ion)] TJ
 70+/F2 1 Tf 12.00 0.00 -0.00 12.00 450.06 47.52 Tm (1.4) Tj
7471 ET
 72+59.04 87.20 m 59.04 431.20 l S
 73+59.04 87.20 m 51.84 87.20 l S
 74+59.04 173.20 m 51.84 173.20 l S
 75+59.04 259.20 m 51.84 259.20 l S
 76+59.04 345.20 m 51.84 345.20 l S
 77+59.04 431.20 m 51.84 431.20 l S
7578 BT
76 -/F2 1 Tf 10.00 0.00 -0.00 10.00 65.89 93.86 Tm [(W) 30 (ereSpielChequers)] TJ
 79+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 83.86 Tm (6) Tj
7780 ET
7881 BT
79 -/F2 1 Tf 10.00 0.00 -0.00 10.00 115.03 102.19 Tm (Bonadea) Tj
 82+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 169.86 Tm (8) Tj
8083 ET
8184 BT
82 -/F2 1 Tf 10.00 0.00 -0.00 10.00 122.27 110.53 Tm (Melaen) Tj
 85+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 252.53 Tm (10) Tj
8386 ET
8487 BT
85 -/F2 1 Tf 10.00 0.00 -0.00 10.00 113.37 118.86 Tm (PhGustaf) Tj
 88+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 338.53 Tm (12) Tj
8689 ET
8790 BT
88 -/F2 1 Tf 10.00 0.00 -0.00 10.00 99.08 127.19 Tm [(W) 40 (a) 30 (yne Slam)] TJ
 91+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 424.53 Tm (14) Tj
8992 ET
90 -BT
91 -/F2 1 Tf 10.00 0.00 -0.00 10.00 101.15 135.52 Tm (Prestonmag) Tj
92 -ET
93 -BT
94 -/F2 1 Tf 10.00 0.00 -0.00 10.00 105.61 143.85 Tm (Catfish Jim) Tj
95 -ET
96 -BT
97 -/F2 1 Tf 10.00 0.00 -0.00 10.00 104.59 152.18 Tm [(Cindam) 10 (use)] TJ
98 -ET
99 -BT
100 -/F2 1 Tf 10.00 0.00 -0.00 10.00 130.05 160.51 Tm (Mono) Tj
101 -ET
102 -BT
103 -/F2 1 Tf 10.00 0.00 -0.00 10.00 121.16 168.84 Tm (Scapler) Tj
104 -ET
105 -BT
106 -/F2 1 Tf 10.00 0.00 -0.00 10.00 109.10 177.17 Tm [(Jimm) 15 (y Pitt)] TJ
107 -ET
108 -BT
109 -/F2 1 Tf 10.00 0.00 -0.00 10.00 121.73 185.50 Tm (WWGB) Tj
110 -ET
111 -BT
112 -/F2 1 Tf 10.00 0.00 -0.00 10.00 91.00 193.83 Tm [(Elektr) -15 (ik Shoos)] TJ
113 -ET
114 -BT
115 -/F2 1 Tf 10.00 0.00 -0.00 10.00 50.67 202.16 Tm [(Phar) 10 (aoh of the Wizards)] TJ
116 -ET
117 -BT
118 -/F2 1 Tf 10.00 0.00 -0.00 10.00 96.13 210.49 Tm (Seb az86556) Tj
119 -ET
120 -BT
121 -/F2 1 Tf 10.00 0.00 -0.00 10.00 103.31 218.82 Tm [(F) 50 (alcon8765)] TJ
122 -ET
123 -BT
124 -/F2 1 Tf 10.00 0.00 -0.00 10.00 132.83 227.15 Tm (Stifle) Tj
125 -ET
126 -BT
127 -/F2 1 Tf 10.00 0.00 -0.00 10.00 106.16 235.48 Tm (Andyjsmith) Tj
128 -ET
129 -BT
130 -/F2 1 Tf 10.00 0.00 -0.00 10.00 76.97 243.81 Tm [(Marcus Qw) 10 (er) -40 (tyus)] TJ
131 -ET
132 -BT
133 -/F2 1 Tf 10.00 0.00 -0.00 10.00 112.28 252.14 Tm (Clubmarx) Tj
134 -ET
135 -BT
136 -/F2 1 Tf 10.00 0.00 -0.00 10.00 121.16 260.47 Tm (TheTito) Tj
137 -ET
138 -BT
139 -/F2 1 Tf 10.00 0.00 -0.00 10.00 129.64 268.80 Tm [(Xtz) 15 (ou)] TJ
140 -ET
141 -BT
142 -/F2 1 Tf 10.00 0.00 -0.00 10.00 129.91 277.13 Tm [(Shir) -15 (ik)] TJ
143 -ET
144 -BT
145 -/F2 1 Tf 10.00 0.00 -0.00 10.00 99.48 285.46 Tm (MuffledThud) Tj
146 -ET
147 -BT
148 -/F2 1 Tf 10.00 0.00 -0.00 10.00 112.36 293.79 Tm [(T) 120 (r) 10 (a) 20 (v) 25 (elbird)] TJ
149 -ET
150 -BT
151 -/F2 1 Tf 10.00 0.00 -0.00 10.00 38.84 302.13 Tm [(V) 80 (ejv) 25 (an\\xc4\\x8dick\\xc3\\xbd)] TJ
152 -ET
153 -BT
154 -/F2 1 Tf 10.00 0.00 -0.00 10.00 121.16 310.46 Tm (VQuakr) Tj
155 -ET
156 -BT
157 -/F2 1 Tf 10.00 0.00 -0.00 10.00 98.63 318.79 Tm [(Air) -30 (planeman)] TJ
158 -ET
159 -BT
160 -/F2 1 Tf 10.00 0.00 -0.00 10.00 63.91 327.12 Tm (Daemonic Kangaroo) Tj
161 -ET
162 -BT
163 -/F2 1 Tf 10.00 0.00 -0.00 10.00 106.56 335.45 Tm [(Der) -15 (ild4921)] TJ
164 -ET
165 -BT
166 -/F2 1 Tf 10.00 0.00 -0.00 10.00 115.33 343.78 Tm [(K) 30 (udpung)] TJ
167 -ET
168 -BT
169 -/F2 1 Tf 10.00 0.00 -0.00 10.00 106.72 352.11 Tm (Realkyhick) Tj
170 -ET
171 -BT
172 -/F2 1 Tf 10.00 0.00 -0.00 10.00 115.04 360.44 Tm (Gilo1969) Tj
173 -ET
174 -BT
175 -/F2 1 Tf 10.00 0.00 -0.00 10.00 17.67 368.77 Tm [(The Blade of the Nor) -40 (ther) -25 (n Ligh)] TJ
176 -ET
177 -BT
178 -/F2 1 Tf 10.00 0.00 -0.00 10.00 112.21 377.10 Tm [(RadioF) 50 (an)] TJ
179 -ET
180 -BT
181 -/F2 1 Tf 10.00 0.00 -0.00 10.00 110.60 385.43 Tm (Timneu22) Tj
182 -ET
183 -BT
184 -/F2 1 Tf 10.00 0.00 -0.00 10.00 119.08 393.76 Tm [(Tton) 15 (yb1)] TJ
185 -ET
186 -BT
187 -/F2 1 Tf 10.00 0.00 -0.00 10.00 97.97 402.09 Tm [(Shado) 15 (wjams)] TJ
188 -ET
189 -BT
190 -/F2 1 Tf 10.00 0.00 -0.00 10.00 53.36 410.42 Tm (Ser Amantio di Nicolao) Tj
191 -ET
192 -BT
193 -/F2 1 Tf 10.00 0.00 -0.00 10.00 105.06 418.75 Tm (Malcolmxl5) Tj
194 -ET
195 -BT
196 -/F2 1 Tf 10.00 0.00 -0.00 10.00 64.57 427.08 Tm [(Dr) 10 (agonflySixtyse) 30 (v) 25 (en)] TJ
197 -ET
198 -BT
199 -/F2 1 Tf 10.00 0.00 -0.00 10.00 111.70 435.41 Tm (Sopher99) Tj
200 -ET
201 -BT
202 -/F2 1 Tf 10.00 0.00 -0.00 10.00 115.60 443.74 Tm (Eeekster) Tj
203 -ET
204 -BT
205 -/F2 1 Tf 10.00 0.00 -0.00 10.00 113.93 452.07 Tm (Ironholds) Tj
206 -ET
207 -BT
208 -/F2 1 Tf 10.00 0.00 -0.00 10.00 104.48 460.40 Tm (Blanchardb) Tj
209 -ET
 93+59.04 73.44 m
 94+473.76 73.44 l
 95+473.76 444.96 l
 96+59.04 444.96 l
 97+59.04 73.44 l
 98+S
21099 Q q
211 -Q q
212 -/sRGB CS 0.000 0.000 0.000 SCN
213 -0.75 w
214 -[] 0 d
215 -1 J
216 -1 j
217 -10.00 M
218 -172.58 50.80 m 172.58 45.13 l S
219 -231.82 50.80 m 231.82 45.13 l S
220 -291.07 50.80 m 291.07 45.13 l S
221 -350.32 50.80 m 350.32 45.13 l S
222 -409.57 50.80 m 409.57 45.13 l S
223 -468.81 50.80 m 468.81 45.13 l S
224100 BT
225101 /sRGB cs 0.000 0.000 0.000 scn
226 -/F2 1 Tf 10.00 0.00 -0.00 10.00 169.80 32.29 Tm (0) Tj
 102+/F2 1 Tf 12.00 0.00 -0.00 12.00 251.90 18.72 Tm [(Inde) 30 (x)] TJ
227103 ET
228104 BT
229 -/F2 1 Tf 10.00 0.00 -0.00 10.00 217.92 32.29 Tm (50000) Tj
 105+/F2 1 Tf 0.00 12.00 -12.00 0.00 12.96 244.19 Tm (10:10) Tj
230106 ET
231 -BT
232 -/F2 1 Tf 10.00 0.00 -0.00 10.00 274.39 32.29 Tm (100000) Tj
233 -ET
234 -BT
235 -/F2 1 Tf 10.00 0.00 -0.00 10.00 333.64 32.29 Tm (150000) Tj
236 -ET
237 -BT
238 -/F2 1 Tf 10.00 0.00 -0.00 10.00 392.89 32.29 Tm (200000) Tj
239 -ET
240 -BT
241 -/F2 1 Tf 10.00 0.00 -0.00 10.00 452.13 32.29 Tm (250000) Tj
242 -ET
243 -Q q
244 -Q q 160.73 50.80 319.93 418.19 re W n
245 -/sRGB cs 0.000 1.000 1.000 scn
246 -/sRGB CS 0.000 0.000 0.000 SCN
247 -0.75 w
248 -[] 0 d
249 -1 J
250 -1 j
251 -10.00 M
252 -160.73 452.89 13.26 5.55 re B
253 -160.73 461.22 13.26 5.55 re B
254 -160.73 444.56 13.26 5.55 re B
255 -160.73 436.23 13.28 5.55 re B
256 -160.73 427.90 13.28 5.55 re B
257 -160.73 419.56 13.34 5.55 re B
258 -160.73 411.23 13.36 5.55 re B
259 -160.73 402.90 13.38 5.55 re B
260 -160.73 394.57 13.38 5.55 re B
261 -160.73 386.24 13.38 5.55 re B
262 -160.73 377.91 13.39 5.55 re B
263 -160.73 369.58 13.39 5.55 re B
264 -160.73 361.25 13.41 5.55 re B
265 -160.73 352.92 13.48 5.55 re B
266 -160.73 344.59 13.49 5.55 re B
267 -160.73 336.26 13.52 5.55 re B
268 -160.73 327.93 13.56 5.55 re B
269 -160.73 319.60 13.62 5.55 re B
270 -160.73 311.27 13.71 5.55 re B
271 -160.73 302.94 13.77 5.55 re B
272 -160.73 294.61 13.78 5.55 re B
273 -160.73 286.28 13.85 5.55 re B
274 -160.73 277.95 13.99 5.55 re B
275 -160.73 269.62 14.01 5.55 re B
276 -160.73 261.29 14.06 5.55 re B
277 -160.73 252.96 14.14 5.55 re B
278 -160.73 244.63 14.14 5.55 re B
279 -160.73 236.30 14.23 5.55 re B
280 -160.73 227.96 14.29 5.55 re B
281 -160.73 219.63 14.35 5.55 re B
282 -160.73 211.30 14.39 5.55 re B
283 -160.73 202.97 14.49 5.55 re B
284 -160.73 194.64 14.62 5.55 re B
285 -160.73 186.31 14.66 5.55 re B
286 -160.73 177.98 14.69 5.55 re B
287 -160.73 169.65 14.94 5.55 re B
288 -160.73 161.32 15.38 5.55 re B
289 -160.73 152.99 16.45 5.55 re B
290 -160.73 144.66 17.24 5.55 re B
291 -160.73 136.33 17.31 5.55 re B
292 -160.73 128.00 17.66 5.55 re B
293 -160.73 119.67 18.54 5.55 re B
294 -160.73 111.34 18.57 5.55 re B
295 -160.73 103.01 20.23 5.55 re B
296 -160.73 94.68 21.28 5.55 re B
297 -160.73 86.35 22.11 5.55 re B
298 -160.73 78.02 22.20 5.55 re B
299 -160.73 69.69 22.37 5.55 re B
300 -160.73 61.36 23.52 5.55 re B
301 -160.73 53.03 23.55 5.55 re B
302 -Q q
303 -Q q
304 -/sRGB CS 0.000 0.000 0.000 SCN
305 -0.75 w
306 -[] 0 d
307 -1 J
308 -1 j
309 -10.00 M
310 -160.73 50.80 319.93 418.19 re S
311 -Q q
312 -Q q
313 -Q q
314107 Q
315108 endstream
316109 endobj
317110 9 0 obj
318 -6726
 111+1517
319112 endobj
320113 3 0 obj
321114 <<
@@ -329,7 +122,7 @@
330123 4 0 obj
331124 <<
332125 /ProcSet [/PDF /Text]
333 -/Font <</F2 11 0 R >>
 126+/Font << /F1 11 0 R /F2 12 0 R >>
334127 /ExtGState << >>
335128 /ColorSpace << /sRGB 5 0 R >>
336129 >>
@@ -548,33 +341,42 @@
549342 /dieresis /.notdef /ring /cedilla /.notdef /hungarumlaut /ogonek /caron /space]
550343 >>
551344 endobj
552 -11 0 obj <<
 345+11 0 obj
 346+<<
553347 /Type /Font
554348 /Subtype /Type1
 349+/Name /F1
 350+/BaseFont /ZapfDingbats
 351+>>
 352+endobj
 353+12 0 obj <<
 354+/Type /Font
 355+/Subtype /Type1
555356 /Name /F2
556357 /BaseFont /Helvetica
557358 /Encoding 10 0 R
558359 >> endobj
559360 xref
560 -0 12
 361+0 13
561362 0000000000 65535 f
562363 0000000021 00000 n
563364 0000000164 00000 n
564 -0000007092 00000 n
565 -0000007175 00000 n
566 -0000007287 00000 n
567 -0000007320 00000 n
 365+0000001883 00000 n
 366+0000001966 00000 n
 367+0000002090 00000 n
 368+0000002123 00000 n
568369 0000000213 00000 n
569370 0000000293 00000 n
570 -0000007072 00000 n
571 -0000016856 00000 n
572 -0000017114 00000 n
 371+0000001863 00000 n
 372+0000011659 00000 n
 373+0000011917 00000 n
 374+0000012001 00000 n
573375 trailer
574376 <<
575 -/Size 12
 377+/Size 13
576378 /Info 1 0 R
577379 /Root 2 0 R
578380 >>
579381 startxref
580 -17212
 382+12099
581383 %%EOF
Index: trunk/tools/wsor/overworked/convert_reverts.py
@@ -0,0 +1,48 @@
 2+import argparse, sys, os
 3+
 4+
 5+
 6+def main(args):
 7+ files = {
 8+ 'revert': args.revert,
 9+ 'reverted': args.reverted
 10+ }
 11+
 12+ for line in args.input:
 13+ ty = eval(line.strip().split("\t")[0])
 14+ files[ty].write(line.split("\t", 1)[1])
 15+
 16+ if ty == "revert":
 17+ sys.stderr.write("<")
 18+ elif ty == "reverted":
 19+ sys.stderr.write("|")
 20+
 21+ sys.stderr.write("\n")
 22+
 23+
 24+if __name__ == "__main__":
 25+ parser = argparse.ArgumentParser(
 26+ description='Cleans revert data from a dump map process.'
 27+ )
 28+ parser.add_argument(
 29+ '-i', '--input',
 30+ metavar="<path>",
 31+ type=lambda fn:open(fn, "r"),
 32+ help='the path of the file to filter (defaults to stdin)',
 33+ default=sys.stdin
 34+ )
 35+ parser.add_argument(
 36+ '--reverted',
 37+ metavar="<path>",
 38+ type=lambda fn:open(os.path.expanduser(fn), "w"),
 39+ help='the path to a file to produce representing the reverted revisions'
 40+ )
 41+ parser.add_argument(
 42+ '--revert',
 43+ metavar="<path>",
 44+ type=lambda fn:open(os.path.expanduser(fn), "w"),
 45+ help='the path to a file to produce representing the reverting revisions'
 46+ )
 47+ args = parser.parse_args()
 48+ main(args)
 49+
Index: trunk/tools/wsor/overworked/testing.sql
@@ -1,52 +1,52 @@
22 +------------+-------------+
3 -| log_action | log_type |
 3+| log_action log_type
44 +------------+-------------+
5 -| delete | delete |
6 -| upload | upload |
7 -| protect | protect |
8 -| block | block |
9 -| unblock | block |
10 -| restore | delete |
11 -| unprotect | protect |
12 -| rights | rights |
13 -| move | move |
14 -| move_redir | move |
15 -| | |
16 -| renameuser | renameuser |
17 -| newusers | newusers |
18 -| create | newusers |
19 -| create2 | newusers |
20 -| modify | protect |
21 -| overwrite | upload |
22 -| upload | import |
23 -| patrol | patrol |
24 -| delete | suppress |
25 -| autocreate | newusers |
26 -| delete | globalauth |
27 -| whitelist | gblblock |
28 -| dwhitelist | gblblock |
29 -| move_prot | protect |
30 -| reblock | block |
31 -| event | suppress |
32 -| event | delete |
33 -| revision | delete |
34 -| revision | suppress |
35 -| reblock | suppress |
36 -| modify | abusefilter |
37 -| block | suppress |
38 -| usergroups | gblrights |
39 -| interwiki | import |
40 -| groupprms2 | gblrights |
41 -| config | stable |
42 -| approve-ia | review |
43 -| approve-a | review |
44 -| unapprove | review |
45 -| approve | review |
46 -| reset | stable |
47 -| modify | stable |
48 -| approve-i | review |
49 -| hide-afl | suppress |
50 -| unhide-afl | suppress |
 5+| delete delete
 6+| upload upload
 7+| protect protect
 8+| block block
 9+| unblock block
 10+| restore delete
 11+| unprotect protect
 12+| rights rights
 13+| move move
 14+| move_redir move
 15+|
 16+| renameuser renameuser
 17+| newusers newusers
 18+| create newusers
 19+| create2 newusers
 20+| modify protect
 21+| overwrite upload
 22+| upload import
 23+| patrol patrol
 24+| delete suppress
 25+| autocreate newusers
 26+| delete globalauth
 27+| whitelist gblblock
 28+| dwhitelist gblblock
 29+| move_prot protect
 30+| reblock block
 31+| event suppress
 32+| event delete
 33+| revision delete
 34+| revision suppress
 35+| reblock suppress
 36+| modify abusefilter
 37+| block suppress
 38+| usergroups gblrights
 39+| interwiki import
 40+| groupprms2 gblrights
 41+| config stable
 42+| approve-ia review
 43+| approve-a review
 44+| unapprove review
 45+| approve review
 46+| reset stable
 47+| modify stable
 48+| approve-i review
 49+| hide-afl suppress
 50+| unhide-afl suppress
5151 +------------+-------------+
5252 46 rows in set (2 min 28.77 sec)
5353
@@ -90,6 +90,103 @@
9191 )
9292
9393
 94+CREATE TABLE halfak.revert_pre_20110115(
 95+ revision_id INT,
 96+ rvtd_to_id INT,
 97+ revs_reverted INT
 98+);
 99+
 100+CREATE TABLE halfak.reverted_pre_20110115(
 101+ revision_id INT,
 102+ rvtg_id INT,
 103+ rvtd_to_id INT,
 104+ revs_reverted INT
 105+);
 106+
 107+
 108+
 109+CREATE TABLE halfak.reverted_20110115(
 110+ revision_id INT,
 111+ username VARBINARY(255),
 112+ user_id INT,
 113+ comment VARBINARY(255),
 114+ rvtg_id INT,
 115+ rvtg_username VARBINARY(255),
 116+ rvtg_user_id INT,
 117+ rvtg_comment VARBINARY(255),
 118+ rvtto_id INT,
 119+ rvtto_username VARBINARY(255),
 120+ rvtto_user_id INT,
 121+ rvtto_comment VARBINARY(255),
 122+ is_vandalism BOOL,
 123+ revs_reverted INT
 124+);
 125+INSERT INTO halfak.reverted_20110115
 126+SELECT
 127+ reverted.rev_id,
 128+ reverted.rev_user_text,
 129+ reverted.rev_user,
 130+ reverted.rev_comment,
 131+ reverting.rev_id,
 132+ reverting.rev_user_text,
 133+ reverting.rev_user,
 134+ reverting.rev_comment,
 135+ reverted_to.rev_id,
 136+ reverted_to.rev_user_text,
 137+ reverted_to.rev_user,
 138+ reverted_to.rev_comment,
 139+ CONVERT(reverting.rev_comment USING utf8) REGEXP "(^revert\\ to.+using)|(^reverted\\ edits\\ by.+using)|(^reverted\\ edits\\ by.+to\\ last\\ version\\ by)|(^bot\\ -\\ rv.+to\\ last\\ version\\ by)|(-assisted\\ reversion)|(^(revert(ed)?|rv).+to\\ last)|(^undo\\ revision.+by)" OR
 140+ CONVERT(reverting.rev_comment USING utf8) REGEXP "(\\brvv)|(\\brv[/ ]v)|(vandal(!proof|bot))|(\\b(rv|rev(ert)?|rm)\\b.*(blank|spam|nonsense|porn|mass\\sdelet|vand))",
 141+ r.revs_reverted
 142+FROM
 143+ halfak.reverted_pre_20110115 r
 144+INNER JOIN revision reverted
 145+ ON r.revision_id = reverted.rev_id
 146+INNER JOIN revision reverting
 147+ ON r.revision_id = reverting.rev_id
 148+INNER JOIN revision reverted_to
 149+ ON r.revision_id = reverted_to.rev_id;
 150+CREATE INDEX rev_id_idx ON halfak.reverted_20110115 (revision_id);
 151+CREATE INDEX rvtg_id_idx ON halfak.reverted_20110115 (rvtg_id);
 152+
 153+
 154+CREATE TABLE halfak.revert_20110115(
 155+ revision_id INT,
 156+ rvtto_id INT,
 157+ is_vandalism BOOL,
 158+ revs_reverted INT
 159+);
 160+INSERT INTO halfak.revert_20110115
 161+SELECT
 162+ rvt.revision_id,
 163+ rvt.rvtd_to_id,
 164+ bit_or(rvtd.is_vandalism),
 165+ rvt.revs_reverted
 166+FROM halfak.revert_pre_20110115 rvt
 167+INNER JOIN halfak.reverted_20110115 rvtd
 168+ ON rvt.revision_id = rvtd.rvtg_id
 169+GROUP BY rvt.revision_id, rvt.rvtd_to_id, rvt.revs_reverted;
 170+CREATE INDEX rev_id_idx ON halfak.revert_20110115 (revision_id);
 171+CREATE INDEX is_vandalism ON halfak.revert_20110115 (is_vandalism);
 172+
 173+
 174+
 175+
94176 --SELECT * FROM revision WHERE rev_comment LIKE "Requesting speedy deletion%"
95177
96178
 179+SELECT
 180+ SUBSTRING(rev_timestamp, 1,4) as year,
 181+ rev_user as user_id,
 182+ u.user_name as username,
 183+ COUNT(*) as revisions,
 184+ SUM(rvt.revision_id IS NOT NULL) as reverts,
 185+ SUM(rvt.revision_id IS NOT NULL AND rvt.is_vandalism) as vandal_reverts
 186+FROM revision r
 187+LEFT JOIN halfak.revert_20100130 rvt
 188+ ON r.rev_id = rvt.revision_id
 189+INNER JOIN user u
 190+ ON r.rev_user = u.user_id
 191+WHERE rev_timestamp < "20110000000000"
 192+GROUP BY SUBSTRING(rev_timestamp, 1,4), rev_user, u.user_name
 193+
Index: trunk/tools/wsor/scripts/revision_meta.py
@@ -0,0 +1,114 @@
 2+import sys, subprocess, os, errno, re, argparse, logging, hashlib, types
 3+from difflib import SequenceMatcher
 4+from gl.containers import LimitedDictLists
 5+import wmf
 6+
 7+from text import STOP_WORDS, MARKUP
 8+
 9+
 10+def tokenize(text):
 11+ return re.findall(
 12+ r"[\w]+|\[\[|\]\]|\{\{|\}\}|\n+| +|&\w+;|'''|''|=+|\{\||\|\}|\|\-|.",
 13+ text
 14+ )
 15+
 16+def simpleDiff(a, b):
 17+ sm = SequenceMatcher(None, a, b)
 18+ added = []
 19+ removed = []
 20+ for (tag, i1, i2, j1, j2) in sm.get_opcodes():
 21+ if tag == 'replace':
 22+ removed.extend(a[i1:i2])
 23+ added.extend(b[j1:j2])
 24+ elif tag == 'delete':
 25+ removed.extend(a[i1:i2])
 26+ elif tag == 'insert':
 27+ added.extend(b[i1:i2])
 28+
 29+ return (added, removed)
 30+
 31+
 32+
 33+def process(dump, page):
 34+ recentRevs = LimitedDictLists(maxsize=15)
 35+ lastTokens = []
 36+ metaHeaders = [
 37+ 'rev_id',
 38+ 'checksum',
 39+ 'tokens',
 40+ 'cs_added',
 41+ 'cs_removed',
 42+ 'ts_added',
 43+ 'ts_removed',
 44+ 'ws_added',
 45+ 'ws_removed',
 46+ 'ms_added',
 47+ 'ms_removed'
 48+ ]
 49+ for revision in page.readRevisions():
 50+ checksum = hashlib.md5(revision.getText().encode("utf-8")).hexdigest()
 51+ if checksum in recentRevs:
 52+ #found a revert
 53+ revertedToRev = recentRevs[checksum]
 54+
 55+ #get the revisions that were reverted
 56+ revertedRevs = [r for (c,r) in reversed(recentRevs.getQueue()) if r.getId() > revertedToRev.getId()]
 57+
 58+ isVandalism = wmf.isVandalismByComment(revision.getComment())
 59+
 60+ #write revert row
 61+ yield (
 62+ 'revert',
 63+ revision.getId(),
 64+ revertedToRev.getId(),
 65+ isVandalism,
 66+ len(revertedRevs)
 67+ )
 68+
 69+ for rev in revertedRevs:
 70+ yield (
 71+ 'reverted',
 72+ rev.getId(),
 73+ revision.getId(),
 74+ revertedToRev.getId(),
 75+ isVandalism,
 76+ len(revertedRevs)
 77+ )
 78+ else:
 79+ pass
 80+
 81+ """tokens = tokenize(revision.getText())
 82+
 83+ tokensAdded, tokensRemoved = simpleDiff(lastTokens, tokens)
 84+
 85+ row = {
 86+ 'rev_id': revision.getId(),
 87+ 'checksum': checksum,
 88+ 'tokens': len(revision.getText()),
 89+ 'cs_added': 0,
 90+ 'cs_removed': 0,
 91+ 'ts_added': 0,
 92+ 'ts_removed': 0,
 93+ 'ws_added': 0,
 94+ 'ws_removed': 0,
 95+ 'ms_added': 0,
 96+ 'ms_removed': 0
 97+ }
 98+ for token in tokensAdded:
 99+ row['ts_added'] += 1
 100+ row['cs_added'] += len(token)
 101+ if token.strip() == '': pass
 102+ if token in MARKUP: row['ms_added'] += 1
 103+ elif token not in STOP_WORDS: row['ws_added'] += 1
 104+ for token in tokensRemoved:
 105+ row['ts_removed'] += 1
 106+ row['cs_removed'] += len(token)
 107+ if token.strip() == '': pass
 108+ if token in MARKUP: row['ms_removed'] += 1
 109+ elif token not in STOP_WORDS: row['ws_removed'] += 1
 110+
 111+
 112+ yield tuple(['meta']+[row[h] for h in metaHeaders])
 113+
 114+ lastTokens = tokens"""
 115+ recentRevs.insert(checksum, revision)
Index: trunk/tools/wsor/ts_samples/total_talk_edits_staeiou.py
@@ -0,0 +1,359 @@
 2+#
 3+# Sample talk page postings to newbie's talk pages in various languages.
 4+#
 5+# This script is intended to be run on the one of the toolserver machines.
 6+#
 7+# run python sample_talk_edits.py --help for command line parameters.
 8+#
 9+import os, sys, logging, argparse, MySQLdb, datetime
 10+
 11+def clean(v):
 12+ if v == None:
 13+ return "\N"
 14+ else:
 15+ return str(v).replace("\\", "\\\\").replace("\t", "\\t").replace("\n", "\\n")
 16+
 17+
 18+def main(args):
 19+ LOGGING_STREAM = sys.stderr
 20+ logging.basicConfig(
 21+ level=logging.DEBUG,
 22+ stream=LOGGING_STREAM,
 23+ format='%(asctime)s %(levelname)-8s %(message)s',
 24+ datefmt='%b-%d %H:%M:%S'
 25+ )
 26+
 27+ logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf))
 28+ conn = MySQLdb.connect(
 29+ host=args.host,
 30+ db=args.db,
 31+ read_default_file=args.cnf
 32+ )
 33+ fetchConn = MySQLdb.connect(
 34+ host=args.host,
 35+ db=args.db,
 36+ read_default_file=args.cnf
 37+ )
 38+
 39+ #Printing headers
 40+ print(
 41+ "\t".join([
 42+ 'user_id',
 43+ 'username',
 44+ 'registration',
 45+ 'first_edit',
 46+ 'end of newbie',
 47+ 'last user rev_id',
 48+ 'last utalk rev_id',
 49+ 'Main edits',
 50+ 'Talk edits',
 51+ 'User edits',
 52+ 'User_talk edits',
 53+ 'Wikipedia edits',
 54+ 'Wikipedia_talk edits',
 55+ 'Image edits',
 56+ 'Image_talk edits',
 57+ 'MediaWiki edits',
 58+ 'MediaWiki_talk edits',
 59+ 'Template edits',
 60+ 'Template_talk edits',
 61+ 'Help edits',
 62+ 'Help_talk edits',
 63+ 'Category edits',
 64+ 'Category_talk edits',
 65+ 'blocks'
 66+ ])
 67+ )
 68+ for year in args.year:
 69+ for semStart, semEnd in [('000000', '069999'), ('070000', '99999')]:
 70+ logging.info("Processing %s:%s" % (year, semStart))
 71+ start = str(year) + semStart + "000000"
 72+ end = str(year) + semEnd + "999999"
 73+ count = 0
 74+ for user in getUsers(fetchConn, start, end):
 75+ #
 76+ # The following lines take a user's first_edit,
 77+ # covert it to a date, add 30 days and convert
 78+ # it back to a string. I am syntax fu.
 79+ #
 80+ endOfNoob = (
 81+ datetime.date(
 82+ int(user['first_edit'][0:4]),
 83+ int(user['first_edit'][4:6]),
 84+ int(user['first_edit'][6:8])
 85+ )+datetime.timedelta(days=30)
 86+ ).strftime("%Y%m%d") + user['first_edit'][8:]
 87+
 88+ LOGGING_STREAM.write(":")
 89+ talkRevs = list(getPostsToTalkPage(
 90+ conn,
 91+ user['user_id'],
 92+ user['user_name'],
 93+ user['first_edit'],
 94+ endOfNoob
 95+ ))
 96+ newbieRevs = {}
 97+
 98+ LOGGING_STREAM.write(":")
 99+ for rev in getUserRevs(conn, user['user_id'], user['first_edit'], endOfNoob):
 100+ newbieRevs[rev['page_namespace']] = newbieRevs.get(rev['page_namespace'], 0)+1
 101+
 102+
 103+ LOGGING_STREAM.write(":")
 104+ blocks = '\n'.join(
 105+ [
 106+ "%(action)s: %(comment)s - %(params)s" % b for b in
 107+ getBlockEvents(conn, user['user_name'], user['first_edit'], endOfNoob)
 108+ ]
 109+ )
 110+
 111+ LOGGING_STREAM.write(":")
 112+ userPageRev = getLastPostToUserPage(
 113+ conn,
 114+ user['user_id'],
 115+ user['user_name'],
 116+ user['first_edit'],
 117+ endOfNoob
 118+ )
 119+ if userPageRev == None:
 120+ userPageRevId = None
 121+ else:
 122+ userPageRevId = userPageRev['rev_id']
 123+
 124+ if len(talkRevs) != 0:
 125+ print(
 126+ "\t".join(clean(v) for v in [
 127+ user['user_id'],
 128+ user['user_name'],
 129+ user['user_registration'],
 130+ user['first_edit'],
 131+ endOfNoob,
 132+ userPageRevId,
 133+ talkRevs[-1]['rev_id'],
 134+ newbieRevs.get(0, 0),
 135+ newbieRevs.get(1, 0),
 136+ newbieRevs.get(2, 0),
 137+ newbieRevs.get(3, 0),
 138+ newbieRevs.get(4, 0),
 139+ newbieRevs.get(5, 0),
 140+ newbieRevs.get(6, 0),
 141+ newbieRevs.get(7, 0),
 142+ newbieRevs.get(8, 0),
 143+ newbieRevs.get(9, 0),
 144+ newbieRevs.get(10, 0),
 145+ newbieRevs.get(11, 0),
 146+ newbieRevs.get(12, 0),
 147+ newbieRevs.get(13, 0),
 148+ newbieRevs.get(14, 0),
 149+ newbieRevs.get(15, 0),
 150+ blocks
 151+ ])
 152+ )
 153+ LOGGING_STREAM.write(".")
 154+ count += 1
 155+ if count >= args.n:
 156+ break
 157+ else:
 158+ LOGGING_STREAM.write("s")
 159+
 160+ LOGGING_STREAM.write("\n")
 161+
 162+
 163+
 164+
 165+def getUsers(conn, start, end):
 166+ cursor = conn.cursor(MySQLdb.cursors.SSCursor)
 167+ cursor.execute("""
 168+ SELECT
 169+ u.user_id,
 170+ u.user_name,
 171+ u.user_registration,
 172+ um.first_edit,
 173+ um.last_edit
 174+ FROM user u
 175+ INNER JOIN halfak.user_meta um
 176+ ON u.user_id = um.user_id
 177+ WHERE um.first_edit BETWEEN %(start)s AND %(end)s
 178+ ORDER BY RAND()
 179+ """,
 180+ {
 181+ 'start': start,
 182+ 'end': end
 183+ }
 184+ )
 185+ for row in cursor:
 186+ yield dict(
 187+ zip(
 188+ (d[0] for d in cursor.description),
 189+ row
 190+ )
 191+ )
 192+
 193+
 194+
 195+
 196+def getUserRevs(conn, userId, start, end):
 197+ user_id = int(userId)
 198+ cursor = conn.cursor()
 199+ cursor.execute("""
 200+ SELECT
 201+ r.*,
 202+ p.page_namespace
 203+ FROM revision r
 204+ INNER JOIN page p
 205+ ON r.rev_page = p.page_id
 206+ WHERE rev_user = %(user_id)s
 207+ AND rev_timestamp BETWEEN %(start)s AND %(end)s
 208+ ORDER BY rev_timestamp ASC
 209+ """,
 210+ {
 211+ 'user_id': userId,
 212+ 'start': start,
 213+ 'end': end
 214+ }
 215+ )
 216+ for row in cursor:
 217+ yield dict(
 218+ zip(
 219+ (d[0] for d in cursor.description),
 220+ row
 221+ )
 222+ )
 223+
 224+
 225+def getBlockEvents(conn, username, start, end):
 226+ cursor = conn.cursor()
 227+ cursor.execute("""
 228+ SELECT
 229+ log_action as action,
 230+ log_comment as comment,
 231+ log_params as params
 232+ FROM logging
 233+ WHERE log_title = %(username)s
 234+ AND log_type = "block"
 235+ AND log_timestamp BETWEEN %(start)s AND %(end)s
 236+ ORDER BY log_timestamp ASC
 237+ """,
 238+ {
 239+ 'username': username,
 240+ 'start': start,
 241+ 'end': end
 242+ }
 243+ )
 244+ for row in cursor:
 245+ yield dict(
 246+ zip(
 247+ (d[0] for d in cursor.description),
 248+ row
 249+ )
 250+ )
 251+
 252+def getLastPostToUserPage(conn, userId, username, start, end):
 253+ pageId = getPageId(conn, username, 2)
 254+ if pageId != None:
 255+ cursor = conn.cursor()
 256+ cursor.execute("""
 257+ SELECT * FROM revision
 258+ WHERE rev_page = %(page_id)s
 259+ AND rev_timestamp BETWEEN %(start)s AND %(end)s
 260+ ORDER BY rev_timestamp DESC
 261+ LIMIT 1
 262+ """,
 263+ {
 264+ 'page_id': pageId,
 265+ 'user_id': userId,
 266+ 'start': start,
 267+ 'end': end
 268+ }
 269+ )
 270+ for rev in cursor:
 271+ return dict(
 272+ zip(
 273+ (d[0] for d in cursor.description),
 274+ rev
 275+ )
 276+ )
 277+
 278+ return None
 279+
 280+
 281+def getPageId(conn, title, namespace):
 282+ cursor = conn.cursor()
 283+ cursor.execute("""
 284+ SELECT page_id FROM page
 285+ WHERE page_title = %(title)s
 286+ AND page_namespace = %(namespace)s
 287+ """,
 288+ {
 289+ 'title': title,
 290+ 'namespace': namespace
 291+ }
 292+ )
 293+ for page in cursor:
 294+ return page[0]
 295+
 296+ return None
 297+
 298+def getPostsToTalkPage(conn, userId, username, start, end):
 299+ pageId = getPageId(conn, username, 3)
 300+ if pageId != None:
 301+ cursor = conn.cursor()
 302+ cursor.execute("""
 303+ SELECT * FROM revision
 304+ WHERE rev_page = %(page_id)s
 305+ AND rev_timestamp BETWEEN %(start)s AND %(end)s
 306+ ORDER BY rev_id
 307+ """,
 308+ {
 309+ 'page_id': pageId,
 310+ 'user_id': userId,
 311+ 'start': start,
 312+ 'end': end
 313+ }
 314+ )
 315+ for rev in cursor:
 316+ yield dict(
 317+ zip(
 318+ (d[0] for d in cursor.description),
 319+ rev
 320+ )
 321+ )
 322+
 323+
 324+if __name__ == "__main__":
 325+ parser = argparse.ArgumentParser(
 326+ description=
 327+ 'Samples editors by the year they made their first edit.'
 328+ )
 329+ parser.add_argument(
 330+ 'n',
 331+ type=int,
 332+ help='the number of editors to sample from each year'
 333+ )
 334+ parser.add_argument(
 335+ 'year',
 336+ type=int,
 337+ help='year(s) to sample from',
 338+ nargs="+"
 339+ )
 340+ parser.add_argument(
 341+ '-c', '--cnf',
 342+ metavar="<path>",
 343+ type=str,
 344+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
 345+ default=os.path.expanduser("~/.my.cnf")
 346+ )
 347+ parser.add_argument(
 348+ '-s', '--host',
 349+ type=str,
 350+ help='the database host to connect to (defaults to localhost)',
 351+ default="localhost"
 352+ )
 353+ parser.add_argument(
 354+ '-d', '--db',
 355+ type=str,
 356+ help='the language db to run the query in (defaults to enwiki)',
 357+ default="enwiki"
 358+ )
 359+ args = parser.parse_args()
 360+ main(args)
Index: trunk/tools/wsor/utilities/limited_dict_lists.py
@@ -0,0 +1,144 @@
 2+from .limited_queue import LimitedQueue
 3+
 4+__docformat__ = "restructuredtext en"
 5+
 6+class LimitedDictLists(dict):
 7+ """
 8+ Represents a limited size dictionary. When the dictionary is full and
 9+ another item is added to it, the oldest item in the dictionary is thrown
 10+ away.
 11+ """
 12+
 13+ def __init__(self, pairs=[], maxsize=None):
 14+ """
 15+ Constructs a new LimitedDict. If `maxsize` is not specified,
 16+ it will be assumed to be len(pairs).
 17+
 18+ :Parameters:
 19+ pairs : dict | iterable pairs
 20+ A dict of (key->value) pairs or an iterator over pairs.
 21+ maxsize : int
 22+ The number of values that will be remembered
 23+
 24+ """
 25+ #Create a clean iterator of pairs
 26+ try:
 27+ pairs = [(k,v) for (k, v) in pairs]
 28+ except AttributeError:
 29+ pairs = [(k,v) for (k, v) in pairs.iteritems()]
 30+
 31+ #Determine how large the maxsize should be
 32+ if maxsize != None:
 33+ self.__maxsize = maxsize
 34+ else:
 35+ self.__maxsize = len(pairs)
 36+
 37+ #Prime the queue
 38+ self.__queue = LimitedQueue(maxlen=self.__maxsize)
 39+
 40+ #Load in the pairs
 41+ for k, v in pairs:
 42+ self.__setitem__(k, v)
 43+
 44+
 45+
 46+ def __getitem__(self, key):
 47+ if dict.__contains__(self, key) and len(dict.__getitem__(self, key)) > 0:
 48+ return dict.__getitem__(self, key)[0]
 49+ else:
 50+ raise KeyError(key)
 51+
 52+
 53+ def __delitem__(self, key):
 54+ if key in self and len(dict.__getitem__(self, key)) > 0:
 55+ dict.__delitem__(self, key)
 56+ else:
 57+ raise KeyError(key)
 58+
 59+ def __iter__(self):
 60+ for pair in self.__queue:
 61+ yield pair
 62+
 63+ def __len__(self):
 64+ return len(self.__queue)
 65+
 66+
 67+ def __str__(self):
 68+ return self.__repr__()
 69+
 70+
 71+ def __repr__(self):
 72+ return "%s(%r,%r)" % (
 73+ self.__class__.__name__,
 74+ self.__maxsize,
 75+ [pair for pair in self]
 76+ )
 77+
 78+ def getMaxSize(self): return self.__maxsize
 79+ def getQueue(self): return self.__queue
 80+
 81+ def insert(self, key, value):
 82+ """
 83+ Inserts a new key-value pair into the dictionary. If the
 84+ dictionary is full, this function will return an expectorate;
 85+ otherwise, it returns None.
 86+
 87+ :Parameters:
 88+ key : hashable
 89+ key to reference value
 90+ value
 91+ value to store
 92+
 93+ :Return:
 94+ A (key, value) pair expectorate if the dictionary is
 95+ full or None if it is not.
 96+ """
 97+ return self.__setitem__(key, value)
 98+
 99+
 100+ def __setitem__(self, key, value):
 101+ """
 102+ Inserts a new key-value pair into the dictionary. If the
 103+ dictionary is full, this function will return an expectorate;
 104+ otherwise, it returns None.
 105+
 106+ :Parameters:
 107+ key : hashable
 108+ key to reference value
 109+ value
 110+ value to store
 111+
 112+ :Return:
 113+ A (key, value) pair expectorate if the dictionary is
 114+ full or None if it is not.
 115+ """
 116+ #The queue will return something interesting if it is full
 117+ expectorate = self.__queue.append((key,value))
 118+
 119+ if expectorate != None and expectorate[0] in self:
 120+ #Something got spit out of the queue
 121+ #Take it out of the map
 122+ if dict.__getitem__(self, expectorate[0])[-1] == expectorate[1]:
 123+ retVal = dict.__getitem__(self, expectorate[0]).pop()
 124+
 125+ if len(dict.__getitem__(self, expectorate[0])) == 0:
 126+ dict.__delitem__(self, expectorate[0])
 127+
 128+ return retVal
 129+
 130+ if key in self:
 131+ dict.__getitem__(self, key).insert(0, value)
 132+ else:
 133+ dict.__setitem__(self, key, [value])
 134+
 135+
 136+
 137+ def getByIndex(self, index):
 138+ """
 139+ Gets a pair based on the order it was added where 0 is the oldest and
 140+ self.getLimit()-1 was most recently added.
 141+ """
 142+ return self.__queue.get(index)[1]
 143+
 144+
 145+
Index: trunk/tools/wsor/utilities/limited_queue.py
@@ -0,0 +1,212 @@
 2+
 3+__docformat__ = "restructuredtext en"
 4+
 5+class LimitedQueue:
 6+ """
 7+ A limited size queue, instances of this class will have a limited
 8+ amount of spaces to store items. When the queue is full and a
 9+ subsequent item is added, an expectorate will be returned.
 10+
 11+ This class's internal representation is a sized "circular" queue.
 12+ Essentially, no matter what is added, removed or popped from this
 13+ queue, it's internal structure does not change in size from its
 14+ creation. For the most part, this detail can be ignored, but it
 15+ may be an important when considering performance.
 16+
 17+ This should eventually be depricated in favor of collections.deque.
 18+ """
 19+
 20+ def __init__(self, iterable=[], maxlen=None):
 21+ """
 22+ Constructor
 23+
 24+ :Parameters:
 25+ iterable : iterable
 26+ An iterator of elements to load into the queue
 27+ maxlen : int
 28+ The size of the queue, i.e. the perimeter of the circle.
 29+ """
 30+ self.__circle = list(iterable)
 31+ if maxlen != None:
 32+ self.__maxlen = maxlen
 33+ self.__circle = self.__circle[0:maxlen]
 34+ else:
 35+ self.__maxlen = len(self.__circle)
 36+
 37+ self.__counter = 0
 38+ self.__length = 0
 39+
 40+ for i in range(len(self.__circle), self.__maxlen):
 41+ self.__circle.append(None)
 42+
 43+
 44+ def __iter__(self):
 45+ for i in range(0, self.__length):
 46+ if self.get(i) != None:
 47+ yield self.get(i)
 48+
 49+ def __reversed__(self):
 50+ for i in range(self.__length-1, -1, -1):
 51+ if self.get(i) != None:
 52+ yield self.get(i)
 53+
 54+
 55+
 56+
 57+ def __len__(self):
 58+ return self.__length
 59+
 60+
 61+ def index(self, item, j=0, k=None):
 62+ """
 63+ Find the index of an item in the queue
 64+ """
 65+
 66+ if k == None:
 67+ k = len(self)
 68+
 69+ for i in range(j, k):
 70+ if item == self.__getitem__(i):
 71+ return i
 72+
 73+ raise ValueError("%s not in queue" % item)
 74+
 75+
 76+
 77+ def pop(self, index=0):
 78+ """
 79+ Removes an item from the queue and returns it.
 80+
 81+ :Parameters:
 82+ index : int
 83+ Index of the item to remove and return
 84+
 85+ :Return:
 86+ The item removed.
 87+ """
 88+ if index >= 0 and index < self.__length:
 89+ #Remove the item from the correct location
 90+ value = self.__circle.pop(self.__getInternalIndex(index))
 91+
 92+ #Add a None to the end to make up for the removed item
 93+ self.__circle.insert(self.__getInternalIndex(0), None)
 94+
 95+ #Decrement length
 96+ self.__length -= 1
 97+
 98+ return value
 99+ else:
 100+ raise IndexError("Index %s does not exist in the queue and cannot be popped." % index)
 101+
 102+ def __getitem__(self, index):
 103+ return self.get(index)
 104+
 105+
 106+ def __delitem__(self, index):
 107+ self.pop(index)
 108+
 109+
 110+ def __str__(self):
 111+ return self.__repr__()
 112+
 113+
 114+ def __repr__(self):
 115+ return "%s(%r, %r)" % (
 116+ self.__class__.__name__,
 117+ self.__maxlen,
 118+ [item for item in self]
 119+ )
 120+
 121+
 122+
 123+ def append(self, item):
 124+ """
 125+ Adds a new object into the queue. The return value depends on the
 126+ state of the queue. If the queue is full, the value returned will be
 127+ the object that falls off of the end of the queue. If the queue is
 128+
 129+ :Parameters:
 130+ item
 131+ The item to append to the queue
 132+
 133+ :Return:
 134+ The expectorate. None if nothing if there was room for
 135+ item or the oldest item in the queue if the queue was
 136+ full.
 137+ """
 138+ oldItem = self.__circle[self.__getNextInternalIndex()]
 139+
 140+ self.__circle[self.__getNextInternalIndex()] = item
 141+ self.__counter += 1
 142+ if self.__length < self.__maxlen:
 143+ self.__length += 1
 144+
 145+ return oldItem
 146+
 147+
 148+ def get(self, index):
 149+ """
 150+ Gets a value from the queue. This method will throw an IndexError if
 151+ the index is not in the queue.
 152+
 153+ :Parameters:
 154+ index : int
 155+ The index of the item to retrieve
 156+
 157+ :Return:
 158+ The item
 159+ """
 160+ if index >= self.__length*-1 and index < self.__length:
 161+ if index < 0:
 162+ posIndex = self.__length+index
 163+ else:
 164+ posIndex = index
 165+
 166+ return self.__circle[self.__getInternalIndex(posIndex)]
 167+ else:
 168+ raise IndexError("Index %s out of range(%s-%s)" % (index, self.__length*-1, self.__length))
 169+
 170+
 171+ def __getNextInternalIndex(self):
 172+ """
 173+ Gets the next location in self.__circle where new values should be
 174+ stored.
 175+ """
 176+ return self.__counter % self.__maxlen
 177+
 178+
 179+
 180+ def __getInternalIndex(self, index):
 181+ """
 182+ Generates the actual internal index based off of an abstract
 183+ external index. Essentially, this function turns what someone
 184+ using this class thinks is an index to the right one for
 185+ self.__circle.
 186+
 187+ :Parameters:
 188+ index : int
 189+ the index to convert
 190+
 191+ :Return:
 192+ Internal index
 193+ """
 194+ if index >= 0 and index < self.__maxlen:
 195+ if self.__counter >= self.__maxlen:
 196+ return ((self.__counter)+index) % self.__maxlen
 197+ else:
 198+ return index
 199+
 200+ else:
 201+ raise IndexError("Index %s out of range" % index)
 202+
 203+
 204+ def clear(self):
 205+ """
 206+ Resets the queue to its empty state.
 207+ """
 208+ self.__counter = 0
 209+ self.__circle = []
 210+
 211+ for i in range(0,self.__maxlen):
 212+ self.__circle.append(None)
 213+

Status & tagging log