Index: trunk/tools/wsor/editor_lifecycle/models.py |
— | — | @@ -1,373 +0,0 @@ |
2 | | -# coding: utf8 |
3 | | - |
4 | | -import numpy as np |
5 | | -from scipy.stats import norm, chisqprob, normaltest |
6 | | -from scipy.optimize import curve_fit |
7 | | -from scipy.special import gamma |
8 | | -from cStringIO import StringIO |
9 | | -import datetime as dt |
10 | | -#from scikits.statsmodels.api import OLS |
11 | | - |
12 | | -__all__ = ['Expon', 'PowerLaw', 'StretchedExpon' ] |
13 | | - |
14 | | -class Parameter(object): |
15 | | - ''' |
16 | | - Class for parameter descriptors. Works with ParameterMixin |
17 | | - ''' |
18 | | - def __init__(self, name, attrlist): |
19 | | - self.name = name # parameter name |
20 | | - for att in attrlist: |
21 | | - if att.name == name: |
22 | | - raise AttributeError('cannot add parameter {}'.format(name)) |
23 | | - attrlist.append(self) |
24 | | - def __get__(self, instance, owner): |
25 | | - if instance is not None: |
26 | | - return instance.__dict__['_' + self.name] |
27 | | - return self |
28 | | - def __set__(self, instance, value): |
29 | | - try: |
30 | | - value, error = value |
31 | | - except TypeError: |
32 | | - value, error = value, None |
33 | | - instance.__dict__['_' + self.name] = value |
34 | | - instance.__dict__[self.name + '_err'] = error |
35 | | - def __repr__(self): |
36 | | - return '<Parameter {} at 0x{}>'.format(self.name, '%x' % id(self)) |
37 | | - |
38 | | -class ParameterMixin(object): |
39 | | - ''' |
40 | | - Class that lets you look up all Parameter instances in __params__ |
41 | | - ''' |
42 | | - def itererrors(self): |
43 | | - for p in self.__params__: |
44 | | - yield self.__getattribute__(p.name + '_err') |
45 | | - def errors(self): |
46 | | - return list(self.itererrors()) |
47 | | - def iterparams(self): |
48 | | - ''' |
49 | | - Returns an iterator over all parameters of this model |
50 | | - ''' |
51 | | - for p in self.__params__: |
52 | | - yield self.__getattribute__(p.name) |
53 | | - def params(self): |
54 | | - ''' |
55 | | - Returns a tuple of all parameters of this model |
56 | | - ''' |
57 | | - return list(self.iterparams()) |
58 | | - def setparams(self, *args): |
59 | | - ''' |
60 | | - Sets unset parameters of this model to *args. Parameters that already |
61 | | - are associated a value will *NOT* be modified by this method. |
62 | | - ''' |
63 | | - keyf = lambda p : self.__getattribute__(p.name) is None |
64 | | - for p, a in zip(filter(keyf, self.__params__), args): |
65 | | - setattr(self, p.name, a) |
66 | | - |
67 | | -def _orNA(val, fmt='%8.5g'): |
68 | | - if val is not None: |
69 | | - return fmt % val |
70 | | - else: |
71 | | - return 'N/A' |
72 | | - |
73 | | -class ParametricModel(ParameterMixin): |
74 | | - ''' |
75 | | - Callable class with Parameter descriptors. Subclasses of ParametricModel |
76 | | - ought define, as class attributes, any number of Parameter descriptors at the |
77 | | - class level, together with a list (conventional name: `__params__'). See |
78 | | - Parameter.__init__ on how to instantiate a Parameter descriptor. |
79 | | - |
80 | | - Subclassess ought also define two static methods: `func' and `init'. The |
81 | | - first is the actual function that accepts an argument together with the same |
82 | | - number of parameters as in __params__. The second is used to get initial |
83 | | - estimates for the Levenberg-Marquardt leastsq minimizer used to fit this |
84 | | - model. |
85 | | - |
86 | | - From that point on, any instance of this class acts as the function `func' |
87 | | - itself, with the only differences that it automatically performs partial |
88 | | - application for those Parameter attributes that are being assigned a value. |
89 | | - Example: |
90 | | - |
91 | | - # expon.func(x, A, B) is A * exp(B * x) |
92 | | - >>> expon(1, -1, 2) = 0.73575888234288467 |
93 | | - >>> expon.A = 2 |
94 | | - >>> expon(1, -1) = 0.73575888234288467 |
95 | | - ''' |
96 | | - def __init__(self, *args, **kwargs): |
97 | | - keys = [p.name for p in self.__params__] |
98 | | - for k in keys: |
99 | | - if k not in kwargs: |
100 | | - kwargs[k] = None |
101 | | - kwargs.update(zip(keys, args)) # update the rightmost parameters only |
102 | | - for k, v in kwargs.items(): |
103 | | - setattr(self, k, v) |
104 | | - self.goftest = tuple([None] * 3) |
105 | | - self.residtest = tuple([None] * 2) |
106 | | - self.Rsquared = None |
107 | | - def __call__(self, x, *args): |
108 | | - ''' |
109 | | - See class method `func' |
110 | | - ''' |
111 | | - fargs = self.params() |
112 | | - N = len(filter(None, fargs)) |
113 | | - if N + len(args) > len(fargs): |
114 | | - raise TypeError('{} accepts only {} ' |
115 | | - 'parameters'.format(self.__class__.__name__, len(fargs))) |
116 | | - for a in args: |
117 | | - idx = fargs.index(None) |
118 | | - fargs[idx] = a |
119 | | - fargs = tuple(fargs) |
120 | | - return self.func(x, *fargs) |
121 | | - def fit(self, x, y, ye, **kwargs): |
122 | | - ''' |
123 | | - Fits this parametric model to observations (x_i, y_i). Uncertainty in |
124 | | - the y-estimates can be specified with argument `ye'. Additional keyword |
125 | | - arguments are passed to scipy.optimize.curve_fit which in turn passes |
126 | | - them to scipy.optimize.leastsq. |
127 | | - ''' |
128 | | - fp0 = self.init(x, y) |
129 | | - fargs = self.params() |
130 | | - p0 = [] |
131 | | - for a, p in zip(fargs, fp0): |
132 | | - if a is None: |
133 | | - p0.append(p) |
134 | | - p0 = tuple(p0) |
135 | | - return curve_fit(self, x, y, sigma=ye, p0=p0, **kwargs) |
136 | | - def gof(self, x, y, ye): |
137 | | - ''' |
138 | | - Computes GoF test statistics and other diagnostical tests |
139 | | - |
140 | | - Returns: |
141 | | - -------- |
142 | | - - GoF test: Chi^2, p-value, and ddof |
143 | | - - Normality of residuals: K^2 and p-value |
144 | | - ''' |
145 | | - res = {} |
146 | | - resid = y - self(x) |
147 | | - chisq = np.sum(((resid) / ye) ** 2) |
148 | | - ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters |
149 | | - chisq_pvalue = chisqprob(chisq, ddof) |
150 | | - gof = (chisq, chisq_pvalue, ddof) |
151 | | - resid = normaltest(resid) |
152 | | - ym = y.mean() |
153 | | - SStot = np.sum((y - ym) ** 2) |
154 | | - SSerr = np.sum((y - self(x)) ** 2) |
155 | | - Rsquared = 1.0 - SSerr / SStot |
156 | | -# Besides being buggy, this test for homoscedasticity is supposed to work only |
157 | | -# for linear regressions, hence is not suited for our case, but I'll keep it |
158 | | -# here until I figure out an alternative. Remember to uncomment the import for |
159 | | -# OLS ontop. |
160 | | -# regresults = OLS(resid ** 2, np.c_[x, x**2]).fit() |
161 | | -# LM =regresults.rsquared |
162 | | -# LM_pvalue = chisqprob(LM, len(x) - ddof) |
163 | | -# white = (LM, LM_pvalue) |
164 | | -# return gof, resid, white |
165 | | - return gof, resid, Rsquared |
166 | | - def __str__(self): |
167 | | - name = self.__class__.__name__ |
168 | | - prep = [] |
169 | | - for p in self.params(): |
170 | | - if p is not None: |
171 | | - prep.append('%3.4g' % p) |
172 | | - else: |
173 | | - prep.append('*') |
174 | | - return '{}({})'.format(name, ','.join(prep)) |
175 | | - def __repr__(self): |
176 | | - return '<{} object at 0x{}>'.format(str(self), '%x' % id(self)) |
177 | | - def summary(self, **kwargs): |
178 | | - ''' |
179 | | - Returns a summary of this model |
180 | | - ''' |
181 | | - s = StringIO() |
182 | | - print >> s, '' |
183 | | - print >> s, 'General information' |
184 | | - print >> s, '-------------------' |
185 | | - print >> s, 'model: %s' % self.name.capitalize() |
186 | | - print >> s, 'date: %s' % dt.datetime.now() |
187 | | - for item in kwargs.items(): |
188 | | - print >> s, '{}: {}'.format(*map(str, item)) |
189 | | - print >> s, '' |
190 | | - print >> s, 'Model parameters' |
191 | | - print >> s, '----------------' |
192 | | - for p, val, err in zip(self.__params__, self.params(), self.errors()): |
193 | | - print >> s, '{}: {} ± {}'.format(p.name, _orNA(val), _orNA(err)) |
194 | | - chi, p, ddof = self.goftest |
195 | | - print >> s, '' |
196 | | - print >> s, 'Fit results' |
197 | | - print >> s, '-----------' |
198 | | - print >> s, 'Goodness-of-fit: Chi-squared = {}, p = {}, ddof = {}'.format( |
199 | | - _orNA(chi, '%5.2f'), _orNA(p, '%8.4e'), _orNA(ddof, '%d')) |
200 | | - D, p = self.residtest |
201 | | - print >> s, 'Normality of residuals: K-squared = {}, p = {}'.format( |
202 | | - _orNA(D, '%5.2f'), _orNA(p, '%8.4e')) |
203 | | - print >> s, 'Coefficient of Determination: {}'.format( |
204 | | - _orNA(self.Rsquared, '%5.2f')) |
205 | | - return s.getvalue() |
206 | | - |
207 | | -class Expon(ParametricModel): |
208 | | - ''' |
209 | | - y = A * exp( -(x / B)) + C |
210 | | - ''' |
211 | | - __params__ = [] |
212 | | - A = Parameter('A', __params__) |
213 | | - B = Parameter('B', __params__) |
214 | | - C = Parameter('C', __params__) |
215 | | - name = 'exponential' |
216 | | - @staticmethod |
217 | | - def func(x, a, b, c): |
218 | | - return a * np.exp(-(x / b)) + c |
219 | | - @staticmethod |
220 | | - def init(x, y): |
221 | | - a0 = y[np.argmin(np.abs(x))] # estimate for A = f(0) |
222 | | - b0 = x.max() / 10.0 |
223 | | - c0 = y.min() |
224 | | - return (a0, b0, c0) |
225 | | - def fit(self, x, y, ye, **kwargs): |
226 | | - if kwargs.pop('constrained', 0): |
227 | | - self.A = y[np.argmin(np.abs(x))] |
228 | | - return super(Expon, self).fit(x, y, ye, **kwargs) |
229 | | - |
230 | | -class StretchedExpon(ParametricModel): |
231 | | - ''' |
232 | | - y = A * exp (-(t / tau) ** beta) |
233 | | - ''' |
234 | | - __params__ = [] |
235 | | - A = Parameter('A', __params__) |
236 | | - tau = Parameter('tau', __params__) |
237 | | - beta = Parameter('beta', __params__) |
238 | | - name = 'stretched exponential' |
239 | | - @staticmethod |
240 | | - def func(x, a, tau, beta): |
241 | | - return a * np.exp(- (x / tau) ** beta) |
242 | | - @staticmethod |
243 | | - def init(x, y): |
244 | | - a0 = y[np.argmin(np.abs(x))] # estimate for A = f(0) |
245 | | - tau0 = x.max() / 10.0 |
246 | | - return (a0, tau0, 0.5) |
247 | | - def fit(self, x, y, ye, **kwargs): |
248 | | - if kwargs.pop('constrained', 0): |
249 | | - self.A = y[np.argmin(np.abs(x))] |
250 | | - return super(StretchedExpon, self).fit(x, y, ye, **kwargs) |
251 | | - def summary(self, **kwargs): |
252 | | - mrt = self.mrt(self.tau, self.beta) |
253 | | - kwargs['Mean relaxation time <tau>'] = '%5.2f days' % mrt |
254 | | - return super(StretchedExpon, self).summary(**kwargs) |
255 | | - def mrt(self, tau, beta): |
256 | | - return (tau / beta) * gamma(beta ** -1) |
257 | | - |
258 | | -class PowerLaw(ParametricModel): |
259 | | - ''' |
260 | | - y = A * x ** B |
261 | | - ''' |
262 | | - __params__ = [] |
263 | | - A = Parameter('A', __params__) |
264 | | - B = Parameter('B', __params__) |
265 | | - name = 'power-law' |
266 | | - @staticmethod |
267 | | - def func(x, a, b): |
268 | | - return a * x ** b |
269 | | - @staticmethod |
270 | | - def init(x, y): |
271 | | - return (1, y.ptp()/x.ptp()) |
272 | | -# NR says this code is more robust against roundoff errors, but presently it |
273 | | -# does not work. Bummer. |
274 | | -# def fit(self, x, y, ye, **kwargs): |
275 | | -# x, y, ye = self._removezeros(x, y, ye) |
276 | | -# ye = ye / y |
277 | | -# x = np.log(x) |
278 | | -# y = np.log(y) |
279 | | -# S = np.sum(ye ** -1) |
280 | | -# Sx = np.sum(x / ye) |
281 | | -# Sy = np.sum(y / ye) |
282 | | -# t = (ye ** -1) * (x - Sx / S) |
283 | | -# Stt = np.sum(t ** 2) |
284 | | -# b = Stt ** -1 * np.sum((y * t) / ye) |
285 | | -# a = np.exp((Sy - Sx * b) / S) |
286 | | -# a_var = S ** -1 * (1 + Sx ** 2 / (S * Stt)) |
287 | | -# b_var = Stt ** -1 |
288 | | -# ab_covar = - Sx / Stt |
289 | | -# pcov = np.asarray([[a_var, ab_covar], [ab_covar, b_var]]) |
290 | | -# return (a, b), pcov |
291 | | - def fit(self, x, y, ye, **kwargs): |
292 | | - ''' |
293 | | - Fit by linear least squares of log-transformed data |
294 | | - ''' |
295 | | - x, y, ye = self._removezeros(x, y, ye) |
296 | | - ye = (ye / y) ** 2 |
297 | | - x = np.log(x) |
298 | | - y = np.log(y) |
299 | | - S = np.sum(ye ** -1) |
300 | | - Sx = np.sum(x / ye) |
301 | | - Sy = np.sum(y / ye) |
302 | | - Sxx = np.sum(x ** 2 / ye) |
303 | | - Sxy = np.sum((x * y) / ye) |
304 | | - Delta = S * Sxx - Sx ** 2 |
305 | | - a = np.exp((Sxx * Sy - Sx * Sxy) / Delta) |
306 | | - b = (S * Sxy - Sx * Sy) / Delta |
307 | | - a_var = Sxx / Delta |
308 | | - b_var = S / Delta |
309 | | - ab_covar = - Sx / Delta |
310 | | - pcov = np.asarray([[a_var, ab_covar], [ab_covar, b_var]]) |
311 | | - return (a, b), pcov |
312 | | - def gof(self, x, y, ye): |
313 | | - ''' |
314 | | - GoF of linear least squares of log-transformed data |
315 | | - ''' |
316 | | - x, y, ye = self._removezeros(x, y, ye) |
317 | | - ye = (ye / y) |
318 | | - x = np.log(x) |
319 | | - y = np.log(y) |
320 | | - yp = np.log(self.A) + self.B * x |
321 | | - chisq = np.sum(((yp - y) / ye) ** 2) |
322 | | - ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters |
323 | | - chisq_pvalue = chisqprob(chisq / 2., ddof) |
324 | | - resid = normaltest(y - yp) |
325 | | - ym = y.mean() |
326 | | - SStot = np.sum((y - ym) ** 2) |
327 | | - SSerr = np.sum((y - yp) ** 2) |
328 | | - Rsquared = 1.0 - SSerr / SStot |
329 | | - return (chisq, chisq_pvalue, ddof), resid, Rsquared |
330 | | - @staticmethod |
331 | | - def _removezeros(x, y, ye): |
332 | | - idx = x > 0 |
333 | | - return x[idx], y[idx], ye[idx] |
334 | | - |
335 | | -if __name__ == '__main__': |
336 | | - |
337 | | - import matplotlib.pyplot as pp |
338 | | - import scale |
339 | | - |
340 | | - model = StretchedExpon() |
341 | | - |
342 | | - a = 2 |
343 | | - tau = 100 |
344 | | - beta = .5 |
345 | | - c = 0. |
346 | | - s = 0.1 |
347 | | - xmax = 1000 |
348 | | - x = np.linspace(0, xmax, 50) |
349 | | - y = model(x, a, tau, beta, c) + np.random.randn(len(x)) * s |
350 | | - |
351 | | - pest, pcov = model.fit(x, y, s) |
352 | | - |
353 | | - model.setparams(*zip(pest, np.sqrt(np.diag(pcov)))) |
354 | | - |
355 | | - xx = np.linspace(0, xmax, 1000) |
356 | | - yy = model(xx) |
357 | | - |
358 | | - pp.errorbar(x, y, s, fmt='. ', color='k', ecolor='none', label='data') |
359 | | - pp.plot(xx, yy, 'r-', label='Stretch. Exp. fit') |
360 | | - pp.xscale('power', exponent=beta) |
361 | | - pp.yscale('log') |
362 | | - |
363 | | - pp.legend() |
364 | | - gof, resid, Rsquared = model.gof(x, y, s) |
365 | | - model.goftest = gof |
366 | | - model.residtest = resid |
367 | | - model.Rsquared = Rsquared |
368 | | - print model.summary() |
369 | | - chi, p, ddof = gof |
370 | | - pp.text(200, 1, r'$\chi^2 = %.2f,\, p-{\rm value} = %5.2g,\,' |
371 | | - r'{\rm ddof} = %d,\, R^2 = %.2f$' |
372 | | - % (chi,p,ddof, Rsquared), |
373 | | - fontsize=16) |
374 | | - pp.show() |
Index: trunk/tools/wsor/editor_lifecycle/MANIFEST.in |
— | — | @@ -1,3 +0,0 @@ |
2 | | -include *.py |
3 | | -include *.sh |
4 | | -include db.cfg |
Index: trunk/tools/wsor/editor_lifecycle/relax |
— | — | @@ -1,49 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -#:vim:ft=python |
4 | | - |
5 | | -''' batch model fitting ''' |
6 | | - |
7 | | -import re |
8 | | -import os |
9 | | -import sys |
10 | | -import numpy as np |
11 | | -from argparse import ArgumentParser |
12 | | -from models import StretchedExpon |
13 | | -from datetime import datetime |
14 | | - |
15 | | -__prog__ = os.path.basename(os.path.abspath(__file__)) |
16 | | - |
17 | | -parser = ArgumentParser(description=__doc__) |
18 | | -parser.add_argument('data', nargs='+') |
19 | | - |
20 | | -ns = parser.parse_args() |
21 | | - |
22 | | -output = [] |
23 | | - |
24 | | -# XXX format should not be fixed to 2010-1 !! |
25 | | -# TODO should use the len file |
26 | | - |
27 | | -for d in ns.data: |
28 | | - k = re.match('(.*?)_.+\.?.*', d).groups()[0] |
29 | | - t = datetime.strptime(k, '%Y-%m') |
30 | | - if not os.path.exists(d): |
31 | | - continue |
32 | | - x, y, ye = np.loadtxt(d, unpack=1) |
33 | | - model = StretchedExpon() |
34 | | - model.A = y[0] |
35 | | - idx = ye > 0 |
36 | | - x = x[idx] |
37 | | - y = y[idx] |
38 | | - ye = ye[idx] |
39 | | - if len(x)>10: # st. dev. 0 means only 1 user |
40 | | - pest, pcov = model.fit(x, y, ye, maxfev=100000, warning=False) |
41 | | - perr = np.sqrt(np.diag(pcov)) / 2. |
42 | | - model.setparams(*zip(pest,perr)) |
43 | | - mrt = model.mrt(model.tau, model.beta) |
44 | | - else: |
45 | | - mrt = np.nan |
46 | | - output.append((t, mrt)) |
47 | | - |
48 | | -output = np.asarray(output, dtype=np.dtype([('date', object), ('mrt', np.double)])) |
49 | | -np.save('mrt.npy', output) |
50 | | -print 'output saved to mrt.npy' |
Index: trunk/tools/wsor/editor_lifecycle/fitting_batch.sh |
— | — | @@ -1,43 +0,0 @@ |
2 | | -#!/bin/bash |
3 | | - |
4 | | -# Applies the `fitting' script to a batch of files |
5 | | -# |
6 | | -# author: Giovanni Luca Ciampaglia <gciampaglia@wikimedia.org> |
7 | | -# |
8 | | -# USAGE: fitting_batch.sh file1 file2 file3 ... |
9 | | -# |
10 | | -# This will produce the normal console output that fitting produces; PDF plots |
11 | | -# will be stored in file fit.pdf (please note: no check against overwriting |
12 | | -# existing versions is performed!) |
13 | | - |
14 | | -if [[ -z `type -p fitting` ]] ; then |
15 | | - echo 'error: could not find fitting script. Check your PATH' |
16 | | - exit 1 |
17 | | -fi |
18 | | - |
19 | | -if [[ -e fit.pdf ]] ; then |
20 | | - echo 'error: cannot overwrite file fit.pdf' |
21 | | - exit 1 |
22 | | -fi |
23 | | - |
24 | | -O=`mktemp -d` |
25 | | -models="expon powerlaw stretchedexp" |
26 | | -files="$@" |
27 | | - |
28 | | -for file in $files ; do |
29 | | - for model in $models ; do |
30 | | - fitting $model -force -loglog -batch $file -o $O/${file%.*}_$model.pdf |
31 | | - echo |
32 | | - echo |
33 | | - done |
34 | | -done |
35 | | - |
36 | | -pdfs=`ls $O/*.pdf | sort` |
37 | | - |
38 | | -gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=fit.pdf -dBATCH $pdfs &>/dev/null |
39 | | - |
40 | | -if [[ $? = 0 ]] ; then |
41 | | - echo 'images saved in fit.pdf' |
42 | | -else |
43 | | - echo "error: problem saving fit.pdf. Individual image files in $O" |
44 | | -fi |
Index: trunk/tools/wsor/editor_lifecycle/scale.py |
— | — | @@ -1,68 +0,0 @@ |
2 | | -from matplotlib.scale import ScaleBase, register_scale |
3 | | -from matplotlib.transforms import Transform, nonsingular |
4 | | -from matplotlib.ticker import LinearLocator, Formatter |
5 | | -from math import ceil, floor |
6 | | -import numpy as np |
7 | | - |
8 | | -class PowerScale(ScaleBase): |
9 | | - name ='power' |
10 | | - def __init__(self, axis, **kwargs): |
11 | | - ScaleBase.__init__(self) |
12 | | - exponent = kwargs.pop('exponent') |
13 | | - if exponent <= 0: |
14 | | - raise ValueError('exponent must be positive') |
15 | | - self.exponent = exponent |
16 | | - def get_transform(self): |
17 | | - return PowerTransform(self.exponent) |
18 | | - def set_default_locators_and_formatters(self, axis): |
19 | | - axis.set_major_locator(PowerLocator(self.exponent)) |
20 | | - axis.set_major_formatter(PowerFormatter(self.exponent)) |
21 | | - axis.set_minor_formatter(PowerFormatter(self.exponent)) |
22 | | - |
23 | | -class PowerLocator(LinearLocator): |
24 | | - def __init__(self, exponent, **kwargs): |
25 | | - LinearLocator.__init__(self, **kwargs) |
26 | | - self.exponent = exponent |
27 | | - self.numticks = 5 |
28 | | - def __call__(self): |
29 | | - vmin, vmax = self.axis.get_view_interval() |
30 | | - vmin, vmax = nonsingular(vmin, vmax, expander = 0.05) |
31 | | - vmin = vmin ** self.exponent |
32 | | - vmax = vmax ** self.exponent |
33 | | - if vmax<vmin: |
34 | | - vmin, vmax = vmax, vmin |
35 | | - |
36 | | - ticklocs = np.linspace(vmin, vmax, num=self.numticks, endpoint=True) |
37 | | - return self.raise_if_exceeds(ticklocs ** (1.0 / self.exponent)) |
38 | | - |
39 | | -class PowerFormatter(Formatter): |
40 | | - def __init__(self, exponent): |
41 | | - self.exponent = exponent |
42 | | - def __call__(self, x, pos=None): |
43 | | - return u'%.2g' % (x ** (1.0 / self.exponent)) |
44 | | - |
45 | | -class PowerTransform(Transform): |
46 | | - input_dims = 1 |
47 | | - output_dims = 1 |
48 | | - is_separable = True |
49 | | - def __init__(self, exponent): |
50 | | - Transform.__init__(self) |
51 | | - self.exponent = exponent |
52 | | - def transform(self, a): |
53 | | - return a ** self.exponent |
54 | | - def inverted(self): |
55 | | - return PowerTransform(1.0 / self.exponent) |
56 | | - |
57 | | -register_scale(PowerScale) |
58 | | - |
59 | | -if __name__ == '__main__': |
60 | | - from pylab import * |
61 | | - import numpy as np |
62 | | - tau = 20 |
63 | | - beta = 0.5 |
64 | | - x = np.linspace(0,100, num=10) |
65 | | - y = np.exp(-(x / tau) ** beta) |
66 | | - plot(x, y, 'o ', mfc='none', mew=2) |
67 | | - xscale('power', exponent=beta) |
68 | | - yscale('log', basey=10) |
69 | | - show() |
Index: trunk/tools/wsor/editor_lifecycle/lifecycle |
— | — | @@ -1,158 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -#:vim:ts=python: |
4 | | - |
5 | | -''' compute editor lifecycle ''' |
6 | | - |
7 | | -import re |
8 | | -import os |
9 | | -from argparse import ArgumentParser |
10 | | -import numpy as np |
11 | | -from collections import deque |
12 | | -import datetime as dt |
13 | | - |
14 | | -__prog__ = os.path.basename(os.path.abspath(__file__)) |
15 | | - |
16 | | -def estimaterate(edits, step): |
17 | | - ''' |
18 | | - This function takes the daily edit history of an individual editor, and a |
19 | | - step parameter; it estimates the daily activity of the editor. It returns |
20 | | - the daily rates every `step' days. |
21 | | - ''' |
22 | | - N = len(edits) |
23 | | - if N % step: |
24 | | - NN = np.ceil(N / float(step)) * step |
25 | | - tmp = np.zeros((NN,), dtype=edits.dtype) |
26 | | - tmp[:N] = edits |
27 | | - edits = tmp |
28 | | - return edits.reshape((-1, step)).sum(axis=-1) / float(step) |
29 | | - |
30 | | -def itercycles(npzarchive, every, users=None): |
31 | | - ''' |
32 | | - Iterates over the archive or over given list of users and returns estimated |
33 | | - activity life cycle (see estimaterate()) |
34 | | - ''' |
35 | | - for uid in (users or npzarchive.files): |
36 | | - days, edits = npzarchive[uid].T |
37 | | - days = days - days.min() |
38 | | - rates = estimaterate(edits, every) |
39 | | - yield np.c_[days[::every], rates] |
40 | | - |
41 | | -def averagecycle(ratesbyday): |
42 | | - ''' |
43 | | - Computes average cycle with standard errors. Takes in input a dictionary |
44 | | - returned by groupbydayssince() |
45 | | - ''' |
46 | | - all_days = sorted(ratesbyday.keys()) |
47 | | - result = deque() |
48 | | - for d in all_days: |
49 | | - s = ratesbyday[d] |
50 | | - sqN = np.sqrt(len(s)) |
51 | | - result.append((d, np.mean(s), np.std(s)/np.sqrt(len(s)))) |
52 | | - return np.asarray(result) |
53 | | - |
54 | | -def groupbyday(npzarchive, every, users=None): |
55 | | - ''' |
56 | | - This function estimates editors' activity rates and groups rate estimates by |
57 | | - number of days elapsed since editor registration (which corresponds to time = 0) |
58 | | - ''' |
59 | | - tmp = {} |
60 | | - for cyclearr in itercycles(npzarchive, every, users): |
61 | | - for d, r in cyclearr: |
62 | | - try: |
63 | | - tmp[d].append(r) |
64 | | - except KeyError: |
65 | | - tmp[d] = deque([r]) |
66 | | - return tmp |
67 | | - |
68 | | -def lifetimes(npzarchive, users=None): |
69 | | - ''' |
70 | | - Returns the distribution of account lifetimes over an archive. Can take an |
71 | | - optional list users ids to restrict the sample to a specific group of |
72 | | - editors |
73 | | - ''' |
74 | | - lt = deque() |
75 | | - for uid in (users or npzarchive.files): |
76 | | - days, edits = npzarchive[uid].T |
77 | | - lt.append(days.ptp()) |
78 | | - return np.asarray(lt) |
79 | | - |
80 | | -def find_inactives(npzarchive, inactivity, minimum_activity, maximum_activity): |
81 | | - now = dt.datetime.now().toordinal() |
82 | | - epoch = dt.datetime(1970,1,1).toordinal() |
83 | | - unix_now = now - epoch |
84 | | - inactives = deque() |
85 | | - for uid in npzarchive.files: |
86 | | - days, edits = npzarchive[uid].T |
87 | | - if days.ptp() <= inactivity: |
88 | | - continue |
89 | | - unix_last = days[-1] |
90 | | - if (unix_now - unix_last) > inactivity: |
91 | | - tot_edits = float(edits.sum()) |
92 | | - tot_days = float(days.ptp() - inactivity) |
93 | | - activity = tot_edits / tot_days * 365.0 |
94 | | - if minimum_activity < activity and maximum_activity > activity: |
95 | | - inactives.append(uid) |
96 | | - return inactives |
97 | | - |
98 | | -parser = ArgumentParser(description=__doc__) |
99 | | -parser.add_argument('data_file', metavar='data') |
100 | | -parser.add_argument(metavar='minact', type=int, dest='minimum_activity') |
101 | | -parser.add_argument(metavar='maxact', type=int, dest='maximum_activity') |
102 | | -parser.add_argument('-key') |
103 | | -parser.add_argument('-every', type=int, help='default: %(default)d days', |
104 | | - default=30, metavar='NUM') |
105 | | -parser.add_argument('-inactivity', type=int, default=180, help='default: ' |
106 | | - '%(default)d days', metavar='NUM') |
107 | | -parser.add_argument('-all', dest='dump_all', action='store_true') |
108 | | - |
109 | | - |
110 | | -def main(ns): |
111 | | - if ns.key is None: |
112 | | - m = re.match('(.*?)\.npz', ns.data_file, re.I) |
113 | | - if m is not None: |
114 | | - ns.key = m.groups()[0] |
115 | | - else: |
116 | | - print >> sys.stderr, '%s: cannot determine key from file name: %s'\ |
117 | | - % (__prog__, ns.data_file) |
118 | | - sys.exit(1) |
119 | | - if ns.minimum_activity >= ns.maximum_activity: |
120 | | - print >> sys.stderr, '%s: error: minact >= maxact' % __prog__ |
121 | | - sys.exit(1) |
122 | | - |
123 | | - # load data |
124 | | - npzarchive = np.load(ns.data_file) |
125 | | - |
126 | | - if ns.dump_all: |
127 | | - fn = mkfn('cycles', ns, 'npz') |
128 | | - values_iter = itercycles(npzarchive, ns.every) |
129 | | - keys = npzarchive.files |
130 | | - tmp = dict(zip(keys, list(values_iter))) |
131 | | - np.savez(fn, **tmp) |
132 | | - print '%s: output saved to %s' % (__prog__, fn) |
133 | | - else: |
134 | | - # compute lifetime distribution |
135 | | - lt = lifetimes(npzarchive) |
136 | | - |
137 | | - # compute inactive subgroups |
138 | | - inactive_users = find_inactives(npzarchive, ns.inactivity, ns.minimum_activity, |
139 | | - ns.maximum_activity) |
140 | | - |
141 | | - ratesbyday = groupbyday(npzarchive, ns.every) |
142 | | - ratesbyday_inact = groupbyday(npzarchive, ns.every, inactive_users) |
143 | | - |
144 | | - avg_all = averagecycle(ratesbyday) |
145 | | - avg_inact = averagecycle(ratesbyday_inact) |
146 | | - |
147 | | - lens = [ len(npzarchive.files), len(inactive_users) ] |
148 | | - |
149 | | - names = ['lt', 'len', 'all', 'inact' ] |
150 | | - arrs = [ lt, lens, avg_all, avg_inact ] |
151 | | - |
152 | | - for n, a in zip(names, arrs): |
153 | | - fn = '%s_%s.%s' % (ns.key, n, 'tsv') |
154 | | - np.savetxt(fn, a) |
155 | | - print '%s: output saved to %s' % (__prog__, fn) |
156 | | - |
157 | | -if __name__ == '__main__': |
158 | | - ns = parser.parse_args() |
159 | | - main(ns) |
Index: trunk/tools/wsor/editor_lifecycle/graphlife |
— | — | @@ -1,85 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | - |
4 | | -''' plot editor life cycle ''' |
5 | | - |
6 | | -import sys |
7 | | -import numpy as np |
8 | | -from argparse import ArgumentParser |
9 | | -import os |
10 | | - |
11 | | -__prog__ = os.path.basename(os.path.abspath(__file__)) |
12 | | - |
13 | | -parser = ArgumentParser(description=__doc__) |
14 | | -parser.add_argument('data_files', metavar='data', nargs='+') |
15 | | -parser.add_argument('-l', '--label', metavar='TEXT', action='append', |
16 | | - dest='labels_list', default=[]) |
17 | | -parser.add_argument('-inset', dest='inset_data_file', metavar='FILE') |
18 | | -parser.add_argument('-batch', action='store_true', help='uses PDF backend') |
19 | | -parser.add_argument('-title') |
20 | | -parser.add_argument('-fmt', default='pdf', help='default: %(default)s') |
21 | | - |
22 | | -if __name__ == '__main__': |
23 | | - ns = parser.parse_args() |
24 | | - |
25 | | - # checks |
26 | | - if len(ns.data_files) != len(ns.labels_list): |
27 | | - print >> sys.stderr, '%s: error: please provide as many labels '\ |
28 | | - 'as lines' % __prog__ |
29 | | - sys.exit(1) |
30 | | - |
31 | | - # import pyplot, make lists of colors and markers |
32 | | - if ns.batch: |
33 | | - import matplotlib |
34 | | - matplotlib.use('PDF') |
35 | | - import matplotlib.pyplot as pp |
36 | | - from matplotlib.lines import lineMarkers as markers |
37 | | - markers = dict(filter( |
38 | | - lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing', |
39 | | - markers.items())).keys() |
40 | | - colors = 'krbgm' |
41 | | - |
42 | | - # create figure and axes |
43 | | - fig = pp.figure() |
44 | | - ax = pp.axes([.1, .1, .85, .8]) |
45 | | - |
46 | | - # add lines |
47 | | - N = len(ns.data_files) |
48 | | - for i in xrange(N): |
49 | | - data_file = ns.data_files[i] |
50 | | - label = ns.labels_list[i] |
51 | | - color = colors[i % N] |
52 | | - marker= markers[i % N] |
53 | | - x, y, ye = np.loadtxt(data_file, unpack=1) |
54 | | - ax.errorbar(x, y, ye, color=color, marker=marker, mfc='none', |
55 | | - mec=color, ls=':', label=label) |
56 | | - |
57 | | - ax.legend(loc=2) |
58 | | - ax.set_xlabel('days since registration') |
59 | | - ax.set_ylabel('edits/day') |
60 | | - if ns.title is not None: |
61 | | - ax.set_title(ns.title) |
62 | | - ax.axis('tight') |
63 | | - |
64 | | - # plot hist of lifetimes in inset axes |
65 | | - lt = np.loadtxt(ns.inset_data_file) |
66 | | - inax = pp.axes([.55, .6, .35, .25], axisbg='none') |
67 | | - inax.hist(lt, bins=20, fc='none', cumulative=-1, normed=0) |
68 | | - for l in inax.xaxis.get_ticklabels(): |
69 | | - l.set_rotation(30) |
70 | | - l.set_fontsize('x-small') |
71 | | - for l in inax.yaxis.get_ticklabels(): |
72 | | - l.set_fontsize('x-small') |
73 | | - inax.set_xlabel('lifespan $x$ (days)', fontsize='small') |
74 | | - inax.set_ylabel('no. of users older\n more than $x$ days', fontsize='small') |
75 | | - inax.set_title('account lifetime') |
76 | | - inax.axis('tight') |
77 | | - |
78 | | - pp.draw() |
79 | | - if ns.title is not None: |
80 | | - fn = ns.title.replace(' ', '_').lower() + '.' + ns.fmt |
81 | | - else: |
82 | | - fn = 'output.' + ns.fmt |
83 | | - print 'output saved to %s' % fn |
84 | | - |
85 | | - pp.savefig(fn, fmt=ns.fmt) |
86 | | - pp.show() |
Index: trunk/tools/wsor/editor_lifecycle/README.rst |
— | — | @@ -1,34 +0,0 @@ |
2 | | -============ |
3 | | -README |
4 | | -============ |
5 | | - |
6 | | -workflow |
7 | | - |
8 | | -This package is a collection of python and shell scripts that can assist |
9 | | -creating and analyzing data on user life cycle. |
10 | | - |
11 | | -Sample selection |
12 | | - |
13 | | -TBD |
14 | | - |
15 | | -Edit activity data collection |
16 | | - |
17 | | -First use `fetchrates` to download the rate data from the MySQL database. This |
18 | | -script takes a user_id in input (and stores the rate data in a file called |
19 | | -<user_id>.npy). This script can be parallelized. At the end you will end up with |
20 | | -a bunch of NPY files. |
21 | | - |
22 | | -Cohort selection |
23 | | - |
24 | | -See the docstring in `mkcohort`. |
25 | | - |
26 | | -Cohort analysis |
27 | | - |
28 | | -See `graphlife`, `fitting`, `fitting_batch.sh`, and `relax`. |
29 | | - |
Index: trunk/tools/wsor/editor_lifecycle/fitting |
— | — | @@ -1,125 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# coding: utf-8 |
4 | | -# :vim:ft=python |
5 | | - |
6 | | -''' editor lifecycle data fitting tool ''' |
7 | | - |
8 | | -import sys |
9 | | -import os |
10 | | -from functools import partial |
11 | | -import numpy as np |
12 | | -from argparse import ArgumentParser |
13 | | -from scipy.optimize import curve_fit |
14 | | - |
15 | | -from models import Expon, PowerLaw, StretchedExpon |
16 | | -import scale |
17 | | - |
18 | | -__prog__ = os.path.basename(os.path.abspath(__file__)) |
19 | | - |
20 | | -_maxfev = 10000 |
21 | | - |
22 | | -parent = ArgumentParser(add_help=False) |
23 | | -parent.add_argument('data_file', metavar='DATA') |
24 | | -parent.add_argument('-output', dest='output_file', metavar='FILE') |
25 | | -parent.add_argument('-title') |
26 | | -group = parent.add_mutually_exclusive_group() |
27 | | -group.add_argument('-loglog', action='store_true') |
28 | | -group.add_argument('-loglin', action='store_true') |
29 | | -parent.add_argument('-constrained', action='store_true') |
30 | | -parent.add_argument('-batch', action='store_true', help='do not show graphics') |
31 | | -parent.add_argument('-force', action='store_true', help='force overwrite') |
32 | | - |
33 | | -parser = ArgumentParser(description=__doc__) |
34 | | -subparsers = parser.add_subparsers(help='Parametric models supported') |
35 | | - |
36 | | -parser_expon = subparsers.add_parser('expon', parents=[parent]) |
37 | | -parser_expon.set_defaults(modelclass=Expon) |
38 | | - |
39 | | -parser_stretch = subparsers.add_parser('stretchedexp', parents=[parent]) |
40 | | -parser_stretch.set_defaults(modelclass=StretchedExpon) |
41 | | - |
42 | | -parser_power = subparsers.add_parser('powerlaw', parents=[parent]) |
43 | | -parser_power.set_defaults(modelclass=PowerLaw) |
44 | | - |
45 | | -def plotfit(model, x, y, ye, data=None): |
46 | | - xx = np.linspace(x.min(), x.max(), endpoint=True, num=1000) |
47 | | - yy = model(xx) |
48 | | - pp.errorbar(x, y, ye / 2, fmt='. ', label=data or 'data', color='k', ecolor='none') |
49 | | - model_label = model.name.split() |
50 | | - if len(model_label) > 1: |
51 | | - model_label[1] = model_label[1][:3] + '.' |
52 | | - model_label = ' '.join(model_label[:2]).capitalize() |
53 | | - pp.plot(xx, yy, 'r--', label='{} fit'.format(model_label), lw=2.5) |
54 | | - if ns.loglog: |
55 | | - pp.xscale('log') |
56 | | - pp.yscale('log') |
57 | | - elif ns.loglin: |
58 | | - pp.xscale('power', exponent=model.beta) |
59 | | - pp.yscale('log') |
60 | | - pp.legend(loc='best') |
61 | | - if ns.title is not None: |
62 | | - pp.title(ns.title) |
63 | | - pp.xlabel('Days since registration') |
64 | | - pp.ylabel('Edits/day') |
65 | | - |
66 | | - # residuals - uncomment lines to produce relative residuals plots |
67 | | - pp.figure() |
68 | | - r = model(x) - y |
69 | | -# rm = r[True - np.isinf(r)].max() |
70 | | -# r /= np.abs(rm) |
71 | | - pp.axhline(y=0, c='k') |
72 | | - pp.plot(x, r, '.:k') |
73 | | - pp.title('Fit residuals') |
74 | | - pp.xlabel('Days since registration') |
75 | | -# pp.ylabel(r'Relative residual $\xi / \max{|\xi|}$') |
76 | | -# pp.ylim(-1,1) |
77 | | - pp.draw() |
78 | | - |
79 | | -def _testoverwrite(*files): |
80 | | - exit_flag = False |
81 | | - for fn in files: |
82 | | - if os.path.exists(fn): |
83 | | - exit_flag = True |
84 | | - print '%s: error: cannot overwrite %s' % (__prog__, fn) |
85 | | - if exit_flag: |
86 | | - sys.exit(1) |
87 | | - |
88 | | -def main(ns): |
89 | | - x, y, ye = np.loadtxt(ns.data_file, unpack=True) |
90 | | - model = ns.modelclass() |
91 | | - if ns.constrained: |
92 | | - pest, pcov = model.fit(x, y, ye=ye, maxfev=_maxfev, constrained=1) |
93 | | - else: |
94 | | - pest, pcov = model.fit(x, y, ye=ye, maxfev=_maxfev) |
95 | | - perr = np.sqrt(np.diag(pcov)) / 2. |
96 | | - model.setparams(*zip(pest, perr)) |
97 | | - gof, resid, Rsquared = model.gof(x, y, ye) |
98 | | - model.goftest = gof |
99 | | - model.residtest = resid |
100 | | - model.Rsquared = Rsquared |
101 | | - print model.summary(dataset=ns.data_file, observations=len(x)) |
102 | | - plotfit(model, x, y, ye, data=os.path.splitext(ns.data_file)[0]) |
103 | | - if ns.output_file is not None: |
104 | | - fn, ext = os.path.splitext(ns.output_file) |
105 | | - fmt = ext[1:] |
106 | | - if ns.batch and fmt.lower() != 'pdf': |
107 | | - print '%s: error: batch mode supports only PDF format' % __prog__ |
108 | | - sys.exit(1) |
109 | | - resid_output_file = fn + '_residuals' + ext |
110 | | - if not ns.force: |
111 | | - _testoverwrite(ns.output_file, resid_output_file) |
112 | | - pp.figure(1) |
113 | | - pp.savefig(ns.output_file, format=fmt) |
114 | | - print '%s: output saved to %s' % (__prog__, ns.output_file) |
115 | | - pp.figure(2) |
116 | | - pp.savefig(resid_output_file, format=fmt) |
117 | | - print '%s: output saved to %s' % (__prog__, resid_output_file) |
118 | | - pp.show() |
119 | | - |
120 | | -if __name__ == '__main__': |
121 | | - ns = parser.parse_args() |
122 | | - if ns.batch: |
123 | | - import matplotlib |
124 | | - matplotlib.use('PDF') |
125 | | - import matplotlib.pyplot as pp |
126 | | - main(ns) |
Index: trunk/tools/wsor/editor_lifecycle/mkcohort |
— | — | @@ -1,118 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# coding: utf-8 |
4 | | -# :vim:ft=python |
5 | | - |
6 | | -''' creates cohort files, filtering out bots ''' |
7 | | - |
8 | | -''' |
9 | | -This script reads an index file, which is a tab-separated text file like the |
10 | | -following: |
11 | | - |
12 | | - 34 WojPob 20010129110725 2524 |
13 | | - 94 AstroNomer 20010207222248 1532 |
14 | | - 43 Lee Daniel Crocker 20010314020407 4388 |
15 | | - 86 Stephen Gilbert 20010326191355 3599 |
16 | | - 3 Tobias Hoevekamp 20010326202105 1903 |
17 | | - 1273 Wathiik 20010510171751 1772 |
18 | | - 3371 Arno 20010721180708 2700 |
19 | | - 122 Ap 20010722201619 2137 |
20 | | - 182 Rjstott 20010726102546 2602 |
21 | | - 64 Uriyan 20010727141651 1634 |
22 | | - |
23 | | -Where fields are: id, name, date, count. Dates are parsed using dateutil, so |
24 | | -other formats are allowed too (e.g. 2010-01-29 11:07:25). Currently the last |
25 | | -column (editcount) is not used, but the script still expects to find it, so you |
26 | | -can put whatever you want in it. |
27 | | - |
28 | | -The script will aggregate users based on the date field and will lookup for |
29 | | -files of the form <id>.npy in the current directory. These files contain the |
30 | | -daily edits count for any individual user, stored using the NumPy binary array |
31 | | -format. If data files are not in the current directory, a different path can be |
32 | | -specified from the command line (-P/--datapath) |
33 | | - |
34 | | -Based on the level of aggregation (say: months), the script will create a |
35 | | -compressed ZIP archive with the user edit counts data files (e.g.: 2010-01.npz |
36 | | -for all users from January 2010). This compressed archive can be later processed |
37 | | -with the script `fitting' or with the load() function from NumPy. |
38 | | - |
39 | | -The script will produce in output the name of the produced files, how many users |
40 | | -it contains, and how many suspected BOT users it filtered out from the index |
41 | | -(use --bot to include them). The script filters out a user base on the name |
42 | | -field: if the name contains the pattern 'bot' at the beginning or at the end of |
43 | | -any word, it will be filtered out (e.g. "Botuser IV" will match, but "Francis |
44 | | -Abbott" won't). |
45 | | - |
46 | | -Please note that the index file must be already sorted by date, in order for the |
47 | | -group by date aggregation to work. You can use `sort' from the commmand line, |
48 | | -e.g.: |
49 | | - |
50 | | - $~ sort -t$'\t' -k3 -h unsorted.tsv |
51 | | - |
52 | | -should sort file unsorted.tsv. |
53 | | -''' |
54 | | - |
55 | | -import re |
56 | | -import os |
57 | | -import sys |
58 | | -import csv |
59 | | -from argparse import ArgumentParser, FileType |
60 | | -from contextlib import closing |
61 | | -from itertools import groupby |
62 | | -from dateutil.parser import parser as DateParser |
63 | | -from zipfile import ZipFile |
64 | | - |
65 | | -__prog__ = os.path.basename(os.path.abspath(__file__)) |
66 | | -_botpat = r'\bbot|bot\b' |
67 | | -_fields = ['id', 'name', 'date', 'count'] |
68 | | - |
69 | | -def yearkey(date): |
70 | | - return date.year, |
71 | | - |
72 | | -def monthkey(date): |
73 | | - return date.year, date.month |
74 | | - |
75 | | -def daykey(date): |
76 | | - return date.year, date.month, date.day |
77 | | - |
78 | | -parser = ArgumentParser(description=__doc__) |
79 | | -parser.add_argument('index', type=FileType('r'), help='*must* be already sorted') |
80 | | -group = parser.add_mutually_exclusive_group(required=1) |
81 | | -group.add_argument('--year', help='group by year', action='store_const', |
82 | | - const=yearkey, dest='keyfunc') |
83 | | -group.add_argument('--month', help='group by month', action='store_const', |
84 | | - const=monthkey, dest='keyfunc') |
85 | | -group.add_argument('--day', help='group by day', action='store_const', |
86 | | - const=daykey, dest='keyfunc') |
87 | | -parser.add_argument('--bots', action='store_true', help='do NOT filter out bots') |
88 | | -parser.add_argument('-P', '--datapath', help='data files location', |
89 | | - default=os.path.curdir) |
90 | | - |
91 | | -dateparser = DateParser() |
92 | | - |
93 | | -if __name__ == '__main__': |
94 | | - ns = parser.parse_args() |
95 | | - reader = csv.DictReader(ns.index, _fields, dialect='excel-tab') |
96 | | - |
97 | | - def _keyfunc(row): |
98 | | - date = dateparser.parse(row['date']) |
99 | | - return ns.keyfunc(date) |
100 | | - |
101 | | - for key, subiter in groupby(reader, _keyfunc): |
102 | | - tot_users = 0 |
103 | | - tot_bots = 0 |
104 | | - datestr = '-'.join(map(lambda k : '%02d' % k, key)) # (2010,1) -> '2010-01' |
105 | | - zipfn = '{}.npz'.format(datestr) |
106 | | - with closing(ZipFile(zipfn, 'w')) as zf: |
107 | | - for row in subiter: |
108 | | - user_id = row['id'] |
109 | | - if ns.bots or (re.search(_botpat, row['name'], re.I) is None): |
110 | | - fn = os.path.join(ns.datapath, '{}.npy'.format(user_id )) |
111 | | - if os.path.exists(fn): |
112 | | - zf.write(fn, user_id) |
113 | | - else: |
114 | | - print >> sys.stderr, '%s: warning: missing %s' %\ |
115 | | - (__prog__, fn) |
116 | | - else: |
117 | | - tot_bots += 1 |
118 | | - tot_users += 1 |
119 | | - print '%s created (users: %5d, bots %5d)' % (zipfn, tot_users, tot_bots) |
Index: trunk/tools/wsor/editor_lifecycle/setup.py |
— | — | @@ -1,10 +0,0 @@ |
2 | | -from distutils.core import setup |
3 | | - |
4 | | -setup( |
5 | | - name='lifecycle', |
6 | | - description='WMF summer of research project', |
7 | | - version='0.0.0', |
8 | | - author='Giovanni Luca Ciampaglia', |
9 | | - author_email='gciampaglia@wikimedia.org', |
10 | | - scripts=['fetchrates', 'graphlife', 'fetchcohort'] |
11 | | -) |
Index: trunk/tools/wsor/editor_lifecycle/fetchcohort |
— | — | @@ -1,59 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# vim:ft=python: |
4 | | -# coding : utf-8 |
5 | | - |
6 | | -''' fetches a cohort based on year of registration and editing activity ''' |
7 | | - |
8 | | -from argparse import ArgumentParser |
9 | | -from oursql import connect |
10 | | -import os |
11 | | -import sys |
12 | | -import datetime as dt |
13 | | -import csv |
14 | | - |
15 | | -prog = os.path.basename(os.path.abspath(__file__)) |
16 | | - |
17 | | -parser = ArgumentParser(description=__doc__, fromfile_prefix_chars='@') |
18 | | -parser.add_argument('registration_year', metavar='year', type=int) |
19 | | -parser.add_argument('min_activity', metavar='minedits', type=int) |
20 | | -parser.add_argument('max_activity', metavar='maxedits', type=int) |
21 | | -parser.add_argument('-c', '--config', dest='config_file') |
22 | | -parser.add_argument('-l', '--limit', type=int) |
23 | | - |
24 | | -query = ''' |
25 | | -select |
26 | | - user_id, |
27 | | - user_name, |
28 | | - user_registration, |
29 | | - user_editcount |
30 | | -from user u left join user_groups ug |
31 | | -on u.user_id = ug.ug_user |
32 | | -where |
33 | | - (ug_group <> 'bot' or ug_user is null) |
34 | | - and year(user_registration) = ? |
35 | | - and user_editcount > ? |
36 | | - and user_editcount < ? |
37 | | -''' |
38 | | - |
39 | | -if __name__ == '__main__': |
40 | | - ns = parser.parse_args() |
41 | | - if ns.min_activity >= ns.max_activity: |
42 | | - print >> sys.stderr, '%s: error: min_activity >= max_activity' % prog |
43 | | - sys.exit(1) |
44 | | - if ns.registration_year < 2001 or ns.registration_year > dt.datetime.now().year: |
45 | | - print >> sys.stderr, '%s: error: illegal year: %d' % (prog, |
46 | | - ns.registration_year) |
47 | | - sys.exit(1) |
48 | | - |
49 | | - if ns.limit is not None: |
50 | | - query += 'limit %d' % ns.limit |
51 | | - |
52 | | - if ns.config_file is None: |
53 | | - ns.config_file = os.path.expanduser('~/.my.cnf') |
54 | | - |
55 | | - conn = connect(read_default_file=ns.config_file) |
56 | | - writer = csv.writer(sys.stdout, dialect='excel-tab') |
57 | | - cursor = conn.cursor() |
58 | | - cursor.execute(query, (ns.registration_year, ns.min_activity, ns.max_activity)) |
59 | | - for row in cursor: |
60 | | - writer.writerow(row) |
Index: trunk/tools/wsor/editor_lifecycle/fetchrates |
— | — | @@ -1,69 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | - |
4 | | -import sys |
5 | | -from oursql import connect |
6 | | -from argparse import ArgumentParser |
7 | | -import numpy as np |
8 | | -import os |
9 | | -from time import time |
10 | | - |
11 | | -parser = ArgumentParser(description=__doc__) |
12 | | -parser.add_argument('user_id', type=int) |
13 | | -parser.add_argument('-config', dest='config_file') |
14 | | -parser.add_argument('-outdir', dest='output_dir', default=os.curdir) |
15 | | - |
16 | | -# TODO get also deleted revisions! |
17 | | - |
18 | | -query = """ |
19 | | -select unix_timestamp(rev_timestamp)/86400.0 |
20 | | -from revision |
21 | | -where rev_user = ? |
22 | | -order by rev_timestamp |
23 | | -""" |
24 | | - |
25 | | -prog = os.path.basename(os.path.abspath(__file__)) |
26 | | - |
27 | | -def main(ns): |
28 | | - # get mysql client configuration file |
29 | | - mycnf = os.path.expanduser('~/.my.cnf') |
30 | | - if ns.config_file is None and not os.path.exists(mycnf): |
31 | | - print >> sys.stderr, '%s: no config file specified and $HOME/.my.cnf' |
32 | | - ' not found' % prog |
33 | | - sys.exit(1) |
34 | | - elif ns.config_file is None: |
35 | | - ns.config_file = mycnf |
36 | | - |
37 | | - # test output directory exists |
38 | | - if not os.path.exists(ns.output_dir): |
39 | | - print >> sys.stderr, '%s: output directory does not exist: %s' % ( |
40 | | - prog, ns.output_dir) |
41 | | - sys.exit(1) |
42 | | - if not os.path.isdir(ns.output_dir): |
43 | | - print >> sys.stderr, '%s: not a directory: %s' % (prog, ns.output_dir) |
44 | | - |
45 | | - # start timer |
46 | | - tstart = time() |
47 | | - |
48 | | - # connect run query |
49 | | - conn = connect(read_default_file=ns.config_file) |
50 | | - cursor = conn.cursor() |
51 | | - cursor.execute(query, (ns.user_id,)) |
52 | | - |
53 | | - # compute rates and save to file |
54 | | - revs = np.asfarray(list(cursor)) |
55 | | - m, M = np.floor(revs.min()), np.ceil(revs.max()) |
56 | | - rates, days = np.histogram(revs, range=(m,M), bins=(M-m)) |
57 | | - data = np.c_[days[:-1], rates] |
58 | | - out_path = os.path.join(ns.output_dir, '%d.npy' % ns.user_id) |
59 | | - np.save(out_path, data) |
60 | | - |
61 | | - # stop timer |
62 | | - tstop = time() |
63 | | - print '%s: output saved to %s (execution time: %g sec, fetched: %d rows)' % ( |
64 | | - prog, out_path, tstop - tstart, len(revs)) |
65 | | - |
66 | | -if __name__ == '__main__': |
67 | | - # get arguments from command line |
68 | | - ns = parser.parse_args() |
69 | | - main(ns) |
70 | | - |