r94722 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r94721‎ \| r94722 \| r94723 >
Date:	01:46, 17 August 2011
Author:	giovanni
Status:	deferred
Tags:
Comment:	manually removed old revision
Modified paths:	/trunk/tools/wsor/editor_lifecycle/MANIFEST.in (deleted) (history) /trunk/tools/wsor/editor_lifecycle/README.rst (deleted) (history) /trunk/tools/wsor/editor_lifecycle/fetchcohort (deleted) (history) /trunk/tools/wsor/editor_lifecycle/fetchrates (deleted) (history) /trunk/tools/wsor/editor_lifecycle/fitting (deleted) (history) /trunk/tools/wsor/editor_lifecycle/fitting_batch.sh (deleted) (history) /trunk/tools/wsor/editor_lifecycle/graphlife (deleted) (history) /trunk/tools/wsor/editor_lifecycle/lifecycle (deleted) (history) /trunk/tools/wsor/editor_lifecycle/mkcohort (deleted) (history) /trunk/tools/wsor/editor_lifecycle/models.py (deleted) (history) /trunk/tools/wsor/editor_lifecycle/relax (deleted) (history) /trunk/tools/wsor/editor_lifecycle/scale.py (deleted) (history) /trunk/tools/wsor/editor_lifecycle/setup.py (deleted) (history)

Diff [purge]

Index: trunk/tools/wsor/editor_lifecycle/models.py
—	—	@@ -1,373 +0,0 @@
2		~~-# coding: utf8~~
3		-
4		~~-import numpy as np~~
5		~~-from scipy.stats import norm, chisqprob, normaltest~~
6		~~-from scipy.optimize import curve_fit~~
7		~~-from scipy.special import gamma~~
8		~~-from cStringIO import StringIO~~
9		~~-import datetime as dt~~
10		~~-#from scikits.statsmodels.api import OLS~~
11		-
12		~~-__all__ = ['Expon', 'PowerLaw', 'StretchedExpon' ]~~
13		-
14		~~-class Parameter(object):~~
15		~~- '''~~
16		~~- Class for parameter descriptors. Works with ParameterMixin~~
17		~~- '''~~
18		~~- def __init__(self, name, attrlist):~~
19		~~- self.name = name # parameter name~~
20		~~- for att in attrlist:~~
21		~~- if att.name == name:~~
22		~~- raise AttributeError('cannot add parameter {}'.format(name))~~
23		~~- attrlist.append(self)~~
24		~~- def __get__(self, instance, owner):~~
25		~~- if instance is not None:~~
26		~~- return instance.__dict__['_' + self.name]~~
27		~~- return self~~
28		~~- def __set__(self, instance, value):~~
29		~~- try:~~
30		~~- value, error = value~~
31		~~- except TypeError:~~
32		~~- value, error = value, None~~
33		~~- instance.__dict__['_' + self.name] = value~~
34		~~- instance.__dict__[self.name + '_err'] = error~~
35		~~- def __repr__(self):~~
36		~~- return '<Parameter {} at 0x{}>'.format(self.name, '%x' % id(self))~~
37		-
38		~~-class ParameterMixin(object):~~
39		~~- '''~~
40		~~- Class that lets you look up all Parameter instances in __params__~~
41		~~- '''~~
42		~~- def itererrors(self):~~
43		~~- for p in self.__params__:~~
44		~~- yield self.__getattribute__(p.name + '_err')~~
45		~~- def errors(self):~~
46		~~- return list(self.itererrors())~~
47		~~- def iterparams(self):~~
48		~~- '''~~
49		~~- Returns an iterator over all parameters of this model~~
50		~~- '''~~
51		~~- for p in self.__params__:~~
52		~~- yield self.__getattribute__(p.name)~~
53		~~- def params(self):~~
54		~~- '''~~
55		~~- Returns a tuple of all parameters of this model~~
56		~~- '''~~
57		~~- return list(self.iterparams())~~
58		~~- def setparams(self, *args):~~
59		~~- '''~~
60		~~- Sets unset parameters of this model to *args. Parameters that already~~
61		~~- are associated a value will NOT be modified by this method.~~
62		~~- '''~~
63		~~- keyf = lambda p : self.__getattribute__(p.name) is None~~
64		~~- for p, a in zip(filter(keyf, self.__params__), args):~~
65		~~- setattr(self, p.name, a)~~
66		-
67		~~-def _orNA(val, fmt='%8.5g'):~~
68		~~- if val is not None:~~
69		~~- return fmt % val~~
70		~~- else:~~
71		~~- return 'N/A'~~
72		-
73		~~-class ParametricModel(ParameterMixin):~~
74		~~- '''~~
75		~~- Callable class with Parameter descriptors. Subclasses of ParametricModel~~
76		~~- ought define, as class attributes, any number of Parameter descriptors at the~~
77		~~- class level, together with a list (conventional name: `__params__'). See~~
78		~~- Parameter.__init__ on how to instantiate a Parameter descriptor.~~
79		-
80		~~- Subclassess ought also define two static methods: `func' and `init'. The~~
81		~~- first is the actual function that accepts an argument together with the same~~
82		~~- number of parameters as in __params__. The second is used to get initial~~
83		~~- estimates for the Levenberg-Marquardt leastsq minimizer used to fit this~~
84		~~- model.~~
85		-
86		~~- From that point on, any instance of this class acts as the function `func'~~
87		~~- itself, with the only differences that it automatically performs partial~~
88		~~- application for those Parameter attributes that are being assigned a value.~~
89		~~- Example:~~
90		-
91		~~- # expon.func(x, A, B) is A * exp(B * x)~~
92		~~- >>> expon(1, -1, 2) = 0.73575888234288467~~
93		~~- >>> expon.A = 2~~
94		~~- >>> expon(1, -1) = 0.73575888234288467~~
95		~~- '''~~
96		~~- def __init__(self, args, *kwargs):~~
97		~~- keys = [p.name for p in self.__params__]~~
98		~~- for k in keys:~~
99		~~- if k not in kwargs:~~
100		~~- kwargs[k] = None~~
101		~~- kwargs.update(zip(keys, args)) # update the rightmost parameters only~~
102		~~- for k, v in kwargs.items():~~
103		~~- setattr(self, k, v)~~
104		~~- self.goftest = tuple([None] * 3)~~
105		~~- self.residtest = tuple([None] * 2)~~
106		~~- self.Rsquared = None~~
107		~~- def __call__(self, x, *args):~~
108		~~- '''~~
109		~~- See class method `func'~~
110		~~- '''~~
111		~~- fargs = self.params()~~
112		~~- N = len(filter(None, fargs))~~
113		~~- if N + len(args) > len(fargs):~~
114		~~- raise TypeError('{} accepts only {} '~~
115		~~- 'parameters'.format(self.__class__.__name__, len(fargs)))~~
116		~~- for a in args:~~
117		~~- idx = fargs.index(None)~~
118		~~- fargs[idx] = a~~
119		~~- fargs = tuple(fargs)~~
120		~~- return self.func(x, *fargs)~~
121		~~- def fit(self, x, y, ye, **kwargs):~~
122		~~- '''~~
123		~~- Fits this parametric model to observations (x_i, y_i). Uncertainty in~~
124		~~- the y-estimates can be specified with argument `ye'. Additional keyword~~
125		~~- arguments are passed to scipy.optimize.curve_fit which in turn passes~~
126		~~- them to scipy.optimize.leastsq.~~
127		~~- '''~~
128		~~- fp0 = self.init(x, y)~~
129		~~- fargs = self.params()~~
130		~~- p0 = []~~
131		~~- for a, p in zip(fargs, fp0):~~
132		~~- if a is None:~~
133		~~- p0.append(p)~~
134		~~- p0 = tuple(p0)~~
135		~~- return curve_fit(self, x, y, sigma=ye, p0=p0, **kwargs)~~
136		~~- def gof(self, x, y, ye):~~
137		~~- '''~~
138		~~- Computes GoF test statistics and other diagnostical tests~~
139		-
140		~~- Returns:~~
141		~~- --------~~
142		~~- - GoF test: Chi^2, p-value, and ddof~~
143		~~- - Normality of residuals: K^2 and p-value~~
144		~~- '''~~
145		~~- res = {}~~
146		~~- resid = y - self(x)~~
147		~~- chisq = np.sum(((resid) / ye) ** 2)~~
148		~~- ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters~~
149		~~- chisq_pvalue = chisqprob(chisq, ddof)~~
150		~~- gof = (chisq, chisq_pvalue, ddof)~~
151		~~- resid = normaltest(resid)~~
152		~~- ym = y.mean()~~
153		~~- SStot = np.sum((y - ym) ** 2)~~
154		~~- SSerr = np.sum((y - self(x)) ** 2)~~
155		~~- Rsquared = 1.0 - SSerr / SStot~~
156		~~-# Besides being buggy, this test for homoscedasticity is supposed to work only~~
157		~~-# for linear regressions, hence is not suited for our case, but I'll keep it~~
158		~~-# here until I figure out an alternative. Remember to uncomment the import for~~
159		~~-# OLS ontop.~~
160		~~-# regresults = OLS(resid 2, np.c_[x, x2]).fit()~~
161		~~-# LM =regresults.rsquared~~
162		~~-# LM_pvalue = chisqprob(LM, len(x) - ddof)~~
163		~~-# white = (LM, LM_pvalue)~~
164		~~-# return gof, resid, white~~
165		~~- return gof, resid, Rsquared~~
166		~~- def __str__(self):~~
167		~~- name = self.__class__.__name__~~
168		~~- prep = []~~
169		~~- for p in self.params():~~
170		~~- if p is not None:~~
171		~~- prep.append('%3.4g' % p)~~
172		~~- else:~~
173		~~- prep.append('*')~~
174		~~- return '{}({})'.format(name, ','.join(prep))~~
175		~~- def __repr__(self):~~
176		~~- return '<{} object at 0x{}>'.format(str(self), '%x' % id(self))~~
177		~~- def summary(self, **kwargs):~~
178		~~- '''~~
179		~~- Returns a summary of this model~~
180		~~- '''~~
181		~~- s = StringIO()~~
182		~~- print >> s, ''~~
183		~~- print >> s, 'General information'~~
184		~~- print >> s, '-------------------'~~
185		~~- print >> s, 'model: %s' % self.name.capitalize()~~
186		~~- print >> s, 'date: %s' % dt.datetime.now()~~
187		~~- for item in kwargs.items():~~
188		~~- print >> s, '{}: {}'.format(*map(str, item))~~
189		~~- print >> s, ''~~
190		~~- print >> s, 'Model parameters'~~
191		~~- print >> s, '----------------'~~
192		~~- for p, val, err in zip(self.__params__, self.params(), self.errors()):~~
193		~~- print >> s, '{}: {} ± {}'.format(p.name, _orNA(val), _orNA(err))~~
194		~~- chi, p, ddof = self.goftest~~
195		~~- print >> s, ''~~
196		~~- print >> s, 'Fit results'~~
197		~~- print >> s, '-----------'~~
198		~~- print >> s, 'Goodness-of-fit: Chi-squared = {}, p = {}, ddof = {}'.format(~~
199		~~- _orNA(chi, '%5.2f'), _orNA(p, '%8.4e'), _orNA(ddof, '%d'))~~
200		~~- D, p = self.residtest~~
201		~~- print >> s, 'Normality of residuals: K-squared = {}, p = {}'.format(~~
202		~~- _orNA(D, '%5.2f'), _orNA(p, '%8.4e'))~~
203		~~- print >> s, 'Coefficient of Determination: {}'.format(~~
204		~~- _orNA(self.Rsquared, '%5.2f'))~~
205		~~- return s.getvalue()~~
206		-
207		~~-class Expon(ParametricModel):~~
208		~~- '''~~
209		~~- y = A * exp( -(x / B)) + C~~
210		~~- '''~~
211		~~- __params__ = []~~
212		~~- A = Parameter('A', __params__)~~
213		~~- B = Parameter('B', __params__)~~
214		~~- C = Parameter('C', __params__)~~
215		~~- name = 'exponential'~~
216		~~- @staticmethod~~
217		~~- def func(x, a, b, c):~~
218		~~- return a * np.exp(-(x / b)) + c~~
219		~~- @staticmethod~~
220		~~- def init(x, y):~~
221		~~- a0 = y[np.argmin(np.abs(x))] # estimate for A = f(0)~~
222		~~- b0 = x.max() / 10.0~~
223		~~- c0 = y.min()~~
224		~~- return (a0, b0, c0)~~
225		~~- def fit(self, x, y, ye, **kwargs):~~
226		~~- if kwargs.pop('constrained', 0):~~
227		~~- self.A = y[np.argmin(np.abs(x))]~~
228		~~- return super(Expon, self).fit(x, y, ye, **kwargs)~~
229		-
230		~~-class StretchedExpon(ParametricModel):~~
231		~~- '''~~
232		~~- y = A * exp (-(t / tau) ** beta)~~
233		~~- '''~~
234		~~- __params__ = []~~
235		~~- A = Parameter('A', __params__)~~
236		~~- tau = Parameter('tau', __params__)~~
237		~~- beta = Parameter('beta', __params__)~~
238		~~- name = 'stretched exponential'~~
239		~~- @staticmethod~~
240		~~- def func(x, a, tau, beta):~~
241		~~- return a * np.exp(- (x / tau) ** beta)~~
242		~~- @staticmethod~~
243		~~- def init(x, y):~~
244		~~- a0 = y[np.argmin(np.abs(x))] # estimate for A = f(0)~~
245		~~- tau0 = x.max() / 10.0~~
246		~~- return (a0, tau0, 0.5)~~
247		~~- def fit(self, x, y, ye, **kwargs):~~
248		~~- if kwargs.pop('constrained', 0):~~
249		~~- self.A = y[np.argmin(np.abs(x))]~~
250		~~- return super(StretchedExpon, self).fit(x, y, ye, **kwargs)~~
251		~~- def summary(self, **kwargs):~~
252		~~- mrt = self.mrt(self.tau, self.beta)~~
253		~~- kwargs['Mean relaxation time <tau>'] = '%5.2f days' % mrt~~
254		~~- return super(StretchedExpon, self).summary(**kwargs)~~
255		~~- def mrt(self, tau, beta):~~
256		~~- return (tau / beta) * gamma(beta ** -1)~~
257		-
258		~~-class PowerLaw(ParametricModel):~~
259		~~- '''~~
260		~~- y = A * x ** B~~
261		~~- '''~~
262		~~- __params__ = []~~
263		~~- A = Parameter('A', __params__)~~
264		~~- B = Parameter('B', __params__)~~
265		~~- name = 'power-law'~~
266		~~- @staticmethod~~
267		~~- def func(x, a, b):~~
268		~~- return a * x ** b~~
269		~~- @staticmethod~~
270		~~- def init(x, y):~~
271		~~- return (1, y.ptp()/x.ptp())~~
272		~~-# NR says this code is more robust against roundoff errors, but presently it~~
273		~~-# does not work. Bummer.~~
274		~~-# def fit(self, x, y, ye, **kwargs):~~
275		~~-# x, y, ye = self._removezeros(x, y, ye)~~
276		~~-# ye = ye / y~~
277		~~-# x = np.log(x)~~
278		~~-# y = np.log(y)~~
279		~~-# S = np.sum(ye ** -1)~~
280		~~-# Sx = np.sum(x / ye)~~
281		~~-# Sy = np.sum(y / ye)~~
282		~~-# t = (ye ** -1) * (x - Sx / S)~~
283		~~-# Stt = np.sum(t ** 2)~~
284		~~-# b = Stt ** -1 * np.sum((y * t) / ye)~~
285		~~-# a = np.exp((Sy - Sx * b) / S)~~
286		~~-# a_var = S ** -1 * (1 + Sx ** 2 / (S * Stt))~~
287		~~-# b_var = Stt ** -1~~
288		~~-# ab_covar = - Sx / Stt~~
289		~~-# pcov = np.asarray([[a_var, ab_covar], [ab_covar, b_var]])~~
290		~~-# return (a, b), pcov~~
291		~~- def fit(self, x, y, ye, **kwargs):~~
292		~~- '''~~
293		~~- Fit by linear least squares of log-transformed data~~
294		~~- '''~~
295		~~- x, y, ye = self._removezeros(x, y, ye)~~
296		~~- ye = (ye / y) ** 2~~
297		~~- x = np.log(x)~~
298		~~- y = np.log(y)~~
299		~~- S = np.sum(ye ** -1)~~
300		~~- Sx = np.sum(x / ye)~~
301		~~- Sy = np.sum(y / ye)~~
302		~~- Sxx = np.sum(x ** 2 / ye)~~
303		~~- Sxy = np.sum((x * y) / ye)~~
304		~~- Delta = S * Sxx - Sx ** 2~~
305		~~- a = np.exp((Sxx * Sy - Sx * Sxy) / Delta)~~
306		~~- b = (S * Sxy - Sx * Sy) / Delta~~
307		~~- a_var = Sxx / Delta~~
308		~~- b_var = S / Delta~~
309		~~- ab_covar = - Sx / Delta~~
310		~~- pcov = np.asarray([[a_var, ab_covar], [ab_covar, b_var]])~~
311		~~- return (a, b), pcov~~
312		~~- def gof(self, x, y, ye):~~
313		~~- '''~~
314		~~- GoF of linear least squares of log-transformed data~~
315		~~- '''~~
316		~~- x, y, ye = self._removezeros(x, y, ye)~~
317		~~- ye = (ye / y)~~
318		~~- x = np.log(x)~~
319		~~- y = np.log(y)~~
320		~~- yp = np.log(self.A) + self.B * x~~
321		~~- chisq = np.sum(((yp - y) / ye) ** 2)~~
322		~~- ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters~~
323		~~- chisq_pvalue = chisqprob(chisq / 2., ddof)~~
324		~~- resid = normaltest(y - yp)~~
325		~~- ym = y.mean()~~
326		~~- SStot = np.sum((y - ym) ** 2)~~
327		~~- SSerr = np.sum((y - yp) ** 2)~~
328		~~- Rsquared = 1.0 - SSerr / SStot~~
329		~~- return (chisq, chisq_pvalue, ddof), resid, Rsquared~~
330		~~- @staticmethod~~
331		~~- def _removezeros(x, y, ye):~~
332		~~- idx = x > 0~~
333		~~- return x[idx], y[idx], ye[idx]~~
334		-
335		~~-if __name__ == '__main__':~~
336		-
337		~~- import matplotlib.pyplot as pp~~
338		~~- import scale~~
339		-
340		~~- model = StretchedExpon()~~
341		-
342		~~- a = 2~~
343		~~- tau = 100~~
344		~~- beta = .5~~
345		~~- c = 0.~~
346		~~- s = 0.1~~
347		~~- xmax = 1000~~
348		~~- x = np.linspace(0, xmax, 50)~~
349		~~- y = model(x, a, tau, beta, c) + np.random.randn(len(x)) * s~~
350		-
351		~~- pest, pcov = model.fit(x, y, s)~~
352		-
353		~~- model.setparams(*zip(pest, np.sqrt(np.diag(pcov))))~~
354		-
355		~~- xx = np.linspace(0, xmax, 1000)~~
356		~~- yy = model(xx)~~
357		-
358		~~- pp.errorbar(x, y, s, fmt='. ', color='k', ecolor='none', label='data')~~
359		~~- pp.plot(xx, yy, 'r-', label='Stretch. Exp. fit')~~
360		~~- pp.xscale('power', exponent=beta)~~
361		~~- pp.yscale('log')~~
362		-
363		~~- pp.legend()~~
364		~~- gof, resid, Rsquared = model.gof(x, y, s)~~
365		~~- model.goftest = gof~~
366		~~- model.residtest = resid~~
367		~~- model.Rsquared = Rsquared~~
368		~~- print model.summary()~~
369		~~- chi, p, ddof = gof~~
370		~~- pp.text(200, 1, r'$\chi^2 = %.2f,\, p-{\rm value} = %5.2g,\,'~~
371		~~- r'{\rm ddof} = %d,\, R^2 = %.2f$'~~
372		~~- % (chi,p,ddof, Rsquared),~~
373		~~- fontsize=16)~~
374		~~- pp.show()~~
Index: trunk/tools/wsor/editor_lifecycle/MANIFEST.in
—	—	@@ -1,3 +0,0 @@
2		~~-include *.py~~
3		~~-include *.sh~~
4		~~-include db.cfg~~
Index: trunk/tools/wsor/editor_lifecycle/relax
—	—	@@ -1,49 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-#:vim:ft=python~~
4		-
5		~~-''' batch model fitting '''~~
6		-
7		~~-import re~~
8		~~-import os~~
9		~~-import sys~~
10		~~-import numpy as np~~
11		~~-from argparse import ArgumentParser~~
12		~~-from models import StretchedExpon~~
13		~~-from datetime import datetime~~
14		-
15		~~-__prog__ = os.path.basename(os.path.abspath(__file__))~~
16		-
17		~~-parser = ArgumentParser(description=__doc__)~~
18		~~-parser.add_argument('data', nargs='+')~~
19		-
20		~~-ns = parser.parse_args()~~
21		-
22		~~-output = []~~
23		-
24		~~-# XXX format should not be fixed to 2010-1 !!~~
25		~~-# TODO should use the len file~~
26		-
27		~~-for d in ns.data:~~
28		~~- k = re.match('(.?)_.+\.?.', d).groups()[0]~~
29		~~- t = datetime.strptime(k, '%Y-%m')~~
30		~~- if not os.path.exists(d):~~
31		~~- continue~~
32		~~- x, y, ye = np.loadtxt(d, unpack=1)~~
33		~~- model = StretchedExpon()~~
34		~~- model.A = y[0]~~
35		~~- idx = ye > 0~~
36		~~- x = x[idx]~~
37		~~- y = y[idx]~~
38		~~- ye = ye[idx]~~
39		~~- if len(x)>10: # st. dev. 0 means only 1 user~~
40		~~- pest, pcov = model.fit(x, y, ye, maxfev=100000, warning=False)~~
41		~~- perr = np.sqrt(np.diag(pcov)) / 2.~~
42		~~- model.setparams(*zip(pest,perr))~~
43		~~- mrt = model.mrt(model.tau, model.beta)~~
44		~~- else:~~
45		~~- mrt = np.nan~~
46		~~- output.append((t, mrt))~~
47		-
48		~~-output = np.asarray(output, dtype=np.dtype([('date', object), ('mrt', np.double)]))~~
49		~~-np.save('mrt.npy', output)~~
50		~~-print 'output saved to mrt.npy'~~
Index: trunk/tools/wsor/editor_lifecycle/fitting_batch.sh
—	—	@@ -1,43 +0,0 @@
2		~~-#!/bin/bash~~
3		-
4		~~-# Applies the `fitting' script to a batch of files~~
5		-#
6		~~-# author: Giovanni Luca Ciampaglia <gciampaglia@wikimedia.org>~~
7		-#
8		~~-# USAGE: fitting_batch.sh file1 file2 file3 ...~~
9		-#
10		~~-# This will produce the normal console output that fitting produces; PDF plots~~
11		~~-# will be stored in file fit.pdf (please note: no check against overwriting~~
12		~~-# existing versions is performed!)~~
13		-
14		~~-if [[ -z `type -p fitting` ]] ; then~~
15		~~- echo 'error: could not find fitting script. Check your PATH'~~
16		~~- exit 1~~
17		~~-fi~~
18		-
19		~~-if [[ -e fit.pdf ]] ; then~~
20		~~- echo 'error: cannot overwrite file fit.pdf'~~
21		~~- exit 1~~
22		~~-fi~~
23		-
24		~~-O=`mktemp -d`~~
25		~~-models="expon powerlaw stretchedexp"~~
26		~~-files="$@"~~
27		-
28		~~-for file in $files ; do~~
29		~~- for model in $models ; do~~
30		~~- fitting $model -force -loglog -batch $file -o $O/${file%.*}_$model.pdf~~
31		~~- echo~~
32		~~- echo~~
33		~~- done~~
34		~~-done~~
35		-
36		~~-pdfs=`ls $O/*.pdf \| sort`~~
37		-
38		~~-gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=fit.pdf -dBATCH $pdfs &>/dev/null~~
39		-
40		~~-if [[ $? = 0 ]] ; then~~
41		~~- echo 'images saved in fit.pdf'~~
42		~~-else~~
43		~~- echo "error: problem saving fit.pdf. Individual image files in $O"~~
44		~~-fi~~
Index: trunk/tools/wsor/editor_lifecycle/scale.py
—	—	@@ -1,68 +0,0 @@
2		~~-from matplotlib.scale import ScaleBase, register_scale~~
3		~~-from matplotlib.transforms import Transform, nonsingular~~
4		~~-from matplotlib.ticker import LinearLocator, Formatter~~
5		~~-from math import ceil, floor~~
6		~~-import numpy as np~~
7		-
8		~~-class PowerScale(ScaleBase):~~
9		~~- name ='power'~~
10		~~- def __init__(self, axis, **kwargs):~~
11		~~- ScaleBase.__init__(self)~~
12		~~- exponent = kwargs.pop('exponent')~~
13		~~- if exponent <= 0:~~
14		~~- raise ValueError('exponent must be positive')~~
15		~~- self.exponent = exponent~~
16		~~- def get_transform(self):~~
17		~~- return PowerTransform(self.exponent)~~
18		~~- def set_default_locators_and_formatters(self, axis):~~
19		~~- axis.set_major_locator(PowerLocator(self.exponent))~~
20		~~- axis.set_major_formatter(PowerFormatter(self.exponent))~~
21		~~- axis.set_minor_formatter(PowerFormatter(self.exponent))~~
22		-
23		~~-class PowerLocator(LinearLocator):~~
24		~~- def __init__(self, exponent, **kwargs):~~
25		~~- LinearLocator.__init__(self, **kwargs)~~
26		~~- self.exponent = exponent~~
27		~~- self.numticks = 5~~
28		~~- def __call__(self):~~
29		~~- vmin, vmax = self.axis.get_view_interval()~~
30		~~- vmin, vmax = nonsingular(vmin, vmax, expander = 0.05)~~
31		~~- vmin = vmin ** self.exponent~~
32		~~- vmax = vmax ** self.exponent~~
33		~~- if vmax<vmin:~~
34		~~- vmin, vmax = vmax, vmin~~
35		-
36		~~- ticklocs = np.linspace(vmin, vmax, num=self.numticks, endpoint=True)~~
37		~~- return self.raise_if_exceeds(ticklocs ** (1.0 / self.exponent))~~
38		-
39		~~-class PowerFormatter(Formatter):~~
40		~~- def __init__(self, exponent):~~
41		~~- self.exponent = exponent~~
42		~~- def __call__(self, x, pos=None):~~
43		~~- return u'%.2g' % (x ** (1.0 / self.exponent))~~
44		-
45		~~-class PowerTransform(Transform):~~
46		~~- input_dims = 1~~
47		~~- output_dims = 1~~
48		~~- is_separable = True~~
49		~~- def __init__(self, exponent):~~
50		~~- Transform.__init__(self)~~
51		~~- self.exponent = exponent~~
52		~~- def transform(self, a):~~
53		~~- return a ** self.exponent~~
54		~~- def inverted(self):~~
55		~~- return PowerTransform(1.0 / self.exponent)~~
56		-
57		~~-register_scale(PowerScale)~~
58		-
59		~~-if __name__ == '__main__':~~
60		- from pylab import *
61		~~- import numpy as np~~
62		~~- tau = 20~~
63		~~- beta = 0.5~~
64		~~- x = np.linspace(0,100, num=10)~~
65		~~- y = np.exp(-(x / tau) ** beta)~~
66		~~- plot(x, y, 'o ', mfc='none', mew=2)~~
67		~~- xscale('power', exponent=beta)~~
68		~~- yscale('log', basey=10)~~
69		~~- show()~~
Index: trunk/tools/wsor/editor_lifecycle/lifecycle
—	—	@@ -1,158 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-#:vim:ts=python:~~
4		-
5		~~-''' compute editor lifecycle '''~~
6		-
7		~~-import re~~
8		~~-import os~~
9		~~-from argparse import ArgumentParser~~
10		~~-import numpy as np~~
11		~~-from collections import deque~~
12		~~-import datetime as dt~~
13		-
14		~~-__prog__ = os.path.basename(os.path.abspath(__file__))~~
15		-
16		~~-def estimaterate(edits, step):~~
17		~~- '''~~
18		~~- This function takes the daily edit history of an individual editor, and a~~
19		~~- step parameter; it estimates the daily activity of the editor. It returns~~
20		~~- the daily rates every `step' days.~~
21		~~- '''~~
22		~~- N = len(edits)~~
23		~~- if N % step:~~
24		~~- NN = np.ceil(N / float(step)) * step~~
25		~~- tmp = np.zeros((NN,), dtype=edits.dtype)~~
26		~~- tmp[:N] = edits~~
27		~~- edits = tmp~~
28		~~- return edits.reshape((-1, step)).sum(axis=-1) / float(step)~~
29		-
30		~~-def itercycles(npzarchive, every, users=None):~~
31		~~- '''~~
32		~~- Iterates over the archive or over given list of users and returns estimated~~
33		~~- activity life cycle (see estimaterate())~~
34		~~- '''~~
35		~~- for uid in (users or npzarchive.files):~~
36		~~- days, edits = npzarchive[uid].T~~
37		~~- days = days - days.min()~~
38		~~- rates = estimaterate(edits, every)~~
39		~~- yield np.c_[days[::every], rates]~~
40		-
41		~~-def averagecycle(ratesbyday):~~
42		~~- '''~~
43		~~- Computes average cycle with standard errors. Takes in input a dictionary~~
44		~~- returned by groupbydayssince()~~
45		~~- '''~~
46		~~- all_days = sorted(ratesbyday.keys())~~
47		~~- result = deque()~~
48		~~- for d in all_days:~~
49		~~- s = ratesbyday[d]~~
50		~~- sqN = np.sqrt(len(s))~~
51		~~- result.append((d, np.mean(s), np.std(s)/np.sqrt(len(s))))~~
52		~~- return np.asarray(result)~~
53		-
54		~~-def groupbyday(npzarchive, every, users=None):~~
55		~~- '''~~
56		~~- This function estimates editors' activity rates and groups rate estimates by~~
57		~~- number of days elapsed since editor registration (which corresponds to time = 0)~~
58		~~- '''~~
59		~~- tmp = {}~~
60		~~- for cyclearr in itercycles(npzarchive, every, users):~~
61		~~- for d, r in cyclearr:~~
62		~~- try:~~
63		~~- tmp[d].append(r)~~
64		~~- except KeyError:~~
65		~~- tmp[d] = deque([r])~~
66		~~- return tmp~~
67		-
68		~~-def lifetimes(npzarchive, users=None):~~
69		~~- '''~~
70		~~- Returns the distribution of account lifetimes over an archive. Can take an~~
71		~~- optional list users ids to restrict the sample to a specific group of~~
72		~~- editors~~
73		~~- '''~~
74		~~- lt = deque()~~
75		~~- for uid in (users or npzarchive.files):~~
76		~~- days, edits = npzarchive[uid].T~~
77		~~- lt.append(days.ptp())~~
78		~~- return np.asarray(lt)~~
79		-
80		~~-def find_inactives(npzarchive, inactivity, minimum_activity, maximum_activity):~~
81		~~- now = dt.datetime.now().toordinal()~~
82		~~- epoch = dt.datetime(1970,1,1).toordinal()~~
83		~~- unix_now = now - epoch~~
84		~~- inactives = deque()~~
85		~~- for uid in npzarchive.files:~~
86		~~- days, edits = npzarchive[uid].T~~
87		~~- if days.ptp() <= inactivity:~~
88		~~- continue~~
89		~~- unix_last = days[-1]~~
90		~~- if (unix_now - unix_last) > inactivity:~~
91		~~- tot_edits = float(edits.sum())~~
92		~~- tot_days = float(days.ptp() - inactivity)~~
93		~~- activity = tot_edits / tot_days * 365.0~~
94		~~- if minimum_activity < activity and maximum_activity > activity:~~
95		~~- inactives.append(uid)~~
96		~~- return inactives~~
97		-
98		~~-parser = ArgumentParser(description=__doc__)~~
99		~~-parser.add_argument('data_file', metavar='data')~~
100		~~-parser.add_argument(metavar='minact', type=int, dest='minimum_activity')~~
101		~~-parser.add_argument(metavar='maxact', type=int, dest='maximum_activity')~~
102		~~-parser.add_argument('-key')~~
103		~~-parser.add_argument('-every', type=int, help='default: %(default)d days',~~
104		~~- default=30, metavar='NUM')~~
105		~~-parser.add_argument('-inactivity', type=int, default=180, help='default: '~~
106		~~- '%(default)d days', metavar='NUM')~~
107		~~-parser.add_argument('-all', dest='dump_all', action='store_true')~~
108		-
109		-
110		~~-def main(ns):~~
111		~~- if ns.key is None:~~
112		~~- m = re.match('(.*?)\.npz', ns.data_file, re.I)~~
113		~~- if m is not None:~~
114		~~- ns.key = m.groups()[0]~~
115		~~- else:~~
116		~~- print >> sys.stderr, '%s: cannot determine key from file name: %s'\~~
117		~~- % (__prog__, ns.data_file)~~
118		~~- sys.exit(1)~~
119		~~- if ns.minimum_activity >= ns.maximum_activity:~~
120		~~- print >> sys.stderr, '%s: error: minact >= maxact' % __prog__~~
121		~~- sys.exit(1)~~
122		-
123		~~- # load data~~
124		~~- npzarchive = np.load(ns.data_file)~~
125		-
126		~~- if ns.dump_all:~~
127		~~- fn = mkfn('cycles', ns, 'npz')~~
128		~~- values_iter = itercycles(npzarchive, ns.every)~~
129		~~- keys = npzarchive.files~~
130		~~- tmp = dict(zip(keys, list(values_iter)))~~
131		~~- np.savez(fn, **tmp)~~
132		~~- print '%s: output saved to %s' % (__prog__, fn)~~
133		~~- else:~~
134		~~- # compute lifetime distribution~~
135		~~- lt = lifetimes(npzarchive)~~
136		-
137		~~- # compute inactive subgroups~~
138		~~- inactive_users = find_inactives(npzarchive, ns.inactivity, ns.minimum_activity,~~
139		~~- ns.maximum_activity)~~
140		-
141		~~- ratesbyday = groupbyday(npzarchive, ns.every)~~
142		~~- ratesbyday_inact = groupbyday(npzarchive, ns.every, inactive_users)~~
143		-
144		~~- avg_all = averagecycle(ratesbyday)~~
145		~~- avg_inact = averagecycle(ratesbyday_inact)~~
146		-
147		~~- lens = [ len(npzarchive.files), len(inactive_users) ]~~
148		-
149		~~- names = ['lt', 'len', 'all', 'inact' ]~~
150		~~- arrs = [ lt, lens, avg_all, avg_inact ]~~
151		-
152		~~- for n, a in zip(names, arrs):~~
153		~~- fn = '%s_%s.%s' % (ns.key, n, 'tsv')~~
154		~~- np.savetxt(fn, a)~~
155		~~- print '%s: output saved to %s' % (__prog__, fn)~~
156		-
157		~~-if __name__ == '__main__':~~
158		~~- ns = parser.parse_args()~~
159		~~- main(ns)~~
Index: trunk/tools/wsor/editor_lifecycle/graphlife
—	—	@@ -1,85 +0,0 @@
2		~~-#!/usr/bin/python~~
3		-
4		~~-''' plot editor life cycle '''~~
5		-
6		~~-import sys~~
7		~~-import numpy as np~~
8		~~-from argparse import ArgumentParser~~
9		~~-import os~~
10		-
11		~~-__prog__ = os.path.basename(os.path.abspath(__file__))~~
12		-
13		~~-parser = ArgumentParser(description=__doc__)~~
14		~~-parser.add_argument('data_files', metavar='data', nargs='+')~~
15		~~-parser.add_argument('-l', '--label', metavar='TEXT', action='append',~~
16		~~- dest='labels_list', default=[])~~
17		~~-parser.add_argument('-inset', dest='inset_data_file', metavar='FILE')~~
18		~~-parser.add_argument('-batch', action='store_true', help='uses PDF backend')~~
19		~~-parser.add_argument('-title')~~
20		~~-parser.add_argument('-fmt', default='pdf', help='default: %(default)s')~~
21		-
22		~~-if __name__ == '__main__':~~
23		~~- ns = parser.parse_args()~~
24		-
25		~~- # checks~~
26		~~- if len(ns.data_files) != len(ns.labels_list):~~
27		~~- print >> sys.stderr, '%s: error: please provide as many labels '\~~
28		~~- 'as lines' % __prog__~~
29		~~- sys.exit(1)~~
30		-
31		~~- # import pyplot, make lists of colors and markers~~
32		~~- if ns.batch:~~
33		~~- import matplotlib~~
34		~~- matplotlib.use('PDF')~~
35		~~- import matplotlib.pyplot as pp~~
36		~~- from matplotlib.lines import lineMarkers as markers~~
37		~~- markers = dict(filter(~~
38		~~- lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing',~~
39		~~- markers.items())).keys()~~
40		~~- colors = 'krbgm'~~
41		-
42		~~- # create figure and axes~~
43		~~- fig = pp.figure()~~
44		~~- ax = pp.axes([.1, .1, .85, .8])~~
45		-
46		~~- # add lines~~
47		~~- N = len(ns.data_files)~~
48		~~- for i in xrange(N):~~
49		~~- data_file = ns.data_files[i]~~
50		~~- label = ns.labels_list[i]~~
51		~~- color = colors[i % N]~~
52		~~- marker= markers[i % N]~~
53		~~- x, y, ye = np.loadtxt(data_file, unpack=1)~~
54		~~- ax.errorbar(x, y, ye, color=color, marker=marker, mfc='none',~~
55		~~- mec=color, ls=':', label=label)~~
56		-
57		~~- ax.legend(loc=2)~~
58		~~- ax.set_xlabel('days since registration')~~
59		~~- ax.set_ylabel('edits/day')~~
60		~~- if ns.title is not None:~~
61		~~- ax.set_title(ns.title)~~
62		~~- ax.axis('tight')~~
63		-
64		~~- # plot hist of lifetimes in inset axes~~
65		~~- lt = np.loadtxt(ns.inset_data_file)~~
66		~~- inax = pp.axes([.55, .6, .35, .25], axisbg='none')~~
67		~~- inax.hist(lt, bins=20, fc='none', cumulative=-1, normed=0)~~
68		~~- for l in inax.xaxis.get_ticklabels():~~
69		~~- l.set_rotation(30)~~
70		~~- l.set_fontsize('x-small')~~
71		~~- for l in inax.yaxis.get_ticklabels():~~
72		~~- l.set_fontsize('x-small')~~
73		~~- inax.set_xlabel('lifespan $x$ (days)', fontsize='small')~~
74		~~- inax.set_ylabel('no. of users older\n more than $x$ days', fontsize='small')~~
75		~~- inax.set_title('account lifetime')~~
76		~~- inax.axis('tight')~~
77		-
78		~~- pp.draw()~~
79		~~- if ns.title is not None:~~
80		~~- fn = ns.title.replace(' ', '_').lower() + '.' + ns.fmt~~
81		~~- else:~~
82		~~- fn = 'output.' + ns.fmt~~
83		~~- print 'output saved to %s' % fn~~
84		-
85		~~- pp.savefig(fn, fmt=ns.fmt)~~
86		~~- pp.show()~~
Index: trunk/tools/wsor/editor_lifecycle/README.rst
—	—	@@ -1,34 +0,0 @@
2		~~-============~~
3		~~-README~~
4		~~-============~~
5		-
6		~~-workflow~~
7		-
8		~~-This package is a collection of python and shell scripts that can assist~~
9		~~-creating and analyzing data on user life cycle.~~
10		-
11		~~-Sample selection~~
12		-
13		~~-TBD~~
14		-
15		~~-Edit activity data collection~~
16		-
17		~~-First use `fetchrates` to download the rate data from the MySQL database. This~~
18		~~-script takes a user_id in input (and stores the rate data in a file called~~
19		~~-<user_id>.npy). This script can be parallelized. At the end you will end up with~~
20		~~-a bunch of NPY files.~~
21		-
22		~~-Cohort selection~~
23		-
24		~~-See the docstring in `mkcohort`.~~
25		-
26		~~-Cohort analysis~~
27		-
28		~~-See `graphlife`, `fitting`, `fitting_batch.sh`, and `relax`.~~
29		-
Index: trunk/tools/wsor/editor_lifecycle/fitting
—	—	@@ -1,125 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# coding: utf-8~~
4		~~-# :vim:ft=python~~
5		-
6		~~-''' editor lifecycle data fitting tool '''~~
7		-
8		~~-import sys~~
9		~~-import os~~
10		~~-from functools import partial~~
11		~~-import numpy as np~~
12		~~-from argparse import ArgumentParser~~
13		~~-from scipy.optimize import curve_fit~~
14		-
15		~~-from models import Expon, PowerLaw, StretchedExpon~~
16		~~-import scale~~
17		-
18		~~-__prog__ = os.path.basename(os.path.abspath(__file__))~~
19		-
20		~~-_maxfev = 10000~~
21		-
22		~~-parent = ArgumentParser(add_help=False)~~
23		~~-parent.add_argument('data_file', metavar='DATA')~~
24		~~-parent.add_argument('-output', dest='output_file', metavar='FILE')~~
25		~~-parent.add_argument('-title')~~
26		~~-group = parent.add_mutually_exclusive_group()~~
27		~~-group.add_argument('-loglog', action='store_true')~~
28		~~-group.add_argument('-loglin', action='store_true')~~
29		~~-parent.add_argument('-constrained', action='store_true')~~
30		~~-parent.add_argument('-batch', action='store_true', help='do not show graphics')~~
31		~~-parent.add_argument('-force', action='store_true', help='force overwrite')~~
32		-
33		~~-parser = ArgumentParser(description=__doc__)~~
34		~~-subparsers = parser.add_subparsers(help='Parametric models supported')~~
35		-
36		~~-parser_expon = subparsers.add_parser('expon', parents=[parent])~~
37		~~-parser_expon.set_defaults(modelclass=Expon)~~
38		-
39		~~-parser_stretch = subparsers.add_parser('stretchedexp', parents=[parent])~~
40		~~-parser_stretch.set_defaults(modelclass=StretchedExpon)~~
41		-
42		~~-parser_power = subparsers.add_parser('powerlaw', parents=[parent])~~
43		~~-parser_power.set_defaults(modelclass=PowerLaw)~~
44		-
45		~~-def plotfit(model, x, y, ye, data=None):~~
46		~~- xx = np.linspace(x.min(), x.max(), endpoint=True, num=1000)~~
47		~~- yy = model(xx)~~
48		~~- pp.errorbar(x, y, ye / 2, fmt='. ', label=data or 'data', color='k', ecolor='none')~~
49		~~- model_label = model.name.split()~~
50		~~- if len(model_label) > 1:~~
51		~~- model_label[1] = model_label[1][:3] + '.'~~
52		~~- model_label = ' '.join(model_label[:2]).capitalize()~~
53		~~- pp.plot(xx, yy, 'r--', label='{} fit'.format(model_label), lw=2.5)~~
54		~~- if ns.loglog:~~
55		~~- pp.xscale('log')~~
56		~~- pp.yscale('log')~~
57		~~- elif ns.loglin:~~
58		~~- pp.xscale('power', exponent=model.beta)~~
59		~~- pp.yscale('log')~~
60		~~- pp.legend(loc='best')~~
61		~~- if ns.title is not None:~~
62		~~- pp.title(ns.title)~~
63		~~- pp.xlabel('Days since registration')~~
64		~~- pp.ylabel('Edits/day')~~
65		-
66		~~- # residuals - uncomment lines to produce relative residuals plots~~
67		~~- pp.figure()~~
68		~~- r = model(x) - y~~
69		~~-# rm = r[True - np.isinf(r)].max()~~
70		~~-# r /= np.abs(rm)~~
71		~~- pp.axhline(y=0, c='k')~~
72		~~- pp.plot(x, r, '.:k')~~
73		~~- pp.title('Fit residuals')~~
74		~~- pp.xlabel('Days since registration')~~
75		~~-# pp.ylabel(r'Relative residual $\xi / \max{\|\xi\|}$')~~
76		~~-# pp.ylim(-1,1)~~
77		~~- pp.draw()~~
78		-
79		~~-def _testoverwrite(*files):~~
80		~~- exit_flag = False~~
81		~~- for fn in files:~~
82		~~- if os.path.exists(fn):~~
83		~~- exit_flag = True~~
84		~~- print '%s: error: cannot overwrite %s' % (__prog__, fn)~~
85		~~- if exit_flag:~~
86		~~- sys.exit(1)~~
87		-
88		~~-def main(ns):~~
89		~~- x, y, ye = np.loadtxt(ns.data_file, unpack=True)~~
90		~~- model = ns.modelclass()~~
91		~~- if ns.constrained:~~
92		~~- pest, pcov = model.fit(x, y, ye=ye, maxfev=_maxfev, constrained=1)~~
93		~~- else:~~
94		~~- pest, pcov = model.fit(x, y, ye=ye, maxfev=_maxfev)~~
95		~~- perr = np.sqrt(np.diag(pcov)) / 2.~~
96		~~- model.setparams(*zip(pest, perr))~~
97		~~- gof, resid, Rsquared = model.gof(x, y, ye)~~
98		~~- model.goftest = gof~~
99		~~- model.residtest = resid~~
100		~~- model.Rsquared = Rsquared~~
101		~~- print model.summary(dataset=ns.data_file, observations=len(x))~~
102		~~- plotfit(model, x, y, ye, data=os.path.splitext(ns.data_file)[0])~~
103		~~- if ns.output_file is not None:~~
104		~~- fn, ext = os.path.splitext(ns.output_file)~~
105		~~- fmt = ext[1:]~~
106		~~- if ns.batch and fmt.lower() != 'pdf':~~
107		~~- print '%s: error: batch mode supports only PDF format' % __prog__~~
108		~~- sys.exit(1)~~
109		~~- resid_output_file = fn + '_residuals' + ext~~
110		~~- if not ns.force:~~
111		~~- _testoverwrite(ns.output_file, resid_output_file)~~
112		~~- pp.figure(1)~~
113		~~- pp.savefig(ns.output_file, format=fmt)~~
114		~~- print '%s: output saved to %s' % (__prog__, ns.output_file)~~
115		~~- pp.figure(2)~~
116		~~- pp.savefig(resid_output_file, format=fmt)~~
117		~~- print '%s: output saved to %s' % (__prog__, resid_output_file)~~
118		~~- pp.show()~~
119		-
120		~~-if __name__ == '__main__':~~
121		~~- ns = parser.parse_args()~~
122		~~- if ns.batch:~~
123		~~- import matplotlib~~
124		~~- matplotlib.use('PDF')~~
125		~~- import matplotlib.pyplot as pp~~
126		~~- main(ns)~~
Index: trunk/tools/wsor/editor_lifecycle/mkcohort
—	—	@@ -1,118 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# coding: utf-8~~
4		~~-# :vim:ft=python~~
5		-
6		~~-''' creates cohort files, filtering out bots '''~~
7		-
8		~~-'''~~
9		~~-This script reads an index file, which is a tab-separated text file like the~~
10		~~-following:~~
11		-
12		~~- 34 WojPob 20010129110725 2524~~
13		~~- 94 AstroNomer 20010207222248 1532~~
14		~~- 43 Lee Daniel Crocker 20010314020407 4388~~
15		~~- 86 Stephen Gilbert 20010326191355 3599~~
16		~~- 3 Tobias Hoevekamp 20010326202105 1903~~
17		~~- 1273 Wathiik 20010510171751 1772~~
18		~~- 3371 Arno 20010721180708 2700~~
19		~~- 122 Ap 20010722201619 2137~~
20		~~- 182 Rjstott 20010726102546 2602~~
21		~~- 64 Uriyan 20010727141651 1634~~
22		-
23		~~-Where fields are: id, name, date, count. Dates are parsed using dateutil, so~~
24		~~-other formats are allowed too (e.g. 2010-01-29 11:07:25). Currently the last~~
25		~~-column (editcount) is not used, but the script still expects to find it, so you~~
26		~~-can put whatever you want in it.~~
27		-
28		~~-The script will aggregate users based on the date field and will lookup for~~
29		~~-files of the form <id>.npy in the current directory. These files contain the~~
30		~~-daily edits count for any individual user, stored using the NumPy binary array~~
31		~~-format. If data files are not in the current directory, a different path can be~~
32		~~-specified from the command line (-P/--datapath)~~
33		-
34		~~-Based on the level of aggregation (say: months), the script will create a~~
35		~~-compressed ZIP archive with the user edit counts data files (e.g.: 2010-01.npz~~
36		~~-for all users from January 2010). This compressed archive can be later processed~~
37		~~-with the script `fitting' or with the load() function from NumPy.~~
38		-
39		~~-The script will produce in output the name of the produced files, how many users~~
40		~~-it contains, and how many suspected BOT users it filtered out from the index~~
41		~~-(use --bot to include them). The script filters out a user base on the name~~
42		~~-field: if the name contains the pattern 'bot' at the beginning or at the end of~~
43		~~-any word, it will be filtered out (e.g. "Botuser IV" will match, but "Francis~~
44		~~-Abbott" won't).~~
45		-
46		~~-Please note that the index file must be already sorted by date, in order for the~~
47		~~-group by date aggregation to work. You can use `sort' from the commmand line,~~
48		~~-e.g.:~~
49		-
50		~~- $~ sort -t$'\t' -k3 -h unsorted.tsv~~
51		-
52		~~-should sort file unsorted.tsv.~~
53		~~-'''~~
54		-
55		~~-import re~~
56		~~-import os~~
57		~~-import sys~~
58		~~-import csv~~
59		~~-from argparse import ArgumentParser, FileType~~
60		~~-from contextlib import closing~~
61		~~-from itertools import groupby~~
62		~~-from dateutil.parser import parser as DateParser~~
63		~~-from zipfile import ZipFile~~
64		-
65		~~-__prog__ = os.path.basename(os.path.abspath(__file__))~~
66		~~-_botpat = r'\bbot\|bot\b'~~
67		~~-_fields = ['id', 'name', 'date', 'count']~~
68		-
69		~~-def yearkey(date):~~
70		~~- return date.year,~~
71		-
72		~~-def monthkey(date):~~
73		~~- return date.year, date.month~~
74		-
75		~~-def daykey(date):~~
76		~~- return date.year, date.month, date.day~~
77		-
78		~~-parser = ArgumentParser(description=__doc__)~~
79		~~-parser.add_argument('index', type=FileType('r'), help='must be already sorted')~~
80		~~-group = parser.add_mutually_exclusive_group(required=1)~~
81		~~-group.add_argument('--year', help='group by year', action='store_const',~~
82		~~- const=yearkey, dest='keyfunc')~~
83		~~-group.add_argument('--month', help='group by month', action='store_const',~~
84		~~- const=monthkey, dest='keyfunc')~~
85		~~-group.add_argument('--day', help='group by day', action='store_const',~~
86		~~- const=daykey, dest='keyfunc')~~
87		~~-parser.add_argument('--bots', action='store_true', help='do NOT filter out bots')~~
88		~~-parser.add_argument('-P', '--datapath', help='data files location',~~
89		~~- default=os.path.curdir)~~
90		-
91		~~-dateparser = DateParser()~~
92		-
93		~~-if __name__ == '__main__':~~
94		~~- ns = parser.parse_args()~~
95		~~- reader = csv.DictReader(ns.index, _fields, dialect='excel-tab')~~
96		-
97		~~- def _keyfunc(row):~~
98		~~- date = dateparser.parse(row['date'])~~
99		~~- return ns.keyfunc(date)~~
100		-
101		~~- for key, subiter in groupby(reader, _keyfunc):~~
102		~~- tot_users = 0~~
103		~~- tot_bots = 0~~
104		~~- datestr = '-'.join(map(lambda k : '%02d' % k, key)) # (2010,1) -> '2010-01'~~
105		~~- zipfn = '{}.npz'.format(datestr)~~
106		~~- with closing(ZipFile(zipfn, 'w')) as zf:~~
107		~~- for row in subiter:~~
108		~~- user_id = row['id']~~
109		~~- if ns.bots or (re.search(_botpat, row['name'], re.I) is None):~~
110		~~- fn = os.path.join(ns.datapath, '{}.npy'.format(user_id ))~~
111		~~- if os.path.exists(fn):~~
112		~~- zf.write(fn, user_id)~~
113		~~- else:~~
114		~~- print >> sys.stderr, '%s: warning: missing %s' %\~~
115		~~- (__prog__, fn)~~
116		~~- else:~~
117		~~- tot_bots += 1~~
118		~~- tot_users += 1~~
119		~~- print '%s created (users: %5d, bots %5d)' % (zipfn, tot_users, tot_bots)~~
Index: trunk/tools/wsor/editor_lifecycle/setup.py
—	—	@@ -1,10 +0,0 @@
2		~~-from distutils.core import setup~~
3		-
4		~~-setup(~~
5		~~- name='lifecycle',~~
6		~~- description='WMF summer of research project',~~
7		~~- version='0.0.0',~~
8		~~- author='Giovanni Luca Ciampaglia',~~
9		~~- author_email='gciampaglia@wikimedia.org',~~
10		~~- scripts=['fetchrates', 'graphlife', 'fetchcohort']~~
11		-)
Index: trunk/tools/wsor/editor_lifecycle/fetchcohort
—	—	@@ -1,59 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# vim:ft=python:~~
4		~~-# coding : utf-8~~
5		-
6		~~-''' fetches a cohort based on year of registration and editing activity '''~~
7		-
8		~~-from argparse import ArgumentParser~~
9		~~-from oursql import connect~~
10		~~-import os~~
11		~~-import sys~~
12		~~-import datetime as dt~~
13		~~-import csv~~
14		-
15		~~-prog = os.path.basename(os.path.abspath(__file__))~~
16		-
17		~~-parser = ArgumentParser(description=__doc__, fromfile_prefix_chars='@')~~
18		~~-parser.add_argument('registration_year', metavar='year', type=int)~~
19		~~-parser.add_argument('min_activity', metavar='minedits', type=int)~~
20		~~-parser.add_argument('max_activity', metavar='maxedits', type=int)~~
21		~~-parser.add_argument('-c', '--config', dest='config_file')~~
22		~~-parser.add_argument('-l', '--limit', type=int)~~
23		-
24		~~-query = '''~~
25		~~-select~~
26		~~- user_id,~~
27		~~- user_name,~~
28		~~- user_registration,~~
29		~~- user_editcount~~
30		~~-from user u left join user_groups ug~~
31		~~-on u.user_id = ug.ug_user~~
32		~~-where~~
33		~~- (ug_group <> 'bot' or ug_user is null)~~
34		~~- and year(user_registration) = ?~~
35		~~- and user_editcount > ?~~
36		~~- and user_editcount < ?~~
37		~~-'''~~
38		-
39		~~-if __name__ == '__main__':~~
40		~~- ns = parser.parse_args()~~
41		~~- if ns.min_activity >= ns.max_activity:~~
42		~~- print >> sys.stderr, '%s: error: min_activity >= max_activity' % prog~~
43		~~- sys.exit(1)~~
44		~~- if ns.registration_year < 2001 or ns.registration_year > dt.datetime.now().year:~~
45		~~- print >> sys.stderr, '%s: error: illegal year: %d' % (prog,~~
46		~~- ns.registration_year)~~
47		~~- sys.exit(1)~~
48		-
49		~~- if ns.limit is not None:~~
50		~~- query += 'limit %d' % ns.limit~~
51		-
52		~~- if ns.config_file is None:~~
53		~~- ns.config_file = os.path.expanduser('~/.my.cnf')~~
54		-
55		~~- conn = connect(read_default_file=ns.config_file)~~
56		~~- writer = csv.writer(sys.stdout, dialect='excel-tab')~~
57		~~- cursor = conn.cursor()~~
58		~~- cursor.execute(query, (ns.registration_year, ns.min_activity, ns.max_activity))~~
59		~~- for row in cursor:~~
60		~~- writer.writerow(row)~~
Index: trunk/tools/wsor/editor_lifecycle/fetchrates
—	—	@@ -1,69 +0,0 @@
2		~~-#!/usr/bin/python~~
3		-
4		~~-import sys~~
5		~~-from oursql import connect~~
6		~~-from argparse import ArgumentParser~~
7		~~-import numpy as np~~
8		~~-import os~~
9		~~-from time import time~~
10		-
11		~~-parser = ArgumentParser(description=__doc__)~~
12		~~-parser.add_argument('user_id', type=int)~~
13		~~-parser.add_argument('-config', dest='config_file')~~
14		~~-parser.add_argument('-outdir', dest='output_dir', default=os.curdir)~~
15		-
16		~~-# TODO get also deleted revisions!~~
17		-
18		~~-query = """~~
19		~~-select unix_timestamp(rev_timestamp)/86400.0~~
20		~~-from revision~~
21		~~-where rev_user = ?~~
22		~~-order by rev_timestamp~~
23		~~-"""~~
24		-
25		~~-prog = os.path.basename(os.path.abspath(__file__))~~
26		-
27		~~-def main(ns):~~
28		~~- # get mysql client configuration file~~
29		~~- mycnf = os.path.expanduser('~/.my.cnf')~~
30		~~- if ns.config_file is None and not os.path.exists(mycnf):~~
31		~~- print >> sys.stderr, '%s: no config file specified and $HOME/.my.cnf'~~
32		~~- ' not found' % prog~~
33		~~- sys.exit(1)~~
34		~~- elif ns.config_file is None:~~
35		~~- ns.config_file = mycnf~~
36		-
37		~~- # test output directory exists~~
38		~~- if not os.path.exists(ns.output_dir):~~
39		~~- print >> sys.stderr, '%s: output directory does not exist: %s' % (~~
40		~~- prog, ns.output_dir)~~
41		~~- sys.exit(1)~~
42		~~- if not os.path.isdir(ns.output_dir):~~
43		~~- print >> sys.stderr, '%s: not a directory: %s' % (prog, ns.output_dir)~~
44		-
45		~~- # start timer~~
46		~~- tstart = time()~~
47		-
48		~~- # connect run query~~
49		~~- conn = connect(read_default_file=ns.config_file)~~
50		~~- cursor = conn.cursor()~~
51		~~- cursor.execute(query, (ns.user_id,))~~
52		-
53		~~- # compute rates and save to file~~
54		~~- revs = np.asfarray(list(cursor))~~
55		~~- m, M = np.floor(revs.min()), np.ceil(revs.max())~~
56		~~- rates, days = np.histogram(revs, range=(m,M), bins=(M-m))~~
57		~~- data = np.c_[days[:-1], rates]~~
58		~~- out_path = os.path.join(ns.output_dir, '%d.npy' % ns.user_id)~~
59		~~- np.save(out_path, data)~~
60		-
61		~~- # stop timer~~
62		~~- tstop = time()~~
63		~~- print '%s: output saved to %s (execution time: %g sec, fetched: %d rows)' % (~~
64		~~- prog, out_path, tstop - tstart, len(revs))~~
65		-
66		~~-if __name__ == '__main__':~~
67		~~- # get arguments from command line~~
68		~~- ns = parser.parse_args()~~
69		~~- main(ns)~~
70		-

Status & tagging log

11:34, 17 August 2011 Reedy (talk | contribs) changed the status of r94722 [removed: new added: deferred]