Index: trunk/tools/wsor/editor_lifecycle/plotpeak |
— | — | @@ -0,0 +1,101 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# :vim:ft=python |
| 4 | + |
| 5 | +import os |
| 6 | +import sys |
| 7 | + |
| 8 | +import numpy as np |
| 9 | +import matplotlib.pyplot as pp |
| 10 | + |
| 11 | +from argparse import ArgumentParser |
| 12 | +from collections import defaultdict |
| 13 | +from datetime import datetime |
| 14 | +from matplotlib.font_manager import FontProperties |
| 15 | +from dateutil.parser import parser as DateParser |
| 16 | + |
| 17 | +__prog__ = os.path.basename(__file__) |
| 18 | + |
| 19 | +parser = ArgumentParser(description=__doc__) |
| 20 | +parser.add_argument('data_paths', metavar='data', nargs='+') |
| 21 | +parser.add_argument('-o', '--output', dest='output_path', metavar='FILE') |
| 22 | +parser.add_argument('-T', '--title') |
| 23 | + |
| 24 | +date_parser = DateParser() |
| 25 | +default_date = datetime(2001,1,1) |
| 26 | + |
| 27 | +markers = 'ov^<>sp*+xD' |
| 28 | +colors = 'bgrcmykw' |
| 29 | +conv = defaultdict(lambda k : float) |
| 30 | +conv[0] = lambda k : date_parser.parse(k, default=default_date) |
| 31 | + |
| 32 | +if __name__ == '__main__': |
| 33 | + ns = parser.parse_args() |
| 34 | + |
| 35 | + # create figure |
| 36 | + fig = pp.figure(figsize=(10,4)) |
| 37 | + ax1 = fig.add_axes(pp.axes([.1,.15,.65,.75], axisbg='ghostwhite')) |
| 38 | + ax2 = ax1.twinx() |
| 39 | + |
| 40 | + M = len(markers) |
| 41 | + C = len(colors) |
| 42 | + |
| 43 | + lines = [] |
| 44 | + |
| 45 | + for i, path in enumerate(ns.data_paths): |
| 46 | + |
| 47 | + # load cohort data and filter out estimates based on samples with size |
| 48 | + # smaller than minimum requested |
| 49 | + arr = np.loadtxt(path, delimiter='\t', dtype=object, converters=conv, |
| 50 | + skiprows=1) |
| 51 | + |
| 52 | + if len(arr) == 0: |
| 53 | + print >> sys.stderr, '%s: warning: skipping empty dataset: %s' % \ |
| 54 | + (__prog__, path) |
| 55 | + continue |
| 56 | + |
| 57 | + date = arr[:, 0] |
| 58 | + idx = np.argsort(date) |
| 59 | + arr = arr[idx] |
| 60 | + date = arr[:, 0] |
| 61 | + act, peak_date, peak_date_err, peak, peak_err = map(np.asfarray, |
| 62 | + arr[:,1:].T) |
| 63 | + |
| 64 | + label = os.path.splitext(path)[0].replace('_',' ') |
| 65 | + |
| 66 | +# color = colors[i % C] |
| 67 | + marker = markers[i % M] |
| 68 | + l, (wu, wd), mc = ax1.errorbar(date, peak_date, peak_date_err / 2.0, |
| 69 | + marker=marker, mec='red', label=label, ecolor='lightgrey', |
| 70 | + ls='none', lw=2, mfc='none') |
| 71 | + lines.append(l) |
| 72 | + |
| 73 | + l, (wu, wd), mc = ax2.errorbar(date, peak, peak_err / 2.0, |
| 74 | + marker=marker, mec='blue', label=label, ecolor='lightgrey', |
| 75 | + ls='none', lw=2, mfc='none') |
| 76 | + |
| 77 | + lines.append(l) |
| 78 | + |
| 79 | + # decorate figure |
| 80 | + ax1.set_xlabel('cohorts') |
| 81 | + ax1.set_ylabel('peak day', color='red') |
| 82 | + ax2.set_ylabel('peak edits/day', color='blue') |
| 83 | + pp.figlegend(lines, [ l.get_label() for l in lines ], |
| 84 | + loc='center right', prop=FontProperties(size='small')) |
| 85 | + pp.minorticks_on() |
| 86 | + pp.grid("on") |
| 87 | + pp.axis('tight') |
| 88 | + |
| 89 | + pp.draw() |
| 90 | + if ns.title is not None: |
| 91 | + pp.title(ns.title) |
| 92 | + pp.draw() |
| 93 | + |
| 94 | + # save to file, is output path specified |
| 95 | + if ns.output_path is not None: |
| 96 | + _, ext = os.path.splitext(ns.output_path) |
| 97 | + fmt = ext.strip('.') or 'pdf' |
| 98 | + pp.savefig(ns.output_path, fmt=ext) |
| 99 | + print '%s: output saved to %s' % (__prog__, ns.output_path) |
| 100 | + |
| 101 | + pp.show() |
| 102 | + |
Property changes on: trunk/tools/wsor/editor_lifecycle/plotpeak |
___________________________________________________________________ |
Added: svn:executable |
1 | 103 | + * |
Index: trunk/tools/wsor/editor_lifecycle/timechart |
— | — | @@ -13,8 +13,9 @@ |
14 | 14 | |
15 | 15 | from argparse import ArgumentParser |
16 | 16 | from matplotlib.font_manager import FontProperties |
| 17 | +from scipy.interpolate import UnivariateSpline |
17 | 18 | |
18 | | -from lifecycle.cvsmooth import find_peak |
| 19 | +from lifecycle.cvsmooth import find_peak |
19 | 20 | |
20 | 21 | __prog__ = os.path.basename(__file__) |
21 | 22 | |
— | — | @@ -67,20 +68,19 @@ |
68 | 69 | marker=marker, mec=color, label=label, ecolor='lightgrey', |
69 | 70 | ls='none', lw=2, mfc='none') |
70 | 71 | pp.setp(wd, ls='none') |
| 72 | + lines.append(l) |
71 | 73 | |
72 | | - xp, spl = find_peak(days, rate, rate_err / 2.0) |
| 74 | + # compute maximum |
| 75 | + xp, xperr, s, ests = find_peak(days, rate, rate_err) |
| 76 | + ax.axvline(xp, color=color) |
73 | 77 | |
| 78 | + # plot spline |
| 79 | + spl = UnivariateSpline(days, rate, rate_err ** -1, s=s) |
74 | 80 | x = np.linspace(days.min(), days.max(), endpoint=True, num=100) |
75 | | - |
76 | 81 | y = spl(x) |
77 | | - |
78 | 82 | ax.plot(x, y, label='spline fit', color=color, ls='--', marker='none', |
79 | 83 | lw=2) |
80 | 84 | |
81 | | - ax.axvline(xp, color=color) |
82 | | - |
83 | | - lines.append(l) |
84 | | - |
85 | 85 | # decorate figure |
86 | 86 | pp.xlabel('days since first edit') |
87 | 87 | pp.ylabel('edits/day') |
Index: trunk/tools/wsor/editor_lifecycle/comppeak |
— | — | @@ -0,0 +1,66 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# :vim:ft=python |
| 4 | + |
| 5 | +''' compute activity peaks ''' |
| 6 | + |
| 7 | +__author__ = "Giovanni Luca Ciampaglia" |
| 8 | +__email__ = "gciampaglia@wikimedia.org" |
| 9 | + |
| 10 | +import os |
| 11 | +import sys |
| 12 | + |
| 13 | +import numpy as np |
| 14 | + |
| 15 | +from argparse import ArgumentParser |
| 16 | +from lifecycle.cvsmooth import find_peak |
| 17 | +from scipy.interpolate import UnivariateSpline |
| 18 | +from warnings import catch_warnings, simplefilter |
| 19 | + |
| 20 | +__prog__ = os.path.basename(__file__) |
| 21 | + |
| 22 | +parser = ArgumentParser(description=__doc__) |
| 23 | +parser.add_argument('data_paths', metavar='data', nargs='+') |
| 24 | +parser.add_argument('-m', '--min-size', type=int, default=0) |
| 25 | +parser.add_argument('-o', '--output', dest='output_path', metavar='FILE') |
| 26 | +parser.add_argument('-i', '--iter', type=int, metavar='NUM', |
| 27 | + dest='bootstrap_iter', default=10000, |
| 28 | + help='bootstrap iterations (default: %(default)d)') |
| 29 | + |
| 30 | +if __name__ == '__main__': |
| 31 | + ns = parser.parse_args() |
| 32 | + |
| 33 | + if ns.output_path is not None: |
| 34 | + output_file = open(ns.output_path, 'w') |
| 35 | + else: |
| 36 | + output_file = sys.stdout |
| 37 | + |
| 38 | + print >> output_file,\ |
| 39 | + 'date\tactivity\tpeak_date\tpeak_date_err\tpeak\tpeak_err' |
| 40 | + for path in ns.data_paths: |
| 41 | + |
| 42 | + # get date and activity rate for file name |
| 43 | + basename = os.path.basename(path) |
| 44 | + basename, _ = os.path.splitext(basename) |
| 45 | + cohort_date, act = basename.split('_') |
| 46 | + act = int(act) |
| 47 | + |
| 48 | + # load data and check if fittable |
| 49 | + x, y, ye, n = map(np.atleast_1d, np.loadtxt(path, unpack=1)) |
| 50 | + idx = (n > ns.min_size) * (ye > 0) |
| 51 | + x = x[idx] |
| 52 | + y = y[idx] |
| 53 | + ye = ye[idx] |
| 54 | + if len(x) - 1 <= 3: # takes into account the leave-one-out CV |
| 55 | + print >> sys.stderr, 'skipping %s' % path |
| 56 | + continue |
| 57 | + |
| 58 | + # fit data and report peak activity and uncertainty |
| 59 | + with catch_warnings(): |
| 60 | + simplefilter('ignore', category=Warning) |
| 61 | + (xp, yp), (xperr, yperr), s, ests = find_peak(x, y, ye, |
| 62 | + reps=ns.bootstrap_iter) |
| 63 | + spl = UnivariateSpline(x, y, ye ** -1, s=s) |
| 64 | + print >> output_file, '%s\t%d\t%12.5f\t%12.5f\t%12.5f\t%12.5f' % \ |
| 65 | + (cohort_date, act, xp, xperr, yp, yperr) |
| 66 | + |
| 67 | + |
Property changes on: trunk/tools/wsor/editor_lifecycle/comppeak |
___________________________________________________________________ |
Added: svn:executable |
1 | 68 | + * |
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py |
— | — | @@ -1,13 +1,17 @@ |
| 2 | +import sys |
| 3 | + |
2 | 4 | import numpy as np |
3 | | -from scipy.interpolate import splrep, splev, UnivariateSpline |
| 5 | +from scipy.interpolate import UnivariateSpline |
4 | 6 | from scipy.optimize import fmin_tnc |
5 | 7 | import scipy.optimize.tnc as tnc |
| 8 | +from multiprocessing import Pool |
| 9 | +from multiprocessing.pool import ApplyResult |
6 | 10 | |
7 | 11 | def spsmooth(x, y, ye, **kwargs): |
8 | 12 | ''' |
9 | 13 | Finds the best spline smoothing factor using leave-one-out cross validation |
10 | 14 | |
11 | | - Additional keyword arguments are passed to splrep (e.g. k for the degree) |
| 15 | + Additional keyword arguments are passed to UnivariateSpline (e.g. k for the degree) |
12 | 16 | ''' |
13 | 17 | |
14 | 18 | best = [] |
— | — | @@ -33,8 +37,8 @@ |
34 | 38 | err_best = np.inf |
35 | 39 | |
36 | 40 | for s in slist: |
37 | | - tck = splrep(train_x, train_y, w=train_w, s=s, **kwargs) |
38 | | - err = np.sqrt((splev(test_x, tck) - test_y) ** 2) |
| 41 | + spl = UnivariateSpline(train_x, train_y, w=train_w, s=s) |
| 42 | + err = np.sqrt((spl(test_x) - test_y) ** 2) |
39 | 43 | |
40 | 44 | if err < err_best: |
41 | 45 | s_best = s |
— | — | @@ -43,18 +47,8 @@ |
44 | 48 | best.append(s_best) |
45 | 49 | |
46 | 50 | return np.mean(best) |
47 | | - |
48 | | -def find_peak(x,y,ye,k=3): |
49 | | - ''' |
50 | | - Finds maximum in time series (x_i, y_i) using smoothing splines |
51 | 51 | |
52 | | - Parameters |
53 | | - ---------- |
54 | | - x,y - data |
55 | | - ye - standard errors estimates |
56 | | - k - spline degree (must be <= 5) |
57 | | - ''' |
58 | | - s = spsmooth(x, y, ye, k=k) |
| 52 | +def _find_peak(x, y, ye, k=3, s=None): |
59 | 53 | spl = UnivariateSpline(x, y, ye ** -1, k=k, s=s) |
60 | 54 | f = lambda k : -spl(k) |
61 | 55 | fprime = np.vectorize(lambda k : - spl.derivatives(k)[1]) |
— | — | @@ -65,8 +59,59 @@ |
66 | 60 | x0 = (x.ptp() * np.random.rand() + x.min(),) |
67 | 61 | xp, nfeval, rc = fmin_tnc(f, x0, fprime=fprime, bounds=bounds, |
68 | 62 | messages=tnc.MSG_NONE) |
| 63 | + xp = xp.item() |
69 | 64 | yp = spl(xp) |
70 | 65 | if yp >= yp_best: |
71 | 66 | xp_best = xp |
72 | 67 | yp_best = yp |
73 | | - return xp_best, spl |
| 68 | + return xp_best, yp_best.item() |
| 69 | + |
| 70 | +def _init(): |
| 71 | + import signal |
| 72 | + signal.signal(signal.SIGINT, signal.SIG_IGN) |
| 73 | + import warnings |
| 74 | + warnings.simplefilter("ignore", category=UserWarning) |
| 75 | + |
| 76 | +def find_peak(x, y, ye, k=3,reps=10000): |
| 77 | + ''' |
| 78 | + Finds maximum in time series (x_i, y_i) using smoothing splines |
| 79 | + |
| 80 | + Parameters |
| 81 | + ---------- |
| 82 | + x,y - data |
| 83 | + ye - standard errors estimates |
| 84 | + k - spline degree (must be <= 5) |
| 85 | + |
| 86 | + Returns |
| 87 | + ------- |
| 88 | + xp - maximum of the smoothing spline |
| 89 | + xperr - uncertainty in the maximum |
| 90 | + s - the smoothing factor (either the one passed, or the one estimated) |
| 91 | + ''' |
| 92 | + # compute the peak |
| 93 | + s = spsmooth(x, y, ye, k=k) |
| 94 | + xp, yp = _find_peak(x, y, ye, k=k, s=s) |
| 95 | + |
| 96 | + # compute uncertainty by bootstrap |
| 97 | + results = [] |
| 98 | + N = len(x) |
| 99 | + pool = Pool() |
| 100 | + spl = UnivariateSpline(x, y, ye ** -1, k=k, s=s) |
| 101 | + yf = spl(x) |
| 102 | + err = y - yf |
| 103 | + kwds = {'k' : k, 's' : s} |
| 104 | + try: |
| 105 | + for i in xrange(reps): |
| 106 | + idx = np.random.randint(0, len(x), len(x)) |
| 107 | + ys = yf + err[idx] |
| 108 | + res = pool.apply_async(_find_peak, args=(x, ys, ye), |
| 109 | + kwds=kwds) |
| 110 | + results.append(res) |
| 111 | + results = np.asarray(map(ApplyResult.get, results)) |
| 112 | + xerr = np.std(results[:,0], ddof=1, axis=0) |
| 113 | + # geometric standard deviation |
| 114 | + yerr = np.exp(np.std(np.log(results[:,1]), ddof=1)) |
| 115 | + finally: |
| 116 | + pool.terminate() |
| 117 | + pool.join() |
| 118 | + return (xp, yp), (xerr, yerr), s, results |