r95303 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r95302‎ | r95303 | r95304 >
Date:08:07, 23 August 2011
Author:giovanni
Status:deferred
Tags:
Comment:
reworked bootstrapping, added scripts for computing and plotting activity peaks
Modified paths:
  • /trunk/tools/wsor/editor_lifecycle/comppeak (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py (modified) (history)
  • /trunk/tools/wsor/editor_lifecycle/plotpeak (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/timechart (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/editor_lifecycle/plotpeak
@@ -0,0 +1,101 @@
 2+#!/usr/bin/python
 3+# :vim:ft=python
 4+
 5+import os
 6+import sys
 7+
 8+import numpy as np
 9+import matplotlib.pyplot as pp
 10+
 11+from argparse import ArgumentParser
 12+from collections import defaultdict
 13+from datetime import datetime
 14+from matplotlib.font_manager import FontProperties
 15+from dateutil.parser import parser as DateParser
 16+
 17+__prog__ = os.path.basename(__file__)
 18+
 19+parser = ArgumentParser(description=__doc__)
 20+parser.add_argument('data_paths', metavar='data', nargs='+')
 21+parser.add_argument('-o', '--output', dest='output_path', metavar='FILE')
 22+parser.add_argument('-T', '--title')
 23+
 24+date_parser = DateParser()
 25+default_date = datetime(2001,1,1)
 26+
 27+markers = 'ov^<>sp*+xD'
 28+colors = 'bgrcmykw'
 29+conv = defaultdict(lambda k : float)
 30+conv[0] = lambda k : date_parser.parse(k, default=default_date)
 31+
 32+if __name__ == '__main__':
 33+ ns = parser.parse_args()
 34+
 35+ # create figure
 36+ fig = pp.figure(figsize=(10,4))
 37+ ax1 = fig.add_axes(pp.axes([.1,.15,.65,.75], axisbg='ghostwhite'))
 38+ ax2 = ax1.twinx()
 39+
 40+ M = len(markers)
 41+ C = len(colors)
 42+
 43+ lines = []
 44+
 45+ for i, path in enumerate(ns.data_paths):
 46+
 47+ # load cohort data and filter out estimates based on samples with size
 48+ # smaller than minimum requested
 49+ arr = np.loadtxt(path, delimiter='\t', dtype=object, converters=conv,
 50+ skiprows=1)
 51+
 52+ if len(arr) == 0:
 53+ print >> sys.stderr, '%s: warning: skipping empty dataset: %s' % \
 54+ (__prog__, path)
 55+ continue
 56+
 57+ date = arr[:, 0]
 58+ idx = np.argsort(date)
 59+ arr = arr[idx]
 60+ date = arr[:, 0]
 61+ act, peak_date, peak_date_err, peak, peak_err = map(np.asfarray,
 62+ arr[:,1:].T)
 63+
 64+ label = os.path.splitext(path)[0].replace('_',' ')
 65+
 66+# color = colors[i % C]
 67+ marker = markers[i % M]
 68+ l, (wu, wd), mc = ax1.errorbar(date, peak_date, peak_date_err / 2.0,
 69+ marker=marker, mec='red', label=label, ecolor='lightgrey',
 70+ ls='none', lw=2, mfc='none')
 71+ lines.append(l)
 72+
 73+ l, (wu, wd), mc = ax2.errorbar(date, peak, peak_err / 2.0,
 74+ marker=marker, mec='blue', label=label, ecolor='lightgrey',
 75+ ls='none', lw=2, mfc='none')
 76+
 77+ lines.append(l)
 78+
 79+ # decorate figure
 80+ ax1.set_xlabel('cohorts')
 81+ ax1.set_ylabel('peak day', color='red')
 82+ ax2.set_ylabel('peak edits/day', color='blue')
 83+ pp.figlegend(lines, [ l.get_label() for l in lines ],
 84+ loc='center right', prop=FontProperties(size='small'))
 85+ pp.minorticks_on()
 86+ pp.grid("on")
 87+ pp.axis('tight')
 88+
 89+ pp.draw()
 90+ if ns.title is not None:
 91+ pp.title(ns.title)
 92+ pp.draw()
 93+
 94+ # save to file, is output path specified
 95+ if ns.output_path is not None:
 96+ _, ext = os.path.splitext(ns.output_path)
 97+ fmt = ext.strip('.') or 'pdf'
 98+ pp.savefig(ns.output_path, fmt=ext)
 99+ print '%s: output saved to %s' % (__prog__, ns.output_path)
 100+
 101+ pp.show()
 102+
Property changes on: trunk/tools/wsor/editor_lifecycle/plotpeak
___________________________________________________________________
Added: svn:executable
1103 + *
Index: trunk/tools/wsor/editor_lifecycle/timechart
@@ -13,8 +13,9 @@
1414
1515 from argparse import ArgumentParser
1616 from matplotlib.font_manager import FontProperties
 17+from scipy.interpolate import UnivariateSpline
1718
18 -from lifecycle.cvsmooth import find_peak
 19+from lifecycle.cvsmooth import find_peak
1920
2021 __prog__ = os.path.basename(__file__)
2122
@@ -67,20 +68,19 @@
6869 marker=marker, mec=color, label=label, ecolor='lightgrey',
6970 ls='none', lw=2, mfc='none')
7071 pp.setp(wd, ls='none')
 72+ lines.append(l)
7173
72 - xp, spl = find_peak(days, rate, rate_err / 2.0)
 74+ # compute maximum
 75+ xp, xperr, s, ests = find_peak(days, rate, rate_err)
 76+ ax.axvline(xp, color=color)
7377
 78+ # plot spline
 79+ spl = UnivariateSpline(days, rate, rate_err ** -1, s=s)
7480 x = np.linspace(days.min(), days.max(), endpoint=True, num=100)
75 -
7681 y = spl(x)
77 -
7882 ax.plot(x, y, label='spline fit', color=color, ls='--', marker='none',
7983 lw=2)
8084
81 - ax.axvline(xp, color=color)
82 -
83 - lines.append(l)
84 -
8585 # decorate figure
8686 pp.xlabel('days since first edit')
8787 pp.ylabel('edits/day')
Index: trunk/tools/wsor/editor_lifecycle/comppeak
@@ -0,0 +1,66 @@
 2+#!/usr/bin/python
 3+# :vim:ft=python
 4+
 5+''' compute activity peaks '''
 6+
 7+__author__ = "Giovanni Luca Ciampaglia"
 8+__email__ = "gciampaglia@wikimedia.org"
 9+
 10+import os
 11+import sys
 12+
 13+import numpy as np
 14+
 15+from argparse import ArgumentParser
 16+from lifecycle.cvsmooth import find_peak
 17+from scipy.interpolate import UnivariateSpline
 18+from warnings import catch_warnings, simplefilter
 19+
 20+__prog__ = os.path.basename(__file__)
 21+
 22+parser = ArgumentParser(description=__doc__)
 23+parser.add_argument('data_paths', metavar='data', nargs='+')
 24+parser.add_argument('-m', '--min-size', type=int, default=0)
 25+parser.add_argument('-o', '--output', dest='output_path', metavar='FILE')
 26+parser.add_argument('-i', '--iter', type=int, metavar='NUM',
 27+ dest='bootstrap_iter', default=10000,
 28+ help='bootstrap iterations (default: %(default)d)')
 29+
 30+if __name__ == '__main__':
 31+ ns = parser.parse_args()
 32+
 33+ if ns.output_path is not None:
 34+ output_file = open(ns.output_path, 'w')
 35+ else:
 36+ output_file = sys.stdout
 37+
 38+ print >> output_file,\
 39+ 'date\tactivity\tpeak_date\tpeak_date_err\tpeak\tpeak_err'
 40+ for path in ns.data_paths:
 41+
 42+ # get date and activity rate for file name
 43+ basename = os.path.basename(path)
 44+ basename, _ = os.path.splitext(basename)
 45+ cohort_date, act = basename.split('_')
 46+ act = int(act)
 47+
 48+ # load data and check if fittable
 49+ x, y, ye, n = map(np.atleast_1d, np.loadtxt(path, unpack=1))
 50+ idx = (n > ns.min_size) * (ye > 0)
 51+ x = x[idx]
 52+ y = y[idx]
 53+ ye = ye[idx]
 54+ if len(x) - 1 <= 3: # takes into account the leave-one-out CV
 55+ print >> sys.stderr, 'skipping %s' % path
 56+ continue
 57+
 58+ # fit data and report peak activity and uncertainty
 59+ with catch_warnings():
 60+ simplefilter('ignore', category=Warning)
 61+ (xp, yp), (xperr, yperr), s, ests = find_peak(x, y, ye,
 62+ reps=ns.bootstrap_iter)
 63+ spl = UnivariateSpline(x, y, ye ** -1, s=s)
 64+ print >> output_file, '%s\t%d\t%12.5f\t%12.5f\t%12.5f\t%12.5f' % \
 65+ (cohort_date, act, xp, xperr, yp, yperr)
 66+
 67+
Property changes on: trunk/tools/wsor/editor_lifecycle/comppeak
___________________________________________________________________
Added: svn:executable
168 + *
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py
@@ -1,13 +1,17 @@
 2+import sys
 3+
24 import numpy as np
3 -from scipy.interpolate import splrep, splev, UnivariateSpline
 5+from scipy.interpolate import UnivariateSpline
46 from scipy.optimize import fmin_tnc
57 import scipy.optimize.tnc as tnc
 8+from multiprocessing import Pool
 9+from multiprocessing.pool import ApplyResult
610
711 def spsmooth(x, y, ye, **kwargs):
812 '''
913 Finds the best spline smoothing factor using leave-one-out cross validation
1014
11 - Additional keyword arguments are passed to splrep (e.g. k for the degree)
 15+ Additional keyword arguments are passed to UnivariateSpline (e.g. k for the degree)
1216 '''
1317
1418 best = []
@@ -33,8 +37,8 @@
3438 err_best = np.inf
3539
3640 for s in slist:
37 - tck = splrep(train_x, train_y, w=train_w, s=s, **kwargs)
38 - err = np.sqrt((splev(test_x, tck) - test_y) ** 2)
 41+ spl = UnivariateSpline(train_x, train_y, w=train_w, s=s)
 42+ err = np.sqrt((spl(test_x) - test_y) ** 2)
3943
4044 if err < err_best:
4145 s_best = s
@@ -43,18 +47,8 @@
4448 best.append(s_best)
4549
4650 return np.mean(best)
47 -
48 -def find_peak(x,y,ye,k=3):
49 - '''
50 - Finds maximum in time series (x_i, y_i) using smoothing splines
5151
52 - Parameters
53 - ----------
54 - x,y - data
55 - ye - standard errors estimates
56 - k - spline degree (must be <= 5)
57 - '''
58 - s = spsmooth(x, y, ye, k=k)
 52+def _find_peak(x, y, ye, k=3, s=None):
5953 spl = UnivariateSpline(x, y, ye ** -1, k=k, s=s)
6054 f = lambda k : -spl(k)
6155 fprime = np.vectorize(lambda k : - spl.derivatives(k)[1])
@@ -65,8 +59,59 @@
6660 x0 = (x.ptp() * np.random.rand() + x.min(),)
6761 xp, nfeval, rc = fmin_tnc(f, x0, fprime=fprime, bounds=bounds,
6862 messages=tnc.MSG_NONE)
 63+ xp = xp.item()
6964 yp = spl(xp)
7065 if yp >= yp_best:
7166 xp_best = xp
7267 yp_best = yp
73 - return xp_best, spl
 68+ return xp_best, yp_best.item()
 69+
 70+def _init():
 71+ import signal
 72+ signal.signal(signal.SIGINT, signal.SIG_IGN)
 73+ import warnings
 74+ warnings.simplefilter("ignore", category=UserWarning)
 75+
 76+def find_peak(x, y, ye, k=3,reps=10000):
 77+ '''
 78+ Finds maximum in time series (x_i, y_i) using smoothing splines
 79+
 80+ Parameters
 81+ ----------
 82+ x,y - data
 83+ ye - standard errors estimates
 84+ k - spline degree (must be <= 5)
 85+
 86+ Returns
 87+ -------
 88+ xp - maximum of the smoothing spline
 89+ xperr - uncertainty in the maximum
 90+ s - the smoothing factor (either the one passed, or the one estimated)
 91+ '''
 92+ # compute the peak
 93+ s = spsmooth(x, y, ye, k=k)
 94+ xp, yp = _find_peak(x, y, ye, k=k, s=s)
 95+
 96+ # compute uncertainty by bootstrap
 97+ results = []
 98+ N = len(x)
 99+ pool = Pool()
 100+ spl = UnivariateSpline(x, y, ye ** -1, k=k, s=s)
 101+ yf = spl(x)
 102+ err = y - yf
 103+ kwds = {'k' : k, 's' : s}
 104+ try:
 105+ for i in xrange(reps):
 106+ idx = np.random.randint(0, len(x), len(x))
 107+ ys = yf + err[idx]
 108+ res = pool.apply_async(_find_peak, args=(x, ys, ye),
 109+ kwds=kwds)
 110+ results.append(res)
 111+ results = np.asarray(map(ApplyResult.get, results))
 112+ xerr = np.std(results[:,0], ddof=1, axis=0)
 113+ # geometric standard deviation
 114+ yerr = np.exp(np.std(np.log(results[:,1]), ddof=1))
 115+ finally:
 116+ pool.terminate()
 117+ pool.join()
 118+ return (xp, yp), (xerr, yerr), s, results

Status & tagging log