r95303 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r95302‎ \| r95303 \| r95304 >
Date:	08:07, 23 August 2011
Author:	giovanni
Status:	deferred
Tags:
Comment:	reworked bootstrapping, added scripts for computing and plotting activity peaks
Modified paths:	/trunk/tools/wsor/editor_lifecycle/comppeak (added) (history) /trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py (modified) (history) /trunk/tools/wsor/editor_lifecycle/plotpeak (added) (history) /trunk/tools/wsor/editor_lifecycle/timechart (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/editor_lifecycle/plotpeak
—	—	@@ -0,0 +1,101 @@
	2	+#!/usr/bin/python
	3	+# :vim:ft=python
	4	+
	5	+import os
	6	+import sys
	7	+
	8	+import numpy as np
	9	+import matplotlib.pyplot as pp
	10	+
	11	+from argparse import ArgumentParser
	12	+from collections import defaultdict
	13	+from datetime import datetime
	14	+from matplotlib.font_manager import FontProperties
	15	+from dateutil.parser import parser as DateParser
	16	+
	17	+__prog__ = os.path.basename(__file__)
	18	+
	19	+parser = ArgumentParser(description=__doc__)
	20	+parser.add_argument('data_paths', metavar='data', nargs='+')
	21	+parser.add_argument('-o', '--output', dest='output_path', metavar='FILE')
	22	+parser.add_argument('-T', '--title')
	23	+
	24	+date_parser = DateParser()
	25	+default_date = datetime(2001,1,1)
	26	+
	27	+markers = 'ov^<>sp*+xD'
	28	+colors = 'bgrcmykw'
	29	+conv = defaultdict(lambda k : float)
	30	+conv[0] = lambda k : date_parser.parse(k, default=default_date)
	31	+
	32	+if __name__ == '__main__':
	33	+ ns = parser.parse_args()
	34	+
	35	+ # create figure
	36	+ fig = pp.figure(figsize=(10,4))
	37	+ ax1 = fig.add_axes(pp.axes([.1,.15,.65,.75], axisbg='ghostwhite'))
	38	+ ax2 = ax1.twinx()
	39	+
	40	+ M = len(markers)
	41	+ C = len(colors)
	42	+
	43	+ lines = []
	44	+
	45	+ for i, path in enumerate(ns.data_paths):
	46	+
	47	+ # load cohort data and filter out estimates based on samples with size
	48	+ # smaller than minimum requested
	49	+ arr = np.loadtxt(path, delimiter='\t', dtype=object, converters=conv,
	50	+ skiprows=1)
	51	+
	52	+ if len(arr) == 0:
	53	+ print >> sys.stderr, '%s: warning: skipping empty dataset: %s' % \
	54	+ (__prog__, path)
	55	+ continue
	56	+
	57	+ date = arr[:, 0]
	58	+ idx = np.argsort(date)
	59	+ arr = arr[idx]
	60	+ date = arr[:, 0]
	61	+ act, peak_date, peak_date_err, peak, peak_err = map(np.asfarray,
	62	+ arr[:,1:].T)
	63	+
	64	+ label = os.path.splitext(path)[0].replace('_',' ')
	65	+
	66	+# color = colors[i % C]
	67	+ marker = markers[i % M]
	68	+ l, (wu, wd), mc = ax1.errorbar(date, peak_date, peak_date_err / 2.0,
	69	+ marker=marker, mec='red', label=label, ecolor='lightgrey',
	70	+ ls='none', lw=2, mfc='none')
	71	+ lines.append(l)
	72	+
	73	+ l, (wu, wd), mc = ax2.errorbar(date, peak, peak_err / 2.0,
	74	+ marker=marker, mec='blue', label=label, ecolor='lightgrey',
	75	+ ls='none', lw=2, mfc='none')
	76	+
	77	+ lines.append(l)
	78	+
	79	+ # decorate figure
	80	+ ax1.set_xlabel('cohorts')
	81	+ ax1.set_ylabel('peak day', color='red')
	82	+ ax2.set_ylabel('peak edits/day', color='blue')
	83	+ pp.figlegend(lines, [ l.get_label() for l in lines ],
	84	+ loc='center right', prop=FontProperties(size='small'))
	85	+ pp.minorticks_on()
	86	+ pp.grid("on")
	87	+ pp.axis('tight')
	88	+
	89	+ pp.draw()
	90	+ if ns.title is not None:
	91	+ pp.title(ns.title)
	92	+ pp.draw()
	93	+
	94	+ # save to file, is output path specified
	95	+ if ns.output_path is not None:
	96	+ _, ext = os.path.splitext(ns.output_path)
	97	+ fmt = ext.strip('.') or 'pdf'
	98	+ pp.savefig(ns.output_path, fmt=ext)
	99	+ print '%s: output saved to %s' % (__prog__, ns.output_path)
	100	+
	101	+ pp.show()
	102	+
Property changes on: trunk/tools/wsor/editor_lifecycle/plotpeak
___________________________________________________________________
Added: svn:executable
1	103	+ *
Index: trunk/tools/wsor/editor_lifecycle/timechart
—	—	@@ -13,8 +13,9 @@
14	14
15	15	from argparse import ArgumentParser
16	16	from matplotlib.font_manager import FontProperties
	17	+from scipy.interpolate import UnivariateSpline
17	18
18		~~-from lifecycle.cvsmooth import find_peak~~
	19	+from lifecycle.cvsmooth import find_peak
19	20
20	21	__prog__ = os.path.basename(__file__)
21	22
—	—	@@ -67,20 +68,19 @@
68	69	marker=marker, mec=color, label=label, ecolor='lightgrey',
69	70	ls='none', lw=2, mfc='none')
70	71	pp.setp(wd, ls='none')
	72	+ lines.append(l)
71	73
72		~~- xp, spl = find_peak(days, rate, rate_err / 2.0)~~
	74	+ # compute maximum
	75	+ xp, xperr, s, ests = find_peak(days, rate, rate_err)
	76	+ ax.axvline(xp, color=color)
73	77
	78	+ # plot spline
	79	+ spl = UnivariateSpline(days, rate, rate_err ** -1, s=s)
74	80	x = np.linspace(days.min(), days.max(), endpoint=True, num=100)
75		-
76	81	y = spl(x)
77		-
78	82	ax.plot(x, y, label='spline fit', color=color, ls='--', marker='none',
79	83	lw=2)
80	84
81		~~- ax.axvline(xp, color=color)~~
82		-
83		~~- lines.append(l)~~
84		-
85	85	# decorate figure
86	86	pp.xlabel('days since first edit')
87	87	pp.ylabel('edits/day')
Index: trunk/tools/wsor/editor_lifecycle/comppeak
—	—	@@ -0,0 +1,66 @@
	2	+#!/usr/bin/python
	3	+# :vim:ft=python
	4	+
	5	+''' compute activity peaks '''
	6	+
	7	+__author__ = "Giovanni Luca Ciampaglia"
	8	+__email__ = "gciampaglia@wikimedia.org"
	9	+
	10	+import os
	11	+import sys
	12	+
	13	+import numpy as np
	14	+
	15	+from argparse import ArgumentParser
	16	+from lifecycle.cvsmooth import find_peak
	17	+from scipy.interpolate import UnivariateSpline
	18	+from warnings import catch_warnings, simplefilter
	19	+
	20	+__prog__ = os.path.basename(__file__)
	21	+
	22	+parser = ArgumentParser(description=__doc__)
	23	+parser.add_argument('data_paths', metavar='data', nargs='+')
	24	+parser.add_argument('-m', '--min-size', type=int, default=0)
	25	+parser.add_argument('-o', '--output', dest='output_path', metavar='FILE')
	26	+parser.add_argument('-i', '--iter', type=int, metavar='NUM',
	27	+ dest='bootstrap_iter', default=10000,
	28	+ help='bootstrap iterations (default: %(default)d)')
	29	+
	30	+if __name__ == '__main__':
	31	+ ns = parser.parse_args()
	32	+
	33	+ if ns.output_path is not None:
	34	+ output_file = open(ns.output_path, 'w')
	35	+ else:
	36	+ output_file = sys.stdout
	37	+
	38	+ print >> output_file,\
	39	+ 'date\tactivity\tpeak_date\tpeak_date_err\tpeak\tpeak_err'
	40	+ for path in ns.data_paths:
	41	+
	42	+ # get date and activity rate for file name
	43	+ basename = os.path.basename(path)
	44	+ basename, _ = os.path.splitext(basename)
	45	+ cohort_date, act = basename.split('_')
	46	+ act = int(act)
	47	+
	48	+ # load data and check if fittable
	49	+ x, y, ye, n = map(np.atleast_1d, np.loadtxt(path, unpack=1))
	50	+ idx = (n > ns.min_size) * (ye > 0)
	51	+ x = x[idx]
	52	+ y = y[idx]
	53	+ ye = ye[idx]
	54	+ if len(x) - 1 <= 3: # takes into account the leave-one-out CV
	55	+ print >> sys.stderr, 'skipping %s' % path
	56	+ continue
	57	+
	58	+ # fit data and report peak activity and uncertainty
	59	+ with catch_warnings():
	60	+ simplefilter('ignore', category=Warning)
	61	+ (xp, yp), (xperr, yperr), s, ests = find_peak(x, y, ye,
	62	+ reps=ns.bootstrap_iter)
	63	+ spl = UnivariateSpline(x, y, ye ** -1, s=s)
	64	+ print >> output_file, '%s\t%d\t%12.5f\t%12.5f\t%12.5f\t%12.5f' % \
	65	+ (cohort_date, act, xp, xperr, yp, yperr)
	66	+
	67	+
Property changes on: trunk/tools/wsor/editor_lifecycle/comppeak
___________________________________________________________________
Added: svn:executable
1	68	+ *
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py
—	—	@@ -1,13 +1,17 @@
	2	+import sys
	3	+
2	4	import numpy as np
3		~~-from scipy.interpolate import splrep, splev, UnivariateSpline~~
	5	+from scipy.interpolate import UnivariateSpline
4	6	from scipy.optimize import fmin_tnc
5	7	import scipy.optimize.tnc as tnc
	8	+from multiprocessing import Pool
	9	+from multiprocessing.pool import ApplyResult
6	10
7	11	def spsmooth(x, y, ye, **kwargs):
8	12	'''
9	13	Finds the best spline smoothing factor using leave-one-out cross validation
10	14
11		~~- Additional keyword arguments are passed to splrep (e.g. k for the degree)~~
	15	+ Additional keyword arguments are passed to UnivariateSpline (e.g. k for the degree)
12	16	'''
13	17
14	18	best = []
—	—	@@ -33,8 +37,8 @@
34	38	err_best = np.inf
35	39
36	40	for s in slist:
37		~~- tck = splrep(train_x, train_y, w=train_w, s=s, **kwargs)~~
38		~~- err = np.sqrt((splev(test_x, tck) - test_y) ** 2)~~
	41	+ spl = UnivariateSpline(train_x, train_y, w=train_w, s=s)
	42	+ err = np.sqrt((spl(test_x) - test_y) ** 2)
39	43
40	44	if err < err_best:
41	45	s_best = s
—	—	@@ -43,18 +47,8 @@
44	48	best.append(s_best)
45	49
46	50	return np.mean(best)
47		-
48		~~-def find_peak(x,y,ye,k=3):~~
49		~~- '''~~
50		~~- Finds maximum in time series (x_i, y_i) using smoothing splines~~
51	51
52		~~- Parameters~~
53		~~- ----------~~
54		~~- x,y - data~~
55		~~- ye - standard errors estimates~~
56		~~- k - spline degree (must be <= 5)~~
57		~~- '''~~
58		~~- s = spsmooth(x, y, ye, k=k)~~
	52	+def _find_peak(x, y, ye, k=3, s=None):
59	53	spl = UnivariateSpline(x, y, ye ** -1, k=k, s=s)
60	54	f = lambda k : -spl(k)
61	55	fprime = np.vectorize(lambda k : - spl.derivatives(k)[1])
—	—	@@ -65,8 +59,59 @@
66	60	x0 = (x.ptp() * np.random.rand() + x.min(),)
67	61	xp, nfeval, rc = fmin_tnc(f, x0, fprime=fprime, bounds=bounds,
68	62	messages=tnc.MSG_NONE)
	63	+ xp = xp.item()
69	64	yp = spl(xp)
70	65	if yp >= yp_best:
71	66	xp_best = xp
72	67	yp_best = yp
73		~~- return xp_best, spl~~
	68	+ return xp_best, yp_best.item()
	69	+
	70	+def _init():
	71	+ import signal
	72	+ signal.signal(signal.SIGINT, signal.SIG_IGN)
	73	+ import warnings
	74	+ warnings.simplefilter("ignore", category=UserWarning)
	75	+
	76	+def find_peak(x, y, ye, k=3,reps=10000):
	77	+ '''
	78	+ Finds maximum in time series (x_i, y_i) using smoothing splines
	79	+
	80	+ Parameters
	81	+ ----------
	82	+ x,y - data
	83	+ ye - standard errors estimates
	84	+ k - spline degree (must be <= 5)
	85	+
	86	+ Returns
	87	+ -------
	88	+ xp - maximum of the smoothing spline
	89	+ xperr - uncertainty in the maximum
	90	+ s - the smoothing factor (either the one passed, or the one estimated)
	91	+ '''
	92	+ # compute the peak
	93	+ s = spsmooth(x, y, ye, k=k)
	94	+ xp, yp = _find_peak(x, y, ye, k=k, s=s)
	95	+
	96	+ # compute uncertainty by bootstrap
	97	+ results = []
	98	+ N = len(x)
	99	+ pool = Pool()
	100	+ spl = UnivariateSpline(x, y, ye ** -1, k=k, s=s)
	101	+ yf = spl(x)
	102	+ err = y - yf
	103	+ kwds = {'k' : k, 's' : s}
	104	+ try:
	105	+ for i in xrange(reps):
	106	+ idx = np.random.randint(0, len(x), len(x))
	107	+ ys = yf + err[idx]
	108	+ res = pool.apply_async(_find_peak, args=(x, ys, ye),
	109	+ kwds=kwds)
	110	+ results.append(res)
	111	+ results = np.asarray(map(ApplyResult.get, results))
	112	+ xerr = np.std(results[:,0], ddof=1, axis=0)
	113	+ # geometric standard deviation
	114	+ yerr = np.exp(np.std(np.log(results[:,1]), ddof=1))
	115	+ finally:
	116	+ pool.terminate()
	117	+ pool.join()
	118	+ return (xp, yp), (xerr, yerr), s, results

Status & tagging log

12:12, 23 August 2011 Reedy (talk | contribs) changed the status of r95303 [removed: new added: deferred]