r95019 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r95018‎ \| r95019 \| r95020 >
Date:	17:53, 19 August 2011
Author:	giovanni
Status:	deferred
Tags:
Comment:	added registration_lags scripts
Modified paths:	/trunk/tools/wsor/registration_lags (added) (history) /trunk/tools/wsor/registration_lags/evolplot (added) (history) /trunk/tools/wsor/registration_lags/fitmixture (added) (history) /trunk/tools/wsor/registration_lags/graphics.py (added) (history) /trunk/tools/wsor/registration_lags/mixhist (added) (history)

Diff [purge]

Index: trunk/tools/wsor/registration_lags/fitmixture
—	—	@@ -0,0 +1,39 @@
	2	+#!/usr/bin/python
	3	+# vim:ft=python
	4	+
	5	+''' fits lags data to a gaussian mixture model '''
	6	+
	7	+import os
	8	+from argparse import ArgumentParser
	9	+import numpy as np
	10	+from scikits.learn.mixture import GMM
	11	+
	12	+parser = ArgumentParser(description=__doc__)
	13	+parser.add_argument('data_file', metavar='data', help='NumPy array file')
	14	+parser.add_argument('components', type=int)
	15	+
	16	+ns = parser.parse_args()
	17	+
	18	+data = np.load(ns.data_file)
	19	+w = - np.diff(data, axis=1)
	20	+w = np.log(w[w > 0] / 86400.)
	21	+
	22	+key = os.path.splitext(os.path.basename(ns.data_file))[0]
	23	+out = [key]
	24	+
	25	+if len(w) > ns.components:
	26	+ gmm = GMM(ns.components)
	27	+ gmm.fit(w[:, None])
	28	+
	29	+ means = np.ravel(gmm.means)
	30	+ covars = np.ravel(gmm.covars)
	31	+ weights = np.ravel(gmm.weights)
	32	+
	33	+ idx = means.argsort()
	34	+ out.extend(means[idx])
	35	+ out.extend(covars[idx])
	36	+ out.extend(weights[idx])
	37	+else:
	38	+ out.extend([np.nan] * 3 * ns.components)
	39	+
	40	+print '\t'.join(map(str, out))
Property changes on: trunk/tools/wsor/registration_lags/fitmixture
___________________________________________________________________
Added: svn:executable
1	41	+ *
Index: trunk/tools/wsor/registration_lags/evolplot
—	—	@@ -0,0 +1,31 @@
	2	+#!/usr/bin/python
	3	+# :vim:ft=python
	4	+
	5	+import os
	6	+from pylab import *
	7	+from datetime import datetime
	8	+from argparse import ArgumentParser
	9	+
	10	+parser = ArgumentParser()
	11	+parser.add_argument('data_file', metavar='data')
	12	+
	13	+ns = parser.parse_args()
	14	+
	15	+ts = loadtxt(ns.data_file, dtype=dtype('S7,f,f,f,f,f,f'))
	16	+
	17	+cf = lambda k : datetime.strptime(k, '%Y-%m')
	18	+x = map(cf, ts['f0'])
	19	+
	20	+plot(x, ts['f1'], 'o w')
	21	+plot(x, ts['f2'], 'd k')
	22	+axis('auto')
	23	+xlabel('time')
	24	+ylabel('log-days')
	25	+title('GMM means, all NS (revision + archive)')
	26	+draw()
	27	+
	28	+pdf_file = os.path.splitext(ns.data_file)[0] + '.pdf'
	29	+savefig(pdf_file)
	30	+print 'output saved to %s.' % pdf_file
	31	+
	32	+show()
Property changes on: trunk/tools/wsor/registration_lags/evolplot
___________________________________________________________________
Added: svn:executable
1	33	+ *
Index: trunk/tools/wsor/registration_lags/mixhist
—	—	@@ -0,0 +1,72 @@
	2	+#!/usr/bin/python
	3	+# vim:ft=python
	4	+
	5	+''' plot histogram of waiting time data in log-space '''
	6	+
	7	+from argparse import ArgumentParser
	8	+from pylab import *
	9	+import os
	10	+from scikits.learn.mixture import GMM
	11	+from scipy.stats import norm
	12	+
	13	+from graphics import mixturehist
	14	+
	15	+parser = ArgumentParser(description=__doc__)
	16	+parser.add_argument('data_file', metavar='data', help='NumPy array file')
	17	+parser.add_argument('components', type=int)
	18	+parser.add_argument('-subtitle', metavar='TEXT')
	19	+
	20	+times = array([1/86400., 60/86400., 3600/86400., 1, 7, 30, 365])
	21	+logtimes = log(times)
	22	+timestexts = ['1 sec', '1 min', '1 hr', '1 d', '1 week', '1 month', '1 year']
	23	+
	24	+if __name__ == '__main__':
	25	+ ns = parser.parse_args()
	26	+
	27	+ data = np.load(ns.data_file)
	28	+ w = - diff(data, axis=1)
	29	+ w = np.log(w[w > 0] / 86400.)
	30	+
	31	+ print 'fitting GMM with {} components'.format(ns.components)
	32	+ gmm = GMM(ns.components)
	33	+ gmm.fit(w[:, None])
	34	+ means = np.ravel(gmm.means)
	35	+ covars = np.ravel(gmm.covars)
	36	+ weights = np.ravel(gmm.weights)
	37	+
	38	+ print
	39	+ for i, (m, v, p) in enumerate(zip(means, covars, weights)):
	40	+ print 'component {}'.format(i + 1)
	41	+ print '-------------'
	42	+ mu = np.exp(m + v / 2)
	43	+ med = np.exp(m)
	44	+ var = (np.exp(v) - 1) * np.exp(2 * m + v)
	45	+ print 'mean: %8.4g' % mu
	46	+ print 'median: %8.4g' % med
	47	+ print 'std. dev.: %8.4g' % np.sqrt(var)
	48	+ print 'weight: %8.4g' % p
	49	+ print
	50	+
	51	+ comps = [ norm(m, np.sqrt(v)) for m, v in zip(means, covars) ]
	52	+ mixturehist(w, comps, weights, 50, 1000)
	53	+
	54	+ ym, yM = ylim()
	55	+
	56	+ for t, l in zip(logtimes, timestexts):
	57	+ axvline(t, color='k', ls=':')
	58	+ text(t+.1, yM - 0.05 * (yM-ym), l, fontsize='small', color='k', rotation=-30)
	59	+
	60	+ xlabel('log(days)')
	61	+ ylabel('density')
	62	+
	63	+ _title = 'Time between registration and first edit (N = {})'.format(len(w))
	64	+ if ns.subtitle:
	65	+ _title += '\n' + ns.subtitle
	66	+ title(_title)
	67	+
	68	+ draw()
	69	+
	70	+ fn = os.path.splitext(ns.data_file)[0] + '.pdf'
	71	+ savefig(fn, format='pdf')
	72	+ print 'output saved into {}'.format(fn)
	73	+ show()
Property changes on: trunk/tools/wsor/registration_lags/mixhist
___________________________________________________________________
Added: svn:executable
1	74	+ *
Index: trunk/tools/wsor/registration_lags/graphics.py
—	—	@@ -0,0 +1,106 @@
	2	+import numpy as np
	3	+import matplotlib.pyplot as pp
	4	+from matplotlib import cm
	5	+from scipy.stats import gaussian_kde
	6	+from matplotlib.collections import LineCollection
	7	+
	8	+def stackedarea(x, components, weights, cmap=cm.YlGnBu, **kwargs):
	9	+ '''
	10	+ Produces a stacked area plot from given components and weights.
	11	+
	12	+ Parameters
	13	+ ----------
	14	+ x - ordinates
	15	+ components - a sequence of objects with a `pdf' method
	16	+ weights - a sequence of components' weights
	17	+
	18	+ Default color map is Yellow-Green-Blue. Additional keyword arguments are
	19	+ passed matplotlib.pyplot.fill_between. Returns a list of PolyCollections
	20	+ (one for each component).
	21	+ '''
	22	+ assert np.allclose(np.sum(weights), 1) and np.all(weights), 'illegal weights'
	23	+ p = [ w * comp.pdf(x) for comp, w in zip(components, weights) ]
	24	+ p = [ np.zeros(len(x)) ] + p
	25	+ p = np.cumsum(p, axis=0)
	26	+ N = len(p)
	27	+ colors = cmap(np.linspace(0, 1, N) * (1 - 1.0 / N))
	28	+ ret = []
	29	+ for i in xrange(1, N):
	30	+ kwargs['color'] = colors[i-1]
	31	+ r = pp.fill_between(x, p[i-1], p[i], **kwargs)
	32	+ ret.append(r)
	33	+ pp.draw()
	34	+ return ret
	35	+
	36	+def mixturehist(data, components, weights, bins=10, num=1000, cmap=cm.YlGnBu, **kwargs):
	37	+ '''
	38	+ Plots a histogram of given data with a stacked densities of given
	39	+ components
	40	+
	41	+ Parameters
	42	+ ----------
	43	+ data - data array
	44	+ components - a sequence of random variable objects (see scipy.stats)
	45	+ weights - a sequence of components' weights
	46	+ bins - number of histogram bins
	47	+ num - number of points at which stacked densities are evaluated
	48	+ cmap - stacked area plot color map
	49	+
	50	+ Additional keyword arguments are passed to both matplotlib.pyplot.hist and
	51	+ stackedarea.
	52	+ '''
	53	+ histkw = dict(kwargs)
	54	+ # settings for producing transparent histograms
	55	+ histkw.update(normed=True, fc=(0,0,0,0), ec='k')
	56	+ pp.hist(data, bins=bins, **histkw)
	57	+ xmin, xmax = pp.xlim()
	58	+ xi = np.linspace(xmin, xmax, num)
	59	+ stackedarea(xi, components, weights, cmap, **kwargs)
	60	+
	61	+def kdeplot(data, xmin=None, xmax=None, num=50, vmin=None, vmax=None, lc='k', **kwargs):
	62	+ '''
	63	+ Plots density of data, estimated via Gaussian Kernels, together with
	64	+ vertical lines for each data point.
	65	+
	66	+ Parameters
	67	+ ----------
	68	+ data - data sample
	69	+ xmin, xmax - range of density line plot
	70	+ num - number of points at which kde will be evaluated
	71	+ vmin, vmax - vertical lines will span from vmin to vmax
	72	+ lc - vertical lines color
	73	+
	74	+ Returns
	75	+ -------
	76	+ l - density line object
	77	+ linecoll - collection of vertical lines
	78	+ kde - scipy.stats.kde.gaussian_kde
	79	+
	80	+ Additional keyword arguments are passed to plot the density line
	81	+ '''
	82	+ data = np.ravel(data)
	83	+ x0, x1 = data.min(), data.max()
	84	+ xmin = xmin or x0
	85	+ xmax = xmax or x1
	86	+ x = np.linspace(xmin, xmax, num)
	87	+ kde = gaussian_kde(data)
	88	+ d = kde.evaluate(x)
	89	+ if 'axes' in kwargs:
	90	+ ax = kwargs['axes']
	91	+ elif 'figure' in kwargs:
	92	+ ax = kwargs['figure'].axes[-1]
	93	+ elif kwargs.pop('hold', False):
	94	+ ax = pp.gca()
	95	+ else:
	96	+ fig = pp.figure()
	97	+ ax = fig.add_subplot(111)
	98	+ l = ax.plot(x,d, **kwargs)
	99	+ y0, y1 = ax.get_ylim()
	100	+ vmax = vmax or -(y1 - y0) / 30.
	101	+ vmin = vmin or -(y1 - y0) / 10.
	102	+ linesiter = ( [(d, vmax), (d, vmin)] for d in data )
	103	+ linecoll = LineCollection(linesiter, color=lc, alpha=.1)
	104	+ ax.add_collection(linecoll)
	105	+ ax.set_ylim(y0 - (y1 - vmin)/9., y1)
	106	+ pp.draw()
	107	+ return l, linecoll, kde

Status & tagging log

17:56, 19 August 2011 😂 (talk | contribs) changed the status of r95019 [removed: new added: deferred]