Index: trunk/tools/wsor/registration_lags/fitmixture |
— | — | @@ -0,0 +1,39 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# vim:ft=python |
| 4 | + |
| 5 | +''' fits lags data to a gaussian mixture model ''' |
| 6 | + |
| 7 | +import os |
| 8 | +from argparse import ArgumentParser |
| 9 | +import numpy as np |
| 10 | +from scikits.learn.mixture import GMM |
| 11 | + |
| 12 | +parser = ArgumentParser(description=__doc__) |
| 13 | +parser.add_argument('data_file', metavar='data', help='NumPy array file') |
| 14 | +parser.add_argument('components', type=int) |
| 15 | + |
| 16 | +ns = parser.parse_args() |
| 17 | + |
| 18 | +data = np.load(ns.data_file) |
| 19 | +w = - np.diff(data, axis=1) |
| 20 | +w = np.log(w[w > 0] / 86400.) |
| 21 | + |
| 22 | +key = os.path.splitext(os.path.basename(ns.data_file))[0] |
| 23 | +out = [key] |
| 24 | + |
| 25 | +if len(w) > ns.components: |
| 26 | + gmm = GMM(ns.components) |
| 27 | + gmm.fit(w[:, None]) |
| 28 | + |
| 29 | + means = np.ravel(gmm.means) |
| 30 | + covars = np.ravel(gmm.covars) |
| 31 | + weights = np.ravel(gmm.weights) |
| 32 | + |
| 33 | + idx = means.argsort() |
| 34 | + out.extend(means[idx]) |
| 35 | + out.extend(covars[idx]) |
| 36 | + out.extend(weights[idx]) |
| 37 | +else: |
| 38 | + out.extend([np.nan] * 3 * ns.components) |
| 39 | + |
| 40 | +print '\t'.join(map(str, out)) |
Property changes on: trunk/tools/wsor/registration_lags/fitmixture |
___________________________________________________________________ |
Added: svn:executable |
1 | 41 | + * |
Index: trunk/tools/wsor/registration_lags/evolplot |
— | — | @@ -0,0 +1,31 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# :vim:ft=python |
| 4 | + |
| 5 | +import os |
| 6 | +from pylab import * |
| 7 | +from datetime import datetime |
| 8 | +from argparse import ArgumentParser |
| 9 | + |
| 10 | +parser = ArgumentParser() |
| 11 | +parser.add_argument('data_file', metavar='data') |
| 12 | + |
| 13 | +ns = parser.parse_args() |
| 14 | + |
| 15 | +ts = loadtxt(ns.data_file, dtype=dtype('S7,f,f,f,f,f,f')) |
| 16 | + |
| 17 | +cf = lambda k : datetime.strptime(k, '%Y-%m') |
| 18 | +x = map(cf, ts['f0']) |
| 19 | + |
| 20 | +plot(x, ts['f1'], 'o w') |
| 21 | +plot(x, ts['f2'], 'd k') |
| 22 | +axis('auto') |
| 23 | +xlabel('time') |
| 24 | +ylabel('log-days') |
| 25 | +title('GMM means, all NS (revision + archive)') |
| 26 | +draw() |
| 27 | + |
| 28 | +pdf_file = os.path.splitext(ns.data_file)[0] + '.pdf' |
| 29 | +savefig(pdf_file) |
| 30 | +print 'output saved to %s.' % pdf_file |
| 31 | + |
| 32 | +show() |
Property changes on: trunk/tools/wsor/registration_lags/evolplot |
___________________________________________________________________ |
Added: svn:executable |
1 | 33 | + * |
Index: trunk/tools/wsor/registration_lags/mixhist |
— | — | @@ -0,0 +1,72 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# vim:ft=python |
| 4 | + |
| 5 | +''' plot histogram of waiting time data in log-space ''' |
| 6 | + |
| 7 | +from argparse import ArgumentParser |
| 8 | +from pylab import * |
| 9 | +import os |
| 10 | +from scikits.learn.mixture import GMM |
| 11 | +from scipy.stats import norm |
| 12 | + |
| 13 | +from graphics import mixturehist |
| 14 | + |
| 15 | +parser = ArgumentParser(description=__doc__) |
| 16 | +parser.add_argument('data_file', metavar='data', help='NumPy array file') |
| 17 | +parser.add_argument('components', type=int) |
| 18 | +parser.add_argument('-subtitle', metavar='TEXT') |
| 19 | + |
| 20 | +times = array([1/86400., 60/86400., 3600/86400., 1, 7, 30, 365]) |
| 21 | +logtimes = log(times) |
| 22 | +timestexts = ['1 sec', '1 min', '1 hr', '1 d', '1 week', '1 month', '1 year'] |
| 23 | + |
| 24 | +if __name__ == '__main__': |
| 25 | + ns = parser.parse_args() |
| 26 | + |
| 27 | + data = np.load(ns.data_file) |
| 28 | + w = - diff(data, axis=1) |
| 29 | + w = np.log(w[w > 0] / 86400.) |
| 30 | + |
| 31 | + print 'fitting GMM with {} components'.format(ns.components) |
| 32 | + gmm = GMM(ns.components) |
| 33 | + gmm.fit(w[:, None]) |
| 34 | + means = np.ravel(gmm.means) |
| 35 | + covars = np.ravel(gmm.covars) |
| 36 | + weights = np.ravel(gmm.weights) |
| 37 | + |
| 38 | + print |
| 39 | + for i, (m, v, p) in enumerate(zip(means, covars, weights)): |
| 40 | + print 'component {}'.format(i + 1) |
| 41 | + print '-------------' |
| 42 | + mu = np.exp(m + v / 2) |
| 43 | + med = np.exp(m) |
| 44 | + var = (np.exp(v) - 1) * np.exp(2 * m + v) |
| 45 | + print 'mean: %8.4g' % mu |
| 46 | + print 'median: %8.4g' % med |
| 47 | + print 'std. dev.: %8.4g' % np.sqrt(var) |
| 48 | + print 'weight: %8.4g' % p |
| 49 | + print |
| 50 | + |
| 51 | + comps = [ norm(m, np.sqrt(v)) for m, v in zip(means, covars) ] |
| 52 | + mixturehist(w, comps, weights, 50, 1000) |
| 53 | + |
| 54 | + ym, yM = ylim() |
| 55 | + |
| 56 | + for t, l in zip(logtimes, timestexts): |
| 57 | + axvline(t, color='k', ls=':') |
| 58 | + text(t+.1, yM - 0.05 * (yM-ym), l, fontsize='small', color='k', rotation=-30) |
| 59 | + |
| 60 | + xlabel('log(days)') |
| 61 | + ylabel('density') |
| 62 | + |
| 63 | + _title = 'Time between registration and first edit (N = {})'.format(len(w)) |
| 64 | + if ns.subtitle: |
| 65 | + _title += '\n' + ns.subtitle |
| 66 | + title(_title) |
| 67 | + |
| 68 | + draw() |
| 69 | + |
| 70 | + fn = os.path.splitext(ns.data_file)[0] + '.pdf' |
| 71 | + savefig(fn, format='pdf') |
| 72 | + print 'output saved into {}'.format(fn) |
| 73 | + show() |
Property changes on: trunk/tools/wsor/registration_lags/mixhist |
___________________________________________________________________ |
Added: svn:executable |
1 | 74 | + * |
Index: trunk/tools/wsor/registration_lags/graphics.py |
— | — | @@ -0,0 +1,106 @@ |
| 2 | +import numpy as np |
| 3 | +import matplotlib.pyplot as pp |
| 4 | +from matplotlib import cm |
| 5 | +from scipy.stats import gaussian_kde |
| 6 | +from matplotlib.collections import LineCollection |
| 7 | + |
| 8 | +def stackedarea(x, components, weights, cmap=cm.YlGnBu, **kwargs): |
| 9 | + ''' |
| 10 | + Produces a stacked area plot from given components and weights. |
| 11 | + |
| 12 | + Parameters |
| 13 | + ---------- |
| 14 | + x - ordinates |
| 15 | + components - a sequence of objects with a `pdf' method |
| 16 | + weights - a sequence of components' weights |
| 17 | + |
| 18 | + Default color map is Yellow-Green-Blue. Additional keyword arguments are |
| 19 | + passed matplotlib.pyplot.fill_between. Returns a list of PolyCollections |
| 20 | + (one for each component). |
| 21 | + ''' |
| 22 | + assert np.allclose(np.sum(weights), 1) and np.all(weights), 'illegal weights' |
| 23 | + p = [ w * comp.pdf(x) for comp, w in zip(components, weights) ] |
| 24 | + p = [ np.zeros(len(x)) ] + p |
| 25 | + p = np.cumsum(p, axis=0) |
| 26 | + N = len(p) |
| 27 | + colors = cmap(np.linspace(0, 1, N) * (1 - 1.0 / N)) |
| 28 | + ret = [] |
| 29 | + for i in xrange(1, N): |
| 30 | + kwargs['color'] = colors[i-1] |
| 31 | + r = pp.fill_between(x, p[i-1], p[i], **kwargs) |
| 32 | + ret.append(r) |
| 33 | + pp.draw() |
| 34 | + return ret |
| 35 | + |
| 36 | +def mixturehist(data, components, weights, bins=10, num=1000, cmap=cm.YlGnBu, **kwargs): |
| 37 | + ''' |
| 38 | + Plots a histogram of given data with a stacked densities of given |
| 39 | + components |
| 40 | + |
| 41 | + Parameters |
| 42 | + ---------- |
| 43 | + data - data array |
| 44 | + components - a sequence of random variable objects (see scipy.stats) |
| 45 | + weights - a sequence of components' weights |
| 46 | + bins - number of histogram bins |
| 47 | + num - number of points at which stacked densities are evaluated |
| 48 | + cmap - stacked area plot color map |
| 49 | + |
| 50 | + Additional keyword arguments are passed to both matplotlib.pyplot.hist and |
| 51 | + stackedarea. |
| 52 | + ''' |
| 53 | + histkw = dict(kwargs) |
| 54 | + # settings for producing transparent histograms |
| 55 | + histkw.update(normed=True, fc=(0,0,0,0), ec='k') |
| 56 | + pp.hist(data, bins=bins, **histkw) |
| 57 | + xmin, xmax = pp.xlim() |
| 58 | + xi = np.linspace(xmin, xmax, num) |
| 59 | + stackedarea(xi, components, weights, cmap, **kwargs) |
| 60 | + |
| 61 | +def kdeplot(data, xmin=None, xmax=None, num=50, vmin=None, vmax=None, lc='k', **kwargs): |
| 62 | + ''' |
| 63 | + Plots density of data, estimated via Gaussian Kernels, together with |
| 64 | + vertical lines for each data point. |
| 65 | + |
| 66 | + Parameters |
| 67 | + ---------- |
| 68 | + data - data sample |
| 69 | + xmin, xmax - range of density line plot |
| 70 | + num - number of points at which kde will be evaluated |
| 71 | + vmin, vmax - vertical lines will span from vmin to vmax |
| 72 | + lc - vertical lines color |
| 73 | + |
| 74 | + Returns |
| 75 | + ------- |
| 76 | + l - density line object |
| 77 | + linecoll - collection of vertical lines |
| 78 | + kde - scipy.stats.kde.gaussian_kde |
| 79 | + |
| 80 | + Additional keyword arguments are passed to plot the density line |
| 81 | + ''' |
| 82 | + data = np.ravel(data) |
| 83 | + x0, x1 = data.min(), data.max() |
| 84 | + xmin = xmin or x0 |
| 85 | + xmax = xmax or x1 |
| 86 | + x = np.linspace(xmin, xmax, num) |
| 87 | + kde = gaussian_kde(data) |
| 88 | + d = kde.evaluate(x) |
| 89 | + if 'axes' in kwargs: |
| 90 | + ax = kwargs['axes'] |
| 91 | + elif 'figure' in kwargs: |
| 92 | + ax = kwargs['figure'].axes[-1] |
| 93 | + elif kwargs.pop('hold', False): |
| 94 | + ax = pp.gca() |
| 95 | + else: |
| 96 | + fig = pp.figure() |
| 97 | + ax = fig.add_subplot(111) |
| 98 | + l = ax.plot(x,d, **kwargs) |
| 99 | + y0, y1 = ax.get_ylim() |
| 100 | + vmax = vmax or -(y1 - y0) / 30. |
| 101 | + vmin = vmin or -(y1 - y0) / 10. |
| 102 | + linesiter = ( [(d, vmax), (d, vmin)] for d in data ) |
| 103 | + linecoll = LineCollection(linesiter, color=lc, alpha=.1) |
| 104 | + ax.add_collection(linecoll) |
| 105 | + ax.set_ylim(y0 - (y1 - vmin)/9., y1) |
| 106 | + pp.draw() |
| 107 | + return l, linecoll, kde |