r95019 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r95018‎ | r95019 | r95020 >
Date:17:53, 19 August 2011
Author:giovanni
Status:deferred
Tags:
Comment:
added registration_lags scripts
Modified paths:
  • /trunk/tools/wsor/registration_lags (added) (history)
  • /trunk/tools/wsor/registration_lags/evolplot (added) (history)
  • /trunk/tools/wsor/registration_lags/fitmixture (added) (history)
  • /trunk/tools/wsor/registration_lags/graphics.py (added) (history)
  • /trunk/tools/wsor/registration_lags/mixhist (added) (history)

Diff [purge]

Index: trunk/tools/wsor/registration_lags/fitmixture
@@ -0,0 +1,39 @@
 2+#!/usr/bin/python
 3+# vim:ft=python
 4+
 5+''' fits lags data to a gaussian mixture model '''
 6+
 7+import os
 8+from argparse import ArgumentParser
 9+import numpy as np
 10+from scikits.learn.mixture import GMM
 11+
 12+parser = ArgumentParser(description=__doc__)
 13+parser.add_argument('data_file', metavar='data', help='NumPy array file')
 14+parser.add_argument('components', type=int)
 15+
 16+ns = parser.parse_args()
 17+
 18+data = np.load(ns.data_file)
 19+w = - np.diff(data, axis=1)
 20+w = np.log(w[w > 0] / 86400.)
 21+
 22+key = os.path.splitext(os.path.basename(ns.data_file))[0]
 23+out = [key]
 24+
 25+if len(w) > ns.components:
 26+ gmm = GMM(ns.components)
 27+ gmm.fit(w[:, None])
 28+
 29+ means = np.ravel(gmm.means)
 30+ covars = np.ravel(gmm.covars)
 31+ weights = np.ravel(gmm.weights)
 32+
 33+ idx = means.argsort()
 34+ out.extend(means[idx])
 35+ out.extend(covars[idx])
 36+ out.extend(weights[idx])
 37+else:
 38+ out.extend([np.nan] * 3 * ns.components)
 39+
 40+print '\t'.join(map(str, out))
Property changes on: trunk/tools/wsor/registration_lags/fitmixture
___________________________________________________________________
Added: svn:executable
141 + *
Index: trunk/tools/wsor/registration_lags/evolplot
@@ -0,0 +1,31 @@
 2+#!/usr/bin/python
 3+# :vim:ft=python
 4+
 5+import os
 6+from pylab import *
 7+from datetime import datetime
 8+from argparse import ArgumentParser
 9+
 10+parser = ArgumentParser()
 11+parser.add_argument('data_file', metavar='data')
 12+
 13+ns = parser.parse_args()
 14+
 15+ts = loadtxt(ns.data_file, dtype=dtype('S7,f,f,f,f,f,f'))
 16+
 17+cf = lambda k : datetime.strptime(k, '%Y-%m')
 18+x = map(cf, ts['f0'])
 19+
 20+plot(x, ts['f1'], 'o w')
 21+plot(x, ts['f2'], 'd k')
 22+axis('auto')
 23+xlabel('time')
 24+ylabel('log-days')
 25+title('GMM means, all NS (revision + archive)')
 26+draw()
 27+
 28+pdf_file = os.path.splitext(ns.data_file)[0] + '.pdf'
 29+savefig(pdf_file)
 30+print 'output saved to %s.' % pdf_file
 31+
 32+show()
Property changes on: trunk/tools/wsor/registration_lags/evolplot
___________________________________________________________________
Added: svn:executable
133 + *
Index: trunk/tools/wsor/registration_lags/mixhist
@@ -0,0 +1,72 @@
 2+#!/usr/bin/python
 3+# vim:ft=python
 4+
 5+''' plot histogram of waiting time data in log-space '''
 6+
 7+from argparse import ArgumentParser
 8+from pylab import *
 9+import os
 10+from scikits.learn.mixture import GMM
 11+from scipy.stats import norm
 12+
 13+from graphics import mixturehist
 14+
 15+parser = ArgumentParser(description=__doc__)
 16+parser.add_argument('data_file', metavar='data', help='NumPy array file')
 17+parser.add_argument('components', type=int)
 18+parser.add_argument('-subtitle', metavar='TEXT')
 19+
 20+times = array([1/86400., 60/86400., 3600/86400., 1, 7, 30, 365])
 21+logtimes = log(times)
 22+timestexts = ['1 sec', '1 min', '1 hr', '1 d', '1 week', '1 month', '1 year']
 23+
 24+if __name__ == '__main__':
 25+ ns = parser.parse_args()
 26+
 27+ data = np.load(ns.data_file)
 28+ w = - diff(data, axis=1)
 29+ w = np.log(w[w > 0] / 86400.)
 30+
 31+ print 'fitting GMM with {} components'.format(ns.components)
 32+ gmm = GMM(ns.components)
 33+ gmm.fit(w[:, None])
 34+ means = np.ravel(gmm.means)
 35+ covars = np.ravel(gmm.covars)
 36+ weights = np.ravel(gmm.weights)
 37+
 38+ print
 39+ for i, (m, v, p) in enumerate(zip(means, covars, weights)):
 40+ print 'component {}'.format(i + 1)
 41+ print '-------------'
 42+ mu = np.exp(m + v / 2)
 43+ med = np.exp(m)
 44+ var = (np.exp(v) - 1) * np.exp(2 * m + v)
 45+ print 'mean: %8.4g' % mu
 46+ print 'median: %8.4g' % med
 47+ print 'std. dev.: %8.4g' % np.sqrt(var)
 48+ print 'weight: %8.4g' % p
 49+ print
 50+
 51+ comps = [ norm(m, np.sqrt(v)) for m, v in zip(means, covars) ]
 52+ mixturehist(w, comps, weights, 50, 1000)
 53+
 54+ ym, yM = ylim()
 55+
 56+ for t, l in zip(logtimes, timestexts):
 57+ axvline(t, color='k', ls=':')
 58+ text(t+.1, yM - 0.05 * (yM-ym), l, fontsize='small', color='k', rotation=-30)
 59+
 60+ xlabel('log(days)')
 61+ ylabel('density')
 62+
 63+ _title = 'Time between registration and first edit (N = {})'.format(len(w))
 64+ if ns.subtitle:
 65+ _title += '\n' + ns.subtitle
 66+ title(_title)
 67+
 68+ draw()
 69+
 70+ fn = os.path.splitext(ns.data_file)[0] + '.pdf'
 71+ savefig(fn, format='pdf')
 72+ print 'output saved into {}'.format(fn)
 73+ show()
Property changes on: trunk/tools/wsor/registration_lags/mixhist
___________________________________________________________________
Added: svn:executable
174 + *
Index: trunk/tools/wsor/registration_lags/graphics.py
@@ -0,0 +1,106 @@
 2+import numpy as np
 3+import matplotlib.pyplot as pp
 4+from matplotlib import cm
 5+from scipy.stats import gaussian_kde
 6+from matplotlib.collections import LineCollection
 7+
 8+def stackedarea(x, components, weights, cmap=cm.YlGnBu, **kwargs):
 9+ '''
 10+ Produces a stacked area plot from given components and weights.
 11+
 12+ Parameters
 13+ ----------
 14+ x - ordinates
 15+ components - a sequence of objects with a `pdf' method
 16+ weights - a sequence of components' weights
 17+
 18+ Default color map is Yellow-Green-Blue. Additional keyword arguments are
 19+ passed matplotlib.pyplot.fill_between. Returns a list of PolyCollections
 20+ (one for each component).
 21+ '''
 22+ assert np.allclose(np.sum(weights), 1) and np.all(weights), 'illegal weights'
 23+ p = [ w * comp.pdf(x) for comp, w in zip(components, weights) ]
 24+ p = [ np.zeros(len(x)) ] + p
 25+ p = np.cumsum(p, axis=0)
 26+ N = len(p)
 27+ colors = cmap(np.linspace(0, 1, N) * (1 - 1.0 / N))
 28+ ret = []
 29+ for i in xrange(1, N):
 30+ kwargs['color'] = colors[i-1]
 31+ r = pp.fill_between(x, p[i-1], p[i], **kwargs)
 32+ ret.append(r)
 33+ pp.draw()
 34+ return ret
 35+
 36+def mixturehist(data, components, weights, bins=10, num=1000, cmap=cm.YlGnBu, **kwargs):
 37+ '''
 38+ Plots a histogram of given data with a stacked densities of given
 39+ components
 40+
 41+ Parameters
 42+ ----------
 43+ data - data array
 44+ components - a sequence of random variable objects (see scipy.stats)
 45+ weights - a sequence of components' weights
 46+ bins - number of histogram bins
 47+ num - number of points at which stacked densities are evaluated
 48+ cmap - stacked area plot color map
 49+
 50+ Additional keyword arguments are passed to both matplotlib.pyplot.hist and
 51+ stackedarea.
 52+ '''
 53+ histkw = dict(kwargs)
 54+ # settings for producing transparent histograms
 55+ histkw.update(normed=True, fc=(0,0,0,0), ec='k')
 56+ pp.hist(data, bins=bins, **histkw)
 57+ xmin, xmax = pp.xlim()
 58+ xi = np.linspace(xmin, xmax, num)
 59+ stackedarea(xi, components, weights, cmap, **kwargs)
 60+
 61+def kdeplot(data, xmin=None, xmax=None, num=50, vmin=None, vmax=None, lc='k', **kwargs):
 62+ '''
 63+ Plots density of data, estimated via Gaussian Kernels, together with
 64+ vertical lines for each data point.
 65+
 66+ Parameters
 67+ ----------
 68+ data - data sample
 69+ xmin, xmax - range of density line plot
 70+ num - number of points at which kde will be evaluated
 71+ vmin, vmax - vertical lines will span from vmin to vmax
 72+ lc - vertical lines color
 73+
 74+ Returns
 75+ -------
 76+ l - density line object
 77+ linecoll - collection of vertical lines
 78+ kde - scipy.stats.kde.gaussian_kde
 79+
 80+ Additional keyword arguments are passed to plot the density line
 81+ '''
 82+ data = np.ravel(data)
 83+ x0, x1 = data.min(), data.max()
 84+ xmin = xmin or x0
 85+ xmax = xmax or x1
 86+ x = np.linspace(xmin, xmax, num)
 87+ kde = gaussian_kde(data)
 88+ d = kde.evaluate(x)
 89+ if 'axes' in kwargs:
 90+ ax = kwargs['axes']
 91+ elif 'figure' in kwargs:
 92+ ax = kwargs['figure'].axes[-1]
 93+ elif kwargs.pop('hold', False):
 94+ ax = pp.gca()
 95+ else:
 96+ fig = pp.figure()
 97+ ax = fig.add_subplot(111)
 98+ l = ax.plot(x,d, **kwargs)
 99+ y0, y1 = ax.get_ylim()
 100+ vmax = vmax or -(y1 - y0) / 30.
 101+ vmin = vmin or -(y1 - y0) / 10.
 102+ linesiter = ( [(d, vmax), (d, vmin)] for d in data )
 103+ linecoll = LineCollection(linesiter, color=lc, alpha=.1)
 104+ ax.add_collection(linecoll)
 105+ ax.set_ylim(y0 - (y1 - vmin)/9., y1)
 106+ pp.draw()
 107+ return l, linecoll, kde

Status & tagging log