r94855 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r94854‎ | r94855 | r94856 >
Date:01:43, 18 August 2011
Author:giovanni
Status:deferred
Tags:
Comment:
renamed relax -> sefit, added timechart, completed mrtchart, renamed fitting_batch.sh fitbooklet.sh
Modified paths:
  • /trunk/tools/wsor/editor_lifecycle/fitbooklet.sh (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/fitting_batch.sh (deleted) (history)
  • /trunk/tools/wsor/editor_lifecycle/mrtchart (modified) (history)
  • /trunk/tools/wsor/editor_lifecycle/relax (deleted) (history)
  • /trunk/tools/wsor/editor_lifecycle/sefit (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/timechart (added) (history)

Diff [purge]

Index: trunk/tools/wsor/editor_lifecycle/relax
@@ -1,81 +0,0 @@
2 -#!/usr/bin/python
3 -#:vim:ft=python
4 -
5 -''' batch model fitting (usable with xargs)'''
6 -
7 -import re
8 -import os
9 -import sys
10 -import numpy as np
11 -from argparse import ArgumentParser
12 -
13 -from lifecycle.models import StretchedExpon
14 -
15 -__prog__ = os.path.basename(__file__)
16 -
17 -parser = ArgumentParser(description=__doc__)
18 -parser.add_argument('data_file', metavar='data')
19 -parser.add_argument('-m', '--min-size', type=int, default=0)
20 -parser.add_argument('-c', '--constrain', choices=['head', 'tail', 'both'])
21 -parser.add_argument('--maxfev', type=int, default=10000)
22 -parser.add_argument('--debug', action='store_true')
23 -parser.add_argument('--split-name', action='store_true', help='split input file'
24 - ' name into date and a rate part.')
25 -
26 -if __name__ == '__main__':
27 - # parse command line
28 - ns = parser.parse_args()
29 -
30 - # read data, filter data
31 - x, y, ye, n = np.loadtxt(ns.data_file, unpack=1)
32 - idx = (ye > 0) * (n > ns.min_size)
33 - if idx.sum() == 0:
34 - print >> sys.stderr, '%s: error: no data meeting requirements in %s'\
35 - % (__prog__, ns.data_file)
36 - sys.exit(1)
37 - if idx.sum() < 4:
38 - print >> sys.stderr, '%s: error: non identifiable data in %s'\
39 - % (__prog__, ns.data_file)
40 - sys.exit(1)
41 -
42 - # create model, set constraints
43 - model = StretchedExpon()
44 - if ns.constrain in ['head', 'both']:
45 - model.A = y[np.argmin(np.abs(x))]
46 - if ns.constrain in ['tail', 'both']:
47 - model.C = y.min()
48 -
49 - # fit model
50 - try:
51 - pest, pcov = model.fit(x[idx], y[idx], ye[idx], maxfev=ns.maxfev, warning=False)
52 - except ValueError, e:
53 - print >> sys.stderr, '%s: error: "%s" when fitting %s' % (__prog__,
54 - e.message, ns.data_file)
55 - if ns.debug:
56 - raise
57 - else:
58 - sys.exit(1)
59 - if np.isscalar(pcov) or np.isinf(pcov).any():
60 - print >> sys.stderr, '%s: error: bad covariance matrix in %s' % (\
61 - __prog__, ns.data_file)
62 - sys.exit(1)
63 -
64 - # compute errors, MRT
65 - perr = np.sqrt(np.diag(pcov)) / 2.
66 - model.setparams(*zip(pest,perr))
67 - mrt = model.mrt(model.tau, model.beta)
68 - N = len(model.__params__)
69 - params = np.empty((N * 2 + 1,), dtype=float)
70 - params[:N] = [ model.A, model.tau, model.beta, model.C ]
71 - params[N:2*N] = map(lambda k : k or np.nan, [ model.A_err, model.tau_err,
72 - model.beta_err, model.C_err ])
73 - params[-1] = mrt
74 -
75 - # print output line
76 - key, _ = os.path.splitext(ns.data_file)
77 - if ns.split_name:
78 - key = key.split('_')
79 - else:
80 - key = [ key ]
81 - params = map(lambda k : '%12.5g' % k, params)
82 - print '\t'.join(key + params)
Index: trunk/tools/wsor/editor_lifecycle/fitting_batch.sh
@@ -1,43 +0,0 @@
2 -#!/bin/bash
3 -
4 -# Applies the `fitting' script to a batch of files
5 -#
6 -# author: Giovanni Luca Ciampaglia <gciampaglia@wikimedia.org>
7 -#
8 -# USAGE: fitting_batch.sh file1 file2 file3 ...
9 -#
10 -# This will produce the normal console output that fitting produces; PDF plots
11 -# will be stored in file fit.pdf (please note: no check against overwriting
12 -# existing versions is performed!)
13 -
14 -if [[ -z `type -p fitting` ]] ; then
15 - echo 'error: could not find fitting script. Check your PATH'
16 - exit 1
17 -fi
18 -
19 -if [[ -e fit.pdf ]] ; then
20 - echo 'error: cannot overwrite file fit.pdf'
21 - exit 1
22 -fi
23 -
24 -O=`mktemp -d`
25 -models="expon powerlaw stretchedexp"
26 -files="$@"
27 -
28 -for file in $files ; do
29 - for model in $models ; do
30 - fitting $model -force -loglog -batch $file -o $O/${file%.*}_$model.pdf
31 - echo
32 - echo
33 - done
34 -done
35 -
36 -pdfs=`ls $O/*.pdf | sort`
37 -
38 -gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=fit.pdf -dBATCH $pdfs &>/dev/null
39 -
40 -if [[ $? = 0 ]] ; then
41 - echo 'images saved in fit.pdf'
42 -else
43 - echo "error: problem saving fit.pdf. Individual image files in $O"
44 -fi
Index: trunk/tools/wsor/editor_lifecycle/timechart
@@ -0,0 +1,79 @@
 2+#!/usr/bin/python
 3+
 4+''' plots cohort rate date '''
 5+
 6+import os
 7+import sys
 8+
 9+import numpy as np
 10+import matplotlib.pyplot as pp
 11+
 12+from argparse import ArgumentParser
 13+from matplotlib.font_manager import FontProperties
 14+
 15+__prog__ = os.path.basename(__file__)
 16+
 17+parser = ArgumentParser(description=__doc__)
 18+parser.add_argument('data_paths', metavar='data', nargs='+')
 19+parser.add_argument('-m', '--minsize', type=int, default=0)
 20+parser.add_argument('-o', '--output', dest='output_path', metavar='FILE')
 21+parser.add_argument('-T', '--title')
 22+
 23+markers = 'ov^<>sp*+xD'
 24+colors = 'bgrcmykw'
 25+
 26+if __name__ == '__main__':
 27+ ns = parser.parse_args()
 28+
 29+ # create figure
 30+ fig = pp.figure(figsize=(10,4))
 31+ ax = fig.add_axes(pp.axes([.1,.1,.65,.8], axisbg='ghostwhite'))
 32+ M = len(markers)
 33+ C = len(colors)
 34+
 35+ lines = []
 36+
 37+ for i, path in enumerate(ns.data_paths):
 38+
 39+ # load cohort data and filter out estimates based on samples with size
 40+ # smaller than minimum requested
 41+ days, rate, rate_err, size = np.loadtxt(path, delimiter='\t', unpack=1)
 42+ idx = size >= ns.minsize
 43+ days = days[idx]
 44+ rate = rate[idx]
 45+ rate_err = rate_err[idx]
 46+ if len(days) == 0:
 47+ print >> sys.stderr, '%s: warning: skipping empty dataset: %s' % \
 48+ (__prog__, path)
 49+ continue
 50+
 51+ # plot errorbars
 52+ l, (wu, wd), mc = ax.errorbar(days, rate, rate_err, marker=markers[i % M], color=colors[i
 53+ % C], label=os.path.splitext(path)[0].replace('_',' '),
 54+ ecolor='none', ls=':', lw=2)
 55+ lines.append(l)
 56+ pp.setp(wd, ls='none')
 57+
 58+ # decorate figure
 59+ pp.xlabel('days since registration')
 60+ pp.ylabel('edits/day')
 61+ pp.figlegend(lines, [ l.get_label() for l in lines ],
 62+ loc='center right', prop=FontProperties(size='small'))
 63+ pp.minorticks_on()
 64+ pp.grid("on")
 65+ pp.axis('tight')
 66+
 67+ pp.draw()
 68+ if ns.title is not None:
 69+ pp.title(ns.title)
 70+ pp.draw()
 71+
 72+ # save to file, is output path specified
 73+ if ns.output_path is not None:
 74+ _, ext = os.path.splitext(ns.output_path)
 75+ fmt = ext.strip('.') or 'pdf'
 76+ pp.savefig(ns.output_path, fmt=ext)
 77+ print '%s: output saved to %s' % (__prog__, ns.output_path)
 78+
 79+ pp.show()
 80+
Property changes on: trunk/tools/wsor/editor_lifecycle/timechart
___________________________________________________________________
Added: svn:executable
181 + *
Index: trunk/tools/wsor/editor_lifecycle/mrtchart
@@ -1,4 +1,5 @@
22 #!/usr/bin/python
 3+# coding: utf8
34
45 import re
56 import os
@@ -16,7 +17,10 @@
1718
1819 parser = ArgumentParser(description=__doc__)
1920 parser.add_argument('data_paths', metavar='data', nargs='+')
20 -parser.add_argument('-title',)
 21+parser.add_argument('-o', '--output', dest='output_path', metavar='FILE')
 22+parser.add_argument('-T', '--title')
 23+parser.add_argument('-s', '--significance', default=0, help='plot MRT only '
 24+ 'for fits with significance >= %(metavar)s', metavar='LEVEL', type=float)
2125
2226 markers = 'ov^<>sp*+xD'
2327 colors = 'bgrcmykw'
@@ -30,49 +34,69 @@
3135 ''' detect extension and loads data using numpy.io functions '''
3236 _, ext = os.path.splitext(path)
3337 if re.match('^\.npy$', ext, re.I):
34 - return np.load(path)[:,[0,-1]]
 38+ return np.load(path)
3539 elif re.match('^\.tsv$', ext, re.I) or re.match('^\.txt$', ext, re.I):
3640 default = datetime(2001,1,1)
3741 def parse(timestamp):
3842 return dateparser.parse(timestamp, default)
39 - conv = {0: parse, -1: float}
 43+ conv = {0: parse, 1: int, 2: float, 3: float, 4: float, 5: float}
4044 data = np.loadtxt(path, delimiter='\t', converters=conv, dtype=object)
41 - return data[:, [0,-1]]
 45+ return data
4246 raise UnsupportedFileFormatError(path)
4347
44 -def clean(data):
45 - data = filter(lambda k : k[1] < 3000 and k[1] > 0, data)
 48+def clean(data, level=0):
 49+ data = filter(lambda k : k[2] < 3000 and k[2] > 0 and k[-1] >= level, data)
4650 data = sorted(data, key=lambda k : k[0])
47 - return zip(*data)
 51+ return np.asarray(data)
4852
4953 def main(ns):
5054
5155 # create figure and axis
52 - fig = pp.figure(figsize=(8,4))
53 - ax = fig.add_axes(pp.axes([.15,.1,.8,.8], axisbg='antiquewhite'))
 56+ fig = pp.figure(figsize=(10,4))
 57+ ax = fig.add_axes(pp.axes([.1,.1,.65,.8], axisbg='ghostwhite'))
5458 M = len(markers)
5559 C = len(colors)
5660
 61+ print 'date: %s' % datetime.now()
 62+
5763 # plot lines
5864 for i, path in enumerate(ns.data_paths):
5965 try:
6066 name, ext = os.path.splitext(path)
6167 name = name.replace('_',' ')
62 - data = load(path)
63 - dates, mrt = clean(data)
64 - ax.plot(dates, mrt, ':'+markers[i % M]+colors[i % C], label=name)
 68+ data = clean(load(path), ns.significance)
 69+ if len(data):
 70+ dates, a, mrt, R2, Chi2, pval = data.T
 71+ ax.plot(dates, mrt, ':'+markers[i % M]+colors[i % C], label=name)
 72+ print 'dataset: %s, average R^2: %.5g, points at α > %d%%: %d'\
 73+ % (path, np.mean(R2), ns.significance * 100, len(data))
 74+ else:
 75+ print 'dataset: %s, average R^2: N/A, points at %d%%: 0'\
 76+ % (path, ns.significance * 100)
6577 except UnsupportedFileFormatError,e:
6678 print >> sys.stderr, '%s: error: unsupported file type %s (.npy,'\
6779 '.tsv, .txt accepted)' % (__prog__, e.args[0])
6880 sys.exit(1)
6981
70 - pp.title('100-500 edits/year')
 82+
 83+ # decorate
7184 pp.ylabel('average time to inactivity')
7285 pp.xlabel('')
73 - pp.legend(loc='best', prop=FontProperties(size='small'))
 86+ pp.figlegend(ax.lines, [ l.get_label() for l in ax.lines ],
 87+ loc='center right', prop=FontProperties(size='small'))
7488 pp.minorticks_on()
7589 pp.grid("on")
76 - pp.savefig('test.pdf')
 90+ if ns.title is not None:
 91+ pp.title(ns.title)
 92+ pp.draw()
 93+
 94+ # save to file, is output path specified
 95+ if ns.output_path is not None:
 96+ _, ext = os.path.splitext(ns.output_path)
 97+ fmt = ext.strip('.') or 'pdf'
 98+ pp.savefig(ns.output_path, fmt=ext)
 99+ print '%s: output saved to %s' % (__prog__, ns.output_path)
 100+
77101 pp.show()
78102
79103 if __name__ == '__main__':
Index: trunk/tools/wsor/editor_lifecycle/sefit
@@ -0,0 +1,78 @@
 2+#!/usr/bin/python
 3+#:vim:ft=python
 4+
 5+''' batch model fitting (usable with xargs)'''
 6+
 7+import re
 8+import os
 9+import sys
 10+import numpy as np
 11+from argparse import ArgumentParser
 12+
 13+from lifecycle.models import StretchedExpon
 14+
 15+__prog__ = os.path.basename(__file__)
 16+
 17+parser = ArgumentParser(description=__doc__)
 18+parser.add_argument('data_file', metavar='data')
 19+parser.add_argument('-m', '--min-size', type=int, default=0)
 20+parser.add_argument('-c', '--constrain', choices=['head', 'tail', 'both'])
 21+parser.add_argument('--maxfev', type=int, default=10000)
 22+parser.add_argument('--debug', action='store_true')
 23+
 24+if __name__ == '__main__':
 25+ # parse command line
 26+ ns = parser.parse_args()
 27+
 28+ # read data, filter data
 29+ x, y, ye, n = np.loadtxt(ns.data_file, unpack=1)
 30+ idx = (ye > 0) * (n > ns.min_size)
 31+ if idx.sum() == 0:
 32+ print >> sys.stderr, '%s: error: no data meeting requirements in %s'\
 33+ % (__prog__, ns.data_file)
 34+ sys.exit(1)
 35+ if idx.sum() < 4:
 36+ print >> sys.stderr, '%s: error: non identifiable data in %s'\
 37+ % (__prog__, ns.data_file)
 38+ sys.exit(1)
 39+ x = x[idx]
 40+ y = y[idx]
 41+ ye = ye[idx]
 42+
 43+ # create model, set fit constraints if any
 44+ model = StretchedExpon()
 45+ if ns.constrain in ['head', 'both']:
 46+ model.A = y[np.argmin(np.abs(x))]
 47+ if ns.constrain in ['tail', 'both']:
 48+ model.C = y.min()
 49+
 50+ # fit model to data
 51+ try:
 52+ pest, pcov = model.fit(x, y, ye, maxfev=ns.maxfev, warning=False)
 53+ except ValueError, e:
 54+ print >> sys.stderr, '%s: error: "%s" when fitting %s' % (__prog__,
 55+ e.message, ns.data_file)
 56+ if ns.debug:
 57+ raise
 58+ else:
 59+ sys.exit(1)
 60+ if np.isscalar(pcov) or np.isinf(pcov).any():
 61+ print >> sys.stderr, '%s: error: bad covariance matrix in %s' % (\
 62+ __prog__, ns.data_file)
 63+ sys.exit(1)
 64+
 65+ # compute errors, MRT, GoF, coefficient of determination
 66+ perr = np.sqrt(np.diag(pcov)) / 2.
 67+ model.setparams(*zip(pest,perr))
 68+ mrt = model.mrt(model.tau, model.beta)
 69+ gof, resid, Rsquared = model.gof(x, y, ye)
 70+ model.goftest = gof
 71+ model.residtest = resid
 72+ model.Rsquared = Rsquared
 73+
 74+ # print output
 75+ key, _ = os.path.splitext(ns.data_file)
 76+ key = key.split('_')
 77+ output = [ mrt, model.Rsquared, model.goftest[0], model.goftest[1] ]
 78+ output = map(lambda k : '%12.5g' % k, output)
 79+ print '\t'.join(key + output)
Property changes on: trunk/tools/wsor/editor_lifecycle/sefit
___________________________________________________________________
Added: svn:executable
180 + *
Index: trunk/tools/wsor/editor_lifecycle/fitbooklet.sh
@@ -0,0 +1,43 @@
 2+#!/bin/bash
 3+
 4+# Applies the `fitting' script to a batch of files
 5+#
 6+# author: Giovanni Luca Ciampaglia <gciampaglia@wikimedia.org>
 7+#
 8+# USAGE: fitting_batch.sh file1 file2 file3 ...
 9+#
 10+# This will produce the normal console output that fitting produces; PDF plots
 11+# will be stored in file fit.pdf (please note: no check against overwriting
 12+# existing versions is performed!)
 13+
 14+if [[ -z `type -p fitting` ]] ; then
 15+ echo 'error: could not find fitting script. Check your PATH'
 16+ exit 1
 17+fi
 18+
 19+if [[ -e fit.pdf ]] ; then
 20+ echo 'error: cannot overwrite file fit.pdf'
 21+ exit 1
 22+fi
 23+
 24+O=`mktemp -d`
 25+models="expon powerlaw stretchedexp"
 26+files="$@"
 27+
 28+for file in $files ; do
 29+ for model in $models ; do
 30+ fitting $model -force -loglog -batch $file -o $O/${file%.*}_$model.pdf
 31+ echo
 32+ echo
 33+ done
 34+done
 35+
 36+pdfs=`ls $O/*.pdf | sort`
 37+
 38+gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=fit.pdf -dBATCH $pdfs &>/dev/null
 39+
 40+if [[ $? = 0 ]] ; then
 41+ echo 'images saved in fit.pdf'
 42+else
 43+ echo "error: problem saving fit.pdf. Individual image files in $O"
 44+fi
Property changes on: trunk/tools/wsor/editor_lifecycle/fitbooklet.sh
___________________________________________________________________
Added: svn:executable
145 + *

Status & tagging log