r94855 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r94854‎ \| r94855 \| r94856 >
Date:	01:43, 18 August 2011
Author:	giovanni
Status:	deferred
Tags:
Comment:	renamed relax -> sefit, added timechart, completed mrtchart, renamed fitting_batch.sh fitbooklet.sh
Modified paths:	/trunk/tools/wsor/editor_lifecycle/fitbooklet.sh (added) (history) /trunk/tools/wsor/editor_lifecycle/fitting_batch.sh (deleted) (history) /trunk/tools/wsor/editor_lifecycle/mrtchart (modified) (history) /trunk/tools/wsor/editor_lifecycle/relax (deleted) (history) /trunk/tools/wsor/editor_lifecycle/sefit (added) (history) /trunk/tools/wsor/editor_lifecycle/timechart (added) (history)

Diff [purge]

Index: trunk/tools/wsor/editor_lifecycle/relax
—	—	@@ -1,81 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-#:vim:ft=python~~
4		-
5		~~-''' batch model fitting (usable with xargs)'''~~
6		-
7		~~-import re~~
8		~~-import os~~
9		~~-import sys~~
10		~~-import numpy as np~~
11		~~-from argparse import ArgumentParser~~
12		-
13		~~-from lifecycle.models import StretchedExpon~~
14		-
15		~~-__prog__ = os.path.basename(__file__)~~
16		-
17		~~-parser = ArgumentParser(description=__doc__)~~
18		~~-parser.add_argument('data_file', metavar='data')~~
19		~~-parser.add_argument('-m', '--min-size', type=int, default=0)~~
20		~~-parser.add_argument('-c', '--constrain', choices=['head', 'tail', 'both'])~~
21		~~-parser.add_argument('--maxfev', type=int, default=10000)~~
22		~~-parser.add_argument('--debug', action='store_true')~~
23		~~-parser.add_argument('--split-name', action='store_true', help='split input file'~~
24		~~- ' name into date and a rate part.')~~
25		-
26		~~-if __name__ == '__main__':~~
27		~~- # parse command line~~
28		~~- ns = parser.parse_args()~~
29		-
30		~~- # read data, filter data~~
31		~~- x, y, ye, n = np.loadtxt(ns.data_file, unpack=1)~~
32		~~- idx = (ye > 0) * (n > ns.min_size)~~
33		~~- if idx.sum() == 0:~~
34		~~- print >> sys.stderr, '%s: error: no data meeting requirements in %s'\~~
35		~~- % (__prog__, ns.data_file)~~
36		~~- sys.exit(1)~~
37		~~- if idx.sum() < 4:~~
38		~~- print >> sys.stderr, '%s: error: non identifiable data in %s'\~~
39		~~- % (__prog__, ns.data_file)~~
40		~~- sys.exit(1)~~
41		-
42		~~- # create model, set constraints~~
43		~~- model = StretchedExpon()~~
44		~~- if ns.constrain in ['head', 'both']:~~
45		~~- model.A = y[np.argmin(np.abs(x))]~~
46		~~- if ns.constrain in ['tail', 'both']:~~
47		~~- model.C = y.min()~~
48		-
49		~~- # fit model~~
50		~~- try:~~
51		~~- pest, pcov = model.fit(x[idx], y[idx], ye[idx], maxfev=ns.maxfev, warning=False)~~
52		~~- except ValueError, e:~~
53		~~- print >> sys.stderr, '%s: error: "%s" when fitting %s' % (__prog__,~~
54		~~- e.message, ns.data_file)~~
55		~~- if ns.debug:~~
56		~~- raise~~
57		~~- else:~~
58		~~- sys.exit(1)~~
59		~~- if np.isscalar(pcov) or np.isinf(pcov).any():~~
60		~~- print >> sys.stderr, '%s: error: bad covariance matrix in %s' % (\~~
61		~~- __prog__, ns.data_file)~~
62		~~- sys.exit(1)~~
63		-
64		~~- # compute errors, MRT~~
65		~~- perr = np.sqrt(np.diag(pcov)) / 2.~~
66		~~- model.setparams(*zip(pest,perr))~~
67		~~- mrt = model.mrt(model.tau, model.beta)~~
68		~~- N = len(model.__params__)~~
69		~~- params = np.empty((N * 2 + 1,), dtype=float)~~
70		~~- params[:N] = [ model.A, model.tau, model.beta, model.C ]~~
71		~~- params[N:2*N] = map(lambda k : k or np.nan, [ model.A_err, model.tau_err,~~
72		~~- model.beta_err, model.C_err ])~~
73		~~- params[-1] = mrt~~
74		-
75		~~- # print output line~~
76		~~- key, _ = os.path.splitext(ns.data_file)~~
77		~~- if ns.split_name:~~
78		~~- key = key.split('_')~~
79		~~- else:~~
80		~~- key = [ key ]~~
81		~~- params = map(lambda k : '%12.5g' % k, params)~~
82		~~- print '\t'.join(key + params)~~
Index: trunk/tools/wsor/editor_lifecycle/fitting_batch.sh
—	—	@@ -1,43 +0,0 @@
2		~~-#!/bin/bash~~
3		-
4		~~-# Applies the `fitting' script to a batch of files~~
5		-#
6		~~-# author: Giovanni Luca Ciampaglia <gciampaglia@wikimedia.org>~~
7		-#
8		~~-# USAGE: fitting_batch.sh file1 file2 file3 ...~~
9		-#
10		~~-# This will produce the normal console output that fitting produces; PDF plots~~
11		~~-# will be stored in file fit.pdf (please note: no check against overwriting~~
12		~~-# existing versions is performed!)~~
13		-
14		~~-if [[ -z `type -p fitting` ]] ; then~~
15		~~- echo 'error: could not find fitting script. Check your PATH'~~
16		~~- exit 1~~
17		~~-fi~~
18		-
19		~~-if [[ -e fit.pdf ]] ; then~~
20		~~- echo 'error: cannot overwrite file fit.pdf'~~
21		~~- exit 1~~
22		~~-fi~~
23		-
24		~~-O=`mktemp -d`~~
25		~~-models="expon powerlaw stretchedexp"~~
26		~~-files="$@"~~
27		-
28		~~-for file in $files ; do~~
29		~~- for model in $models ; do~~
30		~~- fitting $model -force -loglog -batch $file -o $O/${file%.*}_$model.pdf~~
31		~~- echo~~
32		~~- echo~~
33		~~- done~~
34		~~-done~~
35		-
36		~~-pdfs=`ls $O/*.pdf \| sort`~~
37		-
38		~~-gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=fit.pdf -dBATCH $pdfs &>/dev/null~~
39		-
40		~~-if [[ $? = 0 ]] ; then~~
41		~~- echo 'images saved in fit.pdf'~~
42		~~-else~~
43		~~- echo "error: problem saving fit.pdf. Individual image files in $O"~~
44		~~-fi~~
Index: trunk/tools/wsor/editor_lifecycle/timechart
—	—	@@ -0,0 +1,79 @@
	2	+#!/usr/bin/python
	3	+
	4	+''' plots cohort rate date '''
	5	+
	6	+import os
	7	+import sys
	8	+
	9	+import numpy as np
	10	+import matplotlib.pyplot as pp
	11	+
	12	+from argparse import ArgumentParser
	13	+from matplotlib.font_manager import FontProperties
	14	+
	15	+__prog__ = os.path.basename(__file__)
	16	+
	17	+parser = ArgumentParser(description=__doc__)
	18	+parser.add_argument('data_paths', metavar='data', nargs='+')
	19	+parser.add_argument('-m', '--minsize', type=int, default=0)
	20	+parser.add_argument('-o', '--output', dest='output_path', metavar='FILE')
	21	+parser.add_argument('-T', '--title')
	22	+
	23	+markers = 'ov^<>sp*+xD'
	24	+colors = 'bgrcmykw'
	25	+
	26	+if __name__ == '__main__':
	27	+ ns = parser.parse_args()
	28	+
	29	+ # create figure
	30	+ fig = pp.figure(figsize=(10,4))
	31	+ ax = fig.add_axes(pp.axes([.1,.1,.65,.8], axisbg='ghostwhite'))
	32	+ M = len(markers)
	33	+ C = len(colors)
	34	+
	35	+ lines = []
	36	+
	37	+ for i, path in enumerate(ns.data_paths):
	38	+
	39	+ # load cohort data and filter out estimates based on samples with size
	40	+ # smaller than minimum requested
	41	+ days, rate, rate_err, size = np.loadtxt(path, delimiter='\t', unpack=1)
	42	+ idx = size >= ns.minsize
	43	+ days = days[idx]
	44	+ rate = rate[idx]
	45	+ rate_err = rate_err[idx]
	46	+ if len(days) == 0:
	47	+ print >> sys.stderr, '%s: warning: skipping empty dataset: %s' % \
	48	+ (__prog__, path)
	49	+ continue
	50	+
	51	+ # plot errorbars
	52	+ l, (wu, wd), mc = ax.errorbar(days, rate, rate_err, marker=markers[i % M], color=colors[i
	53	+ % C], label=os.path.splitext(path)[0].replace('_',' '),
	54	+ ecolor='none', ls=':', lw=2)
	55	+ lines.append(l)
	56	+ pp.setp(wd, ls='none')
	57	+
	58	+ # decorate figure
	59	+ pp.xlabel('days since registration')
	60	+ pp.ylabel('edits/day')
	61	+ pp.figlegend(lines, [ l.get_label() for l in lines ],
	62	+ loc='center right', prop=FontProperties(size='small'))
	63	+ pp.minorticks_on()
	64	+ pp.grid("on")
	65	+ pp.axis('tight')
	66	+
	67	+ pp.draw()
	68	+ if ns.title is not None:
	69	+ pp.title(ns.title)
	70	+ pp.draw()
	71	+
	72	+ # save to file, is output path specified
	73	+ if ns.output_path is not None:
	74	+ _, ext = os.path.splitext(ns.output_path)
	75	+ fmt = ext.strip('.') or 'pdf'
	76	+ pp.savefig(ns.output_path, fmt=ext)
	77	+ print '%s: output saved to %s' % (__prog__, ns.output_path)
	78	+
	79	+ pp.show()
	80	+
Property changes on: trunk/tools/wsor/editor_lifecycle/timechart
___________________________________________________________________
Added: svn:executable
1	81	+ *
Index: trunk/tools/wsor/editor_lifecycle/mrtchart
—	—	@@ -1,4 +1,5 @@
2	2	#!/usr/bin/python
	3	+# coding: utf8
3	4
4	5	import re
5	6	import os
—	—	@@ -16,7 +17,10 @@
17	18
18	19	parser = ArgumentParser(description=__doc__)
19	20	parser.add_argument('data_paths', metavar='data', nargs='+')
20		~~-parser.add_argument('-title',)~~
	21	+parser.add_argument('-o', '--output', dest='output_path', metavar='FILE')
	22	+parser.add_argument('-T', '--title')
	23	+parser.add_argument('-s', '--significance', default=0, help='plot MRT only '
	24	+ 'for fits with significance >= %(metavar)s', metavar='LEVEL', type=float)
21	25
22	26	markers = 'ov^<>sp*+xD'
23	27	colors = 'bgrcmykw'
—	—	@@ -30,49 +34,69 @@
31	35	''' detect extension and loads data using numpy.io functions '''
32	36	_, ext = os.path.splitext(path)
33	37	if re.match('^\.npy$', ext, re.I):
34		~~- return np.load(path)[:,[0,-1]]~~
	38	+ return np.load(path)
35	39	elif re.match('^\.tsv$', ext, re.I) or re.match('^\.txt$', ext, re.I):
36	40	default = datetime(2001,1,1)
37	41	def parse(timestamp):
38	42	return dateparser.parse(timestamp, default)
39		~~- conv = {0: parse, -1: float}~~
	43	+ conv = {0: parse, 1: int, 2: float, 3: float, 4: float, 5: float}
40	44	data = np.loadtxt(path, delimiter='\t', converters=conv, dtype=object)
41		~~- return data[:, [0,-1]]~~
	45	+ return data
42	46	raise UnsupportedFileFormatError(path)
43	47
44		~~-def clean(data):~~
45		~~- data = filter(lambda k : k[1] < 3000 and k[1] > 0, data)~~
	48	+def clean(data, level=0):
	49	+ data = filter(lambda k : k[2] < 3000 and k[2] > 0 and k[-1] >= level, data)
46	50	data = sorted(data, key=lambda k : k[0])
47		~~- return zip(*data)~~
	51	+ return np.asarray(data)
48	52
49	53	def main(ns):
50	54
51	55	# create figure and axis
52		~~- fig = pp.figure(figsize=(8,4))~~
53		~~- ax = fig.add_axes(pp.axes([.15,.1,.8,.8], axisbg='antiquewhite'))~~
	56	+ fig = pp.figure(figsize=(10,4))
	57	+ ax = fig.add_axes(pp.axes([.1,.1,.65,.8], axisbg='ghostwhite'))
54	58	M = len(markers)
55	59	C = len(colors)
56	60
	61	+ print 'date: %s' % datetime.now()
	62	+
57	63	# plot lines
58	64	for i, path in enumerate(ns.data_paths):
59	65	try:
60	66	name, ext = os.path.splitext(path)
61	67	name = name.replace('_',' ')
62		~~- data = load(path)~~
63		~~- dates, mrt = clean(data)~~
64		~~- ax.plot(dates, mrt, ':'+markers[i % M]+colors[i % C], label=name)~~
	68	+ data = clean(load(path), ns.significance)
	69	+ if len(data):
	70	+ dates, a, mrt, R2, Chi2, pval = data.T
	71	+ ax.plot(dates, mrt, ':'+markers[i % M]+colors[i % C], label=name)
	72	+ print 'dataset: %s, average R^2: %.5g, points at α > %d%%: %d'\
	73	+ % (path, np.mean(R2), ns.significance * 100, len(data))
	74	+ else:
	75	+ print 'dataset: %s, average R^2: N/A, points at %d%%: 0'\
	76	+ % (path, ns.significance * 100)
65	77	except UnsupportedFileFormatError,e:
66	78	print >> sys.stderr, '%s: error: unsupported file type %s (.npy,'\
67	79	'.tsv, .txt accepted)' % (__prog__, e.args[0])
68	80	sys.exit(1)
69	81
70		~~- pp.title('100-500 edits/year')~~
	82	+
	83	+ # decorate
71	84	pp.ylabel('average time to inactivity')
72	85	pp.xlabel('')
73		~~- pp.legend(loc='best', prop=FontProperties(size='small'))~~
	86	+ pp.figlegend(ax.lines, [ l.get_label() for l in ax.lines ],
	87	+ loc='center right', prop=FontProperties(size='small'))
74	88	pp.minorticks_on()
75	89	pp.grid("on")
76		~~- pp.savefig('test.pdf')~~
	90	+ if ns.title is not None:
	91	+ pp.title(ns.title)
	92	+ pp.draw()
	93	+
	94	+ # save to file, is output path specified
	95	+ if ns.output_path is not None:
	96	+ _, ext = os.path.splitext(ns.output_path)
	97	+ fmt = ext.strip('.') or 'pdf'
	98	+ pp.savefig(ns.output_path, fmt=ext)
	99	+ print '%s: output saved to %s' % (__prog__, ns.output_path)
	100	+
77	101	pp.show()
78	102
79	103	if __name__ == '__main__':
Index: trunk/tools/wsor/editor_lifecycle/sefit
—	—	@@ -0,0 +1,78 @@
	2	+#!/usr/bin/python
	3	+#:vim:ft=python
	4	+
	5	+''' batch model fitting (usable with xargs)'''
	6	+
	7	+import re
	8	+import os
	9	+import sys
	10	+import numpy as np
	11	+from argparse import ArgumentParser
	12	+
	13	+from lifecycle.models import StretchedExpon
	14	+
	15	+__prog__ = os.path.basename(__file__)
	16	+
	17	+parser = ArgumentParser(description=__doc__)
	18	+parser.add_argument('data_file', metavar='data')
	19	+parser.add_argument('-m', '--min-size', type=int, default=0)
	20	+parser.add_argument('-c', '--constrain', choices=['head', 'tail', 'both'])
	21	+parser.add_argument('--maxfev', type=int, default=10000)
	22	+parser.add_argument('--debug', action='store_true')
	23	+
	24	+if __name__ == '__main__':
	25	+ # parse command line
	26	+ ns = parser.parse_args()
	27	+
	28	+ # read data, filter data
	29	+ x, y, ye, n = np.loadtxt(ns.data_file, unpack=1)
	30	+ idx = (ye > 0) * (n > ns.min_size)
	31	+ if idx.sum() == 0:
	32	+ print >> sys.stderr, '%s: error: no data meeting requirements in %s'\
	33	+ % (__prog__, ns.data_file)
	34	+ sys.exit(1)
	35	+ if idx.sum() < 4:
	36	+ print >> sys.stderr, '%s: error: non identifiable data in %s'\
	37	+ % (__prog__, ns.data_file)
	38	+ sys.exit(1)
	39	+ x = x[idx]
	40	+ y = y[idx]
	41	+ ye = ye[idx]
	42	+
	43	+ # create model, set fit constraints if any
	44	+ model = StretchedExpon()
	45	+ if ns.constrain in ['head', 'both']:
	46	+ model.A = y[np.argmin(np.abs(x))]
	47	+ if ns.constrain in ['tail', 'both']:
	48	+ model.C = y.min()
	49	+
	50	+ # fit model to data
	51	+ try:
	52	+ pest, pcov = model.fit(x, y, ye, maxfev=ns.maxfev, warning=False)
	53	+ except ValueError, e:
	54	+ print >> sys.stderr, '%s: error: "%s" when fitting %s' % (__prog__,
	55	+ e.message, ns.data_file)
	56	+ if ns.debug:
	57	+ raise
	58	+ else:
	59	+ sys.exit(1)
	60	+ if np.isscalar(pcov) or np.isinf(pcov).any():
	61	+ print >> sys.stderr, '%s: error: bad covariance matrix in %s' % (\
	62	+ __prog__, ns.data_file)
	63	+ sys.exit(1)
	64	+
	65	+ # compute errors, MRT, GoF, coefficient of determination
	66	+ perr = np.sqrt(np.diag(pcov)) / 2.
	67	+ model.setparams(*zip(pest,perr))
	68	+ mrt = model.mrt(model.tau, model.beta)
	69	+ gof, resid, Rsquared = model.gof(x, y, ye)
	70	+ model.goftest = gof
	71	+ model.residtest = resid
	72	+ model.Rsquared = Rsquared
	73	+
	74	+ # print output
	75	+ key, _ = os.path.splitext(ns.data_file)
	76	+ key = key.split('_')
	77	+ output = [ mrt, model.Rsquared, model.goftest[0], model.goftest[1] ]
	78	+ output = map(lambda k : '%12.5g' % k, output)
	79	+ print '\t'.join(key + output)
Property changes on: trunk/tools/wsor/editor_lifecycle/sefit
___________________________________________________________________
Added: svn:executable
1	80	+ *
Index: trunk/tools/wsor/editor_lifecycle/fitbooklet.sh
—	—	@@ -0,0 +1,43 @@
	2	+#!/bin/bash
	3	+
	4	+# Applies the `fitting' script to a batch of files
	5	+#
	6	+# author: Giovanni Luca Ciampaglia <gciampaglia@wikimedia.org>
	7	+#
	8	+# USAGE: fitting_batch.sh file1 file2 file3 ...
	9	+#
	10	+# This will produce the normal console output that fitting produces; PDF plots
	11	+# will be stored in file fit.pdf (please note: no check against overwriting
	12	+# existing versions is performed!)
	13	+
	14	+if [[ -z `type -p fitting` ]] ; then
	15	+ echo 'error: could not find fitting script. Check your PATH'
	16	+ exit 1
	17	+fi
	18	+
	19	+if [[ -e fit.pdf ]] ; then
	20	+ echo 'error: cannot overwrite file fit.pdf'
	21	+ exit 1
	22	+fi
	23	+
	24	+O=`mktemp -d`
	25	+models="expon powerlaw stretchedexp"
	26	+files="$@"
	27	+
	28	+for file in $files ; do
	29	+ for model in $models ; do
	30	+ fitting $model -force -loglog -batch $file -o $O/${file%.*}_$model.pdf
	31	+ echo
	32	+ echo
	33	+ done
	34	+done
	35	+
	36	+pdfs=`ls $O/*.pdf \| sort`
	37	+
	38	+gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=fit.pdf -dBATCH $pdfs &>/dev/null
	39	+
	40	+if [[ $? = 0 ]] ; then
	41	+ echo 'images saved in fit.pdf'
	42	+else
	43	+ echo "error: problem saving fit.pdf. Individual image files in $O"
	44	+fi
Property changes on: trunk/tools/wsor/editor_lifecycle/fitbooklet.sh
___________________________________________________________________
Added: svn:executable
1	45	+ *

Status & tagging log

13:34, 18 August 2011 Reedy (talk | contribs) changed the status of r94855 [removed: new added: deferred]