Index: trunk/tools/wsor/editor_lifecycle/scripts/mksamples |
— | — | @@ -0,0 +1,47 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +#:vim:ft=python |
| 4 | +# encoding:utf-8 |
| 5 | + |
| 6 | +''' groups user counts by day since registration ''' |
| 7 | + |
| 8 | +import os |
| 9 | +from argparse import ArgumentParser |
| 10 | +import numpy as np |
| 11 | +from scipy.sparse import coo_matrix |
| 12 | +from collections import deque |
| 13 | +from contextlib import closing |
| 14 | + |
| 15 | +parser = ArgumentParser(description=__doc__) |
| 16 | +parser.add_argument('input_paths', metavar='file', nargs='+') |
| 17 | +parser.add_argument('-p', '--prefix', dest='output_prefix', default='', |
| 18 | + metavar='PREFIX') |
| 19 | + |
| 20 | +def main(args): |
| 21 | + for path in args.input_paths: |
| 22 | + output_path = args.output_prefix + os.path.basename(path) |
| 23 | + output_path = os.path.splitext(output_path)[0] + '.tsv' |
| 24 | + day_counts = {} |
| 25 | + archive = np.load(path) |
| 26 | + N = len(archive.files) |
| 27 | + print '%d users in %s' % (N, path) |
| 28 | + with closing(open(output_path, 'w')) as out_file: |
| 29 | + for uid in archive.files: |
| 30 | + data = archive[uid].view(np.recarray) |
| 31 | + idx = data.ns >= 0 |
| 32 | + data = data[idx] |
| 33 | + counts = coo_matrix((data.edits, (data.day - data.day.min(), |
| 34 | + data.ns))).tocsr().sum(axis=1) |
| 35 | + for day in xrange(counts.shape[0]): |
| 36 | + n = int(counts[day]) |
| 37 | + try: |
| 38 | + day_counts[day].append(n) |
| 39 | + except KeyError: |
| 40 | + day_counts[day] = deque([n]) |
| 41 | + max_day = max(day_counts.keys()) |
| 42 | + for day in xrange(max_day): |
| 43 | + print >> out_file, ' '.join(map(str, day_counts.get(day, []))) |
| 44 | + print '%s saved.' % output_path |
| 45 | + |
| 46 | +if __name__ == '__main__': |
| 47 | + args = parser.parse_args() |
| 48 | + main(args) |
Index: trunk/tools/wsor/editor_lifecycle/scripts/fitcounts |
— | — | @@ -0,0 +1,55 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +#:vim:ft=python |
| 4 | +# encoding:utf-8 |
| 5 | + |
| 6 | +''' fits daily count samples ''' |
| 7 | + |
| 8 | +import os |
| 9 | +from contextlib import closing |
| 10 | +from argparse import ArgumentParser |
| 11 | +import numpy as np |
| 12 | +from scipy.stats import nbinom, geom, poisson, chisquare |
| 13 | +from scipy.optimize import fmin |
| 14 | +import matplotlib.pyplot as pp |
| 15 | + |
| 16 | +parser = ArgumentParser(description=__doc__) |
| 17 | +parser.add_argument('input_path', metavar='file') |
| 18 | + |
| 19 | +models = [ nbinom, ] # poisson, geom ] |
| 20 | +initial_args = { 'nbinom' : (5,.5), 'poisson' : (10,), 'geom' : (.5,) } |
| 21 | + |
| 22 | +def main(args): |
| 23 | + model_params = {} |
| 24 | + model_pvalue = {} |
| 25 | + with closing(open(args.input_path)) as infile: |
| 26 | + for i, line in enumerate(infile): |
| 27 | + sample = np.asarray(map(int, line.split())) |
| 28 | + if len(sample) < 5: |
| 29 | + print 'day %d: skipping rest of file' % i |
| 30 | + break |
| 31 | + f_obs, bins = np.histogram(sample, bins=sample.ptp() or 1) |
| 32 | + for rv in models: |
| 33 | + nll = lambda k : - rv(*k).logpmf(sample).sum() |
| 34 | + beta = fmin(nll, initial_args[rv.name], disp=False) |
| 35 | + f_exp = rv(*beta).pmf(bins[:-1]) * sample.sum() |
| 36 | + chisq, pval = chisquare(f_obs, f_exp, rv.numargs) |
| 37 | + try: |
| 38 | + model_params[rv.name].append(beta) |
| 39 | + model_pvalue[rv.name].append(pval) |
| 40 | + except KeyError: |
| 41 | + model_params[rv.name] = [ beta ] |
| 42 | + model_pvalue[rv.name] = [ pval ] |
| 43 | + print 'day %d: done' % i |
| 44 | + for rv in models: |
| 45 | + model_params[rv.name] = np.asarray(model_params[rv.name]) |
| 46 | + model_pvalue[rv.name] = np.asarray(model_pvalue[rv.name]) |
| 47 | + return model_params, model_pvalue |
| 48 | + |
| 49 | + |
| 50 | +# pp.scatter(params.T[0], params.T[1], c='k', marker='.') |
| 51 | +# pp.show() |
| 52 | + |
| 53 | + |
| 54 | +if __name__ == '__main__': |
| 55 | + args = parser.parse_args() |
| 56 | + params, pvalues = main(args) |