r113492 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r113491‎ | r113492 | r113493 >
Date:18:29, 9 March 2012
Author:giovanni
Status:new
Tags:
Comment:
added scripts for analyzing daily cohort edit count data
Modified paths:
  • /trunk/tools/wsor/editor_lifecycle/scripts/fitcounts (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/scripts/mksamples (added) (history)

Diff [purge]

Index: trunk/tools/wsor/editor_lifecycle/scripts/mksamples
@@ -0,0 +1,47 @@
 2+#!/usr/bin/python
 3+#:vim:ft=python
 4+# encoding:utf-8
 5+
 6+''' groups user counts by day since registration '''
 7+
 8+import os
 9+from argparse import ArgumentParser
 10+import numpy as np
 11+from scipy.sparse import coo_matrix
 12+from collections import deque
 13+from contextlib import closing
 14+
 15+parser = ArgumentParser(description=__doc__)
 16+parser.add_argument('input_paths', metavar='file', nargs='+')
 17+parser.add_argument('-p', '--prefix', dest='output_prefix', default='',
 18+ metavar='PREFIX')
 19+
 20+def main(args):
 21+ for path in args.input_paths:
 22+ output_path = args.output_prefix + os.path.basename(path)
 23+ output_path = os.path.splitext(output_path)[0] + '.tsv'
 24+ day_counts = {}
 25+ archive = np.load(path)
 26+ N = len(archive.files)
 27+ print '%d users in %s' % (N, path)
 28+ with closing(open(output_path, 'w')) as out_file:
 29+ for uid in archive.files:
 30+ data = archive[uid].view(np.recarray)
 31+ idx = data.ns >= 0
 32+ data = data[idx]
 33+ counts = coo_matrix((data.edits, (data.day - data.day.min(),
 34+ data.ns))).tocsr().sum(axis=1)
 35+ for day in xrange(counts.shape[0]):
 36+ n = int(counts[day])
 37+ try:
 38+ day_counts[day].append(n)
 39+ except KeyError:
 40+ day_counts[day] = deque([n])
 41+ max_day = max(day_counts.keys())
 42+ for day in xrange(max_day):
 43+ print >> out_file, ' '.join(map(str, day_counts.get(day, [])))
 44+ print '%s saved.' % output_path
 45+
 46+if __name__ == '__main__':
 47+ args = parser.parse_args()
 48+ main(args)
Index: trunk/tools/wsor/editor_lifecycle/scripts/fitcounts
@@ -0,0 +1,55 @@
 2+#!/usr/bin/python
 3+#:vim:ft=python
 4+# encoding:utf-8
 5+
 6+''' fits daily count samples '''
 7+
 8+import os
 9+from contextlib import closing
 10+from argparse import ArgumentParser
 11+import numpy as np
 12+from scipy.stats import nbinom, geom, poisson, chisquare
 13+from scipy.optimize import fmin
 14+import matplotlib.pyplot as pp
 15+
 16+parser = ArgumentParser(description=__doc__)
 17+parser.add_argument('input_path', metavar='file')
 18+
 19+models = [ nbinom, ] # poisson, geom ]
 20+initial_args = { 'nbinom' : (5,.5), 'poisson' : (10,), 'geom' : (.5,) }
 21+
 22+def main(args):
 23+ model_params = {}
 24+ model_pvalue = {}
 25+ with closing(open(args.input_path)) as infile:
 26+ for i, line in enumerate(infile):
 27+ sample = np.asarray(map(int, line.split()))
 28+ if len(sample) < 5:
 29+ print 'day %d: skipping rest of file' % i
 30+ break
 31+ f_obs, bins = np.histogram(sample, bins=sample.ptp() or 1)
 32+ for rv in models:
 33+ nll = lambda k : - rv(*k).logpmf(sample).sum()
 34+ beta = fmin(nll, initial_args[rv.name], disp=False)
 35+ f_exp = rv(*beta).pmf(bins[:-1]) * sample.sum()
 36+ chisq, pval = chisquare(f_obs, f_exp, rv.numargs)
 37+ try:
 38+ model_params[rv.name].append(beta)
 39+ model_pvalue[rv.name].append(pval)
 40+ except KeyError:
 41+ model_params[rv.name] = [ beta ]
 42+ model_pvalue[rv.name] = [ pval ]
 43+ print 'day %d: done' % i
 44+ for rv in models:
 45+ model_params[rv.name] = np.asarray(model_params[rv.name])
 46+ model_pvalue[rv.name] = np.asarray(model_pvalue[rv.name])
 47+ return model_params, model_pvalue
 48+
 49+
 50+# pp.scatter(params.T[0], params.T[1], c='k', marker='.')
 51+# pp.show()
 52+
 53+
 54+if __name__ == '__main__':
 55+ args = parser.parse_args()
 56+ params, pvalues = main(args)

Status & tagging log