r113493 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r113492‎ | r113493 | r113494 >
Date:18:29, 9 March 2012
Author:giovanni
Status:new
Tags:
Comment:
renamed mksample -> groupbyday; groupbyday now writes into numpy binary array files
Modified paths:
  • /trunk/tools/wsor/editor_lifecycle/scripts/groupbyday (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/scripts/mksamples (deleted) (history)

Diff [purge]

Index: trunk/tools/wsor/editor_lifecycle/scripts/mksamples
@@ -1,47 +0,0 @@
2 -#!/usr/bin/python
3 -#:vim:ft=python
4 -# encoding:utf-8
5 -
6 -''' groups user counts by day since registration '''
7 -
8 -import os
9 -from argparse import ArgumentParser
10 -import numpy as np
11 -from scipy.sparse import coo_matrix
12 -from collections import deque
13 -from contextlib import closing
14 -
15 -parser = ArgumentParser(description=__doc__)
16 -parser.add_argument('input_paths', metavar='file', nargs='+')
17 -parser.add_argument('-p', '--prefix', dest='output_prefix', default='',
18 - metavar='PREFIX')
19 -
20 -def main(args):
21 - for path in args.input_paths:
22 - output_path = args.output_prefix + os.path.basename(path)
23 - output_path = os.path.splitext(output_path)[0] + '.tsv'
24 - day_counts = {}
25 - archive = np.load(path)
26 - N = len(archive.files)
27 - print '%d users in %s' % (N, path)
28 - with closing(open(output_path, 'w')) as out_file:
29 - for uid in archive.files:
30 - data = archive[uid].view(np.recarray)
31 - idx = data.ns >= 0
32 - data = data[idx]
33 - counts = coo_matrix((data.edits, (data.day - data.day.min(),
34 - data.ns))).tocsr().sum(axis=1)
35 - for day in xrange(counts.shape[0]):
36 - n = int(counts[day])
37 - try:
38 - day_counts[day].append(n)
39 - except KeyError:
40 - day_counts[day] = deque([n])
41 - max_day = max(day_counts.keys())
42 - for day in xrange(max_day):
43 - print >> out_file, ' '.join(map(str, day_counts.get(day, [])))
44 - print '%s saved.' % output_path
45 -
46 -if __name__ == '__main__':
47 - args = parser.parse_args()
48 - main(args)
Index: trunk/tools/wsor/editor_lifecycle/scripts/groupbyday
@@ -0,0 +1,64 @@
 2+#!/usr/bin/python
 3+#:vim:ft=python
 4+# encoding:utf-8
 5+
 6+''' groups user counts by day since registration '''
 7+
 8+import os
 9+from argparse import ArgumentParser
 10+import numpy as np
 11+from scipy.sparse import coo_matrix
 12+from collections import deque
 13+
 14+parser = ArgumentParser(description=__doc__)
 15+parser.add_argument('input_paths', metavar='file', nargs='+')
 16+parser.add_argument('-p', '--prefix', dest='out_prefix', default='daily_',
 17+ metavar='PREFIX', help="(default: %(metavar)s)")
 18+
 19+def group_by_day(counts):
 20+ '''
 21+ counts is a mapping between user IDs and edits-by-namespace count data
 22+ '''
 23+ # hold cohort daily counts in a mapping in memory
 24+ day_counts = {}
 25+
 26+ for uid in counts:
 27+ data = counts[uid].view(np.recarray)
 28+
 29+ # NS < 0 are virtual. Filter out those edits because they are junk
 30+ idx = data.ns >= 0
 31+ data = data[idx]
 32+
 33+ # Sparse matrix (num_days x namespaces) where num_days is the activity
 34+ # span in days. Summing along rows returns a dense matrix
 35+ counts_matrix = coo_matrix((data.edits, (data.day - data.day.min(), \
 36+ data.ns))).tocsc().sum(axis=1)
 37+
 38+ # Add counts to cohort daily counts
 39+ for day in xrange(counts_matrix.shape[0]):
 40+ n = int(counts_matrix[day])
 41+ try:
 42+ day_counts[str(day)].append(n)
 43+ except KeyError:
 44+ day_counts[str(day)] = deque([n])
 45+
 46+ return day_counts
 47+
 48+def main(args):
 49+ for path in args.input_paths:
 50+ # if path is /a/b/c/whatever.npz, by default output will be in
 51+ # $WD/byday_whatever.npz where $WD is the working dir
 52+ out_path = args.out_prefix + os.path.basename(path)
 53+ out_path = os.path.splitext(out_path)[0] + '.npz'
 54+
 55+ # load input, group, save to file, tell user
 56+ user_counts = np.load(path)
 57+ N = len(user_counts.files)
 58+ print '%d users in %s' % (N, path)
 59+ day_counts = group_by_day(user_counts)
 60+ np.savez(out_path, **day_counts)
 61+ print '%s saved (%d days).' % (out_path, len(day_counts))
 62+
 63+if __name__ == '__main__':
 64+ args = parser.parse_args()
 65+ main(args)
Property changes on: trunk/tools/wsor/editor_lifecycle/scripts/groupbyday
___________________________________________________________________
Added: svn:executable
166 + *

Status & tagging log