Index: trunk/tools/wsor/editor_lifecycle/scripts/mksamples |
— | — | @@ -1,47 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -#:vim:ft=python |
4 | | -# encoding:utf-8 |
5 | | - |
6 | | -''' groups user counts by day since registration ''' |
7 | | - |
8 | | -import os |
9 | | -from argparse import ArgumentParser |
10 | | -import numpy as np |
11 | | -from scipy.sparse import coo_matrix |
12 | | -from collections import deque |
13 | | -from contextlib import closing |
14 | | - |
15 | | -parser = ArgumentParser(description=__doc__) |
16 | | -parser.add_argument('input_paths', metavar='file', nargs='+') |
17 | | -parser.add_argument('-p', '--prefix', dest='output_prefix', default='', |
18 | | - metavar='PREFIX') |
19 | | - |
20 | | -def main(args): |
21 | | - for path in args.input_paths: |
22 | | - output_path = args.output_prefix + os.path.basename(path) |
23 | | - output_path = os.path.splitext(output_path)[0] + '.tsv' |
24 | | - day_counts = {} |
25 | | - archive = np.load(path) |
26 | | - N = len(archive.files) |
27 | | - print '%d users in %s' % (N, path) |
28 | | - with closing(open(output_path, 'w')) as out_file: |
29 | | - for uid in archive.files: |
30 | | - data = archive[uid].view(np.recarray) |
31 | | - idx = data.ns >= 0 |
32 | | - data = data[idx] |
33 | | - counts = coo_matrix((data.edits, (data.day - data.day.min(), |
34 | | - data.ns))).tocsr().sum(axis=1) |
35 | | - for day in xrange(counts.shape[0]): |
36 | | - n = int(counts[day]) |
37 | | - try: |
38 | | - day_counts[day].append(n) |
39 | | - except KeyError: |
40 | | - day_counts[day] = deque([n]) |
41 | | - max_day = max(day_counts.keys()) |
42 | | - for day in xrange(max_day): |
43 | | - print >> out_file, ' '.join(map(str, day_counts.get(day, []))) |
44 | | - print '%s saved.' % output_path |
45 | | - |
46 | | -if __name__ == '__main__': |
47 | | - args = parser.parse_args() |
48 | | - main(args) |
Index: trunk/tools/wsor/editor_lifecycle/scripts/groupbyday |
— | — | @@ -0,0 +1,64 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +#:vim:ft=python |
| 4 | +# encoding:utf-8 |
| 5 | + |
| 6 | +''' groups user counts by day since registration ''' |
| 7 | + |
| 8 | +import os |
| 9 | +from argparse import ArgumentParser |
| 10 | +import numpy as np |
| 11 | +from scipy.sparse import coo_matrix |
| 12 | +from collections import deque |
| 13 | + |
| 14 | +parser = ArgumentParser(description=__doc__) |
| 15 | +parser.add_argument('input_paths', metavar='file', nargs='+') |
| 16 | +parser.add_argument('-p', '--prefix', dest='out_prefix', default='daily_', |
| 17 | + metavar='PREFIX', help="(default: %(metavar)s)") |
| 18 | + |
| 19 | +def group_by_day(counts): |
| 20 | + ''' |
| 21 | + counts is a mapping between user IDs and edits-by-namespace count data |
| 22 | + ''' |
| 23 | + # hold cohort daily counts in a mapping in memory |
| 24 | + day_counts = {} |
| 25 | + |
| 26 | + for uid in counts: |
| 27 | + data = counts[uid].view(np.recarray) |
| 28 | + |
| 29 | + # NS < 0 are virtual. Filter out those edits because they are junk |
| 30 | + idx = data.ns >= 0 |
| 31 | + data = data[idx] |
| 32 | + |
| 33 | + # Sparse matrix (num_days x namespaces) where num_days is the activity |
| 34 | + # span in days. Summing along rows returns a dense matrix |
| 35 | + counts_matrix = coo_matrix((data.edits, (data.day - data.day.min(), \ |
| 36 | + data.ns))).tocsc().sum(axis=1) |
| 37 | + |
| 38 | + # Add counts to cohort daily counts |
| 39 | + for day in xrange(counts_matrix.shape[0]): |
| 40 | + n = int(counts_matrix[day]) |
| 41 | + try: |
| 42 | + day_counts[str(day)].append(n) |
| 43 | + except KeyError: |
| 44 | + day_counts[str(day)] = deque([n]) |
| 45 | + |
| 46 | + return day_counts |
| 47 | + |
| 48 | +def main(args): |
| 49 | + for path in args.input_paths: |
| 50 | + # if path is /a/b/c/whatever.npz, by default output will be in |
| 51 | + # $WD/byday_whatever.npz where $WD is the working dir |
| 52 | + out_path = args.out_prefix + os.path.basename(path) |
| 53 | + out_path = os.path.splitext(out_path)[0] + '.npz' |
| 54 | + |
| 55 | + # load input, group, save to file, tell user |
| 56 | + user_counts = np.load(path) |
| 57 | + N = len(user_counts.files) |
| 58 | + print '%d users in %s' % (N, path) |
| 59 | + day_counts = group_by_day(user_counts) |
| 60 | + np.savez(out_path, **day_counts) |
| 61 | + print '%s saved (%d days).' % (out_path, len(day_counts)) |
| 62 | + |
| 63 | +if __name__ == '__main__': |
| 64 | + args = parser.parse_args() |
| 65 | + main(args) |
Property changes on: trunk/tools/wsor/editor_lifecycle/scripts/groupbyday |
___________________________________________________________________ |
Added: svn:executable |
1 | 66 | + * |