r94957 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r94956‎ | r94957 | r94958 >
Date:22:59, 18 August 2011
Author:giovanni
Status:deferred
Tags:
Comment:
now topcontributors.py plots the set similarity of top users over the years
Modified paths:
  • /trunk/tools/wsor/contribution_inequality/topcontributors.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/contribution_inequality/topcontributors.py
@@ -8,25 +8,77 @@
99
1010 from itertools import groupby
1111 from contextlib import closing
12 -from argparse import ArgumentParser
 12+from argparse import ArgumentParser, FileType
1313 from collections import deque
 14+from matplotlib.font_manager import FontProperties
 15+from datetime import date
1416
 17+import numpy as np
 18+import matplotlib.pyplot as pp
 19+
1520 parser = ArgumentParser(description=__doc__)
1621 parser.add_argument('data_path', metavar='data')
17 -parser.add_argument('maxlen', metavar='number', type=int)
 22+parser.add_argument('output_file', metavar='output_file', type=FileType('w'))
 23+parser.add_argument('-t', '--top', dest='maxlen', type=int, default=100,
 24+ help='Top users to list. default: %(default)d', metavar='NUM')
1825
 26+colors = 'bgrcmykw'
 27+styles = ['-', '--', '-.', ':']
 28+markers = 'ov^<>1234'
 29+
1930 if __name__ == '__main__':
2031
2132 ns = parser.parse_args()
22 -
 33+ databyns = {}
 34+
2335 with closing(open(ns.data_path)) as f:
2436 reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
2537 groupfunc = lambda row : (row['namespace'], row['year'])
2638 for key, subiter in groupby(reader, groupfunc):
2739 # smart way to keep only the tail
2840 users = deque((row['user_id'] for row in subiter ), maxlen=ns.maxlen)
29 - print '\t'.join(key + tuple(users))
30 - sys.stdout.flush()
 41+ print >> ns.output_file, '\t'.join(key + tuple(users))
 42+ ns.output_file.flush()
3143
 44+ NS, year = map(int, key)
 45+ try:
 46+ databyns[NS].append((year, set(users)))
 47+ except KeyError:
 48+ databyns[NS] = [ (year, set(users)) ]
 49+
 50+ figure = pp.figure(figsize=(8,4))
 51+ ax = figure.add_axes(pp.axes([.1,.1,.8,.8], axisbg='whitesmoke'))
 52+ i = 0
 53+ M = len(markers)
 54+ C = len(colors)
 55+ S = len(styles)
 56+
 57+ for key in databyns:
 58+ years, users = zip(*databyns[key])
 59+ years = [ date(year, 1, 1) for year in years ]
 60+ I = np.asfarray(map(len, map(set.intersection, users[1:], users[:-1])))
 61+ U = np.asfarray(map(len, map(set.union, users[1:], users[:-1])))
 62+ label = 'NS %s' % key
 63+
 64+ ax.plot(years[1:], I / U, label=label, marker=markers[i % M],
 65+ color=colors[i % C], linestyle=styles[i % S])
 66+ i += 1
 67+
 68+ pp.ylim(0,1)
 69+ pp.ylabel('similarity')
 70+ pp.title('Top %d contributors' % ns.maxlen)
 71+ pp.legend(loc='best', prop=FontProperties(size='small'))
 72+ pp.draw()
 73+
 74+ if not ns.output_file.isatty():
 75+ figure_path = os.path.splitext(ns.output_file.name)[0] + '.pdf'
 76+ pp.savefig(figure_path, fmt='pdf')
 77+ print 'figure saved to %s' % figure_path
 78+ print 'output saved to %s' % ns.output_file.name
 79+
 80+ pp.show()
3281
 82+ if not ns.output_file.isatty():
 83+ ns.output_file.close()
3384
 85+

Status & tagging log