r94723 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r94722‎ \| r94723 \| r94724 >
Date:	01:49, 17 August 2011
Author:	giovanni
Status:	deferred
Tags:
Comment:	added again package
Modified paths:	/trunk/tools/wsor/editor_lifecycle/MANIFEST.in (added) (history) /trunk/tools/wsor/editor_lifecycle/README.rst (added) (history) /trunk/tools/wsor/editor_lifecycle/fetchrates (added) (history) /trunk/tools/wsor/editor_lifecycle/fitting (added) (history) /trunk/tools/wsor/editor_lifecycle/fitting_batch.sh (added) (history) /trunk/tools/wsor/editor_lifecycle/lifecycle (added) (history) /trunk/tools/wsor/editor_lifecycle/lifecycle/__init__.py (added) (history) /trunk/tools/wsor/editor_lifecycle/lifecycle/models.py (added) (history) /trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py (added) (history) /trunk/tools/wsor/editor_lifecycle/lifecycle/scale.py (added) (history) /trunk/tools/wsor/editor_lifecycle/mkcohort (added) (history) /trunk/tools/wsor/editor_lifecycle/obsolete (added) (history) /trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort (added) (history) /trunk/tools/wsor/editor_lifecycle/obsolete/graphlife (added) (history) /trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort (added) (history) /trunk/tools/wsor/editor_lifecycle/obsolete/rates (added) (history) /trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh (added) (history) /trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql (added) (history) /trunk/tools/wsor/editor_lifecycle/rateperedits.py (added) (history) /trunk/tools/wsor/editor_lifecycle/ratesnobots.sql (added) (history) /trunk/tools/wsor/editor_lifecycle/relax (added) (history) /trunk/tools/wsor/editor_lifecycle/setup.py (added) (history) /trunk/tools/wsor/editor_lifecycle/sge (added) (history) /trunk/tools/wsor/editor_lifecycle/sge/rates.sh (added) (history)

Diff [purge]

Index: trunk/tools/wsor/editor_lifecycle/mkcohort
—	—	@@ -0,0 +1,73 @@
	2	+#!/usr/bin/python
	3	+#:vim:ft=python
	4	+
	5	+''' finding better cohorts by real user activity '''
	6	+
	7	+import os
	8	+import sys
	9	+import csv
	10	+import numpy as np
	11	+
	12	+from collections import deque
	13	+from itertools import groupby
	14	+from argparse import ArgumentParser, FileType
	15	+from dateutil.parser import parser as DateParser
	16	+
	17	+__prog__ = os.path.basename(__file__)
	18	+
	19	+def yearkey(date):
	20	+ return date.year,
	21	+
	22	+def monthkey(date):
	23	+ return date.year, date.month
	24	+
	25	+def daykey(date):
	26	+ return date.year, date.month, date.day
	27	+
	28	+parser = ArgumentParser(description=__doc__)
	29	+parser.add_argument('data', type=FileType('r'))
	30	+group = parser.add_mutually_exclusive_group(required=1)
	31	+group.add_argument('--day', dest='keyfunc', const=daykey, action='store_const')
	32	+group.add_argument('--month', dest='keyfunc', const=monthkey, action='store_const')
	33	+group.add_argument('--year', dest='keyfunc', const=yearkey, action='store_const')
	34	+
	35	+parser.add_argument('--minspan', default=3600, type=int)
	36	+parser.add_argument('--mincount', default=2, type=int)
	37	+
	38	+date_parser = DateParser()
	39	+fieldnames = ['user', 'max_timestamp', 'min_timestamp', 'editcount']
	40	+
	41	+def activity(rowiterator, mincount, minspan):
	42	+ for row in rowiterator:
	43	+ user = float(row['user'])
	44	+ min_timestamp = date_parser.parse(row['min_timestamp'])
	45	+ max_timestamp = date_parser.parse(row['max_timestamp'])
	46	+ editcount = float(row['editcount'])
	47	+ span = (max_timestamp - min_timestamp).total_seconds()
	48	+ if (editcount > mincount) and (span > minspan):
	49	+ yield user, editcount / span
	50	+
	51	+def magnitude(x):
	52	+ return np.floor(np.log10(x))
	53	+
	54	+if __name__ == '__main__':
	55	+ ns = parser.parse_args()
	56	+ reader = csv.DictReader(ns.data, fieldnames=fieldnames, delimiter='\t',
	57	+ quoting=csv.QUOTE_NONE)
	58	+
	59	+ def cohortdatefunc(row):
	60	+ return ns.keyfunc(date_parser.parse(row['min_timestamp']))
	61	+
	62	+ for cohortdate, subiter in groupby(reader, cohortdatefunc):
	63	+ data = np.asarray(list(activity(subiter, ns.mincount, ns.minspan)))
	64	+ if len(data) > 0:
	65	+ user, rate = data.T
	66	+ idx = rate.argsort()
	67	+ data = list(data[idx])
	68	+ kf = lambda k : magnitude(k[1])
	69	+ for m, subsubiter in groupby(data, kf):
	70	+ outrow = deque([ '-'.join(map(str, cohortdate)), '%d' % m ])
	71	+ for u, e in subsubiter:
	72	+ outrow.append('%d' % u)
	73	+ print '\t'.join(outrow)
	74	+
Property changes on: trunk/tools/wsor/editor_lifecycle/mkcohort
___________________________________________________________________
Added: svn:executable
1	75	+ *
Index: trunk/tools/wsor/editor_lifecycle/setup.py
—	—	@@ -0,0 +1,16 @@
	2	+from distutils.core import setup
	3	+
	4	+setup(
	5	+ name='lifecycle',
	6	+ description='WMF summer of research project',
	7	+ version='0.0.0',
	8	+ author='Giovanni Luca Ciampaglia',
	9	+ author_email='gciampaglia@wikimedia.org',
	10	+ packages=['lifecycle'],
	11	+ scripts=[
	12	+ 'fetchrates',
	13	+ 'fitting',
	14	+ 'relax',
	15	+ 'mkcohort',
	16	+ ]
	17	+)
Index: trunk/tools/wsor/editor_lifecycle/fetchrates
—	—	@@ -0,0 +1,126 @@
	2	+#!/usr/bin/python
	3	+
	4	+import sys
	5	+import os
	6	+import numpy as np
	7	+
	8	+from zipfile import ZipFile
	9	+from contextlib import closing
	10	+from tempfile import mkdtemp
	11	+from oursql import connect
	12	+from argparse import ArgumentParser
	13	+from datetime import datetime
	14	+
	15	+from lifecycle.rates import computerates
	16	+
	17	+parser = ArgumentParser(description=__doc__)
	18	+parser.add_argument('input_path', metavar='cohort')
	19	+parser.add_argument('-config', dest='config_file')
	20	+parser.add_argument('-outdir', dest='output_dir', default=os.curdir)
	21	+parser.add_argument('-v','--verbose', action='store_true')
	22	+parser.add_argument('-every', type=int, help='default: average over %(default)d days',
	23	+ default=30, metavar='NUM')
	24	+parser.add_argument('-ns', type=int, action='append', help='select only these NS',
	25	+ dest='only')
	26	+parser.add_argument('--save-cohort', dest='rates_only', action='store_false')
	27	+
	28	+query = """
	29	+select unix_timestamp(rev_timestamp)/86400.0, page_namespace
	30	+from revision r join page p
	31	+on r.rev_page = p.page_id
	32	+where rev_user = ?
	33	+order by rev_timestamp
	34	+"""
	35	+
	36	+__prog__ = os.path.basename(os.path.abspath(__file__))
	37	+
	38	+# def iterateoversets(cursor):
	39	+# yield list(cursor)
	40	+# while cursor.nextset():
	41	+# yield list(cursor)
	42	+
	43	+def process(rows, user_id, output_dir=os.path.curdir):
	44	+ if len(rows) == 0:
	45	+ print >> sys.stderr, '%s: error: empty revision history for user %d' % (__prog__,
	46	+ user_id)
	47	+ return
	48	+
	49	+ rev_time, rev_ns = np.asfarray(rows).T
	50	+ m, M = np.floor(rev_time.min()), np.ceil(rev_time.max())
	51	+ uns = sorted(np.unique(rev_ns))
	52	+ bins_time = np.arange(m,M + 1)
	53	+ bins_ns = uns + [uns[-1] + 1]
	54	+ rates, days, _ = np.histogram2d(rev_time, rev_ns, bins=(bins_time, bins_ns))
	55	+ I,J = np.nonzero(rates)
	56	+ data = [ (days[i],uns[j],rates[i,j]) for i, j in zip(I,J) ] # in coordinate format
	57	+ dtype=np.dtype([('day', int), ('ns', int), ('edits', int)])
	58	+ data = np.asarray(data, dtype=dtype)
	59	+ out_path = os.path.join(output_dir, '%d.npy' % user_id)
	60	+ np.save(out_path, data)
	61	+ return out_path
	62	+
	63	+def main(ns):
	64	+ # get mysql client configuration file
	65	+ mycnf = os.path.expanduser('~/.my.cnf')
	66	+ if ns.config_file is None and not os.path.exists(mycnf):
	67	+ print >> sys.stderr, '%s: no config file specified and $HOME/.my.cnf'
	68	+ ' not found' % __prog__
	69	+ sys.exit(1)
	70	+ elif ns.config_file is None:
	71	+ ns.config_file = mycnf
	72	+
	73	+ # test output directory exists
	74	+ if not os.path.exists(ns.output_dir):
	75	+ print >> sys.stderr, '%s: output directory does not exist: %s' % (
	76	+ __prog__, ns.output_dir)
	77	+ sys.exit(1)
	78	+ if not os.path.isdir(ns.output_dir):
	79	+ print >> sys.stderr, '%s: not a directory: %s' % (__prog__, ns.output_dir)
	80	+
	81	+ # read user ids from cohort file, create zip archive and temp dir
	82	+ with closing(open(ns.input_path)) as f:
	83	+ line = f.readline().strip()
	84	+ if line:
	85	+ user_ids = map(int, line.split('\t'))
	86	+ else:
	87	+ print >> sys.stderr, '%s: error: empty input file: %s' % ns.input_path
	88	+ sys.exit(1)
	89	+ zip_path = os.path.splitext(os.path.basename(ns.input_path))[0] + '.npz'
	90	+ zip_path = os.path.join(ns.output_dir, zip_path)
	91	+ temp_dir = mkdtemp()
	92	+
	93	+ with closing(ZipFile(zip_path, 'w')) as zf:
	94	+
	95	+ # connect and run query
	96	+ conn = connect(read_default_file=ns.config_file)
	97	+ for uid in user_ids:
	98	+ # compute rates and save to file
	99	+ with conn.cursor() as cursor:
	100	+ cursor.execute(query, (uid,))
	101	+ rows = list(cursor)
	102	+ path = process(rows, uid, temp_dir)
	103	+ if path is None:
	104	+ continue
	105	+ zf.write(path, os.path.basename(path))
	106	+ os.remove(path)
	107	+
	108	+ # remove temp dir
	109	+ os.removedirs(temp_dir)
	110	+
	111	+ # compute rates for this cohort and save them to file
	112	+ rate_path = os.path.splitext(os.path.basename(ns.input_path))[0] + '.tsv'
	113	+ rate_path = os.path.join(ns.output_dir, rate_path)
	114	+ rates = computerates(zip_path, ns.every, onlyns=ns.only)
	115	+ np.savetxt(rate_path, rates, fmt='%d\t%12.8g\t%12.8g\t%d')
	116	+ if ns.rates_only:
	117	+ os.remove(zip_path)
	118	+ print '%s: output saved to %s' % (datetime.now(), rate_path)
	119	+ else:
	120	+ print '%s: output saved to %s, cohort data to %s' % (datetime.now(),
	121	+ rate_path, zip_path)
	122	+
	123	+if __name__ == '__main__':
	124	+ # get arguments from command line
	125	+ ns = parser.parse_args()
	126	+ main(ns)
	127	+
Property changes on: trunk/tools/wsor/editor_lifecycle/fetchrates
___________________________________________________________________
Added: svn:executable
1	128	+ *
Index: trunk/tools/wsor/editor_lifecycle/MANIFEST.in
—	—	@@ -0,0 +1,5 @@
	2	+include *.py
	3	+include *.sh
	4	+include *.sql
	5	+include sge/*.sh
	6	+include obsolete/*
Index: trunk/tools/wsor/editor_lifecycle/relax
—	—	@@ -0,0 +1,81 @@
	2	+#!/usr/bin/python
	3	+#:vim:ft=python
	4	+
	5	+''' batch model fitting (usable with xargs)'''
	6	+
	7	+import re
	8	+import os
	9	+import sys
	10	+import numpy as np
	11	+from argparse import ArgumentParser
	12	+
	13	+from lifecycle.models import StretchedExpon
	14	+
	15	+__prog__ = os.path.basename(__file__)
	16	+
	17	+parser = ArgumentParser(description=__doc__)
	18	+parser.add_argument('data_file', metavar='data')
	19	+parser.add_argument('-m', '--min-size', type=int, default=0)
	20	+parser.add_argument('-c', '--constrain', choices=['head', 'tail', 'both'])
	21	+parser.add_argument('--maxfev', type=int, default=10000)
	22	+parser.add_argument('--debug', action='store_true')
	23	+parser.add_argument('--split-name', action='store_true', help='split input file'
	24	+ ' name into date and a rate part.')
	25	+
	26	+if __name__ == '__main__':
	27	+ # parse command line
	28	+ ns = parser.parse_args()
	29	+
	30	+ # read data, filter data
	31	+ x, y, ye, n = np.loadtxt(ns.data_file, unpack=1)
	32	+ idx = (ye > 0) * (n > ns.min_size)
	33	+ if idx.sum() == 0:
	34	+ print >> sys.stderr, '%s: error: no data meeting requirements in %s'\
	35	+ % (__prog__, ns.data_file)
	36	+ sys.exit(1)
	37	+ if idx.sum() < 4:
	38	+ print >> sys.stderr, '%s: error: non identifiable data in %s'\
	39	+ % (__prog__, ns.data_file)
	40	+ sys.exit(1)
	41	+
	42	+ # create model, set constraints
	43	+ model = StretchedExpon()
	44	+ if ns.constrain in ['head', 'both']:
	45	+ model.A = y[np.argmin(np.abs(x))]
	46	+ if ns.constrain in ['tail', 'both']:
	47	+ model.C = y.min()
	48	+
	49	+ # fit model
	50	+ try:
	51	+ pest, pcov = model.fit(x[idx], y[idx], ye[idx], maxfev=ns.maxfev, warning=False)
	52	+ except ValueError, e:
	53	+ print >> sys.stderr, '%s: error: "%s" when fitting %s' % (__prog__,
	54	+ e.message, ns.data_file)
	55	+ if ns.debug:
	56	+ raise
	57	+ else:
	58	+ sys.exit(1)
	59	+ if np.isscalar(pcov) or np.isinf(pcov).any():
	60	+ print >> sys.stderr, '%s: error: bad covariance matrix in %s' % (\
	61	+ __prog__, ns.data_file)
	62	+ sys.exit(1)
	63	+
	64	+ # compute errors, MRT
	65	+ perr = np.sqrt(np.diag(pcov)) / 2.
	66	+ model.setparams(*zip(pest,perr))
	67	+ mrt = model.mrt(model.tau, model.beta)
	68	+ N = len(model.__params__)
	69	+ params = np.empty((N * 2 + 1,), dtype=float)
	70	+ params[:N] = [ model.A, model.tau, model.beta, model.C ]
	71	+ params[N:2*N] = map(lambda k : k or np.nan, [ model.A_err, model.tau_err,
	72	+ model.beta_err, model.C_err ])
	73	+ params[-1] = mrt
	74	+
	75	+ # print output line
	76	+ key, _ = os.path.splitext(ns.data_file)
	77	+ if ns.split_name:
	78	+ key = key.split('_')
	79	+ else:
	80	+ key = [ key ]
	81	+ params = map(lambda k : '%12.5g' % k, params)
	82	+ print '\t'.join(key + params)
Property changes on: trunk/tools/wsor/editor_lifecycle/relax
___________________________________________________________________
Added: svn:executable
1	83	+ *
Index: trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort
—	—	@@ -0,0 +1,196 @@
	2	+#!/usr/bin/python
	3	+# coding: utf-8
	4	+# :vim:ft=python
	5	+
	6	+# TODO: obsolete
	7	+
	8	+''' creates cohort files, filtering out bots '''
	9	+
	10	+'''
	11	+This script reads two files: an ZIP archive file, and an index file, which is a
	12	+tab-separated text file like the following:
	13	+
	14	+ 34 WojPob 20010129110725 2524
	15	+ 94 AstroNomer 20010207222248 1532
	16	+ 43 Lee Daniel Crocker 20010314020407 4388
	17	+ 86 Stephen Gilbert 20010326191355 3599
	18	+ 3 Tobias Hoevekamp 20010326202105 1903
	19	+ 1273 Wathiik 20010510171751 1772
	20	+ 3371 Arno 20010721180708 2700
	21	+ 122 Ap 20010722201619 2137
	22	+ 182 Rjstott 20010726102546 2602
	23	+ 64 Uriyan 20010727141651 1634
	24	+
	25	+Where fields are: id, name, date, count. Dates are parsed using dateutil, so
	26	+other formats are allowed too (e.g. 2010-01-29 11:07:25).
	27	+
	28	+The script will aggregate users based on the date field and will lookup for
	29	+files of the form <id>.npy in the archive file. Each of these files contains the
	30	+daily edits count for a single user, stored using the NumPy binary array
	31	+format. A relative path within the ZIP archive can be specified from the command
	32	+line with -P/--datapath. Once the data for a cohort (e.g. an aggregated group
	33	+of users) have been collected, the script will compute the average activity rate
	34	+since the first day of activity for all users in that cohort.
	35	+
	36	+The script produces two files per each cohort: a tab-separated values file with
	37	+cohort average activity rate, and a compressed NumPy binary archive with the
	38	+user data array files.
	39	+
	40	+For each discovered cohort, the script will print on the console the date of the
	41	+cohort, how many users it contains, and how many suspected BOT users it filtered
	42	+out from the index. Use --bot disable this chieck and always include them. The
	43	+check is as follows: if the name contains the pattern 'bot' at the beginning or
	44	+at the end of any word, it will be filtered out (e.g. "Botuser IV" will match,
	45	+but "Francis Abbott" won't). If arguments -mincount or -maxcount (or both) are
	46	+passed, the script will process only users whose edit count is below the minimum
	47	+count, or above the maximum count, or both.
	48	+
	49	+Please note that the index file must be already sorted by date, in order for the
	50	+group by date aggregation to work. You can use `sort' from the commmand line,
	51	+e.g.:
	52	+
	53	+ $~ sort -t$'\t' -k3 -h unsorted.tsv
	54	+
	55	+should sort file unsorted.tsv.
	56	+'''
	57	+
	58	+import re
	59	+import os
	60	+import sys
	61	+import csv
	62	+import numpy as np
	63	+from argparse import ArgumentParser, FileType
	64	+from contextlib import closing
	65	+from itertools import groupby
	66	+from dateutil.parser import parser as DateParser
	67	+from datetime import datetime
	68	+from zipfile import ZipFile
	69	+
	70	+from rates import computerates
	71	+
	72	+__prog__ = os.path.basename(os.path.abspath(__file__))
	73	+_botpat = r'\bbot\|bot\b'
	74	+_fields = ['id', 'name', 'date', 'count']
	75	+
	76	+def yearkey(date):
	77	+ return date.year,
	78	+
	79	+def monthkey(date):
	80	+ return date.year, date.month
	81	+
	82	+def daykey(date):
	83	+ return date.year, date.month, date.day
	84	+
	85	+parser = ArgumentParser(description=__doc__)
	86	+parser.add_argument('index', type=FileType('r'), help='must be already sorted')
	87	+parser.add_argument('archive_path', metavar='archive', help='data archive in ZIP '
	88	+ 'format')
	89	+group = parser.add_mutually_exclusive_group(required=1)
	90	+group.add_argument('--year', help='group by year', action='store_const',
	91	+ const=yearkey, dest='keyfunc')
	92	+group.add_argument('--month', help='group by month', action='store_const',
	93	+ const=monthkey, dest='keyfunc')
	94	+group.add_argument('--day', help='group by day', action='store_const',
	95	+ const=daykey, dest='keyfunc')
	96	+parser.add_argument('--bots', action='store_true', help='do NOT filter out bots')
	97	+parser.add_argument('-P', '--datapath', help='relative path of files within '
	98	+ 'archive', default='')
	99	+parser.add_argument('-mincount', type=int)
	100	+parser.add_argument('-maxcount', type=int)
	101	+parser.add_argument('-minperyear', type=int)
	102	+parser.add_argument('-maxperyear', type=int)
	103	+parser.add_argument('-n', '--dry-run', action='store_true', help='write to '
	104	+ 'console all actions, but do not produce any file')
	105	+parser.add_argument('-every', type=int, help='default: average over %(default)d days',
	106	+ default=30, metavar='NUM')
	107	+parser.add_argument('-ns', type=int, action='append', help='select only these NS',
	108	+ dest='only')
	109	+
	110	+dateparser = DateParser()
	111	+
	112	+# dummy ZipFile class in case we do not want do anything!
	113	+class DummyZipFile:
	114	+ def __init__(self, fn, mode):
	115	+ pass
	116	+ def close(self):
	117	+ pass
	118	+ def write(self, fn, *args):
	119	+ pass
	120	+
	121	+if __name__ == '__main__':
	122	+ ns = parser.parse_args()
	123	+ reader = csv.DictReader(ns.index, _fields, quoting=csv.QUOTE_NONE,
	124	+ delimiter='\t')
	125	+ archive = ZipFile(ns.archive_path)
	126	+
	127	+ def _keyfunc(row):
	128	+ try:
	129	+ date = dateparser.parse(row['date'])
	130	+ except:
	131	+ print row
	132	+ raise
	133	+
	134	+ return ns.keyfunc(date)
	135	+
	136	+ # group by index by date of registration
	137	+ for key, subiter in groupby(reader, _keyfunc):
	138	+
	139	+ # reset indices and define output file names from cohort period
	140	+ tot_users = 0
	141	+ tot_bots = 0
	142	+ datestr = '-'.join(map(lambda k : '%02d' % k, key)) # (2010,1) -> '2010-01'
	143	+ zipfn = '{}.npz'.format(datestr)
	144	+ tsvfn = '{}.tsv'.format(datestr)
	145	+
	146	+ # if user wants to do a dry-run, replace the Zip files class with the
	147	+ # dummy one
	148	+ if ns.dry_run:
	149	+ ZipFile = DummyZipFile
	150	+
	151	+ # for each user, determine if may go in cohort
	152	+ with closing(ZipFile(zipfn, 'w')) as zf:
	153	+ for row in subiter:
	154	+
	155	+ # compute user details (edit count, yearly activity rate, etc.)
	156	+ # and other useful variables
	157	+ user_id = row['id']
	158	+ count = int(row['count'])
	159	+ user_date = dateparser.parse(row['date'])
	160	+ now_date = datetime.now()
	161	+ activity_span = float((now_date - user_date).days) # in days
	162	+ yearly_rate = count / activity_span * 365.0
	163	+ bot_flag = re.search(_botpat, row['name'], re.I) is not None
	164	+ tot_bots += bot_flag # update counts of bot matches
	165	+
	166	+ # define paths
	167	+ basepath = '{}.npy'.format(user_id)
	168	+ archivepath = os.path.join(ns.datapath, basepath)
	169	+
	170	+ # check cohort membership (keep if conjunction of all given
	171	+ # criteria is true, that is, discard if any given criterion is
	172	+ # false)
	173	+ if ns.mincount is not None and count <= ns.mincount:
	174	+ continue
	175	+ if ns.maxcount is not None and count >= ns.maxcount:
	176	+ continue
	177	+ if ns.minperyear is not None and yearly_rate <= ns.minperyear:
	178	+ continue
	179	+ if ns.maxperyear is not None and yearly_rate >= ns.maxperyear:
	180	+ continue
	181	+ # user can turn this test off by passing --bots
	182	+ if not ns.bots and bot_flag:
	183	+ continue
	184	+ try:
	185	+ zf.writestr(basepath, archive.read(archivepath))
	186	+ except KeyError:
	187	+ print >> sys.stderr, '%s: warning: %s not in archive' %\
	188	+ (__prog__, archivepath)
	189	+ tot_users += 1
	190	+
	191	+ if tot_users > 0:
	192	+ rates = computerates(zipfn, ns.every, onlyns=ns.only)
	193	+ np.savetxt(tsvfn, rates, fmt='%f')
	194	+
	195	+ print '%s: %s, %s created (users: %5d, skipped bots %5d)' % (
	196	+ __prog__, tsvfn, zipfn, tot_users, tot_bots)
	197	+ sys.stdout.flush()
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort
___________________________________________________________________
Added: svn:executable
1	198	+ *
Index: trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort
—	—	@@ -0,0 +1,61 @@
	2	+#!/usr/bin/python
	3	+# vim:ft=python:
	4	+# coding : utf-8
	5	+
	6	+# TODO: obsolete
	7	+
	8	+''' fetches a cohort based on year of registration and editing activity '''
	9	+
	10	+from argparse import ArgumentParser
	11	+from oursql import connect
	12	+import os
	13	+import sys
	14	+import datetime as dt
	15	+import csv
	16	+
	17	+prog = os.path.basename(os.path.abspath(__file__))
	18	+
	19	+parser = ArgumentParser(description=__doc__, fromfile_prefix_chars='@')
	20	+parser.add_argument('registration_year', metavar='year', type=int)
	21	+parser.add_argument('min_activity', metavar='minedits', type=int)
	22	+parser.add_argument('max_activity', metavar='maxedits', type=int)
	23	+parser.add_argument('-c', '--config', dest='config_file')
	24	+parser.add_argument('-l', '--limit', type=int)
	25	+
	26	+query = '''
	27	+select
	28	+ user_id,
	29	+ user_name,
	30	+ user_registration,
	31	+ user_editcount
	32	+from user u left join user_groups ug
	33	+on u.user_id = ug.ug_user
	34	+where
	35	+ (ug_group <> 'bot' or ug_user is null)
	36	+ and year(user_registration) = ?
	37	+ and user_editcount > ?
	38	+ and user_editcount < ?
	39	+'''
	40	+
	41	+if __name__ == '__main__':
	42	+ ns = parser.parse_args()
	43	+ if ns.min_activity >= ns.max_activity:
	44	+ print >> sys.stderr, '%s: error: min_activity >= max_activity' % prog
	45	+ sys.exit(1)
	46	+ if ns.registration_year < 2001 or ns.registration_year > dt.datetime.now().year:
	47	+ print >> sys.stderr, '%s: error: illegal year: %d' % (prog,
	48	+ ns.registration_year)
	49	+ sys.exit(1)
	50	+
	51	+ if ns.limit is not None:
	52	+ query += 'limit %d' % ns.limit
	53	+
	54	+ if ns.config_file is None:
	55	+ ns.config_file = os.path.expanduser('~/.my.cnf')
	56	+
	57	+ conn = connect(read_default_file=ns.config_file)
	58	+ writer = csv.writer(sys.stdout, dialect='excel-tab')
	59	+ cursor = conn.cursor()
	60	+ cursor.execute(query, (ns.registration_year, ns.min_activity, ns.max_activity))
	61	+ for row in cursor:
	62	+ writer.writerow(row)
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort
___________________________________________________________________
Added: svn:executable
1	63	+ *
Index: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql
—	—	@@ -0,0 +1,14 @@
	2	+-- user ID, user name, timestamp of first edit and edit count of all registered
	3	+-- users that are not flagged bots. N.B. there might still be unflagged bots.
	4	+
	5	+select
	6	+ rev_user as user_id,
	7	+ rev_user_text as user_name,
	8	+ min(rev_timestamp) as first_timestamp,
	9	+ count(rev_timestamp) as editcount
	10	+from
	11	+ revision r use index (usertext_timestamp) left join user_groups g
	12	+on r.rev_user = g.ug_user
	13	+where (ug_group <> 'bot' or g.ug_user is null) and rev_user > 0
	14	+group by rev_user_text
	15	+-- limit 100
Index: trunk/tools/wsor/editor_lifecycle/obsolete/rates
—	—	@@ -0,0 +1,78 @@
	2	+#!/usr/bin/python
	3	+#:vim:ts=python:
	4	+
	5	+''' compute editor lifecycle '''
	6	+
	7	+import re
	8	+import os
	9	+from argparse import ArgumentParser
	10	+import numpy as np
	11	+from collections import deque
	12	+import datetime as dt
	13	+
	14	+from lifecycle.rates import *
	15	+
	16	+__prog__ = os.path.basename(os.path.abspath(__file__))
	17	+
	18	+parser = ArgumentParser(description=__doc__)
	19	+parser.add_argument('data_file', metavar='data')
	20	+parser.add_argument(metavar='minact', type=int, dest='minimum_activity')
	21	+parser.add_argument(metavar='maxact', type=int, dest='maximum_activity')
	22	+parser.add_argument('-key')
	23	+parser.add_argument('-every', type=int, help='default: %(default)d days',
	24	+ default=30, metavar='NUM')
	25	+parser.add_argument('-inactivity', type=int, default=180, help='default: '
	26	+ '%(default)d days', metavar='NUM')
	27	+parser.add_argument('-all', dest='dump_all', action='store_true')
	28	+
	29	+
	30	+def main(ns):
	31	+ if ns.key is None:
	32	+ m = re.match('(.*?)\.npz', ns.data_file, re.I)
	33	+ if m is not None:
	34	+ ns.key = m.groups()[0]
	35	+ else:
	36	+ print >> sys.stderr, '%s: cannot determine key from file name: %s'\
	37	+ % (__prog__, ns.data_file)
	38	+ sys.exit(1)
	39	+ if ns.minimum_activity >= ns.maximum_activity:
	40	+ print >> sys.stderr, '%s: error: minact >= maxact' % __prog__
	41	+ sys.exit(1)
	42	+
	43	+ # load data
	44	+ npzarchive = np.load(ns.data_file)
	45	+
	46	+ if ns.dump_all:
	47	+ fn = mkfn('cycles', ns, 'npz')
	48	+ values_iter = itercycles(npzarchive, ns.every)
	49	+ keys = npzarchive.files
	50	+ tmp = dict(zip(keys, list(values_iter)))
	51	+ np.savez(fn, **tmp)
	52	+ print '%s: output saved to %s' % (__prog__, fn)
	53	+ else:
	54	+ # compute lifetime distribution
	55	+ lt = lifetimes(npzarchive)
	56	+
	57	+ # compute inactive subgroups
	58	+ inactive_users = find_inactives(npzarchive, ns.inactivity, ns.minimum_activity,
	59	+ ns.maximum_activity)
	60	+
	61	+ ratesbyday = groupbyday(npzarchive, ns.every)
	62	+ ratesbyday_inact = groupbyday(npzarchive, ns.every, inactive_users)
	63	+
	64	+ avg_all = averagecycle(ratesbyday)
	65	+ avg_inact = averagecycle(ratesbyday_inact)
	66	+
	67	+ lens = [ len(npzarchive.files), len(inactive_users) ]
	68	+
	69	+ names = ['lt', 'len', 'all', 'inact' ]
	70	+ arrs = [ lt, lens, avg_all, avg_inact ]
	71	+
	72	+ for n, a in zip(names, arrs):
	73	+ fn = '%s_%s.%s' % (ns.key, n, 'tsv')
	74	+ np.savetxt(fn, a)
	75	+ print '%s: output saved to %s' % (__prog__, fn)
	76	+
	77	+if __name__ == '__main__':
	78	+ ns = parser.parse_args()
	79	+ main(ns)
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/rates
___________________________________________________________________
Added: svn:executable
1	80	+ *
Index: trunk/tools/wsor/editor_lifecycle/obsolete/graphlife
—	—	@@ -0,0 +1,90 @@
	2	+#!/usr/bin/python
	3	+
	4	+''' plot editor life cycle '''
	5	+
	6	+import sys
	7	+import numpy as np
	8	+from argparse import ArgumentParser
	9	+import os
	10	+
	11	+__prog__ = os.path.basename(os.path.abspath(__file__))
	12	+
	13	+parser = ArgumentParser(description=__doc__)
	14	+parser.add_argument('data_files', metavar='data', nargs='+')
	15	+parser.add_argument('-l', '--label', metavar='TEXT', action='append',
	16	+ dest='labels_list')
	17	+parser.add_argument('-inset', dest='inset_data_file', metavar='FILE')
	18	+parser.add_argument('-batch', action='store_true', help='uses PDF backend')
	19	+parser.add_argument('-title')
	20	+parser.add_argument('-fmt', default='pdf', help='default: %(default)s')
	21	+
	22	+if __name__ == '__main__':
	23	+ ns = parser.parse_args()
	24	+
	25	+ # checks
	26	+ if ns.labels_list and len(ns.data_files) != len(ns.labels_list):
	27	+ print >> sys.stderr, '%s: error: please provide as many labels '\
	28	+ 'as data files' % __prog__
	29	+ sys.exit(1)
	30	+
	31	+ # import pyplot, make lists of colors and markers
	32	+ if ns.batch:
	33	+ import matplotlib
	34	+ matplotlib.use('PDF')
	35	+ import matplotlib.pyplot as pp
	36	+ from matplotlib.lines import lineMarkers as markers
	37	+ markers = dict(filter(
	38	+ lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing',
	39	+ markers.items())).keys()
	40	+ colors = 'krbgm'
	41	+
	42	+ # create figure and axes
	43	+ fig = pp.figure()
	44	+ ax = pp.axes([.1, .1, .85, .8])
	45	+
	46	+ # add lines
	47	+ N = len(ns.data_files)
	48	+ for i in xrange(N):
	49	+ data_file = ns.data_files[i]
	50	+ if ns.labels_list is not None:
	51	+ label = ns.labels_list[i]
	52	+ else:
	53	+ label = 'line-%d' % (i + 1)
	54	+ color = colors[i % len(colors)]
	55	+ marker= markers[i % len(markers)]
	56	+ x, y, ye = np.loadtxt(data_file, unpack=1)
	57	+ ax.errorbar(x, y, ye, color=color, marker=marker, mfc='none',
	58	+ mec=color, ls=':', label=label)
	59	+
	60	+ ax.legend(loc=2)
	61	+ ax.set_xlabel('days since registration')
	62	+ ax.set_ylabel('edits/day')
	63	+ if ns.title is not None:
	64	+ ax.set_title(ns.title)
	65	+ ax.axis('tight')
	66	+
	67	+ # plot hist of lifetimes in inset axes
	68	+ if ns.inset_data_file is not None:
	69	+ lt = np.loadtxt(ns.inset_data_file)
	70	+ inax = pp.axes([.55, .6, .35, .25], axisbg='none')
	71	+ inax.hist(lt, bins=20, fc='none', cumulative=-1, normed=0)
	72	+ for l in inax.xaxis.get_ticklabels():
	73	+ l.set_rotation(30)
	74	+ l.set_fontsize('x-small')
	75	+ for l in inax.yaxis.get_ticklabels():
	76	+ l.set_fontsize('x-small')
	77	+ inax.set_xlabel('lifespan $x$ (days)', fontsize='small')
	78	+ inax.set_ylabel('no. of users older\n more than $x$ days',
	79	+ fontsize='small')
	80	+ inax.set_title('account lifetime')
	81	+ inax.axis('tight')
	82	+
	83	+ pp.draw()
	84	+ if ns.title is not None:
	85	+ fn = ns.title.replace(' ', '_').lower() + '.' + ns.fmt
	86	+ else:
	87	+ fn = 'output.' + ns.fmt
	88	+ print 'output saved to %s' % fn
	89	+
	90	+ pp.savefig(fn, fmt=ns.fmt)
	91	+ pp.show()
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/graphlife
___________________________________________________________________
Added: svn:executable
1	92	+ *
Index: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh
—	—	@@ -0,0 +1,14 @@
	2	+#!/bin/bash
	3	+
	4	+# This scripts writes to output a list of registered, not-flagged-as-bot users,
	5	+# sorted by time of first edit. Each item in the list comprises:
	6	+#
	7	+# 1. user_id
	8	+# 2. user_name
	9	+# 3. first_timestamp
	10	+# 4. editcount
	11	+#
	12	+# For the SQL query, check file userlist.sql.
	13	+
	14	+srcdir=`dirname $(type -p $0)`
	15	+mysql -BN < $srcdir/userlist.sql \| sort -h -k3 -t $'\t'
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh
___________________________________________________________________
Added: svn:executable
1	16	+ *
Index: trunk/tools/wsor/editor_lifecycle/fitting_batch.sh
—	—	@@ -0,0 +1,43 @@
	2	+#!/bin/bash
	3	+
	4	+# Applies the `fitting' script to a batch of files
	5	+#
	6	+# author: Giovanni Luca Ciampaglia <gciampaglia@wikimedia.org>
	7	+#
	8	+# USAGE: fitting_batch.sh file1 file2 file3 ...
	9	+#
	10	+# This will produce the normal console output that fitting produces; PDF plots
	11	+# will be stored in file fit.pdf (please note: no check against overwriting
	12	+# existing versions is performed!)
	13	+
	14	+if [[ -z `type -p fitting` ]] ; then
	15	+ echo 'error: could not find fitting script. Check your PATH'
	16	+ exit 1
	17	+fi
	18	+
	19	+if [[ -e fit.pdf ]] ; then
	20	+ echo 'error: cannot overwrite file fit.pdf'
	21	+ exit 1
	22	+fi
	23	+
	24	+O=`mktemp -d`
	25	+models="expon powerlaw stretchedexp"
	26	+files="$@"
	27	+
	28	+for file in $files ; do
	29	+ for model in $models ; do
	30	+ fitting $model -force -loglog -batch $file -o $O/${file%.*}_$model.pdf
	31	+ echo
	32	+ echo
	33	+ done
	34	+done
	35	+
	36	+pdfs=`ls $O/*.pdf \| sort`
	37	+
	38	+gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=fit.pdf -dBATCH $pdfs &>/dev/null
	39	+
	40	+if [[ $? = 0 ]] ; then
	41	+ echo 'images saved in fit.pdf'
	42	+else
	43	+ echo "error: problem saving fit.pdf. Individual image files in $O"
	44	+fi
Property changes on: trunk/tools/wsor/editor_lifecycle/fitting_batch.sh
___________________________________________________________________
Added: svn:executable
1	45	+ *
Index: trunk/tools/wsor/editor_lifecycle/sge/rates.sh
—	—	@@ -0,0 +1,30 @@
	2	+#! /bin/bash
	3	+#$ -N rates
	4	+#$ -j y
	5	+#$ -o query.log
	6	+#$ -t 1-NNNNN
	7	+#$ -tc 10
	8	+#$ -l sqlprocs-s1=1
	9	+#$ -m n
	10	+
	11	+# Use this script to run the query on the toolserver using its Sun Grid Engine
	12	+# batch job scheduling system. SGE will run this script in parallel for each
	13	+# input file in input/, and place the output in (surprise..) output/
	14	+#
	15	+# Each input file will be fed to this script by SGE. However, SGE does not have
	16	+# any way to tell how many input files you have to process, so you have to
	17	+# manually change the comment line on top of this file to reflect this
	18	+# information:
	19	+#
	20	+# #$ -t 1-NNNNN
	21	+#
	22	+# once you have done this, you can simply submit the job to SGE using qsub:
	23	+#
	24	+# ~$ qsub < rates.sh
	25	+#
	26	+# See man qsub for more info on that.
	27	+
	28	+uid=$((SGE_TASK_ID-1))
	29	+infile=$uid.in
	30	+export PATH=$HOME/local/bin:$PATH
	31	+fetchrates -o $HOME/output $HOME/input/$infile
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/models.py
—	—	@@ -0,0 +1,371 @@
	2	+# coding: utf8
	3	+
	4	+import numpy as np
	5	+from scipy.stats import norm, chisqprob, normaltest
	6	+from scipy.optimize import curve_fit
	7	+from scipy.special import gamma
	8	+from cStringIO import StringIO
	9	+import datetime as dt
	10	+#from scikits.statsmodels.api import OLS
	11	+
	12	+__all__ = ['Expon', 'PowerLaw', 'StretchedExpon' ]
	13	+
	14	+class Parameter(object):
	15	+ '''
	16	+ Class for parameter descriptors. Works with ParameterMixin
	17	+ '''
	18	+ def __init__(self, name, attrlist):
	19	+ self.name = name # parameter name
	20	+ for att in attrlist:
	21	+ if att.name == name:
	22	+ raise AttributeError('cannot add parameter {}'.format(name))
	23	+ attrlist.append(self)
	24	+ def __get__(self, instance, owner):
	25	+ if instance is not None:
	26	+ return instance.__dict__['_' + self.name]
	27	+ return self
	28	+ def __set__(self, instance, value):
	29	+ try:
	30	+ value, error = value
	31	+ except TypeError:
	32	+ value, error = value, None
	33	+ instance.__dict__['_' + self.name] = value
	34	+ instance.__dict__[self.name + '_err'] = error
	35	+ def __repr__(self):
	36	+ return '<Parameter {} at 0x{}>'.format(self.name, '%x' % id(self))
	37	+
	38	+class ParameterMixin(object):
	39	+ '''
	40	+ Class that lets you look up all Parameter instances in __params__
	41	+ '''
	42	+ def itererrors(self):
	43	+ for p in self.__params__:
	44	+ yield self.__getattribute__(p.name + '_err')
	45	+ def errors(self):
	46	+ return list(self.itererrors())
	47	+ def iterparams(self):
	48	+ '''
	49	+ Returns an iterator over all parameters of this model
	50	+ '''
	51	+ for p in self.__params__:
	52	+ yield self.__getattribute__(p.name)
	53	+ def params(self):
	54	+ '''
	55	+ Returns a tuple of all parameters of this model
	56	+ '''
	57	+ return list(self.iterparams())
	58	+ def setparams(self, *args):
	59	+ '''
	60	+ Sets unset parameters of this model to *args. Parameters that already
	61	+ are associated a value will NOT be modified by this method.
	62	+ '''
	63	+ keyf = lambda p : self.__getattribute__(p.name) is None
	64	+ for p, a in zip(filter(keyf, self.__params__), args):
	65	+ setattr(self, p.name, a)
	66	+
	67	+def _orNA(val, fmt='%8.5g'):
	68	+ if val is not None:
	69	+ return fmt % val
	70	+ else:
	71	+ return 'N/A'
	72	+
	73	+class ParametricModel(ParameterMixin):
	74	+ '''
	75	+ Callable class with Parameter descriptors. Subclasses of ParametricModel
	76	+ ought define, as class attributes, any number of Parameter descriptors at the
	77	+ class level, together with a list (conventional name: `__params__'). See
	78	+ Parameter.__init__ on how to instantiate a Parameter descriptor.
	79	+
	80	+ Subclassess ought also define two static methods: `func' and `init'. The
	81	+ first is the actual function that accepts an argument together with the same
	82	+ number of parameters as in __params__. The second is used to get initial
	83	+ estimates for the Levenberg-Marquardt leastsq minimizer used to fit this
	84	+ model.
	85	+
	86	+ From that point on, any instance of this class acts as the function `func'
	87	+ itself, with the only differences that it automatically performs partial
	88	+ application for those Parameter attributes that are being assigned a value.
	89	+ Example:
	90	+
	91	+ # expon.func(x, A, B) is A * exp(B * x)
	92	+ >>> expon(1, -1, 2) = 0.73575888234288467
	93	+ >>> expon.A = 2
	94	+ >>> expon(1, -1) = 0.73575888234288467
	95	+ '''
	96	+ def __init__(self, args, *kwargs):
	97	+ keys = [p.name for p in self.__params__]
	98	+ for k in keys:
	99	+ if k not in kwargs:
	100	+ kwargs[k] = None
	101	+ kwargs.update(zip(keys, args)) # update the rightmost parameters only
	102	+ for k, v in kwargs.items():
	103	+ setattr(self, k, v)
	104	+ self.goftest = tuple([None] * 3)
	105	+ self.residtest = tuple([None] * 2)
	106	+ self.Rsquared = None
	107	+ def __call__(self, x, *args):
	108	+ '''
	109	+ See class method `func'
	110	+ '''
	111	+ fargs = self.params()
	112	+ N = len(filter(None, fargs))
	113	+ if N + len(args) > len(fargs):
	114	+ raise TypeError('{} accepts only {} '
	115	+ 'parameters'.format(self.__class__.__name__, len(fargs)))
	116	+ for a in args:
	117	+ idx = fargs.index(None)
	118	+ fargs[idx] = a
	119	+ fargs = tuple(fargs)
	120	+ return self.func(x, *fargs)
	121	+ def fit(self, x, y, ye, **kwargs):
	122	+ '''
	123	+ Fits this parametric model to observations (x_i, y_i). Uncertainty in
	124	+ the y-estimates can be specified with argument `ye'. Additional keyword
	125	+ arguments are passed to scipy.optimize.curve_fit which in turn passes
	126	+ them to scipy.optimize.leastsq.
	127	+ '''
	128	+ fp0 = self.init(x, y)
	129	+ fargs = self.params()
	130	+ p0 = []
	131	+ for a, p in zip(fargs, fp0):
	132	+ if a is None:
	133	+ p0.append(p)
	134	+ p0 = tuple(p0)
	135	+ return curve_fit(self, x, y, sigma=ye, p0=p0, **kwargs)
	136	+ def gof(self, x, y, ye):
	137	+ '''
	138	+ Computes GoF test statistics and other diagnostical tests
	139	+
	140	+ Returns:
	141	+ --------
	142	+ - GoF test: Chi^2, p-value, and ddof
	143	+ - Normality of residuals: K^2 and p-value
	144	+ '''
	145	+ res = {}
	146	+ resid = y - self(x)
	147	+ chisq = np.sum(((resid) / ye) ** 2)
	148	+ ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters
	149	+ chisq_pvalue = chisqprob(chisq, ddof)
	150	+ gof = (chisq, chisq_pvalue, ddof)
	151	+ resid = normaltest(resid)
	152	+ ym = y.mean()
	153	+ SStot = np.sum((y - ym) ** 2)
	154	+ SSerr = np.sum((y - self(x)) ** 2)
	155	+ Rsquared = 1.0 - SSerr / SStot
	156	+# Besides being buggy, this test for homoscedasticity is supposed to work only
	157	+# for linear regressions, hence is not suited for our case, but I'll keep it
	158	+# here until I figure out an alternative. Remember to uncomment the import for
	159	+# OLS ontop.
	160	+# regresults = OLS(resid 2, np.c_[x, x2]).fit()
	161	+# LM =regresults.rsquared
	162	+# LM_pvalue = chisqprob(LM, len(x) - ddof)
	163	+# white = (LM, LM_pvalue)
	164	+# return gof, resid, white
	165	+ return gof, resid, Rsquared
	166	+ def __str__(self):
	167	+ name = self.__class__.__name__
	168	+ prep = []
	169	+ for p in self.params():
	170	+ if p is not None:
	171	+ prep.append('%3.4g' % p)
	172	+ else:
	173	+ prep.append('*')
	174	+ return '{}({})'.format(name, ','.join(prep))
	175	+ def __repr__(self):
	176	+ return '<{} object at 0x{}>'.format(str(self), '%x' % id(self))
	177	+ def summary(self, **kwargs):
	178	+ '''
	179	+ Returns a summary of this model
	180	+ '''
	181	+ s = StringIO()
	182	+ print >> s, ''
	183	+ print >> s, 'General information'
	184	+ print >> s, '-------------------'
	185	+ print >> s, 'model: %s' % self.name.capitalize()
	186	+ print >> s, 'date: %s' % dt.datetime.now()
	187	+ for item in kwargs.items():
	188	+ print >> s, '{}: {}'.format(*map(str, item))
	189	+ print >> s, ''
	190	+ print >> s, 'Model parameters'
	191	+ print >> s, '----------------'
	192	+ for p, val, err in zip(self.__params__, self.params(), self.errors()):
	193	+ print >> s, '{}: {} ± {}'.format(p.name, _orNA(val), _orNA(err))
	194	+ chi, p, ddof = self.goftest
	195	+ print >> s, ''
	196	+ print >> s, 'Fit results'
	197	+ print >> s, '-----------'
	198	+ print >> s, 'Goodness-of-fit: Chi-squared = {}, p = {}, ddof = {}'.format(
	199	+ _orNA(chi, '%5.2f'), _orNA(p, '%8.4e'), _orNA(ddof, '%d'))
	200	+ D, p = self.residtest
	201	+ print >> s, 'Normality of residuals: K-squared = {}, p = {}'.format(
	202	+ _orNA(D, '%5.2f'), _orNA(p, '%8.4e'))
	203	+ print >> s, 'Coefficient of Determination: {}'.format(
	204	+ _orNA(self.Rsquared, '%5.2f'))
	205	+ return s.getvalue()
	206	+
	207	+class Expon(ParametricModel):
	208	+ '''
	209	+ y = A * exp( -(x / B)) + C
	210	+ '''
	211	+ __params__ = []
	212	+ A = Parameter('A', __params__)
	213	+ B = Parameter('B', __params__)
	214	+ C = Parameter('C', __params__)
	215	+ name = 'exponential'
	216	+ @staticmethod
	217	+ def func(x, a, b, c):
	218	+ return a * np.exp(-(x / b)) + c
	219	+ @staticmethod
	220	+ def init(x, y):
	221	+ a0 = y[np.argmin(np.abs(x))] # estimate for A = f(0)
	222	+ b0 = x.max() / 10.0
	223	+ c0 = y.min()
	224	+ return (a0, b0, c0)
	225	+ def fit(self, x, y, ye, **kwargs):
	226	+ return super(Expon, self).fit(x, y, ye, **kwargs)
	227	+
	228	+class StretchedExpon(ParametricModel):
	229	+ '''
	230	+ y = A * exp (-(t / tau) ** beta) + C
	231	+ '''
	232	+ __params__ = []
	233	+ A = Parameter('A', __params__)
	234	+ tau = Parameter('tau', __params__)
	235	+ beta = Parameter('beta', __params__)
	236	+ C = Parameter('C', __params__)
	237	+ name = 'stretched exponential'
	238	+ @staticmethod
	239	+ def func(x, a, tau, beta, c):
	240	+ return a * np.exp(- (x / tau) ** beta) + c
	241	+ @staticmethod
	242	+ def init(x, y):
	243	+ a0 = y[np.argmin(np.abs(x))] # estimate for A = f(0)
	244	+ tau0 = x.max() / 10.0
	245	+ c0 = y.min()
	246	+ return (a0, tau0, 0.5, c0)
	247	+ def fit(self, x, y, ye, **kwargs):
	248	+ return super(StretchedExpon, self).fit(x, y, ye, **kwargs)
	249	+ def summary(self, **kwargs):
	250	+ mrt = self.mrt(self.tau, self.beta)
	251	+ kwargs['Mean relaxation time <tau>'] = '%5.2f days' % mrt
	252	+ return super(StretchedExpon, self).summary(**kwargs)
	253	+ def mrt(self, tau, beta):
	254	+ return (tau / beta) * gamma(beta ** -1)
	255	+
	256	+class PowerLaw(ParametricModel):
	257	+ '''
	258	+ y = A * x ** B
	259	+ '''
	260	+ __params__ = []
	261	+ A = Parameter('A', __params__)
	262	+ B = Parameter('B', __params__)
	263	+ name = 'power-law'
	264	+ @staticmethod
	265	+ def func(x, a, b):
	266	+ return a * x ** b
	267	+ @staticmethod
	268	+ def init(x, y):
	269	+ return (1, y.ptp()/x.ptp())
	270	+# NR says this code is more robust against roundoff errors, but presently it
	271	+# does not work. Bummer.
	272	+# def fit(self, x, y, ye, **kwargs):
	273	+# x, y, ye = self._removezeros(x, y, ye)
	274	+# ye = ye / y
	275	+# x = np.log(x)
	276	+# y = np.log(y)
	277	+# S = np.sum(ye ** -1)
	278	+# Sx = np.sum(x / ye)
	279	+# Sy = np.sum(y / ye)
	280	+# t = (ye ** -1) * (x - Sx / S)
	281	+# Stt = np.sum(t ** 2)
	282	+# b = Stt ** -1 * np.sum((y * t) / ye)
	283	+# a = np.exp((Sy - Sx * b) / S)
	284	+# a_var = S ** -1 * (1 + Sx ** 2 / (S * Stt))
	285	+# b_var = Stt ** -1
	286	+# ab_covar = - Sx / Stt
	287	+# pcov = np.asarray([[a_var, ab_covar], [ab_covar, b_var]])
	288	+# return (a, b), pcov
	289	+ def fit(self, x, y, ye, **kwargs):
	290	+ '''
	291	+ Fit by linear least squares of log-transformed data
	292	+ '''
	293	+ x, y, ye = self._removezeros(x, y, ye)
	294	+ ye = (ye / y) ** 2
	295	+ x = np.log(x)
	296	+ y = np.log(y)
	297	+ S = np.sum(ye ** -1)
	298	+ Sx = np.sum(x / ye)
	299	+ Sy = np.sum(y / ye)
	300	+ Sxx = np.sum(x ** 2 / ye)
	301	+ Sxy = np.sum((x * y) / ye)
	302	+ Delta = S * Sxx - Sx ** 2
	303	+ a = np.exp((Sxx * Sy - Sx * Sxy) / Delta)
	304	+ b = (S * Sxy - Sx * Sy) / Delta
	305	+ a_var = Sxx / Delta
	306	+ b_var = S / Delta
	307	+ ab_covar = - Sx / Delta
	308	+ pcov = np.asarray([[a_var, ab_covar], [ab_covar, b_var]])
	309	+ return (a, b), pcov
	310	+ def gof(self, x, y, ye):
	311	+ '''
	312	+ GoF of linear least squares of log-transformed data
	313	+ '''
	314	+ x, y, ye = self._removezeros(x, y, ye)
	315	+ ye = (ye / y)
	316	+ x = np.log(x)
	317	+ y = np.log(y)
	318	+ yp = np.log(self.A) + self.B * x
	319	+ chisq = np.sum(((yp - y) / ye) ** 2)
	320	+ ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters
	321	+ chisq_pvalue = chisqprob(chisq / 2., ddof)
	322	+ resid = normaltest(y - yp)
	323	+ ym = y.mean()
	324	+ SStot = np.sum((y - ym) ** 2)
	325	+ SSerr = np.sum((y - yp) ** 2)
	326	+ Rsquared = 1.0 - SSerr / SStot
	327	+ return (chisq, chisq_pvalue, ddof), resid, Rsquared
	328	+ @staticmethod
	329	+ def _removezeros(x, y, ye):
	330	+ idx = x > 0
	331	+ return x[idx], y[idx], ye[idx]
	332	+
	333	+if __name__ == '__main__':
	334	+
	335	+ import matplotlib.pyplot as pp
	336	+ import scale
	337	+
	338	+ model = StretchedExpon()
	339	+
	340	+ a = 2
	341	+ tau = 100
	342	+ beta = .5
	343	+ c = 0.
	344	+ s = 0.1
	345	+ xmax = 1000
	346	+ x = np.linspace(0, xmax, 50)
	347	+ y = model(x, a, tau, beta, c) + np.random.randn(len(x)) * s
	348	+
	349	+ pest, pcov = model.fit(x, y, s)
	350	+
	351	+ model.setparams(*zip(pest, np.sqrt(np.diag(pcov))))
	352	+
	353	+ xx = np.linspace(0, xmax, 1000)
	354	+ yy = model(xx)
	355	+
	356	+ pp.errorbar(x, y, s, fmt='. ', color='k', ecolor='none', label='data')
	357	+ pp.plot(xx, yy, 'r-', label='Stretch. Exp. fit')
	358	+ pp.xscale('power', exponent=beta)
	359	+ pp.yscale('log')
	360	+
	361	+ pp.legend()
	362	+ gof, resid, Rsquared = model.gof(x, y, s)
	363	+ model.goftest = gof
	364	+ model.residtest = resid
	365	+ model.Rsquared = Rsquared
	366	+ print model.summary()
	367	+ chi, p, ddof = gof
	368	+ pp.text(200, 1, r'$\chi^2 = %.2f,\, p-{\rm value} = %5.2g,\,'
	369	+ r'{\rm ddof} = %d,\, R^2 = %.2f$'
	370	+ % (chi,p,ddof, Rsquared),
	371	+ fontsize=16)
	372	+ pp.show()
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/scale.py
—	—	@@ -0,0 +1,68 @@
	2	+from matplotlib.scale import ScaleBase, register_scale
	3	+from matplotlib.transforms import Transform, nonsingular
	4	+from matplotlib.ticker import LinearLocator, Formatter
	5	+from math import ceil, floor
	6	+import numpy as np
	7	+
	8	+class PowerScale(ScaleBase):
	9	+ name ='power'
	10	+ def __init__(self, axis, **kwargs):
	11	+ ScaleBase.__init__(self)
	12	+ exponent = kwargs.pop('exponent')
	13	+ if exponent <= 0:
	14	+ raise ValueError('exponent must be positive')
	15	+ self.exponent = exponent
	16	+ def get_transform(self):
	17	+ return PowerTransform(self.exponent)
	18	+ def set_default_locators_and_formatters(self, axis):
	19	+ axis.set_major_locator(PowerLocator(self.exponent))
	20	+ axis.set_major_formatter(PowerFormatter(self.exponent))
	21	+ axis.set_minor_formatter(PowerFormatter(self.exponent))
	22	+
	23	+class PowerLocator(LinearLocator):
	24	+ def __init__(self, exponent, **kwargs):
	25	+ LinearLocator.__init__(self, **kwargs)
	26	+ self.exponent = exponent
	27	+ self.numticks = 5
	28	+ def __call__(self):
	29	+ vmin, vmax = self.axis.get_view_interval()
	30	+ vmin, vmax = nonsingular(vmin, vmax, expander = 0.05)
	31	+ vmin = vmin ** self.exponent
	32	+ vmax = vmax ** self.exponent
	33	+ if vmax<vmin:
	34	+ vmin, vmax = vmax, vmin
	35	+
	36	+ ticklocs = np.linspace(vmin, vmax, num=self.numticks, endpoint=True)
	37	+ return self.raise_if_exceeds(ticklocs ** (1.0 / self.exponent))
	38	+
	39	+class PowerFormatter(Formatter):
	40	+ def __init__(self, exponent):
	41	+ self.exponent = exponent
	42	+ def __call__(self, x, pos=None):
	43	+ return u'%.2g' % (x ** (1.0 / self.exponent))
	44	+
	45	+class PowerTransform(Transform):
	46	+ input_dims = 1
	47	+ output_dims = 1
	48	+ is_separable = True
	49	+ def __init__(self, exponent):
	50	+ Transform.__init__(self)
	51	+ self.exponent = exponent
	52	+ def transform(self, a):
	53	+ return a ** self.exponent
	54	+ def inverted(self):
	55	+ return PowerTransform(1.0 / self.exponent)
	56	+
	57	+register_scale(PowerScale)
	58	+
	59	+if __name__ == '__main__':
	60	+ from pylab import *
	61	+ import numpy as np
	62	+ tau = 20
	63	+ beta = 0.5
	64	+ x = np.linspace(0,100, num=10)
	65	+ y = np.exp(-(x / tau) ** beta)
	66	+ plot(x, y, 'o ', mfc='none', mew=2)
	67	+ xscale('power', exponent=beta)
	68	+ yscale('log', basey=10)
	69	+ show()
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
—	—	@@ -0,0 +1,173 @@
	2	+#!/usr/bin/python
	3	+#:vim:ts=python:
	4	+
	5	+''' functions for computing average activity rate of a single cohort '''
	6	+
	7	+# TODO adapts to sparse matrices!
	8	+
	9	+import os
	10	+import sys
	11	+from argparse import ArgumentParser, FileType
	12	+import numpy as np
	13	+from scipy.sparse import coo_matrix
	14	+from collections import deque
	15	+import datetime as dt
	16	+
	17	+__prog__ = os.path.basename(os.path.abspath(__file__))
	18	+
	19	+namespaces = {
	20	+ 'main': 0,
	21	+ 'talk': 1,
	22	+ 'user': 2,
	23	+ 'user_talk' : 3,
	24	+ 'wikipedia' : 4,
	25	+ 'wikipedia_talk' : 5,
	26	+ 'file' : 6,
	27	+ 'file_talk' : 7,
	28	+ 'mediawiki' : 8,
	29	+ 'mediawiki_talk' : 9,
	30	+ 'template' : 10,
	31	+ 'template_talk' : 11,
	32	+ 'help' : 12,
	33	+ 'help_talk' : 13,
	34	+ 'category' : 14,
	35	+ 'category_talk' : 15,
	36	+ 'portal' : 100,
	37	+ 'portal_talk' : 101,
	38	+ 'book' : 108,
	39	+ 'book_talk' : 109
	40	+}
	41	+
	42	+MAX_NS = np.max(namespaces.values()) # 109 as of August 2011
	43	+
	44	+def estimaterate(edits, step):
	45	+ '''
	46	+ This function takes the daily edit history of an individual editor, and a
	47	+ step parameter; it estimates the daily activity of the editor. It returns
	48	+ the daily rates every `step' days.
	49	+ '''
	50	+ N = len(edits)
	51	+ if N % step:
	52	+ NN = np.ceil(N / float(step)) * step
	53	+ tmp = np.zeros((NN,), dtype=edits.dtype)
	54	+ tmp[:N] = edits
	55	+ edits = tmp
	56	+ return edits.reshape((-1, step)).sum(axis=-1) / float(step)
	57	+
	58	+def itercycles(npzarchive, every, users=None, onlyns=None):
	59	+ '''
	60	+ Iterates over the archive or over given list of users and returns estimated
	61	+ activity life cycle (see estimaterate())
	62	+
	63	+ For parameters, see computerates
	64	+ '''
	65	+ for uid in (users or npzarchive.files):
	66	+ data = npzarchive[uid].view(np.recarray)
	67	+ idx = data.ns >= 0 # let's filter out junk (i.e. edits to virtual ns)
	68	+ days = data.day[idx]
	69	+ edits = data.edits[idx]
	70	+ ns = data.ns[idx]
	71	+ days = days - days.min()
	72	+ shape = (days.max() + 1, MAX_NS + 1)
	73	+ M = coo_matrix((edits, (days, ns)), shape=shape)
	74	+ if onlyns is not None:
	75	+ M = M.tocsc()[:, onlyns]
	76	+ M = M.tocsr()
	77	+ rates = estimaterate(np.asarray(M.sum(axis=1)).ravel(), every)
	78	+ yield np.c_[np.arange(len(rates)) * every, rates]
	79	+
	80	+def average(ratesbyday):
	81	+ '''
	82	+ Computes average cycle with standard errors. Takes in input a dictionary
	83	+ returned by groupbydayssince()
	84	+ '''
	85	+ all_days = sorted(ratesbyday.keys())
	86	+ result = deque()
	87	+ for d in all_days:
	88	+ s = ratesbyday[d]
	89	+ sqN = np.sqrt(len(s))
	90	+ result.append((d, np.mean(s), np.std(s)/np.sqrt(len(s)), len(s)))
	91	+ return np.asarray(result)
	92	+
	93	+def groupbyday(npzarchive, every, users=None, onlyns=None):
	94	+ '''
	95	+ This function estimates editors' activity rates and groups rate estimates by
	96	+ number of days elapsed since editor registration (which corresponds to time = 0)
	97	+
	98	+ For parameters, see computerates
	99	+ '''
	100	+ tmp = {}
	101	+ for cyclearr in itercycles(npzarchive, every, users, onlyns):
	102	+ for d, r in cyclearr:
	103	+ try:
	104	+ tmp[d].append(r)
	105	+ except KeyError:
	106	+ tmp[d] = deque([r])
	107	+ return tmp
	108	+
	109	+# NOTE: not used right now
	110	+def lifetimes(npzarchive, users=None):
	111	+ '''
	112	+ Returns the distribution of account lifetimes over an archive. Can take an
	113	+ optional list users ids to restrict the sample to a specific group of
	114	+ editors
	115	+ '''
	116	+ lt = deque()
	117	+ for uid in (users or npzarchive.files):
	118	+ days, edits = npzarchive[uid].T
	119	+ lt.append(days.ptp())
	120	+ return np.asarray(lt)
	121	+
	122	+# NOTE: not used right now
	123	+def find_inactives(npzarchive, inactivity, minimum_activity, maximum_activity):
	124	+ now = dt.datetime.now().toordinal()
	125	+ epoch = dt.datetime(1970,1,1).toordinal()
	126	+ unix_now = now - epoch
	127	+ inactives = deque()
	128	+ for uid in npzarchive.files:
	129	+ days, edits = npzarchive[uid].T
	130	+ if days.ptp() <= inactivity:
	131	+ continue
	132	+ unix_last = days[-1]
	133	+ if (unix_now - unix_last) > inactivity:
	134	+ tot_edits = float(edits.sum())
	135	+ tot_days = float(days.ptp() - inactivity)
	136	+ activity = tot_edits / tot_days * 365.0
	137	+ if minimum_activity < activity and maximum_activity > activity:
	138	+ inactives.append(uid)
	139	+ return inactives
	140	+
	141	+def computerates(fn, every, users=None, onlyns=None):
	142	+ '''
	143	+ Returns an array of average activity vs day since registration with standard
	144	+ errors of the average
	145	+
	146	+ Parameters
	147	+ ----------
	148	+ onlyns - compute edit counts only over specified list of namespaces
	149	+ users - compute rates only for these users
	150	+ every - compute daily rates in strides of length `every'
	151	+ '''
	152	+ npzarchive = np.load(fn)
	153	+ rates = average(groupbyday(npzarchive, every, users, onlyns))
	154	+ return rates
	155	+
	156	+parser = ArgumentParser(description=__doc__)
	157	+parser.add_argument('data_file', metavar='data')
	158	+parser.add_argument('output_file', metavar='output', type=FileType('w'))
	159	+parser.add_argument('-every', type=int, help='default: average over %(default)d days',
	160	+ default=30, metavar='NUM')
	161	+parser.add_argument('-ns', type=int, action='append', help='select only these NS',
	162	+ dest='only')
	163	+
	164	+def main(ns):
	165	+ rates = computerates(ns.data_file, ns.every, onlyns=ns.only)
	166	+ if ns.output_file.isatty():
	167	+ print >> sys.stderr, 'writing to standard output'
	168	+ np.savetxt(ns.output_file, rates, fmt='%f')
	169	+ if not ns.output_file.isatty():
	170	+ print '%s: output saved to %s' % (__prog__, ns.output_file.name)
	171	+
	172	+if __name__ == '__main__':
	173	+ ns = parser.parse_args()
	174	+ main(ns)
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/__init__.py
Index: trunk/tools/wsor/editor_lifecycle/ratesnobots.sql
—	—	@@ -0,0 +1,9 @@
	2	+select
	3	+ rev_user,
	4	+ min(rev_timestamp),
	5	+ max(rev_timestamp),
	6	+ count(*)
	7	+from revision left join halfak.bot_20110711
	8	+on rev_user = user_id
	9	+where user_id is null and rev_user > 0
	10	+group by rev_user;
Index: trunk/tools/wsor/editor_lifecycle/rateperedits.py
—	—	@@ -0,0 +1,69 @@
	2	+#!/usr/bin/python
	3	+
	4	+'''plots number of edits vs activity rate '''
	5	+
	6	+import re
	7	+import os
	8	+import sys
	9	+
	10	+from argparse import ArgumentParser
	11	+from itertools import groupby
	12	+
	13	+import numpy as np
	14	+import matplotlib.pyplot as pp
	15	+
	16	+__prog__ = os.path.basename(__file__)
	17	+
	18	+parser = ArgumentParser(description=__doc__)
	19	+parser.add_argument('data_file', metavar='data')
	20	+
	21	+def main(ns):
	22	+
	23	+ # detect extension and load data
	24	+ _, ext = os.path.splitext(ns.data_file)
	25	+ if re.match('^\.npy$', ext, re.I):
	26	+ data = np.load(ns.data_file)
	27	+ elif re.match('^\.tsv$', ext, re.I) or re.match('^\.txt$', ext, re.I):
	28	+ data = np.loadtxt(ns.data_file, delimiter='\t')
	29	+ else:
	30	+ print >> sys.stderr, '%s: error: could not determine file type (.npy,'\
	31	+ '.tsv, .txt accepted)' % __prog__
	32	+
	33	+ # sort data
	34	+ data[:,1] = np.floor(np.log2(data[:,1]))
	35	+ idx = np.argsort(data[:,1])
	36	+ data = data[idx]
	37	+
	38	+ # group by exponential binning
	39	+ positions = []
	40	+ x = []
	41	+ for p, subiter in groupby(data, lambda k : k[1]):
	42	+ positions.append(p)
	43	+ x.append(np.asarray([r for r, e in subiter ]))
	44	+ positions = np.asarray(positions)
	45	+
	46	+ # make boxplot on log-log scale
	47	+ widths = 2 ** np.arange(0, len(x))
	48	+ ax = pp.gca()
	49	+ pp.boxplot(x, positions= 2 ** positions, sym=' ', whis=1.8, bootstrap=5000,
	50	+ widths=widths)
	51	+ pp.xscale('log')
	52	+ pp.yscale('log')
	53	+
	54	+ # add reference line
	55	+ x = np.array([1e2, 1e6])
	56	+ m = 1e-8
	57	+ pp.plot(x, m * x, 'k-', alpha=.75, lw=2)
	58	+
	59	+ # decorate
	60	+ pp.setp(ax.lines, color='k')
	61	+ pp.xlim(1,2e6)
	62	+ pp.ylim(1e-9,.1)
	63	+ pp.xlabel('edits')
	64	+ pp.ylabel(r'activity (${\rm sec} ^ {-1}$)')
	65	+ pp.draw()
	66	+ pp.show()
	67	+
	68	+if __name__ == '__main__':
	69	+ ns = parser.parse_args()
	70	+ main(ns)
Index: trunk/tools/wsor/editor_lifecycle/README.rst
—	—	@@ -0,0 +1,34 @@
	2	+============
	3	+README
	4	+============
	5	+
	6	+---------
	7	+workflow
	8	+---------
	9	+
	10	+This package is a collection of python and shell scripts that can assist
	11	+creating and analyzing data on user life cycle.
	12	+
	13	+Sample selection
	14	+----------------
	15	+
	16	+TBD
	17	+
	18	+Edit activity data collection
	19	+-----------------------------
	20	+
	21	+First use `fetchrates` to download the rate data from the MySQL database. This
	22	+script takes a user_id in input (and stores the rate data in a file called
	23	+<user_id>.npy). This script can be parallelized. At the end you will end up with
	24	+a bunch of NPY files.
	25	+
	26	+Cohort selection
	27	+----------------
	28	+
	29	+See the docstring in `mkcohort`.
	30	+
	31	+Cohort analysis
	32	+---------------
	33	+
	34	+See `graphlife`, `fitting`, `fitting_batch.sh`, and `relax`.
	35	+
Index: trunk/tools/wsor/editor_lifecycle/fitting
—	—	@@ -0,0 +1,218 @@
	2	+#!/usr/bin/python
	3	+# coding: utf-8
	4	+# :vim:ft=python
	5	+
	6	+''' editor lifecycle data fitting tool '''
	7	+
	8	+import sys
	9	+import os
	10	+import numpy as np
	11	+import matplotlib.pyplot as pp
	12	+from matplotlib.font_manager import FontProperties
	13	+from argparse import ArgumentParser
	14	+from scipy.optimize import curve_fit
	15	+
	16	+from lifecycle.models import Expon, PowerLaw, StretchedExpon
	17	+import lifecycle.scale
	18	+
	19	+__prog__ = os.path.basename(os.path.abspath(__file__))
	20	+
	21	+_maxfev = 100000
	22	+
	23	+parent = ArgumentParser(add_help=False)
	24	+parent.add_argument('data_files', metavar='DATA', nargs='+')
	25	+parent.add_argument('-output', dest='output_file', metavar='FILE')
	26	+parent.add_argument('-title')
	27	+group = parent.add_mutually_exclusive_group()
	28	+group.add_argument('-loglog', action='store_true')
	29	+group.add_argument('-logpow', action='store_true')
	30	+parent.add_argument('-constrain', choices=['head', 'tail', 'both'])
	31	+parent.add_argument('-batch', action='store_true', help='do not show graphics')
	32	+parent.add_argument('-force', action='store_true', help='force overwrite')
	33	+parent.add_argument('-minsize', type=int)
	34	+
	35	+parser = ArgumentParser(description=__doc__)
	36	+subparsers = parser.add_subparsers(help='Parametric models supported')
	37	+
	38	+parser_expon = subparsers.add_parser('expon', parents=[parent])
	39	+parser_expon.set_defaults(modelclass=Expon)
	40	+
	41	+parser_stretch = subparsers.add_parser('stretchedexp', parents=[parent])
	42	+parser_stretch.set_defaults(modelclass=StretchedExpon)
	43	+
	44	+parser_power = subparsers.add_parser('powerlaw', parents=[parent])
	45	+parser_power.set_defaults(modelclass=PowerLaw)
	46	+
	47	+class DataError(Exception):
	48	+ pass
	49	+
	50	+class TooFewDataError(DataError):
	51	+ pass
	52	+
	53	+class EmptyDataError(DataError):
	54	+ pass
	55	+
	56	+def plotfit(model, x, y, ye, label='data', **kwargs):
	57	+ xx = np.linspace(x.min(), x.max(), endpoint=True, num=1000)
	58	+ yy = model(xx)
	59	+ kwargs['ecolor'] = 'none'
	60	+ kwargs['ls'] = 'none'
	61	+ pp.errorbar(x, y, ye / 2, label=label, **kwargs)
	62	+ model_label = model.name.split()
	63	+ if len(model_label) > 1:
	64	+ model_label[1] = model_label[1][:3] + '.'
	65	+ model_label = ' '.join(model_label[:2]).capitalize()
	66	+ c = kwargs.get('color', 'r')
	67	+ pp.plot(xx, yy, '--', color=c, label='{} fit'.format(model_label), lw=2.5)
	68	+ if ns.loglog:
	69	+ pp.xscale('log')
	70	+ pp.yscale('log')
	71	+ elif ns.logpow:
	72	+ pp.xscale('power', exponent=model.beta)
	73	+ pp.yscale('log')
	74	+ pp.legend(loc='best', prop=FontProperties(size='x-small'))
	75	+ if ns.title is not None:
	76	+ pp.title(ns.title)
	77	+ pp.xlabel('Days since registration')
	78	+ pp.ylabel('Edits/day')
	79	+
	80	+def plotresid(model, x, y, label='data', **kwargs):
	81	+ r = model(x) - y
	82	+# rm = r[True - np.isinf(r)].max()
	83	+# r /= np.abs(rm)
	84	+ pp.axhline(y=0, c='k')
	85	+ kwargs['ls'] = ':'
	86	+ pp.plot(x, r, label=label, **kwargs)
	87	+ pp.title('Fit residuals')
	88	+ pp.xlabel('Days since registration')
	89	+# pp.ylabel(r'Relative residual $\xi / \max{\|\xi\|}$')
	90	+# pp.ylim(-1,1)
	91	+ pp.draw()
	92	+
	93	+def _testoverwrite(*files):
	94	+ exit_flag = False
	95	+ for fn in files:
	96	+ if os.path.exists(fn):
	97	+ exit_flag = True
	98	+ print '%s: error: cannot overwrite %s' % (__prog__, fn)
	99	+ if exit_flag:
	100	+ sys.exit(1)
	101	+
	102	+# TODO add usecols and option for taking only observations based on minimum
	103	+# sample size
	104	+
	105	+def fit(data_file, modelclass, constrain=None, minsize=None):
	106	+ '''
	107	+ Fits a model to data. Data are read from `data_file` and the model is
	108	+ specified as a `modelclass` (see `lifecycle.models`). The data file should
	109	+ contain a (N,4) array where columns are x, y, ye (errors) and n (sample
	110	+ sizes -- number of users on which the average rate y is computed).
	111	+
	112	+ Parameters
	113	+ ----------
	114	+ If `modelclass` is any of `Expon` or `StretchedExpon`, one can constrain the
	115	+ value of some parameters before performing the fit. To this end one can use
	116	+ the parameter `constrain`, which can be any of the following strings:
	117	+ 'both', 'head', or 'tail'. Their meaning is the following:
	118	+
	119	+ * head : A = y[0]
	120	+ * tail : C = y[-1]
	121	+ * both : A = y[0] and C = y[-1]
	122	+ '''
	123	+ x, y, ye, n = np.loadtxt(data_file, unpack=True)
	124	+ idx = ye > 0
	125	+ if minsize:
	126	+ idx = idx * (n > minsize)
	127	+ if not np.any(idx):
	128	+ raise EmptyDataError()
	129	+ if np.sum(idx) < len(modelclass.__params__):
	130	+ raise TooFewDataError()
	131	+ x = x[idx]
	132	+ y = y[idx]
	133	+ ye = ye[idx]
	134	+ model = modelclass()
	135	+ if constrain in ['head', 'both']:
	136	+ model.A = y[np.argmin(np.abs(x))]
	137	+ if constrain in ['tail', 'both']:
	138	+ model.C = y.min()
	139	+ pest, pcov = model.fit(x, y, ye=ye, maxfev=_maxfev)
	140	+ if not np.isscalar(pcov):
	141	+ perr = np.sqrt(np.diag(pcov)) / 2.
	142	+ model.setparams(*zip(pest, perr))
	143	+ else:
	144	+ model.setparams(*pest)
	145	+ gof, resid, Rsquared = model.gof(x, y, ye)
	146	+ model.goftest = gof
	147	+ model.residtest = resid
	148	+ model.Rsquared = Rsquared
	149	+ print model.summary(dataset=data_file, observations=len(x))
	150	+ return x, y, ye, model
	151	+
	152	+def main(ns):
	153	+ # import matploblib
	154	+ from matplotlib.lines import lineMarkers as markers
	155	+ markers = dict(filter(
	156	+ lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing',
	157	+ markers.items())).keys()
	158	+ colors = 'bgcmykw'
	159	+
	160	+ data = []
	161	+ models = []
	162	+ labels = []
	163	+
	164	+ # fit all datasets
	165	+ for f in ns.data_files:
	166	+ try:
	167	+ x, y, ye, model = fit(f, ns.modelclass, ns.constrain, ns.minsize)
	168	+ except TooFewDataError:
	169	+ print >> sys.stderr, '%s: warning: too few data: %s' % (__prog__, f)
	170	+ continue
	171	+ except EmptyDataError:
	172	+ print >> sys.stderr, '%s: warning: no usable data: %s'% (__prog__, f)
	173	+ continue
	174	+ data.append((x,y,ye))
	175	+ models.append(model)
	176	+ labels.append(os.path.splitext(f)[0])
	177	+
	178	+ # plot fits
	179	+ pp.figure()
	180	+ pp.hold(1)
	181	+ for i, ((x, y, ye), model, label) in enumerate(zip(data, models, labels)):
	182	+ m = markers[i % len(markers)]
	183	+ c = colors[i % len(colors)]
	184	+ plotfit(model, x, y, ye, label, marker=m, color=c)
	185	+
	186	+ # plot residuals
	187	+ pp.figure()
	188	+ pp.hold(1)
	189	+ for i, ((x, y, ye), model, label) in enumerate(zip(data, models, labels)):
	190	+ m = markers[i % len(markers)]
	191	+ c = colors[i % len(colors)]
	192	+ plotresid(model, x, y, label, marker=m, color=c)
	193	+
	194	+ # save figures
	195	+ if ns.output_file is not None:
	196	+ fn, ext = os.path.splitext(ns.output_file)
	197	+ fmt = ext[1:]
	198	+ if ns.batch and fmt.lower() != 'pdf':
	199	+ print '%s: error: batch mode supports only PDF format' % __prog__
	200	+ sys.exit(1)
	201	+ resid_output_file = fn + '_residuals' + ext
	202	+ if not ns.force:
	203	+ _testoverwrite(ns.output_file, resid_output_file)
	204	+ pp.figure(1)
	205	+ pp.savefig(ns.output_file, format=fmt)
	206	+ print '%s: output saved to %s' % (__prog__, ns.output_file)
	207	+ pp.figure(2)
	208	+ pp.savefig(resid_output_file, format=fmt)
	209	+ print '%s: output saved to %s' % (__prog__, resid_output_file)
	210	+
	211	+ pp.show()
	212	+
	213	+if __name__ == '__main__':
	214	+ ns = parser.parse_args()
	215	+# if ns.batch:
	216	+# import matplotlib
	217	+# matplotlib.use('PDF')
	218	+# import matplotlib.pyplot as pp
	219	+ main(ns)
Property changes on: trunk/tools/wsor/editor_lifecycle/fitting
___________________________________________________________________
Added: svn:executable
1	220	+ *

Status & tagging log

11:34, 17 August 2011 Reedy (talk | contribs) changed the status of r94723 [removed: new added: deferred]