Index: trunk/tools/wsor/editor_lifecycle/mkcohort |
— | — | @@ -0,0 +1,73 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +#:vim:ft=python |
| 4 | + |
| 5 | +''' finding better cohorts by real user activity ''' |
| 6 | + |
| 7 | +import os |
| 8 | +import sys |
| 9 | +import csv |
| 10 | +import numpy as np |
| 11 | + |
| 12 | +from collections import deque |
| 13 | +from itertools import groupby |
| 14 | +from argparse import ArgumentParser, FileType |
| 15 | +from dateutil.parser import parser as DateParser |
| 16 | + |
| 17 | +__prog__ = os.path.basename(__file__) |
| 18 | + |
| 19 | +def yearkey(date): |
| 20 | + return date.year, |
| 21 | + |
| 22 | +def monthkey(date): |
| 23 | + return date.year, date.month |
| 24 | + |
| 25 | +def daykey(date): |
| 26 | + return date.year, date.month, date.day |
| 27 | + |
| 28 | +parser = ArgumentParser(description=__doc__) |
| 29 | +parser.add_argument('data', type=FileType('r')) |
| 30 | +group = parser.add_mutually_exclusive_group(required=1) |
| 31 | +group.add_argument('--day', dest='keyfunc', const=daykey, action='store_const') |
| 32 | +group.add_argument('--month', dest='keyfunc', const=monthkey, action='store_const') |
| 33 | +group.add_argument('--year', dest='keyfunc', const=yearkey, action='store_const') |
| 34 | + |
| 35 | +parser.add_argument('--minspan', default=3600, type=int) |
| 36 | +parser.add_argument('--mincount', default=2, type=int) |
| 37 | + |
| 38 | +date_parser = DateParser() |
| 39 | +fieldnames = ['user', 'max_timestamp', 'min_timestamp', 'editcount'] |
| 40 | + |
| 41 | +def activity(rowiterator, mincount, minspan): |
| 42 | + for row in rowiterator: |
| 43 | + user = float(row['user']) |
| 44 | + min_timestamp = date_parser.parse(row['min_timestamp']) |
| 45 | + max_timestamp = date_parser.parse(row['max_timestamp']) |
| 46 | + editcount = float(row['editcount']) |
| 47 | + span = (max_timestamp - min_timestamp).total_seconds() |
| 48 | + if (editcount > mincount) and (span > minspan): |
| 49 | + yield user, editcount / span |
| 50 | + |
| 51 | +def magnitude(x): |
| 52 | + return np.floor(np.log10(x)) |
| 53 | + |
| 54 | +if __name__ == '__main__': |
| 55 | + ns = parser.parse_args() |
| 56 | + reader = csv.DictReader(ns.data, fieldnames=fieldnames, delimiter='\t', |
| 57 | + quoting=csv.QUOTE_NONE) |
| 58 | + |
| 59 | + def cohortdatefunc(row): |
| 60 | + return ns.keyfunc(date_parser.parse(row['min_timestamp'])) |
| 61 | + |
| 62 | + for cohortdate, subiter in groupby(reader, cohortdatefunc): |
| 63 | + data = np.asarray(list(activity(subiter, ns.mincount, ns.minspan))) |
| 64 | + if len(data) > 0: |
| 65 | + user, rate = data.T |
| 66 | + idx = rate.argsort() |
| 67 | + data = list(data[idx]) |
| 68 | + kf = lambda k : magnitude(k[1]) |
| 69 | + for m, subsubiter in groupby(data, kf): |
| 70 | + outrow = deque([ '-'.join(map(str, cohortdate)), '%d' % m ]) |
| 71 | + for u, e in subsubiter: |
| 72 | + outrow.append('%d' % u) |
| 73 | + print '\t'.join(outrow) |
| 74 | + |
Property changes on: trunk/tools/wsor/editor_lifecycle/mkcohort |
___________________________________________________________________ |
Added: svn:executable |
1 | 75 | + * |
Index: trunk/tools/wsor/editor_lifecycle/setup.py |
— | — | @@ -0,0 +1,16 @@ |
| 2 | +from distutils.core import setup |
| 3 | + |
| 4 | +setup( |
| 5 | + name='lifecycle', |
| 6 | + description='WMF summer of research project', |
| 7 | + version='0.0.0', |
| 8 | + author='Giovanni Luca Ciampaglia', |
| 9 | + author_email='gciampaglia@wikimedia.org', |
| 10 | + packages=['lifecycle'], |
| 11 | + scripts=[ |
| 12 | + 'fetchrates', |
| 13 | + 'fitting', |
| 14 | + 'relax', |
| 15 | + 'mkcohort', |
| 16 | + ] |
| 17 | +) |
Index: trunk/tools/wsor/editor_lifecycle/fetchrates |
— | — | @@ -0,0 +1,126 @@ |
| 2 | +#!/usr/bin/python |
| 3 | + |
| 4 | +import sys |
| 5 | +import os |
| 6 | +import numpy as np |
| 7 | + |
| 8 | +from zipfile import ZipFile |
| 9 | +from contextlib import closing |
| 10 | +from tempfile import mkdtemp |
| 11 | +from oursql import connect |
| 12 | +from argparse import ArgumentParser |
| 13 | +from datetime import datetime |
| 14 | + |
| 15 | +from lifecycle.rates import computerates |
| 16 | + |
| 17 | +parser = ArgumentParser(description=__doc__) |
| 18 | +parser.add_argument('input_path', metavar='cohort') |
| 19 | +parser.add_argument('-config', dest='config_file') |
| 20 | +parser.add_argument('-outdir', dest='output_dir', default=os.curdir) |
| 21 | +parser.add_argument('-v','--verbose', action='store_true') |
| 22 | +parser.add_argument('-every', type=int, help='default: average over %(default)d days', |
| 23 | + default=30, metavar='NUM') |
| 24 | +parser.add_argument('-ns', type=int, action='append', help='select only these NS', |
| 25 | + dest='only') |
| 26 | +parser.add_argument('--save-cohort', dest='rates_only', action='store_false') |
| 27 | + |
| 28 | +query = """ |
| 29 | +select unix_timestamp(rev_timestamp)/86400.0, page_namespace |
| 30 | +from revision r join page p |
| 31 | +on r.rev_page = p.page_id |
| 32 | +where rev_user = ? |
| 33 | +order by rev_timestamp |
| 34 | +""" |
| 35 | + |
| 36 | +__prog__ = os.path.basename(os.path.abspath(__file__)) |
| 37 | + |
| 38 | +# def iterateoversets(cursor): |
| 39 | +# yield list(cursor) |
| 40 | +# while cursor.nextset(): |
| 41 | +# yield list(cursor) |
| 42 | + |
| 43 | +def process(rows, user_id, output_dir=os.path.curdir): |
| 44 | + if len(rows) == 0: |
| 45 | + print >> sys.stderr, '%s: error: empty revision history for user %d' % (__prog__, |
| 46 | + user_id) |
| 47 | + return |
| 48 | + |
| 49 | + rev_time, rev_ns = np.asfarray(rows).T |
| 50 | + m, M = np.floor(rev_time.min()), np.ceil(rev_time.max()) |
| 51 | + uns = sorted(np.unique(rev_ns)) |
| 52 | + bins_time = np.arange(m,M + 1) |
| 53 | + bins_ns = uns + [uns[-1] + 1] |
| 54 | + rates, days, _ = np.histogram2d(rev_time, rev_ns, bins=(bins_time, bins_ns)) |
| 55 | + I,J = np.nonzero(rates) |
| 56 | + data = [ (days[i],uns[j],rates[i,j]) for i, j in zip(I,J) ] # in coordinate format |
| 57 | + dtype=np.dtype([('day', int), ('ns', int), ('edits', int)]) |
| 58 | + data = np.asarray(data, dtype=dtype) |
| 59 | + out_path = os.path.join(output_dir, '%d.npy' % user_id) |
| 60 | + np.save(out_path, data) |
| 61 | + return out_path |
| 62 | + |
| 63 | +def main(ns): |
| 64 | + # get mysql client configuration file |
| 65 | + mycnf = os.path.expanduser('~/.my.cnf') |
| 66 | + if ns.config_file is None and not os.path.exists(mycnf): |
| 67 | + print >> sys.stderr, '%s: no config file specified and $HOME/.my.cnf' |
| 68 | + ' not found' % __prog__ |
| 69 | + sys.exit(1) |
| 70 | + elif ns.config_file is None: |
| 71 | + ns.config_file = mycnf |
| 72 | + |
| 73 | + # test output directory exists |
| 74 | + if not os.path.exists(ns.output_dir): |
| 75 | + print >> sys.stderr, '%s: output directory does not exist: %s' % ( |
| 76 | + __prog__, ns.output_dir) |
| 77 | + sys.exit(1) |
| 78 | + if not os.path.isdir(ns.output_dir): |
| 79 | + print >> sys.stderr, '%s: not a directory: %s' % (__prog__, ns.output_dir) |
| 80 | + |
| 81 | + # read user ids from cohort file, create zip archive and temp dir |
| 82 | + with closing(open(ns.input_path)) as f: |
| 83 | + line = f.readline().strip() |
| 84 | + if line: |
| 85 | + user_ids = map(int, line.split('\t')) |
| 86 | + else: |
| 87 | + print >> sys.stderr, '%s: error: empty input file: %s' % ns.input_path |
| 88 | + sys.exit(1) |
| 89 | + zip_path = os.path.splitext(os.path.basename(ns.input_path))[0] + '.npz' |
| 90 | + zip_path = os.path.join(ns.output_dir, zip_path) |
| 91 | + temp_dir = mkdtemp() |
| 92 | + |
| 93 | + with closing(ZipFile(zip_path, 'w')) as zf: |
| 94 | + |
| 95 | + # connect and run query |
| 96 | + conn = connect(read_default_file=ns.config_file) |
| 97 | + for uid in user_ids: |
| 98 | + # compute rates and save to file |
| 99 | + with conn.cursor() as cursor: |
| 100 | + cursor.execute(query, (uid,)) |
| 101 | + rows = list(cursor) |
| 102 | + path = process(rows, uid, temp_dir) |
| 103 | + if path is None: |
| 104 | + continue |
| 105 | + zf.write(path, os.path.basename(path)) |
| 106 | + os.remove(path) |
| 107 | + |
| 108 | + # remove temp dir |
| 109 | + os.removedirs(temp_dir) |
| 110 | + |
| 111 | + # compute rates for this cohort and save them to file |
| 112 | + rate_path = os.path.splitext(os.path.basename(ns.input_path))[0] + '.tsv' |
| 113 | + rate_path = os.path.join(ns.output_dir, rate_path) |
| 114 | + rates = computerates(zip_path, ns.every, onlyns=ns.only) |
| 115 | + np.savetxt(rate_path, rates, fmt='%d\t%12.8g\t%12.8g\t%d') |
| 116 | + if ns.rates_only: |
| 117 | + os.remove(zip_path) |
| 118 | + print '%s: output saved to %s' % (datetime.now(), rate_path) |
| 119 | + else: |
| 120 | + print '%s: output saved to %s, cohort data to %s' % (datetime.now(), |
| 121 | + rate_path, zip_path) |
| 122 | + |
| 123 | +if __name__ == '__main__': |
| 124 | + # get arguments from command line |
| 125 | + ns = parser.parse_args() |
| 126 | + main(ns) |
| 127 | + |
Property changes on: trunk/tools/wsor/editor_lifecycle/fetchrates |
___________________________________________________________________ |
Added: svn:executable |
1 | 128 | + * |
Index: trunk/tools/wsor/editor_lifecycle/MANIFEST.in |
— | — | @@ -0,0 +1,5 @@ |
| 2 | +include *.py |
| 3 | +include *.sh |
| 4 | +include *.sql |
| 5 | +include sge/*.sh |
| 6 | +include obsolete/* |
Index: trunk/tools/wsor/editor_lifecycle/relax |
— | — | @@ -0,0 +1,81 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +#:vim:ft=python |
| 4 | + |
| 5 | +''' batch model fitting (usable with xargs)''' |
| 6 | + |
| 7 | +import re |
| 8 | +import os |
| 9 | +import sys |
| 10 | +import numpy as np |
| 11 | +from argparse import ArgumentParser |
| 12 | + |
| 13 | +from lifecycle.models import StretchedExpon |
| 14 | + |
| 15 | +__prog__ = os.path.basename(__file__) |
| 16 | + |
| 17 | +parser = ArgumentParser(description=__doc__) |
| 18 | +parser.add_argument('data_file', metavar='data') |
| 19 | +parser.add_argument('-m', '--min-size', type=int, default=0) |
| 20 | +parser.add_argument('-c', '--constrain', choices=['head', 'tail', 'both']) |
| 21 | +parser.add_argument('--maxfev', type=int, default=10000) |
| 22 | +parser.add_argument('--debug', action='store_true') |
| 23 | +parser.add_argument('--split-name', action='store_true', help='split input file' |
| 24 | + ' name into date and a rate part.') |
| 25 | + |
| 26 | +if __name__ == '__main__': |
| 27 | + # parse command line |
| 28 | + ns = parser.parse_args() |
| 29 | + |
| 30 | + # read data, filter data |
| 31 | + x, y, ye, n = np.loadtxt(ns.data_file, unpack=1) |
| 32 | + idx = (ye > 0) * (n > ns.min_size) |
| 33 | + if idx.sum() == 0: |
| 34 | + print >> sys.stderr, '%s: error: no data meeting requirements in %s'\ |
| 35 | + % (__prog__, ns.data_file) |
| 36 | + sys.exit(1) |
| 37 | + if idx.sum() < 4: |
| 38 | + print >> sys.stderr, '%s: error: non identifiable data in %s'\ |
| 39 | + % (__prog__, ns.data_file) |
| 40 | + sys.exit(1) |
| 41 | + |
| 42 | + # create model, set constraints |
| 43 | + model = StretchedExpon() |
| 44 | + if ns.constrain in ['head', 'both']: |
| 45 | + model.A = y[np.argmin(np.abs(x))] |
| 46 | + if ns.constrain in ['tail', 'both']: |
| 47 | + model.C = y.min() |
| 48 | + |
| 49 | + # fit model |
| 50 | + try: |
| 51 | + pest, pcov = model.fit(x[idx], y[idx], ye[idx], maxfev=ns.maxfev, warning=False) |
| 52 | + except ValueError, e: |
| 53 | + print >> sys.stderr, '%s: error: "%s" when fitting %s' % (__prog__, |
| 54 | + e.message, ns.data_file) |
| 55 | + if ns.debug: |
| 56 | + raise |
| 57 | + else: |
| 58 | + sys.exit(1) |
| 59 | + if np.isscalar(pcov) or np.isinf(pcov).any(): |
| 60 | + print >> sys.stderr, '%s: error: bad covariance matrix in %s' % (\ |
| 61 | + __prog__, ns.data_file) |
| 62 | + sys.exit(1) |
| 63 | + |
| 64 | + # compute errors, MRT |
| 65 | + perr = np.sqrt(np.diag(pcov)) / 2. |
| 66 | + model.setparams(*zip(pest,perr)) |
| 67 | + mrt = model.mrt(model.tau, model.beta) |
| 68 | + N = len(model.__params__) |
| 69 | + params = np.empty((N * 2 + 1,), dtype=float) |
| 70 | + params[:N] = [ model.A, model.tau, model.beta, model.C ] |
| 71 | + params[N:2*N] = map(lambda k : k or np.nan, [ model.A_err, model.tau_err, |
| 72 | + model.beta_err, model.C_err ]) |
| 73 | + params[-1] = mrt |
| 74 | + |
| 75 | + # print output line |
| 76 | + key, _ = os.path.splitext(ns.data_file) |
| 77 | + if ns.split_name: |
| 78 | + key = key.split('_') |
| 79 | + else: |
| 80 | + key = [ key ] |
| 81 | + params = map(lambda k : '%12.5g' % k, params) |
| 82 | + print '\t'.join(key + params) |
Property changes on: trunk/tools/wsor/editor_lifecycle/relax |
___________________________________________________________________ |
Added: svn:executable |
1 | 83 | + * |
Index: trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort |
— | — | @@ -0,0 +1,196 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# coding: utf-8 |
| 4 | +# :vim:ft=python |
| 5 | + |
| 6 | +# TODO: obsolete |
| 7 | + |
| 8 | +''' creates cohort files, filtering out bots ''' |
| 9 | + |
| 10 | +''' |
| 11 | +This script reads two files: an ZIP archive file, and an index file, which is a |
| 12 | +tab-separated text file like the following: |
| 13 | + |
| 14 | + 34 WojPob 20010129110725 2524 |
| 15 | + 94 AstroNomer 20010207222248 1532 |
| 16 | + 43 Lee Daniel Crocker 20010314020407 4388 |
| 17 | + 86 Stephen Gilbert 20010326191355 3599 |
| 18 | + 3 Tobias Hoevekamp 20010326202105 1903 |
| 19 | + 1273 Wathiik 20010510171751 1772 |
| 20 | + 3371 Arno 20010721180708 2700 |
| 21 | + 122 Ap 20010722201619 2137 |
| 22 | + 182 Rjstott 20010726102546 2602 |
| 23 | + 64 Uriyan 20010727141651 1634 |
| 24 | + |
| 25 | +Where fields are: id, name, date, count. Dates are parsed using dateutil, so |
| 26 | +other formats are allowed too (e.g. 2010-01-29 11:07:25). |
| 27 | + |
| 28 | +The script will aggregate users based on the date field and will lookup for |
| 29 | +files of the form <id>.npy in the archive file. Each of these files contains the |
| 30 | +daily edits count for a single user, stored using the NumPy binary array |
| 31 | +format. A relative path within the ZIP archive can be specified from the command |
| 32 | +line with -P/--datapath. Once the data for a cohort (e.g. an aggregated group |
| 33 | +of users) have been collected, the script will compute the average activity rate |
| 34 | +since the first day of activity for all users in that cohort. |
| 35 | + |
| 36 | +The script produces two files per each cohort: a tab-separated values file with |
| 37 | +cohort average activity rate, and a compressed NumPy binary archive with the |
| 38 | +user data array files. |
| 39 | + |
| 40 | +For each discovered cohort, the script will print on the console the date of the |
| 41 | +cohort, how many users it contains, and how many suspected BOT users it filtered |
| 42 | +out from the index. Use --bot disable this chieck and always include them. The |
| 43 | +check is as follows: if the name contains the pattern 'bot' at the beginning or |
| 44 | +at the end of any word, it will be filtered out (e.g. "Botuser IV" will match, |
| 45 | +but "Francis Abbott" won't). If arguments -mincount or -maxcount (or both) are |
| 46 | +passed, the script will process only users whose edit count is below the minimum |
| 47 | +count, or above the maximum count, or both. |
| 48 | + |
| 49 | +Please note that the index file must be already sorted by date, in order for the |
| 50 | +group by date aggregation to work. You can use `sort' from the commmand line, |
| 51 | +e.g.: |
| 52 | + |
| 53 | + $~ sort -t$'\t' -k3 -h unsorted.tsv |
| 54 | + |
| 55 | +should sort file unsorted.tsv. |
| 56 | +''' |
| 57 | + |
| 58 | +import re |
| 59 | +import os |
| 60 | +import sys |
| 61 | +import csv |
| 62 | +import numpy as np |
| 63 | +from argparse import ArgumentParser, FileType |
| 64 | +from contextlib import closing |
| 65 | +from itertools import groupby |
| 66 | +from dateutil.parser import parser as DateParser |
| 67 | +from datetime import datetime |
| 68 | +from zipfile import ZipFile |
| 69 | + |
| 70 | +from rates import computerates |
| 71 | + |
| 72 | +__prog__ = os.path.basename(os.path.abspath(__file__)) |
| 73 | +_botpat = r'\bbot|bot\b' |
| 74 | +_fields = ['id', 'name', 'date', 'count'] |
| 75 | + |
| 76 | +def yearkey(date): |
| 77 | + return date.year, |
| 78 | + |
| 79 | +def monthkey(date): |
| 80 | + return date.year, date.month |
| 81 | + |
| 82 | +def daykey(date): |
| 83 | + return date.year, date.month, date.day |
| 84 | + |
| 85 | +parser = ArgumentParser(description=__doc__) |
| 86 | +parser.add_argument('index', type=FileType('r'), help='*must* be already sorted') |
| 87 | +parser.add_argument('archive_path', metavar='archive', help='data archive in ZIP ' |
| 88 | + 'format') |
| 89 | +group = parser.add_mutually_exclusive_group(required=1) |
| 90 | +group.add_argument('--year', help='group by year', action='store_const', |
| 91 | + const=yearkey, dest='keyfunc') |
| 92 | +group.add_argument('--month', help='group by month', action='store_const', |
| 93 | + const=monthkey, dest='keyfunc') |
| 94 | +group.add_argument('--day', help='group by day', action='store_const', |
| 95 | + const=daykey, dest='keyfunc') |
| 96 | +parser.add_argument('--bots', action='store_true', help='do NOT filter out bots') |
| 97 | +parser.add_argument('-P', '--datapath', help='relative path of files within ' |
| 98 | + 'archive', default='') |
| 99 | +parser.add_argument('-mincount', type=int) |
| 100 | +parser.add_argument('-maxcount', type=int) |
| 101 | +parser.add_argument('-minperyear', type=int) |
| 102 | +parser.add_argument('-maxperyear', type=int) |
| 103 | +parser.add_argument('-n', '--dry-run', action='store_true', help='write to ' |
| 104 | + 'console all actions, but do not produce any file') |
| 105 | +parser.add_argument('-every', type=int, help='default: average over %(default)d days', |
| 106 | + default=30, metavar='NUM') |
| 107 | +parser.add_argument('-ns', type=int, action='append', help='select only these NS', |
| 108 | + dest='only') |
| 109 | + |
| 110 | +dateparser = DateParser() |
| 111 | + |
| 112 | +# dummy ZipFile class in case we do not want do anything! |
| 113 | +class DummyZipFile: |
| 114 | + def __init__(self, fn, mode): |
| 115 | + pass |
| 116 | + def close(self): |
| 117 | + pass |
| 118 | + def write(self, fn, *args): |
| 119 | + pass |
| 120 | + |
| 121 | +if __name__ == '__main__': |
| 122 | + ns = parser.parse_args() |
| 123 | + reader = csv.DictReader(ns.index, _fields, quoting=csv.QUOTE_NONE, |
| 124 | + delimiter='\t') |
| 125 | + archive = ZipFile(ns.archive_path) |
| 126 | + |
| 127 | + def _keyfunc(row): |
| 128 | + try: |
| 129 | + date = dateparser.parse(row['date']) |
| 130 | + except: |
| 131 | + print row |
| 132 | + raise |
| 133 | + |
| 134 | + return ns.keyfunc(date) |
| 135 | + |
| 136 | + # group by index by date of registration |
| 137 | + for key, subiter in groupby(reader, _keyfunc): |
| 138 | + |
| 139 | + # reset indices and define output file names from cohort period |
| 140 | + tot_users = 0 |
| 141 | + tot_bots = 0 |
| 142 | + datestr = '-'.join(map(lambda k : '%02d' % k, key)) # (2010,1) -> '2010-01' |
| 143 | + zipfn = '{}.npz'.format(datestr) |
| 144 | + tsvfn = '{}.tsv'.format(datestr) |
| 145 | + |
| 146 | + # if user wants to do a dry-run, replace the Zip files class with the |
| 147 | + # dummy one |
| 148 | + if ns.dry_run: |
| 149 | + ZipFile = DummyZipFile |
| 150 | + |
| 151 | + # for each user, determine if may go in cohort |
| 152 | + with closing(ZipFile(zipfn, 'w')) as zf: |
| 153 | + for row in subiter: |
| 154 | + |
| 155 | + # compute user details (edit count, yearly activity rate, etc.) |
| 156 | + # and other useful variables |
| 157 | + user_id = row['id'] |
| 158 | + count = int(row['count']) |
| 159 | + user_date = dateparser.parse(row['date']) |
| 160 | + now_date = datetime.now() |
| 161 | + activity_span = float((now_date - user_date).days) # in days |
| 162 | + yearly_rate = count / activity_span * 365.0 |
| 163 | + bot_flag = re.search(_botpat, row['name'], re.I) is not None |
| 164 | + tot_bots += bot_flag # update counts of bot matches |
| 165 | + |
| 166 | + # define paths |
| 167 | + basepath = '{}.npy'.format(user_id) |
| 168 | + archivepath = os.path.join(ns.datapath, basepath) |
| 169 | + |
| 170 | + # check cohort membership (keep if conjunction of all given |
| 171 | + # criteria is true, that is, discard if any given criterion is |
| 172 | + # false) |
| 173 | + if ns.mincount is not None and count <= ns.mincount: |
| 174 | + continue |
| 175 | + if ns.maxcount is not None and count >= ns.maxcount: |
| 176 | + continue |
| 177 | + if ns.minperyear is not None and yearly_rate <= ns.minperyear: |
| 178 | + continue |
| 179 | + if ns.maxperyear is not None and yearly_rate >= ns.maxperyear: |
| 180 | + continue |
| 181 | + # user can turn this test off by passing --bots |
| 182 | + if not ns.bots and bot_flag: |
| 183 | + continue |
| 184 | + try: |
| 185 | + zf.writestr(basepath, archive.read(archivepath)) |
| 186 | + except KeyError: |
| 187 | + print >> sys.stderr, '%s: warning: %s not in archive' %\ |
| 188 | + (__prog__, archivepath) |
| 189 | + tot_users += 1 |
| 190 | + |
| 191 | + if tot_users > 0: |
| 192 | + rates = computerates(zipfn, ns.every, onlyns=ns.only) |
| 193 | + np.savetxt(tsvfn, rates, fmt='%f') |
| 194 | + |
| 195 | + print '%s: %s, %s created (users: %5d, skipped bots %5d)' % ( |
| 196 | + __prog__, tsvfn, zipfn, tot_users, tot_bots) |
| 197 | + sys.stdout.flush() |
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort |
___________________________________________________________________ |
Added: svn:executable |
1 | 198 | + * |
Index: trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort |
— | — | @@ -0,0 +1,61 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# vim:ft=python: |
| 4 | +# coding : utf-8 |
| 5 | + |
| 6 | +# TODO: obsolete |
| 7 | + |
| 8 | +''' fetches a cohort based on year of registration and editing activity ''' |
| 9 | + |
| 10 | +from argparse import ArgumentParser |
| 11 | +from oursql import connect |
| 12 | +import os |
| 13 | +import sys |
| 14 | +import datetime as dt |
| 15 | +import csv |
| 16 | + |
| 17 | +prog = os.path.basename(os.path.abspath(__file__)) |
| 18 | + |
| 19 | +parser = ArgumentParser(description=__doc__, fromfile_prefix_chars='@') |
| 20 | +parser.add_argument('registration_year', metavar='year', type=int) |
| 21 | +parser.add_argument('min_activity', metavar='minedits', type=int) |
| 22 | +parser.add_argument('max_activity', metavar='maxedits', type=int) |
| 23 | +parser.add_argument('-c', '--config', dest='config_file') |
| 24 | +parser.add_argument('-l', '--limit', type=int) |
| 25 | + |
| 26 | +query = ''' |
| 27 | +select |
| 28 | + user_id, |
| 29 | + user_name, |
| 30 | + user_registration, |
| 31 | + user_editcount |
| 32 | +from user u left join user_groups ug |
| 33 | +on u.user_id = ug.ug_user |
| 34 | +where |
| 35 | + (ug_group <> 'bot' or ug_user is null) |
| 36 | + and year(user_registration) = ? |
| 37 | + and user_editcount > ? |
| 38 | + and user_editcount < ? |
| 39 | +''' |
| 40 | + |
| 41 | +if __name__ == '__main__': |
| 42 | + ns = parser.parse_args() |
| 43 | + if ns.min_activity >= ns.max_activity: |
| 44 | + print >> sys.stderr, '%s: error: min_activity >= max_activity' % prog |
| 45 | + sys.exit(1) |
| 46 | + if ns.registration_year < 2001 or ns.registration_year > dt.datetime.now().year: |
| 47 | + print >> sys.stderr, '%s: error: illegal year: %d' % (prog, |
| 48 | + ns.registration_year) |
| 49 | + sys.exit(1) |
| 50 | + |
| 51 | + if ns.limit is not None: |
| 52 | + query += 'limit %d' % ns.limit |
| 53 | + |
| 54 | + if ns.config_file is None: |
| 55 | + ns.config_file = os.path.expanduser('~/.my.cnf') |
| 56 | + |
| 57 | + conn = connect(read_default_file=ns.config_file) |
| 58 | + writer = csv.writer(sys.stdout, dialect='excel-tab') |
| 59 | + cursor = conn.cursor() |
| 60 | + cursor.execute(query, (ns.registration_year, ns.min_activity, ns.max_activity)) |
| 61 | + for row in cursor: |
| 62 | + writer.writerow(row) |
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort |
___________________________________________________________________ |
Added: svn:executable |
1 | 63 | + * |
Index: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql |
— | — | @@ -0,0 +1,14 @@ |
| 2 | +-- user ID, user name, timestamp of first edit and edit count of all registered |
| 3 | +-- users that are not flagged bots. N.B. there might still be unflagged bots. |
| 4 | + |
| 5 | +select |
| 6 | + rev_user as user_id, |
| 7 | + rev_user_text as user_name, |
| 8 | + min(rev_timestamp) as first_timestamp, |
| 9 | + count(rev_timestamp) as editcount |
| 10 | +from |
| 11 | + revision r use index (usertext_timestamp) left join user_groups g |
| 12 | +on r.rev_user = g.ug_user |
| 13 | +where (ug_group <> 'bot' or g.ug_user is null) and rev_user > 0 |
| 14 | +group by rev_user_text |
| 15 | +-- limit 100 |
Index: trunk/tools/wsor/editor_lifecycle/obsolete/rates |
— | — | @@ -0,0 +1,78 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +#:vim:ts=python: |
| 4 | + |
| 5 | +''' compute editor lifecycle ''' |
| 6 | + |
| 7 | +import re |
| 8 | +import os |
| 9 | +from argparse import ArgumentParser |
| 10 | +import numpy as np |
| 11 | +from collections import deque |
| 12 | +import datetime as dt |
| 13 | + |
| 14 | +from lifecycle.rates import * |
| 15 | + |
| 16 | +__prog__ = os.path.basename(os.path.abspath(__file__)) |
| 17 | + |
| 18 | +parser = ArgumentParser(description=__doc__) |
| 19 | +parser.add_argument('data_file', metavar='data') |
| 20 | +parser.add_argument(metavar='minact', type=int, dest='minimum_activity') |
| 21 | +parser.add_argument(metavar='maxact', type=int, dest='maximum_activity') |
| 22 | +parser.add_argument('-key') |
| 23 | +parser.add_argument('-every', type=int, help='default: %(default)d days', |
| 24 | + default=30, metavar='NUM') |
| 25 | +parser.add_argument('-inactivity', type=int, default=180, help='default: ' |
| 26 | + '%(default)d days', metavar='NUM') |
| 27 | +parser.add_argument('-all', dest='dump_all', action='store_true') |
| 28 | + |
| 29 | + |
| 30 | +def main(ns): |
| 31 | + if ns.key is None: |
| 32 | + m = re.match('(.*?)\.npz', ns.data_file, re.I) |
| 33 | + if m is not None: |
| 34 | + ns.key = m.groups()[0] |
| 35 | + else: |
| 36 | + print >> sys.stderr, '%s: cannot determine key from file name: %s'\ |
| 37 | + % (__prog__, ns.data_file) |
| 38 | + sys.exit(1) |
| 39 | + if ns.minimum_activity >= ns.maximum_activity: |
| 40 | + print >> sys.stderr, '%s: error: minact >= maxact' % __prog__ |
| 41 | + sys.exit(1) |
| 42 | + |
| 43 | + # load data |
| 44 | + npzarchive = np.load(ns.data_file) |
| 45 | + |
| 46 | + if ns.dump_all: |
| 47 | + fn = mkfn('cycles', ns, 'npz') |
| 48 | + values_iter = itercycles(npzarchive, ns.every) |
| 49 | + keys = npzarchive.files |
| 50 | + tmp = dict(zip(keys, list(values_iter))) |
| 51 | + np.savez(fn, **tmp) |
| 52 | + print '%s: output saved to %s' % (__prog__, fn) |
| 53 | + else: |
| 54 | + # compute lifetime distribution |
| 55 | + lt = lifetimes(npzarchive) |
| 56 | + |
| 57 | + # compute inactive subgroups |
| 58 | + inactive_users = find_inactives(npzarchive, ns.inactivity, ns.minimum_activity, |
| 59 | + ns.maximum_activity) |
| 60 | + |
| 61 | + ratesbyday = groupbyday(npzarchive, ns.every) |
| 62 | + ratesbyday_inact = groupbyday(npzarchive, ns.every, inactive_users) |
| 63 | + |
| 64 | + avg_all = averagecycle(ratesbyday) |
| 65 | + avg_inact = averagecycle(ratesbyday_inact) |
| 66 | + |
| 67 | + lens = [ len(npzarchive.files), len(inactive_users) ] |
| 68 | + |
| 69 | + names = ['lt', 'len', 'all', 'inact' ] |
| 70 | + arrs = [ lt, lens, avg_all, avg_inact ] |
| 71 | + |
| 72 | + for n, a in zip(names, arrs): |
| 73 | + fn = '%s_%s.%s' % (ns.key, n, 'tsv') |
| 74 | + np.savetxt(fn, a) |
| 75 | + print '%s: output saved to %s' % (__prog__, fn) |
| 76 | + |
| 77 | +if __name__ == '__main__': |
| 78 | + ns = parser.parse_args() |
| 79 | + main(ns) |
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/rates |
___________________________________________________________________ |
Added: svn:executable |
1 | 80 | + * |
Index: trunk/tools/wsor/editor_lifecycle/obsolete/graphlife |
— | — | @@ -0,0 +1,90 @@ |
| 2 | +#!/usr/bin/python |
| 3 | + |
| 4 | +''' plot editor life cycle ''' |
| 5 | + |
| 6 | +import sys |
| 7 | +import numpy as np |
| 8 | +from argparse import ArgumentParser |
| 9 | +import os |
| 10 | + |
| 11 | +__prog__ = os.path.basename(os.path.abspath(__file__)) |
| 12 | + |
| 13 | +parser = ArgumentParser(description=__doc__) |
| 14 | +parser.add_argument('data_files', metavar='data', nargs='+') |
| 15 | +parser.add_argument('-l', '--label', metavar='TEXT', action='append', |
| 16 | + dest='labels_list') |
| 17 | +parser.add_argument('-inset', dest='inset_data_file', metavar='FILE') |
| 18 | +parser.add_argument('-batch', action='store_true', help='uses PDF backend') |
| 19 | +parser.add_argument('-title') |
| 20 | +parser.add_argument('-fmt', default='pdf', help='default: %(default)s') |
| 21 | + |
| 22 | +if __name__ == '__main__': |
| 23 | + ns = parser.parse_args() |
| 24 | + |
| 25 | + # checks |
| 26 | + if ns.labels_list and len(ns.data_files) != len(ns.labels_list): |
| 27 | + print >> sys.stderr, '%s: error: please provide as many labels '\ |
| 28 | + 'as data files' % __prog__ |
| 29 | + sys.exit(1) |
| 30 | + |
| 31 | + # import pyplot, make lists of colors and markers |
| 32 | + if ns.batch: |
| 33 | + import matplotlib |
| 34 | + matplotlib.use('PDF') |
| 35 | + import matplotlib.pyplot as pp |
| 36 | + from matplotlib.lines import lineMarkers as markers |
| 37 | + markers = dict(filter( |
| 38 | + lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing', |
| 39 | + markers.items())).keys() |
| 40 | + colors = 'krbgm' |
| 41 | + |
| 42 | + # create figure and axes |
| 43 | + fig = pp.figure() |
| 44 | + ax = pp.axes([.1, .1, .85, .8]) |
| 45 | + |
| 46 | + # add lines |
| 47 | + N = len(ns.data_files) |
| 48 | + for i in xrange(N): |
| 49 | + data_file = ns.data_files[i] |
| 50 | + if ns.labels_list is not None: |
| 51 | + label = ns.labels_list[i] |
| 52 | + else: |
| 53 | + label = 'line-%d' % (i + 1) |
| 54 | + color = colors[i % len(colors)] |
| 55 | + marker= markers[i % len(markers)] |
| 56 | + x, y, ye = np.loadtxt(data_file, unpack=1) |
| 57 | + ax.errorbar(x, y, ye, color=color, marker=marker, mfc='none', |
| 58 | + mec=color, ls=':', label=label) |
| 59 | + |
| 60 | + ax.legend(loc=2) |
| 61 | + ax.set_xlabel('days since registration') |
| 62 | + ax.set_ylabel('edits/day') |
| 63 | + if ns.title is not None: |
| 64 | + ax.set_title(ns.title) |
| 65 | + ax.axis('tight') |
| 66 | + |
| 67 | + # plot hist of lifetimes in inset axes |
| 68 | + if ns.inset_data_file is not None: |
| 69 | + lt = np.loadtxt(ns.inset_data_file) |
| 70 | + inax = pp.axes([.55, .6, .35, .25], axisbg='none') |
| 71 | + inax.hist(lt, bins=20, fc='none', cumulative=-1, normed=0) |
| 72 | + for l in inax.xaxis.get_ticklabels(): |
| 73 | + l.set_rotation(30) |
| 74 | + l.set_fontsize('x-small') |
| 75 | + for l in inax.yaxis.get_ticklabels(): |
| 76 | + l.set_fontsize('x-small') |
| 77 | + inax.set_xlabel('lifespan $x$ (days)', fontsize='small') |
| 78 | + inax.set_ylabel('no. of users older\n more than $x$ days', |
| 79 | + fontsize='small') |
| 80 | + inax.set_title('account lifetime') |
| 81 | + inax.axis('tight') |
| 82 | + |
| 83 | + pp.draw() |
| 84 | + if ns.title is not None: |
| 85 | + fn = ns.title.replace(' ', '_').lower() + '.' + ns.fmt |
| 86 | + else: |
| 87 | + fn = 'output.' + ns.fmt |
| 88 | + print 'output saved to %s' % fn |
| 89 | + |
| 90 | + pp.savefig(fn, fmt=ns.fmt) |
| 91 | + pp.show() |
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/graphlife |
___________________________________________________________________ |
Added: svn:executable |
1 | 92 | + * |
Index: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh |
— | — | @@ -0,0 +1,14 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +# This scripts writes to output a list of registered, not-flagged-as-bot users, |
| 5 | +# sorted by time of first edit. Each item in the list comprises: |
| 6 | +# |
| 7 | +# 1. user_id |
| 8 | +# 2. user_name |
| 9 | +# 3. first_timestamp |
| 10 | +# 4. editcount |
| 11 | +# |
| 12 | +# For the SQL query, check file userlist.sql. |
| 13 | + |
| 14 | +srcdir=`dirname $(type -p $0)` |
| 15 | +mysql -BN < $srcdir/userlist.sql | sort -h -k3 -t $'\t' |
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh |
___________________________________________________________________ |
Added: svn:executable |
1 | 16 | + * |
Index: trunk/tools/wsor/editor_lifecycle/fitting_batch.sh |
— | — | @@ -0,0 +1,43 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +# Applies the `fitting' script to a batch of files |
| 5 | +# |
| 6 | +# author: Giovanni Luca Ciampaglia <gciampaglia@wikimedia.org> |
| 7 | +# |
| 8 | +# USAGE: fitting_batch.sh file1 file2 file3 ... |
| 9 | +# |
| 10 | +# This will produce the normal console output that fitting produces; PDF plots |
| 11 | +# will be stored in file fit.pdf (please note: no check against overwriting |
| 12 | +# existing versions is performed!) |
| 13 | + |
| 14 | +if [[ -z `type -p fitting` ]] ; then |
| 15 | + echo 'error: could not find fitting script. Check your PATH' |
| 16 | + exit 1 |
| 17 | +fi |
| 18 | + |
| 19 | +if [[ -e fit.pdf ]] ; then |
| 20 | + echo 'error: cannot overwrite file fit.pdf' |
| 21 | + exit 1 |
| 22 | +fi |
| 23 | + |
| 24 | +O=`mktemp -d` |
| 25 | +models="expon powerlaw stretchedexp" |
| 26 | +files="$@" |
| 27 | + |
| 28 | +for file in $files ; do |
| 29 | + for model in $models ; do |
| 30 | + fitting $model -force -loglog -batch $file -o $O/${file%.*}_$model.pdf |
| 31 | + echo |
| 32 | + echo |
| 33 | + done |
| 34 | +done |
| 35 | + |
| 36 | +pdfs=`ls $O/*.pdf | sort` |
| 37 | + |
| 38 | +gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=fit.pdf -dBATCH $pdfs &>/dev/null |
| 39 | + |
| 40 | +if [[ $? = 0 ]] ; then |
| 41 | + echo 'images saved in fit.pdf' |
| 42 | +else |
| 43 | + echo "error: problem saving fit.pdf. Individual image files in $O" |
| 44 | +fi |
Property changes on: trunk/tools/wsor/editor_lifecycle/fitting_batch.sh |
___________________________________________________________________ |
Added: svn:executable |
1 | 45 | + * |
Index: trunk/tools/wsor/editor_lifecycle/sge/rates.sh |
— | — | @@ -0,0 +1,30 @@ |
| 2 | +#! /bin/bash |
| 3 | +#$ -N rates |
| 4 | +#$ -j y |
| 5 | +#$ -o query.log |
| 6 | +#$ -t 1-NNNNN |
| 7 | +#$ -tc 10 |
| 8 | +#$ -l sqlprocs-s1=1 |
| 9 | +#$ -m n |
| 10 | + |
| 11 | +# Use this script to run the query on the toolserver using its Sun Grid Engine |
| 12 | +# batch job scheduling system. SGE will run this script in parallel for each |
| 13 | +# input file in input/, and place the output in (surprise..) output/ |
| 14 | +# |
| 15 | +# Each input file will be fed to this script by SGE. However, SGE does not have |
| 16 | +# any way to tell how many input files you have to process, so you have to |
| 17 | +# manually change the comment line on top of this file to reflect this |
| 18 | +# information: |
| 19 | +# |
| 20 | +# #$ -t 1-NNNNN |
| 21 | +# |
| 22 | +# once you have done this, you can simply submit the job to SGE using qsub: |
| 23 | +# |
| 24 | +# ~$ qsub < rates.sh |
| 25 | +# |
| 26 | +# See man qsub for more info on that. |
| 27 | + |
| 28 | +uid=$((SGE_TASK_ID-1)) |
| 29 | +infile=$uid.in |
| 30 | +export PATH=$HOME/local/bin:$PATH |
| 31 | +fetchrates -o $HOME/output $HOME/input/$infile |
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/models.py |
— | — | @@ -0,0 +1,371 @@ |
| 2 | +# coding: utf8 |
| 3 | + |
| 4 | +import numpy as np |
| 5 | +from scipy.stats import norm, chisqprob, normaltest |
| 6 | +from scipy.optimize import curve_fit |
| 7 | +from scipy.special import gamma |
| 8 | +from cStringIO import StringIO |
| 9 | +import datetime as dt |
| 10 | +#from scikits.statsmodels.api import OLS |
| 11 | + |
| 12 | +__all__ = ['Expon', 'PowerLaw', 'StretchedExpon' ] |
| 13 | + |
| 14 | +class Parameter(object): |
| 15 | + ''' |
| 16 | + Class for parameter descriptors. Works with ParameterMixin |
| 17 | + ''' |
| 18 | + def __init__(self, name, attrlist): |
| 19 | + self.name = name # parameter name |
| 20 | + for att in attrlist: |
| 21 | + if att.name == name: |
| 22 | + raise AttributeError('cannot add parameter {}'.format(name)) |
| 23 | + attrlist.append(self) |
| 24 | + def __get__(self, instance, owner): |
| 25 | + if instance is not None: |
| 26 | + return instance.__dict__['_' + self.name] |
| 27 | + return self |
| 28 | + def __set__(self, instance, value): |
| 29 | + try: |
| 30 | + value, error = value |
| 31 | + except TypeError: |
| 32 | + value, error = value, None |
| 33 | + instance.__dict__['_' + self.name] = value |
| 34 | + instance.__dict__[self.name + '_err'] = error |
| 35 | + def __repr__(self): |
| 36 | + return '<Parameter {} at 0x{}>'.format(self.name, '%x' % id(self)) |
| 37 | + |
| 38 | +class ParameterMixin(object): |
| 39 | + ''' |
| 40 | + Class that lets you look up all Parameter instances in __params__ |
| 41 | + ''' |
| 42 | + def itererrors(self): |
| 43 | + for p in self.__params__: |
| 44 | + yield self.__getattribute__(p.name + '_err') |
| 45 | + def errors(self): |
| 46 | + return list(self.itererrors()) |
| 47 | + def iterparams(self): |
| 48 | + ''' |
| 49 | + Returns an iterator over all parameters of this model |
| 50 | + ''' |
| 51 | + for p in self.__params__: |
| 52 | + yield self.__getattribute__(p.name) |
| 53 | + def params(self): |
| 54 | + ''' |
| 55 | + Returns a tuple of all parameters of this model |
| 56 | + ''' |
| 57 | + return list(self.iterparams()) |
| 58 | + def setparams(self, *args): |
| 59 | + ''' |
| 60 | + Sets unset parameters of this model to *args. Parameters that already |
| 61 | + are associated a value will *NOT* be modified by this method. |
| 62 | + ''' |
| 63 | + keyf = lambda p : self.__getattribute__(p.name) is None |
| 64 | + for p, a in zip(filter(keyf, self.__params__), args): |
| 65 | + setattr(self, p.name, a) |
| 66 | + |
| 67 | +def _orNA(val, fmt='%8.5g'): |
| 68 | + if val is not None: |
| 69 | + return fmt % val |
| 70 | + else: |
| 71 | + return 'N/A' |
| 72 | + |
| 73 | +class ParametricModel(ParameterMixin): |
| 74 | + ''' |
| 75 | + Callable class with Parameter descriptors. Subclasses of ParametricModel |
| 76 | + ought define, as class attributes, any number of Parameter descriptors at the |
| 77 | + class level, together with a list (conventional name: `__params__'). See |
| 78 | + Parameter.__init__ on how to instantiate a Parameter descriptor. |
| 79 | + |
| 80 | + Subclassess ought also define two static methods: `func' and `init'. The |
| 81 | + first is the actual function that accepts an argument together with the same |
| 82 | + number of parameters as in __params__. The second is used to get initial |
| 83 | + estimates for the Levenberg-Marquardt leastsq minimizer used to fit this |
| 84 | + model. |
| 85 | + |
| 86 | + From that point on, any instance of this class acts as the function `func' |
| 87 | + itself, with the only differences that it automatically performs partial |
| 88 | + application for those Parameter attributes that are being assigned a value. |
| 89 | + Example: |
| 90 | + |
| 91 | + # expon.func(x, A, B) is A * exp(B * x) |
| 92 | + >>> expon(1, -1, 2) = 0.73575888234288467 |
| 93 | + >>> expon.A = 2 |
| 94 | + >>> expon(1, -1) = 0.73575888234288467 |
| 95 | + ''' |
| 96 | + def __init__(self, *args, **kwargs): |
| 97 | + keys = [p.name for p in self.__params__] |
| 98 | + for k in keys: |
| 99 | + if k not in kwargs: |
| 100 | + kwargs[k] = None |
| 101 | + kwargs.update(zip(keys, args)) # update the rightmost parameters only |
| 102 | + for k, v in kwargs.items(): |
| 103 | + setattr(self, k, v) |
| 104 | + self.goftest = tuple([None] * 3) |
| 105 | + self.residtest = tuple([None] * 2) |
| 106 | + self.Rsquared = None |
| 107 | + def __call__(self, x, *args): |
| 108 | + ''' |
| 109 | + See class method `func' |
| 110 | + ''' |
| 111 | + fargs = self.params() |
| 112 | + N = len(filter(None, fargs)) |
| 113 | + if N + len(args) > len(fargs): |
| 114 | + raise TypeError('{} accepts only {} ' |
| 115 | + 'parameters'.format(self.__class__.__name__, len(fargs))) |
| 116 | + for a in args: |
| 117 | + idx = fargs.index(None) |
| 118 | + fargs[idx] = a |
| 119 | + fargs = tuple(fargs) |
| 120 | + return self.func(x, *fargs) |
| 121 | + def fit(self, x, y, ye, **kwargs): |
| 122 | + ''' |
| 123 | + Fits this parametric model to observations (x_i, y_i). Uncertainty in |
| 124 | + the y-estimates can be specified with argument `ye'. Additional keyword |
| 125 | + arguments are passed to scipy.optimize.curve_fit which in turn passes |
| 126 | + them to scipy.optimize.leastsq. |
| 127 | + ''' |
| 128 | + fp0 = self.init(x, y) |
| 129 | + fargs = self.params() |
| 130 | + p0 = [] |
| 131 | + for a, p in zip(fargs, fp0): |
| 132 | + if a is None: |
| 133 | + p0.append(p) |
| 134 | + p0 = tuple(p0) |
| 135 | + return curve_fit(self, x, y, sigma=ye, p0=p0, **kwargs) |
| 136 | + def gof(self, x, y, ye): |
| 137 | + ''' |
| 138 | + Computes GoF test statistics and other diagnostical tests |
| 139 | + |
| 140 | + Returns: |
| 141 | + -------- |
| 142 | + - GoF test: Chi^2, p-value, and ddof |
| 143 | + - Normality of residuals: K^2 and p-value |
| 144 | + ''' |
| 145 | + res = {} |
| 146 | + resid = y - self(x) |
| 147 | + chisq = np.sum(((resid) / ye) ** 2) |
| 148 | + ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters |
| 149 | + chisq_pvalue = chisqprob(chisq, ddof) |
| 150 | + gof = (chisq, chisq_pvalue, ddof) |
| 151 | + resid = normaltest(resid) |
| 152 | + ym = y.mean() |
| 153 | + SStot = np.sum((y - ym) ** 2) |
| 154 | + SSerr = np.sum((y - self(x)) ** 2) |
| 155 | + Rsquared = 1.0 - SSerr / SStot |
| 156 | +# Besides being buggy, this test for homoscedasticity is supposed to work only |
| 157 | +# for linear regressions, hence is not suited for our case, but I'll keep it |
| 158 | +# here until I figure out an alternative. Remember to uncomment the import for |
| 159 | +# OLS ontop. |
| 160 | +# regresults = OLS(resid ** 2, np.c_[x, x**2]).fit() |
| 161 | +# LM =regresults.rsquared |
| 162 | +# LM_pvalue = chisqprob(LM, len(x) - ddof) |
| 163 | +# white = (LM, LM_pvalue) |
| 164 | +# return gof, resid, white |
| 165 | + return gof, resid, Rsquared |
| 166 | + def __str__(self): |
| 167 | + name = self.__class__.__name__ |
| 168 | + prep = [] |
| 169 | + for p in self.params(): |
| 170 | + if p is not None: |
| 171 | + prep.append('%3.4g' % p) |
| 172 | + else: |
| 173 | + prep.append('*') |
| 174 | + return '{}({})'.format(name, ','.join(prep)) |
| 175 | + def __repr__(self): |
| 176 | + return '<{} object at 0x{}>'.format(str(self), '%x' % id(self)) |
| 177 | + def summary(self, **kwargs): |
| 178 | + ''' |
| 179 | + Returns a summary of this model |
| 180 | + ''' |
| 181 | + s = StringIO() |
| 182 | + print >> s, '' |
| 183 | + print >> s, 'General information' |
| 184 | + print >> s, '-------------------' |
| 185 | + print >> s, 'model: %s' % self.name.capitalize() |
| 186 | + print >> s, 'date: %s' % dt.datetime.now() |
| 187 | + for item in kwargs.items(): |
| 188 | + print >> s, '{}: {}'.format(*map(str, item)) |
| 189 | + print >> s, '' |
| 190 | + print >> s, 'Model parameters' |
| 191 | + print >> s, '----------------' |
| 192 | + for p, val, err in zip(self.__params__, self.params(), self.errors()): |
| 193 | + print >> s, '{}: {} ± {}'.format(p.name, _orNA(val), _orNA(err)) |
| 194 | + chi, p, ddof = self.goftest |
| 195 | + print >> s, '' |
| 196 | + print >> s, 'Fit results' |
| 197 | + print >> s, '-----------' |
| 198 | + print >> s, 'Goodness-of-fit: Chi-squared = {}, p = {}, ddof = {}'.format( |
| 199 | + _orNA(chi, '%5.2f'), _orNA(p, '%8.4e'), _orNA(ddof, '%d')) |
| 200 | + D, p = self.residtest |
| 201 | + print >> s, 'Normality of residuals: K-squared = {}, p = {}'.format( |
| 202 | + _orNA(D, '%5.2f'), _orNA(p, '%8.4e')) |
| 203 | + print >> s, 'Coefficient of Determination: {}'.format( |
| 204 | + _orNA(self.Rsquared, '%5.2f')) |
| 205 | + return s.getvalue() |
| 206 | + |
| 207 | +class Expon(ParametricModel): |
| 208 | + ''' |
| 209 | + y = A * exp( -(x / B)) + C |
| 210 | + ''' |
| 211 | + __params__ = [] |
| 212 | + A = Parameter('A', __params__) |
| 213 | + B = Parameter('B', __params__) |
| 214 | + C = Parameter('C', __params__) |
| 215 | + name = 'exponential' |
| 216 | + @staticmethod |
| 217 | + def func(x, a, b, c): |
| 218 | + return a * np.exp(-(x / b)) + c |
| 219 | + @staticmethod |
| 220 | + def init(x, y): |
| 221 | + a0 = y[np.argmin(np.abs(x))] # estimate for A = f(0) |
| 222 | + b0 = x.max() / 10.0 |
| 223 | + c0 = y.min() |
| 224 | + return (a0, b0, c0) |
| 225 | + def fit(self, x, y, ye, **kwargs): |
| 226 | + return super(Expon, self).fit(x, y, ye, **kwargs) |
| 227 | + |
| 228 | +class StretchedExpon(ParametricModel): |
| 229 | + ''' |
| 230 | + y = A * exp (-(t / tau) ** beta) + C |
| 231 | + ''' |
| 232 | + __params__ = [] |
| 233 | + A = Parameter('A', __params__) |
| 234 | + tau = Parameter('tau', __params__) |
| 235 | + beta = Parameter('beta', __params__) |
| 236 | + C = Parameter('C', __params__) |
| 237 | + name = 'stretched exponential' |
| 238 | + @staticmethod |
| 239 | + def func(x, a, tau, beta, c): |
| 240 | + return a * np.exp(- (x / tau) ** beta) + c |
| 241 | + @staticmethod |
| 242 | + def init(x, y): |
| 243 | + a0 = y[np.argmin(np.abs(x))] # estimate for A = f(0) |
| 244 | + tau0 = x.max() / 10.0 |
| 245 | + c0 = y.min() |
| 246 | + return (a0, tau0, 0.5, c0) |
| 247 | + def fit(self, x, y, ye, **kwargs): |
| 248 | + return super(StretchedExpon, self).fit(x, y, ye, **kwargs) |
| 249 | + def summary(self, **kwargs): |
| 250 | + mrt = self.mrt(self.tau, self.beta) |
| 251 | + kwargs['Mean relaxation time <tau>'] = '%5.2f days' % mrt |
| 252 | + return super(StretchedExpon, self).summary(**kwargs) |
| 253 | + def mrt(self, tau, beta): |
| 254 | + return (tau / beta) * gamma(beta ** -1) |
| 255 | + |
| 256 | +class PowerLaw(ParametricModel): |
| 257 | + ''' |
| 258 | + y = A * x ** B |
| 259 | + ''' |
| 260 | + __params__ = [] |
| 261 | + A = Parameter('A', __params__) |
| 262 | + B = Parameter('B', __params__) |
| 263 | + name = 'power-law' |
| 264 | + @staticmethod |
| 265 | + def func(x, a, b): |
| 266 | + return a * x ** b |
| 267 | + @staticmethod |
| 268 | + def init(x, y): |
| 269 | + return (1, y.ptp()/x.ptp()) |
| 270 | +# NR says this code is more robust against roundoff errors, but presently it |
| 271 | +# does not work. Bummer. |
| 272 | +# def fit(self, x, y, ye, **kwargs): |
| 273 | +# x, y, ye = self._removezeros(x, y, ye) |
| 274 | +# ye = ye / y |
| 275 | +# x = np.log(x) |
| 276 | +# y = np.log(y) |
| 277 | +# S = np.sum(ye ** -1) |
| 278 | +# Sx = np.sum(x / ye) |
| 279 | +# Sy = np.sum(y / ye) |
| 280 | +# t = (ye ** -1) * (x - Sx / S) |
| 281 | +# Stt = np.sum(t ** 2) |
| 282 | +# b = Stt ** -1 * np.sum((y * t) / ye) |
| 283 | +# a = np.exp((Sy - Sx * b) / S) |
| 284 | +# a_var = S ** -1 * (1 + Sx ** 2 / (S * Stt)) |
| 285 | +# b_var = Stt ** -1 |
| 286 | +# ab_covar = - Sx / Stt |
| 287 | +# pcov = np.asarray([[a_var, ab_covar], [ab_covar, b_var]]) |
| 288 | +# return (a, b), pcov |
| 289 | + def fit(self, x, y, ye, **kwargs): |
| 290 | + ''' |
| 291 | + Fit by linear least squares of log-transformed data |
| 292 | + ''' |
| 293 | + x, y, ye = self._removezeros(x, y, ye) |
| 294 | + ye = (ye / y) ** 2 |
| 295 | + x = np.log(x) |
| 296 | + y = np.log(y) |
| 297 | + S = np.sum(ye ** -1) |
| 298 | + Sx = np.sum(x / ye) |
| 299 | + Sy = np.sum(y / ye) |
| 300 | + Sxx = np.sum(x ** 2 / ye) |
| 301 | + Sxy = np.sum((x * y) / ye) |
| 302 | + Delta = S * Sxx - Sx ** 2 |
| 303 | + a = np.exp((Sxx * Sy - Sx * Sxy) / Delta) |
| 304 | + b = (S * Sxy - Sx * Sy) / Delta |
| 305 | + a_var = Sxx / Delta |
| 306 | + b_var = S / Delta |
| 307 | + ab_covar = - Sx / Delta |
| 308 | + pcov = np.asarray([[a_var, ab_covar], [ab_covar, b_var]]) |
| 309 | + return (a, b), pcov |
| 310 | + def gof(self, x, y, ye): |
| 311 | + ''' |
| 312 | + GoF of linear least squares of log-transformed data |
| 313 | + ''' |
| 314 | + x, y, ye = self._removezeros(x, y, ye) |
| 315 | + ye = (ye / y) |
| 316 | + x = np.log(x) |
| 317 | + y = np.log(y) |
| 318 | + yp = np.log(self.A) + self.B * x |
| 319 | + chisq = np.sum(((yp - y) / ye) ** 2) |
| 320 | + ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters |
| 321 | + chisq_pvalue = chisqprob(chisq / 2., ddof) |
| 322 | + resid = normaltest(y - yp) |
| 323 | + ym = y.mean() |
| 324 | + SStot = np.sum((y - ym) ** 2) |
| 325 | + SSerr = np.sum((y - yp) ** 2) |
| 326 | + Rsquared = 1.0 - SSerr / SStot |
| 327 | + return (chisq, chisq_pvalue, ddof), resid, Rsquared |
| 328 | + @staticmethod |
| 329 | + def _removezeros(x, y, ye): |
| 330 | + idx = x > 0 |
| 331 | + return x[idx], y[idx], ye[idx] |
| 332 | + |
| 333 | +if __name__ == '__main__': |
| 334 | + |
| 335 | + import matplotlib.pyplot as pp |
| 336 | + import scale |
| 337 | + |
| 338 | + model = StretchedExpon() |
| 339 | + |
| 340 | + a = 2 |
| 341 | + tau = 100 |
| 342 | + beta = .5 |
| 343 | + c = 0. |
| 344 | + s = 0.1 |
| 345 | + xmax = 1000 |
| 346 | + x = np.linspace(0, xmax, 50) |
| 347 | + y = model(x, a, tau, beta, c) + np.random.randn(len(x)) * s |
| 348 | + |
| 349 | + pest, pcov = model.fit(x, y, s) |
| 350 | + |
| 351 | + model.setparams(*zip(pest, np.sqrt(np.diag(pcov)))) |
| 352 | + |
| 353 | + xx = np.linspace(0, xmax, 1000) |
| 354 | + yy = model(xx) |
| 355 | + |
| 356 | + pp.errorbar(x, y, s, fmt='. ', color='k', ecolor='none', label='data') |
| 357 | + pp.plot(xx, yy, 'r-', label='Stretch. Exp. fit') |
| 358 | + pp.xscale('power', exponent=beta) |
| 359 | + pp.yscale('log') |
| 360 | + |
| 361 | + pp.legend() |
| 362 | + gof, resid, Rsquared = model.gof(x, y, s) |
| 363 | + model.goftest = gof |
| 364 | + model.residtest = resid |
| 365 | + model.Rsquared = Rsquared |
| 366 | + print model.summary() |
| 367 | + chi, p, ddof = gof |
| 368 | + pp.text(200, 1, r'$\chi^2 = %.2f,\, p-{\rm value} = %5.2g,\,' |
| 369 | + r'{\rm ddof} = %d,\, R^2 = %.2f$' |
| 370 | + % (chi,p,ddof, Rsquared), |
| 371 | + fontsize=16) |
| 372 | + pp.show() |
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/scale.py |
— | — | @@ -0,0 +1,68 @@ |
| 2 | +from matplotlib.scale import ScaleBase, register_scale |
| 3 | +from matplotlib.transforms import Transform, nonsingular |
| 4 | +from matplotlib.ticker import LinearLocator, Formatter |
| 5 | +from math import ceil, floor |
| 6 | +import numpy as np |
| 7 | + |
| 8 | +class PowerScale(ScaleBase): |
| 9 | + name ='power' |
| 10 | + def __init__(self, axis, **kwargs): |
| 11 | + ScaleBase.__init__(self) |
| 12 | + exponent = kwargs.pop('exponent') |
| 13 | + if exponent <= 0: |
| 14 | + raise ValueError('exponent must be positive') |
| 15 | + self.exponent = exponent |
| 16 | + def get_transform(self): |
| 17 | + return PowerTransform(self.exponent) |
| 18 | + def set_default_locators_and_formatters(self, axis): |
| 19 | + axis.set_major_locator(PowerLocator(self.exponent)) |
| 20 | + axis.set_major_formatter(PowerFormatter(self.exponent)) |
| 21 | + axis.set_minor_formatter(PowerFormatter(self.exponent)) |
| 22 | + |
| 23 | +class PowerLocator(LinearLocator): |
| 24 | + def __init__(self, exponent, **kwargs): |
| 25 | + LinearLocator.__init__(self, **kwargs) |
| 26 | + self.exponent = exponent |
| 27 | + self.numticks = 5 |
| 28 | + def __call__(self): |
| 29 | + vmin, vmax = self.axis.get_view_interval() |
| 30 | + vmin, vmax = nonsingular(vmin, vmax, expander = 0.05) |
| 31 | + vmin = vmin ** self.exponent |
| 32 | + vmax = vmax ** self.exponent |
| 33 | + if vmax<vmin: |
| 34 | + vmin, vmax = vmax, vmin |
| 35 | + |
| 36 | + ticklocs = np.linspace(vmin, vmax, num=self.numticks, endpoint=True) |
| 37 | + return self.raise_if_exceeds(ticklocs ** (1.0 / self.exponent)) |
| 38 | + |
| 39 | +class PowerFormatter(Formatter): |
| 40 | + def __init__(self, exponent): |
| 41 | + self.exponent = exponent |
| 42 | + def __call__(self, x, pos=None): |
| 43 | + return u'%.2g' % (x ** (1.0 / self.exponent)) |
| 44 | + |
| 45 | +class PowerTransform(Transform): |
| 46 | + input_dims = 1 |
| 47 | + output_dims = 1 |
| 48 | + is_separable = True |
| 49 | + def __init__(self, exponent): |
| 50 | + Transform.__init__(self) |
| 51 | + self.exponent = exponent |
| 52 | + def transform(self, a): |
| 53 | + return a ** self.exponent |
| 54 | + def inverted(self): |
| 55 | + return PowerTransform(1.0 / self.exponent) |
| 56 | + |
| 57 | +register_scale(PowerScale) |
| 58 | + |
| 59 | +if __name__ == '__main__': |
| 60 | + from pylab import * |
| 61 | + import numpy as np |
| 62 | + tau = 20 |
| 63 | + beta = 0.5 |
| 64 | + x = np.linspace(0,100, num=10) |
| 65 | + y = np.exp(-(x / tau) ** beta) |
| 66 | + plot(x, y, 'o ', mfc='none', mew=2) |
| 67 | + xscale('power', exponent=beta) |
| 68 | + yscale('log', basey=10) |
| 69 | + show() |
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py |
— | — | @@ -0,0 +1,173 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +#:vim:ts=python: |
| 4 | + |
| 5 | +''' functions for computing average activity rate of a single cohort ''' |
| 6 | + |
| 7 | +# TODO adapts to sparse matrices! |
| 8 | + |
| 9 | +import os |
| 10 | +import sys |
| 11 | +from argparse import ArgumentParser, FileType |
| 12 | +import numpy as np |
| 13 | +from scipy.sparse import coo_matrix |
| 14 | +from collections import deque |
| 15 | +import datetime as dt |
| 16 | + |
| 17 | +__prog__ = os.path.basename(os.path.abspath(__file__)) |
| 18 | + |
| 19 | +namespaces = { |
| 20 | + 'main': 0, |
| 21 | + 'talk': 1, |
| 22 | + 'user': 2, |
| 23 | + 'user_talk' : 3, |
| 24 | + 'wikipedia' : 4, |
| 25 | + 'wikipedia_talk' : 5, |
| 26 | + 'file' : 6, |
| 27 | + 'file_talk' : 7, |
| 28 | + 'mediawiki' : 8, |
| 29 | + 'mediawiki_talk' : 9, |
| 30 | + 'template' : 10, |
| 31 | + 'template_talk' : 11, |
| 32 | + 'help' : 12, |
| 33 | + 'help_talk' : 13, |
| 34 | + 'category' : 14, |
| 35 | + 'category_talk' : 15, |
| 36 | + 'portal' : 100, |
| 37 | + 'portal_talk' : 101, |
| 38 | + 'book' : 108, |
| 39 | + 'book_talk' : 109 |
| 40 | +} |
| 41 | + |
| 42 | +MAX_NS = np.max(namespaces.values()) # 109 as of August 2011 |
| 43 | + |
| 44 | +def estimaterate(edits, step): |
| 45 | + ''' |
| 46 | + This function takes the daily edit history of an individual editor, and a |
| 47 | + step parameter; it estimates the daily activity of the editor. It returns |
| 48 | + the daily rates every `step' days. |
| 49 | + ''' |
| 50 | + N = len(edits) |
| 51 | + if N % step: |
| 52 | + NN = np.ceil(N / float(step)) * step |
| 53 | + tmp = np.zeros((NN,), dtype=edits.dtype) |
| 54 | + tmp[:N] = edits |
| 55 | + edits = tmp |
| 56 | + return edits.reshape((-1, step)).sum(axis=-1) / float(step) |
| 57 | + |
| 58 | +def itercycles(npzarchive, every, users=None, onlyns=None): |
| 59 | + ''' |
| 60 | + Iterates over the archive or over given list of users and returns estimated |
| 61 | + activity life cycle (see estimaterate()) |
| 62 | + |
| 63 | + For parameters, see computerates |
| 64 | + ''' |
| 65 | + for uid in (users or npzarchive.files): |
| 66 | + data = npzarchive[uid].view(np.recarray) |
| 67 | + idx = data.ns >= 0 # let's filter out junk (i.e. edits to virtual ns) |
| 68 | + days = data.day[idx] |
| 69 | + edits = data.edits[idx] |
| 70 | + ns = data.ns[idx] |
| 71 | + days = days - days.min() |
| 72 | + shape = (days.max() + 1, MAX_NS + 1) |
| 73 | + M = coo_matrix((edits, (days, ns)), shape=shape) |
| 74 | + if onlyns is not None: |
| 75 | + M = M.tocsc()[:, onlyns] |
| 76 | + M = M.tocsr() |
| 77 | + rates = estimaterate(np.asarray(M.sum(axis=1)).ravel(), every) |
| 78 | + yield np.c_[np.arange(len(rates)) * every, rates] |
| 79 | + |
| 80 | +def average(ratesbyday): |
| 81 | + ''' |
| 82 | + Computes average cycle with standard errors. Takes in input a dictionary |
| 83 | + returned by groupbydayssince() |
| 84 | + ''' |
| 85 | + all_days = sorted(ratesbyday.keys()) |
| 86 | + result = deque() |
| 87 | + for d in all_days: |
| 88 | + s = ratesbyday[d] |
| 89 | + sqN = np.sqrt(len(s)) |
| 90 | + result.append((d, np.mean(s), np.std(s)/np.sqrt(len(s)), len(s))) |
| 91 | + return np.asarray(result) |
| 92 | + |
| 93 | +def groupbyday(npzarchive, every, users=None, onlyns=None): |
| 94 | + ''' |
| 95 | + This function estimates editors' activity rates and groups rate estimates by |
| 96 | + number of days elapsed since editor registration (which corresponds to time = 0) |
| 97 | + |
| 98 | + For parameters, see computerates |
| 99 | + ''' |
| 100 | + tmp = {} |
| 101 | + for cyclearr in itercycles(npzarchive, every, users, onlyns): |
| 102 | + for d, r in cyclearr: |
| 103 | + try: |
| 104 | + tmp[d].append(r) |
| 105 | + except KeyError: |
| 106 | + tmp[d] = deque([r]) |
| 107 | + return tmp |
| 108 | + |
| 109 | +# NOTE: not used right now |
| 110 | +def lifetimes(npzarchive, users=None): |
| 111 | + ''' |
| 112 | + Returns the distribution of account lifetimes over an archive. Can take an |
| 113 | + optional list users ids to restrict the sample to a specific group of |
| 114 | + editors |
| 115 | + ''' |
| 116 | + lt = deque() |
| 117 | + for uid in (users or npzarchive.files): |
| 118 | + days, edits = npzarchive[uid].T |
| 119 | + lt.append(days.ptp()) |
| 120 | + return np.asarray(lt) |
| 121 | + |
| 122 | +# NOTE: not used right now |
| 123 | +def find_inactives(npzarchive, inactivity, minimum_activity, maximum_activity): |
| 124 | + now = dt.datetime.now().toordinal() |
| 125 | + epoch = dt.datetime(1970,1,1).toordinal() |
| 126 | + unix_now = now - epoch |
| 127 | + inactives = deque() |
| 128 | + for uid in npzarchive.files: |
| 129 | + days, edits = npzarchive[uid].T |
| 130 | + if days.ptp() <= inactivity: |
| 131 | + continue |
| 132 | + unix_last = days[-1] |
| 133 | + if (unix_now - unix_last) > inactivity: |
| 134 | + tot_edits = float(edits.sum()) |
| 135 | + tot_days = float(days.ptp() - inactivity) |
| 136 | + activity = tot_edits / tot_days * 365.0 |
| 137 | + if minimum_activity < activity and maximum_activity > activity: |
| 138 | + inactives.append(uid) |
| 139 | + return inactives |
| 140 | + |
| 141 | +def computerates(fn, every, users=None, onlyns=None): |
| 142 | + ''' |
| 143 | + Returns an array of average activity vs day since registration with standard |
| 144 | + errors of the average |
| 145 | + |
| 146 | + Parameters |
| 147 | + ---------- |
| 148 | + onlyns - compute edit counts only over specified list of namespaces |
| 149 | + users - compute rates only for these users |
| 150 | + every - compute daily rates in strides of length `every' |
| 151 | + ''' |
| 152 | + npzarchive = np.load(fn) |
| 153 | + rates = average(groupbyday(npzarchive, every, users, onlyns)) |
| 154 | + return rates |
| 155 | + |
| 156 | +parser = ArgumentParser(description=__doc__) |
| 157 | +parser.add_argument('data_file', metavar='data') |
| 158 | +parser.add_argument('output_file', metavar='output', type=FileType('w')) |
| 159 | +parser.add_argument('-every', type=int, help='default: average over %(default)d days', |
| 160 | + default=30, metavar='NUM') |
| 161 | +parser.add_argument('-ns', type=int, action='append', help='select only these NS', |
| 162 | + dest='only') |
| 163 | + |
| 164 | +def main(ns): |
| 165 | + rates = computerates(ns.data_file, ns.every, onlyns=ns.only) |
| 166 | + if ns.output_file.isatty(): |
| 167 | + print >> sys.stderr, 'writing to standard output' |
| 168 | + np.savetxt(ns.output_file, rates, fmt='%f') |
| 169 | + if not ns.output_file.isatty(): |
| 170 | + print '%s: output saved to %s' % (__prog__, ns.output_file.name) |
| 171 | + |
| 172 | +if __name__ == '__main__': |
| 173 | + ns = parser.parse_args() |
| 174 | + main(ns) |
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/__init__.py |
Index: trunk/tools/wsor/editor_lifecycle/ratesnobots.sql |
— | — | @@ -0,0 +1,9 @@ |
| 2 | +select |
| 3 | + rev_user, |
| 4 | + min(rev_timestamp), |
| 5 | + max(rev_timestamp), |
| 6 | + count(*) |
| 7 | +from revision left join halfak.bot_20110711 |
| 8 | +on rev_user = user_id |
| 9 | +where user_id is null and rev_user > 0 |
| 10 | +group by rev_user; |
Index: trunk/tools/wsor/editor_lifecycle/rateperedits.py |
— | — | @@ -0,0 +1,69 @@ |
| 2 | +#!/usr/bin/python |
| 3 | + |
| 4 | +'''plots number of edits vs activity rate ''' |
| 5 | + |
| 6 | +import re |
| 7 | +import os |
| 8 | +import sys |
| 9 | + |
| 10 | +from argparse import ArgumentParser |
| 11 | +from itertools import groupby |
| 12 | + |
| 13 | +import numpy as np |
| 14 | +import matplotlib.pyplot as pp |
| 15 | + |
| 16 | +__prog__ = os.path.basename(__file__) |
| 17 | + |
| 18 | +parser = ArgumentParser(description=__doc__) |
| 19 | +parser.add_argument('data_file', metavar='data') |
| 20 | + |
| 21 | +def main(ns): |
| 22 | + |
| 23 | + # detect extension and load data |
| 24 | + _, ext = os.path.splitext(ns.data_file) |
| 25 | + if re.match('^\.npy$', ext, re.I): |
| 26 | + data = np.load(ns.data_file) |
| 27 | + elif re.match('^\.tsv$', ext, re.I) or re.match('^\.txt$', ext, re.I): |
| 28 | + data = np.loadtxt(ns.data_file, delimiter='\t') |
| 29 | + else: |
| 30 | + print >> sys.stderr, '%s: error: could not determine file type (.npy,'\ |
| 31 | + '.tsv, .txt accepted)' % __prog__ |
| 32 | + |
| 33 | + # sort data |
| 34 | + data[:,1] = np.floor(np.log2(data[:,1])) |
| 35 | + idx = np.argsort(data[:,1]) |
| 36 | + data = data[idx] |
| 37 | + |
| 38 | + # group by exponential binning |
| 39 | + positions = [] |
| 40 | + x = [] |
| 41 | + for p, subiter in groupby(data, lambda k : k[1]): |
| 42 | + positions.append(p) |
| 43 | + x.append(np.asarray([r for r, e in subiter ])) |
| 44 | + positions = np.asarray(positions) |
| 45 | + |
| 46 | + # make boxplot on log-log scale |
| 47 | + widths = 2 ** np.arange(0, len(x)) |
| 48 | + ax = pp.gca() |
| 49 | + pp.boxplot(x, positions= 2 ** positions, sym=' ', whis=1.8, bootstrap=5000, |
| 50 | + widths=widths) |
| 51 | + pp.xscale('log') |
| 52 | + pp.yscale('log') |
| 53 | + |
| 54 | + # add reference line |
| 55 | + x = np.array([1e2, 1e6]) |
| 56 | + m = 1e-8 |
| 57 | + pp.plot(x, m * x, 'k-', alpha=.75, lw=2) |
| 58 | + |
| 59 | + # decorate |
| 60 | + pp.setp(ax.lines, color='k') |
| 61 | + pp.xlim(1,2e6) |
| 62 | + pp.ylim(1e-9,.1) |
| 63 | + pp.xlabel('edits') |
| 64 | + pp.ylabel(r'activity (${\rm sec} ^ {-1}$)') |
| 65 | + pp.draw() |
| 66 | + pp.show() |
| 67 | + |
| 68 | +if __name__ == '__main__': |
| 69 | + ns = parser.parse_args() |
| 70 | + main(ns) |
Index: trunk/tools/wsor/editor_lifecycle/README.rst |
— | — | @@ -0,0 +1,34 @@ |
| 2 | +============ |
| 3 | +README |
| 4 | +============ |
| 5 | + |
| 6 | +--------- |
| 7 | +workflow |
| 8 | +--------- |
| 9 | + |
| 10 | +This package is a collection of python and shell scripts that can assist |
| 11 | +creating and analyzing data on user life cycle. |
| 12 | + |
| 13 | +Sample selection |
| 14 | +---------------- |
| 15 | + |
| 16 | +TBD |
| 17 | + |
| 18 | +Edit activity data collection |
| 19 | +----------------------------- |
| 20 | + |
| 21 | +First use `fetchrates` to download the rate data from the MySQL database. This |
| 22 | +script takes a user_id in input (and stores the rate data in a file called |
| 23 | +<user_id>.npy). This script can be parallelized. At the end you will end up with |
| 24 | +a bunch of NPY files. |
| 25 | + |
| 26 | +Cohort selection |
| 27 | +---------------- |
| 28 | + |
| 29 | +See the docstring in `mkcohort`. |
| 30 | + |
| 31 | +Cohort analysis |
| 32 | +--------------- |
| 33 | + |
| 34 | +See `graphlife`, `fitting`, `fitting_batch.sh`, and `relax`. |
| 35 | + |
Index: trunk/tools/wsor/editor_lifecycle/fitting |
— | — | @@ -0,0 +1,218 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# coding: utf-8 |
| 4 | +# :vim:ft=python |
| 5 | + |
| 6 | +''' editor lifecycle data fitting tool ''' |
| 7 | + |
| 8 | +import sys |
| 9 | +import os |
| 10 | +import numpy as np |
| 11 | +import matplotlib.pyplot as pp |
| 12 | +from matplotlib.font_manager import FontProperties |
| 13 | +from argparse import ArgumentParser |
| 14 | +from scipy.optimize import curve_fit |
| 15 | + |
| 16 | +from lifecycle.models import Expon, PowerLaw, StretchedExpon |
| 17 | +import lifecycle.scale |
| 18 | + |
| 19 | +__prog__ = os.path.basename(os.path.abspath(__file__)) |
| 20 | + |
| 21 | +_maxfev = 100000 |
| 22 | + |
| 23 | +parent = ArgumentParser(add_help=False) |
| 24 | +parent.add_argument('data_files', metavar='DATA', nargs='+') |
| 25 | +parent.add_argument('-output', dest='output_file', metavar='FILE') |
| 26 | +parent.add_argument('-title') |
| 27 | +group = parent.add_mutually_exclusive_group() |
| 28 | +group.add_argument('-loglog', action='store_true') |
| 29 | +group.add_argument('-logpow', action='store_true') |
| 30 | +parent.add_argument('-constrain', choices=['head', 'tail', 'both']) |
| 31 | +parent.add_argument('-batch', action='store_true', help='do not show graphics') |
| 32 | +parent.add_argument('-force', action='store_true', help='force overwrite') |
| 33 | +parent.add_argument('-minsize', type=int) |
| 34 | + |
| 35 | +parser = ArgumentParser(description=__doc__) |
| 36 | +subparsers = parser.add_subparsers(help='Parametric models supported') |
| 37 | + |
| 38 | +parser_expon = subparsers.add_parser('expon', parents=[parent]) |
| 39 | +parser_expon.set_defaults(modelclass=Expon) |
| 40 | + |
| 41 | +parser_stretch = subparsers.add_parser('stretchedexp', parents=[parent]) |
| 42 | +parser_stretch.set_defaults(modelclass=StretchedExpon) |
| 43 | + |
| 44 | +parser_power = subparsers.add_parser('powerlaw', parents=[parent]) |
| 45 | +parser_power.set_defaults(modelclass=PowerLaw) |
| 46 | + |
| 47 | +class DataError(Exception): |
| 48 | + pass |
| 49 | + |
| 50 | +class TooFewDataError(DataError): |
| 51 | + pass |
| 52 | + |
| 53 | +class EmptyDataError(DataError): |
| 54 | + pass |
| 55 | + |
| 56 | +def plotfit(model, x, y, ye, label='data', **kwargs): |
| 57 | + xx = np.linspace(x.min(), x.max(), endpoint=True, num=1000) |
| 58 | + yy = model(xx) |
| 59 | + kwargs['ecolor'] = 'none' |
| 60 | + kwargs['ls'] = 'none' |
| 61 | + pp.errorbar(x, y, ye / 2, label=label, **kwargs) |
| 62 | + model_label = model.name.split() |
| 63 | + if len(model_label) > 1: |
| 64 | + model_label[1] = model_label[1][:3] + '.' |
| 65 | + model_label = ' '.join(model_label[:2]).capitalize() |
| 66 | + c = kwargs.get('color', 'r') |
| 67 | + pp.plot(xx, yy, '--', color=c, label='{} fit'.format(model_label), lw=2.5) |
| 68 | + if ns.loglog: |
| 69 | + pp.xscale('log') |
| 70 | + pp.yscale('log') |
| 71 | + elif ns.logpow: |
| 72 | + pp.xscale('power', exponent=model.beta) |
| 73 | + pp.yscale('log') |
| 74 | + pp.legend(loc='best', prop=FontProperties(size='x-small')) |
| 75 | + if ns.title is not None: |
| 76 | + pp.title(ns.title) |
| 77 | + pp.xlabel('Days since registration') |
| 78 | + pp.ylabel('Edits/day') |
| 79 | + |
| 80 | +def plotresid(model, x, y, label='data', **kwargs): |
| 81 | + r = model(x) - y |
| 82 | +# rm = r[True - np.isinf(r)].max() |
| 83 | +# r /= np.abs(rm) |
| 84 | + pp.axhline(y=0, c='k') |
| 85 | + kwargs['ls'] = ':' |
| 86 | + pp.plot(x, r, label=label, **kwargs) |
| 87 | + pp.title('Fit residuals') |
| 88 | + pp.xlabel('Days since registration') |
| 89 | +# pp.ylabel(r'Relative residual $\xi / \max{|\xi|}$') |
| 90 | +# pp.ylim(-1,1) |
| 91 | + pp.draw() |
| 92 | + |
| 93 | +def _testoverwrite(*files): |
| 94 | + exit_flag = False |
| 95 | + for fn in files: |
| 96 | + if os.path.exists(fn): |
| 97 | + exit_flag = True |
| 98 | + print '%s: error: cannot overwrite %s' % (__prog__, fn) |
| 99 | + if exit_flag: |
| 100 | + sys.exit(1) |
| 101 | + |
| 102 | +# TODO add usecols and option for taking only observations based on minimum |
| 103 | +# sample size |
| 104 | + |
| 105 | +def fit(data_file, modelclass, constrain=None, minsize=None): |
| 106 | + ''' |
| 107 | + Fits a model to data. Data are read from `data_file` and the model is |
| 108 | + specified as a `modelclass` (see `lifecycle.models`). The data file should |
| 109 | + contain a (N,4) array where columns are x, y, ye (errors) and n (sample |
| 110 | + sizes -- number of users on which the average rate y is computed). |
| 111 | + |
| 112 | + Parameters |
| 113 | + ---------- |
| 114 | + If `modelclass` is any of `Expon` or `StretchedExpon`, one can constrain the |
| 115 | + value of some parameters before performing the fit. To this end one can use |
| 116 | + the parameter `constrain`, which can be any of the following strings: |
| 117 | + 'both', 'head', or 'tail'. Their meaning is the following: |
| 118 | + |
| 119 | + * head : A = y[0] |
| 120 | + * tail : C = y[-1] |
| 121 | + * both : A = y[0] and C = y[-1] |
| 122 | + ''' |
| 123 | + x, y, ye, n = np.loadtxt(data_file, unpack=True) |
| 124 | + idx = ye > 0 |
| 125 | + if minsize: |
| 126 | + idx = idx * (n > minsize) |
| 127 | + if not np.any(idx): |
| 128 | + raise EmptyDataError() |
| 129 | + if np.sum(idx) < len(modelclass.__params__): |
| 130 | + raise TooFewDataError() |
| 131 | + x = x[idx] |
| 132 | + y = y[idx] |
| 133 | + ye = ye[idx] |
| 134 | + model = modelclass() |
| 135 | + if constrain in ['head', 'both']: |
| 136 | + model.A = y[np.argmin(np.abs(x))] |
| 137 | + if constrain in ['tail', 'both']: |
| 138 | + model.C = y.min() |
| 139 | + pest, pcov = model.fit(x, y, ye=ye, maxfev=_maxfev) |
| 140 | + if not np.isscalar(pcov): |
| 141 | + perr = np.sqrt(np.diag(pcov)) / 2. |
| 142 | + model.setparams(*zip(pest, perr)) |
| 143 | + else: |
| 144 | + model.setparams(*pest) |
| 145 | + gof, resid, Rsquared = model.gof(x, y, ye) |
| 146 | + model.goftest = gof |
| 147 | + model.residtest = resid |
| 148 | + model.Rsquared = Rsquared |
| 149 | + print model.summary(dataset=data_file, observations=len(x)) |
| 150 | + return x, y, ye, model |
| 151 | + |
| 152 | +def main(ns): |
| 153 | + # import matploblib |
| 154 | + from matplotlib.lines import lineMarkers as markers |
| 155 | + markers = dict(filter( |
| 156 | + lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing', |
| 157 | + markers.items())).keys() |
| 158 | + colors = 'bgcmykw' |
| 159 | + |
| 160 | + data = [] |
| 161 | + models = [] |
| 162 | + labels = [] |
| 163 | + |
| 164 | + # fit all datasets |
| 165 | + for f in ns.data_files: |
| 166 | + try: |
| 167 | + x, y, ye, model = fit(f, ns.modelclass, ns.constrain, ns.minsize) |
| 168 | + except TooFewDataError: |
| 169 | + print >> sys.stderr, '%s: warning: too few data: %s' % (__prog__, f) |
| 170 | + continue |
| 171 | + except EmptyDataError: |
| 172 | + print >> sys.stderr, '%s: warning: no usable data: %s'% (__prog__, f) |
| 173 | + continue |
| 174 | + data.append((x,y,ye)) |
| 175 | + models.append(model) |
| 176 | + labels.append(os.path.splitext(f)[0]) |
| 177 | + |
| 178 | + # plot fits |
| 179 | + pp.figure() |
| 180 | + pp.hold(1) |
| 181 | + for i, ((x, y, ye), model, label) in enumerate(zip(data, models, labels)): |
| 182 | + m = markers[i % len(markers)] |
| 183 | + c = colors[i % len(colors)] |
| 184 | + plotfit(model, x, y, ye, label, marker=m, color=c) |
| 185 | + |
| 186 | + # plot residuals |
| 187 | + pp.figure() |
| 188 | + pp.hold(1) |
| 189 | + for i, ((x, y, ye), model, label) in enumerate(zip(data, models, labels)): |
| 190 | + m = markers[i % len(markers)] |
| 191 | + c = colors[i % len(colors)] |
| 192 | + plotresid(model, x, y, label, marker=m, color=c) |
| 193 | + |
| 194 | + # save figures |
| 195 | + if ns.output_file is not None: |
| 196 | + fn, ext = os.path.splitext(ns.output_file) |
| 197 | + fmt = ext[1:] |
| 198 | + if ns.batch and fmt.lower() != 'pdf': |
| 199 | + print '%s: error: batch mode supports only PDF format' % __prog__ |
| 200 | + sys.exit(1) |
| 201 | + resid_output_file = fn + '_residuals' + ext |
| 202 | + if not ns.force: |
| 203 | + _testoverwrite(ns.output_file, resid_output_file) |
| 204 | + pp.figure(1) |
| 205 | + pp.savefig(ns.output_file, format=fmt) |
| 206 | + print '%s: output saved to %s' % (__prog__, ns.output_file) |
| 207 | + pp.figure(2) |
| 208 | + pp.savefig(resid_output_file, format=fmt) |
| 209 | + print '%s: output saved to %s' % (__prog__, resid_output_file) |
| 210 | + |
| 211 | + pp.show() |
| 212 | + |
| 213 | +if __name__ == '__main__': |
| 214 | + ns = parser.parse_args() |
| 215 | +# if ns.batch: |
| 216 | +# import matplotlib |
| 217 | +# matplotlib.use('PDF') |
| 218 | +# import matplotlib.pyplot as pp |
| 219 | + main(ns) |
Property changes on: trunk/tools/wsor/editor_lifecycle/fitting |
___________________________________________________________________ |
Added: svn:executable |
1 | 220 | + * |