r94723 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r94722‎ | r94723 | r94724 >
Date:01:49, 17 August 2011
Author:giovanni
Status:deferred
Tags:
Comment:
added again package
Modified paths:
  • /trunk/tools/wsor/editor_lifecycle/MANIFEST.in (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/README.rst (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/fetchrates (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/fitting (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/fitting_batch.sh (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/lifecycle (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/lifecycle/__init__.py (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/lifecycle/models.py (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/lifecycle/scale.py (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/mkcohort (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete/graphlife (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete/rates (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/rateperedits.py (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/ratesnobots.sql (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/relax (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/setup.py (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/sge (added) (history)
  • /trunk/tools/wsor/editor_lifecycle/sge/rates.sh (added) (history)

Diff [purge]

Index: trunk/tools/wsor/editor_lifecycle/mkcohort
@@ -0,0 +1,73 @@
 2+#!/usr/bin/python
 3+#:vim:ft=python
 4+
 5+''' finding better cohorts by real user activity '''
 6+
 7+import os
 8+import sys
 9+import csv
 10+import numpy as np
 11+
 12+from collections import deque
 13+from itertools import groupby
 14+from argparse import ArgumentParser, FileType
 15+from dateutil.parser import parser as DateParser
 16+
 17+__prog__ = os.path.basename(__file__)
 18+
 19+def yearkey(date):
 20+ return date.year,
 21+
 22+def monthkey(date):
 23+ return date.year, date.month
 24+
 25+def daykey(date):
 26+ return date.year, date.month, date.day
 27+
 28+parser = ArgumentParser(description=__doc__)
 29+parser.add_argument('data', type=FileType('r'))
 30+group = parser.add_mutually_exclusive_group(required=1)
 31+group.add_argument('--day', dest='keyfunc', const=daykey, action='store_const')
 32+group.add_argument('--month', dest='keyfunc', const=monthkey, action='store_const')
 33+group.add_argument('--year', dest='keyfunc', const=yearkey, action='store_const')
 34+
 35+parser.add_argument('--minspan', default=3600, type=int)
 36+parser.add_argument('--mincount', default=2, type=int)
 37+
 38+date_parser = DateParser()
 39+fieldnames = ['user', 'max_timestamp', 'min_timestamp', 'editcount']
 40+
 41+def activity(rowiterator, mincount, minspan):
 42+ for row in rowiterator:
 43+ user = float(row['user'])
 44+ min_timestamp = date_parser.parse(row['min_timestamp'])
 45+ max_timestamp = date_parser.parse(row['max_timestamp'])
 46+ editcount = float(row['editcount'])
 47+ span = (max_timestamp - min_timestamp).total_seconds()
 48+ if (editcount > mincount) and (span > minspan):
 49+ yield user, editcount / span
 50+
 51+def magnitude(x):
 52+ return np.floor(np.log10(x))
 53+
 54+if __name__ == '__main__':
 55+ ns = parser.parse_args()
 56+ reader = csv.DictReader(ns.data, fieldnames=fieldnames, delimiter='\t',
 57+ quoting=csv.QUOTE_NONE)
 58+
 59+ def cohortdatefunc(row):
 60+ return ns.keyfunc(date_parser.parse(row['min_timestamp']))
 61+
 62+ for cohortdate, subiter in groupby(reader, cohortdatefunc):
 63+ data = np.asarray(list(activity(subiter, ns.mincount, ns.minspan)))
 64+ if len(data) > 0:
 65+ user, rate = data.T
 66+ idx = rate.argsort()
 67+ data = list(data[idx])
 68+ kf = lambda k : magnitude(k[1])
 69+ for m, subsubiter in groupby(data, kf):
 70+ outrow = deque([ '-'.join(map(str, cohortdate)), '%d' % m ])
 71+ for u, e in subsubiter:
 72+ outrow.append('%d' % u)
 73+ print '\t'.join(outrow)
 74+
Property changes on: trunk/tools/wsor/editor_lifecycle/mkcohort
___________________________________________________________________
Added: svn:executable
175 + *
Index: trunk/tools/wsor/editor_lifecycle/setup.py
@@ -0,0 +1,16 @@
 2+from distutils.core import setup
 3+
 4+setup(
 5+ name='lifecycle',
 6+ description='WMF summer of research project',
 7+ version='0.0.0',
 8+ author='Giovanni Luca Ciampaglia',
 9+ author_email='gciampaglia@wikimedia.org',
 10+ packages=['lifecycle'],
 11+ scripts=[
 12+ 'fetchrates',
 13+ 'fitting',
 14+ 'relax',
 15+ 'mkcohort',
 16+ ]
 17+)
Index: trunk/tools/wsor/editor_lifecycle/fetchrates
@@ -0,0 +1,126 @@
 2+#!/usr/bin/python
 3+
 4+import sys
 5+import os
 6+import numpy as np
 7+
 8+from zipfile import ZipFile
 9+from contextlib import closing
 10+from tempfile import mkdtemp
 11+from oursql import connect
 12+from argparse import ArgumentParser
 13+from datetime import datetime
 14+
 15+from lifecycle.rates import computerates
 16+
 17+parser = ArgumentParser(description=__doc__)
 18+parser.add_argument('input_path', metavar='cohort')
 19+parser.add_argument('-config', dest='config_file')
 20+parser.add_argument('-outdir', dest='output_dir', default=os.curdir)
 21+parser.add_argument('-v','--verbose', action='store_true')
 22+parser.add_argument('-every', type=int, help='default: average over %(default)d days',
 23+ default=30, metavar='NUM')
 24+parser.add_argument('-ns', type=int, action='append', help='select only these NS',
 25+ dest='only')
 26+parser.add_argument('--save-cohort', dest='rates_only', action='store_false')
 27+
 28+query = """
 29+select unix_timestamp(rev_timestamp)/86400.0, page_namespace
 30+from revision r join page p
 31+on r.rev_page = p.page_id
 32+where rev_user = ?
 33+order by rev_timestamp
 34+"""
 35+
 36+__prog__ = os.path.basename(os.path.abspath(__file__))
 37+
 38+# def iterateoversets(cursor):
 39+# yield list(cursor)
 40+# while cursor.nextset():
 41+# yield list(cursor)
 42+
 43+def process(rows, user_id, output_dir=os.path.curdir):
 44+ if len(rows) == 0:
 45+ print >> sys.stderr, '%s: error: empty revision history for user %d' % (__prog__,
 46+ user_id)
 47+ return
 48+
 49+ rev_time, rev_ns = np.asfarray(rows).T
 50+ m, M = np.floor(rev_time.min()), np.ceil(rev_time.max())
 51+ uns = sorted(np.unique(rev_ns))
 52+ bins_time = np.arange(m,M + 1)
 53+ bins_ns = uns + [uns[-1] + 1]
 54+ rates, days, _ = np.histogram2d(rev_time, rev_ns, bins=(bins_time, bins_ns))
 55+ I,J = np.nonzero(rates)
 56+ data = [ (days[i],uns[j],rates[i,j]) for i, j in zip(I,J) ] # in coordinate format
 57+ dtype=np.dtype([('day', int), ('ns', int), ('edits', int)])
 58+ data = np.asarray(data, dtype=dtype)
 59+ out_path = os.path.join(output_dir, '%d.npy' % user_id)
 60+ np.save(out_path, data)
 61+ return out_path
 62+
 63+def main(ns):
 64+ # get mysql client configuration file
 65+ mycnf = os.path.expanduser('~/.my.cnf')
 66+ if ns.config_file is None and not os.path.exists(mycnf):
 67+ print >> sys.stderr, '%s: no config file specified and $HOME/.my.cnf'
 68+ ' not found' % __prog__
 69+ sys.exit(1)
 70+ elif ns.config_file is None:
 71+ ns.config_file = mycnf
 72+
 73+ # test output directory exists
 74+ if not os.path.exists(ns.output_dir):
 75+ print >> sys.stderr, '%s: output directory does not exist: %s' % (
 76+ __prog__, ns.output_dir)
 77+ sys.exit(1)
 78+ if not os.path.isdir(ns.output_dir):
 79+ print >> sys.stderr, '%s: not a directory: %s' % (__prog__, ns.output_dir)
 80+
 81+ # read user ids from cohort file, create zip archive and temp dir
 82+ with closing(open(ns.input_path)) as f:
 83+ line = f.readline().strip()
 84+ if line:
 85+ user_ids = map(int, line.split('\t'))
 86+ else:
 87+ print >> sys.stderr, '%s: error: empty input file: %s' % ns.input_path
 88+ sys.exit(1)
 89+ zip_path = os.path.splitext(os.path.basename(ns.input_path))[0] + '.npz'
 90+ zip_path = os.path.join(ns.output_dir, zip_path)
 91+ temp_dir = mkdtemp()
 92+
 93+ with closing(ZipFile(zip_path, 'w')) as zf:
 94+
 95+ # connect and run query
 96+ conn = connect(read_default_file=ns.config_file)
 97+ for uid in user_ids:
 98+ # compute rates and save to file
 99+ with conn.cursor() as cursor:
 100+ cursor.execute(query, (uid,))
 101+ rows = list(cursor)
 102+ path = process(rows, uid, temp_dir)
 103+ if path is None:
 104+ continue
 105+ zf.write(path, os.path.basename(path))
 106+ os.remove(path)
 107+
 108+ # remove temp dir
 109+ os.removedirs(temp_dir)
 110+
 111+ # compute rates for this cohort and save them to file
 112+ rate_path = os.path.splitext(os.path.basename(ns.input_path))[0] + '.tsv'
 113+ rate_path = os.path.join(ns.output_dir, rate_path)
 114+ rates = computerates(zip_path, ns.every, onlyns=ns.only)
 115+ np.savetxt(rate_path, rates, fmt='%d\t%12.8g\t%12.8g\t%d')
 116+ if ns.rates_only:
 117+ os.remove(zip_path)
 118+ print '%s: output saved to %s' % (datetime.now(), rate_path)
 119+ else:
 120+ print '%s: output saved to %s, cohort data to %s' % (datetime.now(),
 121+ rate_path, zip_path)
 122+
 123+if __name__ == '__main__':
 124+ # get arguments from command line
 125+ ns = parser.parse_args()
 126+ main(ns)
 127+
Property changes on: trunk/tools/wsor/editor_lifecycle/fetchrates
___________________________________________________________________
Added: svn:executable
1128 + *
Index: trunk/tools/wsor/editor_lifecycle/MANIFEST.in
@@ -0,0 +1,5 @@
 2+include *.py
 3+include *.sh
 4+include *.sql
 5+include sge/*.sh
 6+include obsolete/*
Index: trunk/tools/wsor/editor_lifecycle/relax
@@ -0,0 +1,81 @@
 2+#!/usr/bin/python
 3+#:vim:ft=python
 4+
 5+''' batch model fitting (usable with xargs)'''
 6+
 7+import re
 8+import os
 9+import sys
 10+import numpy as np
 11+from argparse import ArgumentParser
 12+
 13+from lifecycle.models import StretchedExpon
 14+
 15+__prog__ = os.path.basename(__file__)
 16+
 17+parser = ArgumentParser(description=__doc__)
 18+parser.add_argument('data_file', metavar='data')
 19+parser.add_argument('-m', '--min-size', type=int, default=0)
 20+parser.add_argument('-c', '--constrain', choices=['head', 'tail', 'both'])
 21+parser.add_argument('--maxfev', type=int, default=10000)
 22+parser.add_argument('--debug', action='store_true')
 23+parser.add_argument('--split-name', action='store_true', help='split input file'
 24+ ' name into date and a rate part.')
 25+
 26+if __name__ == '__main__':
 27+ # parse command line
 28+ ns = parser.parse_args()
 29+
 30+ # read data, filter data
 31+ x, y, ye, n = np.loadtxt(ns.data_file, unpack=1)
 32+ idx = (ye > 0) * (n > ns.min_size)
 33+ if idx.sum() == 0:
 34+ print >> sys.stderr, '%s: error: no data meeting requirements in %s'\
 35+ % (__prog__, ns.data_file)
 36+ sys.exit(1)
 37+ if idx.sum() < 4:
 38+ print >> sys.stderr, '%s: error: non identifiable data in %s'\
 39+ % (__prog__, ns.data_file)
 40+ sys.exit(1)
 41+
 42+ # create model, set constraints
 43+ model = StretchedExpon()
 44+ if ns.constrain in ['head', 'both']:
 45+ model.A = y[np.argmin(np.abs(x))]
 46+ if ns.constrain in ['tail', 'both']:
 47+ model.C = y.min()
 48+
 49+ # fit model
 50+ try:
 51+ pest, pcov = model.fit(x[idx], y[idx], ye[idx], maxfev=ns.maxfev, warning=False)
 52+ except ValueError, e:
 53+ print >> sys.stderr, '%s: error: "%s" when fitting %s' % (__prog__,
 54+ e.message, ns.data_file)
 55+ if ns.debug:
 56+ raise
 57+ else:
 58+ sys.exit(1)
 59+ if np.isscalar(pcov) or np.isinf(pcov).any():
 60+ print >> sys.stderr, '%s: error: bad covariance matrix in %s' % (\
 61+ __prog__, ns.data_file)
 62+ sys.exit(1)
 63+
 64+ # compute errors, MRT
 65+ perr = np.sqrt(np.diag(pcov)) / 2.
 66+ model.setparams(*zip(pest,perr))
 67+ mrt = model.mrt(model.tau, model.beta)
 68+ N = len(model.__params__)
 69+ params = np.empty((N * 2 + 1,), dtype=float)
 70+ params[:N] = [ model.A, model.tau, model.beta, model.C ]
 71+ params[N:2*N] = map(lambda k : k or np.nan, [ model.A_err, model.tau_err,
 72+ model.beta_err, model.C_err ])
 73+ params[-1] = mrt
 74+
 75+ # print output line
 76+ key, _ = os.path.splitext(ns.data_file)
 77+ if ns.split_name:
 78+ key = key.split('_')
 79+ else:
 80+ key = [ key ]
 81+ params = map(lambda k : '%12.5g' % k, params)
 82+ print '\t'.join(key + params)
Property changes on: trunk/tools/wsor/editor_lifecycle/relax
___________________________________________________________________
Added: svn:executable
183 + *
Index: trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort
@@ -0,0 +1,196 @@
 2+#!/usr/bin/python
 3+# coding: utf-8
 4+# :vim:ft=python
 5+
 6+# TODO: obsolete
 7+
 8+''' creates cohort files, filtering out bots '''
 9+
 10+'''
 11+This script reads two files: an ZIP archive file, and an index file, which is a
 12+tab-separated text file like the following:
 13+
 14+ 34 WojPob 20010129110725 2524
 15+ 94 AstroNomer 20010207222248 1532
 16+ 43 Lee Daniel Crocker 20010314020407 4388
 17+ 86 Stephen Gilbert 20010326191355 3599
 18+ 3 Tobias Hoevekamp 20010326202105 1903
 19+ 1273 Wathiik 20010510171751 1772
 20+ 3371 Arno 20010721180708 2700
 21+ 122 Ap 20010722201619 2137
 22+ 182 Rjstott 20010726102546 2602
 23+ 64 Uriyan 20010727141651 1634
 24+
 25+Where fields are: id, name, date, count. Dates are parsed using dateutil, so
 26+other formats are allowed too (e.g. 2010-01-29 11:07:25).
 27+
 28+The script will aggregate users based on the date field and will lookup for
 29+files of the form <id>.npy in the archive file. Each of these files contains the
 30+daily edits count for a single user, stored using the NumPy binary array
 31+format. A relative path within the ZIP archive can be specified from the command
 32+line with -P/--datapath. Once the data for a cohort (e.g. an aggregated group
 33+of users) have been collected, the script will compute the average activity rate
 34+since the first day of activity for all users in that cohort.
 35+
 36+The script produces two files per each cohort: a tab-separated values file with
 37+cohort average activity rate, and a compressed NumPy binary archive with the
 38+user data array files.
 39+
 40+For each discovered cohort, the script will print on the console the date of the
 41+cohort, how many users it contains, and how many suspected BOT users it filtered
 42+out from the index. Use --bot disable this chieck and always include them. The
 43+check is as follows: if the name contains the pattern 'bot' at the beginning or
 44+at the end of any word, it will be filtered out (e.g. "Botuser IV" will match,
 45+but "Francis Abbott" won't). If arguments -mincount or -maxcount (or both) are
 46+passed, the script will process only users whose edit count is below the minimum
 47+count, or above the maximum count, or both.
 48+
 49+Please note that the index file must be already sorted by date, in order for the
 50+group by date aggregation to work. You can use `sort' from the commmand line,
 51+e.g.:
 52+
 53+ $~ sort -t$'\t' -k3 -h unsorted.tsv
 54+
 55+should sort file unsorted.tsv.
 56+'''
 57+
 58+import re
 59+import os
 60+import sys
 61+import csv
 62+import numpy as np
 63+from argparse import ArgumentParser, FileType
 64+from contextlib import closing
 65+from itertools import groupby
 66+from dateutil.parser import parser as DateParser
 67+from datetime import datetime
 68+from zipfile import ZipFile
 69+
 70+from rates import computerates
 71+
 72+__prog__ = os.path.basename(os.path.abspath(__file__))
 73+_botpat = r'\bbot|bot\b'
 74+_fields = ['id', 'name', 'date', 'count']
 75+
 76+def yearkey(date):
 77+ return date.year,
 78+
 79+def monthkey(date):
 80+ return date.year, date.month
 81+
 82+def daykey(date):
 83+ return date.year, date.month, date.day
 84+
 85+parser = ArgumentParser(description=__doc__)
 86+parser.add_argument('index', type=FileType('r'), help='*must* be already sorted')
 87+parser.add_argument('archive_path', metavar='archive', help='data archive in ZIP '
 88+ 'format')
 89+group = parser.add_mutually_exclusive_group(required=1)
 90+group.add_argument('--year', help='group by year', action='store_const',
 91+ const=yearkey, dest='keyfunc')
 92+group.add_argument('--month', help='group by month', action='store_const',
 93+ const=monthkey, dest='keyfunc')
 94+group.add_argument('--day', help='group by day', action='store_const',
 95+ const=daykey, dest='keyfunc')
 96+parser.add_argument('--bots', action='store_true', help='do NOT filter out bots')
 97+parser.add_argument('-P', '--datapath', help='relative path of files within '
 98+ 'archive', default='')
 99+parser.add_argument('-mincount', type=int)
 100+parser.add_argument('-maxcount', type=int)
 101+parser.add_argument('-minperyear', type=int)
 102+parser.add_argument('-maxperyear', type=int)
 103+parser.add_argument('-n', '--dry-run', action='store_true', help='write to '
 104+ 'console all actions, but do not produce any file')
 105+parser.add_argument('-every', type=int, help='default: average over %(default)d days',
 106+ default=30, metavar='NUM')
 107+parser.add_argument('-ns', type=int, action='append', help='select only these NS',
 108+ dest='only')
 109+
 110+dateparser = DateParser()
 111+
 112+# dummy ZipFile class in case we do not want do anything!
 113+class DummyZipFile:
 114+ def __init__(self, fn, mode):
 115+ pass
 116+ def close(self):
 117+ pass
 118+ def write(self, fn, *args):
 119+ pass
 120+
 121+if __name__ == '__main__':
 122+ ns = parser.parse_args()
 123+ reader = csv.DictReader(ns.index, _fields, quoting=csv.QUOTE_NONE,
 124+ delimiter='\t')
 125+ archive = ZipFile(ns.archive_path)
 126+
 127+ def _keyfunc(row):
 128+ try:
 129+ date = dateparser.parse(row['date'])
 130+ except:
 131+ print row
 132+ raise
 133+
 134+ return ns.keyfunc(date)
 135+
 136+ # group by index by date of registration
 137+ for key, subiter in groupby(reader, _keyfunc):
 138+
 139+ # reset indices and define output file names from cohort period
 140+ tot_users = 0
 141+ tot_bots = 0
 142+ datestr = '-'.join(map(lambda k : '%02d' % k, key)) # (2010,1) -> '2010-01'
 143+ zipfn = '{}.npz'.format(datestr)
 144+ tsvfn = '{}.tsv'.format(datestr)
 145+
 146+ # if user wants to do a dry-run, replace the Zip files class with the
 147+ # dummy one
 148+ if ns.dry_run:
 149+ ZipFile = DummyZipFile
 150+
 151+ # for each user, determine if may go in cohort
 152+ with closing(ZipFile(zipfn, 'w')) as zf:
 153+ for row in subiter:
 154+
 155+ # compute user details (edit count, yearly activity rate, etc.)
 156+ # and other useful variables
 157+ user_id = row['id']
 158+ count = int(row['count'])
 159+ user_date = dateparser.parse(row['date'])
 160+ now_date = datetime.now()
 161+ activity_span = float((now_date - user_date).days) # in days
 162+ yearly_rate = count / activity_span * 365.0
 163+ bot_flag = re.search(_botpat, row['name'], re.I) is not None
 164+ tot_bots += bot_flag # update counts of bot matches
 165+
 166+ # define paths
 167+ basepath = '{}.npy'.format(user_id)
 168+ archivepath = os.path.join(ns.datapath, basepath)
 169+
 170+ # check cohort membership (keep if conjunction of all given
 171+ # criteria is true, that is, discard if any given criterion is
 172+ # false)
 173+ if ns.mincount is not None and count <= ns.mincount:
 174+ continue
 175+ if ns.maxcount is not None and count >= ns.maxcount:
 176+ continue
 177+ if ns.minperyear is not None and yearly_rate <= ns.minperyear:
 178+ continue
 179+ if ns.maxperyear is not None and yearly_rate >= ns.maxperyear:
 180+ continue
 181+ # user can turn this test off by passing --bots
 182+ if not ns.bots and bot_flag:
 183+ continue
 184+ try:
 185+ zf.writestr(basepath, archive.read(archivepath))
 186+ except KeyError:
 187+ print >> sys.stderr, '%s: warning: %s not in archive' %\
 188+ (__prog__, archivepath)
 189+ tot_users += 1
 190+
 191+ if tot_users > 0:
 192+ rates = computerates(zipfn, ns.every, onlyns=ns.only)
 193+ np.savetxt(tsvfn, rates, fmt='%f')
 194+
 195+ print '%s: %s, %s created (users: %5d, skipped bots %5d)' % (
 196+ __prog__, tsvfn, zipfn, tot_users, tot_bots)
 197+ sys.stdout.flush()
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort
___________________________________________________________________
Added: svn:executable
1198 + *
Index: trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort
@@ -0,0 +1,61 @@
 2+#!/usr/bin/python
 3+# vim:ft=python:
 4+# coding : utf-8
 5+
 6+# TODO: obsolete
 7+
 8+''' fetches a cohort based on year of registration and editing activity '''
 9+
 10+from argparse import ArgumentParser
 11+from oursql import connect
 12+import os
 13+import sys
 14+import datetime as dt
 15+import csv
 16+
 17+prog = os.path.basename(os.path.abspath(__file__))
 18+
 19+parser = ArgumentParser(description=__doc__, fromfile_prefix_chars='@')
 20+parser.add_argument('registration_year', metavar='year', type=int)
 21+parser.add_argument('min_activity', metavar='minedits', type=int)
 22+parser.add_argument('max_activity', metavar='maxedits', type=int)
 23+parser.add_argument('-c', '--config', dest='config_file')
 24+parser.add_argument('-l', '--limit', type=int)
 25+
 26+query = '''
 27+select
 28+ user_id,
 29+ user_name,
 30+ user_registration,
 31+ user_editcount
 32+from user u left join user_groups ug
 33+on u.user_id = ug.ug_user
 34+where
 35+ (ug_group <> 'bot' or ug_user is null)
 36+ and year(user_registration) = ?
 37+ and user_editcount > ?
 38+ and user_editcount < ?
 39+'''
 40+
 41+if __name__ == '__main__':
 42+ ns = parser.parse_args()
 43+ if ns.min_activity >= ns.max_activity:
 44+ print >> sys.stderr, '%s: error: min_activity >= max_activity' % prog
 45+ sys.exit(1)
 46+ if ns.registration_year < 2001 or ns.registration_year > dt.datetime.now().year:
 47+ print >> sys.stderr, '%s: error: illegal year: %d' % (prog,
 48+ ns.registration_year)
 49+ sys.exit(1)
 50+
 51+ if ns.limit is not None:
 52+ query += 'limit %d' % ns.limit
 53+
 54+ if ns.config_file is None:
 55+ ns.config_file = os.path.expanduser('~/.my.cnf')
 56+
 57+ conn = connect(read_default_file=ns.config_file)
 58+ writer = csv.writer(sys.stdout, dialect='excel-tab')
 59+ cursor = conn.cursor()
 60+ cursor.execute(query, (ns.registration_year, ns.min_activity, ns.max_activity))
 61+ for row in cursor:
 62+ writer.writerow(row)
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort
___________________________________________________________________
Added: svn:executable
163 + *
Index: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql
@@ -0,0 +1,14 @@
 2+-- user ID, user name, timestamp of first edit and edit count of all registered
 3+-- users that are not flagged bots. N.B. there might still be unflagged bots.
 4+
 5+select
 6+ rev_user as user_id,
 7+ rev_user_text as user_name,
 8+ min(rev_timestamp) as first_timestamp,
 9+ count(rev_timestamp) as editcount
 10+from
 11+ revision r use index (usertext_timestamp) left join user_groups g
 12+on r.rev_user = g.ug_user
 13+where (ug_group <> 'bot' or g.ug_user is null) and rev_user > 0
 14+group by rev_user_text
 15+-- limit 100
Index: trunk/tools/wsor/editor_lifecycle/obsolete/rates
@@ -0,0 +1,78 @@
 2+#!/usr/bin/python
 3+#:vim:ts=python:
 4+
 5+''' compute editor lifecycle '''
 6+
 7+import re
 8+import os
 9+from argparse import ArgumentParser
 10+import numpy as np
 11+from collections import deque
 12+import datetime as dt
 13+
 14+from lifecycle.rates import *
 15+
 16+__prog__ = os.path.basename(os.path.abspath(__file__))
 17+
 18+parser = ArgumentParser(description=__doc__)
 19+parser.add_argument('data_file', metavar='data')
 20+parser.add_argument(metavar='minact', type=int, dest='minimum_activity')
 21+parser.add_argument(metavar='maxact', type=int, dest='maximum_activity')
 22+parser.add_argument('-key')
 23+parser.add_argument('-every', type=int, help='default: %(default)d days',
 24+ default=30, metavar='NUM')
 25+parser.add_argument('-inactivity', type=int, default=180, help='default: '
 26+ '%(default)d days', metavar='NUM')
 27+parser.add_argument('-all', dest='dump_all', action='store_true')
 28+
 29+
 30+def main(ns):
 31+ if ns.key is None:
 32+ m = re.match('(.*?)\.npz', ns.data_file, re.I)
 33+ if m is not None:
 34+ ns.key = m.groups()[0]
 35+ else:
 36+ print >> sys.stderr, '%s: cannot determine key from file name: %s'\
 37+ % (__prog__, ns.data_file)
 38+ sys.exit(1)
 39+ if ns.minimum_activity >= ns.maximum_activity:
 40+ print >> sys.stderr, '%s: error: minact >= maxact' % __prog__
 41+ sys.exit(1)
 42+
 43+ # load data
 44+ npzarchive = np.load(ns.data_file)
 45+
 46+ if ns.dump_all:
 47+ fn = mkfn('cycles', ns, 'npz')
 48+ values_iter = itercycles(npzarchive, ns.every)
 49+ keys = npzarchive.files
 50+ tmp = dict(zip(keys, list(values_iter)))
 51+ np.savez(fn, **tmp)
 52+ print '%s: output saved to %s' % (__prog__, fn)
 53+ else:
 54+ # compute lifetime distribution
 55+ lt = lifetimes(npzarchive)
 56+
 57+ # compute inactive subgroups
 58+ inactive_users = find_inactives(npzarchive, ns.inactivity, ns.minimum_activity,
 59+ ns.maximum_activity)
 60+
 61+ ratesbyday = groupbyday(npzarchive, ns.every)
 62+ ratesbyday_inact = groupbyday(npzarchive, ns.every, inactive_users)
 63+
 64+ avg_all = averagecycle(ratesbyday)
 65+ avg_inact = averagecycle(ratesbyday_inact)
 66+
 67+ lens = [ len(npzarchive.files), len(inactive_users) ]
 68+
 69+ names = ['lt', 'len', 'all', 'inact' ]
 70+ arrs = [ lt, lens, avg_all, avg_inact ]
 71+
 72+ for n, a in zip(names, arrs):
 73+ fn = '%s_%s.%s' % (ns.key, n, 'tsv')
 74+ np.savetxt(fn, a)
 75+ print '%s: output saved to %s' % (__prog__, fn)
 76+
 77+if __name__ == '__main__':
 78+ ns = parser.parse_args()
 79+ main(ns)
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/rates
___________________________________________________________________
Added: svn:executable
180 + *
Index: trunk/tools/wsor/editor_lifecycle/obsolete/graphlife
@@ -0,0 +1,90 @@
 2+#!/usr/bin/python
 3+
 4+''' plot editor life cycle '''
 5+
 6+import sys
 7+import numpy as np
 8+from argparse import ArgumentParser
 9+import os
 10+
 11+__prog__ = os.path.basename(os.path.abspath(__file__))
 12+
 13+parser = ArgumentParser(description=__doc__)
 14+parser.add_argument('data_files', metavar='data', nargs='+')
 15+parser.add_argument('-l', '--label', metavar='TEXT', action='append',
 16+ dest='labels_list')
 17+parser.add_argument('-inset', dest='inset_data_file', metavar='FILE')
 18+parser.add_argument('-batch', action='store_true', help='uses PDF backend')
 19+parser.add_argument('-title')
 20+parser.add_argument('-fmt', default='pdf', help='default: %(default)s')
 21+
 22+if __name__ == '__main__':
 23+ ns = parser.parse_args()
 24+
 25+ # checks
 26+ if ns.labels_list and len(ns.data_files) != len(ns.labels_list):
 27+ print >> sys.stderr, '%s: error: please provide as many labels '\
 28+ 'as data files' % __prog__
 29+ sys.exit(1)
 30+
 31+ # import pyplot, make lists of colors and markers
 32+ if ns.batch:
 33+ import matplotlib
 34+ matplotlib.use('PDF')
 35+ import matplotlib.pyplot as pp
 36+ from matplotlib.lines import lineMarkers as markers
 37+ markers = dict(filter(
 38+ lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing',
 39+ markers.items())).keys()
 40+ colors = 'krbgm'
 41+
 42+ # create figure and axes
 43+ fig = pp.figure()
 44+ ax = pp.axes([.1, .1, .85, .8])
 45+
 46+ # add lines
 47+ N = len(ns.data_files)
 48+ for i in xrange(N):
 49+ data_file = ns.data_files[i]
 50+ if ns.labels_list is not None:
 51+ label = ns.labels_list[i]
 52+ else:
 53+ label = 'line-%d' % (i + 1)
 54+ color = colors[i % len(colors)]
 55+ marker= markers[i % len(markers)]
 56+ x, y, ye = np.loadtxt(data_file, unpack=1)
 57+ ax.errorbar(x, y, ye, color=color, marker=marker, mfc='none',
 58+ mec=color, ls=':', label=label)
 59+
 60+ ax.legend(loc=2)
 61+ ax.set_xlabel('days since registration')
 62+ ax.set_ylabel('edits/day')
 63+ if ns.title is not None:
 64+ ax.set_title(ns.title)
 65+ ax.axis('tight')
 66+
 67+ # plot hist of lifetimes in inset axes
 68+ if ns.inset_data_file is not None:
 69+ lt = np.loadtxt(ns.inset_data_file)
 70+ inax = pp.axes([.55, .6, .35, .25], axisbg='none')
 71+ inax.hist(lt, bins=20, fc='none', cumulative=-1, normed=0)
 72+ for l in inax.xaxis.get_ticklabels():
 73+ l.set_rotation(30)
 74+ l.set_fontsize('x-small')
 75+ for l in inax.yaxis.get_ticklabels():
 76+ l.set_fontsize('x-small')
 77+ inax.set_xlabel('lifespan $x$ (days)', fontsize='small')
 78+ inax.set_ylabel('no. of users older\n more than $x$ days',
 79+ fontsize='small')
 80+ inax.set_title('account lifetime')
 81+ inax.axis('tight')
 82+
 83+ pp.draw()
 84+ if ns.title is not None:
 85+ fn = ns.title.replace(' ', '_').lower() + '.' + ns.fmt
 86+ else:
 87+ fn = 'output.' + ns.fmt
 88+ print 'output saved to %s' % fn
 89+
 90+ pp.savefig(fn, fmt=ns.fmt)
 91+ pp.show()
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/graphlife
___________________________________________________________________
Added: svn:executable
192 + *
Index: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh
@@ -0,0 +1,14 @@
 2+#!/bin/bash
 3+
 4+# This scripts writes to output a list of registered, not-flagged-as-bot users,
 5+# sorted by time of first edit. Each item in the list comprises:
 6+#
 7+# 1. user_id
 8+# 2. user_name
 9+# 3. first_timestamp
 10+# 4. editcount
 11+#
 12+# For the SQL query, check file userlist.sql.
 13+
 14+srcdir=`dirname $(type -p $0)`
 15+mysql -BN < $srcdir/userlist.sql | sort -h -k3 -t $'\t'
Property changes on: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh
___________________________________________________________________
Added: svn:executable
116 + *
Index: trunk/tools/wsor/editor_lifecycle/fitting_batch.sh
@@ -0,0 +1,43 @@
 2+#!/bin/bash
 3+
 4+# Applies the `fitting' script to a batch of files
 5+#
 6+# author: Giovanni Luca Ciampaglia <gciampaglia@wikimedia.org>
 7+#
 8+# USAGE: fitting_batch.sh file1 file2 file3 ...
 9+#
 10+# This will produce the normal console output that fitting produces; PDF plots
 11+# will be stored in file fit.pdf (please note: no check against overwriting
 12+# existing versions is performed!)
 13+
 14+if [[ -z `type -p fitting` ]] ; then
 15+ echo 'error: could not find fitting script. Check your PATH'
 16+ exit 1
 17+fi
 18+
 19+if [[ -e fit.pdf ]] ; then
 20+ echo 'error: cannot overwrite file fit.pdf'
 21+ exit 1
 22+fi
 23+
 24+O=`mktemp -d`
 25+models="expon powerlaw stretchedexp"
 26+files="$@"
 27+
 28+for file in $files ; do
 29+ for model in $models ; do
 30+ fitting $model -force -loglog -batch $file -o $O/${file%.*}_$model.pdf
 31+ echo
 32+ echo
 33+ done
 34+done
 35+
 36+pdfs=`ls $O/*.pdf | sort`
 37+
 38+gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=fit.pdf -dBATCH $pdfs &>/dev/null
 39+
 40+if [[ $? = 0 ]] ; then
 41+ echo 'images saved in fit.pdf'
 42+else
 43+ echo "error: problem saving fit.pdf. Individual image files in $O"
 44+fi
Property changes on: trunk/tools/wsor/editor_lifecycle/fitting_batch.sh
___________________________________________________________________
Added: svn:executable
145 + *
Index: trunk/tools/wsor/editor_lifecycle/sge/rates.sh
@@ -0,0 +1,30 @@
 2+#! /bin/bash
 3+#$ -N rates
 4+#$ -j y
 5+#$ -o query.log
 6+#$ -t 1-NNNNN
 7+#$ -tc 10
 8+#$ -l sqlprocs-s1=1
 9+#$ -m n
 10+
 11+# Use this script to run the query on the toolserver using its Sun Grid Engine
 12+# batch job scheduling system. SGE will run this script in parallel for each
 13+# input file in input/, and place the output in (surprise..) output/
 14+#
 15+# Each input file will be fed to this script by SGE. However, SGE does not have
 16+# any way to tell how many input files you have to process, so you have to
 17+# manually change the comment line on top of this file to reflect this
 18+# information:
 19+#
 20+# #$ -t 1-NNNNN
 21+#
 22+# once you have done this, you can simply submit the job to SGE using qsub:
 23+#
 24+# ~$ qsub < rates.sh
 25+#
 26+# See man qsub for more info on that.
 27+
 28+uid=$((SGE_TASK_ID-1))
 29+infile=$uid.in
 30+export PATH=$HOME/local/bin:$PATH
 31+fetchrates -o $HOME/output $HOME/input/$infile
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/models.py
@@ -0,0 +1,371 @@
 2+# coding: utf8
 3+
 4+import numpy as np
 5+from scipy.stats import norm, chisqprob, normaltest
 6+from scipy.optimize import curve_fit
 7+from scipy.special import gamma
 8+from cStringIO import StringIO
 9+import datetime as dt
 10+#from scikits.statsmodels.api import OLS
 11+
 12+__all__ = ['Expon', 'PowerLaw', 'StretchedExpon' ]
 13+
 14+class Parameter(object):
 15+ '''
 16+ Class for parameter descriptors. Works with ParameterMixin
 17+ '''
 18+ def __init__(self, name, attrlist):
 19+ self.name = name # parameter name
 20+ for att in attrlist:
 21+ if att.name == name:
 22+ raise AttributeError('cannot add parameter {}'.format(name))
 23+ attrlist.append(self)
 24+ def __get__(self, instance, owner):
 25+ if instance is not None:
 26+ return instance.__dict__['_' + self.name]
 27+ return self
 28+ def __set__(self, instance, value):
 29+ try:
 30+ value, error = value
 31+ except TypeError:
 32+ value, error = value, None
 33+ instance.__dict__['_' + self.name] = value
 34+ instance.__dict__[self.name + '_err'] = error
 35+ def __repr__(self):
 36+ return '<Parameter {} at 0x{}>'.format(self.name, '%x' % id(self))
 37+
 38+class ParameterMixin(object):
 39+ '''
 40+ Class that lets you look up all Parameter instances in __params__
 41+ '''
 42+ def itererrors(self):
 43+ for p in self.__params__:
 44+ yield self.__getattribute__(p.name + '_err')
 45+ def errors(self):
 46+ return list(self.itererrors())
 47+ def iterparams(self):
 48+ '''
 49+ Returns an iterator over all parameters of this model
 50+ '''
 51+ for p in self.__params__:
 52+ yield self.__getattribute__(p.name)
 53+ def params(self):
 54+ '''
 55+ Returns a tuple of all parameters of this model
 56+ '''
 57+ return list(self.iterparams())
 58+ def setparams(self, *args):
 59+ '''
 60+ Sets unset parameters of this model to *args. Parameters that already
 61+ are associated a value will *NOT* be modified by this method.
 62+ '''
 63+ keyf = lambda p : self.__getattribute__(p.name) is None
 64+ for p, a in zip(filter(keyf, self.__params__), args):
 65+ setattr(self, p.name, a)
 66+
 67+def _orNA(val, fmt='%8.5g'):
 68+ if val is not None:
 69+ return fmt % val
 70+ else:
 71+ return 'N/A'
 72+
 73+class ParametricModel(ParameterMixin):
 74+ '''
 75+ Callable class with Parameter descriptors. Subclasses of ParametricModel
 76+ ought define, as class attributes, any number of Parameter descriptors at the
 77+ class level, together with a list (conventional name: `__params__'). See
 78+ Parameter.__init__ on how to instantiate a Parameter descriptor.
 79+
 80+ Subclassess ought also define two static methods: `func' and `init'. The
 81+ first is the actual function that accepts an argument together with the same
 82+ number of parameters as in __params__. The second is used to get initial
 83+ estimates for the Levenberg-Marquardt leastsq minimizer used to fit this
 84+ model.
 85+
 86+ From that point on, any instance of this class acts as the function `func'
 87+ itself, with the only differences that it automatically performs partial
 88+ application for those Parameter attributes that are being assigned a value.
 89+ Example:
 90+
 91+ # expon.func(x, A, B) is A * exp(B * x)
 92+ >>> expon(1, -1, 2) = 0.73575888234288467
 93+ >>> expon.A = 2
 94+ >>> expon(1, -1) = 0.73575888234288467
 95+ '''
 96+ def __init__(self, *args, **kwargs):
 97+ keys = [p.name for p in self.__params__]
 98+ for k in keys:
 99+ if k not in kwargs:
 100+ kwargs[k] = None
 101+ kwargs.update(zip(keys, args)) # update the rightmost parameters only
 102+ for k, v in kwargs.items():
 103+ setattr(self, k, v)
 104+ self.goftest = tuple([None] * 3)
 105+ self.residtest = tuple([None] * 2)
 106+ self.Rsquared = None
 107+ def __call__(self, x, *args):
 108+ '''
 109+ See class method `func'
 110+ '''
 111+ fargs = self.params()
 112+ N = len(filter(None, fargs))
 113+ if N + len(args) > len(fargs):
 114+ raise TypeError('{} accepts only {} '
 115+ 'parameters'.format(self.__class__.__name__, len(fargs)))
 116+ for a in args:
 117+ idx = fargs.index(None)
 118+ fargs[idx] = a
 119+ fargs = tuple(fargs)
 120+ return self.func(x, *fargs)
 121+ def fit(self, x, y, ye, **kwargs):
 122+ '''
 123+ Fits this parametric model to observations (x_i, y_i). Uncertainty in
 124+ the y-estimates can be specified with argument `ye'. Additional keyword
 125+ arguments are passed to scipy.optimize.curve_fit which in turn passes
 126+ them to scipy.optimize.leastsq.
 127+ '''
 128+ fp0 = self.init(x, y)
 129+ fargs = self.params()
 130+ p0 = []
 131+ for a, p in zip(fargs, fp0):
 132+ if a is None:
 133+ p0.append(p)
 134+ p0 = tuple(p0)
 135+ return curve_fit(self, x, y, sigma=ye, p0=p0, **kwargs)
 136+ def gof(self, x, y, ye):
 137+ '''
 138+ Computes GoF test statistics and other diagnostical tests
 139+
 140+ Returns:
 141+ --------
 142+ - GoF test: Chi^2, p-value, and ddof
 143+ - Normality of residuals: K^2 and p-value
 144+ '''
 145+ res = {}
 146+ resid = y - self(x)
 147+ chisq = np.sum(((resid) / ye) ** 2)
 148+ ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters
 149+ chisq_pvalue = chisqprob(chisq, ddof)
 150+ gof = (chisq, chisq_pvalue, ddof)
 151+ resid = normaltest(resid)
 152+ ym = y.mean()
 153+ SStot = np.sum((y - ym) ** 2)
 154+ SSerr = np.sum((y - self(x)) ** 2)
 155+ Rsquared = 1.0 - SSerr / SStot
 156+# Besides being buggy, this test for homoscedasticity is supposed to work only
 157+# for linear regressions, hence is not suited for our case, but I'll keep it
 158+# here until I figure out an alternative. Remember to uncomment the import for
 159+# OLS ontop.
 160+# regresults = OLS(resid ** 2, np.c_[x, x**2]).fit()
 161+# LM =regresults.rsquared
 162+# LM_pvalue = chisqprob(LM, len(x) - ddof)
 163+# white = (LM, LM_pvalue)
 164+# return gof, resid, white
 165+ return gof, resid, Rsquared
 166+ def __str__(self):
 167+ name = self.__class__.__name__
 168+ prep = []
 169+ for p in self.params():
 170+ if p is not None:
 171+ prep.append('%3.4g' % p)
 172+ else:
 173+ prep.append('*')
 174+ return '{}({})'.format(name, ','.join(prep))
 175+ def __repr__(self):
 176+ return '<{} object at 0x{}>'.format(str(self), '%x' % id(self))
 177+ def summary(self, **kwargs):
 178+ '''
 179+ Returns a summary of this model
 180+ '''
 181+ s = StringIO()
 182+ print >> s, ''
 183+ print >> s, 'General information'
 184+ print >> s, '-------------------'
 185+ print >> s, 'model: %s' % self.name.capitalize()
 186+ print >> s, 'date: %s' % dt.datetime.now()
 187+ for item in kwargs.items():
 188+ print >> s, '{}: {}'.format(*map(str, item))
 189+ print >> s, ''
 190+ print >> s, 'Model parameters'
 191+ print >> s, '----------------'
 192+ for p, val, err in zip(self.__params__, self.params(), self.errors()):
 193+ print >> s, '{}: {} ± {}'.format(p.name, _orNA(val), _orNA(err))
 194+ chi, p, ddof = self.goftest
 195+ print >> s, ''
 196+ print >> s, 'Fit results'
 197+ print >> s, '-----------'
 198+ print >> s, 'Goodness-of-fit: Chi-squared = {}, p = {}, ddof = {}'.format(
 199+ _orNA(chi, '%5.2f'), _orNA(p, '%8.4e'), _orNA(ddof, '%d'))
 200+ D, p = self.residtest
 201+ print >> s, 'Normality of residuals: K-squared = {}, p = {}'.format(
 202+ _orNA(D, '%5.2f'), _orNA(p, '%8.4e'))
 203+ print >> s, 'Coefficient of Determination: {}'.format(
 204+ _orNA(self.Rsquared, '%5.2f'))
 205+ return s.getvalue()
 206+
 207+class Expon(ParametricModel):
 208+ '''
 209+ y = A * exp( -(x / B)) + C
 210+ '''
 211+ __params__ = []
 212+ A = Parameter('A', __params__)
 213+ B = Parameter('B', __params__)
 214+ C = Parameter('C', __params__)
 215+ name = 'exponential'
 216+ @staticmethod
 217+ def func(x, a, b, c):
 218+ return a * np.exp(-(x / b)) + c
 219+ @staticmethod
 220+ def init(x, y):
 221+ a0 = y[np.argmin(np.abs(x))] # estimate for A = f(0)
 222+ b0 = x.max() / 10.0
 223+ c0 = y.min()
 224+ return (a0, b0, c0)
 225+ def fit(self, x, y, ye, **kwargs):
 226+ return super(Expon, self).fit(x, y, ye, **kwargs)
 227+
 228+class StretchedExpon(ParametricModel):
 229+ '''
 230+ y = A * exp (-(t / tau) ** beta) + C
 231+ '''
 232+ __params__ = []
 233+ A = Parameter('A', __params__)
 234+ tau = Parameter('tau', __params__)
 235+ beta = Parameter('beta', __params__)
 236+ C = Parameter('C', __params__)
 237+ name = 'stretched exponential'
 238+ @staticmethod
 239+ def func(x, a, tau, beta, c):
 240+ return a * np.exp(- (x / tau) ** beta) + c
 241+ @staticmethod
 242+ def init(x, y):
 243+ a0 = y[np.argmin(np.abs(x))] # estimate for A = f(0)
 244+ tau0 = x.max() / 10.0
 245+ c0 = y.min()
 246+ return (a0, tau0, 0.5, c0)
 247+ def fit(self, x, y, ye, **kwargs):
 248+ return super(StretchedExpon, self).fit(x, y, ye, **kwargs)
 249+ def summary(self, **kwargs):
 250+ mrt = self.mrt(self.tau, self.beta)
 251+ kwargs['Mean relaxation time <tau>'] = '%5.2f days' % mrt
 252+ return super(StretchedExpon, self).summary(**kwargs)
 253+ def mrt(self, tau, beta):
 254+ return (tau / beta) * gamma(beta ** -1)
 255+
 256+class PowerLaw(ParametricModel):
 257+ '''
 258+ y = A * x ** B
 259+ '''
 260+ __params__ = []
 261+ A = Parameter('A', __params__)
 262+ B = Parameter('B', __params__)
 263+ name = 'power-law'
 264+ @staticmethod
 265+ def func(x, a, b):
 266+ return a * x ** b
 267+ @staticmethod
 268+ def init(x, y):
 269+ return (1, y.ptp()/x.ptp())
 270+# NR says this code is more robust against roundoff errors, but presently it
 271+# does not work. Bummer.
 272+# def fit(self, x, y, ye, **kwargs):
 273+# x, y, ye = self._removezeros(x, y, ye)
 274+# ye = ye / y
 275+# x = np.log(x)
 276+# y = np.log(y)
 277+# S = np.sum(ye ** -1)
 278+# Sx = np.sum(x / ye)
 279+# Sy = np.sum(y / ye)
 280+# t = (ye ** -1) * (x - Sx / S)
 281+# Stt = np.sum(t ** 2)
 282+# b = Stt ** -1 * np.sum((y * t) / ye)
 283+# a = np.exp((Sy - Sx * b) / S)
 284+# a_var = S ** -1 * (1 + Sx ** 2 / (S * Stt))
 285+# b_var = Stt ** -1
 286+# ab_covar = - Sx / Stt
 287+# pcov = np.asarray([[a_var, ab_covar], [ab_covar, b_var]])
 288+# return (a, b), pcov
 289+ def fit(self, x, y, ye, **kwargs):
 290+ '''
 291+ Fit by linear least squares of log-transformed data
 292+ '''
 293+ x, y, ye = self._removezeros(x, y, ye)
 294+ ye = (ye / y) ** 2
 295+ x = np.log(x)
 296+ y = np.log(y)
 297+ S = np.sum(ye ** -1)
 298+ Sx = np.sum(x / ye)
 299+ Sy = np.sum(y / ye)
 300+ Sxx = np.sum(x ** 2 / ye)
 301+ Sxy = np.sum((x * y) / ye)
 302+ Delta = S * Sxx - Sx ** 2
 303+ a = np.exp((Sxx * Sy - Sx * Sxy) / Delta)
 304+ b = (S * Sxy - Sx * Sy) / Delta
 305+ a_var = Sxx / Delta
 306+ b_var = S / Delta
 307+ ab_covar = - Sx / Delta
 308+ pcov = np.asarray([[a_var, ab_covar], [ab_covar, b_var]])
 309+ return (a, b), pcov
 310+ def gof(self, x, y, ye):
 311+ '''
 312+ GoF of linear least squares of log-transformed data
 313+ '''
 314+ x, y, ye = self._removezeros(x, y, ye)
 315+ ye = (ye / y)
 316+ x = np.log(x)
 317+ y = np.log(y)
 318+ yp = np.log(self.A) + self.B * x
 319+ chisq = np.sum(((yp - y) / ye) ** 2)
 320+ ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters
 321+ chisq_pvalue = chisqprob(chisq / 2., ddof)
 322+ resid = normaltest(y - yp)
 323+ ym = y.mean()
 324+ SStot = np.sum((y - ym) ** 2)
 325+ SSerr = np.sum((y - yp) ** 2)
 326+ Rsquared = 1.0 - SSerr / SStot
 327+ return (chisq, chisq_pvalue, ddof), resid, Rsquared
 328+ @staticmethod
 329+ def _removezeros(x, y, ye):
 330+ idx = x > 0
 331+ return x[idx], y[idx], ye[idx]
 332+
 333+if __name__ == '__main__':
 334+
 335+ import matplotlib.pyplot as pp
 336+ import scale
 337+
 338+ model = StretchedExpon()
 339+
 340+ a = 2
 341+ tau = 100
 342+ beta = .5
 343+ c = 0.
 344+ s = 0.1
 345+ xmax = 1000
 346+ x = np.linspace(0, xmax, 50)
 347+ y = model(x, a, tau, beta, c) + np.random.randn(len(x)) * s
 348+
 349+ pest, pcov = model.fit(x, y, s)
 350+
 351+ model.setparams(*zip(pest, np.sqrt(np.diag(pcov))))
 352+
 353+ xx = np.linspace(0, xmax, 1000)
 354+ yy = model(xx)
 355+
 356+ pp.errorbar(x, y, s, fmt='. ', color='k', ecolor='none', label='data')
 357+ pp.plot(xx, yy, 'r-', label='Stretch. Exp. fit')
 358+ pp.xscale('power', exponent=beta)
 359+ pp.yscale('log')
 360+
 361+ pp.legend()
 362+ gof, resid, Rsquared = model.gof(x, y, s)
 363+ model.goftest = gof
 364+ model.residtest = resid
 365+ model.Rsquared = Rsquared
 366+ print model.summary()
 367+ chi, p, ddof = gof
 368+ pp.text(200, 1, r'$\chi^2 = %.2f,\, p-{\rm value} = %5.2g,\,'
 369+ r'{\rm ddof} = %d,\, R^2 = %.2f$'
 370+ % (chi,p,ddof, Rsquared),
 371+ fontsize=16)
 372+ pp.show()
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/scale.py
@@ -0,0 +1,68 @@
 2+from matplotlib.scale import ScaleBase, register_scale
 3+from matplotlib.transforms import Transform, nonsingular
 4+from matplotlib.ticker import LinearLocator, Formatter
 5+from math import ceil, floor
 6+import numpy as np
 7+
 8+class PowerScale(ScaleBase):
 9+ name ='power'
 10+ def __init__(self, axis, **kwargs):
 11+ ScaleBase.__init__(self)
 12+ exponent = kwargs.pop('exponent')
 13+ if exponent <= 0:
 14+ raise ValueError('exponent must be positive')
 15+ self.exponent = exponent
 16+ def get_transform(self):
 17+ return PowerTransform(self.exponent)
 18+ def set_default_locators_and_formatters(self, axis):
 19+ axis.set_major_locator(PowerLocator(self.exponent))
 20+ axis.set_major_formatter(PowerFormatter(self.exponent))
 21+ axis.set_minor_formatter(PowerFormatter(self.exponent))
 22+
 23+class PowerLocator(LinearLocator):
 24+ def __init__(self, exponent, **kwargs):
 25+ LinearLocator.__init__(self, **kwargs)
 26+ self.exponent = exponent
 27+ self.numticks = 5
 28+ def __call__(self):
 29+ vmin, vmax = self.axis.get_view_interval()
 30+ vmin, vmax = nonsingular(vmin, vmax, expander = 0.05)
 31+ vmin = vmin ** self.exponent
 32+ vmax = vmax ** self.exponent
 33+ if vmax<vmin:
 34+ vmin, vmax = vmax, vmin
 35+
 36+ ticklocs = np.linspace(vmin, vmax, num=self.numticks, endpoint=True)
 37+ return self.raise_if_exceeds(ticklocs ** (1.0 / self.exponent))
 38+
 39+class PowerFormatter(Formatter):
 40+ def __init__(self, exponent):
 41+ self.exponent = exponent
 42+ def __call__(self, x, pos=None):
 43+ return u'%.2g' % (x ** (1.0 / self.exponent))
 44+
 45+class PowerTransform(Transform):
 46+ input_dims = 1
 47+ output_dims = 1
 48+ is_separable = True
 49+ def __init__(self, exponent):
 50+ Transform.__init__(self)
 51+ self.exponent = exponent
 52+ def transform(self, a):
 53+ return a ** self.exponent
 54+ def inverted(self):
 55+ return PowerTransform(1.0 / self.exponent)
 56+
 57+register_scale(PowerScale)
 58+
 59+if __name__ == '__main__':
 60+ from pylab import *
 61+ import numpy as np
 62+ tau = 20
 63+ beta = 0.5
 64+ x = np.linspace(0,100, num=10)
 65+ y = np.exp(-(x / tau) ** beta)
 66+ plot(x, y, 'o ', mfc='none', mew=2)
 67+ xscale('power', exponent=beta)
 68+ yscale('log', basey=10)
 69+ show()
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
@@ -0,0 +1,173 @@
 2+#!/usr/bin/python
 3+#:vim:ts=python:
 4+
 5+''' functions for computing average activity rate of a single cohort '''
 6+
 7+# TODO adapts to sparse matrices!
 8+
 9+import os
 10+import sys
 11+from argparse import ArgumentParser, FileType
 12+import numpy as np
 13+from scipy.sparse import coo_matrix
 14+from collections import deque
 15+import datetime as dt
 16+
 17+__prog__ = os.path.basename(os.path.abspath(__file__))
 18+
 19+namespaces = {
 20+ 'main': 0,
 21+ 'talk': 1,
 22+ 'user': 2,
 23+ 'user_talk' : 3,
 24+ 'wikipedia' : 4,
 25+ 'wikipedia_talk' : 5,
 26+ 'file' : 6,
 27+ 'file_talk' : 7,
 28+ 'mediawiki' : 8,
 29+ 'mediawiki_talk' : 9,
 30+ 'template' : 10,
 31+ 'template_talk' : 11,
 32+ 'help' : 12,
 33+ 'help_talk' : 13,
 34+ 'category' : 14,
 35+ 'category_talk' : 15,
 36+ 'portal' : 100,
 37+ 'portal_talk' : 101,
 38+ 'book' : 108,
 39+ 'book_talk' : 109
 40+}
 41+
 42+MAX_NS = np.max(namespaces.values()) # 109 as of August 2011
 43+
 44+def estimaterate(edits, step):
 45+ '''
 46+ This function takes the daily edit history of an individual editor, and a
 47+ step parameter; it estimates the daily activity of the editor. It returns
 48+ the daily rates every `step' days.
 49+ '''
 50+ N = len(edits)
 51+ if N % step:
 52+ NN = np.ceil(N / float(step)) * step
 53+ tmp = np.zeros((NN,), dtype=edits.dtype)
 54+ tmp[:N] = edits
 55+ edits = tmp
 56+ return edits.reshape((-1, step)).sum(axis=-1) / float(step)
 57+
 58+def itercycles(npzarchive, every, users=None, onlyns=None):
 59+ '''
 60+ Iterates over the archive or over given list of users and returns estimated
 61+ activity life cycle (see estimaterate())
 62+
 63+ For parameters, see computerates
 64+ '''
 65+ for uid in (users or npzarchive.files):
 66+ data = npzarchive[uid].view(np.recarray)
 67+ idx = data.ns >= 0 # let's filter out junk (i.e. edits to virtual ns)
 68+ days = data.day[idx]
 69+ edits = data.edits[idx]
 70+ ns = data.ns[idx]
 71+ days = days - days.min()
 72+ shape = (days.max() + 1, MAX_NS + 1)
 73+ M = coo_matrix((edits, (days, ns)), shape=shape)
 74+ if onlyns is not None:
 75+ M = M.tocsc()[:, onlyns]
 76+ M = M.tocsr()
 77+ rates = estimaterate(np.asarray(M.sum(axis=1)).ravel(), every)
 78+ yield np.c_[np.arange(len(rates)) * every, rates]
 79+
 80+def average(ratesbyday):
 81+ '''
 82+ Computes average cycle with standard errors. Takes in input a dictionary
 83+ returned by groupbydayssince()
 84+ '''
 85+ all_days = sorted(ratesbyday.keys())
 86+ result = deque()
 87+ for d in all_days:
 88+ s = ratesbyday[d]
 89+ sqN = np.sqrt(len(s))
 90+ result.append((d, np.mean(s), np.std(s)/np.sqrt(len(s)), len(s)))
 91+ return np.asarray(result)
 92+
 93+def groupbyday(npzarchive, every, users=None, onlyns=None):
 94+ '''
 95+ This function estimates editors' activity rates and groups rate estimates by
 96+ number of days elapsed since editor registration (which corresponds to time = 0)
 97+
 98+ For parameters, see computerates
 99+ '''
 100+ tmp = {}
 101+ for cyclearr in itercycles(npzarchive, every, users, onlyns):
 102+ for d, r in cyclearr:
 103+ try:
 104+ tmp[d].append(r)
 105+ except KeyError:
 106+ tmp[d] = deque([r])
 107+ return tmp
 108+
 109+# NOTE: not used right now
 110+def lifetimes(npzarchive, users=None):
 111+ '''
 112+ Returns the distribution of account lifetimes over an archive. Can take an
 113+ optional list users ids to restrict the sample to a specific group of
 114+ editors
 115+ '''
 116+ lt = deque()
 117+ for uid in (users or npzarchive.files):
 118+ days, edits = npzarchive[uid].T
 119+ lt.append(days.ptp())
 120+ return np.asarray(lt)
 121+
 122+# NOTE: not used right now
 123+def find_inactives(npzarchive, inactivity, minimum_activity, maximum_activity):
 124+ now = dt.datetime.now().toordinal()
 125+ epoch = dt.datetime(1970,1,1).toordinal()
 126+ unix_now = now - epoch
 127+ inactives = deque()
 128+ for uid in npzarchive.files:
 129+ days, edits = npzarchive[uid].T
 130+ if days.ptp() <= inactivity:
 131+ continue
 132+ unix_last = days[-1]
 133+ if (unix_now - unix_last) > inactivity:
 134+ tot_edits = float(edits.sum())
 135+ tot_days = float(days.ptp() - inactivity)
 136+ activity = tot_edits / tot_days * 365.0
 137+ if minimum_activity < activity and maximum_activity > activity:
 138+ inactives.append(uid)
 139+ return inactives
 140+
 141+def computerates(fn, every, users=None, onlyns=None):
 142+ '''
 143+ Returns an array of average activity vs day since registration with standard
 144+ errors of the average
 145+
 146+ Parameters
 147+ ----------
 148+ onlyns - compute edit counts only over specified list of namespaces
 149+ users - compute rates only for these users
 150+ every - compute daily rates in strides of length `every'
 151+ '''
 152+ npzarchive = np.load(fn)
 153+ rates = average(groupbyday(npzarchive, every, users, onlyns))
 154+ return rates
 155+
 156+parser = ArgumentParser(description=__doc__)
 157+parser.add_argument('data_file', metavar='data')
 158+parser.add_argument('output_file', metavar='output', type=FileType('w'))
 159+parser.add_argument('-every', type=int, help='default: average over %(default)d days',
 160+ default=30, metavar='NUM')
 161+parser.add_argument('-ns', type=int, action='append', help='select only these NS',
 162+ dest='only')
 163+
 164+def main(ns):
 165+ rates = computerates(ns.data_file, ns.every, onlyns=ns.only)
 166+ if ns.output_file.isatty():
 167+ print >> sys.stderr, 'writing to standard output'
 168+ np.savetxt(ns.output_file, rates, fmt='%f')
 169+ if not ns.output_file.isatty():
 170+ print '%s: output saved to %s' % (__prog__, ns.output_file.name)
 171+
 172+if __name__ == '__main__':
 173+ ns = parser.parse_args()
 174+ main(ns)
Index: trunk/tools/wsor/editor_lifecycle/lifecycle/__init__.py
Index: trunk/tools/wsor/editor_lifecycle/ratesnobots.sql
@@ -0,0 +1,9 @@
 2+select
 3+ rev_user,
 4+ min(rev_timestamp),
 5+ max(rev_timestamp),
 6+ count(*)
 7+from revision left join halfak.bot_20110711
 8+on rev_user = user_id
 9+where user_id is null and rev_user > 0
 10+group by rev_user;
Index: trunk/tools/wsor/editor_lifecycle/rateperedits.py
@@ -0,0 +1,69 @@
 2+#!/usr/bin/python
 3+
 4+'''plots number of edits vs activity rate '''
 5+
 6+import re
 7+import os
 8+import sys
 9+
 10+from argparse import ArgumentParser
 11+from itertools import groupby
 12+
 13+import numpy as np
 14+import matplotlib.pyplot as pp
 15+
 16+__prog__ = os.path.basename(__file__)
 17+
 18+parser = ArgumentParser(description=__doc__)
 19+parser.add_argument('data_file', metavar='data')
 20+
 21+def main(ns):
 22+
 23+ # detect extension and load data
 24+ _, ext = os.path.splitext(ns.data_file)
 25+ if re.match('^\.npy$', ext, re.I):
 26+ data = np.load(ns.data_file)
 27+ elif re.match('^\.tsv$', ext, re.I) or re.match('^\.txt$', ext, re.I):
 28+ data = np.loadtxt(ns.data_file, delimiter='\t')
 29+ else:
 30+ print >> sys.stderr, '%s: error: could not determine file type (.npy,'\
 31+ '.tsv, .txt accepted)' % __prog__
 32+
 33+ # sort data
 34+ data[:,1] = np.floor(np.log2(data[:,1]))
 35+ idx = np.argsort(data[:,1])
 36+ data = data[idx]
 37+
 38+ # group by exponential binning
 39+ positions = []
 40+ x = []
 41+ for p, subiter in groupby(data, lambda k : k[1]):
 42+ positions.append(p)
 43+ x.append(np.asarray([r for r, e in subiter ]))
 44+ positions = np.asarray(positions)
 45+
 46+ # make boxplot on log-log scale
 47+ widths = 2 ** np.arange(0, len(x))
 48+ ax = pp.gca()
 49+ pp.boxplot(x, positions= 2 ** positions, sym=' ', whis=1.8, bootstrap=5000,
 50+ widths=widths)
 51+ pp.xscale('log')
 52+ pp.yscale('log')
 53+
 54+ # add reference line
 55+ x = np.array([1e2, 1e6])
 56+ m = 1e-8
 57+ pp.plot(x, m * x, 'k-', alpha=.75, lw=2)
 58+
 59+ # decorate
 60+ pp.setp(ax.lines, color='k')
 61+ pp.xlim(1,2e6)
 62+ pp.ylim(1e-9,.1)
 63+ pp.xlabel('edits')
 64+ pp.ylabel(r'activity (${\rm sec} ^ {-1}$)')
 65+ pp.draw()
 66+ pp.show()
 67+
 68+if __name__ == '__main__':
 69+ ns = parser.parse_args()
 70+ main(ns)
Index: trunk/tools/wsor/editor_lifecycle/README.rst
@@ -0,0 +1,34 @@
 2+============
 3+README
 4+============
 5+
 6+---------
 7+workflow
 8+---------
 9+
 10+This package is a collection of python and shell scripts that can assist
 11+creating and analyzing data on user life cycle.
 12+
 13+Sample selection
 14+----------------
 15+
 16+TBD
 17+
 18+Edit activity data collection
 19+-----------------------------
 20+
 21+First use `fetchrates` to download the rate data from the MySQL database. This
 22+script takes a user_id in input (and stores the rate data in a file called
 23+<user_id>.npy). This script can be parallelized. At the end you will end up with
 24+a bunch of NPY files.
 25+
 26+Cohort selection
 27+----------------
 28+
 29+See the docstring in `mkcohort`.
 30+
 31+Cohort analysis
 32+---------------
 33+
 34+See `graphlife`, `fitting`, `fitting_batch.sh`, and `relax`.
 35+
Index: trunk/tools/wsor/editor_lifecycle/fitting
@@ -0,0 +1,218 @@
 2+#!/usr/bin/python
 3+# coding: utf-8
 4+# :vim:ft=python
 5+
 6+''' editor lifecycle data fitting tool '''
 7+
 8+import sys
 9+import os
 10+import numpy as np
 11+import matplotlib.pyplot as pp
 12+from matplotlib.font_manager import FontProperties
 13+from argparse import ArgumentParser
 14+from scipy.optimize import curve_fit
 15+
 16+from lifecycle.models import Expon, PowerLaw, StretchedExpon
 17+import lifecycle.scale
 18+
 19+__prog__ = os.path.basename(os.path.abspath(__file__))
 20+
 21+_maxfev = 100000
 22+
 23+parent = ArgumentParser(add_help=False)
 24+parent.add_argument('data_files', metavar='DATA', nargs='+')
 25+parent.add_argument('-output', dest='output_file', metavar='FILE')
 26+parent.add_argument('-title')
 27+group = parent.add_mutually_exclusive_group()
 28+group.add_argument('-loglog', action='store_true')
 29+group.add_argument('-logpow', action='store_true')
 30+parent.add_argument('-constrain', choices=['head', 'tail', 'both'])
 31+parent.add_argument('-batch', action='store_true', help='do not show graphics')
 32+parent.add_argument('-force', action='store_true', help='force overwrite')
 33+parent.add_argument('-minsize', type=int)
 34+
 35+parser = ArgumentParser(description=__doc__)
 36+subparsers = parser.add_subparsers(help='Parametric models supported')
 37+
 38+parser_expon = subparsers.add_parser('expon', parents=[parent])
 39+parser_expon.set_defaults(modelclass=Expon)
 40+
 41+parser_stretch = subparsers.add_parser('stretchedexp', parents=[parent])
 42+parser_stretch.set_defaults(modelclass=StretchedExpon)
 43+
 44+parser_power = subparsers.add_parser('powerlaw', parents=[parent])
 45+parser_power.set_defaults(modelclass=PowerLaw)
 46+
 47+class DataError(Exception):
 48+ pass
 49+
 50+class TooFewDataError(DataError):
 51+ pass
 52+
 53+class EmptyDataError(DataError):
 54+ pass
 55+
 56+def plotfit(model, x, y, ye, label='data', **kwargs):
 57+ xx = np.linspace(x.min(), x.max(), endpoint=True, num=1000)
 58+ yy = model(xx)
 59+ kwargs['ecolor'] = 'none'
 60+ kwargs['ls'] = 'none'
 61+ pp.errorbar(x, y, ye / 2, label=label, **kwargs)
 62+ model_label = model.name.split()
 63+ if len(model_label) > 1:
 64+ model_label[1] = model_label[1][:3] + '.'
 65+ model_label = ' '.join(model_label[:2]).capitalize()
 66+ c = kwargs.get('color', 'r')
 67+ pp.plot(xx, yy, '--', color=c, label='{} fit'.format(model_label), lw=2.5)
 68+ if ns.loglog:
 69+ pp.xscale('log')
 70+ pp.yscale('log')
 71+ elif ns.logpow:
 72+ pp.xscale('power', exponent=model.beta)
 73+ pp.yscale('log')
 74+ pp.legend(loc='best', prop=FontProperties(size='x-small'))
 75+ if ns.title is not None:
 76+ pp.title(ns.title)
 77+ pp.xlabel('Days since registration')
 78+ pp.ylabel('Edits/day')
 79+
 80+def plotresid(model, x, y, label='data', **kwargs):
 81+ r = model(x) - y
 82+# rm = r[True - np.isinf(r)].max()
 83+# r /= np.abs(rm)
 84+ pp.axhline(y=0, c='k')
 85+ kwargs['ls'] = ':'
 86+ pp.plot(x, r, label=label, **kwargs)
 87+ pp.title('Fit residuals')
 88+ pp.xlabel('Days since registration')
 89+# pp.ylabel(r'Relative residual $\xi / \max{|\xi|}$')
 90+# pp.ylim(-1,1)
 91+ pp.draw()
 92+
 93+def _testoverwrite(*files):
 94+ exit_flag = False
 95+ for fn in files:
 96+ if os.path.exists(fn):
 97+ exit_flag = True
 98+ print '%s: error: cannot overwrite %s' % (__prog__, fn)
 99+ if exit_flag:
 100+ sys.exit(1)
 101+
 102+# TODO add usecols and option for taking only observations based on minimum
 103+# sample size
 104+
 105+def fit(data_file, modelclass, constrain=None, minsize=None):
 106+ '''
 107+ Fits a model to data. Data are read from `data_file` and the model is
 108+ specified as a `modelclass` (see `lifecycle.models`). The data file should
 109+ contain a (N,4) array where columns are x, y, ye (errors) and n (sample
 110+ sizes -- number of users on which the average rate y is computed).
 111+
 112+ Parameters
 113+ ----------
 114+ If `modelclass` is any of `Expon` or `StretchedExpon`, one can constrain the
 115+ value of some parameters before performing the fit. To this end one can use
 116+ the parameter `constrain`, which can be any of the following strings:
 117+ 'both', 'head', or 'tail'. Their meaning is the following:
 118+
 119+ * head : A = y[0]
 120+ * tail : C = y[-1]
 121+ * both : A = y[0] and C = y[-1]
 122+ '''
 123+ x, y, ye, n = np.loadtxt(data_file, unpack=True)
 124+ idx = ye > 0
 125+ if minsize:
 126+ idx = idx * (n > minsize)
 127+ if not np.any(idx):
 128+ raise EmptyDataError()
 129+ if np.sum(idx) < len(modelclass.__params__):
 130+ raise TooFewDataError()
 131+ x = x[idx]
 132+ y = y[idx]
 133+ ye = ye[idx]
 134+ model = modelclass()
 135+ if constrain in ['head', 'both']:
 136+ model.A = y[np.argmin(np.abs(x))]
 137+ if constrain in ['tail', 'both']:
 138+ model.C = y.min()
 139+ pest, pcov = model.fit(x, y, ye=ye, maxfev=_maxfev)
 140+ if not np.isscalar(pcov):
 141+ perr = np.sqrt(np.diag(pcov)) / 2.
 142+ model.setparams(*zip(pest, perr))
 143+ else:
 144+ model.setparams(*pest)
 145+ gof, resid, Rsquared = model.gof(x, y, ye)
 146+ model.goftest = gof
 147+ model.residtest = resid
 148+ model.Rsquared = Rsquared
 149+ print model.summary(dataset=data_file, observations=len(x))
 150+ return x, y, ye, model
 151+
 152+def main(ns):
 153+ # import matploblib
 154+ from matplotlib.lines import lineMarkers as markers
 155+ markers = dict(filter(
 156+ lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing',
 157+ markers.items())).keys()
 158+ colors = 'bgcmykw'
 159+
 160+ data = []
 161+ models = []
 162+ labels = []
 163+
 164+ # fit all datasets
 165+ for f in ns.data_files:
 166+ try:
 167+ x, y, ye, model = fit(f, ns.modelclass, ns.constrain, ns.minsize)
 168+ except TooFewDataError:
 169+ print >> sys.stderr, '%s: warning: too few data: %s' % (__prog__, f)
 170+ continue
 171+ except EmptyDataError:
 172+ print >> sys.stderr, '%s: warning: no usable data: %s'% (__prog__, f)
 173+ continue
 174+ data.append((x,y,ye))
 175+ models.append(model)
 176+ labels.append(os.path.splitext(f)[0])
 177+
 178+ # plot fits
 179+ pp.figure()
 180+ pp.hold(1)
 181+ for i, ((x, y, ye), model, label) in enumerate(zip(data, models, labels)):
 182+ m = markers[i % len(markers)]
 183+ c = colors[i % len(colors)]
 184+ plotfit(model, x, y, ye, label, marker=m, color=c)
 185+
 186+ # plot residuals
 187+ pp.figure()
 188+ pp.hold(1)
 189+ for i, ((x, y, ye), model, label) in enumerate(zip(data, models, labels)):
 190+ m = markers[i % len(markers)]
 191+ c = colors[i % len(colors)]
 192+ plotresid(model, x, y, label, marker=m, color=c)
 193+
 194+ # save figures
 195+ if ns.output_file is not None:
 196+ fn, ext = os.path.splitext(ns.output_file)
 197+ fmt = ext[1:]
 198+ if ns.batch and fmt.lower() != 'pdf':
 199+ print '%s: error: batch mode supports only PDF format' % __prog__
 200+ sys.exit(1)
 201+ resid_output_file = fn + '_residuals' + ext
 202+ if not ns.force:
 203+ _testoverwrite(ns.output_file, resid_output_file)
 204+ pp.figure(1)
 205+ pp.savefig(ns.output_file, format=fmt)
 206+ print '%s: output saved to %s' % (__prog__, ns.output_file)
 207+ pp.figure(2)
 208+ pp.savefig(resid_output_file, format=fmt)
 209+ print '%s: output saved to %s' % (__prog__, resid_output_file)
 210+
 211+ pp.show()
 212+
 213+if __name__ == '__main__':
 214+ ns = parser.parse_args()
 215+# if ns.batch:
 216+# import matplotlib
 217+# matplotlib.use('PDF')
 218+# import matplotlib.pyplot as pp
 219+ main(ns)
Property changes on: trunk/tools/wsor/editor_lifecycle/fitting
___________________________________________________________________
Added: svn:executable
1220 + *

Status & tagging log