r95709 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r95708‎ | r95709 | r95710 >
Date:22:07, 29 August 2011
Author:giovanni
Status:deferred
Tags:
Comment:
removed obsolete scripts
Modified paths:
  • /trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort (deleted) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete/graphlife (deleted) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort (deleted) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete/rates (deleted) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh (deleted) (history)
  • /trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql (deleted) (history)

Diff [purge]

Index: trunk/tools/wsor/editor_lifecycle/obsolete/rates
@@ -1,96 +0,0 @@
2 -#!/usr/bin/python
3 -#:vim:ts=python:
4 -
5 -''' compute editor lifecycle '''
6 -
7 -'''
8 -Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, GCIAMPAGLIA@WIKIMEDIA.ORG
9 -This program is free software; you can redistribute it and/or modify
10 -it under the terms of the GNU General Public License as published by
11 -the Free Software Foundation; either version 2 of the License, or
12 -(at your option) any later version.
13 -
14 -This program is distributed in the hope that it will be useful,
15 -but WITHOUT ANY WARRANTY; without even the implied warranty of
16 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 -GNU General Public License for more details.
18 -
19 -You should have received a copy of the GNU General Public License along
20 -with this program; if not, write to the Free Software Foundation, Inc.,
21 -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 -http://www.gnu.org/copyleft/gpl.html
23 -'''
24 -
25 -import re
26 -import os
27 -from argparse import ArgumentParser
28 -import numpy as np
29 -from collections import deque
30 -import datetime as dt
31 -
32 -from lifecycle.rates import *
33 -
34 -__prog__ = os.path.basename(os.path.abspath(__file__))
35 -
36 -parser = ArgumentParser(description=__doc__)
37 -parser.add_argument('data_file', metavar='data')
38 -parser.add_argument(metavar='minact', type=int, dest='minimum_activity')
39 -parser.add_argument(metavar='maxact', type=int, dest='maximum_activity')
40 -parser.add_argument('-key')
41 -parser.add_argument('-every', type=int, help='default: %(default)d days',
42 - default=30, metavar='NUM')
43 -parser.add_argument('-inactivity', type=int, default=180, help='default: '
44 - '%(default)d days', metavar='NUM')
45 -parser.add_argument('-all', dest='dump_all', action='store_true')
46 -
47 -
48 -def main(ns):
49 - if ns.key is None:
50 - m = re.match('(.*?)\.npz', ns.data_file, re.I)
51 - if m is not None:
52 - ns.key = m.groups()[0]
53 - else:
54 - print >> sys.stderr, '%s: cannot determine key from file name: %s'\
55 - % (__prog__, ns.data_file)
56 - sys.exit(1)
57 - if ns.minimum_activity >= ns.maximum_activity:
58 - print >> sys.stderr, '%s: error: minact >= maxact' % __prog__
59 - sys.exit(1)
60 -
61 - # load data
62 - npzarchive = np.load(ns.data_file)
63 -
64 - if ns.dump_all:
65 - fn = mkfn('cycles', ns, 'npz')
66 - values_iter = itercycles(npzarchive, ns.every)
67 - keys = npzarchive.files
68 - tmp = dict(zip(keys, list(values_iter)))
69 - np.savez(fn, **tmp)
70 - print '%s: output saved to %s' % (__prog__, fn)
71 - else:
72 - # compute lifetime distribution
73 - lt = lifetimes(npzarchive)
74 -
75 - # compute inactive subgroups
76 - inactive_users = find_inactives(npzarchive, ns.inactivity, ns.minimum_activity,
77 - ns.maximum_activity)
78 -
79 - ratesbyday = groupbyday(npzarchive, ns.every)
80 - ratesbyday_inact = groupbyday(npzarchive, ns.every, inactive_users)
81 -
82 - avg_all = averagecycle(ratesbyday)
83 - avg_inact = averagecycle(ratesbyday_inact)
84 -
85 - lens = [ len(npzarchive.files), len(inactive_users) ]
86 -
87 - names = ['lt', 'len', 'all', 'inact' ]
88 - arrs = [ lt, lens, avg_all, avg_inact ]
89 -
90 - for n, a in zip(names, arrs):
91 - fn = '%s_%s.%s' % (ns.key, n, 'tsv')
92 - np.savetxt(fn, a)
93 - print '%s: output saved to %s' % (__prog__, fn)
94 -
95 -if __name__ == '__main__':
96 - ns = parser.parse_args()
97 - main(ns)
Index: trunk/tools/wsor/editor_lifecycle/obsolete/graphlife
@@ -1,108 +0,0 @@
2 -#!/usr/bin/python
3 -
4 -''' plot editor life cycle '''
5 -
6 -'''
7 -Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, GCIAMPAGLIA@WIKIMEDIA.ORG
8 -This program is free software; you can redistribute it and/or modify
9 -it under the terms of the GNU General Public License as published by
10 -the Free Software Foundation; either version 2 of the License, or
11 -(at your option) any later version.
12 -
13 -This program is distributed in the hope that it will be useful,
14 -but WITHOUT ANY WARRANTY; without even the implied warranty of
15 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 -GNU General Public License for more details.
17 -
18 -You should have received a copy of the GNU General Public License along
19 -with this program; if not, write to the Free Software Foundation, Inc.,
20 -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 -http://www.gnu.org/copyleft/gpl.html
22 -'''
23 -
24 -import sys
25 -import numpy as np
26 -from argparse import ArgumentParser
27 -import os
28 -
29 -__prog__ = os.path.basename(os.path.abspath(__file__))
30 -
31 -parser = ArgumentParser(description=__doc__)
32 -parser.add_argument('data_files', metavar='data', nargs='+')
33 -parser.add_argument('-l', '--label', metavar='TEXT', action='append',
34 - dest='labels_list')
35 -parser.add_argument('-inset', dest='inset_data_file', metavar='FILE')
36 -parser.add_argument('-batch', action='store_true', help='uses PDF backend')
37 -parser.add_argument('-title')
38 -parser.add_argument('-fmt', default='pdf', help='default: %(default)s')
39 -
40 -if __name__ == '__main__':
41 - ns = parser.parse_args()
42 -
43 - # checks
44 - if ns.labels_list and len(ns.data_files) != len(ns.labels_list):
45 - print >> sys.stderr, '%s: error: please provide as many labels '\
46 - 'as data files' % __prog__
47 - sys.exit(1)
48 -
49 - # import pyplot, make lists of colors and markers
50 - if ns.batch:
51 - import matplotlib
52 - matplotlib.use('PDF')
53 - import matplotlib.pyplot as pp
54 - from matplotlib.lines import lineMarkers as markers
55 - markers = dict(filter(
56 - lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing',
57 - markers.items())).keys()
58 - colors = 'krbgm'
59 -
60 - # create figure and axes
61 - fig = pp.figure()
62 - ax = pp.axes([.1, .1, .85, .8])
63 -
64 - # add lines
65 - N = len(ns.data_files)
66 - for i in xrange(N):
67 - data_file = ns.data_files[i]
68 - if ns.labels_list is not None:
69 - label = ns.labels_list[i]
70 - else:
71 - label = 'line-%d' % (i + 1)
72 - color = colors[i % len(colors)]
73 - marker= markers[i % len(markers)]
74 - x, y, ye = np.loadtxt(data_file, unpack=1)
75 - ax.errorbar(x, y, ye, color=color, marker=marker, mfc='none',
76 - mec=color, ls=':', label=label)
77 -
78 - ax.legend(loc=2)
79 - ax.set_xlabel('days since registration')
80 - ax.set_ylabel('edits/day')
81 - if ns.title is not None:
82 - ax.set_title(ns.title)
83 - ax.axis('tight')
84 -
85 - # plot hist of lifetimes in inset axes
86 - if ns.inset_data_file is not None:
87 - lt = np.loadtxt(ns.inset_data_file)
88 - inax = pp.axes([.55, .6, .35, .25], axisbg='none')
89 - inax.hist(lt, bins=20, fc='none', cumulative=-1, normed=0)
90 - for l in inax.xaxis.get_ticklabels():
91 - l.set_rotation(30)
92 - l.set_fontsize('x-small')
93 - for l in inax.yaxis.get_ticklabels():
94 - l.set_fontsize('x-small')
95 - inax.set_xlabel('lifespan $x$ (days)', fontsize='small')
96 - inax.set_ylabel('no. of users older\n more than $x$ days',
97 - fontsize='small')
98 - inax.set_title('account lifetime')
99 - inax.axis('tight')
100 -
101 - pp.draw()
102 - if ns.title is not None:
103 - fn = ns.title.replace(' ', '_').lower() + '.' + ns.fmt
104 - else:
105 - fn = 'output.' + ns.fmt
106 - print 'output saved to %s' % fn
107 -
108 - pp.savefig(fn, fmt=ns.fmt)
109 - pp.show()
Index: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh
@@ -1,30 +0,0 @@
2 -#!/bin/bash
3 -
4 -# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, GCIAMPAGLIA@WIKIMEDIA.ORG
5 -# This program is free software; you can redistribute it and/or modify
6 -# it under the terms of the GNU General Public License as published by
7 -# the Free Software Foundation; either version 2 of the License, or
8 -# (at your option) any later version.
9 -#
10 -# This program is distributed in the hope that it will be useful,
11 -# but WITHOUT ANY WARRANTY; without even the implied warranty of
12 -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 -# GNU General Public License for more details.
14 -#
15 -# You should have received a copy of the GNU General Public License along
16 -# with this program; if not, write to the Free Software Foundation, Inc.,
17 -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 -# http://www.gnu.org/copyleft/gpl.html
19 -
20 -# This scripts writes to output a list of registered, not-flagged-as-bot users,
21 -# sorted by time of first edit. Each item in the list comprises:
22 -#
23 -# 1. user_id
24 -# 2. user_name
25 -# 3. first_timestamp
26 -# 4. editcount
27 -#
28 -# For the SQL query, check file userlist.sql.
29 -
30 -srcdir=`dirname $(type -p $0)`
31 -mysql -BN < $srcdir/userlist.sql | sort -h -k3 -t $'\t'
Index: trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort
@@ -1,214 +0,0 @@
2 -#!/usr/bin/python
3 -# coding: utf-8
4 -# :vim:ft=python
5 -
6 -# TODO: obsolete
7 -
8 -''' creates cohort files, filtering out bots '''
9 -
10 -'''
11 -Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, GCIAMPAGLIA@WIKIMEDIA.ORG
12 -This program is free software; you can redistribute it and/or modify
13 -it under the terms of the GNU General Public License as published by
14 -the Free Software Foundation; either version 2 of the License, or
15 -(at your option) any later version.
16 -
17 -This program is distributed in the hope that it will be useful,
18 -but WITHOUT ANY WARRANTY; without even the implied warranty of
19 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 -GNU General Public License for more details.
21 -
22 -You should have received a copy of the GNU General Public License along
23 -with this program; if not, write to the Free Software Foundation, Inc.,
24 -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 -http://www.gnu.org/copyleft/gpl.html
26 -'''
27 -
28 -'''
29 -This script reads two files: an ZIP archive file, and an index file, which is a
30 -tab-separated text file like the following:
31 -
32 - 34 WojPob 20010129110725 2524
33 - 94 AstroNomer 20010207222248 1532
34 - 43 Lee Daniel Crocker 20010314020407 4388
35 - 86 Stephen Gilbert 20010326191355 3599
36 - 3 Tobias Hoevekamp 20010326202105 1903
37 - 1273 Wathiik 20010510171751 1772
38 - 3371 Arno 20010721180708 2700
39 - 122 Ap 20010722201619 2137
40 - 182 Rjstott 20010726102546 2602
41 - 64 Uriyan 20010727141651 1634
42 -
43 -Where fields are: id, name, date, count. Dates are parsed using dateutil, so
44 -other formats are allowed too (e.g. 2010-01-29 11:07:25).
45 -
46 -The script will aggregate users based on the date field and will lookup for
47 -files of the form <id>.npy in the archive file. Each of these files contains the
48 -daily edits count for a single user, stored using the NumPy binary array
49 -format. A relative path within the ZIP archive can be specified from the command
50 -line with -P/--datapath. Once the data for a cohort (e.g. an aggregated group
51 -of users) have been collected, the script will compute the average activity rate
52 -since the first day of activity for all users in that cohort.
53 -
54 -The script produces two files per each cohort: a tab-separated values file with
55 -cohort average activity rate, and a compressed NumPy binary archive with the
56 -user data array files.
57 -
58 -For each discovered cohort, the script will print on the console the date of the
59 -cohort, how many users it contains, and how many suspected BOT users it filtered
60 -out from the index. Use --bot disable this chieck and always include them. The
61 -check is as follows: if the name contains the pattern 'bot' at the beginning or
62 -at the end of any word, it will be filtered out (e.g. "Botuser IV" will match,
63 -but "Francis Abbott" won't). If arguments -mincount or -maxcount (or both) are
64 -passed, the script will process only users whose edit count is below the minimum
65 -count, or above the maximum count, or both.
66 -
67 -Please note that the index file must be already sorted by date, in order for the
68 -group by date aggregation to work. You can use `sort' from the commmand line,
69 -e.g.:
70 -
71 - $~ sort -t$'\t' -k3 -h unsorted.tsv
72 -
73 -should sort file unsorted.tsv.
74 -'''
75 -
76 -import re
77 -import os
78 -import sys
79 -import csv
80 -import numpy as np
81 -from argparse import ArgumentParser, FileType
82 -from contextlib import closing
83 -from itertools import groupby
84 -from dateutil.parser import parser as DateParser
85 -from datetime import datetime
86 -from zipfile import ZipFile
87 -
88 -from rates import computerates
89 -
90 -__prog__ = os.path.basename(os.path.abspath(__file__))
91 -_botpat = r'\bbot|bot\b'
92 -_fields = ['id', 'name', 'date', 'count']
93 -
94 -def yearkey(date):
95 - return date.year,
96 -
97 -def monthkey(date):
98 - return date.year, date.month
99 -
100 -def daykey(date):
101 - return date.year, date.month, date.day
102 -
103 -parser = ArgumentParser(description=__doc__)
104 -parser.add_argument('index', type=FileType('r'), help='*must* be already sorted')
105 -parser.add_argument('archive_path', metavar='archive', help='data archive in ZIP '
106 - 'format')
107 -group = parser.add_mutually_exclusive_group(required=1)
108 -group.add_argument('--year', help='group by year', action='store_const',
109 - const=yearkey, dest='keyfunc')
110 -group.add_argument('--month', help='group by month', action='store_const',
111 - const=monthkey, dest='keyfunc')
112 -group.add_argument('--day', help='group by day', action='store_const',
113 - const=daykey, dest='keyfunc')
114 -parser.add_argument('--bots', action='store_true', help='do NOT filter out bots')
115 -parser.add_argument('-P', '--datapath', help='relative path of files within '
116 - 'archive', default='')
117 -parser.add_argument('-mincount', type=int)
118 -parser.add_argument('-maxcount', type=int)
119 -parser.add_argument('-minperyear', type=int)
120 -parser.add_argument('-maxperyear', type=int)
121 -parser.add_argument('-n', '--dry-run', action='store_true', help='write to '
122 - 'console all actions, but do not produce any file')
123 -parser.add_argument('-every', type=int, help='default: average over %(default)d days',
124 - default=30, metavar='NUM')
125 -parser.add_argument('-ns', type=int, action='append', help='select only these NS',
126 - dest='only')
127 -
128 -dateparser = DateParser()
129 -
130 -# dummy ZipFile class in case we do not want do anything!
131 -class DummyZipFile:
132 - def __init__(self, fn, mode):
133 - pass
134 - def close(self):
135 - pass
136 - def write(self, fn, *args):
137 - pass
138 -
139 -if __name__ == '__main__':
140 - ns = parser.parse_args()
141 - reader = csv.DictReader(ns.index, _fields, quoting=csv.QUOTE_NONE,
142 - delimiter='\t')
143 - archive = ZipFile(ns.archive_path)
144 -
145 - def _keyfunc(row):
146 - try:
147 - date = dateparser.parse(row['date'])
148 - except:
149 - print row
150 - raise
151 -
152 - return ns.keyfunc(date)
153 -
154 - # group by index by date of registration
155 - for key, subiter in groupby(reader, _keyfunc):
156 -
157 - # reset indices and define output file names from cohort period
158 - tot_users = 0
159 - tot_bots = 0
160 - datestr = '-'.join(map(lambda k : '%02d' % k, key)) # (2010,1) -> '2010-01'
161 - zipfn = '{}.npz'.format(datestr)
162 - tsvfn = '{}.tsv'.format(datestr)
163 -
164 - # if user wants to do a dry-run, replace the Zip files class with the
165 - # dummy one
166 - if ns.dry_run:
167 - ZipFile = DummyZipFile
168 -
169 - # for each user, determine if may go in cohort
170 - with closing(ZipFile(zipfn, 'w')) as zf:
171 - for row in subiter:
172 -
173 - # compute user details (edit count, yearly activity rate, etc.)
174 - # and other useful variables
175 - user_id = row['id']
176 - count = int(row['count'])
177 - user_date = dateparser.parse(row['date'])
178 - now_date = datetime.now()
179 - activity_span = float((now_date - user_date).days) # in days
180 - yearly_rate = count / activity_span * 365.0
181 - bot_flag = re.search(_botpat, row['name'], re.I) is not None
182 - tot_bots += bot_flag # update counts of bot matches
183 -
184 - # define paths
185 - basepath = '{}.npy'.format(user_id)
186 - archivepath = os.path.join(ns.datapath, basepath)
187 -
188 - # check cohort membership (keep if conjunction of all given
189 - # criteria is true, that is, discard if any given criterion is
190 - # false)
191 - if ns.mincount is not None and count <= ns.mincount:
192 - continue
193 - if ns.maxcount is not None and count >= ns.maxcount:
194 - continue
195 - if ns.minperyear is not None and yearly_rate <= ns.minperyear:
196 - continue
197 - if ns.maxperyear is not None and yearly_rate >= ns.maxperyear:
198 - continue
199 - # user can turn this test off by passing --bots
200 - if not ns.bots and bot_flag:
201 - continue
202 - try:
203 - zf.writestr(basepath, archive.read(archivepath))
204 - except KeyError:
205 - print >> sys.stderr, '%s: warning: %s not in archive' %\
206 - (__prog__, archivepath)
207 - tot_users += 1
208 -
209 - if tot_users > 0:
210 - rates = computerates(zipfn, ns.every, onlyns=ns.only)
211 - np.savetxt(tsvfn, rates, fmt='%f')
212 -
213 - print '%s: %s, %s created (users: %5d, skipped bots %5d)' % (
214 - __prog__, tsvfn, zipfn, tot_users, tot_bots)
215 - sys.stdout.flush()
Index: trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort
@@ -1,79 +0,0 @@
2 -#!/usr/bin/python
3 -# vim:ft=python:
4 -# coding : utf-8
5 -
6 -# TODO: obsolete
7 -
8 -'''
9 -Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, GCIAMPAGLIA@WIKIMEDIA.ORG
10 -This program is free software; you can redistribute it and/or modify
11 -it under the terms of the GNU General Public License as published by
12 -the Free Software Foundation; either version 2 of the License, or
13 -(at your option) any later version.
14 -
15 -This program is distributed in the hope that it will be useful,
16 -but WITHOUT ANY WARRANTY; without even the implied warranty of
17 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 -GNU General Public License for more details.
19 -
20 -You should have received a copy of the GNU General Public License along
21 -with this program; if not, write to the Free Software Foundation, Inc.,
22 -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 -http://www.gnu.org/copyleft/gpl.html
24 -'''
25 -
26 -''' fetches a cohort based on year of registration and editing activity '''
27 -
28 -from argparse import ArgumentParser
29 -from oursql import connect
30 -import os
31 -import sys
32 -import datetime as dt
33 -import csv
34 -
35 -prog = os.path.basename(os.path.abspath(__file__))
36 -
37 -parser = ArgumentParser(description=__doc__, fromfile_prefix_chars='@')
38 -parser.add_argument('registration_year', metavar='year', type=int)
39 -parser.add_argument('min_activity', metavar='minedits', type=int)
40 -parser.add_argument('max_activity', metavar='maxedits', type=int)
41 -parser.add_argument('-c', '--config', dest='config_file')
42 -parser.add_argument('-l', '--limit', type=int)
43 -
44 -query = '''
45 -select
46 - user_id,
47 - user_name,
48 - user_registration,
49 - user_editcount
50 -from user u left join user_groups ug
51 -on u.user_id = ug.ug_user
52 -where
53 - (ug_group <> 'bot' or ug_user is null)
54 - and year(user_registration) = ?
55 - and user_editcount > ?
56 - and user_editcount < ?
57 -'''
58 -
59 -if __name__ == '__main__':
60 - ns = parser.parse_args()
61 - if ns.min_activity >= ns.max_activity:
62 - print >> sys.stderr, '%s: error: min_activity >= max_activity' % prog
63 - sys.exit(1)
64 - if ns.registration_year < 2001 or ns.registration_year > dt.datetime.now().year:
65 - print >> sys.stderr, '%s: error: illegal year: %d' % (prog,
66 - ns.registration_year)
67 - sys.exit(1)
68 -
69 - if ns.limit is not None:
70 - query += 'limit %d' % ns.limit
71 -
72 - if ns.config_file is None:
73 - ns.config_file = os.path.expanduser('~/.my.cnf')
74 -
75 - conn = connect(read_default_file=ns.config_file)
76 - writer = csv.writer(sys.stdout, dialect='excel-tab')
77 - cursor = conn.cursor()
78 - cursor.execute(query, (ns.registration_year, ns.min_activity, ns.max_activity))
79 - for row in cursor:
80 - writer.writerow(row)
Index: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql
@@ -1,30 +0,0 @@
2 -
3 -
4 -select
5 - rev_user as user_id,
6 - rev_user_text as user_name,
7 - min(rev_timestamp) as first_timestamp,
8 - count(rev_timestamp) as editcount
9 -from
10 - revision r use index (usertext_timestamp) left join user_groups g
11 -on r.rev_user = g.ug_user
12 -where (ug_group <> 'bot' or g.ug_user is null) and rev_user > 0
13 -group by rev_user_text

Status & tagging log