r95806 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r95805‎ \| r95806 \| r95807 >
Date:	19:25, 30 August 2011
Author:	whym
Status:	deferred
Tags:
Comment:	Updates to trending article scripts, adding docs and some optimizations
Modified paths:	/trunk/tools/wsor/trending_articles/README.rst (modified) (history) /trunk/tools/wsor/trending_articles/chart.py (modified) (history) /trunk/tools/wsor/trending_articles/detectbursts.py (modified) (history) /trunk/tools/wsor/trending_articles/find_revision_status.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/trending_articles/detectbursts.py
—	—	@@ -7,19 +7,22 @@
8	8	from datetime import datetime, timedelta
9	9	import argparse
10	10	import random
11		~~-import gzip~~
	11	+import myzip
12	12	import re
13	13	import os
14	14	import urllib2
15	15	from collections import deque, namedtuple
16	16	import numpy as np
17		~~-import gc~~
18	17
19	18	pageview_tuple = namedtuple('Pageview', 'date count')
20	19	count_tuple = namedtuple('Count', 'pred real')
21	20
22	21	def time_parse(x):
23		~~- return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')~~
	22	+ if x.endswith('.gz'):
	23	+ return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')
	24	+ elif x.endswith('.gz'):
	25	+ return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.xz')
	26	+
24	27	def time_format(x):
25	28	return datetime.strftime(x, '%Y/%m/%d %H:%M:%S')
26	29	def datetime2days(x):
—	—	@@ -28,7 +31,7 @@
29	32	def load_wikistats_file(f):
30	33	print >>sys.stderr, 'loading %s...' % f
31	34	ret = {}
32		~~- for line in gzip.open(f):~~
	35	+ for line in myzip.open(f):
33	36	line.strip()
34	37	(lang,title,count,bytes) = line.split(' ')
35	38	ret[(lang,title)] = count_tuple(float(count), int(count))
—	—	@@ -143,9 +146,6 @@
144	147	if options.inclusive:
145	148	ls.insert(len(ls), bursting.has_key(page))
146	149	writer.writerow([unicode(x) for x in ls])
147		~~- except UnicodeEncodeError, e:~~
148		~~- print >>sys.stderr, '%s: %s' % (e, page)~~
149		~~- continue~~
150	150	except UnicodeDecodeError, e:
151	151	print >>sys.stderr, '%s: %s' % (e, page)
152	152	continue
Index: trunk/tools/wsor/trending_articles/README.rst
—	—	@@ -1,4 +1,37 @@
2		~~-See http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors~~
	2	+These scripts are used to produce the results published on the `sprint
	3	+page on meta`_ on the editor behavior in trending articles.
3	4
4		~~-Counts files are available at:~~
5		~~-http://dammit.lt/wikistats/archive/2011/01/~~
	5	+Usage
	6	+---------
	7	+We use following directory names.
	8	+
	9	+``pageview.all``
	10	+ raw (hourly) page view count files
	11	+``pageview.200907.en``
	12	+ hourly page view count files for EN wiki in July 2009 only
	13	+``pageview.200907.daily.en``
	14	+ daily page view count files for EN wiki in July 2009 only
	15	+
	16	+
	17	+1. Obtain the page view count files from `Domas's WikiStats` or from the ``stats`` directory `Toolserver's user-store`.
	18	+2. (optional) Select only the page names you are interested to reduce the processing time. For example, use this command ::
	19	+
	20	+ for f in pageview.all/2009/07/pagecounts-200907.gz ; do ggrep '^en [^ ] ' $f \| gzip > pageview.200907.en/`basename $f`; done
	21	+
	22	+3. (optional) Convert hourly page views in to daily page views with ::
	23	+
	24	+ ./accumulatedaily.py pageview.200907.en/pagecounts-200907*.gz -p 3 -f 'pageview.200907.daily.en/pagecounts-%Y%m%d-%H%M%S.gz'
	25	+
	26	+4. Detect bursts in page views with ::
	27	+
	28	+ python -O detectbursts.py pageview.200907.daily.en/pagecounts-200907* -w 3 --rate=3 --min=1000 --max=10 --cutoff=20 -o bursts_200907_daily.tsv
	29	+
	30	+
	31	+Notes
	32	+--------
	33	+Edit counts generated by these scripts may contain errors due to a MediaWiki's `bug 19311`_.
	34	+
	35	+.. _Domas's WikiStats: http://dammit.lt/wikistats/archive
	36	+.. _Toolserver's user-store: https://wiki.toolserver.org/view/User-store
	37	+.. _sprint page on meta: http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors
	38	+.. _bug 19311: https://bugzilla.wikimedia.org/show_bug.cgi?id=19311
Index: trunk/tools/wsor/trending_articles/chart.py
—	—	@@ -12,6 +12,7 @@
13	13	import datetime
14	14	import math
15	15	import re
	16	+import math
16	17	from collections import namedtuple
17	18
18	19	counter_tuple = namedtuple('counter', 'name filter color explode')
—	—	@@ -36,6 +37,9 @@
37	38	parser.add_argument('-v', '--verbose',
38	39	dest='verbose', action='store_true', default=False,
39	40	help='turn on verbose message output')
	41	+ parser.add_argument('-X', '--exclude-semiprotect',
	42	+ dest='nosemiprotect', action='store_true', default=False,
	43	+ help='')
40	44	parser.add_argument('files', nargs='+')
41	45	options = parser.parse_args()
42	46
—	—	@@ -52,14 +56,19 @@
53	57	counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
54	58	]
55	59
56		~~- # counters = [counter_tuple('new registered users', lambda x: x[10] == 'REG' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1),~~
57		~~- # counter_tuple('old registered users', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0),~~
58		~~- # counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1),~~
59		~~- # counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0),~~
60		~~- # counter_tuple('bots', lambda x: x[10] == 'REG_BOT' and x[14] != 'SEMIPROTECT', '#666666', 0.0),~~
61		~~- # #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),~~
	60	+ # counters = [counter_tuple('w/ <30d edit history or IP', lambda x: x[10] == 'ANON' or x[13] == 'NEW', '#FF4444', 0.1),
	61	+ # counter_tuple('w/ >30d edit history and registered', lambda x: x, '#CCCCCC', 0.0),
62	62	# ]
63	63
	64	+ if options.nosemiprotect:
	65	+ counters = [counter_tuple('new registered users', lambda x: x[10] == 'REG' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1),
	66	+ counter_tuple('old registered users', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0),
	67	+ counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1),
	68	+ counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0),
	69	+ counter_tuple('bots', lambda x: x[10] == 'REG_BOT' and x[14] != 'SEMIPROTECT', '#666666', 0.0),
	70	+ #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
	71	+ ]
	72	+
64	73	counters_map = {}
65	74	for x in counters:
66	75	counters_map[x.name] = x
—	—	@@ -67,14 +76,17 @@
68	77	ratios = []
69	78	patt = re.compile('(\d+) / (\d+) / (\d+)')
70	79	for (i,fname) in enumerate(options.files):
	80	+ ratios.append(1.0)
71	81	for line in open(fname).readlines():
72	82	m = patt.search(line)
73	83	if m:
74		~~- ratios.append((float(m.group(1)) / float(m.group(2)) / float(m.group(3))) ** 0.5)~~
	84	+ ratios[i] = (float(m.group(1)) / float(m.group(2)) / float(m.group(3))) ** 0.5
75	85	break
	86	+
76	87	sum_ratio = sum(ratios)
77	88	counter_names = [x.name for x in counters]
78	89
	90	+ # chart for breakdown of users
79	91	plots = []
80	92	matplotlib.rc('font', size=options.fsize)
81	93	for (n,fname) in enumerate(options.files):
—	—	@@ -91,7 +103,8 @@
92	104	counts[c[0]].add(cols[options.field-1])
93	105	break
94	106
95		~~- print counts#!~~
	107	+ for (name,value) in counts.items():
	108	+ print name, len(value)
96	109	#plt.subplot(1, len(options.files), n+1)
97	110	plt.axes([0, 0, ratios[n]/sum_ratio, ratios[n]/sum_ratio])
98	111	plt.title(fname)
—	—	@@ -105,4 +118,39 @@
106	119	loc=(.8, .8))
107	120
108	121	base,ext = os.path.splitext(fname)
	122	+ print >>sys.stderr, 'output: ' + base
109	123	plt.savefig('.'.join([base, 'svg']))
	124	+
	125	+ # chart for new editor retention
	126	+ for (n,fname) in enumerate(options.files):
	127	+ plt.figure(figsize=(10,10))
	128	+ table = list(csv.reader(filter(lambda x: x[0] != '#', open(fname)), delimiter='\t'))
	129	+ table = table[1:]
	130	+ filt = lambda x: x[10] == 'REG' and x[13] == 'NEW'
	131	+ bin = lambda x: min(int(10 * math.log10(int(x[15]) + 1)), int(10 * math.log10(3000)))
	132	+ username = lambda x: x[11]
	133	+ users = {}
	134	+ bins = {}
	135	+ for cols in table:
	136	+ if filt(cols) and not users.has_key(username(cols)):
	137	+ users[username(cols)] = True
	138	+ b = bin(cols)
	139	+ bins.setdefault(b, 0)
	140	+ bins[b] += 1
	141	+
	142	+ bins = sorted(bins.items(), key=lambda x: -x[0])
	143	+ max_bin = max(x[0] for x in bins)
	144	+
	145	+ if max_bin == 0:
	146	+ print >>sys.stderr, '%s: %s (no values)' % (fname, bins)
	147	+ continue
	148	+ print >>sys.stderr, '%s: %s' % (fname, bins)
	149	+
	150	+ p = plt.pie([x[1] for x in bins],
	151	+ pctdistance=1.2,
	152	+ autopct='%1.1f%%',
	153	+ colors=['#' + 3 * ('%02X' % int(255 - 255 * float(x[0]) / max_bin)) for x in bins])
	154	+
	155	+ base,ext = os.path.splitext(fname)
	156	+ print >>sys.stderr, 'output: ' + base
	157	+ plt.savefig('.'.join([base, 'retention', 'svg']))
Index: trunk/tools/wsor/trending_articles/find_revision_status.py
—	—	@@ -10,7 +10,27 @@
11	11	import urllib2
12	12	import re
13	13	from datetime import datetime, timedelta
	14	+from collections import namedtuple
14	15
	16	+revision_t = namedtuple('revision', 'oldid pageid textid comment userid usertext timestamp minor deleted length parentid')
	17	+user_t = namedtuple('user', 'id name first editcount periodedits futureedits type')
	18	+article_t = namedtuple('article', 'title protectlog older')
	19	+edits_t = namedtuple('edits', 'before between')
	20	+wikidate_t = namedtuple('wikidate', 'text datetime')
	21	+log_t = namedtuple('log', 'title action params timestamp')
	22	+
	23	+botpat = re.compile('bot( \|$)', re.IGNORECASE)
	24	+protectpat = re.compile('\[edit=(.?)\] $(.?) \(UTC$\)')
	25	+
	26	+def make_revision_t(*args):
	27	+ x = revision_t(*args)
	28	+ return x._replace(timestamp=wikidate_t(text=x.timestamp,
	29	+ datetime=parse_wikidate(x.timestamp)))
	30	+def make_log_t(*args):
	31	+ x = log_t(*args)
	32	+ return x._replace(timestamp=wikidate_t(text=x.timestamp,
	33	+ datetime=parse_wikidate(x.timestamp)))
	34	+
15	35	def parse_wikidate(x):
16	36	return datetime.strptime(str(x), '%Y%m%d%H%M%S')
17	37
—	—	@@ -47,57 +67,162 @@
48	68	(title, rd_pid) = redirected(cursor, rd_pid, namespace)
49	69	return (title, rd_pid)
50	70
51		~~-def firstedits(cursor, uid, uname, delta, n):~~
	71	+def allprotect(cursor, start, end, ns=0):
	72	+ cursor.execute('''
	73	+ SELECT l.log_title, l.log_action, l.log_params, l.log_timestamp
	74	+ FROM logging l
	75	+ WHERE
	76	+ l.log_type = "protect"
	77	+ AND l.log_timestamp BETWEEN ? AND ?
	78	+ AND l.log_namespace = ?
	79	+ ORDER BY l.log_timestamp DESC
	80	+ ;
	81	+ ''', (start, end, ns))
	82	+ return [make_log_t(*x) for x in list(cursor)]
	83	+
	84	+def closestprotect(cursor, limit, start, title, ns=0):
	85	+ cursor.execute('''
	86	+ SELECT l.log_title, l.log_action, l.log_params, l.log_timestamp
	87	+ FROM logging l
	88	+ WHERE
	89	+ l.log_type = "protect"
	90	+ AND l.log_title = ?
	91	+ AND l.log_timestamp BETWEEN ? AND ?
	92	+ AND l.log_namespace = ?
	93	+ ORDER BY l.log_timestamp DESC
	94	+ LIMIT 1
	95	+ ;
	96	+ ''', (title, limit, start, ns))
	97	+ ls = list(cursor)
	98	+ if len(ls) == 0:
	99	+ return None
	100	+ return make_log_t(*(ls[0]))
	101	+
	102	+def firstedits(cursor, uid, uname, limit=1):
52	103	where = 'r.rev_user_text = ?'
53	104	uspec = uname
54	105	if uid != 0:
55	106	where = 'r.rev_user = ?'
56	107	uspec = uid
57	108	cursor.execute('''
58		~~- SELECT r.rev_timestamp~~
	109	+ SELECT *
59	110	FROM revision r
60	111	WHERE
61	112	r.rev_timestamp != ""
62	113	AND %s
63		~~- ORDER BY r.rev_timestamp ASC~~
64		~~- LIMIT 1~~
	114	+ ORDER BY r.rev_timestamp ASC
	115	+ LIMIT ?
65	116	;
66		~~- ''' % (where,), (uspec,))~~
67		~~- first = list(cursor)[0][0]~~
68		~~- first = parse_wikidate(first)~~
	117	+ ''' % (where,), (uspec,limit))
	118	+ return [make_revision_t(*x) for x in cursor]
	119	+
	120	+def olderthan(cursor, title, timestamp):
69	121	cursor.execute('''
70		~~- SELECT r.rev_id~~
	122	+ SELECT r.rev_timestamp
71	123	FROM revision r
	124	+ INNER JOIN page p on p.page_id = r.rev_page
72	125	WHERE
73		~~- %s~~
74		~~- AND r.rev_timestamp BETWEEN ? AND ?~~
75		~~- LIMIT ?~~
	126	+ r.rev_timestamp != ""
	127	+ AND p.page_title = ?
	128	+ AND r.rev_timestamp < ?
	129	+ LIMIT 1
76	130	;
77		~~- ''' % (where,), (uspec, format_wikidate(first), format_wikidate(first + delta), n))~~
78		~~- return [int(x[0]) for x in list(cursor)]~~
	131	+ ''', (title,timestamp))
	132	+ return len(list(cursor)) != 0
79	133
80		~~-def editcount(cursor, uid, uname, timestamp):~~
81		~~- where = 'r.rev_user_text = ?'~~
82		~~- uspec = uname~~
	134	+def editcount_before(cursor, uid, uname, timestamp):
83	135	if uid != 0:
84		~~- where = 'r.rev_user = ?'~~
85		~~- uspec = uid~~
86		-
	136	+ cursor.execute('''
	137	+ SELECT /* SLOW_OK / count()
	138	+ FROM revision r
	139	+ WHERE
	140	+ r.rev_user = ?
	141	+ AND r.rev_timestamp > ?
	142	+ ;
	143	+ ''', (uid,timestamp))
	144	+ newedits = list(cursor)[0][0]
	145	+ cursor.execute('''
	146	+ SELECT u.user_editcount
	147	+ FROM user u
	148	+ WHERE
	149	+ u.user_id = ?
	150	+ ;
	151	+ ''', (uid,))
	152	+ alledits = list(cursor)[0][0]
	153	+ return int(alledits) - int(newedits)
	154	+ else:
	155	+ # anonymous user's edit count only can be found from revision
	156	+ cursor.execute('''
	157	+ SELECT /* SLOW_OK / count()
	158	+ FROM revision r
	159	+ WHERE
	160	+ r.rev_user_text = ?
	161	+ AND r.rev_timestamp < ?
	162	+ ;
	163	+ ''', (uname,timestamp))
	164	+ return int(list(cursor)[0][0])
	165	+
	166	+def editcount_duration(cursor, uid, uname, timestamp1, timestamp2):
	167	+ uspec = 'r.rev_user = ?'
	168	+ uarg = uid
	169	+ if uid == 0:
	170	+ uspec = 'r.rev_user_text = ?'
	171	+ uarg = uname
87	172	cursor.execute('''
88		~~- SELECT count(*)~~
	173	+ SELECT /* SLOW_OK / count()
89	174	FROM revision r
90	175	WHERE
91	176	%s
92		~~- AND r.rev_timestamp < ?~~
	177	+ AND r.rev_timestamp BETWEEN ? AND ?
93	178	;
94		~~- ''' % (where,), (uspec,timestamp))~~
	179	+ ''' % uspec, (uarg, timestamp1, timestamp2))
95	180	return int(list(cursor)[0][0])
96	181
	182	+def edits_duration(cursor, uid, uname, timestamp1, timestamp2):
	183	+ uspec = 'r.rev_user = ?'
	184	+ uarg = uid
	185	+ if uid == 0:
	186	+ uspec = 'r.rev_user_text = ?'
	187	+ uarg = uname
	188	+ cursor.execute('''
	189	+ SELECT /* SLOW_OK /
	190	+ FROM revision r
	191	+ WHERE
	192	+ %s
	193	+ AND r.rev_timestamp BETWEEN ? AND ?
	194	+ ;
	195	+ ''' % uspec, (uarg, timestamp1, timestamp2))
	196	+ return [make_revision_t(*x) for x in list(cursor)]
	197	+
97	198	if __name__ == '__main__':
98	199	parser = argparse.ArgumentParser()
99	200	parser.add_argument('-f', '--field', metavar='N',
100	201	dest='field', type=int, default=1,
101	202	help='')
	203	+ parser.add_argument('-H', '--host', metavar='HOST',
	204	+ dest='host', type=str, default='',
	205	+ help='mysql host name')
	206	+ parser.add_argument('-R', '--hours', metavar='N',
	207	+ dest='hours', type=int, default=1,
	208	+ help='')
	209	+ parser.add_argument('-a', '--activity-delta', metavar='DAYS',
	210	+ dest='activedelta', type=lambda x: timedelta(days=x), default=timedelta(days=120),
	211	+ help='')
	212	+ parser.add_argument('-D', '--activity-duration', metavar='DAYS',
	213	+ dest='activedur', type=lambda x: timedelta(days=x), default=timedelta(days=90),
	214	+ help='')
	215	+ parser.add_argument('-O', '--threshold', metavar='DATE',
	216	+ dest='olderthan', type=lambda x: parse_wikidate(x), default=None,
	217	+ help='')
	218	+ parser.add_argument('-L', '--limit', metavar='N',
	219	+ dest='limit', type=int, default=30,
	220	+ help='')
	221	+ parser.add_argument('-o', '--output', metavar='FILE',
	222	+ dest='output', type=lambda x: open(x, 'w'), default=sys.stdout,
	223	+ help='')
	224	+ parser.add_argument('-b', '--include-bots',
	225	+ dest='include_bots', action='store_true', default=False,
	226	+ help='')
102	227	parser.add_argument('-d', '--db', metavar='DBNAME', required=True,
103	228	dest='db', type=str, default='hywiki-p',
104	229	help='target wiki name')
—	—	@@ -105,21 +230,20 @@
106	231	options = parser.parse_args()
107	232	options.db = options.db.replace('_','-')
108	233
109		~~- host = options.db + '.rrdb.toolserver.org'~~
110		~~- conn = oursql.connect(host = host,~~
	234	+ if options.host == '':
	235	+ options.host = options.db + '.rrdb.toolserver.org'
	236	+ conn = oursql.connect(host = options.host,
111	237	read_default_file=os.path.expanduser('~/.my.cnf'),
112	238	db = options.db.replace('-','_'),
113	239	charset=None,
114	240	use_unicode=False)
115	241
116	242	cursor = conn.cursor()
117		-
118	243	csv.field_size_limit(1000000000)
119	244	table = list(csv.reader(open(options.input), delimiter='\t'))
120	245	table = table[1:]
121	246
122	247	output = []
123		~~- hours = {}~~
124	248	for cols in table:
125	249	cursor.execute('''
126	250	SELECT p.page_id, p.page_title, page_is_redirect
—	—	@@ -134,8 +258,7 @@
135	259	print >>sys.stderr, 'error 1 %s' % cols
136	260	continue
137	261	redirect = int(res[0][2]) == 1
138		~~- cols.insert(options.field, 'REDIRECT' if redirect else 'ARTICLE')~~
139		~~- cols.insert(options.field, str(res[0][0]))~~
	262	+ cols[options.field:options.field] = ['REDIRECT' if redirect else 'ARTICLE', str(res[0][0])]
140	263	output.append(cols)
141	264	if redirect:
142	265	(title,pageid) = redirected(cursor, res[0][0])
—	—	@@ -143,11 +266,8 @@
144	267	print >>sys.stderr, 'error 2 %s' % cols
145	268	continue
146	269	a = [x for x in cols]
147		~~- a[0] = title~~
148		~~- a[1] = str(pageid)~~
149		~~- a[2] = 'REDIRECT_RESOLVED'~~
	270	+ a[options.field-1:options.field+2] = (title,str(pageid),'REDIRECT_RESOLVED')
150	271	output.append(a)
151		~~- hours[cols[3]] = True~~
152	272
153	273	# cursor.executemany('''
154	274	# SELECT p.page_title, p.page_id
—	—	@@ -158,16 +278,25 @@
159	279	# ''', [(urllib2.quote(x[options.field-1]),) for x in table])
160	280	# print list(cursor)
161	281
162		- print '\t'.join(['title', 'page_id', 'redirect?', 'pageview timestamp', 'predicted pageview', 'actual pageview', 'trending hours', 'surprisedness', 'revision', 'timestamp', 'user type', 'username', 'editcount', 'new?'])
163		-
164		~~- botpat = re.compile('bot( \|$)', re.IGNORECASE)~~
165		~~- edits = 0~~
	282	+ edits = {}
166	283	articles = {}
	284	+ users = {}
	285	+ timestamps = {}
167	286	for cols in output:
168		~~- start = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')~~
169		~~- end = start + timedelta(hours=1)~~
	287	+ ts = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
	288	+ timestamps[ts] = True
	289	+ duration = sorted(timestamps.keys())
	290	+ duration = (wikidate_t(format_wikidate(duration[0]), duration[0]),
	291	+ wikidate_t(format_wikidate(duration[-1]), duration[-2]))
	292	+ if options.olderthan == None:
	293	+ options.olderthan = duration[0].datetime - timedelta(days=365)
	294	+
	295	+ for cols in output:
	296	+ ts = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
	297	+ start = ts + timedelta(hours=-options.hours)
	298	+ end = start + timedelta(hours=options.hours)
170	299	cursor.execute('''
171		~~- SELECT r.rev_id, r.rev_timestamp, r.rev_user, r.rev_user_text~~
	300	+ SELECT *
172	301	FROM revision r
173	302	WHERE
174	303	r.rev_page = ?
—	—	@@ -175,21 +304,96 @@
176	305	;
177	306
178	307	''', (cols[1],
179		~~- datetime.strftime(start, '%Y%m%d$H%M%S'),~~
180		~~- datetime.strftime(end, '%Y%m%d$H%M%S'),~~
	308	+ datetime.strftime(start, '%Y%m%d%H%M%S'),
	309	+ datetime.strftime(end, '%Y%m%d%H%M%S'),
181	310	))
182		~~- ls = list(cursor)~~
	311	+ ls = [make_revision_t(*x) for x in cursor]
183	312	if len(ls) == 0:
184		~~- print >>sys.stderr, 'no revision: %s %s %s' % (cols[0], start, end)~~
185		~~- for (rev,ts,uid,username) in ls:~~
186		~~- usertype = 'ANON' if uid == 0 else 'REG'~~
187		~~- if uid != 0 and botpat.search(username):~~
188		~~- usertype += '_BOT'~~
189		~~- output = cols + [str(x) for x in [rev, ts, usertype, username,~~
190		~~- editcount(cursor,uid,username,re.sub('[ /\:]', '', cols[3])),~~
191		~~- 'NEW' if firstedits(cursor,uid,username,timedelta(days=30),30).count(rev) > 0 else 'OLD']]~~
192		~~- print '\t'.join(output)~~
193		~~- edits +=1~~
194		~~- articles[cols[1]] = True~~
	313	+ print >>sys.stderr, 'no revision: %s %s %s' % (start, end, cols[0])
	314	+ for rev in ls:
	315	+ usertype = 'ANON' if rev.userid == 0 else 'REG'
	316	+ if rev.userid != 0 and botpat.search(rev.usertext):
	317	+ if options.include_bots:
	318	+ usertype += '_BOT'
	319	+ else:
	320	+ print >>sys.stderr, 'rev %s is by bot (%s)' % (rev.oldid, rev.usertext)
	321	+ continue
	322	+ if not users.has_key((rev.userid,rev.usertext)):
	323	+ users[(rev.userid,rev.usertext)] = user_t(id=rev.userid, name=rev.usertext,
	324	+ first=firstedits(cursor, rev.userid, rev.usertext),
	325	+ editcount=editcount_before(cursor, rev.userid, rev.usertext, duration[0].text),
	326	+ periodedits=edits_duration(cursor, rev.userid, rev.usertext, duration[0].text, duration[1].text),
	327	+ futureedits=edits_duration(cursor, rev.userid, rev.usertext, duration[0].datetime + options.activedelta, duration[1].datetime + options.activedelta + options.activedur),
	328	+ type=usertype)
	329	+ edits[rev.oldid] = (cols,rev)
	330	+ print >>sys.stderr, rev.oldid
	331	+ if not articles.has_key(cols[0]):
	332	+ articles[cols[0]] = article_t(title=cols[0], protectlog=[], older=olderthan(cursor, cols[0], options.olderthan))
195	333
196		~~- print '# %s / %s / %s edits/article/hour' % (edits, len(articles.keys()), len(hours.keys()))~~
	334	+ # collect protect logs
	335	+ print >>sys.stderr, 'collecting protection log entries for %s - %s...' % (duration[0].text, duration[1].text)
	336	+ protectlog = allprotect(cursor, duration[0].text, duration[1].text)
	337	+
	338	+ # collect protect information
	339	+ print >>sys.stderr, 'collecting protection log entries of %d articles for %s - %s...' % (len(articles.items()), duration[0].text, duration[1].text)
	340	+ for (title,article) in articles.items():
	341	+ article.protectlog.extend(filter(lambda x: x.title == title, protectlog))
	342	+ closest = None
	343	+ if article.older:
	344	+ closest = closestprotect(cursor, format_wikidate(options.olderthan), duration[0].text, article.title)
	345	+ else:
	346	+ closest = closestprotect(cursor, '0', duration[0].text, article.title)
	347	+ if closest:
	348	+ article.protectlog.append(closest)
	349	+ print >>sys.stderr, '%s %d' % (title, len(article.protectlog))
	350	+
	351	+ options.output.write('\t'.join(['title', 'page_id', 'redirect?', 'pageview timestamp', 'predicted pageview', 'actual pageview', 'trending hours', 'surprisedness', 'revision', 'timestamp', 'user type', 'username', 'editcount', 'new?', 'protect', 'editcount_%dd+%dd' % (options.activedelta.days, options.activedur.days)]) + '\n')
	352	+
	353	+ # collect protect information
	354	+ print >>sys.stderr, 'writing %d edits...' % (len(edits.items()))
	355	+ for (revid,(cols,rev)) in sorted(edits.items(), key=lambda x: x[0]):
	356	+ new = 'OLD'
	357	+ user = users[(rev.userid, rev.usertext)]
	358	+ if len(user.first) == 0 or user.first[0].timestamp.datetime > rev.timestamp.datetime + timedelta(days=-30):
	359	+ new = 'NEW'
	360	+
	361	+ revdate = rev.timestamp.datetime
	362	+
	363	+ article = articles[cols[0]]
	364	+ protect = None
	365	+ if len(article.protectlog) > 0:
	366	+ f = filter(lambda x: x.timestamp.datetime < rev.timestamp.datetime, article.protectlog)
	367	+ if len(f) > 0:
	368	+ protect = f[0]
	369	+
	370	+ if protect == None or len(protect) == 0:
	371	+ protect = 'NO_PROTECT'
	372	+ else:
	373	+ m = protectpat.search(protect.params)
	374	+ if m:
	375	+ lv = m.group(1)
	376	+ try:
	377	+ expire = datetime.strptime(m.group(2), 'expires %M:%S, %d %B %Y')
	378	+ if lv == 'autoconfirmed' and expire > revdate:
	379	+ protect = 'SEMIPROTECT'
	380	+ elif lv == 'admin' and expire > revdate:
	381	+ protect = 'PROTECT'
	382	+ else:
	383	+ protect = 'OTHER_PROTECT'
	384	+ except ValueError, e:
	385	+ if m.group(2).find('indefinite'):
	386	+ protect = 'INDEFINITE'
	387	+ else:
	388	+ protect = 'OTHER_PROTECT'
	389	+ else:
	390	+ protect = 'UNKNOWN'
	391	+
	392	+ output = cols + [str(x) for x in [revid, rev.timestamp.text, user.type, user.name,
	393	+ user.editcount + len(filter(lambda x: x.timestamp.datetime < rev.timestamp.datetime, user.periodedits)),
	394	+ new,
	395	+ protect,
	396	+ len(filter(lambda x: rev.timestamp.datetime + options.activedelta < x.timestamp.datetime and x.timestamp.datetime < rev.timestamp.datetime + options.activedelta + options.activedur, user.futureedits))
	397	+ ]]
	398	+ line = '\t'.join(output)
	399	+ options.output.write(line + '\n')
	400	+ options.output.write('# %s / %s edits/article\n' % (len(edits.keys()), len(articles.keys())))

Status & tagging log

19:32, 30 August 2011 😂 (talk | contribs) changed the status of r95806 [removed: new added: deferred]