r95806 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r95805‎ | r95806 | r95807 >
Date:19:25, 30 August 2011
Author:whym
Status:deferred
Tags:
Comment:
Updates to trending article scripts, adding docs and some optimizations
Modified paths:
  • /trunk/tools/wsor/trending_articles/README.rst (modified) (history)
  • /trunk/tools/wsor/trending_articles/chart.py (modified) (history)
  • /trunk/tools/wsor/trending_articles/detectbursts.py (modified) (history)
  • /trunk/tools/wsor/trending_articles/find_revision_status.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/trending_articles/detectbursts.py
@@ -7,19 +7,22 @@
88 from datetime import datetime, timedelta
99 import argparse
1010 import random
11 -import gzip
 11+import myzip
1212 import re
1313 import os
1414 import urllib2
1515 from collections import deque, namedtuple
1616 import numpy as np
17 -import gc
1817
1918 pageview_tuple = namedtuple('Pageview', 'date count')
2019 count_tuple = namedtuple('Count', 'pred real')
2120
2221 def time_parse(x):
23 - return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')
 22+ if x.endswith('.gz'):
 23+ return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')
 24+ elif x.endswith('.gz'):
 25+ return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.xz')
 26+
2427 def time_format(x):
2528 return datetime.strftime(x, '%Y/%m/%d %H:%M:%S')
2629 def datetime2days(x):
@@ -28,7 +31,7 @@
2932 def load_wikistats_file(f):
3033 print >>sys.stderr, 'loading %s...' % f
3134 ret = {}
32 - for line in gzip.open(f):
 35+ for line in myzip.open(f):
3336 line.strip()
3437 (lang,title,count,bytes) = line.split(' ')
3538 ret[(lang,title)] = count_tuple(float(count), int(count))
@@ -143,9 +146,6 @@
144147 if options.inclusive:
145148 ls.insert(len(ls), bursting.has_key(page))
146149 writer.writerow([unicode(x) for x in ls])
147 - except UnicodeEncodeError, e:
148 - print >>sys.stderr, '%s: %s' % (e, page)
149 - continue
150150 except UnicodeDecodeError, e:
151151 print >>sys.stderr, '%s: %s' % (e, page)
152152 continue
Index: trunk/tools/wsor/trending_articles/README.rst
@@ -1,4 +1,37 @@
2 -See http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors
 2+These scripts are used to produce the results published on the `sprint
 3+page on meta`_ on the editor behavior in trending articles.
34
4 -Counts files are available at:
5 -http://dammit.lt/wikistats/archive/2011/01/
 5+Usage
 6+---------
 7+We use following directory names.
 8+
 9+``pageview.all``
 10+ raw (hourly) page view count files
 11+``pageview.200907.en``
 12+ hourly page view count files for EN wiki in July 2009 only
 13+``pageview.200907.daily.en``
 14+ daily page view count files for EN wiki in July 2009 only
 15+
 16+
 17+1. Obtain the page view count files from `Domas's WikiStats` or from the ``stats`` directory `Toolserver's user-store`.
 18+2. (optional) Select only the page names you are interested to reduce the processing time. For example, use this command ::
 19+
 20+ for f in pageview.all/2009/07/pagecounts-200907*.gz ; do ggrep '^en [^ ]* ' $f | gzip > pageview.200907.en/`basename $f`; done
 21+
 22+3. (optional) Convert hourly page views in to daily page views with ::
 23+
 24+ ./accumulatedaily.py pageview.200907.en/pagecounts-200907*.gz -p 3 -f 'pageview.200907.daily.en/pagecounts-%Y%m%d-%H%M%S.gz'
 25+
 26+4. Detect bursts in page views with ::
 27+
 28+ python -O detectbursts.py pageview.200907.daily.en/pagecounts-200907* -w 3 --rate=3 --min=1000 --max=10 --cutoff=20 -o bursts_200907_daily.tsv
 29+
 30+
 31+Notes
 32+--------
 33+Edit counts generated by these scripts may contain errors due to a MediaWiki's `bug 19311`_.
 34+
 35+.. _Domas's WikiStats: http://dammit.lt/wikistats/archive
 36+.. _Toolserver's user-store: https://wiki.toolserver.org/view/User-store
 37+.. _sprint page on meta: http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors
 38+.. _bug 19311: https://bugzilla.wikimedia.org/show_bug.cgi?id=19311
Index: trunk/tools/wsor/trending_articles/chart.py
@@ -12,6 +12,7 @@
1313 import datetime
1414 import math
1515 import re
 16+import math
1617 from collections import namedtuple
1718
1819 counter_tuple = namedtuple('counter', 'name filter color explode')
@@ -36,6 +37,9 @@
3738 parser.add_argument('-v', '--verbose',
3839 dest='verbose', action='store_true', default=False,
3940 help='turn on verbose message output')
 41+ parser.add_argument('-X', '--exclude-semiprotect',
 42+ dest='nosemiprotect', action='store_true', default=False,
 43+ help='')
4044 parser.add_argument('files', nargs='+')
4145 options = parser.parse_args()
4246
@@ -52,14 +56,19 @@
5357 counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
5458 ]
5559
56 - # counters = [counter_tuple('new registered users', lambda x: x[10] == 'REG' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1),
57 - # counter_tuple('old registered users', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0),
58 - # counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1),
59 - # counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0),
60 - # counter_tuple('bots', lambda x: x[10] == 'REG_BOT' and x[14] != 'SEMIPROTECT', '#666666', 0.0),
61 - # #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
 60+ # counters = [counter_tuple('w/ <30d edit history or IP', lambda x: x[10] == 'ANON' or x[13] == 'NEW', '#FF4444', 0.1),
 61+ # counter_tuple('w/ >30d edit history and registered', lambda x: x, '#CCCCCC', 0.0),
6262 # ]
6363
 64+ if options.nosemiprotect:
 65+ counters = [counter_tuple('new registered users', lambda x: x[10] == 'REG' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1),
 66+ counter_tuple('old registered users', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0),
 67+ counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1),
 68+ counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0),
 69+ counter_tuple('bots', lambda x: x[10] == 'REG_BOT' and x[14] != 'SEMIPROTECT', '#666666', 0.0),
 70+ #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
 71+ ]
 72+
6473 counters_map = {}
6574 for x in counters:
6675 counters_map[x.name] = x
@@ -67,14 +76,17 @@
6877 ratios = []
6978 patt = re.compile('(\d+) / (\d+) / (\d+)')
7079 for (i,fname) in enumerate(options.files):
 80+ ratios.append(1.0)
7181 for line in open(fname).readlines():
7282 m = patt.search(line)
7383 if m:
74 - ratios.append((float(m.group(1)) / float(m.group(2)) / float(m.group(3))) ** 0.5)
 84+ ratios[i] = (float(m.group(1)) / float(m.group(2)) / float(m.group(3))) ** 0.5
7585 break
 86+
7687 sum_ratio = sum(ratios)
7788 counter_names = [x.name for x in counters]
7889
 90+ # chart for breakdown of users
7991 plots = []
8092 matplotlib.rc('font', size=options.fsize)
8193 for (n,fname) in enumerate(options.files):
@@ -91,7 +103,8 @@
92104 counts[c[0]].add(cols[options.field-1])
93105 break
94106
95 - print counts#!
 107+ for (name,value) in counts.items():
 108+ print name, len(value)
96109 #plt.subplot(1, len(options.files), n+1)
97110 plt.axes([0, 0, ratios[n]/sum_ratio, ratios[n]/sum_ratio])
98111 plt.title(fname)
@@ -105,4 +118,39 @@
106119 loc=(.8, .8))
107120
108121 base,ext = os.path.splitext(fname)
 122+ print >>sys.stderr, 'output: ' + base
109123 plt.savefig('.'.join([base, 'svg']))
 124+
 125+ # chart for new editor retention
 126+ for (n,fname) in enumerate(options.files):
 127+ plt.figure(figsize=(10,10))
 128+ table = list(csv.reader(filter(lambda x: x[0] != '#', open(fname)), delimiter='\t'))
 129+ table = table[1:]
 130+ filt = lambda x: x[10] == 'REG' and x[13] == 'NEW'
 131+ bin = lambda x: min(int(10 * math.log10(int(x[15]) + 1)), int(10 * math.log10(3000)))
 132+ username = lambda x: x[11]
 133+ users = {}
 134+ bins = {}
 135+ for cols in table:
 136+ if filt(cols) and not users.has_key(username(cols)):
 137+ users[username(cols)] = True
 138+ b = bin(cols)
 139+ bins.setdefault(b, 0)
 140+ bins[b] += 1
 141+
 142+ bins = sorted(bins.items(), key=lambda x: -x[0])
 143+ max_bin = max(x[0] for x in bins)
 144+
 145+ if max_bin == 0:
 146+ print >>sys.stderr, '%s: %s (no values)' % (fname, bins)
 147+ continue
 148+ print >>sys.stderr, '%s: %s' % (fname, bins)
 149+
 150+ p = plt.pie([x[1] for x in bins],
 151+ pctdistance=1.2,
 152+ autopct='%1.1f%%',
 153+ colors=['#' + 3 * ('%02X' % int(255 - 255 * float(x[0]) / max_bin)) for x in bins])
 154+
 155+ base,ext = os.path.splitext(fname)
 156+ print >>sys.stderr, 'output: ' + base
 157+ plt.savefig('.'.join([base, 'retention', 'svg']))
Index: trunk/tools/wsor/trending_articles/find_revision_status.py
@@ -10,7 +10,27 @@
1111 import urllib2
1212 import re
1313 from datetime import datetime, timedelta
 14+from collections import namedtuple
1415
 16+revision_t = namedtuple('revision', 'oldid pageid textid comment userid usertext timestamp minor deleted length parentid')
 17+user_t = namedtuple('user', 'id name first editcount periodedits futureedits type')
 18+article_t = namedtuple('article', 'title protectlog older')
 19+edits_t = namedtuple('edits', 'before between')
 20+wikidate_t = namedtuple('wikidate', 'text datetime')
 21+log_t = namedtuple('log', 'title action params timestamp')
 22+
 23+botpat = re.compile('bot( |$)', re.IGNORECASE)
 24+protectpat = re.compile('\[edit=(.*?)\] \((.*?) \(UTC\)\)')
 25+
 26+def make_revision_t(*args):
 27+ x = revision_t(*args)
 28+ return x._replace(timestamp=wikidate_t(text=x.timestamp,
 29+ datetime=parse_wikidate(x.timestamp)))
 30+def make_log_t(*args):
 31+ x = log_t(*args)
 32+ return x._replace(timestamp=wikidate_t(text=x.timestamp,
 33+ datetime=parse_wikidate(x.timestamp)))
 34+
1535 def parse_wikidate(x):
1636 return datetime.strptime(str(x), '%Y%m%d%H%M%S')
1737
@@ -47,57 +67,162 @@
4868 (title, rd_pid) = redirected(cursor, rd_pid, namespace)
4969 return (title, rd_pid)
5070
51 -def firstedits(cursor, uid, uname, delta, n):
 71+def allprotect(cursor, start, end, ns=0):
 72+ cursor.execute('''
 73+ SELECT l.log_title, l.log_action, l.log_params, l.log_timestamp
 74+ FROM logging l
 75+ WHERE
 76+ l.log_type = "protect"
 77+ AND l.log_timestamp BETWEEN ? AND ?
 78+ AND l.log_namespace = ?
 79+ ORDER BY l.log_timestamp DESC
 80+ ;
 81+ ''', (start, end, ns))
 82+ return [make_log_t(*x) for x in list(cursor)]
 83+
 84+def closestprotect(cursor, limit, start, title, ns=0):
 85+ cursor.execute('''
 86+ SELECT l.log_title, l.log_action, l.log_params, l.log_timestamp
 87+ FROM logging l
 88+ WHERE
 89+ l.log_type = "protect"
 90+ AND l.log_title = ?
 91+ AND l.log_timestamp BETWEEN ? AND ?
 92+ AND l.log_namespace = ?
 93+ ORDER BY l.log_timestamp DESC
 94+ LIMIT 1
 95+ ;
 96+ ''', (title, limit, start, ns))
 97+ ls = list(cursor)
 98+ if len(ls) == 0:
 99+ return None
 100+ return make_log_t(*(ls[0]))
 101+
 102+def firstedits(cursor, uid, uname, limit=1):
52103 where = 'r.rev_user_text = ?'
53104 uspec = uname
54105 if uid != 0:
55106 where = 'r.rev_user = ?'
56107 uspec = uid
57108 cursor.execute('''
58 - SELECT r.rev_timestamp
 109+ SELECT *
59110 FROM revision r
60111 WHERE
61112 r.rev_timestamp != ""
62113 AND %s
63 - ORDER BY r.rev_timestamp ASC
64 - LIMIT 1
 114+ ORDER BY r.rev_timestamp ASC
 115+ LIMIT ?
65116 ;
66 - ''' % (where,), (uspec,))
67 - first = list(cursor)[0][0]
68 - first = parse_wikidate(first)
 117+ ''' % (where,), (uspec,limit))
 118+ return [make_revision_t(*x) for x in cursor]
 119+
 120+def olderthan(cursor, title, timestamp):
69121 cursor.execute('''
70 - SELECT r.rev_id
 122+ SELECT r.rev_timestamp
71123 FROM revision r
 124+ INNER JOIN page p on p.page_id = r.rev_page
72125 WHERE
73 - %s
74 - AND r.rev_timestamp BETWEEN ? AND ?
75 - LIMIT ?
 126+ r.rev_timestamp != ""
 127+ AND p.page_title = ?
 128+ AND r.rev_timestamp < ?
 129+ LIMIT 1
76130 ;
77 - ''' % (where,), (uspec, format_wikidate(first), format_wikidate(first + delta), n))
78 - return [int(x[0]) for x in list(cursor)]
 131+ ''', (title,timestamp))
 132+ return len(list(cursor)) != 0
79133
80 -def editcount(cursor, uid, uname, timestamp):
81 - where = 'r.rev_user_text = ?'
82 - uspec = uname
 134+def editcount_before(cursor, uid, uname, timestamp):
83135 if uid != 0:
84 - where = 'r.rev_user = ?'
85 - uspec = uid
86 -
 136+ cursor.execute('''
 137+ SELECT /* SLOW_OK */ count(*)
 138+ FROM revision r
 139+ WHERE
 140+ r.rev_user = ?
 141+ AND r.rev_timestamp > ?
 142+ ;
 143+ ''', (uid,timestamp))
 144+ newedits = list(cursor)[0][0]
 145+ cursor.execute('''
 146+ SELECT u.user_editcount
 147+ FROM user u
 148+ WHERE
 149+ u.user_id = ?
 150+ ;
 151+ ''', (uid,))
 152+ alledits = list(cursor)[0][0]
 153+ return int(alledits) - int(newedits)
 154+ else:
 155+ # anonymous user's edit count only can be found from revision
 156+ cursor.execute('''
 157+ SELECT /* SLOW_OK */ count(*)
 158+ FROM revision r
 159+ WHERE
 160+ r.rev_user_text = ?
 161+ AND r.rev_timestamp < ?
 162+ ;
 163+ ''', (uname,timestamp))
 164+ return int(list(cursor)[0][0])
 165+
 166+def editcount_duration(cursor, uid, uname, timestamp1, timestamp2):
 167+ uspec = 'r.rev_user = ?'
 168+ uarg = uid
 169+ if uid == 0:
 170+ uspec = 'r.rev_user_text = ?'
 171+ uarg = uname
87172 cursor.execute('''
88 - SELECT count(*)
 173+ SELECT /* SLOW_OK */ count(*)
89174 FROM revision r
90175 WHERE
91176 %s
92 - AND r.rev_timestamp < ?
 177+ AND r.rev_timestamp BETWEEN ? AND ?
93178 ;
94 - ''' % (where,), (uspec,timestamp))
 179+ ''' % uspec, (uarg, timestamp1, timestamp2))
95180 return int(list(cursor)[0][0])
96181
 182+def edits_duration(cursor, uid, uname, timestamp1, timestamp2):
 183+ uspec = 'r.rev_user = ?'
 184+ uarg = uid
 185+ if uid == 0:
 186+ uspec = 'r.rev_user_text = ?'
 187+ uarg = uname
 188+ cursor.execute('''
 189+ SELECT /* SLOW_OK */ *
 190+ FROM revision r
 191+ WHERE
 192+ %s
 193+ AND r.rev_timestamp BETWEEN ? AND ?
 194+ ;
 195+ ''' % uspec, (uarg, timestamp1, timestamp2))
 196+ return [make_revision_t(*x) for x in list(cursor)]
 197+
97198 if __name__ == '__main__':
98199 parser = argparse.ArgumentParser()
99200 parser.add_argument('-f', '--field', metavar='N',
100201 dest='field', type=int, default=1,
101202 help='')
 203+ parser.add_argument('-H', '--host', metavar='HOST',
 204+ dest='host', type=str, default='',
 205+ help='mysql host name')
 206+ parser.add_argument('-R', '--hours', metavar='N',
 207+ dest='hours', type=int, default=1,
 208+ help='')
 209+ parser.add_argument('-a', '--activity-delta', metavar='DAYS',
 210+ dest='activedelta', type=lambda x: timedelta(days=x), default=timedelta(days=120),
 211+ help='')
 212+ parser.add_argument('-D', '--activity-duration', metavar='DAYS',
 213+ dest='activedur', type=lambda x: timedelta(days=x), default=timedelta(days=90),
 214+ help='')
 215+ parser.add_argument('-O', '--threshold', metavar='DATE',
 216+ dest='olderthan', type=lambda x: parse_wikidate(x), default=None,
 217+ help='')
 218+ parser.add_argument('-L', '--limit', metavar='N',
 219+ dest='limit', type=int, default=30,
 220+ help='')
 221+ parser.add_argument('-o', '--output', metavar='FILE',
 222+ dest='output', type=lambda x: open(x, 'w'), default=sys.stdout,
 223+ help='')
 224+ parser.add_argument('-b', '--include-bots',
 225+ dest='include_bots', action='store_true', default=False,
 226+ help='')
102227 parser.add_argument('-d', '--db', metavar='DBNAME', required=True,
103228 dest='db', type=str, default='hywiki-p',
104229 help='target wiki name')
@@ -105,21 +230,20 @@
106231 options = parser.parse_args()
107232 options.db = options.db.replace('_','-')
108233
109 - host = options.db + '.rrdb.toolserver.org'
110 - conn = oursql.connect(host = host,
 234+ if options.host == '':
 235+ options.host = options.db + '.rrdb.toolserver.org'
 236+ conn = oursql.connect(host = options.host,
111237 read_default_file=os.path.expanduser('~/.my.cnf'),
112238 db = options.db.replace('-','_'),
113239 charset=None,
114240 use_unicode=False)
115241
116242 cursor = conn.cursor()
117 -
118243 csv.field_size_limit(1000000000)
119244 table = list(csv.reader(open(options.input), delimiter='\t'))
120245 table = table[1:]
121246
122247 output = []
123 - hours = {}
124248 for cols in table:
125249 cursor.execute('''
126250 SELECT p.page_id, p.page_title, page_is_redirect
@@ -134,8 +258,7 @@
135259 print >>sys.stderr, 'error 1 %s' % cols
136260 continue
137261 redirect = int(res[0][2]) == 1
138 - cols.insert(options.field, 'REDIRECT' if redirect else 'ARTICLE')
139 - cols.insert(options.field, str(res[0][0]))
 262+ cols[options.field:options.field] = ['REDIRECT' if redirect else 'ARTICLE', str(res[0][0])]
140263 output.append(cols)
141264 if redirect:
142265 (title,pageid) = redirected(cursor, res[0][0])
@@ -143,11 +266,8 @@
144267 print >>sys.stderr, 'error 2 %s' % cols
145268 continue
146269 a = [x for x in cols]
147 - a[0] = title
148 - a[1] = str(pageid)
149 - a[2] = 'REDIRECT_RESOLVED'
 270+ a[options.field-1:options.field+2] = (title,str(pageid),'REDIRECT_RESOLVED')
150271 output.append(a)
151 - hours[cols[3]] = True
152272
153273 # cursor.executemany('''
154274 # SELECT p.page_title, p.page_id
@@ -158,16 +278,25 @@
159279 # ''', [(urllib2.quote(x[options.field-1]),) for x in table])
160280 # print list(cursor)
161281
162 - print '\t'.join(['title', 'page_id', 'redirect?', 'pageview timestamp', 'predicted pageview', 'actual pageview', 'trending hours', 'surprisedness', 'revision', 'timestamp', 'user type', 'username', 'editcount', 'new?'])
163 -
164 - botpat = re.compile('bot( |$)', re.IGNORECASE)
165 - edits = 0
 282+ edits = {}
166283 articles = {}
 284+ users = {}
 285+ timestamps = {}
167286 for cols in output:
168 - start = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
169 - end = start + timedelta(hours=1)
 287+ ts = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
 288+ timestamps[ts] = True
 289+ duration = sorted(timestamps.keys())
 290+ duration = (wikidate_t(format_wikidate(duration[0]), duration[0]),
 291+ wikidate_t(format_wikidate(duration[-1]), duration[-2]))
 292+ if options.olderthan == None:
 293+ options.olderthan = duration[0].datetime - timedelta(days=365)
 294+
 295+ for cols in output:
 296+ ts = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
 297+ start = ts + timedelta(hours=-options.hours)
 298+ end = start + timedelta(hours=options.hours)
170299 cursor.execute('''
171 - SELECT r.rev_id, r.rev_timestamp, r.rev_user, r.rev_user_text
 300+ SELECT *
172301 FROM revision r
173302 WHERE
174303 r.rev_page = ?
@@ -175,21 +304,96 @@
176305 ;
177306
178307 ''', (cols[1],
179 - datetime.strftime(start, '%Y%m%d$H%M%S'),
180 - datetime.strftime(end, '%Y%m%d$H%M%S'),
 308+ datetime.strftime(start, '%Y%m%d%H%M%S'),
 309+ datetime.strftime(end, '%Y%m%d%H%M%S'),
181310 ))
182 - ls = list(cursor)
 311+ ls = [make_revision_t(*x) for x in cursor]
183312 if len(ls) == 0:
184 - print >>sys.stderr, 'no revision: %s %s %s' % (cols[0], start, end)
185 - for (rev,ts,uid,username) in ls:
186 - usertype = 'ANON' if uid == 0 else 'REG'
187 - if uid != 0 and botpat.search(username):
188 - usertype += '_BOT'
189 - output = cols + [str(x) for x in [rev, ts, usertype, username,
190 - editcount(cursor,uid,username,re.sub('[ /\:]', '', cols[3])),
191 - 'NEW' if firstedits(cursor,uid,username,timedelta(days=30),30).count(rev) > 0 else 'OLD']]
192 - print '\t'.join(output)
193 - edits +=1
194 - articles[cols[1]] = True
 313+ print >>sys.stderr, 'no revision: %s %s %s' % (start, end, cols[0])
 314+ for rev in ls:
 315+ usertype = 'ANON' if rev.userid == 0 else 'REG'
 316+ if rev.userid != 0 and botpat.search(rev.usertext):
 317+ if options.include_bots:
 318+ usertype += '_BOT'
 319+ else:
 320+ print >>sys.stderr, 'rev %s is by bot (%s)' % (rev.oldid, rev.usertext)
 321+ continue
 322+ if not users.has_key((rev.userid,rev.usertext)):
 323+ users[(rev.userid,rev.usertext)] = user_t(id=rev.userid, name=rev.usertext,
 324+ first=firstedits(cursor, rev.userid, rev.usertext),
 325+ editcount=editcount_before(cursor, rev.userid, rev.usertext, duration[0].text),
 326+ periodedits=edits_duration(cursor, rev.userid, rev.usertext, duration[0].text, duration[1].text),
 327+ futureedits=edits_duration(cursor, rev.userid, rev.usertext, duration[0].datetime + options.activedelta, duration[1].datetime + options.activedelta + options.activedur),
 328+ type=usertype)
 329+ edits[rev.oldid] = (cols,rev)
 330+ print >>sys.stderr, rev.oldid
 331+ if not articles.has_key(cols[0]):
 332+ articles[cols[0]] = article_t(title=cols[0], protectlog=[], older=olderthan(cursor, cols[0], options.olderthan))
195333
196 - print '# %s / %s / %s edits/article/hour' % (edits, len(articles.keys()), len(hours.keys()))
 334+ # collect protect logs
 335+ print >>sys.stderr, 'collecting protection log entries for %s - %s...' % (duration[0].text, duration[1].text)
 336+ protectlog = allprotect(cursor, duration[0].text, duration[1].text)
 337+
 338+ # collect protect information
 339+ print >>sys.stderr, 'collecting protection log entries of %d articles for %s - %s...' % (len(articles.items()), duration[0].text, duration[1].text)
 340+ for (title,article) in articles.items():
 341+ article.protectlog.extend(filter(lambda x: x.title == title, protectlog))
 342+ closest = None
 343+ if article.older:
 344+ closest = closestprotect(cursor, format_wikidate(options.olderthan), duration[0].text, article.title)
 345+ else:
 346+ closest = closestprotect(cursor, '0', duration[0].text, article.title)
 347+ if closest:
 348+ article.protectlog.append(closest)
 349+ print >>sys.stderr, '%s %d' % (title, len(article.protectlog))
 350+
 351+ options.output.write('\t'.join(['title', 'page_id', 'redirect?', 'pageview timestamp', 'predicted pageview', 'actual pageview', 'trending hours', 'surprisedness', 'revision', 'timestamp', 'user type', 'username', 'editcount', 'new?', 'protect', 'editcount_%dd+%dd' % (options.activedelta.days, options.activedur.days)]) + '\n')
 352+
 353+ # collect protect information
 354+ print >>sys.stderr, 'writing %d edits...' % (len(edits.items()))
 355+ for (revid,(cols,rev)) in sorted(edits.items(), key=lambda x: x[0]):
 356+ new = 'OLD'
 357+ user = users[(rev.userid, rev.usertext)]
 358+ if len(user.first) == 0 or user.first[0].timestamp.datetime > rev.timestamp.datetime + timedelta(days=-30):
 359+ new = 'NEW'
 360+
 361+ revdate = rev.timestamp.datetime
 362+
 363+ article = articles[cols[0]]
 364+ protect = None
 365+ if len(article.protectlog) > 0:
 366+ f = filter(lambda x: x.timestamp.datetime < rev.timestamp.datetime, article.protectlog)
 367+ if len(f) > 0:
 368+ protect = f[0]
 369+
 370+ if protect == None or len(protect) == 0:
 371+ protect = 'NO_PROTECT'
 372+ else:
 373+ m = protectpat.search(protect.params)
 374+ if m:
 375+ lv = m.group(1)
 376+ try:
 377+ expire = datetime.strptime(m.group(2), 'expires %M:%S, %d %B %Y')
 378+ if lv == 'autoconfirmed' and expire > revdate:
 379+ protect = 'SEMIPROTECT'
 380+ elif lv == 'admin' and expire > revdate:
 381+ protect = 'PROTECT'
 382+ else:
 383+ protect = 'OTHER_PROTECT'
 384+ except ValueError, e:
 385+ if m.group(2).find('indefinite'):
 386+ protect = 'INDEFINITE'
 387+ else:
 388+ protect = 'OTHER_PROTECT'
 389+ else:
 390+ protect = 'UNKNOWN'
 391+
 392+ output = cols + [str(x) for x in [revid, rev.timestamp.text, user.type, user.name,
 393+ user.editcount + len(filter(lambda x: x.timestamp.datetime < rev.timestamp.datetime, user.periodedits)),
 394+ new,
 395+ protect,
 396+ len(filter(lambda x: rev.timestamp.datetime + options.activedelta < x.timestamp.datetime and x.timestamp.datetime < rev.timestamp.datetime + options.activedelta + options.activedur, user.futureedits))
 397+ ]]
 398+ line = '\t'.join(output)
 399+ options.output.write(line + '\n')
 400+ options.output.write('# %s / %s edits/article\n' % (len(edits.keys()), len(articles.keys())))

Status & tagging log