Index: trunk/tools/wsor/trending_articles/detectbursts.py |
— | — | @@ -7,19 +7,22 @@ |
8 | 8 | from datetime import datetime, timedelta |
9 | 9 | import argparse |
10 | 10 | import random |
11 | | -import gzip |
| 11 | +import myzip |
12 | 12 | import re |
13 | 13 | import os |
14 | 14 | import urllib2 |
15 | 15 | from collections import deque, namedtuple |
16 | 16 | import numpy as np |
17 | | -import gc |
18 | 17 | |
19 | 18 | pageview_tuple = namedtuple('Pageview', 'date count') |
20 | 19 | count_tuple = namedtuple('Count', 'pred real') |
21 | 20 | |
22 | 21 | def time_parse(x): |
23 | | - return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz') |
| 22 | + if x.endswith('.gz'): |
| 23 | + return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz') |
| 24 | + elif x.endswith('.gz'): |
| 25 | + return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.xz') |
| 26 | + |
24 | 27 | def time_format(x): |
25 | 28 | return datetime.strftime(x, '%Y/%m/%d %H:%M:%S') |
26 | 29 | def datetime2days(x): |
— | — | @@ -28,7 +31,7 @@ |
29 | 32 | def load_wikistats_file(f): |
30 | 33 | print >>sys.stderr, 'loading %s...' % f |
31 | 34 | ret = {} |
32 | | - for line in gzip.open(f): |
| 35 | + for line in myzip.open(f): |
33 | 36 | line.strip() |
34 | 37 | (lang,title,count,bytes) = line.split(' ') |
35 | 38 | ret[(lang,title)] = count_tuple(float(count), int(count)) |
— | — | @@ -143,9 +146,6 @@ |
144 | 147 | if options.inclusive: |
145 | 148 | ls.insert(len(ls), bursting.has_key(page)) |
146 | 149 | writer.writerow([unicode(x) for x in ls]) |
147 | | - except UnicodeEncodeError, e: |
148 | | - print >>sys.stderr, '%s: %s' % (e, page) |
149 | | - continue |
150 | 150 | except UnicodeDecodeError, e: |
151 | 151 | print >>sys.stderr, '%s: %s' % (e, page) |
152 | 152 | continue |
Index: trunk/tools/wsor/trending_articles/README.rst |
— | — | @@ -1,4 +1,37 @@ |
2 | | -See http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors |
| 2 | +These scripts are used to produce the results published on the `sprint |
| 3 | +page on meta`_ on the editor behavior in trending articles. |
3 | 4 | |
4 | | -Counts files are available at: |
5 | | -http://dammit.lt/wikistats/archive/2011/01/ |
| 5 | +Usage |
| 6 | +--------- |
| 7 | +We use following directory names. |
| 8 | + |
| 9 | +``pageview.all`` |
| 10 | + raw (hourly) page view count files |
| 11 | +``pageview.200907.en`` |
| 12 | + hourly page view count files for EN wiki in July 2009 only |
| 13 | +``pageview.200907.daily.en`` |
| 14 | + daily page view count files for EN wiki in July 2009 only |
| 15 | + |
| 16 | + |
| 17 | +1. Obtain the page view count files from `Domas's WikiStats` or from the ``stats`` directory `Toolserver's user-store`. |
| 18 | +2. (optional) Select only the page names you are interested to reduce the processing time. For example, use this command :: |
| 19 | + |
| 20 | + for f in pageview.all/2009/07/pagecounts-200907*.gz ; do ggrep '^en [^ ]* ' $f | gzip > pageview.200907.en/`basename $f`; done |
| 21 | + |
| 22 | +3. (optional) Convert hourly page views in to daily page views with :: |
| 23 | + |
| 24 | + ./accumulatedaily.py pageview.200907.en/pagecounts-200907*.gz -p 3 -f 'pageview.200907.daily.en/pagecounts-%Y%m%d-%H%M%S.gz' |
| 25 | + |
| 26 | +4. Detect bursts in page views with :: |
| 27 | + |
| 28 | + python -O detectbursts.py pageview.200907.daily.en/pagecounts-200907* -w 3 --rate=3 --min=1000 --max=10 --cutoff=20 -o bursts_200907_daily.tsv |
| 29 | + |
| 30 | + |
| 31 | +Notes |
| 32 | +-------- |
| 33 | +Edit counts generated by these scripts may contain errors due to a MediaWiki's `bug 19311`_. |
| 34 | + |
| 35 | +.. _Domas's WikiStats: http://dammit.lt/wikistats/archive |
| 36 | +.. _Toolserver's user-store: https://wiki.toolserver.org/view/User-store |
| 37 | +.. _sprint page on meta: http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors |
| 38 | +.. _bug 19311: https://bugzilla.wikimedia.org/show_bug.cgi?id=19311 |
Index: trunk/tools/wsor/trending_articles/chart.py |
— | — | @@ -12,6 +12,7 @@ |
13 | 13 | import datetime |
14 | 14 | import math |
15 | 15 | import re |
| 16 | +import math |
16 | 17 | from collections import namedtuple |
17 | 18 | |
18 | 19 | counter_tuple = namedtuple('counter', 'name filter color explode') |
— | — | @@ -36,6 +37,9 @@ |
37 | 38 | parser.add_argument('-v', '--verbose', |
38 | 39 | dest='verbose', action='store_true', default=False, |
39 | 40 | help='turn on verbose message output') |
| 41 | + parser.add_argument('-X', '--exclude-semiprotect', |
| 42 | + dest='nosemiprotect', action='store_true', default=False, |
| 43 | + help='') |
40 | 44 | parser.add_argument('files', nargs='+') |
41 | 45 | options = parser.parse_args() |
42 | 46 | |
— | — | @@ -52,14 +56,19 @@ |
53 | 57 | counter_tuple('others', lambda x: x, '#CCCCCC', 0.0), |
54 | 58 | ] |
55 | 59 | |
56 | | - # counters = [counter_tuple('new registered users', lambda x: x[10] == 'REG' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1), |
57 | | - # counter_tuple('old registered users', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0), |
58 | | - # counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1), |
59 | | - # counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0), |
60 | | - # counter_tuple('bots', lambda x: x[10] == 'REG_BOT' and x[14] != 'SEMIPROTECT', '#666666', 0.0), |
61 | | - # #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0), |
| 60 | + # counters = [counter_tuple('w/ <30d edit history or IP', lambda x: x[10] == 'ANON' or x[13] == 'NEW', '#FF4444', 0.1), |
| 61 | + # counter_tuple('w/ >30d edit history and registered', lambda x: x, '#CCCCCC', 0.0), |
62 | 62 | # ] |
63 | 63 | |
| 64 | + if options.nosemiprotect: |
| 65 | + counters = [counter_tuple('new registered users', lambda x: x[10] == 'REG' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1), |
| 66 | + counter_tuple('old registered users', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0), |
| 67 | + counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1), |
| 68 | + counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0), |
| 69 | + counter_tuple('bots', lambda x: x[10] == 'REG_BOT' and x[14] != 'SEMIPROTECT', '#666666', 0.0), |
| 70 | + #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0), |
| 71 | + ] |
| 72 | + |
64 | 73 | counters_map = {} |
65 | 74 | for x in counters: |
66 | 75 | counters_map[x.name] = x |
— | — | @@ -67,14 +76,17 @@ |
68 | 77 | ratios = [] |
69 | 78 | patt = re.compile('(\d+) / (\d+) / (\d+)') |
70 | 79 | for (i,fname) in enumerate(options.files): |
| 80 | + ratios.append(1.0) |
71 | 81 | for line in open(fname).readlines(): |
72 | 82 | m = patt.search(line) |
73 | 83 | if m: |
74 | | - ratios.append((float(m.group(1)) / float(m.group(2)) / float(m.group(3))) ** 0.5) |
| 84 | + ratios[i] = (float(m.group(1)) / float(m.group(2)) / float(m.group(3))) ** 0.5 |
75 | 85 | break |
| 86 | + |
76 | 87 | sum_ratio = sum(ratios) |
77 | 88 | counter_names = [x.name for x in counters] |
78 | 89 | |
| 90 | + # chart for breakdown of users |
79 | 91 | plots = [] |
80 | 92 | matplotlib.rc('font', size=options.fsize) |
81 | 93 | for (n,fname) in enumerate(options.files): |
— | — | @@ -91,7 +103,8 @@ |
92 | 104 | counts[c[0]].add(cols[options.field-1]) |
93 | 105 | break |
94 | 106 | |
95 | | - print counts#! |
| 107 | + for (name,value) in counts.items(): |
| 108 | + print name, len(value) |
96 | 109 | #plt.subplot(1, len(options.files), n+1) |
97 | 110 | plt.axes([0, 0, ratios[n]/sum_ratio, ratios[n]/sum_ratio]) |
98 | 111 | plt.title(fname) |
— | — | @@ -105,4 +118,39 @@ |
106 | 119 | loc=(.8, .8)) |
107 | 120 | |
108 | 121 | base,ext = os.path.splitext(fname) |
| 122 | + print >>sys.stderr, 'output: ' + base |
109 | 123 | plt.savefig('.'.join([base, 'svg'])) |
| 124 | + |
| 125 | + # chart for new editor retention |
| 126 | + for (n,fname) in enumerate(options.files): |
| 127 | + plt.figure(figsize=(10,10)) |
| 128 | + table = list(csv.reader(filter(lambda x: x[0] != '#', open(fname)), delimiter='\t')) |
| 129 | + table = table[1:] |
| 130 | + filt = lambda x: x[10] == 'REG' and x[13] == 'NEW' |
| 131 | + bin = lambda x: min(int(10 * math.log10(int(x[15]) + 1)), int(10 * math.log10(3000))) |
| 132 | + username = lambda x: x[11] |
| 133 | + users = {} |
| 134 | + bins = {} |
| 135 | + for cols in table: |
| 136 | + if filt(cols) and not users.has_key(username(cols)): |
| 137 | + users[username(cols)] = True |
| 138 | + b = bin(cols) |
| 139 | + bins.setdefault(b, 0) |
| 140 | + bins[b] += 1 |
| 141 | + |
| 142 | + bins = sorted(bins.items(), key=lambda x: -x[0]) |
| 143 | + max_bin = max(x[0] for x in bins) |
| 144 | + |
| 145 | + if max_bin == 0: |
| 146 | + print >>sys.stderr, '%s: %s (no values)' % (fname, bins) |
| 147 | + continue |
| 148 | + print >>sys.stderr, '%s: %s' % (fname, bins) |
| 149 | + |
| 150 | + p = plt.pie([x[1] for x in bins], |
| 151 | + pctdistance=1.2, |
| 152 | + autopct='%1.1f%%', |
| 153 | + colors=['#' + 3 * ('%02X' % int(255 - 255 * float(x[0]) / max_bin)) for x in bins]) |
| 154 | + |
| 155 | + base,ext = os.path.splitext(fname) |
| 156 | + print >>sys.stderr, 'output: ' + base |
| 157 | + plt.savefig('.'.join([base, 'retention', 'svg'])) |
Index: trunk/tools/wsor/trending_articles/find_revision_status.py |
— | — | @@ -10,7 +10,27 @@ |
11 | 11 | import urllib2 |
12 | 12 | import re |
13 | 13 | from datetime import datetime, timedelta |
| 14 | +from collections import namedtuple |
14 | 15 | |
| 16 | +revision_t = namedtuple('revision', 'oldid pageid textid comment userid usertext timestamp minor deleted length parentid') |
| 17 | +user_t = namedtuple('user', 'id name first editcount periodedits futureedits type') |
| 18 | +article_t = namedtuple('article', 'title protectlog older') |
| 19 | +edits_t = namedtuple('edits', 'before between') |
| 20 | +wikidate_t = namedtuple('wikidate', 'text datetime') |
| 21 | +log_t = namedtuple('log', 'title action params timestamp') |
| 22 | + |
| 23 | +botpat = re.compile('bot( |$)', re.IGNORECASE) |
| 24 | +protectpat = re.compile('\[edit=(.*?)\] \((.*?) \(UTC\)\)') |
| 25 | + |
| 26 | +def make_revision_t(*args): |
| 27 | + x = revision_t(*args) |
| 28 | + return x._replace(timestamp=wikidate_t(text=x.timestamp, |
| 29 | + datetime=parse_wikidate(x.timestamp))) |
| 30 | +def make_log_t(*args): |
| 31 | + x = log_t(*args) |
| 32 | + return x._replace(timestamp=wikidate_t(text=x.timestamp, |
| 33 | + datetime=parse_wikidate(x.timestamp))) |
| 34 | + |
15 | 35 | def parse_wikidate(x): |
16 | 36 | return datetime.strptime(str(x), '%Y%m%d%H%M%S') |
17 | 37 | |
— | — | @@ -47,57 +67,162 @@ |
48 | 68 | (title, rd_pid) = redirected(cursor, rd_pid, namespace) |
49 | 69 | return (title, rd_pid) |
50 | 70 | |
51 | | -def firstedits(cursor, uid, uname, delta, n): |
| 71 | +def allprotect(cursor, start, end, ns=0): |
| 72 | + cursor.execute(''' |
| 73 | + SELECT l.log_title, l.log_action, l.log_params, l.log_timestamp |
| 74 | + FROM logging l |
| 75 | + WHERE |
| 76 | + l.log_type = "protect" |
| 77 | + AND l.log_timestamp BETWEEN ? AND ? |
| 78 | + AND l.log_namespace = ? |
| 79 | + ORDER BY l.log_timestamp DESC |
| 80 | + ; |
| 81 | + ''', (start, end, ns)) |
| 82 | + return [make_log_t(*x) for x in list(cursor)] |
| 83 | + |
| 84 | +def closestprotect(cursor, limit, start, title, ns=0): |
| 85 | + cursor.execute(''' |
| 86 | + SELECT l.log_title, l.log_action, l.log_params, l.log_timestamp |
| 87 | + FROM logging l |
| 88 | + WHERE |
| 89 | + l.log_type = "protect" |
| 90 | + AND l.log_title = ? |
| 91 | + AND l.log_timestamp BETWEEN ? AND ? |
| 92 | + AND l.log_namespace = ? |
| 93 | + ORDER BY l.log_timestamp DESC |
| 94 | + LIMIT 1 |
| 95 | + ; |
| 96 | + ''', (title, limit, start, ns)) |
| 97 | + ls = list(cursor) |
| 98 | + if len(ls) == 0: |
| 99 | + return None |
| 100 | + return make_log_t(*(ls[0])) |
| 101 | + |
| 102 | +def firstedits(cursor, uid, uname, limit=1): |
52 | 103 | where = 'r.rev_user_text = ?' |
53 | 104 | uspec = uname |
54 | 105 | if uid != 0: |
55 | 106 | where = 'r.rev_user = ?' |
56 | 107 | uspec = uid |
57 | 108 | cursor.execute(''' |
58 | | - SELECT r.rev_timestamp |
| 109 | + SELECT * |
59 | 110 | FROM revision r |
60 | 111 | WHERE |
61 | 112 | r.rev_timestamp != "" |
62 | 113 | AND %s |
63 | | - ORDER BY r.rev_timestamp ASC |
64 | | - LIMIT 1 |
| 114 | + ORDER BY r.rev_timestamp ASC |
| 115 | + LIMIT ? |
65 | 116 | ; |
66 | | - ''' % (where,), (uspec,)) |
67 | | - first = list(cursor)[0][0] |
68 | | - first = parse_wikidate(first) |
| 117 | + ''' % (where,), (uspec,limit)) |
| 118 | + return [make_revision_t(*x) for x in cursor] |
| 119 | + |
| 120 | +def olderthan(cursor, title, timestamp): |
69 | 121 | cursor.execute(''' |
70 | | - SELECT r.rev_id |
| 122 | + SELECT r.rev_timestamp |
71 | 123 | FROM revision r |
| 124 | + INNER JOIN page p on p.page_id = r.rev_page |
72 | 125 | WHERE |
73 | | - %s |
74 | | - AND r.rev_timestamp BETWEEN ? AND ? |
75 | | - LIMIT ? |
| 126 | + r.rev_timestamp != "" |
| 127 | + AND p.page_title = ? |
| 128 | + AND r.rev_timestamp < ? |
| 129 | + LIMIT 1 |
76 | 130 | ; |
77 | | - ''' % (where,), (uspec, format_wikidate(first), format_wikidate(first + delta), n)) |
78 | | - return [int(x[0]) for x in list(cursor)] |
| 131 | + ''', (title,timestamp)) |
| 132 | + return len(list(cursor)) != 0 |
79 | 133 | |
80 | | -def editcount(cursor, uid, uname, timestamp): |
81 | | - where = 'r.rev_user_text = ?' |
82 | | - uspec = uname |
| 134 | +def editcount_before(cursor, uid, uname, timestamp): |
83 | 135 | if uid != 0: |
84 | | - where = 'r.rev_user = ?' |
85 | | - uspec = uid |
86 | | - |
| 136 | + cursor.execute(''' |
| 137 | + SELECT /* SLOW_OK */ count(*) |
| 138 | + FROM revision r |
| 139 | + WHERE |
| 140 | + r.rev_user = ? |
| 141 | + AND r.rev_timestamp > ? |
| 142 | + ; |
| 143 | + ''', (uid,timestamp)) |
| 144 | + newedits = list(cursor)[0][0] |
| 145 | + cursor.execute(''' |
| 146 | + SELECT u.user_editcount |
| 147 | + FROM user u |
| 148 | + WHERE |
| 149 | + u.user_id = ? |
| 150 | + ; |
| 151 | + ''', (uid,)) |
| 152 | + alledits = list(cursor)[0][0] |
| 153 | + return int(alledits) - int(newedits) |
| 154 | + else: |
| 155 | + # anonymous user's edit count only can be found from revision |
| 156 | + cursor.execute(''' |
| 157 | + SELECT /* SLOW_OK */ count(*) |
| 158 | + FROM revision r |
| 159 | + WHERE |
| 160 | + r.rev_user_text = ? |
| 161 | + AND r.rev_timestamp < ? |
| 162 | + ; |
| 163 | + ''', (uname,timestamp)) |
| 164 | + return int(list(cursor)[0][0]) |
| 165 | + |
| 166 | +def editcount_duration(cursor, uid, uname, timestamp1, timestamp2): |
| 167 | + uspec = 'r.rev_user = ?' |
| 168 | + uarg = uid |
| 169 | + if uid == 0: |
| 170 | + uspec = 'r.rev_user_text = ?' |
| 171 | + uarg = uname |
87 | 172 | cursor.execute(''' |
88 | | - SELECT count(*) |
| 173 | + SELECT /* SLOW_OK */ count(*) |
89 | 174 | FROM revision r |
90 | 175 | WHERE |
91 | 176 | %s |
92 | | - AND r.rev_timestamp < ? |
| 177 | + AND r.rev_timestamp BETWEEN ? AND ? |
93 | 178 | ; |
94 | | - ''' % (where,), (uspec,timestamp)) |
| 179 | + ''' % uspec, (uarg, timestamp1, timestamp2)) |
95 | 180 | return int(list(cursor)[0][0]) |
96 | 181 | |
| 182 | +def edits_duration(cursor, uid, uname, timestamp1, timestamp2): |
| 183 | + uspec = 'r.rev_user = ?' |
| 184 | + uarg = uid |
| 185 | + if uid == 0: |
| 186 | + uspec = 'r.rev_user_text = ?' |
| 187 | + uarg = uname |
| 188 | + cursor.execute(''' |
| 189 | + SELECT /* SLOW_OK */ * |
| 190 | + FROM revision r |
| 191 | + WHERE |
| 192 | + %s |
| 193 | + AND r.rev_timestamp BETWEEN ? AND ? |
| 194 | + ; |
| 195 | + ''' % uspec, (uarg, timestamp1, timestamp2)) |
| 196 | + return [make_revision_t(*x) for x in list(cursor)] |
| 197 | + |
97 | 198 | if __name__ == '__main__': |
98 | 199 | parser = argparse.ArgumentParser() |
99 | 200 | parser.add_argument('-f', '--field', metavar='N', |
100 | 201 | dest='field', type=int, default=1, |
101 | 202 | help='') |
| 203 | + parser.add_argument('-H', '--host', metavar='HOST', |
| 204 | + dest='host', type=str, default='', |
| 205 | + help='mysql host name') |
| 206 | + parser.add_argument('-R', '--hours', metavar='N', |
| 207 | + dest='hours', type=int, default=1, |
| 208 | + help='') |
| 209 | + parser.add_argument('-a', '--activity-delta', metavar='DAYS', |
| 210 | + dest='activedelta', type=lambda x: timedelta(days=x), default=timedelta(days=120), |
| 211 | + help='') |
| 212 | + parser.add_argument('-D', '--activity-duration', metavar='DAYS', |
| 213 | + dest='activedur', type=lambda x: timedelta(days=x), default=timedelta(days=90), |
| 214 | + help='') |
| 215 | + parser.add_argument('-O', '--threshold', metavar='DATE', |
| 216 | + dest='olderthan', type=lambda x: parse_wikidate(x), default=None, |
| 217 | + help='') |
| 218 | + parser.add_argument('-L', '--limit', metavar='N', |
| 219 | + dest='limit', type=int, default=30, |
| 220 | + help='') |
| 221 | + parser.add_argument('-o', '--output', metavar='FILE', |
| 222 | + dest='output', type=lambda x: open(x, 'w'), default=sys.stdout, |
| 223 | + help='') |
| 224 | + parser.add_argument('-b', '--include-bots', |
| 225 | + dest='include_bots', action='store_true', default=False, |
| 226 | + help='') |
102 | 227 | parser.add_argument('-d', '--db', metavar='DBNAME', required=True, |
103 | 228 | dest='db', type=str, default='hywiki-p', |
104 | 229 | help='target wiki name') |
— | — | @@ -105,21 +230,20 @@ |
106 | 231 | options = parser.parse_args() |
107 | 232 | options.db = options.db.replace('_','-') |
108 | 233 | |
109 | | - host = options.db + '.rrdb.toolserver.org' |
110 | | - conn = oursql.connect(host = host, |
| 234 | + if options.host == '': |
| 235 | + options.host = options.db + '.rrdb.toolserver.org' |
| 236 | + conn = oursql.connect(host = options.host, |
111 | 237 | read_default_file=os.path.expanduser('~/.my.cnf'), |
112 | 238 | db = options.db.replace('-','_'), |
113 | 239 | charset=None, |
114 | 240 | use_unicode=False) |
115 | 241 | |
116 | 242 | cursor = conn.cursor() |
117 | | - |
118 | 243 | csv.field_size_limit(1000000000) |
119 | 244 | table = list(csv.reader(open(options.input), delimiter='\t')) |
120 | 245 | table = table[1:] |
121 | 246 | |
122 | 247 | output = [] |
123 | | - hours = {} |
124 | 248 | for cols in table: |
125 | 249 | cursor.execute(''' |
126 | 250 | SELECT p.page_id, p.page_title, page_is_redirect |
— | — | @@ -134,8 +258,7 @@ |
135 | 259 | print >>sys.stderr, 'error 1 %s' % cols |
136 | 260 | continue |
137 | 261 | redirect = int(res[0][2]) == 1 |
138 | | - cols.insert(options.field, 'REDIRECT' if redirect else 'ARTICLE') |
139 | | - cols.insert(options.field, str(res[0][0])) |
| 262 | + cols[options.field:options.field] = ['REDIRECT' if redirect else 'ARTICLE', str(res[0][0])] |
140 | 263 | output.append(cols) |
141 | 264 | if redirect: |
142 | 265 | (title,pageid) = redirected(cursor, res[0][0]) |
— | — | @@ -143,11 +266,8 @@ |
144 | 267 | print >>sys.stderr, 'error 2 %s' % cols |
145 | 268 | continue |
146 | 269 | a = [x for x in cols] |
147 | | - a[0] = title |
148 | | - a[1] = str(pageid) |
149 | | - a[2] = 'REDIRECT_RESOLVED' |
| 270 | + a[options.field-1:options.field+2] = (title,str(pageid),'REDIRECT_RESOLVED') |
150 | 271 | output.append(a) |
151 | | - hours[cols[3]] = True |
152 | 272 | |
153 | 273 | # cursor.executemany(''' |
154 | 274 | # SELECT p.page_title, p.page_id |
— | — | @@ -158,16 +278,25 @@ |
159 | 279 | # ''', [(urllib2.quote(x[options.field-1]),) for x in table]) |
160 | 280 | # print list(cursor) |
161 | 281 | |
162 | | - print '\t'.join(['title', 'page_id', 'redirect?', 'pageview timestamp', 'predicted pageview', 'actual pageview', 'trending hours', 'surprisedness', 'revision', 'timestamp', 'user type', 'username', 'editcount', 'new?']) |
163 | | - |
164 | | - botpat = re.compile('bot( |$)', re.IGNORECASE) |
165 | | - edits = 0 |
| 282 | + edits = {} |
166 | 283 | articles = {} |
| 284 | + users = {} |
| 285 | + timestamps = {} |
167 | 286 | for cols in output: |
168 | | - start = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S') |
169 | | - end = start + timedelta(hours=1) |
| 287 | + ts = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S') |
| 288 | + timestamps[ts] = True |
| 289 | + duration = sorted(timestamps.keys()) |
| 290 | + duration = (wikidate_t(format_wikidate(duration[0]), duration[0]), |
| 291 | + wikidate_t(format_wikidate(duration[-1]), duration[-2])) |
| 292 | + if options.olderthan == None: |
| 293 | + options.olderthan = duration[0].datetime - timedelta(days=365) |
| 294 | + |
| 295 | + for cols in output: |
| 296 | + ts = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S') |
| 297 | + start = ts + timedelta(hours=-options.hours) |
| 298 | + end = start + timedelta(hours=options.hours) |
170 | 299 | cursor.execute(''' |
171 | | - SELECT r.rev_id, r.rev_timestamp, r.rev_user, r.rev_user_text |
| 300 | + SELECT * |
172 | 301 | FROM revision r |
173 | 302 | WHERE |
174 | 303 | r.rev_page = ? |
— | — | @@ -175,21 +304,96 @@ |
176 | 305 | ; |
177 | 306 | |
178 | 307 | ''', (cols[1], |
179 | | - datetime.strftime(start, '%Y%m%d$H%M%S'), |
180 | | - datetime.strftime(end, '%Y%m%d$H%M%S'), |
| 308 | + datetime.strftime(start, '%Y%m%d%H%M%S'), |
| 309 | + datetime.strftime(end, '%Y%m%d%H%M%S'), |
181 | 310 | )) |
182 | | - ls = list(cursor) |
| 311 | + ls = [make_revision_t(*x) for x in cursor] |
183 | 312 | if len(ls) == 0: |
184 | | - print >>sys.stderr, 'no revision: %s %s %s' % (cols[0], start, end) |
185 | | - for (rev,ts,uid,username) in ls: |
186 | | - usertype = 'ANON' if uid == 0 else 'REG' |
187 | | - if uid != 0 and botpat.search(username): |
188 | | - usertype += '_BOT' |
189 | | - output = cols + [str(x) for x in [rev, ts, usertype, username, |
190 | | - editcount(cursor,uid,username,re.sub('[ /\:]', '', cols[3])), |
191 | | - 'NEW' if firstedits(cursor,uid,username,timedelta(days=30),30).count(rev) > 0 else 'OLD']] |
192 | | - print '\t'.join(output) |
193 | | - edits +=1 |
194 | | - articles[cols[1]] = True |
| 313 | + print >>sys.stderr, 'no revision: %s %s %s' % (start, end, cols[0]) |
| 314 | + for rev in ls: |
| 315 | + usertype = 'ANON' if rev.userid == 0 else 'REG' |
| 316 | + if rev.userid != 0 and botpat.search(rev.usertext): |
| 317 | + if options.include_bots: |
| 318 | + usertype += '_BOT' |
| 319 | + else: |
| 320 | + print >>sys.stderr, 'rev %s is by bot (%s)' % (rev.oldid, rev.usertext) |
| 321 | + continue |
| 322 | + if not users.has_key((rev.userid,rev.usertext)): |
| 323 | + users[(rev.userid,rev.usertext)] = user_t(id=rev.userid, name=rev.usertext, |
| 324 | + first=firstedits(cursor, rev.userid, rev.usertext), |
| 325 | + editcount=editcount_before(cursor, rev.userid, rev.usertext, duration[0].text), |
| 326 | + periodedits=edits_duration(cursor, rev.userid, rev.usertext, duration[0].text, duration[1].text), |
| 327 | + futureedits=edits_duration(cursor, rev.userid, rev.usertext, duration[0].datetime + options.activedelta, duration[1].datetime + options.activedelta + options.activedur), |
| 328 | + type=usertype) |
| 329 | + edits[rev.oldid] = (cols,rev) |
| 330 | + print >>sys.stderr, rev.oldid |
| 331 | + if not articles.has_key(cols[0]): |
| 332 | + articles[cols[0]] = article_t(title=cols[0], protectlog=[], older=olderthan(cursor, cols[0], options.olderthan)) |
195 | 333 | |
196 | | - print '# %s / %s / %s edits/article/hour' % (edits, len(articles.keys()), len(hours.keys())) |
| 334 | + # collect protect logs |
| 335 | + print >>sys.stderr, 'collecting protection log entries for %s - %s...' % (duration[0].text, duration[1].text) |
| 336 | + protectlog = allprotect(cursor, duration[0].text, duration[1].text) |
| 337 | + |
| 338 | + # collect protect information |
| 339 | + print >>sys.stderr, 'collecting protection log entries of %d articles for %s - %s...' % (len(articles.items()), duration[0].text, duration[1].text) |
| 340 | + for (title,article) in articles.items(): |
| 341 | + article.protectlog.extend(filter(lambda x: x.title == title, protectlog)) |
| 342 | + closest = None |
| 343 | + if article.older: |
| 344 | + closest = closestprotect(cursor, format_wikidate(options.olderthan), duration[0].text, article.title) |
| 345 | + else: |
| 346 | + closest = closestprotect(cursor, '0', duration[0].text, article.title) |
| 347 | + if closest: |
| 348 | + article.protectlog.append(closest) |
| 349 | + print >>sys.stderr, '%s %d' % (title, len(article.protectlog)) |
| 350 | + |
| 351 | + options.output.write('\t'.join(['title', 'page_id', 'redirect?', 'pageview timestamp', 'predicted pageview', 'actual pageview', 'trending hours', 'surprisedness', 'revision', 'timestamp', 'user type', 'username', 'editcount', 'new?', 'protect', 'editcount_%dd+%dd' % (options.activedelta.days, options.activedur.days)]) + '\n') |
| 352 | + |
| 353 | + # collect protect information |
| 354 | + print >>sys.stderr, 'writing %d edits...' % (len(edits.items())) |
| 355 | + for (revid,(cols,rev)) in sorted(edits.items(), key=lambda x: x[0]): |
| 356 | + new = 'OLD' |
| 357 | + user = users[(rev.userid, rev.usertext)] |
| 358 | + if len(user.first) == 0 or user.first[0].timestamp.datetime > rev.timestamp.datetime + timedelta(days=-30): |
| 359 | + new = 'NEW' |
| 360 | + |
| 361 | + revdate = rev.timestamp.datetime |
| 362 | + |
| 363 | + article = articles[cols[0]] |
| 364 | + protect = None |
| 365 | + if len(article.protectlog) > 0: |
| 366 | + f = filter(lambda x: x.timestamp.datetime < rev.timestamp.datetime, article.protectlog) |
| 367 | + if len(f) > 0: |
| 368 | + protect = f[0] |
| 369 | + |
| 370 | + if protect == None or len(protect) == 0: |
| 371 | + protect = 'NO_PROTECT' |
| 372 | + else: |
| 373 | + m = protectpat.search(protect.params) |
| 374 | + if m: |
| 375 | + lv = m.group(1) |
| 376 | + try: |
| 377 | + expire = datetime.strptime(m.group(2), 'expires %M:%S, %d %B %Y') |
| 378 | + if lv == 'autoconfirmed' and expire > revdate: |
| 379 | + protect = 'SEMIPROTECT' |
| 380 | + elif lv == 'admin' and expire > revdate: |
| 381 | + protect = 'PROTECT' |
| 382 | + else: |
| 383 | + protect = 'OTHER_PROTECT' |
| 384 | + except ValueError, e: |
| 385 | + if m.group(2).find('indefinite'): |
| 386 | + protect = 'INDEFINITE' |
| 387 | + else: |
| 388 | + protect = 'OTHER_PROTECT' |
| 389 | + else: |
| 390 | + protect = 'UNKNOWN' |
| 391 | + |
| 392 | + output = cols + [str(x) for x in [revid, rev.timestamp.text, user.type, user.name, |
| 393 | + user.editcount + len(filter(lambda x: x.timestamp.datetime < rev.timestamp.datetime, user.periodedits)), |
| 394 | + new, |
| 395 | + protect, |
| 396 | + len(filter(lambda x: rev.timestamp.datetime + options.activedelta < x.timestamp.datetime and x.timestamp.datetime < rev.timestamp.datetime + options.activedelta + options.activedur, user.futureedits)) |
| 397 | + ]] |
| 398 | + line = '\t'.join(output) |
| 399 | + options.output.write(line + '\n') |
| 400 | + options.output.write('# %s / %s edits/article\n' % (len(edits.keys()), len(articles.keys()))) |