Index: trunk/tools/wsor/trending_articles/detectbursts.py |
— | — | @@ -0,0 +1,155 @@ |
| 2 | +#! /usr/bin/env python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | + |
| 5 | +import codecs |
| 6 | +import csv |
| 7 | +import sys |
| 8 | +from datetime import datetime, timedelta |
| 9 | +import argparse |
| 10 | +import random |
| 11 | +import gzip |
| 12 | +import re |
| 13 | +import os |
| 14 | +import urllib2 |
| 15 | +from collections import deque, namedtuple |
| 16 | +import numpy as np |
| 17 | +import gc |
| 18 | + |
| 19 | +pageview_tuple = namedtuple('Pageview', 'date count') |
| 20 | +count_tuple = namedtuple('Count', 'pred real') |
| 21 | + |
| 22 | +def time_parse(x): |
| 23 | + return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz') |
| 24 | +def time_format(x): |
| 25 | + return datetime.strftime(x, '%Y/%m/%d %H:%M:%S') |
| 26 | +def datetime2days(x): |
| 27 | + return x.days + x.seconds / 60.0 / 60.0 / 24.0 + x.microseconds / 6000.0 / 60.0 / 24.0 |
| 28 | + |
| 29 | +def load_wikistats_file(f): |
| 30 | + print >>sys.stderr, 'loading %s...' % f |
| 31 | + ret = {} |
| 32 | + for line in gzip.open(f): |
| 33 | + line.strip() |
| 34 | + (lang,title,count,bytes) = line.split(' ') |
| 35 | + ret[(lang,title)] = count_tuple(float(count), int(count)) |
| 36 | + return pageview_tuple(time_parse(os.path.basename(f)), ret) |
| 37 | + |
| 38 | +def slices(ls, size): |
| 39 | + size /= 2 |
| 40 | + return map(lambda i: ls[i:i+2*size+1], xrange(0, len(ls) - 2*size)) |
| 41 | + |
| 42 | +def predict(ls, new): |
| 43 | + hist = {} |
| 44 | + for (page,count) in new.items(): |
| 45 | + hist[page] = count_tuple([], count.real) |
| 46 | + for (i,cnts) in enumerate(ls): |
| 47 | + for (page,c) in cnts.items(): |
| 48 | + if hist.has_key(page): |
| 49 | + if len(hist[page].pred) < i: |
| 50 | + a = [0]*(i-len(hist[page].pred)) |
| 51 | + hist[page].pred.extend(a) |
| 52 | + hist[page].pred.append(c.real) |
| 53 | + else: |
| 54 | + a = [0]*(i+1) |
| 55 | + a[-1] = c.real |
| 56 | + hist[page] = count_tuple(a, new[page].real if new.has_key(page) else 0) |
| 57 | + ret = {} |
| 58 | + for (page, count) in hist.items(): |
| 59 | + if len(hist[page].pred) < len(ls): |
| 60 | + a = [0]*(len(ls)-len(hist[page].pred)) |
| 61 | + hist[page].pred.extend(a) |
| 62 | + slope,intercept = np.linalg.lstsq(np.transpose([np.array(range(0,len(count.pred))), |
| 63 | + np.ones(len(count.pred))]), |
| 64 | + np.array(count.pred))[0] |
| 65 | + ret[page] = count._replace(pred=slope * (len(count.pred) + 1) + intercept) |
| 66 | + return ret |
| 67 | + |
| 68 | +def moving_accumurate(ls, n, extract=lambda x: x, accumurate=lambda sum,x: sum+x): |
| 69 | + if n <= 1: |
| 70 | + for x in ls: |
| 71 | + yield(extract(x)) |
| 72 | + return |
| 73 | + buff = deque() |
| 74 | + for (i,v) in enumerate(ls): |
| 75 | + if i < n-1: |
| 76 | + buff.append(extract(v)) |
| 77 | + else: |
| 78 | + vv = extract(v) |
| 79 | + r = accumurate(buff, vv) |
| 80 | + yield r |
| 81 | + buff.popleft() |
| 82 | + buff.append(vv) |
| 83 | + |
| 84 | +if __name__ == '__main__': |
| 85 | + parser = argparse.ArgumentParser() |
| 86 | + parser.add_argument('-o', '--output', metavar='FILE', |
| 87 | + dest='output', type=lambda x: codecs.open(x, 'w', 'utf-8'), default=sys.stdout, |
| 88 | + help='') |
| 89 | + parser.add_argument('-R', '--rate', metavar='RATE', |
| 90 | + dest='rate', type=float, default=8.0, |
| 91 | + help='') |
| 92 | + parser.add_argument('-m', '--max-duration', metavar='HOURS', |
| 93 | + dest='max', type=float, default=5, |
| 94 | + help='') |
| 95 | + parser.add_argument('-w', '--window', metavar='HOURS', |
| 96 | + dest='window', type=int, default=5, |
| 97 | + help='') |
| 98 | + parser.add_argument('-M', '--min-count', metavar='N', |
| 99 | + dest='min', type=int, default=2000, |
| 100 | + help='') |
| 101 | + parser.add_argument('-v', '--verbose', |
| 102 | + dest='verbose', action='store_true', default=False, |
| 103 | + help='turn on verbose message output') |
| 104 | + parser.add_argument('-i', '--inclusive', |
| 105 | + dest='inclusive', action='store_true', default=False, |
| 106 | + help='include the items below the threshold, add a binary indicator column') |
| 107 | + parser.add_argument('files', nargs='+') |
| 108 | + options = parser.parse_args() |
| 109 | + |
| 110 | + fh = options.output |
| 111 | + writer = csv.writer(fh, delimiter='\t') |
| 112 | + fh.write('title\ttime\tcount_pred\tcount_\tcont\trate\n') |
| 113 | + |
| 114 | + if options.verbose: |
| 115 | + print >>sys.stderr, options |
| 116 | + |
| 117 | + options.files.sort() |
| 118 | + gen_sums = moving_accumurate(options.files, options.window, extract=load_wikistats_file, |
| 119 | + accumurate=lambda hist,cur: (cur.date, predict([x.count for x in hist],cur.count))) |
| 120 | + bursting = {} |
| 121 | + for (newtime,new) in gen_sums: |
| 122 | + for (page,count) in new.items(): |
| 123 | + if count.real < options.min: |
| 124 | + continue |
| 125 | + r = 0 |
| 126 | + if count.pred == 0: |
| 127 | + if count.real > options.min: |
| 128 | + r = 9999999 |
| 129 | + else: |
| 130 | + r = float(count.real - count.pred) / count.pred |
| 131 | + if r > options.rate: |
| 132 | + bursting.setdefault(page, 0) |
| 133 | + if bursting.has_key(page): |
| 134 | + bursting[page] += 1 |
| 135 | + if bursting.has_key(page) or options.inclusive: |
| 136 | + b = bursting[page] if bursting.has_key(page) else 0 |
| 137 | + try: |
| 138 | + ls = [urllib2.unquote(page[1]).decode('utf-8'), |
| 139 | + time_format(newtime), |
| 140 | + count.pred, |
| 141 | + count.real, |
| 142 | + b, |
| 143 | + r] |
| 144 | + if options.inclusive: |
| 145 | + ls.insert(len(ls), bursting.has_key(page)) |
| 146 | + writer.writerow([unicode(x) for x in ls]) |
| 147 | + except UnicodeEncodeError, e: |
| 148 | + print >>sys.stderr, '%s: %s' % (e, page) |
| 149 | + continue |
| 150 | + except UnicodeDecodeError, e: |
| 151 | + print >>sys.stderr, '%s: %s' % (e, page) |
| 152 | + continue |
| 153 | + if bursting.has_key(page) and (r < -options.rate or bursting[page] > float(options.max) / options.window): |
| 154 | + bursting.pop(page) |
| 155 | + |
| 156 | + |
Property changes on: trunk/tools/wsor/trending_articles/detectbursts.py |
___________________________________________________________________ |
Added: svn:executable |
1 | 157 | + * |
Index: trunk/tools/wsor/trending_articles/README.rst |
— | — | @@ -0,0 +1,4 @@ |
| 2 | +See http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors |
| 3 | + |
| 4 | +Counts files are available at: |
| 5 | +http://dammit.lt/wikistats/archive/2011/01/ |
Index: trunk/tools/wsor/trending_articles/chart.py |
— | — | @@ -0,0 +1,108 @@ |
| 2 | +#! /usr/bin/env python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +# |
| 5 | + |
| 6 | +import numpy |
| 7 | +import pylab as plt |
| 8 | +import matplotlib |
| 9 | +import os |
| 10 | +import argparse |
| 11 | +import sys |
| 12 | +import csv |
| 13 | +import datetime |
| 14 | +import math |
| 15 | +import re |
| 16 | +from collections import namedtuple |
| 17 | + |
| 18 | +counter_tuple = namedtuple('counter', 'name filter color explode') |
| 19 | + |
| 20 | +def str_to_time(x): |
| 21 | + return datetime.datetime.strptime(x, '%Y%m%d%H%M%S') |
| 22 | + |
| 23 | +if __name__ == '__main__': |
| 24 | + parser = argparse.ArgumentParser() |
| 25 | + parser.add_argument('-s', '--font-size', metavar='POINT', |
| 26 | + dest='fsize', type=int, default=15, |
| 27 | + help='') |
| 28 | + parser.add_argument('-f', '--field', metavar='N', |
| 29 | + dest='field', type=int, default=9, |
| 30 | + help='') |
| 31 | + parser.add_argument('-w', '--width', metavar='SIZE', |
| 32 | + dest='width', type=int, default=0.3, |
| 33 | + help='') |
| 34 | + parser.add_argument('-y', '--ylimit', metavar='LIMITS', |
| 35 | + dest='ylim', type=str, default='-300,1500', |
| 36 | + help='') |
| 37 | + parser.add_argument('-v', '--verbose', |
| 38 | + dest='verbose', action='store_true', default=False, |
| 39 | + help='turn on verbose message output') |
| 40 | + parser.add_argument('files', nargs='+') |
| 41 | + options = parser.parse_args() |
| 42 | + |
| 43 | + |
| 44 | + csv.field_size_limit(1000000000) |
| 45 | + |
| 46 | + counters = [counter_tuple('new reg. users', lambda x: x[10] == 'REG' and x[13] == 'NEW', '#4444FF', 0.1), |
| 47 | + counter_tuple('old reg. users (semiprotected)', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] == 'SEMIPROTECT', '#99DD99', 0.0), |
| 48 | + counter_tuple('old reg. users (not protected)', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] == 'NO_PROTECT', '#8888EE', 0.0), |
| 49 | + counter_tuple('old reg. users (other)', lambda x: x[10] == 'REG' and x[13] == 'OLD', '#FFFFFF', 0.0), |
| 50 | + counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW', '#FF4444', 0.1), |
| 51 | + counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD', '#EE8888', 0.0), |
| 52 | + counter_tuple('bots', lambda x: x[10] == 'REG_BOT', '#666666', 0.0), |
| 53 | + counter_tuple('others', lambda x: x, '#CCCCCC', 0.0), |
| 54 | + ] |
| 55 | + |
| 56 | + # counters = [counter_tuple('new registered users', lambda x: x[10] == 'REG' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1), |
| 57 | + # counter_tuple('old registered users', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0), |
| 58 | + # counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1), |
| 59 | + # counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0), |
| 60 | + # counter_tuple('bots', lambda x: x[10] == 'REG_BOT' and x[14] != 'SEMIPROTECT', '#666666', 0.0), |
| 61 | + # #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0), |
| 62 | + # ] |
| 63 | + |
| 64 | + counters_map = {} |
| 65 | + for x in counters: |
| 66 | + counters_map[x.name] = x |
| 67 | + |
| 68 | + ratios = [] |
| 69 | + patt = re.compile('(\d+) / (\d+) / (\d+)') |
| 70 | + for (i,fname) in enumerate(options.files): |
| 71 | + for line in open(fname).readlines(): |
| 72 | + m = patt.search(line) |
| 73 | + if m: |
| 74 | + ratios.append((float(m.group(1)) / float(m.group(2)) / float(m.group(3))) ** 0.5) |
| 75 | + break |
| 76 | + sum_ratio = sum(ratios) |
| 77 | + counter_names = [x.name for x in counters] |
| 78 | + |
| 79 | + plots = [] |
| 80 | + matplotlib.rc('font', size=options.fsize) |
| 81 | + for (n,fname) in enumerate(options.files): |
| 82 | + plt.figure(figsize=(10,10)) |
| 83 | + table = list(csv.reader(filter(lambda x: x[0] != '#', open(fname)), delimiter='\t')) |
| 84 | + table = table[1:] |
| 85 | + |
| 86 | + counts = {} |
| 87 | + for name in counter_names: |
| 88 | + counts[name] = set() |
| 89 | + for cols in table: |
| 90 | + for c in counters: |
| 91 | + if c.filter(cols): |
| 92 | + counts[c[0]].add(cols[options.field-1]) |
| 93 | + break |
| 94 | + |
| 95 | + print counts#! |
| 96 | + #plt.subplot(1, len(options.files), n+1) |
| 97 | + plt.axes([0, 0, ratios[n]/sum_ratio, ratios[n]/sum_ratio]) |
| 98 | + plt.title(fname) |
| 99 | + p = plt.pie([len(counts[x]) for x in counter_names], |
| 100 | + explode=[counters_map[x].explode for x in counter_names], |
| 101 | + autopct='%1.1f%%', |
| 102 | + pctdistance=1.2, |
| 103 | + colors=[x.color for x in counters]) |
| 104 | + |
| 105 | + plt.legend(p[0], ['' if counts[x] == 0 else x for x in counter_names], |
| 106 | + loc=(.8, .8)) |
| 107 | + |
| 108 | + base,ext = os.path.splitext(fname) |
| 109 | + plt.savefig('.'.join([base, 'svg'])) |
Property changes on: trunk/tools/wsor/trending_articles/chart.py |
___________________________________________________________________ |
Added: svn:executable |
1 | 110 | + * |
Index: trunk/tools/wsor/trending_articles/find_revision_status.py |
— | — | @@ -0,0 +1,195 @@ |
| 2 | +#! /usr/bin/env python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +# |
| 5 | + |
| 6 | +import oursql |
| 7 | +import os |
| 8 | +import argparse |
| 9 | +import sys |
| 10 | +import csv |
| 11 | +import urllib2 |
| 12 | +import re |
| 13 | +from datetime import datetime, timedelta |
| 14 | + |
| 15 | +def parse_wikidate(x): |
| 16 | + return datetime.strptime(str(x), '%Y%m%d%H%M%S') |
| 17 | + |
| 18 | +def format_wikidate(x): |
| 19 | + return datetime.strftime(x, '%Y%m%d%H%M%S') |
| 20 | + |
| 21 | +def title2pageid(cursor, title, namespace=0): |
| 22 | + cursor.execute(''' |
| 23 | + SELECT p.page_id, page_is_redirect |
| 24 | + FROM page p |
| 25 | + WHERE |
| 26 | + p.page_title = ? |
| 27 | + AND p.page_namespace = ? |
| 28 | + ; |
| 29 | + ''', (title,namespace)) |
| 30 | + ls = list(cursor) |
| 31 | + if len(ls) == 0: |
| 32 | + return (None,None) |
| 33 | + return tuple(ls[0]) |
| 34 | + |
| 35 | +def redirected(cursor, pid, namespace=0): |
| 36 | + cursor.execute(''' |
| 37 | + SELECT rd_title |
| 38 | + FROM redirect r |
| 39 | + WHERE |
| 40 | + r.rd_from = ? |
| 41 | + ; |
| 42 | + ''', (pid,)) |
| 43 | + title = list(cursor)[0][0] |
| 44 | + rd_pid,rd = title2pageid(cursor, title) |
| 45 | + if rd_pid == None: |
| 46 | + return (None,None) |
| 47 | + if rd and rd_pid != pid: |
| 48 | + (title, rd_pid) = redirected(cursor, rd_pid, namespace) |
| 49 | + return (title, rd_pid) |
| 50 | + |
| 51 | +def firstedits(cursor, uid, uname, delta, n): |
| 52 | + where = 'r.rev_user_text = ?' |
| 53 | + uspec = uname |
| 54 | + if uid != 0: |
| 55 | + where = 'r.rev_user = ?' |
| 56 | + uspec = uid |
| 57 | + cursor.execute(''' |
| 58 | + SELECT r.rev_timestamp |
| 59 | + FROM revision r |
| 60 | + WHERE |
| 61 | + r.rev_timestamp != "" |
| 62 | + AND %s |
| 63 | + ORDER BY r.rev_timestamp ASC |
| 64 | + LIMIT 1 |
| 65 | + ; |
| 66 | + ''' % (where,), (uspec,)) |
| 67 | + first = list(cursor)[0][0] |
| 68 | + first = parse_wikidate(first) |
| 69 | + cursor.execute(''' |
| 70 | + SELECT r.rev_id |
| 71 | + FROM revision r |
| 72 | + WHERE |
| 73 | + %s |
| 74 | + AND r.rev_timestamp BETWEEN ? AND ? |
| 75 | + LIMIT ? |
| 76 | + ; |
| 77 | + ''' % (where,), (uspec, format_wikidate(first), format_wikidate(first + delta), n)) |
| 78 | + return [int(x[0]) for x in list(cursor)] |
| 79 | + |
| 80 | +def editcount(cursor, uid, uname, timestamp): |
| 81 | + where = 'r.rev_user_text = ?' |
| 82 | + uspec = uname |
| 83 | + if uid != 0: |
| 84 | + where = 'r.rev_user = ?' |
| 85 | + uspec = uid |
| 86 | + |
| 87 | + cursor.execute(''' |
| 88 | + SELECT count(*) |
| 89 | + FROM revision r |
| 90 | + WHERE |
| 91 | + %s |
| 92 | + AND r.rev_timestamp < ? |
| 93 | + ; |
| 94 | + ''' % (where,), (uspec,timestamp)) |
| 95 | + return int(list(cursor)[0][0]) |
| 96 | + |
| 97 | +if __name__ == '__main__': |
| 98 | + parser = argparse.ArgumentParser() |
| 99 | + parser.add_argument('-f', '--field', metavar='N', |
| 100 | + dest='field', type=int, default=1, |
| 101 | + help='') |
| 102 | + parser.add_argument('-d', '--db', metavar='DBNAME', required=True, |
| 103 | + dest='db', type=str, default='hywiki-p', |
| 104 | + help='target wiki name') |
| 105 | + parser.add_argument('input') |
| 106 | + options = parser.parse_args() |
| 107 | + options.db = options.db.replace('_','-') |
| 108 | + |
| 109 | + host = options.db + '.rrdb.toolserver.org' |
| 110 | + conn = oursql.connect(host = host, |
| 111 | + read_default_file=os.path.expanduser('~/.my.cnf'), |
| 112 | + db = options.db.replace('-','_'), |
| 113 | + charset=None, |
| 114 | + use_unicode=False) |
| 115 | + |
| 116 | + cursor = conn.cursor() |
| 117 | + |
| 118 | + csv.field_size_limit(1000000000) |
| 119 | + table = list(csv.reader(open(options.input), delimiter='\t')) |
| 120 | + table = table[1:] |
| 121 | + |
| 122 | + output = [] |
| 123 | + hours = {} |
| 124 | + for cols in table: |
| 125 | + cursor.execute(''' |
| 126 | + SELECT p.page_id, p.page_title, page_is_redirect |
| 127 | + FROM page p |
| 128 | + WHERE |
| 129 | + p.page_title = ? |
| 130 | + AND p.page_namespace = 0 |
| 131 | + ; |
| 132 | + ''', (cols[options.field-1],)) |
| 133 | + res = list(cursor) |
| 134 | + if res == None or res == []: |
| 135 | + print >>sys.stderr, 'error 1 %s' % cols |
| 136 | + continue |
| 137 | + redirect = int(res[0][2]) == 1 |
| 138 | + cols.insert(options.field, 'REDIRECT' if redirect else 'ARTICLE') |
| 139 | + cols.insert(options.field, str(res[0][0])) |
| 140 | + output.append(cols) |
| 141 | + if redirect: |
| 142 | + (title,pageid) = redirected(cursor, res[0][0]) |
| 143 | + if title == None: |
| 144 | + print >>sys.stderr, 'error 2 %s' % cols |
| 145 | + continue |
| 146 | + a = [x for x in cols] |
| 147 | + a[0] = title |
| 148 | + a[1] = str(pageid) |
| 149 | + a[2] = 'REDIRECT_RESOLVED' |
| 150 | + output.append(a) |
| 151 | + hours[cols[3]] = True |
| 152 | + |
| 153 | + # cursor.executemany(''' |
| 154 | + # SELECT p.page_title, p.page_id |
| 155 | + # FROM page p |
| 156 | + # WHERE |
| 157 | + # p.page_title = ? |
| 158 | + # AND p.page_namespace = 0 |
| 159 | + # ''', [(urllib2.quote(x[options.field-1]),) for x in table]) |
| 160 | + # print list(cursor) |
| 161 | + |
| 162 | + print '\t'.join(['title', 'page_id', 'redirect?', 'pageview timestamp', 'predicted pageview', 'actual pageview', 'trending hours', 'surprisedness', 'revision', 'timestamp', 'user type', 'username', 'editcount', 'new?']) |
| 163 | + |
| 164 | + botpat = re.compile('bot( |$)', re.IGNORECASE) |
| 165 | + edits = 0 |
| 166 | + articles = {} |
| 167 | + for cols in output: |
| 168 | + start = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S') |
| 169 | + end = start + timedelta(hours=1) |
| 170 | + cursor.execute(''' |
| 171 | + SELECT r.rev_id, r.rev_timestamp, r.rev_user, r.rev_user_text |
| 172 | + FROM revision r |
| 173 | + WHERE |
| 174 | + r.rev_page = ? |
| 175 | + AND rev_timestamp BETWEEN ? AND ? |
| 176 | + ; |
| 177 | + |
| 178 | + ''', (cols[1], |
| 179 | + datetime.strftime(start, '%Y%m%d$H%M%S'), |
| 180 | + datetime.strftime(end, '%Y%m%d$H%M%S'), |
| 181 | + )) |
| 182 | + ls = list(cursor) |
| 183 | + if len(ls) == 0: |
| 184 | + print >>sys.stderr, 'no revision: %s %s %s' % (cols[0], start, end) |
| 185 | + for (rev,ts,uid,username) in ls: |
| 186 | + usertype = 'ANON' if uid == 0 else 'REG' |
| 187 | + if uid != 0 and botpat.search(username): |
| 188 | + usertype += '_BOT' |
| 189 | + output = cols + [str(x) for x in [rev, ts, usertype, username, |
| 190 | + editcount(cursor,uid,username,re.sub('[ /\:]', '', cols[3])), |
| 191 | + 'NEW' if firstedits(cursor,uid,username,timedelta(days=30),30).count(rev) > 0 else 'OLD']] |
| 192 | + print '\t'.join(output) |
| 193 | + edits +=1 |
| 194 | + articles[cols[1]] = True |
| 195 | + |
| 196 | + print '# %s / %s / %s edits/article/hour' % (edits, len(articles.keys()), len(hours.keys())) |
Property changes on: trunk/tools/wsor/trending_articles/find_revision_status.py |
___________________________________________________________________ |
Added: svn:executable |
1 | 197 | + * |
Index: trunk/tools/wsor/trending_articles/filter.py |
— | — | @@ -0,0 +1,40 @@ |
| 2 | +#! /usr/bin/env python |
| 3 | + |
| 4 | +import random |
| 5 | +import argparse |
| 6 | +import time |
| 7 | +import sys |
| 8 | +import os |
| 9 | +import gzip |
| 10 | + |
| 11 | +def parse(f): |
| 12 | + for line in f.readlines(): |
| 13 | + i = None |
| 14 | + try: |
| 15 | + i = line.rindex(' ', 0, line.rindex(' ')) |
| 16 | + except ValueError: |
| 17 | + print >>sys.stderr, line |
| 18 | + if i != None and i > 0: |
| 19 | + yield (line,line[0:i]) |
| 20 | + |
| 21 | +if __name__ == '__main__': |
| 22 | + parser = argparse.ArgumentParser() |
| 23 | + parser.add_argument('-l', '--list', metavar='FILE', |
| 24 | + dest='filter', type=str, required=True, |
| 25 | + help='') |
| 26 | + parser.add_argument('-d', '--directory', metavar='DIR', |
| 27 | + dest='dir', type=str, required=True, |
| 28 | + help='') |
| 29 | + parser.add_argument('files', nargs='+') |
| 30 | + options = parser.parse_args() |
| 31 | + |
| 32 | + accepts = {} |
| 33 | + |
| 34 | + for line in open(options.filter).readlines(): |
| 35 | + accepts[line.strip()] = True |
| 36 | + for f in options.files: |
| 37 | + print >>sys.stderr, f |
| 38 | + w = gzip.open(os.path.sep.join([options.dir, os.path.basename(f)]), 'w') |
| 39 | + for (line,p) in parse(gzip.open(f)): |
| 40 | + if accepts.has_key(p): |
| 41 | + print >>w, line, |
Property changes on: trunk/tools/wsor/trending_articles/filter.py |
___________________________________________________________________ |
Added: svn:executable |
1 | 42 | + * |
Index: trunk/tools/wsor/trending_articles/filter_random.py |
— | — | @@ -0,0 +1,22 @@ |
| 2 | +#! /usr/bin/env python |
| 3 | +import random |
| 4 | +import argparse |
| 5 | +import time |
| 6 | +import sys |
| 7 | + |
| 8 | +if __name__ == '__main__': |
| 9 | + parser = argparse.ArgumentParser() |
| 10 | + parser.add_argument('-a', '--acceptance-rate', metavar='RATE', |
| 11 | + dest='accept', type=float, default=0.02, |
| 12 | + help='') |
| 13 | + parser.add_argument('-r', '--random-seed', metavar='SEED', |
| 14 | + dest='seed', type=int, default=int(time.time()), |
| 15 | + help='random number seed') |
| 16 | + options = parser.parse_args() |
| 17 | + |
| 18 | + random.seed(options.seed) |
| 19 | + |
| 20 | + for line in sys.stdin.readlines(): |
| 21 | + if random.random() > options.accept: |
| 22 | + continue |
| 23 | + print line, |
Property changes on: trunk/tools/wsor/trending_articles/filter_random.py |
___________________________________________________________________ |
Added: svn:executable |
1 | 24 | + * |
Index: trunk/tools/wsor/trending_articles/detectnonbursts_random.py |
— | — | @@ -0,0 +1,86 @@ |
| 2 | +#! /usr/bin/env python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | + |
| 5 | +import pymongo |
| 6 | +import codecs |
| 7 | +import csv |
| 8 | +import sys |
| 9 | +from datetime import datetime, timedelta |
| 10 | +from pymongo.master_slave_connection import MasterSlaveConnection |
| 11 | +import argparse |
| 12 | +import random |
| 13 | +import gzip |
| 14 | +import re |
| 15 | +import os |
| 16 | +import time |
| 17 | +import urllib2 |
| 18 | +from collections import deque, namedtuple |
| 19 | +import numpy as np |
| 20 | + |
| 21 | +pageview_tuple = namedtuple('Pageview', 'date count') |
| 22 | +count_tuple = namedtuple('Count', 'pred real') |
| 23 | + |
| 24 | +def time_parse(x): |
| 25 | + return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz') |
| 26 | +def time_format(x): |
| 27 | + return datetime.strftime(x, '%Y/%m/%d %H:%M:%S') |
| 28 | +def datetime2days(x): |
| 29 | + return x.days + x.seconds / 60.0 / 60.0 / 24.0 + x.microseconds / 6000.0 / 60.0 / 24.0 |
| 30 | + |
| 31 | +def load_wikistats_file(f): |
| 32 | + print >>sys.stderr, 'loading %s...' % f |
| 33 | + ret = {} |
| 34 | + for line in gzip.open(f): |
| 35 | + line.strip() |
| 36 | + (lang,title,count,bytes) = line.split(' ') |
| 37 | + ret[(lang,title)] = count_tuple(float(count), int(count)) |
| 38 | + return pageview_tuple(time_parse(os.path.basename(f)), ret) |
| 39 | + |
| 40 | +if __name__ == '__main__': |
| 41 | + parser = argparse.ArgumentParser() |
| 42 | + parser.add_argument('-o', '--output', metavar='FILE', |
| 43 | + dest='output', type=str, required=True, |
| 44 | + help='') |
| 45 | + parser.add_argument('-a', '--acceptance-rate', metavar='RATE', |
| 46 | + dest='accept', type=float, default=0.001, |
| 47 | + help='') |
| 48 | + parser.add_argument('-r', '--random-seed', metavar='SEED', |
| 49 | + dest='seed', type=int, default=int(time.time()), |
| 50 | + help='random number seed') |
| 51 | + parser.add_argument('-v', '--verbose', |
| 52 | + dest='verbose', action='store_true', default=False, |
| 53 | + help='turn on verbose message output') |
| 54 | + parser.add_argument('files', nargs='+') |
| 55 | + options = parser.parse_args() |
| 56 | + |
| 57 | + fh = codecs.open(options.output, 'w', 'utf-8') |
| 58 | + writer = csv.writer(fh, delimiter='\t') |
| 59 | + random.seed(options.seed) |
| 60 | + |
| 61 | + fh.write('title\ttime\tcount_pred\tcount_\tcont\trate\n') |
| 62 | + |
| 63 | + if options.verbose: |
| 64 | + print >>sys.stderr, options |
| 65 | + |
| 66 | + options.files.sort() |
| 67 | + for fname in options.files: |
| 68 | + pvs = load_wikistats_file(fname) |
| 69 | + for (page,count) in pvs.count.items(): |
| 70 | + if random.random() < options.accept: |
| 71 | + ls = [] |
| 72 | + try: |
| 73 | + ls = [urllib2.unquote(page[1]).decode('utf-8'), |
| 74 | + time_format(pvs.date), |
| 75 | + None, |
| 76 | + count.real, |
| 77 | + 0, |
| 78 | + None] |
| 79 | + writer.writerow([unicode(x) for x in ls]) |
| 80 | + except UnicodeEncodeError, e: |
| 81 | + print >>sys.stderr, '%s: %s' % (e, page) |
| 82 | + continue |
| 83 | + except UnicodeDecodeError, e: |
| 84 | + print >>sys.stderr, '%s: %s' % (e, page) |
| 85 | + continue |
| 86 | + |
| 87 | + |
Property changes on: trunk/tools/wsor/trending_articles/detectnonbursts_random.py |
___________________________________________________________________ |
Added: svn:executable |
1 | 88 | + * |