r92537 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r92536‎ \| r92537 \| r92538 >
Date:	14:37, 19 July 2011
Author:	whym
Status:	deferred
Tags:
Comment:	Add sources and README of trending articles sprint
Modified paths:	/trunk/tools/wsor/trending_articles/README.rst (added) (history) /trunk/tools/wsor/trending_articles/chart.py (added) (history) /trunk/tools/wsor/trending_articles/detectbursts.py (added) (history) /trunk/tools/wsor/trending_articles/detectnonbursts_random.py (added) (history) /trunk/tools/wsor/trending_articles/filter.py (added) (history) /trunk/tools/wsor/trending_articles/filter_random.py (added) (history) /trunk/tools/wsor/trending_articles/find_revision_status.py (added) (history)

Diff [purge]

Index: trunk/tools/wsor/trending_articles/detectbursts.py
—	—	@@ -0,0 +1,155 @@
	2	+#! /usr/bin/env python
	3	+# -- coding: utf-8 --
	4	+
	5	+import codecs
	6	+import csv
	7	+import sys
	8	+from datetime import datetime, timedelta
	9	+import argparse
	10	+import random
	11	+import gzip
	12	+import re
	13	+import os
	14	+import urllib2
	15	+from collections import deque, namedtuple
	16	+import numpy as np
	17	+import gc
	18	+
	19	+pageview_tuple = namedtuple('Pageview', 'date count')
	20	+count_tuple = namedtuple('Count', 'pred real')
	21	+
	22	+def time_parse(x):
	23	+ return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')
	24	+def time_format(x):
	25	+ return datetime.strftime(x, '%Y/%m/%d %H:%M:%S')
	26	+def datetime2days(x):
	27	+ return x.days + x.seconds / 60.0 / 60.0 / 24.0 + x.microseconds / 6000.0 / 60.0 / 24.0
	28	+
	29	+def load_wikistats_file(f):
	30	+ print >>sys.stderr, 'loading %s...' % f
	31	+ ret = {}
	32	+ for line in gzip.open(f):
	33	+ line.strip()
	34	+ (lang,title,count,bytes) = line.split(' ')
	35	+ ret[(lang,title)] = count_tuple(float(count), int(count))
	36	+ return pageview_tuple(time_parse(os.path.basename(f)), ret)
	37	+
	38	+def slices(ls, size):
	39	+ size /= 2
	40	+ return map(lambda i: ls[i:i+2size+1], xrange(0, len(ls) - 2size))
	41	+
	42	+def predict(ls, new):
	43	+ hist = {}
	44	+ for (page,count) in new.items():
	45	+ hist[page] = count_tuple([], count.real)
	46	+ for (i,cnts) in enumerate(ls):
	47	+ for (page,c) in cnts.items():
	48	+ if hist.has_key(page):
	49	+ if len(hist[page].pred) < i:
	50	+ a = [0]*(i-len(hist[page].pred))
	51	+ hist[page].pred.extend(a)
	52	+ hist[page].pred.append(c.real)
	53	+ else:
	54	+ a = [0]*(i+1)
	55	+ a[-1] = c.real
	56	+ hist[page] = count_tuple(a, new[page].real if new.has_key(page) else 0)
	57	+ ret = {}
	58	+ for (page, count) in hist.items():
	59	+ if len(hist[page].pred) < len(ls):
	60	+ a = [0]*(len(ls)-len(hist[page].pred))
	61	+ hist[page].pred.extend(a)
	62	+ slope,intercept = np.linalg.lstsq(np.transpose([np.array(range(0,len(count.pred))),
	63	+ np.ones(len(count.pred))]),
	64	+ np.array(count.pred))[0]
	65	+ ret[page] = count._replace(pred=slope * (len(count.pred) + 1) + intercept)
	66	+ return ret
	67	+
	68	+def moving_accumurate(ls, n, extract=lambda x: x, accumurate=lambda sum,x: sum+x):
	69	+ if n <= 1:
	70	+ for x in ls:
	71	+ yield(extract(x))
	72	+ return
	73	+ buff = deque()
	74	+ for (i,v) in enumerate(ls):
	75	+ if i < n-1:
	76	+ buff.append(extract(v))
	77	+ else:
	78	+ vv = extract(v)
	79	+ r = accumurate(buff, vv)
	80	+ yield r
	81	+ buff.popleft()
	82	+ buff.append(vv)
	83	+
	84	+if __name__ == '__main__':
	85	+ parser = argparse.ArgumentParser()
	86	+ parser.add_argument('-o', '--output', metavar='FILE',
	87	+ dest='output', type=lambda x: codecs.open(x, 'w', 'utf-8'), default=sys.stdout,
	88	+ help='')
	89	+ parser.add_argument('-R', '--rate', metavar='RATE',
	90	+ dest='rate', type=float, default=8.0,
	91	+ help='')
	92	+ parser.add_argument('-m', '--max-duration', metavar='HOURS',
	93	+ dest='max', type=float, default=5,
	94	+ help='')
	95	+ parser.add_argument('-w', '--window', metavar='HOURS',
	96	+ dest='window', type=int, default=5,
	97	+ help='')
	98	+ parser.add_argument('-M', '--min-count', metavar='N',
	99	+ dest='min', type=int, default=2000,
	100	+ help='')
	101	+ parser.add_argument('-v', '--verbose',
	102	+ dest='verbose', action='store_true', default=False,
	103	+ help='turn on verbose message output')
	104	+ parser.add_argument('-i', '--inclusive',
	105	+ dest='inclusive', action='store_true', default=False,
	106	+ help='include the items below the threshold, add a binary indicator column')
	107	+ parser.add_argument('files', nargs='+')
	108	+ options = parser.parse_args()
	109	+
	110	+ fh = options.output
	111	+ writer = csv.writer(fh, delimiter='\t')
	112	+ fh.write('title\ttime\tcount_pred\tcount_\tcont\trate\n')
	113	+
	114	+ if options.verbose:
	115	+ print >>sys.stderr, options
	116	+
	117	+ options.files.sort()
	118	+ gen_sums = moving_accumurate(options.files, options.window, extract=load_wikistats_file,
	119	+ accumurate=lambda hist,cur: (cur.date, predict([x.count for x in hist],cur.count)))
	120	+ bursting = {}
	121	+ for (newtime,new) in gen_sums:
	122	+ for (page,count) in new.items():
	123	+ if count.real < options.min:
	124	+ continue
	125	+ r = 0
	126	+ if count.pred == 0:
	127	+ if count.real > options.min:
	128	+ r = 9999999
	129	+ else:
	130	+ r = float(count.real - count.pred) / count.pred
	131	+ if r > options.rate:
	132	+ bursting.setdefault(page, 0)
	133	+ if bursting.has_key(page):
	134	+ bursting[page] += 1
	135	+ if bursting.has_key(page) or options.inclusive:
	136	+ b = bursting[page] if bursting.has_key(page) else 0
	137	+ try:
	138	+ ls = [urllib2.unquote(page[1]).decode('utf-8'),
	139	+ time_format(newtime),
	140	+ count.pred,
	141	+ count.real,
	142	+ b,
	143	+ r]
	144	+ if options.inclusive:
	145	+ ls.insert(len(ls), bursting.has_key(page))
	146	+ writer.writerow([unicode(x) for x in ls])
	147	+ except UnicodeEncodeError, e:
	148	+ print >>sys.stderr, '%s: %s' % (e, page)
	149	+ continue
	150	+ except UnicodeDecodeError, e:
	151	+ print >>sys.stderr, '%s: %s' % (e, page)
	152	+ continue
	153	+ if bursting.has_key(page) and (r < -options.rate or bursting[page] > float(options.max) / options.window):
	154	+ bursting.pop(page)
	155	+
	156	+
Property changes on: trunk/tools/wsor/trending_articles/detectbursts.py
___________________________________________________________________
Added: svn:executable
1	157	+ *
Index: trunk/tools/wsor/trending_articles/README.rst
—	—	@@ -0,0 +1,4 @@
	2	+See http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors
	3	+
	4	+Counts files are available at:
	5	+http://dammit.lt/wikistats/archive/2011/01/
Index: trunk/tools/wsor/trending_articles/chart.py
—	—	@@ -0,0 +1,108 @@
	2	+#! /usr/bin/env python
	3	+# -- coding: utf-8 --
	4	+#
	5	+
	6	+import numpy
	7	+import pylab as plt
	8	+import matplotlib
	9	+import os
	10	+import argparse
	11	+import sys
	12	+import csv
	13	+import datetime
	14	+import math
	15	+import re
	16	+from collections import namedtuple
	17	+
	18	+counter_tuple = namedtuple('counter', 'name filter color explode')
	19	+
	20	+def str_to_time(x):
	21	+ return datetime.datetime.strptime(x, '%Y%m%d%H%M%S')
	22	+
	23	+if __name__ == '__main__':
	24	+ parser = argparse.ArgumentParser()
	25	+ parser.add_argument('-s', '--font-size', metavar='POINT',
	26	+ dest='fsize', type=int, default=15,
	27	+ help='')
	28	+ parser.add_argument('-f', '--field', metavar='N',
	29	+ dest='field', type=int, default=9,
	30	+ help='')
	31	+ parser.add_argument('-w', '--width', metavar='SIZE',
	32	+ dest='width', type=int, default=0.3,
	33	+ help='')
	34	+ parser.add_argument('-y', '--ylimit', metavar='LIMITS',
	35	+ dest='ylim', type=str, default='-300,1500',
	36	+ help='')
	37	+ parser.add_argument('-v', '--verbose',
	38	+ dest='verbose', action='store_true', default=False,
	39	+ help='turn on verbose message output')
	40	+ parser.add_argument('files', nargs='+')
	41	+ options = parser.parse_args()
	42	+
	43	+
	44	+ csv.field_size_limit(1000000000)
	45	+
	46	+ counters = [counter_tuple('new reg. users', lambda x: x[10] == 'REG' and x[13] == 'NEW', '#4444FF', 0.1),
	47	+ counter_tuple('old reg. users (semiprotected)', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] == 'SEMIPROTECT', '#99DD99', 0.0),
	48	+ counter_tuple('old reg. users (not protected)', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] == 'NO_PROTECT', '#8888EE', 0.0),
	49	+ counter_tuple('old reg. users (other)', lambda x: x[10] == 'REG' and x[13] == 'OLD', '#FFFFFF', 0.0),
	50	+ counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW', '#FF4444', 0.1),
	51	+ counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD', '#EE8888', 0.0),
	52	+ counter_tuple('bots', lambda x: x[10] == 'REG_BOT', '#666666', 0.0),
	53	+ counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
	54	+ ]
	55	+
	56	+ # counters = [counter_tuple('new registered users', lambda x: x[10] == 'REG' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1),
	57	+ # counter_tuple('old registered users', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0),
	58	+ # counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1),
	59	+ # counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0),
	60	+ # counter_tuple('bots', lambda x: x[10] == 'REG_BOT' and x[14] != 'SEMIPROTECT', '#666666', 0.0),
	61	+ # #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
	62	+ # ]
	63	+
	64	+ counters_map = {}
	65	+ for x in counters:
	66	+ counters_map[x.name] = x
	67	+
	68	+ ratios = []
	69	+ patt = re.compile('(\d+) / (\d+) / (\d+)')
	70	+ for (i,fname) in enumerate(options.files):
	71	+ for line in open(fname).readlines():
	72	+ m = patt.search(line)
	73	+ if m:
	74	+ ratios.append((float(m.group(1)) / float(m.group(2)) / float(m.group(3))) ** 0.5)
	75	+ break
	76	+ sum_ratio = sum(ratios)
	77	+ counter_names = [x.name for x in counters]
	78	+
	79	+ plots = []
	80	+ matplotlib.rc('font', size=options.fsize)
	81	+ for (n,fname) in enumerate(options.files):
	82	+ plt.figure(figsize=(10,10))
	83	+ table = list(csv.reader(filter(lambda x: x[0] != '#', open(fname)), delimiter='\t'))
	84	+ table = table[1:]
	85	+
	86	+ counts = {}
	87	+ for name in counter_names:
	88	+ counts[name] = set()
	89	+ for cols in table:
	90	+ for c in counters:
	91	+ if c.filter(cols):
	92	+ counts[c[0]].add(cols[options.field-1])
	93	+ break
	94	+
	95	+ print counts#!
	96	+ #plt.subplot(1, len(options.files), n+1)
	97	+ plt.axes([0, 0, ratios[n]/sum_ratio, ratios[n]/sum_ratio])
	98	+ plt.title(fname)
	99	+ p = plt.pie([len(counts[x]) for x in counter_names],
	100	+ explode=[counters_map[x].explode for x in counter_names],
	101	+ autopct='%1.1f%%',
	102	+ pctdistance=1.2,
	103	+ colors=[x.color for x in counters])
	104	+
	105	+ plt.legend(p[0], ['' if counts[x] == 0 else x for x in counter_names],
	106	+ loc=(.8, .8))
	107	+
	108	+ base,ext = os.path.splitext(fname)
	109	+ plt.savefig('.'.join([base, 'svg']))
Property changes on: trunk/tools/wsor/trending_articles/chart.py
___________________________________________________________________
Added: svn:executable
1	110	+ *
Index: trunk/tools/wsor/trending_articles/find_revision_status.py
—	—	@@ -0,0 +1,195 @@
	2	+#! /usr/bin/env python
	3	+# -- coding: utf-8 --
	4	+#
	5	+
	6	+import oursql
	7	+import os
	8	+import argparse
	9	+import sys
	10	+import csv
	11	+import urllib2
	12	+import re
	13	+from datetime import datetime, timedelta
	14	+
	15	+def parse_wikidate(x):
	16	+ return datetime.strptime(str(x), '%Y%m%d%H%M%S')
	17	+
	18	+def format_wikidate(x):
	19	+ return datetime.strftime(x, '%Y%m%d%H%M%S')
	20	+
	21	+def title2pageid(cursor, title, namespace=0):
	22	+ cursor.execute('''
	23	+ SELECT p.page_id, page_is_redirect
	24	+ FROM page p
	25	+ WHERE
	26	+ p.page_title = ?
	27	+ AND p.page_namespace = ?
	28	+ ;
	29	+ ''', (title,namespace))
	30	+ ls = list(cursor)
	31	+ if len(ls) == 0:
	32	+ return (None,None)
	33	+ return tuple(ls[0])
	34	+
	35	+def redirected(cursor, pid, namespace=0):
	36	+ cursor.execute('''
	37	+ SELECT rd_title
	38	+ FROM redirect r
	39	+ WHERE
	40	+ r.rd_from = ?
	41	+ ;
	42	+ ''', (pid,))
	43	+ title = list(cursor)[0][0]
	44	+ rd_pid,rd = title2pageid(cursor, title)
	45	+ if rd_pid == None:
	46	+ return (None,None)
	47	+ if rd and rd_pid != pid:
	48	+ (title, rd_pid) = redirected(cursor, rd_pid, namespace)
	49	+ return (title, rd_pid)
	50	+
	51	+def firstedits(cursor, uid, uname, delta, n):
	52	+ where = 'r.rev_user_text = ?'
	53	+ uspec = uname
	54	+ if uid != 0:
	55	+ where = 'r.rev_user = ?'
	56	+ uspec = uid
	57	+ cursor.execute('''
	58	+ SELECT r.rev_timestamp
	59	+ FROM revision r
	60	+ WHERE
	61	+ r.rev_timestamp != ""
	62	+ AND %s
	63	+ ORDER BY r.rev_timestamp ASC
	64	+ LIMIT 1
	65	+ ;
	66	+ ''' % (where,), (uspec,))
	67	+ first = list(cursor)[0][0]
	68	+ first = parse_wikidate(first)
	69	+ cursor.execute('''
	70	+ SELECT r.rev_id
	71	+ FROM revision r
	72	+ WHERE
	73	+ %s
	74	+ AND r.rev_timestamp BETWEEN ? AND ?
	75	+ LIMIT ?
	76	+ ;
	77	+ ''' % (where,), (uspec, format_wikidate(first), format_wikidate(first + delta), n))
	78	+ return [int(x[0]) for x in list(cursor)]
	79	+
	80	+def editcount(cursor, uid, uname, timestamp):
	81	+ where = 'r.rev_user_text = ?'
	82	+ uspec = uname
	83	+ if uid != 0:
	84	+ where = 'r.rev_user = ?'
	85	+ uspec = uid
	86	+
	87	+ cursor.execute('''
	88	+ SELECT count(*)
	89	+ FROM revision r
	90	+ WHERE
	91	+ %s
	92	+ AND r.rev_timestamp < ?
	93	+ ;
	94	+ ''' % (where,), (uspec,timestamp))
	95	+ return int(list(cursor)[0][0])
	96	+
	97	+if __name__ == '__main__':
	98	+ parser = argparse.ArgumentParser()
	99	+ parser.add_argument('-f', '--field', metavar='N',
	100	+ dest='field', type=int, default=1,
	101	+ help='')
	102	+ parser.add_argument('-d', '--db', metavar='DBNAME', required=True,
	103	+ dest='db', type=str, default='hywiki-p',
	104	+ help='target wiki name')
	105	+ parser.add_argument('input')
	106	+ options = parser.parse_args()
	107	+ options.db = options.db.replace('_','-')
	108	+
	109	+ host = options.db + '.rrdb.toolserver.org'
	110	+ conn = oursql.connect(host = host,
	111	+ read_default_file=os.path.expanduser('~/.my.cnf'),
	112	+ db = options.db.replace('-','_'),
	113	+ charset=None,
	114	+ use_unicode=False)
	115	+
	116	+ cursor = conn.cursor()
	117	+
	118	+ csv.field_size_limit(1000000000)
	119	+ table = list(csv.reader(open(options.input), delimiter='\t'))
	120	+ table = table[1:]
	121	+
	122	+ output = []
	123	+ hours = {}
	124	+ for cols in table:
	125	+ cursor.execute('''
	126	+ SELECT p.page_id, p.page_title, page_is_redirect
	127	+ FROM page p
	128	+ WHERE
	129	+ p.page_title = ?
	130	+ AND p.page_namespace = 0
	131	+ ;
	132	+ ''', (cols[options.field-1],))
	133	+ res = list(cursor)
	134	+ if res == None or res == []:
	135	+ print >>sys.stderr, 'error 1 %s' % cols
	136	+ continue
	137	+ redirect = int(res[0][2]) == 1
	138	+ cols.insert(options.field, 'REDIRECT' if redirect else 'ARTICLE')
	139	+ cols.insert(options.field, str(res[0][0]))
	140	+ output.append(cols)
	141	+ if redirect:
	142	+ (title,pageid) = redirected(cursor, res[0][0])
	143	+ if title == None:
	144	+ print >>sys.stderr, 'error 2 %s' % cols
	145	+ continue
	146	+ a = [x for x in cols]
	147	+ a[0] = title
	148	+ a[1] = str(pageid)
	149	+ a[2] = 'REDIRECT_RESOLVED'
	150	+ output.append(a)
	151	+ hours[cols[3]] = True
	152	+
	153	+ # cursor.executemany('''
	154	+ # SELECT p.page_title, p.page_id
	155	+ # FROM page p
	156	+ # WHERE
	157	+ # p.page_title = ?
	158	+ # AND p.page_namespace = 0
	159	+ # ''', [(urllib2.quote(x[options.field-1]),) for x in table])
	160	+ # print list(cursor)
	161	+
	162	+ print '\t'.join(['title', 'page_id', 'redirect?', 'pageview timestamp', 'predicted pageview', 'actual pageview', 'trending hours', 'surprisedness', 'revision', 'timestamp', 'user type', 'username', 'editcount', 'new?'])
	163	+
	164	+ botpat = re.compile('bot( \|$)', re.IGNORECASE)
	165	+ edits = 0
	166	+ articles = {}
	167	+ for cols in output:
	168	+ start = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
	169	+ end = start + timedelta(hours=1)
	170	+ cursor.execute('''
	171	+ SELECT r.rev_id, r.rev_timestamp, r.rev_user, r.rev_user_text
	172	+ FROM revision r
	173	+ WHERE
	174	+ r.rev_page = ?
	175	+ AND rev_timestamp BETWEEN ? AND ?
	176	+ ;
	177	+
	178	+ ''', (cols[1],
	179	+ datetime.strftime(start, '%Y%m%d$H%M%S'),
	180	+ datetime.strftime(end, '%Y%m%d$H%M%S'),
	181	+ ))
	182	+ ls = list(cursor)
	183	+ if len(ls) == 0:
	184	+ print >>sys.stderr, 'no revision: %s %s %s' % (cols[0], start, end)
	185	+ for (rev,ts,uid,username) in ls:
	186	+ usertype = 'ANON' if uid == 0 else 'REG'
	187	+ if uid != 0 and botpat.search(username):
	188	+ usertype += '_BOT'
	189	+ output = cols + [str(x) for x in [rev, ts, usertype, username,
	190	+ editcount(cursor,uid,username,re.sub('[ /\:]', '', cols[3])),
	191	+ 'NEW' if firstedits(cursor,uid,username,timedelta(days=30),30).count(rev) > 0 else 'OLD']]
	192	+ print '\t'.join(output)
	193	+ edits +=1
	194	+ articles[cols[1]] = True
	195	+
	196	+ print '# %s / %s / %s edits/article/hour' % (edits, len(articles.keys()), len(hours.keys()))
Property changes on: trunk/tools/wsor/trending_articles/find_revision_status.py
___________________________________________________________________
Added: svn:executable
1	197	+ *
Index: trunk/tools/wsor/trending_articles/filter.py
—	—	@@ -0,0 +1,40 @@
	2	+#! /usr/bin/env python
	3	+
	4	+import random
	5	+import argparse
	6	+import time
	7	+import sys
	8	+import os
	9	+import gzip
	10	+
	11	+def parse(f):
	12	+ for line in f.readlines():
	13	+ i = None
	14	+ try:
	15	+ i = line.rindex(' ', 0, line.rindex(' '))
	16	+ except ValueError:
	17	+ print >>sys.stderr, line
	18	+ if i != None and i > 0:
	19	+ yield (line,line[0:i])
	20	+
	21	+if __name__ == '__main__':
	22	+ parser = argparse.ArgumentParser()
	23	+ parser.add_argument('-l', '--list', metavar='FILE',
	24	+ dest='filter', type=str, required=True,
	25	+ help='')
	26	+ parser.add_argument('-d', '--directory', metavar='DIR',
	27	+ dest='dir', type=str, required=True,
	28	+ help='')
	29	+ parser.add_argument('files', nargs='+')
	30	+ options = parser.parse_args()
	31	+
	32	+ accepts = {}
	33	+
	34	+ for line in open(options.filter).readlines():
	35	+ accepts[line.strip()] = True
	36	+ for f in options.files:
	37	+ print >>sys.stderr, f
	38	+ w = gzip.open(os.path.sep.join([options.dir, os.path.basename(f)]), 'w')
	39	+ for (line,p) in parse(gzip.open(f)):
	40	+ if accepts.has_key(p):
	41	+ print >>w, line,
Property changes on: trunk/tools/wsor/trending_articles/filter.py
___________________________________________________________________
Added: svn:executable
1	42	+ *
Index: trunk/tools/wsor/trending_articles/filter_random.py
—	—	@@ -0,0 +1,22 @@
	2	+#! /usr/bin/env python
	3	+import random
	4	+import argparse
	5	+import time
	6	+import sys
	7	+
	8	+if __name__ == '__main__':
	9	+ parser = argparse.ArgumentParser()
	10	+ parser.add_argument('-a', '--acceptance-rate', metavar='RATE',
	11	+ dest='accept', type=float, default=0.02,
	12	+ help='')
	13	+ parser.add_argument('-r', '--random-seed', metavar='SEED',
	14	+ dest='seed', type=int, default=int(time.time()),
	15	+ help='random number seed')
	16	+ options = parser.parse_args()
	17	+
	18	+ random.seed(options.seed)
	19	+
	20	+ for line in sys.stdin.readlines():
	21	+ if random.random() > options.accept:
	22	+ continue
	23	+ print line,
Property changes on: trunk/tools/wsor/trending_articles/filter_random.py
___________________________________________________________________
Added: svn:executable
1	24	+ *
Index: trunk/tools/wsor/trending_articles/detectnonbursts_random.py
—	—	@@ -0,0 +1,86 @@
	2	+#! /usr/bin/env python
	3	+# -- coding: utf-8 --
	4	+
	5	+import pymongo
	6	+import codecs
	7	+import csv
	8	+import sys
	9	+from datetime import datetime, timedelta
	10	+from pymongo.master_slave_connection import MasterSlaveConnection
	11	+import argparse
	12	+import random
	13	+import gzip
	14	+import re
	15	+import os
	16	+import time
	17	+import urllib2
	18	+from collections import deque, namedtuple
	19	+import numpy as np
	20	+
	21	+pageview_tuple = namedtuple('Pageview', 'date count')
	22	+count_tuple = namedtuple('Count', 'pred real')
	23	+
	24	+def time_parse(x):
	25	+ return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')
	26	+def time_format(x):
	27	+ return datetime.strftime(x, '%Y/%m/%d %H:%M:%S')
	28	+def datetime2days(x):
	29	+ return x.days + x.seconds / 60.0 / 60.0 / 24.0 + x.microseconds / 6000.0 / 60.0 / 24.0
	30	+
	31	+def load_wikistats_file(f):
	32	+ print >>sys.stderr, 'loading %s...' % f
	33	+ ret = {}
	34	+ for line in gzip.open(f):
	35	+ line.strip()
	36	+ (lang,title,count,bytes) = line.split(' ')
	37	+ ret[(lang,title)] = count_tuple(float(count), int(count))
	38	+ return pageview_tuple(time_parse(os.path.basename(f)), ret)
	39	+
	40	+if __name__ == '__main__':
	41	+ parser = argparse.ArgumentParser()
	42	+ parser.add_argument('-o', '--output', metavar='FILE',
	43	+ dest='output', type=str, required=True,
	44	+ help='')
	45	+ parser.add_argument('-a', '--acceptance-rate', metavar='RATE',
	46	+ dest='accept', type=float, default=0.001,
	47	+ help='')
	48	+ parser.add_argument('-r', '--random-seed', metavar='SEED',
	49	+ dest='seed', type=int, default=int(time.time()),
	50	+ help='random number seed')
	51	+ parser.add_argument('-v', '--verbose',
	52	+ dest='verbose', action='store_true', default=False,
	53	+ help='turn on verbose message output')
	54	+ parser.add_argument('files', nargs='+')
	55	+ options = parser.parse_args()
	56	+
	57	+ fh = codecs.open(options.output, 'w', 'utf-8')
	58	+ writer = csv.writer(fh, delimiter='\t')
	59	+ random.seed(options.seed)
	60	+
	61	+ fh.write('title\ttime\tcount_pred\tcount_\tcont\trate\n')
	62	+
	63	+ if options.verbose:
	64	+ print >>sys.stderr, options
	65	+
	66	+ options.files.sort()
	67	+ for fname in options.files:
	68	+ pvs = load_wikistats_file(fname)
	69	+ for (page,count) in pvs.count.items():
	70	+ if random.random() < options.accept:
	71	+ ls = []
	72	+ try:
	73	+ ls = [urllib2.unquote(page[1]).decode('utf-8'),
	74	+ time_format(pvs.date),
	75	+ None,
	76	+ count.real,
	77	+ 0,
	78	+ None]
	79	+ writer.writerow([unicode(x) for x in ls])
	80	+ except UnicodeEncodeError, e:
	81	+ print >>sys.stderr, '%s: %s' % (e, page)
	82	+ continue
	83	+ except UnicodeDecodeError, e:
	84	+ print >>sys.stderr, '%s: %s' % (e, page)
	85	+ continue
	86	+
	87	+
Property changes on: trunk/tools/wsor/trending_articles/detectnonbursts_random.py
___________________________________________________________________
Added: svn:executable
1	88	+ *

Status & tagging log

15:56, 19 July 2011 Reedy (talk | contribs) changed the status of r92537 [removed: new added: deferred]