r92537 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r92536‎ | r92537 | r92538 >
Date:14:37, 19 July 2011
Author:whym
Status:deferred
Tags:
Comment:
Add sources and README of trending articles sprint
Modified paths:
  • /trunk/tools/wsor/trending_articles/README.rst (added) (history)
  • /trunk/tools/wsor/trending_articles/chart.py (added) (history)
  • /trunk/tools/wsor/trending_articles/detectbursts.py (added) (history)
  • /trunk/tools/wsor/trending_articles/detectnonbursts_random.py (added) (history)
  • /trunk/tools/wsor/trending_articles/filter.py (added) (history)
  • /trunk/tools/wsor/trending_articles/filter_random.py (added) (history)
  • /trunk/tools/wsor/trending_articles/find_revision_status.py (added) (history)

Diff [purge]

Index: trunk/tools/wsor/trending_articles/detectbursts.py
@@ -0,0 +1,155 @@
 2+#! /usr/bin/env python
 3+# -*- coding: utf-8 -*-
 4+
 5+import codecs
 6+import csv
 7+import sys
 8+from datetime import datetime, timedelta
 9+import argparse
 10+import random
 11+import gzip
 12+import re
 13+import os
 14+import urllib2
 15+from collections import deque, namedtuple
 16+import numpy as np
 17+import gc
 18+
 19+pageview_tuple = namedtuple('Pageview', 'date count')
 20+count_tuple = namedtuple('Count', 'pred real')
 21+
 22+def time_parse(x):
 23+ return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')
 24+def time_format(x):
 25+ return datetime.strftime(x, '%Y/%m/%d %H:%M:%S')
 26+def datetime2days(x):
 27+ return x.days + x.seconds / 60.0 / 60.0 / 24.0 + x.microseconds / 6000.0 / 60.0 / 24.0
 28+
 29+def load_wikistats_file(f):
 30+ print >>sys.stderr, 'loading %s...' % f
 31+ ret = {}
 32+ for line in gzip.open(f):
 33+ line.strip()
 34+ (lang,title,count,bytes) = line.split(' ')
 35+ ret[(lang,title)] = count_tuple(float(count), int(count))
 36+ return pageview_tuple(time_parse(os.path.basename(f)), ret)
 37+
 38+def slices(ls, size):
 39+ size /= 2
 40+ return map(lambda i: ls[i:i+2*size+1], xrange(0, len(ls) - 2*size))
 41+
 42+def predict(ls, new):
 43+ hist = {}
 44+ for (page,count) in new.items():
 45+ hist[page] = count_tuple([], count.real)
 46+ for (i,cnts) in enumerate(ls):
 47+ for (page,c) in cnts.items():
 48+ if hist.has_key(page):
 49+ if len(hist[page].pred) < i:
 50+ a = [0]*(i-len(hist[page].pred))
 51+ hist[page].pred.extend(a)
 52+ hist[page].pred.append(c.real)
 53+ else:
 54+ a = [0]*(i+1)
 55+ a[-1] = c.real
 56+ hist[page] = count_tuple(a, new[page].real if new.has_key(page) else 0)
 57+ ret = {}
 58+ for (page, count) in hist.items():
 59+ if len(hist[page].pred) < len(ls):
 60+ a = [0]*(len(ls)-len(hist[page].pred))
 61+ hist[page].pred.extend(a)
 62+ slope,intercept = np.linalg.lstsq(np.transpose([np.array(range(0,len(count.pred))),
 63+ np.ones(len(count.pred))]),
 64+ np.array(count.pred))[0]
 65+ ret[page] = count._replace(pred=slope * (len(count.pred) + 1) + intercept)
 66+ return ret
 67+
 68+def moving_accumurate(ls, n, extract=lambda x: x, accumurate=lambda sum,x: sum+x):
 69+ if n <= 1:
 70+ for x in ls:
 71+ yield(extract(x))
 72+ return
 73+ buff = deque()
 74+ for (i,v) in enumerate(ls):
 75+ if i < n-1:
 76+ buff.append(extract(v))
 77+ else:
 78+ vv = extract(v)
 79+ r = accumurate(buff, vv)
 80+ yield r
 81+ buff.popleft()
 82+ buff.append(vv)
 83+
 84+if __name__ == '__main__':
 85+ parser = argparse.ArgumentParser()
 86+ parser.add_argument('-o', '--output', metavar='FILE',
 87+ dest='output', type=lambda x: codecs.open(x, 'w', 'utf-8'), default=sys.stdout,
 88+ help='')
 89+ parser.add_argument('-R', '--rate', metavar='RATE',
 90+ dest='rate', type=float, default=8.0,
 91+ help='')
 92+ parser.add_argument('-m', '--max-duration', metavar='HOURS',
 93+ dest='max', type=float, default=5,
 94+ help='')
 95+ parser.add_argument('-w', '--window', metavar='HOURS',
 96+ dest='window', type=int, default=5,
 97+ help='')
 98+ parser.add_argument('-M', '--min-count', metavar='N',
 99+ dest='min', type=int, default=2000,
 100+ help='')
 101+ parser.add_argument('-v', '--verbose',
 102+ dest='verbose', action='store_true', default=False,
 103+ help='turn on verbose message output')
 104+ parser.add_argument('-i', '--inclusive',
 105+ dest='inclusive', action='store_true', default=False,
 106+ help='include the items below the threshold, add a binary indicator column')
 107+ parser.add_argument('files', nargs='+')
 108+ options = parser.parse_args()
 109+
 110+ fh = options.output
 111+ writer = csv.writer(fh, delimiter='\t')
 112+ fh.write('title\ttime\tcount_pred\tcount_\tcont\trate\n')
 113+
 114+ if options.verbose:
 115+ print >>sys.stderr, options
 116+
 117+ options.files.sort()
 118+ gen_sums = moving_accumurate(options.files, options.window, extract=load_wikistats_file,
 119+ accumurate=lambda hist,cur: (cur.date, predict([x.count for x in hist],cur.count)))
 120+ bursting = {}
 121+ for (newtime,new) in gen_sums:
 122+ for (page,count) in new.items():
 123+ if count.real < options.min:
 124+ continue
 125+ r = 0
 126+ if count.pred == 0:
 127+ if count.real > options.min:
 128+ r = 9999999
 129+ else:
 130+ r = float(count.real - count.pred) / count.pred
 131+ if r > options.rate:
 132+ bursting.setdefault(page, 0)
 133+ if bursting.has_key(page):
 134+ bursting[page] += 1
 135+ if bursting.has_key(page) or options.inclusive:
 136+ b = bursting[page] if bursting.has_key(page) else 0
 137+ try:
 138+ ls = [urllib2.unquote(page[1]).decode('utf-8'),
 139+ time_format(newtime),
 140+ count.pred,
 141+ count.real,
 142+ b,
 143+ r]
 144+ if options.inclusive:
 145+ ls.insert(len(ls), bursting.has_key(page))
 146+ writer.writerow([unicode(x) for x in ls])
 147+ except UnicodeEncodeError, e:
 148+ print >>sys.stderr, '%s: %s' % (e, page)
 149+ continue
 150+ except UnicodeDecodeError, e:
 151+ print >>sys.stderr, '%s: %s' % (e, page)
 152+ continue
 153+ if bursting.has_key(page) and (r < -options.rate or bursting[page] > float(options.max) / options.window):
 154+ bursting.pop(page)
 155+
 156+
Property changes on: trunk/tools/wsor/trending_articles/detectbursts.py
___________________________________________________________________
Added: svn:executable
1157 + *
Index: trunk/tools/wsor/trending_articles/README.rst
@@ -0,0 +1,4 @@
 2+See http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors
 3+
 4+Counts files are available at:
 5+http://dammit.lt/wikistats/archive/2011/01/
Index: trunk/tools/wsor/trending_articles/chart.py
@@ -0,0 +1,108 @@
 2+#! /usr/bin/env python
 3+# -*- coding: utf-8 -*-
 4+#
 5+
 6+import numpy
 7+import pylab as plt
 8+import matplotlib
 9+import os
 10+import argparse
 11+import sys
 12+import csv
 13+import datetime
 14+import math
 15+import re
 16+from collections import namedtuple
 17+
 18+counter_tuple = namedtuple('counter', 'name filter color explode')
 19+
 20+def str_to_time(x):
 21+ return datetime.datetime.strptime(x, '%Y%m%d%H%M%S')
 22+
 23+if __name__ == '__main__':
 24+ parser = argparse.ArgumentParser()
 25+ parser.add_argument('-s', '--font-size', metavar='POINT',
 26+ dest='fsize', type=int, default=15,
 27+ help='')
 28+ parser.add_argument('-f', '--field', metavar='N',
 29+ dest='field', type=int, default=9,
 30+ help='')
 31+ parser.add_argument('-w', '--width', metavar='SIZE',
 32+ dest='width', type=int, default=0.3,
 33+ help='')
 34+ parser.add_argument('-y', '--ylimit', metavar='LIMITS',
 35+ dest='ylim', type=str, default='-300,1500',
 36+ help='')
 37+ parser.add_argument('-v', '--verbose',
 38+ dest='verbose', action='store_true', default=False,
 39+ help='turn on verbose message output')
 40+ parser.add_argument('files', nargs='+')
 41+ options = parser.parse_args()
 42+
 43+
 44+ csv.field_size_limit(1000000000)
 45+
 46+ counters = [counter_tuple('new reg. users', lambda x: x[10] == 'REG' and x[13] == 'NEW', '#4444FF', 0.1),
 47+ counter_tuple('old reg. users (semiprotected)', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] == 'SEMIPROTECT', '#99DD99', 0.0),
 48+ counter_tuple('old reg. users (not protected)', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] == 'NO_PROTECT', '#8888EE', 0.0),
 49+ counter_tuple('old reg. users (other)', lambda x: x[10] == 'REG' and x[13] == 'OLD', '#FFFFFF', 0.0),
 50+ counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW', '#FF4444', 0.1),
 51+ counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD', '#EE8888', 0.0),
 52+ counter_tuple('bots', lambda x: x[10] == 'REG_BOT', '#666666', 0.0),
 53+ counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
 54+ ]
 55+
 56+ # counters = [counter_tuple('new registered users', lambda x: x[10] == 'REG' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1),
 57+ # counter_tuple('old registered users', lambda x: x[10] == 'REG' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0),
 58+ # counter_tuple('new IP users', lambda x: x[10] == 'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1),
 59+ # counter_tuple('old IP users', lambda x: x[10] == 'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0),
 60+ # counter_tuple('bots', lambda x: x[10] == 'REG_BOT' and x[14] != 'SEMIPROTECT', '#666666', 0.0),
 61+ # #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
 62+ # ]
 63+
 64+ counters_map = {}
 65+ for x in counters:
 66+ counters_map[x.name] = x
 67+
 68+ ratios = []
 69+ patt = re.compile('(\d+) / (\d+) / (\d+)')
 70+ for (i,fname) in enumerate(options.files):
 71+ for line in open(fname).readlines():
 72+ m = patt.search(line)
 73+ if m:
 74+ ratios.append((float(m.group(1)) / float(m.group(2)) / float(m.group(3))) ** 0.5)
 75+ break
 76+ sum_ratio = sum(ratios)
 77+ counter_names = [x.name for x in counters]
 78+
 79+ plots = []
 80+ matplotlib.rc('font', size=options.fsize)
 81+ for (n,fname) in enumerate(options.files):
 82+ plt.figure(figsize=(10,10))
 83+ table = list(csv.reader(filter(lambda x: x[0] != '#', open(fname)), delimiter='\t'))
 84+ table = table[1:]
 85+
 86+ counts = {}
 87+ for name in counter_names:
 88+ counts[name] = set()
 89+ for cols in table:
 90+ for c in counters:
 91+ if c.filter(cols):
 92+ counts[c[0]].add(cols[options.field-1])
 93+ break
 94+
 95+ print counts#!
 96+ #plt.subplot(1, len(options.files), n+1)
 97+ plt.axes([0, 0, ratios[n]/sum_ratio, ratios[n]/sum_ratio])
 98+ plt.title(fname)
 99+ p = plt.pie([len(counts[x]) for x in counter_names],
 100+ explode=[counters_map[x].explode for x in counter_names],
 101+ autopct='%1.1f%%',
 102+ pctdistance=1.2,
 103+ colors=[x.color for x in counters])
 104+
 105+ plt.legend(p[0], ['' if counts[x] == 0 else x for x in counter_names],
 106+ loc=(.8, .8))
 107+
 108+ base,ext = os.path.splitext(fname)
 109+ plt.savefig('.'.join([base, 'svg']))
Property changes on: trunk/tools/wsor/trending_articles/chart.py
___________________________________________________________________
Added: svn:executable
1110 + *
Index: trunk/tools/wsor/trending_articles/find_revision_status.py
@@ -0,0 +1,195 @@
 2+#! /usr/bin/env python
 3+# -*- coding: utf-8 -*-
 4+#
 5+
 6+import oursql
 7+import os
 8+import argparse
 9+import sys
 10+import csv
 11+import urllib2
 12+import re
 13+from datetime import datetime, timedelta
 14+
 15+def parse_wikidate(x):
 16+ return datetime.strptime(str(x), '%Y%m%d%H%M%S')
 17+
 18+def format_wikidate(x):
 19+ return datetime.strftime(x, '%Y%m%d%H%M%S')
 20+
 21+def title2pageid(cursor, title, namespace=0):
 22+ cursor.execute('''
 23+ SELECT p.page_id, page_is_redirect
 24+ FROM page p
 25+ WHERE
 26+ p.page_title = ?
 27+ AND p.page_namespace = ?
 28+ ;
 29+ ''', (title,namespace))
 30+ ls = list(cursor)
 31+ if len(ls) == 0:
 32+ return (None,None)
 33+ return tuple(ls[0])
 34+
 35+def redirected(cursor, pid, namespace=0):
 36+ cursor.execute('''
 37+ SELECT rd_title
 38+ FROM redirect r
 39+ WHERE
 40+ r.rd_from = ?
 41+ ;
 42+ ''', (pid,))
 43+ title = list(cursor)[0][0]
 44+ rd_pid,rd = title2pageid(cursor, title)
 45+ if rd_pid == None:
 46+ return (None,None)
 47+ if rd and rd_pid != pid:
 48+ (title, rd_pid) = redirected(cursor, rd_pid, namespace)
 49+ return (title, rd_pid)
 50+
 51+def firstedits(cursor, uid, uname, delta, n):
 52+ where = 'r.rev_user_text = ?'
 53+ uspec = uname
 54+ if uid != 0:
 55+ where = 'r.rev_user = ?'
 56+ uspec = uid
 57+ cursor.execute('''
 58+ SELECT r.rev_timestamp
 59+ FROM revision r
 60+ WHERE
 61+ r.rev_timestamp != ""
 62+ AND %s
 63+ ORDER BY r.rev_timestamp ASC
 64+ LIMIT 1
 65+ ;
 66+ ''' % (where,), (uspec,))
 67+ first = list(cursor)[0][0]
 68+ first = parse_wikidate(first)
 69+ cursor.execute('''
 70+ SELECT r.rev_id
 71+ FROM revision r
 72+ WHERE
 73+ %s
 74+ AND r.rev_timestamp BETWEEN ? AND ?
 75+ LIMIT ?
 76+ ;
 77+ ''' % (where,), (uspec, format_wikidate(first), format_wikidate(first + delta), n))
 78+ return [int(x[0]) for x in list(cursor)]
 79+
 80+def editcount(cursor, uid, uname, timestamp):
 81+ where = 'r.rev_user_text = ?'
 82+ uspec = uname
 83+ if uid != 0:
 84+ where = 'r.rev_user = ?'
 85+ uspec = uid
 86+
 87+ cursor.execute('''
 88+ SELECT count(*)
 89+ FROM revision r
 90+ WHERE
 91+ %s
 92+ AND r.rev_timestamp < ?
 93+ ;
 94+ ''' % (where,), (uspec,timestamp))
 95+ return int(list(cursor)[0][0])
 96+
 97+if __name__ == '__main__':
 98+ parser = argparse.ArgumentParser()
 99+ parser.add_argument('-f', '--field', metavar='N',
 100+ dest='field', type=int, default=1,
 101+ help='')
 102+ parser.add_argument('-d', '--db', metavar='DBNAME', required=True,
 103+ dest='db', type=str, default='hywiki-p',
 104+ help='target wiki name')
 105+ parser.add_argument('input')
 106+ options = parser.parse_args()
 107+ options.db = options.db.replace('_','-')
 108+
 109+ host = options.db + '.rrdb.toolserver.org'
 110+ conn = oursql.connect(host = host,
 111+ read_default_file=os.path.expanduser('~/.my.cnf'),
 112+ db = options.db.replace('-','_'),
 113+ charset=None,
 114+ use_unicode=False)
 115+
 116+ cursor = conn.cursor()
 117+
 118+ csv.field_size_limit(1000000000)
 119+ table = list(csv.reader(open(options.input), delimiter='\t'))
 120+ table = table[1:]
 121+
 122+ output = []
 123+ hours = {}
 124+ for cols in table:
 125+ cursor.execute('''
 126+ SELECT p.page_id, p.page_title, page_is_redirect
 127+ FROM page p
 128+ WHERE
 129+ p.page_title = ?
 130+ AND p.page_namespace = 0
 131+ ;
 132+ ''', (cols[options.field-1],))
 133+ res = list(cursor)
 134+ if res == None or res == []:
 135+ print >>sys.stderr, 'error 1 %s' % cols
 136+ continue
 137+ redirect = int(res[0][2]) == 1
 138+ cols.insert(options.field, 'REDIRECT' if redirect else 'ARTICLE')
 139+ cols.insert(options.field, str(res[0][0]))
 140+ output.append(cols)
 141+ if redirect:
 142+ (title,pageid) = redirected(cursor, res[0][0])
 143+ if title == None:
 144+ print >>sys.stderr, 'error 2 %s' % cols
 145+ continue
 146+ a = [x for x in cols]
 147+ a[0] = title
 148+ a[1] = str(pageid)
 149+ a[2] = 'REDIRECT_RESOLVED'
 150+ output.append(a)
 151+ hours[cols[3]] = True
 152+
 153+ # cursor.executemany('''
 154+ # SELECT p.page_title, p.page_id
 155+ # FROM page p
 156+ # WHERE
 157+ # p.page_title = ?
 158+ # AND p.page_namespace = 0
 159+ # ''', [(urllib2.quote(x[options.field-1]),) for x in table])
 160+ # print list(cursor)
 161+
 162+ print '\t'.join(['title', 'page_id', 'redirect?', 'pageview timestamp', 'predicted pageview', 'actual pageview', 'trending hours', 'surprisedness', 'revision', 'timestamp', 'user type', 'username', 'editcount', 'new?'])
 163+
 164+ botpat = re.compile('bot( |$)', re.IGNORECASE)
 165+ edits = 0
 166+ articles = {}
 167+ for cols in output:
 168+ start = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
 169+ end = start + timedelta(hours=1)
 170+ cursor.execute('''
 171+ SELECT r.rev_id, r.rev_timestamp, r.rev_user, r.rev_user_text
 172+ FROM revision r
 173+ WHERE
 174+ r.rev_page = ?
 175+ AND rev_timestamp BETWEEN ? AND ?
 176+ ;
 177+
 178+ ''', (cols[1],
 179+ datetime.strftime(start, '%Y%m%d$H%M%S'),
 180+ datetime.strftime(end, '%Y%m%d$H%M%S'),
 181+ ))
 182+ ls = list(cursor)
 183+ if len(ls) == 0:
 184+ print >>sys.stderr, 'no revision: %s %s %s' % (cols[0], start, end)
 185+ for (rev,ts,uid,username) in ls:
 186+ usertype = 'ANON' if uid == 0 else 'REG'
 187+ if uid != 0 and botpat.search(username):
 188+ usertype += '_BOT'
 189+ output = cols + [str(x) for x in [rev, ts, usertype, username,
 190+ editcount(cursor,uid,username,re.sub('[ /\:]', '', cols[3])),
 191+ 'NEW' if firstedits(cursor,uid,username,timedelta(days=30),30).count(rev) > 0 else 'OLD']]
 192+ print '\t'.join(output)
 193+ edits +=1
 194+ articles[cols[1]] = True
 195+
 196+ print '# %s / %s / %s edits/article/hour' % (edits, len(articles.keys()), len(hours.keys()))
Property changes on: trunk/tools/wsor/trending_articles/find_revision_status.py
___________________________________________________________________
Added: svn:executable
1197 + *
Index: trunk/tools/wsor/trending_articles/filter.py
@@ -0,0 +1,40 @@
 2+#! /usr/bin/env python
 3+
 4+import random
 5+import argparse
 6+import time
 7+import sys
 8+import os
 9+import gzip
 10+
 11+def parse(f):
 12+ for line in f.readlines():
 13+ i = None
 14+ try:
 15+ i = line.rindex(' ', 0, line.rindex(' '))
 16+ except ValueError:
 17+ print >>sys.stderr, line
 18+ if i != None and i > 0:
 19+ yield (line,line[0:i])
 20+
 21+if __name__ == '__main__':
 22+ parser = argparse.ArgumentParser()
 23+ parser.add_argument('-l', '--list', metavar='FILE',
 24+ dest='filter', type=str, required=True,
 25+ help='')
 26+ parser.add_argument('-d', '--directory', metavar='DIR',
 27+ dest='dir', type=str, required=True,
 28+ help='')
 29+ parser.add_argument('files', nargs='+')
 30+ options = parser.parse_args()
 31+
 32+ accepts = {}
 33+
 34+ for line in open(options.filter).readlines():
 35+ accepts[line.strip()] = True
 36+ for f in options.files:
 37+ print >>sys.stderr, f
 38+ w = gzip.open(os.path.sep.join([options.dir, os.path.basename(f)]), 'w')
 39+ for (line,p) in parse(gzip.open(f)):
 40+ if accepts.has_key(p):
 41+ print >>w, line,
Property changes on: trunk/tools/wsor/trending_articles/filter.py
___________________________________________________________________
Added: svn:executable
142 + *
Index: trunk/tools/wsor/trending_articles/filter_random.py
@@ -0,0 +1,22 @@
 2+#! /usr/bin/env python
 3+import random
 4+import argparse
 5+import time
 6+import sys
 7+
 8+if __name__ == '__main__':
 9+ parser = argparse.ArgumentParser()
 10+ parser.add_argument('-a', '--acceptance-rate', metavar='RATE',
 11+ dest='accept', type=float, default=0.02,
 12+ help='')
 13+ parser.add_argument('-r', '--random-seed', metavar='SEED',
 14+ dest='seed', type=int, default=int(time.time()),
 15+ help='random number seed')
 16+ options = parser.parse_args()
 17+
 18+ random.seed(options.seed)
 19+
 20+ for line in sys.stdin.readlines():
 21+ if random.random() > options.accept:
 22+ continue
 23+ print line,
Property changes on: trunk/tools/wsor/trending_articles/filter_random.py
___________________________________________________________________
Added: svn:executable
124 + *
Index: trunk/tools/wsor/trending_articles/detectnonbursts_random.py
@@ -0,0 +1,86 @@
 2+#! /usr/bin/env python
 3+# -*- coding: utf-8 -*-
 4+
 5+import pymongo
 6+import codecs
 7+import csv
 8+import sys
 9+from datetime import datetime, timedelta
 10+from pymongo.master_slave_connection import MasterSlaveConnection
 11+import argparse
 12+import random
 13+import gzip
 14+import re
 15+import os
 16+import time
 17+import urllib2
 18+from collections import deque, namedtuple
 19+import numpy as np
 20+
 21+pageview_tuple = namedtuple('Pageview', 'date count')
 22+count_tuple = namedtuple('Count', 'pred real')
 23+
 24+def time_parse(x):
 25+ return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')
 26+def time_format(x):
 27+ return datetime.strftime(x, '%Y/%m/%d %H:%M:%S')
 28+def datetime2days(x):
 29+ return x.days + x.seconds / 60.0 / 60.0 / 24.0 + x.microseconds / 6000.0 / 60.0 / 24.0
 30+
 31+def load_wikistats_file(f):
 32+ print >>sys.stderr, 'loading %s...' % f
 33+ ret = {}
 34+ for line in gzip.open(f):
 35+ line.strip()
 36+ (lang,title,count,bytes) = line.split(' ')
 37+ ret[(lang,title)] = count_tuple(float(count), int(count))
 38+ return pageview_tuple(time_parse(os.path.basename(f)), ret)
 39+
 40+if __name__ == '__main__':
 41+ parser = argparse.ArgumentParser()
 42+ parser.add_argument('-o', '--output', metavar='FILE',
 43+ dest='output', type=str, required=True,
 44+ help='')
 45+ parser.add_argument('-a', '--acceptance-rate', metavar='RATE',
 46+ dest='accept', type=float, default=0.001,
 47+ help='')
 48+ parser.add_argument('-r', '--random-seed', metavar='SEED',
 49+ dest='seed', type=int, default=int(time.time()),
 50+ help='random number seed')
 51+ parser.add_argument('-v', '--verbose',
 52+ dest='verbose', action='store_true', default=False,
 53+ help='turn on verbose message output')
 54+ parser.add_argument('files', nargs='+')
 55+ options = parser.parse_args()
 56+
 57+ fh = codecs.open(options.output, 'w', 'utf-8')
 58+ writer = csv.writer(fh, delimiter='\t')
 59+ random.seed(options.seed)
 60+
 61+ fh.write('title\ttime\tcount_pred\tcount_\tcont\trate\n')
 62+
 63+ if options.verbose:
 64+ print >>sys.stderr, options
 65+
 66+ options.files.sort()
 67+ for fname in options.files:
 68+ pvs = load_wikistats_file(fname)
 69+ for (page,count) in pvs.count.items():
 70+ if random.random() < options.accept:
 71+ ls = []
 72+ try:
 73+ ls = [urllib2.unquote(page[1]).decode('utf-8'),
 74+ time_format(pvs.date),
 75+ None,
 76+ count.real,
 77+ 0,
 78+ None]
 79+ writer.writerow([unicode(x) for x in ls])
 80+ except UnicodeEncodeError, e:
 81+ print >>sys.stderr, '%s: %s' % (e, page)
 82+ continue
 83+ except UnicodeDecodeError, e:
 84+ print >>sys.stderr, '%s: %s' % (e, page)
 85+ continue
 86+
 87+
Property changes on: trunk/tools/wsor/trending_articles/detectnonbursts_random.py
___________________________________________________________________
Added: svn:executable
188 + *

Status & tagging log