r76202 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76201‎ | r76202 | r76203 >
Date:17:44, 6 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Statistics module.
Modified paths:
  • /trunk/tools/editor_trends/statistics (added) (history)
  • /trunk/tools/editor_trends/statistics/__init__.py (added) (history)
  • /trunk/tools/editor_trends/statistics/dataset.py (added) (history)
  • /trunk/tools/editor_trends/statistics/median.py (added) (history)
  • /trunk/tools/editor_trends/statistics/stata (added) (history)
  • /trunk/tools/editor_trends/statistics/stata/wiki.do (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/statistics/median.py
@@ -0,0 +1,130 @@
 2+from collections import deque
 3+from random import random
 4+from math import log
 5+
 6+class Infinity(object):
 7+ 'Sentinel object that always compares greater than another object'
 8+ def __cmp__(self, other):
 9+ return 1
 10+
 11+def running_median(n, iterable, len=len, min=min, int=int, log=log, random=random):
 12+ 'Fast running median with O(lg n) updates where n is the window size'
 13+
 14+ maxlevels = int(log(n, 2)) + 1
 15+ bottom_to_top = list(range(maxlevels))
 16+ top_to_bottom = bottom_to_top[::-1]
 17+
 18+ VALUE, NEXT, WIDTH = 0, 1, 2 # Components of a node list
 19+ NIL = [Infinity(), [], []] # Singleton terminator node
 20+ head = ['HEAD', [NIL] * maxlevels, [1] * maxlevels]
 21+ staircase = [None] * maxlevels
 22+
 23+ queue = deque()
 24+ queue_append, queue_popleft = queue.append, queue.popleft
 25+ midpoint = n // 2
 26+ oldnode = None
 27+ for newelem in iterable:
 28+ # staircase: first node on each level where node[NEXT][level][VALUE] > newelem
 29+ queue_append(newelem)
 30+ stair_width = [0] * maxlevels
 31+ node = head
 32+ for level in top_to_bottom:
 33+ while not newelem < node[NEXT][level][VALUE]:
 34+ stair_width[level] += node[WIDTH][level]
 35+ node = node[NEXT][level]
 36+ staircase[level] = node
 37+
 38+ # make a new node or reuse one that was previously removed
 39+ if oldnode is None:
 40+ d = min(maxlevels, 1 - int(log(random(), 2.0)))
 41+ newnode = [newelem, [None] * d, [None] * d]
 42+ else:
 43+ newnode = oldnode
 44+ newnode[VALUE] = newelem
 45+ d = len(newnode[NEXT])
 46+
 47+ # insert a link to the newnode at each level
 48+ steps = 0
 49+ for level in bottom_to_top[:d]:
 50+ prevnode = staircase[level]
 51+ newnode[NEXT][level] = prevnode[NEXT][level]
 52+ prevnode[NEXT][level] = newnode
 53+ newnode[WIDTH][level] = prevnode[WIDTH][level] - steps
 54+ prevnode[WIDTH][level] = steps
 55+ steps += stair_width[level]
 56+ for level in bottom_to_top:
 57+ prevnode = staircase[level]
 58+ prevnode[WIDTH][level] += 1
 59+
 60+ if len(queue) >= n:
 61+ # find and yield the midpoint value
 62+ i = midpoint + 1
 63+ node = head
 64+ for level in top_to_bottom:
 65+ while node[WIDTH][level] <= i:
 66+ i -= node[WIDTH][level]
 67+ node = node[NEXT][level]
 68+ yield node[VALUE]
 69+
 70+ # staircase: first node on each level where node[NEXT][level][VALUE] >= oldelem
 71+ oldelem = queue_popleft()
 72+ node = head
 73+ for level in top_to_bottom:
 74+ while node[NEXT][level][VALUE] < oldelem:
 75+ node = node[NEXT][level]
 76+ staircase[level] = node
 77+ oldnode = staircase[0][NEXT][0] # node where oldnode[VALUE] is oldelem
 78+
 79+ # remove links to the oldnode
 80+ d = len(oldnode[NEXT])
 81+ for level in bottom_to_top[:d]:
 82+ prevnode = staircase[level]
 83+ prevnode[WIDTH][level] += oldnode[WIDTH][level]
 84+ prevnode[NEXT][level] = oldnode[NEXT][level]
 85+ for level in bottom_to_top:
 86+ prevnode = staircase[level]
 87+ prevnode[WIDTH][level] -= 1
 88+
 89+
 90+if __name__ == '__main__':
 91+
 92+ ###########################################################################
 93+ # Demonstrate the running_median() generator
 94+ # Compare results to an alternative generator
 95+ # implemented by sorting a regular list.
 96+
 97+ from bisect import insort
 98+ from random import randrange
 99+ from itertools import islice
 100+ import datetime
 101+
 102+ def running_median_slow(n, iterable):
 103+ 'Slow running-median with O(n) updates where n is the window size'
 104+ it = iter(iterable)
 105+ queue = deque(islice(it, n))
 106+ sortedlist = sorted(queue)
 107+ midpoint = len(queue) // 2
 108+ yield sortedlist[midpoint]
 109+ for newelem in it:
 110+ oldelem = queue.popleft()
 111+ sortedlist.remove(oldelem)
 112+ queue.append(newelem)
 113+ insort(sortedlist, newelem)
 114+ yield sortedlist[midpoint]
 115+
 116+ M, N, window = 9000, 80000, 101
 117+
 118+ data = [randrange(M) for i in range(N)]
 119+ t1 = datetime.datetime.now()
 120+ result = list(running_median(window, data))
 121+ t2 = datetime.datetime.now()
 122+ expected = list(running_median_slow(window, data))
 123+ t3 = datetime.datetime.now()
 124+ assert result == expected
 125+ d1 = t2 - t1
 126+ d2 = t3 - t2
 127+ print result
 128+ print expected
 129+ print 'Fast median: %s; slow median: %s' % (d1, d2)
 130+ print 'Successful test of RunningMedian() with', N,
 131+ print 'items and a window of size', window, '\n'
Property changes on: trunk/tools/editor_trends/statistics/median.py
___________________________________________________________________
Added: svn:eol-style
1132 + native
Index: trunk/tools/editor_trends/statistics/__init__.py
Property changes on: trunk/tools/editor_trends/statistics/__init__.py
___________________________________________________________________
Added: svn:eol-style
2133 + native
Index: trunk/tools/editor_trends/statistics/dataset.py
@@ -0,0 +1,13 @@
 2+
 3+
 4+def determine_editor_is_new_wikipedian(edits, definition):
 5+
 6+ if definition == 'traditional':
 7+ if len(edits) < 10:
 8+ return False
 9+ else:
 10+ return True
 11+ else:
 12+ raise NotImplementedError
 13+
 14+
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/statistics/dataset.py
___________________________________________________________________
Added: svn:eol-style
115 + native
Index: trunk/tools/editor_trends/statistics/stata/wiki.do
@@ -0,0 +1,72 @@
 2+local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit"
 3+
 4+foreach edit of local first_ten {
 5+ gen date2 = date(`edit', "YMDhms")
 6+ drop `edit'
 7+ rename date2 `edit'
 8+ format `edit' %td
 9+}
 10+
 11+generate year_left = year(final_edit)
 12+sort year_joined
 13+by year_joined: gen community_size_t = _N
 14+
 15+
 16+forvalues year = 1(1)10{
 17+ gen active200`year' = 0
 18+ replace active200`year' =1 if((edits_10+(`year'*365)<=final_edit))
 19+ egen community_size_200`year' = total(active200`year')
 20+}
 21+
 22+forvalues t = 1(1)10{
 23+ local t1 = `t'+1
 24+ gen retention200`t' = community_size_200`t1' / community_size_200`t'
 25+}
 26+
 27+
 28+
 29+
 30+
 31+
 32+
 33+generate time_to_new_wp = edits_10 - edits_1
 34+generate active_time_wp = final_edit - edits_10
 35+label time_to_new_wp "Number of days it took to become a new wikipedian"
 36+label active_time_wp "Number of days active once becoming a new wikipedian"
 37+
 38+
 39+
 40+compress
 41+
 42+graph hbar (mean) time_to_new_wp, over(year_joined, label(labsize(small))) blabel(bar, size(tiny) format(%9.0f)) ytitle(Average number of days) ytitle(, size(vsmall)) ylabel(, labsize(small)) title("The average number of days to become" "a new wikipedian increases.") note("A new wikipedian is defined as somebody who has made at least 10 edits." "The year in which the 10th edit was made determines in which year an editor became a new wikipedian." "Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits.", size(vsmall))
 43+histogram time_to_new_wp, percent ytitle(Percentage (%)) ytitle(, size(small)) xtitle(Number of days) xtitle(, size(small)) by(, title("Histograms of number of days it took" " to become a new wikipedian by year") subtitle(The pace by which contributors are becoming a new wikipedian is slowing down., size(small)) note("Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits." "A new wikipedian is somebody who has contributed at least 10 edits.", size(vsmall))) by(year_joined)
 44+graph box time_to_new_wp, over(year_joined) nooutsides
 45+glcurve edit_count, by( year_joined) split lorenz
 46+
 47+
 48+
 49+insheet using "C:\Users\diederik.vanliere\Desktop\dataset.csv"
 50+// 0 = False
 51+// 1 = True
 52+
 53+rename v1 id
 54+rename v2 date
 55+format date2 %td
 56+gen date2 = date(date, "MD20Y")
 57+sort id
 58+by id: generate n = _n
 59+by id: egen first_obs = min(date2)
 60+by id: egen last_obs = max(date2)
 61+by id: generate time_required = last_obs - first_obs
 62+by id: generate year= year(last_obs)
 63+
 64+gen made_ten_edits =0
 65+by id: egen temp = max(n)
 66+by id: replace made_ten_edits=1 if(temp==10)
 67+drop temp
 68+
 69+
 70+
 71+by year, sort: egen time_to_new_wikipedian = mean( time_required)
 72+
 73+compress
Property changes on: trunk/tools/editor_trends/statistics/stata/wiki.do
___________________________________________________________________
Added: svn:eol-style
174 + native

Status & tagging log