Index: trunk/tools/editor_trends/statistics/median.py |
— | — | @@ -0,0 +1,130 @@ |
| 2 | +from collections import deque |
| 3 | +from random import random |
| 4 | +from math import log |
| 5 | + |
| 6 | +class Infinity(object): |
| 7 | + 'Sentinel object that always compares greater than another object' |
| 8 | + def __cmp__(self, other): |
| 9 | + return 1 |
| 10 | + |
| 11 | +def running_median(n, iterable, len=len, min=min, int=int, log=log, random=random): |
| 12 | + 'Fast running median with O(lg n) updates where n is the window size' |
| 13 | + |
| 14 | + maxlevels = int(log(n, 2)) + 1 |
| 15 | + bottom_to_top = list(range(maxlevels)) |
| 16 | + top_to_bottom = bottom_to_top[::-1] |
| 17 | + |
| 18 | + VALUE, NEXT, WIDTH = 0, 1, 2 # Components of a node list |
| 19 | + NIL = [Infinity(), [], []] # Singleton terminator node |
| 20 | + head = ['HEAD', [NIL] * maxlevels, [1] * maxlevels] |
| 21 | + staircase = [None] * maxlevels |
| 22 | + |
| 23 | + queue = deque() |
| 24 | + queue_append, queue_popleft = queue.append, queue.popleft |
| 25 | + midpoint = n // 2 |
| 26 | + oldnode = None |
| 27 | + for newelem in iterable: |
| 28 | + # staircase: first node on each level where node[NEXT][level][VALUE] > newelem |
| 29 | + queue_append(newelem) |
| 30 | + stair_width = [0] * maxlevels |
| 31 | + node = head |
| 32 | + for level in top_to_bottom: |
| 33 | + while not newelem < node[NEXT][level][VALUE]: |
| 34 | + stair_width[level] += node[WIDTH][level] |
| 35 | + node = node[NEXT][level] |
| 36 | + staircase[level] = node |
| 37 | + |
| 38 | + # make a new node or reuse one that was previously removed |
| 39 | + if oldnode is None: |
| 40 | + d = min(maxlevels, 1 - int(log(random(), 2.0))) |
| 41 | + newnode = [newelem, [None] * d, [None] * d] |
| 42 | + else: |
| 43 | + newnode = oldnode |
| 44 | + newnode[VALUE] = newelem |
| 45 | + d = len(newnode[NEXT]) |
| 46 | + |
| 47 | + # insert a link to the newnode at each level |
| 48 | + steps = 0 |
| 49 | + for level in bottom_to_top[:d]: |
| 50 | + prevnode = staircase[level] |
| 51 | + newnode[NEXT][level] = prevnode[NEXT][level] |
| 52 | + prevnode[NEXT][level] = newnode |
| 53 | + newnode[WIDTH][level] = prevnode[WIDTH][level] - steps |
| 54 | + prevnode[WIDTH][level] = steps |
| 55 | + steps += stair_width[level] |
| 56 | + for level in bottom_to_top: |
| 57 | + prevnode = staircase[level] |
| 58 | + prevnode[WIDTH][level] += 1 |
| 59 | + |
| 60 | + if len(queue) >= n: |
| 61 | + # find and yield the midpoint value |
| 62 | + i = midpoint + 1 |
| 63 | + node = head |
| 64 | + for level in top_to_bottom: |
| 65 | + while node[WIDTH][level] <= i: |
| 66 | + i -= node[WIDTH][level] |
| 67 | + node = node[NEXT][level] |
| 68 | + yield node[VALUE] |
| 69 | + |
| 70 | + # staircase: first node on each level where node[NEXT][level][VALUE] >= oldelem |
| 71 | + oldelem = queue_popleft() |
| 72 | + node = head |
| 73 | + for level in top_to_bottom: |
| 74 | + while node[NEXT][level][VALUE] < oldelem: |
| 75 | + node = node[NEXT][level] |
| 76 | + staircase[level] = node |
| 77 | + oldnode = staircase[0][NEXT][0] # node where oldnode[VALUE] is oldelem |
| 78 | + |
| 79 | + # remove links to the oldnode |
| 80 | + d = len(oldnode[NEXT]) |
| 81 | + for level in bottom_to_top[:d]: |
| 82 | + prevnode = staircase[level] |
| 83 | + prevnode[WIDTH][level] += oldnode[WIDTH][level] |
| 84 | + prevnode[NEXT][level] = oldnode[NEXT][level] |
| 85 | + for level in bottom_to_top: |
| 86 | + prevnode = staircase[level] |
| 87 | + prevnode[WIDTH][level] -= 1 |
| 88 | + |
| 89 | + |
| 90 | +if __name__ == '__main__': |
| 91 | + |
| 92 | + ########################################################################### |
| 93 | + # Demonstrate the running_median() generator |
| 94 | + # Compare results to an alternative generator |
| 95 | + # implemented by sorting a regular list. |
| 96 | + |
| 97 | + from bisect import insort |
| 98 | + from random import randrange |
| 99 | + from itertools import islice |
| 100 | + import datetime |
| 101 | + |
| 102 | + def running_median_slow(n, iterable): |
| 103 | + 'Slow running-median with O(n) updates where n is the window size' |
| 104 | + it = iter(iterable) |
| 105 | + queue = deque(islice(it, n)) |
| 106 | + sortedlist = sorted(queue) |
| 107 | + midpoint = len(queue) // 2 |
| 108 | + yield sortedlist[midpoint] |
| 109 | + for newelem in it: |
| 110 | + oldelem = queue.popleft() |
| 111 | + sortedlist.remove(oldelem) |
| 112 | + queue.append(newelem) |
| 113 | + insort(sortedlist, newelem) |
| 114 | + yield sortedlist[midpoint] |
| 115 | + |
| 116 | + M, N, window = 9000, 80000, 101 |
| 117 | + |
| 118 | + data = [randrange(M) for i in range(N)] |
| 119 | + t1 = datetime.datetime.now() |
| 120 | + result = list(running_median(window, data)) |
| 121 | + t2 = datetime.datetime.now() |
| 122 | + expected = list(running_median_slow(window, data)) |
| 123 | + t3 = datetime.datetime.now() |
| 124 | + assert result == expected |
| 125 | + d1 = t2 - t1 |
| 126 | + d2 = t3 - t2 |
| 127 | + print result |
| 128 | + print expected |
| 129 | + print 'Fast median: %s; slow median: %s' % (d1, d2) |
| 130 | + print 'Successful test of RunningMedian() with', N, |
| 131 | + print 'items and a window of size', window, '\n' |
Property changes on: trunk/tools/editor_trends/statistics/median.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 132 | + native |
Index: trunk/tools/editor_trends/statistics/__init__.py |
Property changes on: trunk/tools/editor_trends/statistics/__init__.py |
___________________________________________________________________ |
Added: svn:eol-style |
2 | 133 | + native |
Index: trunk/tools/editor_trends/statistics/dataset.py |
— | — | @@ -0,0 +1,13 @@ |
| 2 | + |
| 3 | + |
| 4 | +def determine_editor_is_new_wikipedian(edits, definition): |
| 5 | + |
| 6 | + if definition == 'traditional': |
| 7 | + if len(edits) < 10: |
| 8 | + return False |
| 9 | + else: |
| 10 | + return True |
| 11 | + else: |
| 12 | + raise NotImplementedError |
| 13 | + |
| 14 | + |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/statistics/dataset.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 15 | + native |
Index: trunk/tools/editor_trends/statistics/stata/wiki.do |
— | — | @@ -0,0 +1,72 @@ |
| 2 | +local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit" |
| 3 | + |
| 4 | +foreach edit of local first_ten { |
| 5 | + gen date2 = date(`edit', "YMDhms") |
| 6 | + drop `edit' |
| 7 | + rename date2 `edit' |
| 8 | + format `edit' %td |
| 9 | +} |
| 10 | + |
| 11 | +generate year_left = year(final_edit) |
| 12 | +sort year_joined |
| 13 | +by year_joined: gen community_size_t = _N |
| 14 | + |
| 15 | + |
| 16 | +forvalues year = 1(1)10{ |
| 17 | + gen active200`year' = 0 |
| 18 | + replace active200`year' =1 if((edits_10+(`year'*365)<=final_edit)) |
| 19 | + egen community_size_200`year' = total(active200`year') |
| 20 | +} |
| 21 | + |
| 22 | +forvalues t = 1(1)10{ |
| 23 | + local t1 = `t'+1 |
| 24 | + gen retention200`t' = community_size_200`t1' / community_size_200`t' |
| 25 | +} |
| 26 | + |
| 27 | + |
| 28 | + |
| 29 | + |
| 30 | + |
| 31 | + |
| 32 | + |
| 33 | +generate time_to_new_wp = edits_10 - edits_1 |
| 34 | +generate active_time_wp = final_edit - edits_10 |
| 35 | +label time_to_new_wp "Number of days it took to become a new wikipedian" |
| 36 | +label active_time_wp "Number of days active once becoming a new wikipedian" |
| 37 | + |
| 38 | + |
| 39 | + |
| 40 | +compress |
| 41 | + |
| 42 | +graph hbar (mean) time_to_new_wp, over(year_joined, label(labsize(small))) blabel(bar, size(tiny) format(%9.0f)) ytitle(Average number of days) ytitle(, size(vsmall)) ylabel(, labsize(small)) title("The average number of days to become" "a new wikipedian increases.") note("A new wikipedian is defined as somebody who has made at least 10 edits." "The year in which the 10th edit was made determines in which year an editor became a new wikipedian." "Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits.", size(vsmall)) |
| 43 | +histogram time_to_new_wp, percent ytitle(Percentage (%)) ytitle(, size(small)) xtitle(Number of days) xtitle(, size(small)) by(, title("Histograms of number of days it took" " to become a new wikipedian by year") subtitle(The pace by which contributors are becoming a new wikipedian is slowing down., size(small)) note("Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits." "A new wikipedian is somebody who has contributed at least 10 edits.", size(vsmall))) by(year_joined) |
| 44 | +graph box time_to_new_wp, over(year_joined) nooutsides |
| 45 | +glcurve edit_count, by( year_joined) split lorenz |
| 46 | + |
| 47 | + |
| 48 | + |
| 49 | +insheet using "C:\Users\diederik.vanliere\Desktop\dataset.csv" |
| 50 | +// 0 = False |
| 51 | +// 1 = True |
| 52 | + |
| 53 | +rename v1 id |
| 54 | +rename v2 date |
| 55 | +format date2 %td |
| 56 | +gen date2 = date(date, "MD20Y") |
| 57 | +sort id |
| 58 | +by id: generate n = _n |
| 59 | +by id: egen first_obs = min(date2) |
| 60 | +by id: egen last_obs = max(date2) |
| 61 | +by id: generate time_required = last_obs - first_obs |
| 62 | +by id: generate year= year(last_obs) |
| 63 | + |
| 64 | +gen made_ten_edits =0 |
| 65 | +by id: egen temp = max(n) |
| 66 | +by id: replace made_ten_edits=1 if(temp==10) |
| 67 | +drop temp |
| 68 | + |
| 69 | + |
| 70 | + |
| 71 | +by year, sort: egen time_to_new_wikipedian = mean( time_required) |
| 72 | + |
| 73 | +compress |
Property changes on: trunk/tools/editor_trends/statistics/stata/wiki.do |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 74 | + native |