Index: trunk/tools/editor_trends/statistics/median.py |
— | — | @@ -1,130 +0,0 @@ |
2 | | -from collections import deque |
3 | | -from random import random |
4 | | -from math import log |
5 | | - |
6 | | -class Infinity(object): |
7 | | - 'Sentinel object that always compares greater than another object' |
8 | | - def __cmp__(self, other): |
9 | | - return 1 |
10 | | - |
11 | | -def running_median(n, iterable, len=len, min=min, int=int, log=log, random=random): |
12 | | - 'Fast running median with O(lg n) updates where n is the window size' |
13 | | - |
14 | | - maxlevels = int(log(n, 2)) + 1 |
15 | | - bottom_to_top = list(range(maxlevels)) |
16 | | - top_to_bottom = bottom_to_top[::-1] |
17 | | - |
18 | | - VALUE, NEXT, WIDTH = 0, 1, 2 # Components of a node list |
19 | | - NIL = [Infinity(), [], []] # Singleton terminator node |
20 | | - head = ['HEAD', [NIL] * maxlevels, [1] * maxlevels] |
21 | | - staircase = [None] * maxlevels |
22 | | - |
23 | | - queue = deque() |
24 | | - queue_append, queue_popleft = queue.append, queue.popleft |
25 | | - midpoint = n // 2 |
26 | | - oldnode = None |
27 | | - for newelem in iterable: |
28 | | - # staircase: first node on each level where node[NEXT][level][VALUE] > newelem |
29 | | - queue_append(newelem) |
30 | | - stair_width = [0] * maxlevels |
31 | | - node = head |
32 | | - for level in top_to_bottom: |
33 | | - while not newelem < node[NEXT][level][VALUE]: |
34 | | - stair_width[level] += node[WIDTH][level] |
35 | | - node = node[NEXT][level] |
36 | | - staircase[level] = node |
37 | | - |
38 | | - # make a new node or reuse one that was previously removed |
39 | | - if oldnode is None: |
40 | | - d = min(maxlevels, 1 - int(log(random(), 2.0))) |
41 | | - newnode = [newelem, [None] * d, [None] * d] |
42 | | - else: |
43 | | - newnode = oldnode |
44 | | - newnode[VALUE] = newelem |
45 | | - d = len(newnode[NEXT]) |
46 | | - |
47 | | - # insert a link to the newnode at each level |
48 | | - steps = 0 |
49 | | - for level in bottom_to_top[:d]: |
50 | | - prevnode = staircase[level] |
51 | | - newnode[NEXT][level] = prevnode[NEXT][level] |
52 | | - prevnode[NEXT][level] = newnode |
53 | | - newnode[WIDTH][level] = prevnode[WIDTH][level] - steps |
54 | | - prevnode[WIDTH][level] = steps |
55 | | - steps += stair_width[level] |
56 | | - for level in bottom_to_top: |
57 | | - prevnode = staircase[level] |
58 | | - prevnode[WIDTH][level] += 1 |
59 | | - |
60 | | - if len(queue) >= n: |
61 | | - # find and yield the midpoint value |
62 | | - i = midpoint + 1 |
63 | | - node = head |
64 | | - for level in top_to_bottom: |
65 | | - while node[WIDTH][level] <= i: |
66 | | - i -= node[WIDTH][level] |
67 | | - node = node[NEXT][level] |
68 | | - yield node[VALUE] |
69 | | - |
70 | | - # staircase: first node on each level where node[NEXT][level][VALUE] >= oldelem |
71 | | - oldelem = queue_popleft() |
72 | | - node = head |
73 | | - for level in top_to_bottom: |
74 | | - while node[NEXT][level][VALUE] < oldelem: |
75 | | - node = node[NEXT][level] |
76 | | - staircase[level] = node |
77 | | - oldnode = staircase[0][NEXT][0] # node where oldnode[VALUE] is oldelem |
78 | | - |
79 | | - # remove links to the oldnode |
80 | | - d = len(oldnode[NEXT]) |
81 | | - for level in bottom_to_top[:d]: |
82 | | - prevnode = staircase[level] |
83 | | - prevnode[WIDTH][level] += oldnode[WIDTH][level] |
84 | | - prevnode[NEXT][level] = oldnode[NEXT][level] |
85 | | - for level in bottom_to_top: |
86 | | - prevnode = staircase[level] |
87 | | - prevnode[WIDTH][level] -= 1 |
88 | | - |
89 | | - |
90 | | -if __name__ == '__main__': |
91 | | - |
92 | | - ########################################################################### |
93 | | - # Demonstrate the running_median() generator |
94 | | - # Compare results to an alternative generator |
95 | | - # implemented by sorting a regular list. |
96 | | - |
97 | | - from bisect import insort |
98 | | - from random import randrange |
99 | | - from itertools import islice |
100 | | - import datetime |
101 | | - |
102 | | - def running_median_slow(n, iterable): |
103 | | - 'Slow running-median with O(n) updates where n is the window size' |
104 | | - it = iter(iterable) |
105 | | - queue = deque(islice(it, n)) |
106 | | - sortedlist = sorted(queue) |
107 | | - midpoint = len(queue) // 2 |
108 | | - yield sortedlist[midpoint] |
109 | | - for newelem in it: |
110 | | - oldelem = queue.popleft() |
111 | | - sortedlist.remove(oldelem) |
112 | | - queue.append(newelem) |
113 | | - insort(sortedlist, newelem) |
114 | | - yield sortedlist[midpoint] |
115 | | - |
116 | | - M, N, window = 9000, 80000, 101 |
117 | | - |
118 | | - data = [randrange(M) for i in range(N)] |
119 | | - t1 = datetime.datetime.now() |
120 | | - result = list(running_median(window, data)) |
121 | | - t2 = datetime.datetime.now() |
122 | | - expected = list(running_median_slow(window, data)) |
123 | | - t3 = datetime.datetime.now() |
124 | | - assert result == expected |
125 | | - d1 = t2 - t1 |
126 | | - d2 = t3 - t2 |
127 | | - print result |
128 | | - print expected |
129 | | - print 'Fast median: %s; slow median: %s' % (d1, d2) |
130 | | - print 'Successful test of RunningMedian() with', N, |
131 | | - print 'items and a window of size', window, '\n' |
Index: trunk/tools/editor_trends/statistics/dataset.py |
— | — | @@ -1 +0,0 @@ |
2 | | - |
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts_backward.do |
— | — | @@ -5,12 +5,17 @@ |
6 | 6 | local projects "enwiki"
|
7 | 7 | //local projects "enwiki ruwiki dewiki eswiki jawiki"
|
8 | 8 | foreach proj of local projects {
|
| 9 | + clear
|
9 | 10 | //di "`loc'"
|
10 | 11 | //di "`proj'"
|
11 | | - local p = "`source'" + "`proj'" + "_cohort_data_backward.txt"
|
| 12 | + //local p = "`source'" + "`proj'" + "_cohort_dataset_backward_bar.csv"
|
| 13 | + ren year experience
|
| 14 | + local p = "`source'" + "cohort_dataset_backward_bar.csv"
|
12 | 15 | //di "`p'"
|
13 | 16 | insheet using `p'
|
14 | | -
|
| 17 | + split date, p("-")
|
| 18 | + destring date1, replace
|
| 19 | + ren date1 year
|
15 | 20 | sort year
|
16 | 21 |
|
17 | 22 | by year: generate n = months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
|
— | — | @@ -84,7 +89,7 @@ |
85 | 90 | label var more_one_year_abs "Editors with more than one year experience"
|
86 | 91 |
|
87 | 92 | twoway (line one_year_exp year), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
|
88 | | - local f = "`loc'" + "\`proj'\" + "`proj'" + "_line_rel_one_vs_multi_years.png"
|
| 93 | + local f = "`target'" + "\`proj'\" + "`proj'" + "_line_rel_one_vs_multi_years.png"
|
89 | 94 | graph export `f', replace
|
90 | 95 | //subtitle(Editors are getting older and influx of new editors has stagnated)
|
91 | 96 |
|
— | — | @@ -99,7 +104,7 @@ |
100 | 105 | graph export `f', replace
|
101 | 106 |
|
102 | 107 |
|
103 | | - clear
|
| 108 | +
|
104 | 109 | }
|
105 | 110 | set more on
|
106 | 111 |
|
Index: trunk/tools/editor_trends/statistics/stata/histogram_how_long_new_wikipedian_stays_active.do |
— | — | @@ -1,12 +1,13 @@ |
2 | 2 | clear
|
3 | 3 | set more off
|
4 | 4 | local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
|
5 | | -local projects "ruwiki dewiki eswiki jawiki enwiki"
|
6 | | -
|
| 5 | +//local projects "ruwiki dewiki eswiki jawiki enwiki"
|
| 6 | +local projects "enwiki"
|
7 | 7 | foreach proj of local projects {
|
8 | 8 | clear
|
9 | 9 |
|
10 | | - local p = "`loc'" + "`proj'" + "_forward_cohort.csv"
|
| 10 | + //local p = "`loc'" + "`proj'" + "_forward_cohort.csv"
|
| 11 | + local p = "`loc'" + "cohort_dataset_forward_histogram.csv"
|
11 | 12 | insheet using `p'
|
12 | 13 | label var experience "Number of months active"
|
13 | 14 | gen date = date(_time, "YMD")
|
Index: trunk/tools/editor_trends/statistics/stata/cohort_line_charts_forward.do |
— | — | @@ -1,18 +1,21 @@ |
2 | 2 | clear
|
3 | 3 | set more off
|
4 | 4 | local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
|
5 | | -local projects "ruwiki dewiki eswiki jawiki enwiki"
|
6 | | -
|
| 5 | +//local projects "ruwiki dewiki eswiki jawiki enwiki"
|
| 6 | +local projects "enwiki"
|
7 | 7 | foreach proj of local projects {
|
8 | 8 | clear
|
9 | | - local p = "`loc'" + "`proj'" + "_cohort_data_forward.csv"
|
| 9 | + //local p = "`loc'" + "`proj'" + "_cohort_data_forward.csv"
|
| 10 | + local p = "`loc'" + "cohort_dataset_forward_histogram.csv"
|
10 | 11 | insheet using `p'
|
11 | | - ren v1 raw_date
|
12 | | - ren v2 experience
|
13 | | - ren v3 count
|
14 | | -
|
15 | | - gen date = date(raw_date, "MY")
|
16 | | - format date %td
|
| 12 | + ren date raw_date
|
| 13 | + ren month experience
|
| 14 | + //ren count count
|
| 15 | + split raw_date, p(" ")
|
| 16 | + drop raw_date
|
| 17 | + ren raw_date1 raw_date
|
| 18 | + gen date = date(raw_date, "YMD")
|
| 19 | + //format date %tC
|
17 | 20 |
|
18 | 21 | egen min_year= min(year(date))
|
19 | 22 | egen max_year= max(year(date))
|
— | — | @@ -40,7 +43,7 @@ |
41 | 44 |
|
42 | 45 | replace count = . if count ==0
|
43 | 46 |
|
44 | | - forvalues year = `min_year'(1)`max_year' {
|
| 47 | + forvalues year = `min_year'(1)`max_year' {
|
45 | 48 | di `year'
|
46 | 49 | //local end_date = "1,31," + "`year'"
|
47 | 50 | //di `end_date'
|