r80940 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80939‎ | r80940 | r80941 >
Date:23:40, 24 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Cleanup of files.
Modified paths:
  • /trunk/tools/editor_trends/statistics/dataset.py (deleted) (history)
  • /trunk/tools/editor_trends/statistics/median.py (deleted) (history)
  • /trunk/tools/editor_trends/statistics/stata/cohort_charts_backward.do (modified) (history)
  • /trunk/tools/editor_trends/statistics/stata/cohort_line_charts_forward.do (modified) (history)
  • /trunk/tools/editor_trends/statistics/stata/histogram_how_long_new_wikipedian_stays_active.do (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/statistics/median.py
@@ -1,130 +0,0 @@
2 -from collections import deque
3 -from random import random
4 -from math import log
5 -
6 -class Infinity(object):
7 - 'Sentinel object that always compares greater than another object'
8 - def __cmp__(self, other):
9 - return 1
10 -
11 -def running_median(n, iterable, len=len, min=min, int=int, log=log, random=random):
12 - 'Fast running median with O(lg n) updates where n is the window size'
13 -
14 - maxlevels = int(log(n, 2)) + 1
15 - bottom_to_top = list(range(maxlevels))
16 - top_to_bottom = bottom_to_top[::-1]
17 -
18 - VALUE, NEXT, WIDTH = 0, 1, 2 # Components of a node list
19 - NIL = [Infinity(), [], []] # Singleton terminator node
20 - head = ['HEAD', [NIL] * maxlevels, [1] * maxlevels]
21 - staircase = [None] * maxlevels
22 -
23 - queue = deque()
24 - queue_append, queue_popleft = queue.append, queue.popleft
25 - midpoint = n // 2
26 - oldnode = None
27 - for newelem in iterable:
28 - # staircase: first node on each level where node[NEXT][level][VALUE] > newelem
29 - queue_append(newelem)
30 - stair_width = [0] * maxlevels
31 - node = head
32 - for level in top_to_bottom:
33 - while not newelem < node[NEXT][level][VALUE]:
34 - stair_width[level] += node[WIDTH][level]
35 - node = node[NEXT][level]
36 - staircase[level] = node
37 -
38 - # make a new node or reuse one that was previously removed
39 - if oldnode is None:
40 - d = min(maxlevels, 1 - int(log(random(), 2.0)))
41 - newnode = [newelem, [None] * d, [None] * d]
42 - else:
43 - newnode = oldnode
44 - newnode[VALUE] = newelem
45 - d = len(newnode[NEXT])
46 -
47 - # insert a link to the newnode at each level
48 - steps = 0
49 - for level in bottom_to_top[:d]:
50 - prevnode = staircase[level]
51 - newnode[NEXT][level] = prevnode[NEXT][level]
52 - prevnode[NEXT][level] = newnode
53 - newnode[WIDTH][level] = prevnode[WIDTH][level] - steps
54 - prevnode[WIDTH][level] = steps
55 - steps += stair_width[level]
56 - for level in bottom_to_top:
57 - prevnode = staircase[level]
58 - prevnode[WIDTH][level] += 1
59 -
60 - if len(queue) >= n:
61 - # find and yield the midpoint value
62 - i = midpoint + 1
63 - node = head
64 - for level in top_to_bottom:
65 - while node[WIDTH][level] <= i:
66 - i -= node[WIDTH][level]
67 - node = node[NEXT][level]
68 - yield node[VALUE]
69 -
70 - # staircase: first node on each level where node[NEXT][level][VALUE] >= oldelem
71 - oldelem = queue_popleft()
72 - node = head
73 - for level in top_to_bottom:
74 - while node[NEXT][level][VALUE] < oldelem:
75 - node = node[NEXT][level]
76 - staircase[level] = node
77 - oldnode = staircase[0][NEXT][0] # node where oldnode[VALUE] is oldelem
78 -
79 - # remove links to the oldnode
80 - d = len(oldnode[NEXT])
81 - for level in bottom_to_top[:d]:
82 - prevnode = staircase[level]
83 - prevnode[WIDTH][level] += oldnode[WIDTH][level]
84 - prevnode[NEXT][level] = oldnode[NEXT][level]
85 - for level in bottom_to_top:
86 - prevnode = staircase[level]
87 - prevnode[WIDTH][level] -= 1
88 -
89 -
90 -if __name__ == '__main__':
91 -
92 - ###########################################################################
93 - # Demonstrate the running_median() generator
94 - # Compare results to an alternative generator
95 - # implemented by sorting a regular list.
96 -
97 - from bisect import insort
98 - from random import randrange
99 - from itertools import islice
100 - import datetime
101 -
102 - def running_median_slow(n, iterable):
103 - 'Slow running-median with O(n) updates where n is the window size'
104 - it = iter(iterable)
105 - queue = deque(islice(it, n))
106 - sortedlist = sorted(queue)
107 - midpoint = len(queue) // 2
108 - yield sortedlist[midpoint]
109 - for newelem in it:
110 - oldelem = queue.popleft()
111 - sortedlist.remove(oldelem)
112 - queue.append(newelem)
113 - insort(sortedlist, newelem)
114 - yield sortedlist[midpoint]
115 -
116 - M, N, window = 9000, 80000, 101
117 -
118 - data = [randrange(M) for i in range(N)]
119 - t1 = datetime.datetime.now()
120 - result = list(running_median(window, data))
121 - t2 = datetime.datetime.now()
122 - expected = list(running_median_slow(window, data))
123 - t3 = datetime.datetime.now()
124 - assert result == expected
125 - d1 = t2 - t1
126 - d2 = t3 - t2
127 - print result
128 - print expected
129 - print 'Fast median: %s; slow median: %s' % (d1, d2)
130 - print 'Successful test of RunningMedian() with', N,
131 - print 'items and a window of size', window, '\n'
Index: trunk/tools/editor_trends/statistics/dataset.py
@@ -1 +0,0 @@
2 -
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts_backward.do
@@ -5,12 +5,17 @@
66 local projects "enwiki"
77 //local projects "enwiki ruwiki dewiki eswiki jawiki"
88 foreach proj of local projects {
 9+ clear
910 //di "`loc'"
1011 //di "`proj'"
11 - local p = "`source'" + "`proj'" + "_cohort_data_backward.txt"
 12+ //local p = "`source'" + "`proj'" + "_cohort_dataset_backward_bar.csv"
 13+ ren year experience
 14+ local p = "`source'" + "cohort_dataset_backward_bar.csv"
1215 //di "`p'"
1316 insheet using `p'
14 -
 17+ split date, p("-")
 18+ destring date1, replace
 19+ ren date1 year
1520 sort year
1621
1722 by year: generate n = months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
@@ -84,7 +89,7 @@
8590 label var more_one_year_abs "Editors with more than one year experience"
8691
8792 twoway (line one_year_exp year), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
88 - local f = "`loc'" + "\`proj'\" + "`proj'" + "_line_rel_one_vs_multi_years.png"
 93+ local f = "`target'" + "\`proj'\" + "`proj'" + "_line_rel_one_vs_multi_years.png"
8994 graph export `f', replace
9095 //subtitle(Editors are getting older and influx of new editors has stagnated)
9196
@@ -99,7 +104,7 @@
100105 graph export `f', replace
101106
102107
103 - clear
 108+
104109 }
105110 set more on
106111
Index: trunk/tools/editor_trends/statistics/stata/histogram_how_long_new_wikipedian_stays_active.do
@@ -1,12 +1,13 @@
22 clear
33 set more off
44 local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
5 -local projects "ruwiki dewiki eswiki jawiki enwiki"
6 -
 5+//local projects "ruwiki dewiki eswiki jawiki enwiki"
 6+local projects "enwiki"
77 foreach proj of local projects {
88 clear
99
10 - local p = "`loc'" + "`proj'" + "_forward_cohort.csv"
 10+ //local p = "`loc'" + "`proj'" + "_forward_cohort.csv"
 11+ local p = "`loc'" + "cohort_dataset_forward_histogram.csv"
1112 insheet using `p'
1213 label var experience "Number of months active"
1314 gen date = date(_time, "YMD")
Index: trunk/tools/editor_trends/statistics/stata/cohort_line_charts_forward.do
@@ -1,18 +1,21 @@
22 clear
33 set more off
44 local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
5 -local projects "ruwiki dewiki eswiki jawiki enwiki"
6 -
 5+//local projects "ruwiki dewiki eswiki jawiki enwiki"
 6+local projects "enwiki"
77 foreach proj of local projects {
88 clear
9 - local p = "`loc'" + "`proj'" + "_cohort_data_forward.csv"
 9+ //local p = "`loc'" + "`proj'" + "_cohort_data_forward.csv"
 10+ local p = "`loc'" + "cohort_dataset_forward_histogram.csv"
1011 insheet using `p'
11 - ren v1 raw_date
12 - ren v2 experience
13 - ren v3 count
14 -
15 - gen date = date(raw_date, "MY")
16 - format date %td
 12+ ren date raw_date
 13+ ren month experience
 14+ //ren count count
 15+ split raw_date, p(" ")
 16+ drop raw_date
 17+ ren raw_date1 raw_date
 18+ gen date = date(raw_date, "YMD")
 19+ //format date %tC
1720
1821 egen min_year= min(year(date))
1922 egen max_year= max(year(date))
@@ -40,7 +43,7 @@
4144
4245 replace count = . if count ==0
4346
44 - forvalues year = `min_year'(1)`max_year' {
 47+ forvalues year = `min_year'(1)`max_year' {
4548 di `year'
4649 //local end_date = "1,31," + "`year'"
4750 //di `end_date'

Status & tagging log