Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -58,6 +58,9 @@ |
59 | 59 | for location in locations: |
60 | 60 | setattr(self, location, locations[location]) |
61 | 61 | |
| 62 | + def __str__(self): |
| 63 | + return 'Configurator' |
| 64 | + |
62 | 65 | def __iter__(self): |
63 | 66 | for item in self.__dict__: |
64 | 67 | yield item |
— | — | @@ -66,6 +69,9 @@ |
67 | 70 | def __init__(self): |
68 | 71 | self.t0 = datetime.datetime.now() |
69 | 72 | |
| 73 | + def __str__(self): |
| 74 | + return 'Timer started: %s' % self.t0 |
| 75 | + |
70 | 76 | def stop(self): |
71 | 77 | self.t1 = datetime.datetime.now() |
72 | 78 | |
— | — | @@ -167,16 +173,18 @@ |
168 | 174 | config['ignore'] = get_value(args, 'except') |
169 | 175 | config['clean'] = get_value(args, 'new') |
170 | 176 | |
| 177 | + config['project'] = project |
| 178 | + config['full_project'] = get_projectname(args) |
| 179 | + config['filename'] = generate_wikidump_filename(language_code, project, args) |
| 180 | + config['namespaces'] = get_namespaces(args) |
| 181 | + |
171 | 182 | config['dataset'] = os.path.join(settings.dataset_location, config['full_project']) |
| 183 | + config['charts'] = os.path.join(settings.chart_location, config['full_project']) |
172 | 184 | config['location'] = os.path.join(location, language_code, project) |
173 | 185 | config['txt'] = os.path.join(config['location'], 'txt') |
174 | 186 | config['sorted'] = os.path.join(config['location'], 'sorted') |
175 | 187 | |
176 | | - config['project'] = project |
177 | | - config['full_project'] = get_projectname(args) |
178 | | - config['filename'] = generate_wikidump_filename(language_code, project, args) |
179 | | - config['namespaces'] = get_namespaces(args) |
180 | | - config['directories'] = [config['location'], config['txt'], config['sorted'], config['dataset']] |
| 188 | + config['directories'] = [config['location'], config['txt'], config['sorted'], config['dataset'], config['charts']] |
181 | 189 | config['path'] = '/%s/latest/' % config['full_project'] |
182 | 190 | config['targets'] = targets.split(',') |
183 | 191 | |
— | — | @@ -211,7 +219,7 @@ |
212 | 220 | extension = utils.determine_file_extension(config.filename) |
213 | 221 | filemode = utils.determine_file_mode(extension) |
214 | 222 | log.log_to_mongo(config.full_project, 'download', timer, type='start') |
215 | | - task_queue = dump_downloader.create_list_dumpfiles('%s%s' % (settings.wp_dump_location, path), config.filename, extension) |
| 223 | + task_queue = dump_downloader.create_list_dumpfiles(settings.wp_dump_location, config.path, config.filename, extension) |
216 | 224 | while True: |
217 | 225 | filename = task_queue.get(block=False) |
218 | 226 | if filename == None: |
— | — | @@ -297,8 +305,8 @@ |
298 | 306 | timer = Timer() |
299 | 307 | log.log_to_mongo(config.full_project, 'store', timer, type='start') |
300 | 308 | db.cleanup_database(config.project, logger) |
301 | | - write_message_to_log(logger, args, None, message=None, verb='Storing', location=config.location, input=config.input, project=config.project, collection=config.collection) |
302 | | - store.launcher(config.input, config.project, config.collection) |
| 309 | + write_message_to_log(logger, args, None, message=None, verb='Storing', location=config.location, input=config.sorted, project=config.full_project, collection=config.collection) |
| 310 | + store.launcher(config.sorted, config.full_project, config.collection) |
303 | 311 | timer.elapsed() |
304 | 312 | log.log_to_mongo(full_project, 'store', timer, type='finish') |
305 | 313 | |
— | — | @@ -321,6 +329,7 @@ |
322 | 330 | for target in config.targets: |
323 | 331 | write_message_to_log(logger, args, None, message=None, verb='Exporting', target=target, dbname=config.full_project, collection=config.collection) |
324 | 332 | target = datasets[target] |
| 333 | + print 'Dataset is created by: %s' % target |
325 | 334 | exporter.dataset_launcher(config.full_project, config.collection, target) |
326 | 335 | timer.elapsed() |
327 | 336 | log.log_to_mongo(config.full_project, 'export', timer, type='finish') |
Index: trunk/tools/editor_trends/analyses/count_editors.py |
— | — | @@ -70,6 +70,7 @@ |
71 | 71 | data = shaper.create_datacontainer('list') |
72 | 72 | elif unit == 'year_dict': |
73 | 73 | data = shaper.create_datacontainer('dict') |
| 74 | + data = shaper.add_years_to_datacontainer(data, 0) |
74 | 75 | else: |
75 | 76 | data = {} |
76 | 77 | |
— | — | @@ -82,6 +83,47 @@ |
83 | 84 | return data, prop |
84 | 85 | |
85 | 86 | |
| 87 | +def cohort_dataset_forward_histogram(data, editor, prop): |
| 88 | + if prop == None: |
| 89 | + final_year = datetime.datetime.now().year + 1 |
| 90 | + prop = ChartProperties(headers, False, 'long') |
| 91 | + headers = ['year', 'edits'] |
| 92 | + prop.final_year = final_year |
| 93 | + |
| 94 | + new_wikipedian = editor['new_wikipedian'] |
| 95 | + yearly_edits = editor['edits_by_year'] |
| 96 | + for year in xrange(new_wikipedian.year, prop.final_year): |
| 97 | + data[new_wikipedian.year].append(yearly_edits[year]) |
| 98 | + return data, prop |
| 99 | + |
| 100 | + |
| 101 | +def cohort_dataset_forward_bar(data, editor, prop): |
| 102 | + if prop == None: |
| 103 | + final_year = datetime.datetime.now().year + 1 |
| 104 | + headers = ['experience'] + [y for y in xrange(2001, final_year)] |
| 105 | + prop = ChartProperties(headers, False, 'wide') |
| 106 | + |
| 107 | + prop.final_year = final_year |
| 108 | + prop.cutoff_value = 5 |
| 109 | + |
| 110 | + new_wikipedian = editor['new_wikipedian'] |
| 111 | + last_edit = editor['final_edit'] |
| 112 | + monthly_edits = editor['monthly_edits'] |
| 113 | + yearly_edits = editor['edits_by_year'] |
| 114 | + active = [] |
| 115 | + for year in xrange(new_wikipedian.year, prop.final_year): |
| 116 | + max_edits = max(monthly_edits.get(str(year), {0:0}).values()) |
| 117 | + if yearly_edits.get(str(year), 0) == 0 or max_edits < prop.cutoff_value: |
| 118 | + continue |
| 119 | + else: |
| 120 | + active.append(year) |
| 121 | + |
| 122 | + if active != []: |
| 123 | + year = max(active) |
| 124 | + data[new_wikipedian.year][year] += 1 |
| 125 | + return data, prop |
| 126 | + |
| 127 | + |
86 | 128 | def new_editor_count(data, editor, prop): |
87 | 129 | ''' |
88 | 130 | Summary: This function generates an overview of the number of |
— | — | @@ -90,7 +132,7 @@ |
91 | 133 | stats.download.org to make sure that we are using the same numbers. |
92 | 134 | ''' |
93 | 135 | if prop == None: |
94 | | - headers = ['time', 'count'] |
| 136 | + headers = ['year', 'month', 'count'] |
95 | 137 | prop = ChartProperties(headers, False, 'long') |
96 | 138 | new_wikipedian = editor['new_wikipedian'] |
97 | 139 | data[new_wikipedian.year][new_wikipedian.month] += 1 |
— | — | @@ -167,5 +209,6 @@ |
168 | 210 | |
169 | 211 | |
170 | 212 | if __name__ == '__main__': |
171 | | - generate_chart_data('enwiki', 'editors', histogram_edits, unit='year_list') |
| 213 | + generate_chart_data('enwiki', 'editors', cohort_dataset_forward_bar, unit='year_dict') |
| 214 | + #generate_chart_data('enwiki', 'editors', histogram_edits, unit='year_list') |
172 | 215 | #generate_chart_data('enwiki', 'editors', time_to_new_wikipedian, unit='year_list') |
Index: trunk/tools/editor_trends/etl/exporter.py |
— | — | @@ -272,29 +272,68 @@ |
273 | 273 | if new_wikipedian.month not in data[new_wikipedian.year]: |
274 | 274 | data[new_wikipedian.year][new_wikipedian.month] = {} |
275 | 275 | for i, year in enumerate(xrange(new_wikipedian.year, final_year)): |
| 276 | + min_edits = min(obs['monthly_edits'].values()) |
| 277 | + if min_edits < 5: |
| 278 | + continue |
276 | 279 | months = edits.get(str(year), []) |
277 | | - if i == 0: |
278 | | - months = months.keys() |
279 | | - months = [int(m) for m in months] |
280 | | - months.sort() |
281 | | - months = months[new_wikipedian.month - 1:] |
282 | | - months = [str(m) for m in months] |
| 280 | +# if i == 0: |
| 281 | +# months = months.keys() |
| 282 | +# months = [int(m) for m in months] |
| 283 | +# months.sort() |
| 284 | +# months = months[new_wikipedian.month - 1:] |
| 285 | +# months = [str(m) for m in months] |
283 | 286 | for month in months: |
284 | 287 | experience = str(i * 12 + (int(month) - 1)) |
285 | 288 | if experience not in data[new_wikipedian.year][new_wikipedian.month]: |
286 | 289 | data[new_wikipedian.year][new_wikipedian.month][experience] = 0 |
287 | 290 | data[new_wikipedian.year][new_wikipedian.month][experience] += 1 if edits[str(year)][month] > 0 else 0 |
288 | 291 | |
289 | | - fh = utils.create_txt_filehandle(settings.dataset_location, '%s_cohort_data_forward.csv' % (dbname), 'w', settings.encoding) |
| 292 | + filename = 'cohort_data_forward.bin' |
| 293 | + print 'Storing data as %s' % os.path.join(settings.binary_location, '%s_%s' % (dbname, filename)) |
| 294 | + utils.store_object(data, settings.binary_location, '%s_%s' % (dbname, filename)) |
| 295 | + cohort_charts.prepare_cohort_dataset(dbname, filename) |
| 296 | + |
| 297 | + filename = '_cohort_data_forward_histogram.csv' |
| 298 | + fh = utils.create_txt_filehandle(settings.dataset_location, '%s_%s' % (dbname, filename), 'w', settings.encoding) |
290 | 299 | for year in data: |
291 | 300 | for month in data[year]: |
292 | 301 | obs = data[year][month].keys() |
293 | 302 | obs.sort() |
294 | 303 | for o in obs: |
295 | | - utils.write_list_to_csv(['%s-%s' % (month, year), o, data[year][month][o]], fh, recursive=False, newline=True) |
| 304 | + utils.write_list_to_csv(['%s-%s' % (month, year), o, data[year][month][o]], fh, recursive=False, format='long') |
296 | 305 | fh.close() |
297 | 306 | |
| 307 | +def generate_cohort_dataset_backward(tasks, dbname, collection, **kwargs): |
| 308 | + mongo = db.init_mongo_db(dbname) |
| 309 | + editors = mongo[collection + '_dataset'] |
| 310 | + windows = create_windows(break_down_first_year=False) |
| 311 | + data = shaper.create_datacontainer('dict') |
| 312 | + data = shaper.add_windows_to_datacontainer(data, windows) |
298 | 313 | |
| 314 | + while True: |
| 315 | + id = tasks.get(block=False) |
| 316 | + tasks.task_done() |
| 317 | + if id == None: |
| 318 | + break |
| 319 | + obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1, 'edits_by_year': 1, 'last_edit_by_year': 1}) |
| 320 | + first_edit = obs['first_edit'] |
| 321 | + for year in xrange(2001, datetime.datetime.now().year + 1): |
| 322 | + year = str(year) |
| 323 | + if obs['edits_by_year'][year] > 0: |
| 324 | + last_edit = obs['last_edit_by_year'][year] |
| 325 | + editor_dt = relativedelta(last_edit, first_edit) |
| 326 | + editor_dt = (editor_dt.years * 12) + editor_dt.months |
| 327 | + for w in windows: |
| 328 | + if w >= editor_dt: |
| 329 | + data[int(year)][w] += 1 |
| 330 | + break |
| 331 | + filename = 'cohort_data_backward.bin' |
| 332 | + print 'Storing data as %s' % os.path.join(settings.binary_location, '%s_%s' % (dbname, filename)) |
| 333 | + utils.store_object(data, settings.binary_location, '%s_%s' % (dbname, filename)) |
| 334 | + cohort_charts.prepare_cohort_dataset(dbname, filename) |
| 335 | + |
| 336 | + |
| 337 | + |
299 | 338 | def generate_cohort_dataset_backward_custom(tasks, dbname, collection): |
300 | 339 | mongo = db.init_mongo_db(dbname) |
301 | 340 | editors = mongo[collection + '_dataset'] |
— | — | @@ -326,36 +365,8 @@ |
327 | 366 | |
328 | 367 | |
329 | 368 | |
330 | | -def generate_cohort_dataset_backward(tasks, dbname, collection, **kwargs): |
331 | | - mongo = db.init_mongo_db(dbname) |
332 | | - editors = mongo[collection + '_dataset'] |
333 | | - windows = create_windows(break_down_first_year=False) |
334 | | - data = shaper.create_datacontainer('dict') |
335 | | - data = shaper.add_windows_to_datacontainer(data, windows) |
336 | 369 | |
337 | | - while True: |
338 | | - id = tasks.get(block=False) |
339 | | - tasks.task_done() |
340 | | - if id == None: |
341 | | - break |
342 | | - obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1, 'edits_by_year': 1, 'last_edit_by_year': 1}) |
343 | | - first_edit = obs['first_edit'] |
344 | | - for year in xrange(2001, datetime.datetime.now().year + 1): |
345 | | - year = str(year) |
346 | | - if obs['edits_by_year'][year] > 0: |
347 | | - last_edit = obs['last_edit_by_year'][year] |
348 | | - editor_dt = relativedelta(last_edit, first_edit) |
349 | | - editor_dt = (editor_dt.years * 12) + editor_dt.months |
350 | | - for w in windows: |
351 | | - if w >= editor_dt: |
352 | | - data[int(year)][w] += 1 |
353 | | - break |
354 | | - filename = 'cohort_data_backward.bin' |
355 | | - print 'Storing data as %s' % os.path.join(settings.binary_location, '%s%s' % (dbname, filename)) |
356 | | - utils.store_object(data, settings.binary_location, '%s%s' % (dbname, filename)) |
357 | | - cohort_charts.prepare_cohort_dataset(dbname, filename) |
358 | 370 | |
359 | | - |
360 | 371 | def generate_wide_editor_dataset(tasks, dbname, collection, **kwargs): |
361 | 372 | mongo = db.init_mongo_db(dbname) |
362 | 373 | editors = mongo[collection + '_dataset'] |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -31,6 +31,8 @@ |
32 | 32 | |
33 | 33 | |
34 | 34 | def store_editors(tasks, dbname, collection, input): |
| 35 | + mongo = db.init_mongo_db(dbname) |
| 36 | + collection = mongo[collection] |
35 | 37 | editor_cache = cache.EditorCache(collection) |
36 | 38 | prev_contributor = -1 |
37 | 39 | edits = 0 |
— | — | @@ -44,6 +46,7 @@ |
45 | 47 | |
46 | 48 | fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding) |
47 | 49 | for line in utils.readline(fh): |
| 50 | + print line |
48 | 51 | if len(line) == 0: |
49 | 52 | continue |
50 | 53 | contributor = line[0] |
— | — | @@ -68,15 +71,21 @@ |
69 | 72 | |
70 | 73 | |
71 | 74 | def launcher(input, dbname, collection): |
| 75 | + hack = True |
72 | 76 | mongo = db.init_mongo_db(dbname) |
73 | | - collection = mongo[collection] |
74 | | - collection.ensure_index('editor') |
75 | | - collection.create_index('editor') |
76 | | - files = utils.retrieve_file_list(input, 'csv') |
| 77 | + coll = mongo[collection] |
| 78 | + coll.ensure_index('editor') |
| 79 | + coll.create_index('editor') |
| 80 | + |
| 81 | + if hack: |
| 82 | + input = 'C:\wikimedia\en\wiki\dbready' |
| 83 | + files = utils.retrieve_file_list(input, 'txt') |
| 84 | + else: |
| 85 | + files = utils.retrieve_file_list(input, 'csv') |
77 | 86 | print files |
78 | 87 | print input |
79 | 88 | tasks = multiprocessing.JoinableQueue() |
80 | | - consumers = [multiprocessing.Process(target=store_editors, args=(tasks, dbname, collection, input)) for i in xrange(settings.number_of_processes)] |
| 89 | + consumers = [multiprocessing.Process(target=store_editors, args=(tasks, dbname, collection, input)) for i in xrange(1)] |
81 | 90 | for file in files: |
82 | 91 | tasks.put(file) |
83 | 92 | |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -298,9 +298,10 @@ |
299 | 299 | ''' |
300 | 300 | try: |
301 | 301 | return int(id) % settings.max_filehandles |
302 | | - except: |
| 302 | + except ValueError: |
303 | 303 | return sum([ord(i) for i in id]) % settings.max_filehandles |
304 | 304 | |
| 305 | + |
305 | 306 | if __name__ == '__main__': |
306 | 307 | project = 'wiki' |
307 | 308 | language_code = 'en' |
Index: trunk/tools/editor_trends/etl/shaper.py |
— | — | @@ -68,7 +68,15 @@ |
69 | 69 | |
70 | 70 | return datacontainer |
71 | 71 | |
| 72 | +def add_years_to_datacontainer(datacontainer, datatype): |
| 73 | + final_year = datetime.datetime.now().year |
| 74 | + for dc in datacontainer: |
| 75 | + datacontainer[dc] = {} |
| 76 | + for x in range(2001, final_year): |
| 77 | + datacontainer[dc][x] = datatype |
| 78 | + return datacontainer |
72 | 79 | |
| 80 | + |
73 | 81 | def get_standard_deviation(numberList): |
74 | 82 | mean = get_mean(numberList) |
75 | 83 | std = 0 |
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts.do |
— | — | @@ -1,123 +0,0 @@ |
2 | | -clear |
3 | | -set more off |
4 | | -local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\" |
5 | | -//local projects "enwiki" |
6 | | -local projects "enwiki ruwiki dewiki eswiki jawiki" |
7 | | -foreach proj of local projects { |
8 | | - //di "`loc'" |
9 | | - //di "`proj'" |
10 | | - local p = "`loc'" + "`proj'" + "_cohort_data.txt" |
11 | | - //di "`p'" |
12 | | - insheet using `p' |
13 | | - |
14 | | - sort year |
15 | | - |
16 | | - by year: generate n = months_3 + months_6 + months_9 + months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108 |
17 | | - by year: egen obs = sum(n) |
18 | | - by year: generate one_year_exp = ((months_3 + months_6 + months_9 + months_12) / n) * 100 |
19 | | - |
20 | | - sum(obs) |
21 | | - return list |
22 | | - local obs = r(max) |
23 | | - |
24 | | - di "`n'" |
25 | | - |
26 | | - |
27 | | - gen months_3_rel = (months_3 / n) * 100 |
28 | | - gen months_6_rel = (months_6 / n) * 100 |
29 | | - gen months_9_rel = (months_9 / n) * 100 |
30 | | - gen months_12_rel = (months_12 / n) * 100 |
31 | | - gen months_24_rel = (months_24 / n) * 100 |
32 | | - gen months_36_rel = (months_36 / n) * 100 |
33 | | - gen months_48_rel = (months_48 / n) * 100 |
34 | | - gen months_60_rel = (months_60 / n) * 100 |
35 | | - gen months_72_rel = (months_72 / n) * 100 |
36 | | - gen months_84_rel = (months_84 / n) * 100 |
37 | | - gen months_96_rel = (months_96 / n) * 100 |
38 | | - gen months_108_rel = (months_108 / n) * 100 |
39 | | - //local values "3 6 9 12 24 36 48 60 72 84 96 108" |
40 | | - //foreach value of local values { |
41 | | - // local new_var = "months_" + "`value'" + "_rel" |
42 | | - // local var = "months_" + "`value'" |
43 | | - // generate `new_var' = `var' / |
44 | | - //} |
45 | | - |
46 | | - label var months_3 "3 months" |
47 | | - label var months_6 "6 months" |
48 | | - label var months_9 "9 months" |
49 | | - label var months_12 "1 year" |
50 | | - label var months_24 "2 years" |
51 | | - label var months_36 "3 years" |
52 | | - label var months_48 "4 years" |
53 | | - label var months_60 "5 years" |
54 | | - label var months_72 "6 years" |
55 | | - label var months_84 "7 years" |
56 | | - label var months_96 "8 years" |
57 | | - label var months_108 "9 years" |
58 | | - |
59 | | - label var months_3_rel "3 months" |
60 | | - label var months_6_rel "6 months" |
61 | | - label var months_9_rel "9 months" |
62 | | - label var months_12_rel "1 year" |
63 | | - label var months_24_rel "2 years" |
64 | | - label var months_36_rel "3 years" |
65 | | - label var months_48_rel "4 years" |
66 | | - label var months_60_rel "5 years" |
67 | | - label var months_72_rel "6 years" |
68 | | - label var months_84_rel "7 years" |
69 | | - label var months_96_rel "8 years" |
70 | | - label var months_108_rel "9 years" |
71 | | - |
72 | | - |
73 | | - |
74 | | - //drop if(year==2010) |
75 | | - generate fewer_one_year_abs = months_3 + months_6 + months_9 + months_12 |
76 | | - generate more_one_year_abs = months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108 |
77 | | - label var fewer_one_year_abs "Editors with less than one year experience" |
78 | | - label var more_one_year_abs "Editors with more than one year experience" |
79 | | - |
80 | | - twoway (line one_year_exp year), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall)) |
81 | | - local f = "`loc'" + "`proj'" + "_line_rel_one_vs_multi_years.png" |
82 | | - graph export `f', replace |
83 | | - //subtitle(Editors are getting older and influx of new editors has stagnated) |
84 | | - |
85 | | - |
86 | | - graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) subtitle(Project `proj') legend(colfirst cols(1)) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall)) |
87 | | - local f = "`loc'" + "`proj'" + "_bar_abs_one_vs_multi_years.png" |
88 | | - graph export `f', replace |
89 | | - |
90 | | - graph bar (asis) months_3_rel months_6_rel months_9_rel months_12_rel months_24_rel months_36_rel months_48_rel months_60_rel months_72_rel months_84_rel months_96_rel months_108_rel, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) note("Based on the `proj' project, `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall)) |
91 | | - local f = "`loc'" + "`proj'" + "_bar_cohort.png" |
92 | | - graph export `f', replace |
93 | | - |
94 | | - |
95 | | - clear |
96 | | -} |
97 | | -set more on |
98 | | - |
99 | | -//label var months_3 "3 Months" |
100 | | -//label var months_6 "6 Months" |
101 | | -//label var months_9 "9 Months" |
102 | | -//label var months_12 "1 Year" |
103 | | -//label var months_24 "2 Years" |
104 | | -//label var months_36 "3 Years" |
105 | | -//label var months_48 "4 Years" |
106 | | -//label var months_60 "5 Years" |
107 | | -//label var months_72 "6 Years" |
108 | | -//label var months_84 "7 Years" |
109 | | -//label var months_96 "8 Years" |
110 | | -//label var months_108 "9 Years" |
111 | | -//generate one_year_exp = months_3+ months_6+ months_9+ months_12 |
112 | | - |
113 | | -//generate fewer_one_year_abs = (one_year_exp/100) * n |
114 | | -//generate more_one_year_abs = n - fewer_one_year_abs |
115 | | -//label var fewer_one_year_abs "Editors with less than one year experience" |
116 | | -//label var more_one_year_abs "Editors with more than one year experience" |
117 | | - |
118 | | -//graph bar (asis) months_3 months_6 months_9 months_12 months_24 months_36 months_48 months_60 months_72 months_84 months_96 months_108, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) subtitle(Editors are getting older and influx of new editors has stagnated) note("Based on English Wikipedia, 345.000 editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(tiny)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall)) |
119 | | - |
120 | | -//twoway (line one_year_exp year), ytitle(%) ytitle(, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the English Wikipedia, dataset 345.000 editors.", size(vsmall)) |
121 | | - |
122 | | - |
123 | | -//graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) legend(colfirst cols(1)) |
124 | | - |
Index: trunk/tools/editor_trends/statistics/stata/wiki.do |
— | — | @@ -1,68 +0,0 @@ |
2 | | -insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_long_editors.csv" |
3 | | -local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit first_edit" |
4 | | - |
5 | | -foreach edit of local first_ten { |
6 | | - gen date2 = date(`edit', "YMDhms") |
7 | | - drop `edit' |
8 | | - rename date2 `edit' |
9 | | - format `edit' %td |
10 | | -} |
11 | | - |
12 | | -generate year_left = year(final_edit) |
13 | | -generate year_joined = year(first_edit) |
14 | | -sort year_joined |
15 | | -by year_joined: gen community_size_t = _N |
16 | | - |
17 | | - |
18 | | -forvalues year = 1(1)10{ |
19 | | - gen active200`year' = 0 |
20 | | - replace active200`year' =1 if((edits_10+(`year'*365)<=final_edit)) |
21 | | - egen community_size_200`year' = total(active200`year') |
22 | | -} |
23 | | - |
24 | | -forvalues t = 1(1)10{ |
25 | | - local t1 = `t'+1 |
26 | | - gen retention200`t' = community_size_200`t1' / community_size_200`t' |
27 | | -} |
28 | | - |
29 | | -generate time_to_new_wp = edits_10 - edits_1 |
30 | | -generate active_time_wp = final_edit - edits_10 |
31 | | -label time_to_new_wp "Number of days it took to become a new wikipedian" |
32 | | -label active_time_wp "Number of days active once becoming a new wikipedian" |
33 | | - |
34 | | - |
35 | | - |
36 | | -compress |
37 | | - |
38 | | -graph hbar (mean) time_to_new_wp, over(year_joined, label(labsize(small))) blabel(bar, size(tiny) format(%9.0f)) ytitle(Average number of days) ytitle(, size(vsmall)) ylabel(, labsize(small)) title("The average number of days to become" "a new wikipedian increases.") note("A new wikipedian is defined as somebody who has made at least 10 edits." "The year in which the 10th edit was made determines in which year an editor became a new wikipedian." "Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits.", size(vsmall)) |
39 | | -histogram time_to_new_wp, percent ytitle(Percentage (%)) ytitle(, size(small)) xtitle(Number of days) xtitle(, size(small)) by(, title("Histograms of number of days it took" " to become a new wikipedian by year") subtitle(The pace by which contributors are becoming a new wikipedian is slowing down., size(small)) note("Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits." "A new wikipedian is somebody who has contributed at least 10 edits.", size(vsmall))) by(year_joined) |
40 | | -graph box time_to_new_wp, over(year_joined) nooutsides |
41 | | -glcurve edit_count, by( year_joined) split lorenz |
42 | | - |
43 | | - |
44 | | - |
45 | | -insheet using "C:\Users\diederik.vanliere\Desktop\dataset.csv" |
46 | | -// 0 = False |
47 | | -// 1 = True |
48 | | - |
49 | | -rename v1 id |
50 | | -rename v2 date |
51 | | -format date2 %td |
52 | | -gen date2 = date(date, "MD20Y") |
53 | | -sort id |
54 | | -by id: generate n = _n |
55 | | -by id: egen first_obs = min(date2) |
56 | | -by id: egen last_obs = max(date2) |
57 | | -by id: generate time_required = last_obs - first_obs |
58 | | -by id: generate year= year(last_obs) |
59 | | - |
60 | | -gen made_ten_edits =0 |
61 | | -by id: egen temp = max(n) |
62 | | -by id: replace made_ten_edits=1 if(temp==10) |
63 | | -drop temp |
64 | | - |
65 | | - |
66 | | - |
67 | | -by year, sort: egen time_to_new_wikipedian = mean( time_required) |
68 | | - |
69 | | -compress |
Index: trunk/tools/editor_trends/statistics/stata/combined_line_chart_experience.do |
— | — | @@ -0,0 +1,14 @@ |
| 2 | +clear
|
| 3 | +set more off
|
| 4 | +local source "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
|
| 5 | +local target "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
|
| 6 | +
|
| 7 | +sort year
|
| 8 | +by year: generate n = months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
|
| 9 | +by year: generate one_year_exp = ((months_12) / n) * 100
|
| 10 | +
|
| 11 | +twoway (line one_year_exp year if project=="enwiki") (line one_year_exp year if project=="ruwiki") (line one_year_exp year if project=="eswiki") (line one_year_exp year if project=="jawiki") (line one_year_exp year if project=="frwiki") (line one_year_exp year if project=="dewiki"), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) legend(order(1 "Enwiki" 2 "Ruwiki" 3 "Eswiki" 4 "Jawiki" 5 "Frwiki" 6 "Dewiki"))
|
| 12 | +//twoway (line one_year_exp year), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
|
| 13 | +local f = "`target'" + "\`proj'\" + "`proj'" + "_line_rel_one_vs_multi_years.png"
|
| 14 | +graph export `f', replace
|
| 15 | +
|
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts_backward.do |
— | — | @@ -0,0 +1,131 @@ |
| 2 | +clear
|
| 3 | +set more off
|
| 4 | +local source "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
|
| 5 | +local target "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
|
| 6 | +local projects "enwiki"
|
| 7 | +//local projects "enwiki ruwiki dewiki eswiki jawiki"
|
| 8 | +foreach proj of local projects {
|
| 9 | + //di "`loc'"
|
| 10 | + //di "`proj'"
|
| 11 | + local p = "`source'" + "`proj'" + "_cohort_data_backward.txt"
|
| 12 | + //di "`p'"
|
| 13 | + insheet using `p'
|
| 14 | +
|
| 15 | + sort year
|
| 16 | +
|
| 17 | + by year: generate n = months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
|
| 18 | + //by year: generate n = months_3 + months_6 + months_9 + months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
|
| 19 | + by year: egen obs = sum(n)
|
| 20 | + by year: generate one_year_exp = ((months_12) / n) * 100
|
| 21 | + //by year: generate one_year_exp = ((months_3 + months_6 + months_9 + months_12) / n) * 100
|
| 22 | +
|
| 23 | + sum(obs)
|
| 24 | + return list
|
| 25 | + local obs = r(max)
|
| 26 | +
|
| 27 | + di "`n'"
|
| 28 | +
|
| 29 | +
|
| 30 | + //gen months_3_rel = (months_3 / n) * 100
|
| 31 | + //gen months_6_rel = (months_6 / n) * 100
|
| 32 | + //gen months_9_rel = (months_9 / n) * 100
|
| 33 | + gen months_12_rel = (months_12 / n) * 100
|
| 34 | + gen months_24_rel = (months_24 / n) * 100
|
| 35 | + gen months_36_rel = (months_36 / n) * 100
|
| 36 | + gen months_48_rel = (months_48 / n) * 100
|
| 37 | + gen months_60_rel = (months_60 / n) * 100
|
| 38 | + gen months_72_rel = (months_72 / n) * 100
|
| 39 | + gen months_84_rel = (months_84 / n) * 100
|
| 40 | + gen months_96_rel = (months_96 / n) * 100
|
| 41 | + gen months_108_rel = (months_108 / n) * 100
|
| 42 | + //local values "3 6 9 12 24 36 48 60 72 84 96 108"
|
| 43 | + //foreach value of local values {
|
| 44 | + // local new_var = "months_" + "`value'" + "_rel"
|
| 45 | + // local var = "months_" + "`value'"
|
| 46 | + // generate `new_var' = `var' /
|
| 47 | + //}
|
| 48 | +
|
| 49 | + //label var months_3 "3 months"
|
| 50 | + //label var months_6 "6 months"
|
| 51 | + //label var months_9 "9 months"
|
| 52 | + label var months_12 "1 year"
|
| 53 | + label var months_24 "2 years"
|
| 54 | + label var months_36 "3 years"
|
| 55 | + label var months_48 "4 years"
|
| 56 | + label var months_60 "5 years"
|
| 57 | + label var months_72 "6 years"
|
| 58 | + label var months_84 "7 years"
|
| 59 | + label var months_96 "8 years"
|
| 60 | + label var months_108 "9 years"
|
| 61 | +
|
| 62 | + //label var months_3_rel "3 months"
|
| 63 | + //label var months_6_rel "6 months"
|
| 64 | + //label var months_9_rel "9 months"
|
| 65 | + label var months_12_rel "1 year"
|
| 66 | + label var months_24_rel "2 years"
|
| 67 | + label var months_36_rel "3 years"
|
| 68 | + label var months_48_rel "4 years"
|
| 69 | + label var months_60_rel "5 years"
|
| 70 | + label var months_72_rel "6 years"
|
| 71 | + label var months_84_rel "7 years"
|
| 72 | + label var months_96_rel "8 years"
|
| 73 | + label var months_108_rel "9 years"
|
| 74 | +
|
| 75 | +
|
| 76 | + local obs = "."
|
| 77 | + drop if(year==2011)
|
| 78 | + drop if(year==2012)
|
| 79 | +
|
| 80 | + generate fewer_one_year_abs = months_12
|
| 81 | + //generate fewer_one_year_abs = months_3 + months_6 + months_9 + months_12
|
| 82 | +
|
| 83 | + generate more_one_year_abs = months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
|
| 84 | + label var fewer_one_year_abs "Editors with less than one year experience"
|
| 85 | + label var more_one_year_abs "Editors with more than one year experience"
|
| 86 | +
|
| 87 | + twoway (line one_year_exp year), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
|
| 88 | + local f = "`loc'" + "\`proj'\" + "`proj'" + "_line_rel_one_vs_multi_years.png"
|
| 89 | + graph export `f', replace
|
| 90 | + //subtitle(Editors are getting older and influx of new editors has stagnated)
|
| 91 | +
|
| 92 | +
|
| 93 | + graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) subtitle(Project `proj') legend(colfirst cols(1)) note("Based on the `proj' project, dataset `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall))
|
| 94 | + local f = "`target'" + "\`proj'\" + "`proj'" + "_bar_abs_one_vs_multi_years.png"
|
| 95 | + graph export `f', replace
|
| 96 | +
|
| 97 | + graph bar (asis) months_12_rel months_24_rel months_36_rel months_48_rel months_60_rel months_72_rel months_84_rel months_96_rel months_108_rel, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) note("Based on the `proj' project, `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
|
| 98 | + //graph bar (asis) months_3_rel months_6_rel months_9_rel months_12_rel months_24_rel months_36_rel months_48_rel months_60_rel months_72_rel months_84_rel months_96_rel months_108_rel, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) note("Based on the `proj' project, `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
|
| 99 | + local f = "`target'" + "\`proj'\" + "`proj'" + "_bar_cohort.png"
|
| 100 | + graph export `f', replace
|
| 101 | +
|
| 102 | +
|
| 103 | + clear
|
| 104 | +}
|
| 105 | +set more on
|
| 106 | +
|
| 107 | +//label var months_3 "3 Months"
|
| 108 | +//label var months_6 "6 Months"
|
| 109 | +//label var months_9 "9 Months"
|
| 110 | +//label var months_12 "1 Year"
|
| 111 | +//label var months_24 "2 Years"
|
| 112 | +//label var months_36 "3 Years"
|
| 113 | +//label var months_48 "4 Years"
|
| 114 | +//label var months_60 "5 Years"
|
| 115 | +//label var months_72 "6 Years"
|
| 116 | +//label var months_84 "7 Years"
|
| 117 | +//label var months_96 "8 Years"
|
| 118 | +//label var months_108 "9 Years"
|
| 119 | +//generate one_year_exp = months_3+ months_6+ months_9+ months_12
|
| 120 | +
|
| 121 | +//generate fewer_one_year_abs = (one_year_exp/100) * n
|
| 122 | +//generate more_one_year_abs = n - fewer_one_year_abs
|
| 123 | +//label var fewer_one_year_abs "Editors with less than one year experience"
|
| 124 | +//label var more_one_year_abs "Editors with more than one year experience"
|
| 125 | +
|
| 126 | +//graph bar (asis) months_3 months_6 months_9 months_12 months_24 months_36 months_48 months_60 months_72 months_84 months_96 months_108, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) subtitle(Editors are getting older and influx of new editors has stagnated) note("Based on English Wikipedia, 345.000 editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(tiny)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
|
| 127 | +
|
| 128 | +//twoway (line one_year_exp year), ytitle(%) ytitle(, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the English Wikipedia, dataset 345.000 editors.", size(vsmall))
|
| 129 | +
|
| 130 | +
|
| 131 | +//graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) legend(colfirst cols(1))
|
| 132 | +
|
Index: trunk/tools/editor_trends/statistics/stata/histogram_edits_new_wikipedian.do |
— | — | @@ -0,0 +1,14 @@ |
| 2 | +clear
|
| 3 | +local loc = "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
|
| 4 | +insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_histogram_edits.csv"
|
| 5 | +
|
| 6 | +sum(year)
|
| 7 | +return list
|
| 8 | +local max_year = r(max)
|
| 9 | +local min_year = r(min)
|
| 10 | +
|
| 11 | +forvalues year = `min_year'(1)`max_year' {
|
| 12 | + histogram num_edits if year==`year', percent addlabel addlabopts(mlabsize(tiny) mlabangle(forty_five)) xtitle(Number of edits) title("Histogram Number of Edits" "New Wikipedians Made in `year'")
|
| 13 | + local f = "`loc'" + "enwiki_" + "`year'" + "_histogram_edits.png"
|
| 14 | + graph export `f', replace
|
| 15 | +}
|
Index: trunk/tools/editor_trends/statistics/stata/histogram_how_long_new_wikipedian_stays_active.do |
— | — | @@ -0,0 +1,57 @@ |
| 2 | +clear
|
| 3 | +set more off
|
| 4 | +local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
|
| 5 | +local projects "ruwiki dewiki eswiki jawiki enwiki"
|
| 6 | +
|
| 7 | +foreach proj of local projects {
|
| 8 | + clear
|
| 9 | +
|
| 10 | + local p = "`loc'" + "`proj'" + "_forward_cohort.csv"
|
| 11 | + insheet using `p'
|
| 12 | + label var experience "Number of months active"
|
| 13 | + gen date = date(_time, "YMD")
|
| 14 | + format date %td
|
| 15 | +
|
| 16 | + egen min_year= min(year(date))
|
| 17 | + egen max_year= max(year(date))
|
| 18 | + gen month = month(date)
|
| 19 | + gen day = day(date)
|
| 20 | +
|
| 21 | + sum(max_year)
|
| 22 | + return list
|
| 23 | + local max_year = r(max)
|
| 24 | +
|
| 25 | + sum(min_year)
|
| 26 | + return list
|
| 27 | + local min_year = r(min)
|
| 28 | +
|
| 29 | + gen first_year = 0
|
| 30 | + replace first_year =1 if year(date)==`min_year'
|
| 31 | +
|
| 32 | + sum(month) if first_year ==1
|
| 33 | + return list
|
| 34 | + local m = r(min)
|
| 35 | +
|
| 36 | + sum(day) if(first_year ==1 & month==`m')
|
| 37 | + return list
|
| 38 | + local d = r(min)
|
| 39 | +
|
| 40 | + di `min_year'
|
| 41 | + di `m'
|
| 42 | + di `d'
|
| 43 | +
|
| 44 | + forvalues year = `min_year'(1)`max_year' {
|
| 45 | + di `year'
|
| 46 | + //local end_date = "1,31," + "`year'"
|
| 47 | + //di `end_date'
|
| 48 | + //list date if date==mdy("`m'", "`d'", "`year'")
|
| 49 | + if mdy(`m', `d', `year') < mdy(`m',`d', `max_year') {
|
| 50 | + histogram experience if date==mdy(`m', `d', `year'), discrete percent ylabel(0(5)100, labsize(vsmall)) title("How long do editors stay who entered `m'/`year'?") subtitle("Project `proj'") note("Based on the `proj' project." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall))
|
| 51 | + local f = "`loc'" + "`proj'" + "_" + "`year'" + "_histogram_cohort_forward.png"
|
| 52 | + graph export `f', replace
|
| 53 | + }
|
| 54 | + }
|
| 55 | +}
|
| 56 | +
|
| 57 | +
|
| 58 | +set more on
|
Index: trunk/tools/editor_trends/statistics/stata/histogram_time_to_new_wikipedian.do |
— | — | @@ -0,0 +1,18 @@ |
| 2 | +clear
|
| 3 | +local loc = "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
|
| 4 | +insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_time_to_new_wikipedian.csv"
|
| 5 | +
|
| 6 | +//egen min_year = min(year)
|
| 7 | +//egen max_year = max(year)
|
| 8 | +
|
| 9 | +sum(year)
|
| 10 | +return list
|
| 11 | +local max_year = r(max)
|
| 12 | +local min_year = r(min)
|
| 13 | +
|
| 14 | +forvalues year = `min_year'(1)`max_year' {
|
| 15 | + histogram time_to_new_wikipedian if year==`year', discrete percent xtitle(Number of days) note("An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) title("Histogram Number of Days it Takes" "to Become a New Wikipedian in `year'")
|
| 16 | + local f = "`loc'" + "enwiki_" + "`year'" + "_histogram_time_to_new_wikipedian.png"
|
| 17 | + graph export `f', replace
|
| 18 | +}
|
| 19 | +
|
Index: trunk/tools/editor_trends/statistics/stata/histogram_edits_new_wikipedian_outdated.do |
— | — | @@ -0,0 +1,68 @@ |
| 2 | +insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_long_editors.csv"
|
| 3 | +local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit first_edit"
|
| 4 | +
|
| 5 | +foreach edit of local first_ten {
|
| 6 | + gen date2 = date(`edit', "YMDhms")
|
| 7 | + drop `edit'
|
| 8 | + rename date2 `edit'
|
| 9 | + format `edit' %td
|
| 10 | +}
|
| 11 | +
|
| 12 | +generate year_left = year(final_edit)
|
| 13 | +generate year_joined = year(first_edit)
|
| 14 | +sort year_joined
|
| 15 | +by year_joined: gen community_size_t = _N
|
| 16 | +
|
| 17 | +
|
| 18 | +forvalues year = 1(1)10{
|
| 19 | + gen active200`year' = 0
|
| 20 | + replace active200`year' =1 if((edits_10+(`year'*365)<=final_edit))
|
| 21 | + egen community_size_200`year' = total(active200`year')
|
| 22 | +}
|
| 23 | +
|
| 24 | +forvalues t = 1(1)10{
|
| 25 | + local t1 = `t'+1
|
| 26 | + gen retention200`t' = community_size_200`t1' / community_size_200`t'
|
| 27 | +}
|
| 28 | +
|
| 29 | +generate time_to_new_wp = edits_10 - edits_1
|
| 30 | +generate active_time_wp = final_edit - edits_10
|
| 31 | +label time_to_new_wp "Number of days it took to become a new wikipedian"
|
| 32 | +label active_time_wp "Number of days active once becoming a new wikipedian"
|
| 33 | +
|
| 34 | +
|
| 35 | +
|
| 36 | +compress
|
| 37 | +
|
| 38 | +graph hbar (mean) time_to_new_wp, over(year_joined, label(labsize(small))) blabel(bar, size(tiny) format(%9.0f)) ytitle(Average number of days) ytitle(, size(vsmall)) ylabel(, labsize(small)) title("The average number of days to become" "a new wikipedian increases.") note("A new wikipedian is defined as somebody who has made at least 10 edits." "The year in which the 10th edit was made determines in which year an editor became a new wikipedian." "Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits.", size(vsmall))
|
| 39 | +histogram time_to_new_wp, percent ytitle(Percentage (%)) ytitle(, size(small)) xtitle(Number of days) xtitle(, size(small)) by(, title("Histograms of number of days it took" " to become a new wikipedian by year") subtitle(The pace by which contributors are becoming a new wikipedian is slowing down., size(small)) note("Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits." "A new wikipedian is somebody who has contributed at least 10 edits.", size(vsmall))) by(year_joined)
|
| 40 | +graph box time_to_new_wp, over(year_joined) nooutsides
|
| 41 | +glcurve edit_count, by( year_joined) split lorenz
|
| 42 | +
|
| 43 | +
|
| 44 | +
|
| 45 | +insheet using "C:\Users\diederik.vanliere\Desktop\dataset.csv"
|
| 46 | +// 0 = False
|
| 47 | +// 1 = True
|
| 48 | +
|
| 49 | +rename v1 id
|
| 50 | +rename v2 date
|
| 51 | +format date2 %td
|
| 52 | +gen date2 = date(date, "MD20Y")
|
| 53 | +sort id
|
| 54 | +by id: generate n = _n
|
| 55 | +by id: egen first_obs = min(date2)
|
| 56 | +by id: egen last_obs = max(date2)
|
| 57 | +by id: generate time_required = last_obs - first_obs
|
| 58 | +by id: generate year= year(last_obs)
|
| 59 | +
|
| 60 | +gen made_ten_edits =0
|
| 61 | +by id: egen temp = max(n)
|
| 62 | +by id: replace made_ten_edits=1 if(temp==10)
|
| 63 | +drop temp
|
| 64 | +
|
| 65 | +
|
| 66 | +
|
| 67 | +by year, sort: egen time_to_new_wikipedian = mean( time_required)
|
| 68 | +
|
| 69 | +compress
|
Index: trunk/tools/editor_trends/statistics/stata/cohort_line_charts_forward.do |
— | — | @@ -0,0 +1,57 @@ |
| 2 | +clear
|
| 3 | +set more off
|
| 4 | +local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
|
| 5 | +local projects "ruwiki dewiki eswiki jawiki enwiki"
|
| 6 | +
|
| 7 | +foreach proj of local projects {
|
| 8 | + clear
|
| 9 | + local p = "`loc'" + "`proj'" + "_cohort_data_forward.csv"
|
| 10 | + insheet using `p'
|
| 11 | + ren v1 raw_date
|
| 12 | + ren v2 experience
|
| 13 | + ren v3 count
|
| 14 | +
|
| 15 | + gen date = date(raw_date, "MY")
|
| 16 | + format date %td
|
| 17 | +
|
| 18 | + egen min_year= min(year(date))
|
| 19 | + egen max_year= max(year(date))
|
| 20 | + gen month = month(date)
|
| 21 | + gen day = day(date)
|
| 22 | +
|
| 23 | + sum(max_year)
|
| 24 | + return list
|
| 25 | + local max_year = r(max)
|
| 26 | +
|
| 27 | + sum(min_year)
|
| 28 | + return list
|
| 29 | + local min_year = r(min)
|
| 30 | +
|
| 31 | + gen first_year = 0
|
| 32 | + replace first_year =1 if year(date)==`min_year'
|
| 33 | +
|
| 34 | + sum(month) if first_year ==1
|
| 35 | + return list
|
| 36 | + local m = r(min)
|
| 37 | +
|
| 38 | + sum(day) if(first_year ==1 & month==`m')
|
| 39 | + return list
|
| 40 | + local d = r(min)
|
| 41 | +
|
| 42 | + replace count = . if count ==0
|
| 43 | +
|
| 44 | + forvalues year = `min_year'(1)`max_year' {
|
| 45 | + di `year'
|
| 46 | + //local end_date = "1,31," + "`year'"
|
| 47 | + //di `end_date'
|
| 48 | + //list date if date==mdy("`m'", "`d'", "`year'")
|
| 49 | +
|
| 50 | + if mdy(`m', 1, `year') < mdy(`m', 1, `max_year') {
|
| 51 | + twoway (line count experience if date==mdy(1,1,`year'), sort cmissing(n)), ytitle(Number of New Wikipedians) xtitle(Number of months active) xlabel(0(4)108, labsize(vsmall)) title("The number of New Wikipedians active who entered 1/`year'") subtitle("Project `proj'")
|
| 52 | +
|
| 53 | + local f = "`loc'" + "`proj'" + "_" + "`year'" + "_line_cohort_forward.png"
|
| 54 | + graph export `f', replace
|
| 55 | + }
|
| 56 | + }
|
| 57 | +
|
| 58 | +}
|
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -194,3 +194,4 @@ |
195 | 195 | 'data', 'objects') |
196 | 196 | self.namespace_location = os.path.join(self.working_directory, |
197 | 197 | 'namespaces') |
| 198 | + self.chart_location = os.path.join(self.working_directory, 'statistics', 'charts') |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -195,7 +195,7 @@ |
196 | 196 | return 'wb' |
197 | 197 | |
198 | 198 | |
199 | | -def write_list_to_csv(data, fh, recursive=False, newline=True): |
| 199 | +def write_list_to_csv(data, fh, recursive=False, newline=True, format='wide'): |
200 | 200 | ''' |
201 | 201 | @data is a list which can contain other lists that will be written as a |
202 | 202 | single line to a textfile |
— | — | @@ -218,7 +218,7 @@ |
219 | 219 | if len(d) == len(data[x]): |
220 | 220 | fh.write('\n') |
221 | 221 | elif type(d) == type({}): |
222 | | - tab = write_dict_to_csv(d, fh, write_key=False, newline=newline) |
| 222 | + tab = write_dict_to_csv(d, fh, d.keys(), write_key=False, format=format) |
223 | 223 | else: |
224 | 224 | fh.write('%s' % d) |
225 | 225 | tab = True |
— | — | @@ -245,8 +245,19 @@ |
246 | 246 | fh.write('%s\t%s\t%s\n' % (key, d, data[key][d])) |
247 | 247 | else: |
248 | 248 | fh.write('%s\n' % (data[key])) |
249 | | - else: |
250 | | - print 'not yet implemented' |
| 249 | + elif format == 'wide': |
| 250 | + for key in keys: |
| 251 | + if write_key: |
| 252 | + fh.write('%s\t' % key) |
| 253 | + if type(data[key]) == type([]): |
| 254 | + for d in data[key]: |
| 255 | + fh.write('%s\t') |
| 256 | + elif type(data[key]) == type({}): |
| 257 | + write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format) |
| 258 | + else: |
| 259 | + fh.write('%s\t' % (data[key])) |
| 260 | + fh.write('\n') |
| 261 | + |
251 | 262 | #if type(data[key]) == type([]): |
252 | 263 | # write_list_to_csv(data[key], fh, recursive=False, newline=True) |
253 | 264 | |
— | — | @@ -367,6 +378,14 @@ |
368 | 379 | return d |
369 | 380 | |
370 | 381 | |
| 382 | +def determine_canonical_name(filename): |
| 383 | + while filename.find('.') > -1: |
| 384 | + ext = determine_file_extension(filename) |
| 385 | + ext = '.%s' % ext |
| 386 | + filename = filename.replace(ext, '') |
| 387 | + return filename |
| 388 | + |
| 389 | + |
371 | 390 | def retrieve_file_list(location, extension, mask=None): |
372 | 391 | ''' |
373 | 392 | Retrieve a list of files from a specified location. |
Index: trunk/tools/editor_trends/utils/dump_downloader.py |
— | — | @@ -29,35 +29,38 @@ |
30 | 30 | import utils |
31 | 31 | |
32 | 32 | |
33 | | -def create_list_dumpfiles(url, canonical_filename, ext): |
| 33 | +def create_list_dumpfiles(domain, path, filename, ext): |
34 | 34 | ''' |
35 | 35 | Wikipedia offers the option to download one dump file in separate batches. |
36 | 36 | This function determines how many files there are for a giving dump and puts |
37 | 37 | them in a queue. |
38 | 38 | ''' |
39 | 39 | task_queue = multiprocessing.JoinableQueue() |
| 40 | + canonical_filename = utils.determine_canonical_name(filename) |
40 | 41 | for x in xrange(1, 100): |
41 | | - f = '%s%s%s' % (canonical_filename, x, ext) |
42 | | - res = check_remote_file_exists(url, f) |
| 42 | + f = '%s%s.xml.%s' % (canonical_filename, x, ext) |
| 43 | + res = check_remote_file_exists(domain, path, f) |
43 | 44 | if res == None or res.status != 200: |
44 | 45 | break |
45 | 46 | else: |
46 | | - task_queue.add(f) |
| 47 | + print 'Added chunk to download: %s' % f |
| 48 | + task_queue.put(f) |
47 | 49 | for x in xrange(settings.number_of_processes): |
48 | | - task_queue.add(None) |
| 50 | + task_queue.put(None) |
49 | 51 | return task_queue |
50 | 52 | |
51 | 53 | |
52 | | -def check_remote_file_exists(url, filename): |
| 54 | +def check_remote_file_exists(domain, path, filename): |
53 | 55 | ''' |
54 | 56 | @url is the full path of the file to be downloaded |
55 | 57 | @filename is the name of the file to be downloaded |
56 | 58 | ''' |
57 | 59 | try: |
58 | | - if url.startswith('http://'): |
59 | | - url = url[7:] |
60 | | - conn = httplib.HTTPConnection(url) |
61 | | - conn.request('HEAD', filename) |
| 60 | + if domain.startswith('http://'): |
| 61 | + domain = domain[7:] |
| 62 | + conn = httplib.HTTPConnection(domain) |
| 63 | + url = '%s%s' % (path, filename) |
| 64 | + conn.request('HEAD', url) |
62 | 65 | res = conn.getresponse() |
63 | 66 | conn.close() |
64 | 67 | return res |