r80219 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r80218‎ \| r80219 \| r80220 >
Date:	00:45, 14 January 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	1) Stata do files to generate charts. 2) Another batch of small improvements.
Modified paths:	/trunk/tools/editor_trends/analyses/count_editors.py (modified) (history) /trunk/tools/editor_trends/configuration.py (modified) (history) /trunk/tools/editor_trends/datasets/enwiki (added) (history) /trunk/tools/editor_trends/etl/exporter.py (modified) (history) /trunk/tools/editor_trends/etl/extracter.py (modified) (history) /trunk/tools/editor_trends/etl/shaper.py (modified) (history) /trunk/tools/editor_trends/etl/store.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/statistics/stata/cohort_charts.do (deleted) (history) /trunk/tools/editor_trends/statistics/stata/cohort_charts_backward.do (added) (history) /trunk/tools/editor_trends/statistics/stata/cohort_line_charts_forward.do (added) (history) /trunk/tools/editor_trends/statistics/stata/combined_line_chart_experience.do (added) (history) /trunk/tools/editor_trends/statistics/stata/histogram_edits_new_wikipedian.do (added) (history) /trunk/tools/editor_trends/statistics/stata/histogram_edits_new_wikipedian_outdated.do (added) (history) /trunk/tools/editor_trends/statistics/stata/histogram_how_long_new_wikipedian_stays_active.do (added) (history) /trunk/tools/editor_trends/statistics/stata/histogram_time_to_new_wikipedian.do (added) (history) /trunk/tools/editor_trends/statistics/stata/wiki.do (deleted) (history) /trunk/tools/editor_trends/utils/dump_downloader.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -58,6 +58,9 @@
59	59	for location in locations:
60	60	setattr(self, location, locations[location])
61	61
	62	+ def __str__(self):
	63	+ return 'Configurator'
	64	+
62	65	def __iter__(self):
63	66	for item in self.__dict__:
64	67	yield item
—	—	@@ -66,6 +69,9 @@
67	70	def __init__(self):
68	71	self.t0 = datetime.datetime.now()
69	72
	73	+ def __str__(self):
	74	+ return 'Timer started: %s' % self.t0
	75	+
70	76	def stop(self):
71	77	self.t1 = datetime.datetime.now()
72	78
—	—	@@ -167,16 +173,18 @@
168	174	config['ignore'] = get_value(args, 'except')
169	175	config['clean'] = get_value(args, 'new')
170	176
	177	+ config['project'] = project
	178	+ config['full_project'] = get_projectname(args)
	179	+ config['filename'] = generate_wikidump_filename(language_code, project, args)
	180	+ config['namespaces'] = get_namespaces(args)
	181	+
171	182	config['dataset'] = os.path.join(settings.dataset_location, config['full_project'])
	183	+ config['charts'] = os.path.join(settings.chart_location, config['full_project'])
172	184	config['location'] = os.path.join(location, language_code, project)
173	185	config['txt'] = os.path.join(config['location'], 'txt')
174	186	config['sorted'] = os.path.join(config['location'], 'sorted')
175	187
176		~~- config['project'] = project~~
177		~~- config['full_project'] = get_projectname(args)~~
178		~~- config['filename'] = generate_wikidump_filename(language_code, project, args)~~
179		~~- config['namespaces'] = get_namespaces(args)~~
180		~~- config['directories'] = [config['location'], config['txt'], config['sorted'], config['dataset']]~~
	188	+ config['directories'] = [config['location'], config['txt'], config['sorted'], config['dataset'], config['charts']]
181	189	config['path'] = '/%s/latest/' % config['full_project']
182	190	config['targets'] = targets.split(',')
183	191
—	—	@@ -211,7 +219,7 @@
212	220	extension = utils.determine_file_extension(config.filename)
213	221	filemode = utils.determine_file_mode(extension)
214	222	log.log_to_mongo(config.full_project, 'download', timer, type='start')
215		~~- task_queue = dump_downloader.create_list_dumpfiles('%s%s' % (settings.wp_dump_location, path), config.filename, extension)~~
	223	+ task_queue = dump_downloader.create_list_dumpfiles(settings.wp_dump_location, config.path, config.filename, extension)
216	224	while True:
217	225	filename = task_queue.get(block=False)
218	226	if filename == None:
—	—	@@ -297,8 +305,8 @@
298	306	timer = Timer()
299	307	log.log_to_mongo(config.full_project, 'store', timer, type='start')
300	308	db.cleanup_database(config.project, logger)
301		~~- write_message_to_log(logger, args, None, message=None, verb='Storing', location=config.location, input=config.input, project=config.project, collection=config.collection)~~
302		~~- store.launcher(config.input, config.project, config.collection)~~
	309	+ write_message_to_log(logger, args, None, message=None, verb='Storing', location=config.location, input=config.sorted, project=config.full_project, collection=config.collection)
	310	+ store.launcher(config.sorted, config.full_project, config.collection)
303	311	timer.elapsed()
304	312	log.log_to_mongo(full_project, 'store', timer, type='finish')
305	313
—	—	@@ -321,6 +329,7 @@
322	330	for target in config.targets:
323	331	write_message_to_log(logger, args, None, message=None, verb='Exporting', target=target, dbname=config.full_project, collection=config.collection)
324	332	target = datasets[target]
	333	+ print 'Dataset is created by: %s' % target
325	334	exporter.dataset_launcher(config.full_project, config.collection, target)
326	335	timer.elapsed()
327	336	log.log_to_mongo(config.full_project, 'export', timer, type='finish')
Index: trunk/tools/editor_trends/analyses/count_editors.py
—	—	@@ -70,6 +70,7 @@
71	71	data = shaper.create_datacontainer('list')
72	72	elif unit == 'year_dict':
73	73	data = shaper.create_datacontainer('dict')
	74	+ data = shaper.add_years_to_datacontainer(data, 0)
74	75	else:
75	76	data = {}
76	77
—	—	@@ -82,6 +83,47 @@
83	84	return data, prop
84	85
85	86
	87	+def cohort_dataset_forward_histogram(data, editor, prop):
	88	+ if prop == None:
	89	+ final_year = datetime.datetime.now().year + 1
	90	+ prop = ChartProperties(headers, False, 'long')
	91	+ headers = ['year', 'edits']
	92	+ prop.final_year = final_year
	93	+
	94	+ new_wikipedian = editor['new_wikipedian']
	95	+ yearly_edits = editor['edits_by_year']
	96	+ for year in xrange(new_wikipedian.year, prop.final_year):
	97	+ data[new_wikipedian.year].append(yearly_edits[year])
	98	+ return data, prop
	99	+
	100	+
	101	+def cohort_dataset_forward_bar(data, editor, prop):
	102	+ if prop == None:
	103	+ final_year = datetime.datetime.now().year + 1
	104	+ headers = ['experience'] + [y for y in xrange(2001, final_year)]
	105	+ prop = ChartProperties(headers, False, 'wide')
	106	+
	107	+ prop.final_year = final_year
	108	+ prop.cutoff_value = 5
	109	+
	110	+ new_wikipedian = editor['new_wikipedian']
	111	+ last_edit = editor['final_edit']
	112	+ monthly_edits = editor['monthly_edits']
	113	+ yearly_edits = editor['edits_by_year']
	114	+ active = []
	115	+ for year in xrange(new_wikipedian.year, prop.final_year):
	116	+ max_edits = max(monthly_edits.get(str(year), {0:0}).values())
	117	+ if yearly_edits.get(str(year), 0) == 0 or max_edits < prop.cutoff_value:
	118	+ continue
	119	+ else:
	120	+ active.append(year)
	121	+
	122	+ if active != []:
	123	+ year = max(active)
	124	+ data[new_wikipedian.year][year] += 1
	125	+ return data, prop
	126	+
	127	+
86	128	def new_editor_count(data, editor, prop):
87	129	'''
88	130	Summary: This function generates an overview of the number of
—	—	@@ -90,7 +132,7 @@
91	133	stats.download.org to make sure that we are using the same numbers.
92	134	'''
93	135	if prop == None:
94		~~- headers = ['time', 'count']~~
	136	+ headers = ['year', 'month', 'count']
95	137	prop = ChartProperties(headers, False, 'long')
96	138	new_wikipedian = editor['new_wikipedian']
97	139	data[new_wikipedian.year][new_wikipedian.month] += 1
—	—	@@ -167,5 +209,6 @@
168	210
169	211
170	212	if __name__ == '__main__':
171		~~- generate_chart_data('enwiki', 'editors', histogram_edits, unit='year_list')~~
	213	+ generate_chart_data('enwiki', 'editors', cohort_dataset_forward_bar, unit='year_dict')
	214	+ #generate_chart_data('enwiki', 'editors', histogram_edits, unit='year_list')
172	215	#generate_chart_data('enwiki', 'editors', time_to_new_wikipedian, unit='year_list')
Index: trunk/tools/editor_trends/etl/exporter.py
—	—	@@ -272,29 +272,68 @@
273	273	if new_wikipedian.month not in data[new_wikipedian.year]:
274	274	data[new_wikipedian.year][new_wikipedian.month] = {}
275	275	for i, year in enumerate(xrange(new_wikipedian.year, final_year)):
	276	+ min_edits = min(obs['monthly_edits'].values())
	277	+ if min_edits < 5:
	278	+ continue
276	279	months = edits.get(str(year), [])
277		~~- if i == 0:~~
278		~~- months = months.keys()~~
279		~~- months = [int(m) for m in months]~~
280		~~- months.sort()~~
281		~~- months = months[new_wikipedian.month - 1:]~~
282		~~- months = [str(m) for m in months]~~
	280	+# if i == 0:
	281	+# months = months.keys()
	282	+# months = [int(m) for m in months]
	283	+# months.sort()
	284	+# months = months[new_wikipedian.month - 1:]
	285	+# months = [str(m) for m in months]
283	286	for month in months:
284	287	experience = str(i * 12 + (int(month) - 1))
285	288	if experience not in data[new_wikipedian.year][new_wikipedian.month]:
286	289	data[new_wikipedian.year][new_wikipedian.month][experience] = 0
287	290	data[new_wikipedian.year][new_wikipedian.month][experience] += 1 if edits[str(year)][month] > 0 else 0
288	291
289		~~- fh = utils.create_txt_filehandle(settings.dataset_location, '%s_cohort_data_forward.csv' % (dbname), 'w', settings.encoding)~~
	292	+ filename = 'cohort_data_forward.bin'
	293	+ print 'Storing data as %s' % os.path.join(settings.binary_location, '%s_%s' % (dbname, filename))
	294	+ utils.store_object(data, settings.binary_location, '%s_%s' % (dbname, filename))
	295	+ cohort_charts.prepare_cohort_dataset(dbname, filename)
	296	+
	297	+ filename = '_cohort_data_forward_histogram.csv'
	298	+ fh = utils.create_txt_filehandle(settings.dataset_location, '%s_%s' % (dbname, filename), 'w', settings.encoding)
290	299	for year in data:
291	300	for month in data[year]:
292	301	obs = data[year][month].keys()
293	302	obs.sort()
294	303	for o in obs:
295		~~- utils.write_list_to_csv(['%s-%s' % (month, year), o, data[year][month][o]], fh, recursive=False, newline=True)~~
	304	+ utils.write_list_to_csv(['%s-%s' % (month, year), o, data[year][month][o]], fh, recursive=False, format='long')
296	305	fh.close()
297	306
	307	+def generate_cohort_dataset_backward(tasks, dbname, collection, **kwargs):
	308	+ mongo = db.init_mongo_db(dbname)
	309	+ editors = mongo[collection + '_dataset']
	310	+ windows = create_windows(break_down_first_year=False)
	311	+ data = shaper.create_datacontainer('dict')
	312	+ data = shaper.add_windows_to_datacontainer(data, windows)
298	313
	314	+ while True:
	315	+ id = tasks.get(block=False)
	316	+ tasks.task_done()
	317	+ if id == None:
	318	+ break
	319	+ obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1, 'edits_by_year': 1, 'last_edit_by_year': 1})
	320	+ first_edit = obs['first_edit']
	321	+ for year in xrange(2001, datetime.datetime.now().year + 1):
	322	+ year = str(year)
	323	+ if obs['edits_by_year'][year] > 0:
	324	+ last_edit = obs['last_edit_by_year'][year]
	325	+ editor_dt = relativedelta(last_edit, first_edit)
	326	+ editor_dt = (editor_dt.years * 12) + editor_dt.months
	327	+ for w in windows:
	328	+ if w >= editor_dt:
	329	+ data[int(year)][w] += 1
	330	+ break
	331	+ filename = 'cohort_data_backward.bin'
	332	+ print 'Storing data as %s' % os.path.join(settings.binary_location, '%s_%s' % (dbname, filename))
	333	+ utils.store_object(data, settings.binary_location, '%s_%s' % (dbname, filename))
	334	+ cohort_charts.prepare_cohort_dataset(dbname, filename)
	335	+
	336	+
	337	+
299	338	def generate_cohort_dataset_backward_custom(tasks, dbname, collection):
300	339	mongo = db.init_mongo_db(dbname)
301	340	editors = mongo[collection + '_dataset']
—	—	@@ -326,36 +365,8 @@
327	366
328	367
329	368
330		~~-def generate_cohort_dataset_backward(tasks, dbname, collection, **kwargs):~~
331		~~- mongo = db.init_mongo_db(dbname)~~
332		~~- editors = mongo[collection + '_dataset']~~
333		~~- windows = create_windows(break_down_first_year=False)~~
334		~~- data = shaper.create_datacontainer('dict')~~
335		~~- data = shaper.add_windows_to_datacontainer(data, windows)~~
336	369
337		~~- while True:~~
338		~~- id = tasks.get(block=False)~~
339		~~- tasks.task_done()~~
340		~~- if id == None:~~
341		~~- break~~
342		~~- obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1, 'edits_by_year': 1, 'last_edit_by_year': 1})~~
343		~~- first_edit = obs['first_edit']~~
344		~~- for year in xrange(2001, datetime.datetime.now().year + 1):~~
345		~~- year = str(year)~~
346		~~- if obs['edits_by_year'][year] > 0:~~
347		~~- last_edit = obs['last_edit_by_year'][year]~~
348		~~- editor_dt = relativedelta(last_edit, first_edit)~~
349		~~- editor_dt = (editor_dt.years * 12) + editor_dt.months~~
350		~~- for w in windows:~~
351		~~- if w >= editor_dt:~~
352		~~- data[int(year)][w] += 1~~
353		~~- break~~
354		~~- filename = 'cohort_data_backward.bin'~~
355		~~- print 'Storing data as %s' % os.path.join(settings.binary_location, '%s%s' % (dbname, filename))~~
356		~~- utils.store_object(data, settings.binary_location, '%s%s' % (dbname, filename))~~
357		~~- cohort_charts.prepare_cohort_dataset(dbname, filename)~~
358	370
359		-
360	371	def generate_wide_editor_dataset(tasks, dbname, collection, **kwargs):
361	372	mongo = db.init_mongo_db(dbname)
362	373	editors = mongo[collection + '_dataset']
Index: trunk/tools/editor_trends/etl/store.py
—	—	@@ -31,6 +31,8 @@
32	32
33	33
34	34	def store_editors(tasks, dbname, collection, input):
	35	+ mongo = db.init_mongo_db(dbname)
	36	+ collection = mongo[collection]
35	37	editor_cache = cache.EditorCache(collection)
36	38	prev_contributor = -1
37	39	edits = 0
—	—	@@ -44,6 +46,7 @@
45	47
46	48	fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding)
47	49	for line in utils.readline(fh):
	50	+ print line
48	51	if len(line) == 0:
49	52	continue
50	53	contributor = line[0]
—	—	@@ -68,15 +71,21 @@
69	72
70	73
71	74	def launcher(input, dbname, collection):
	75	+ hack = True
72	76	mongo = db.init_mongo_db(dbname)
73		~~- collection = mongo[collection]~~
74		~~- collection.ensure_index('editor')~~
75		~~- collection.create_index('editor')~~
76		~~- files = utils.retrieve_file_list(input, 'csv')~~
	77	+ coll = mongo[collection]
	78	+ coll.ensure_index('editor')
	79	+ coll.create_index('editor')
	80	+
	81	+ if hack:
	82	+ input = 'C:\wikimedia\en\wiki\dbready'
	83	+ files = utils.retrieve_file_list(input, 'txt')
	84	+ else:
	85	+ files = utils.retrieve_file_list(input, 'csv')
77	86	print files
78	87	print input
79	88	tasks = multiprocessing.JoinableQueue()
80		~~- consumers = [multiprocessing.Process(target=store_editors, args=(tasks, dbname, collection, input)) for i in xrange(settings.number_of_processes)]~~
	89	+ consumers = [multiprocessing.Process(target=store_editors, args=(tasks, dbname, collection, input)) for i in xrange(1)]
81	90	for file in files:
82	91	tasks.put(file)
83	92
Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -298,9 +298,10 @@
299	299	'''
300	300	try:
301	301	return int(id) % settings.max_filehandles
302		~~- except:~~
	302	+ except ValueError:
303	303	return sum([ord(i) for i in id]) % settings.max_filehandles
304	304
	305	+
305	306	if __name__ == '__main__':
306	307	project = 'wiki'
307	308	language_code = 'en'
Index: trunk/tools/editor_trends/etl/shaper.py
—	—	@@ -68,7 +68,15 @@
69	69
70	70	return datacontainer
71	71
	72	+def add_years_to_datacontainer(datacontainer, datatype):
	73	+ final_year = datetime.datetime.now().year
	74	+ for dc in datacontainer:
	75	+ datacontainer[dc] = {}
	76	+ for x in range(2001, final_year):
	77	+ datacontainer[dc][x] = datatype
	78	+ return datacontainer
72	79
	80	+
73	81	def get_standard_deviation(numberList):
74	82	mean = get_mean(numberList)
75	83	std = 0
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts.do
—	—	@@ -1,123 +0,0 @@
2		~~-clear~~
3		~~-set more off~~
4		~~-local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"~~
5		~~-//local projects "enwiki"~~
6		~~-local projects "enwiki ruwiki dewiki eswiki jawiki"~~
7		~~-foreach proj of local projects {~~
8		~~- //di "`loc'"~~
9		~~- //di "`proj'"~~
10		~~- local p = "`loc'" + "`proj'" + "_cohort_data.txt"~~
11		~~- //di "`p'"~~
12		~~- insheet using `p'~~
13		-
14		~~- sort year~~
15		-
16		~~- by year: generate n = months_3 + months_6 + months_9 + months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108~~
17		~~- by year: egen obs = sum(n)~~
18		~~- by year: generate one_year_exp = ((months_3 + months_6 + months_9 + months_12) / n) * 100~~
19		-
20		~~- sum(obs)~~
21		~~- return list~~
22		~~- local obs = r(max)~~
23		-
24		~~- di "`n'"~~
25		-
26		-
27		~~- gen months_3_rel = (months_3 / n) * 100~~
28		~~- gen months_6_rel = (months_6 / n) * 100~~
29		~~- gen months_9_rel = (months_9 / n) * 100~~
30		~~- gen months_12_rel = (months_12 / n) * 100~~
31		~~- gen months_24_rel = (months_24 / n) * 100~~
32		~~- gen months_36_rel = (months_36 / n) * 100~~
33		~~- gen months_48_rel = (months_48 / n) * 100~~
34		~~- gen months_60_rel = (months_60 / n) * 100~~
35		~~- gen months_72_rel = (months_72 / n) * 100~~
36		~~- gen months_84_rel = (months_84 / n) * 100~~
37		~~- gen months_96_rel = (months_96 / n) * 100~~
38		~~- gen months_108_rel = (months_108 / n) * 100~~
39		~~- //local values "3 6 9 12 24 36 48 60 72 84 96 108"~~
40		~~- //foreach value of local values {~~
41		~~- // local new_var = "months_" + "`value'" + "_rel"~~
42		~~- // local var = "months_" + "`value'"~~
43		~~- // generate `new_var' = `var' /~~
44		~~- //}~~
45		-
46		~~- label var months_3 "3 months"~~
47		~~- label var months_6 "6 months"~~
48		~~- label var months_9 "9 months"~~
49		~~- label var months_12 "1 year"~~
50		~~- label var months_24 "2 years"~~
51		~~- label var months_36 "3 years"~~
52		~~- label var months_48 "4 years"~~
53		~~- label var months_60 "5 years"~~
54		~~- label var months_72 "6 years"~~
55		~~- label var months_84 "7 years"~~
56		~~- label var months_96 "8 years"~~
57		~~- label var months_108 "9 years"~~
58		-
59		~~- label var months_3_rel "3 months"~~
60		~~- label var months_6_rel "6 months"~~
61		~~- label var months_9_rel "9 months"~~
62		~~- label var months_12_rel "1 year"~~
63		~~- label var months_24_rel "2 years"~~
64		~~- label var months_36_rel "3 years"~~
65		~~- label var months_48_rel "4 years"~~
66		~~- label var months_60_rel "5 years"~~
67		~~- label var months_72_rel "6 years"~~
68		~~- label var months_84_rel "7 years"~~
69		~~- label var months_96_rel "8 years"~~
70		~~- label var months_108_rel "9 years"~~
71		-
72		-
73		-
74		~~- //drop if(year==2010)~~
75		~~- generate fewer_one_year_abs = months_3 + months_6 + months_9 + months_12~~
76		~~- generate more_one_year_abs = months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108~~
77		~~- label var fewer_one_year_abs "Editors with less than one year experience"~~
78		~~- label var more_one_year_abs "Editors with more than one year experience"~~
79		-
80		- twoway (line one_year_exp year), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
81		~~- local f = "`loc'" + "`proj'" + "_line_rel_one_vs_multi_years.png"~~
82		~~- graph export `f', replace~~
83		~~- //subtitle(Editors are getting older and influx of new editors has stagnated)~~
84		-
85		-
86		- graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) subtitle(Project `proj') legend(colfirst cols(1)) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
87		~~- local f = "`loc'" + "`proj'" + "_bar_abs_one_vs_multi_years.png"~~
88		~~- graph export `f', replace~~
89		-
90		- graph bar (asis) months_3_rel months_6_rel months_9_rel months_12_rel months_24_rel months_36_rel months_48_rel months_60_rel months_72_rel months_84_rel months_96_rel months_108_rel, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) note("Based on the `proj' project, `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
91		~~- local f = "`loc'" + "`proj'" + "_bar_cohort.png"~~
92		~~- graph export `f', replace~~
93		-
94		-
95		~~- clear~~
96		-}
97		~~-set more on~~
98		-
99		~~-//label var months_3 "3 Months"~~
100		~~-//label var months_6 "6 Months"~~
101		~~-//label var months_9 "9 Months"~~
102		~~-//label var months_12 "1 Year"~~
103		~~-//label var months_24 "2 Years"~~
104		~~-//label var months_36 "3 Years"~~
105		~~-//label var months_48 "4 Years"~~
106		~~-//label var months_60 "5 Years"~~
107		~~-//label var months_72 "6 Years"~~
108		~~-//label var months_84 "7 Years"~~
109		~~-//label var months_96 "8 Years"~~
110		~~-//label var months_108 "9 Years"~~
111		~~-//generate one_year_exp = months_3+ months_6+ months_9+ months_12~~
112		-
113		~~-//generate fewer_one_year_abs = (one_year_exp/100) * n~~
114		~~-//generate more_one_year_abs = n - fewer_one_year_abs~~
115		~~-//label var fewer_one_year_abs "Editors with less than one year experience"~~
116		~~-//label var more_one_year_abs "Editors with more than one year experience"~~
117		-
118		-//graph bar (asis) months_3 months_6 months_9 months_12 months_24 months_36 months_48 months_60 months_72 months_84 months_96 months_108, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) subtitle(Editors are getting older and influx of new editors has stagnated) note("Based on English Wikipedia, 345.000 editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(tiny)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
119		-
120		-//twoway (line one_year_exp year), ytitle(%) ytitle(, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the English Wikipedia, dataset 345.000 editors.", size(vsmall))
121		-
122		-
123		-//graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) legend(colfirst cols(1))
124		-
Index: trunk/tools/editor_trends/statistics/stata/wiki.do
—	—	@@ -1,68 +0,0 @@
2		~~-insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_long_editors.csv"~~
3		~~-local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit first_edit"~~
4		-
5		~~-foreach edit of local first_ten {~~
6		~~- gen date2 = date(`edit', "YMDhms")~~
7		~~- drop `edit'~~
8		~~- rename date2 `edit'~~
9		~~- format `edit' %td~~
10		-}
11		-
12		~~-generate year_left = year(final_edit)~~
13		~~-generate year_joined = year(first_edit)~~
14		~~-sort year_joined~~
15		~~-by year_joined: gen community_size_t = _N~~
16		-
17		-
18		~~-forvalues year = 1(1)10{~~
19		~~- gen active200`year' = 0~~
20		~~- replace active200`year' =1 if((edits_10+(`year'*365)<=final_edit))~~
21		~~- egen community_size_200`year' = total(active200`year')~~
22		-}
23		-
24		~~-forvalues t = 1(1)10{~~
25		~~- local t1 = `t'+1~~
26		~~- gen retention200`t' = community_size_200`t1' / community_size_200`t'~~
27		-}
28		-
29		~~-generate time_to_new_wp = edits_10 - edits_1~~
30		~~-generate active_time_wp = final_edit - edits_10~~
31		~~-label time_to_new_wp "Number of days it took to become a new wikipedian"~~
32		~~-label active_time_wp "Number of days active once becoming a new wikipedian"~~
33		-
34		-
35		-
36		~~-compress~~
37		-
38		-graph hbar (mean) time_to_new_wp, over(year_joined, label(labsize(small))) blabel(bar, size(tiny) format(%9.0f)) ytitle(Average number of days) ytitle(, size(vsmall)) ylabel(, labsize(small)) title("The average number of days to become" "a new wikipedian increases.") note("A new wikipedian is defined as somebody who has made at least 10 edits." "The year in which the 10th edit was made determines in which year an editor became a new wikipedian." "Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits.", size(vsmall))
39		-histogram time_to_new_wp, percent ytitle(Percentage (%)) ytitle(, size(small)) xtitle(Number of days) xtitle(, size(small)) by(, title("Histograms of number of days it took" " to become a new wikipedian by year") subtitle(The pace by which contributors are becoming a new wikipedian is slowing down., size(small)) note("Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits." "A new wikipedian is somebody who has contributed at least 10 edits.", size(vsmall))) by(year_joined)
40		~~-graph box time_to_new_wp, over(year_joined) nooutsides~~
41		~~-glcurve edit_count, by( year_joined) split lorenz~~
42		-
43		-
44		-
45		~~-insheet using "C:\Users\diederik.vanliere\Desktop\dataset.csv"~~
46		~~-// 0 = False~~
47		~~-// 1 = True~~
48		-
49		~~-rename v1 id~~
50		~~-rename v2 date~~
51		~~-format date2 %td~~
52		~~-gen date2 = date(date, "MD20Y")~~
53		~~-sort id~~
54		~~-by id: generate n = _n~~
55		~~-by id: egen first_obs = min(date2)~~
56		~~-by id: egen last_obs = max(date2)~~
57		~~-by id: generate time_required = last_obs - first_obs~~
58		~~-by id: generate year= year(last_obs)~~
59		-
60		~~-gen made_ten_edits =0~~
61		~~-by id: egen temp = max(n)~~
62		~~-by id: replace made_ten_edits=1 if(temp==10)~~
63		~~-drop temp~~
64		-
65		-
66		-
67		~~-by year, sort: egen time_to_new_wikipedian = mean( time_required)~~
68		-
69		~~-compress~~
Index: trunk/tools/editor_trends/statistics/stata/combined_line_chart_experience.do
—	—	@@ -0,0 +1,14 @@
	2	+clear
	3	+set more off
	4	+local source "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
	5	+local target "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
	6	+
	7	+sort year
	8	+by year: generate n = months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
	9	+by year: generate one_year_exp = ((months_12) / n) * 100
	10	+
	11	+twoway (line one_year_exp year if project=="enwiki") (line one_year_exp year if project=="ruwiki") (line one_year_exp year if project=="eswiki") (line one_year_exp year if project=="jawiki") (line one_year_exp year if project=="frwiki") (line one_year_exp year if project=="dewiki"), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) legend(order(1 "Enwiki" 2 "Ruwiki" 3 "Eswiki" 4 "Jawiki" 5 "Frwiki" 6 "Dewiki"))
	12	+//twoway (line one_year_exp year), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
	13	+local f = "`target'" + "\`proj'\" + "`proj'" + "_line_rel_one_vs_multi_years.png"
	14	+graph export `f', replace
	15	+
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts_backward.do
—	—	@@ -0,0 +1,131 @@
	2	+clear
	3	+set more off
	4	+local source "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
	5	+local target "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
	6	+local projects "enwiki"
	7	+//local projects "enwiki ruwiki dewiki eswiki jawiki"
	8	+foreach proj of local projects {
	9	+ //di "`loc'"
	10	+ //di "`proj'"
	11	+ local p = "`source'" + "`proj'" + "_cohort_data_backward.txt"
	12	+ //di "`p'"
	13	+ insheet using `p'
	14	+
	15	+ sort year
	16	+
	17	+ by year: generate n = months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
	18	+ //by year: generate n = months_3 + months_6 + months_9 + months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
	19	+ by year: egen obs = sum(n)
	20	+ by year: generate one_year_exp = ((months_12) / n) * 100
	21	+ //by year: generate one_year_exp = ((months_3 + months_6 + months_9 + months_12) / n) * 100
	22	+
	23	+ sum(obs)
	24	+ return list
	25	+ local obs = r(max)
	26	+
	27	+ di "`n'"
	28	+
	29	+
	30	+ //gen months_3_rel = (months_3 / n) * 100
	31	+ //gen months_6_rel = (months_6 / n) * 100
	32	+ //gen months_9_rel = (months_9 / n) * 100
	33	+ gen months_12_rel = (months_12 / n) * 100
	34	+ gen months_24_rel = (months_24 / n) * 100
	35	+ gen months_36_rel = (months_36 / n) * 100
	36	+ gen months_48_rel = (months_48 / n) * 100
	37	+ gen months_60_rel = (months_60 / n) * 100
	38	+ gen months_72_rel = (months_72 / n) * 100
	39	+ gen months_84_rel = (months_84 / n) * 100
	40	+ gen months_96_rel = (months_96 / n) * 100
	41	+ gen months_108_rel = (months_108 / n) * 100
	42	+ //local values "3 6 9 12 24 36 48 60 72 84 96 108"
	43	+ //foreach value of local values {
	44	+ // local new_var = "months_" + "`value'" + "_rel"
	45	+ // local var = "months_" + "`value'"
	46	+ // generate `new_var' = `var' /
	47	+ //}
	48	+
	49	+ //label var months_3 "3 months"
	50	+ //label var months_6 "6 months"
	51	+ //label var months_9 "9 months"
	52	+ label var months_12 "1 year"
	53	+ label var months_24 "2 years"
	54	+ label var months_36 "3 years"
	55	+ label var months_48 "4 years"
	56	+ label var months_60 "5 years"
	57	+ label var months_72 "6 years"
	58	+ label var months_84 "7 years"
	59	+ label var months_96 "8 years"
	60	+ label var months_108 "9 years"
	61	+
	62	+ //label var months_3_rel "3 months"
	63	+ //label var months_6_rel "6 months"
	64	+ //label var months_9_rel "9 months"
	65	+ label var months_12_rel "1 year"
	66	+ label var months_24_rel "2 years"
	67	+ label var months_36_rel "3 years"
	68	+ label var months_48_rel "4 years"
	69	+ label var months_60_rel "5 years"
	70	+ label var months_72_rel "6 years"
	71	+ label var months_84_rel "7 years"
	72	+ label var months_96_rel "8 years"
	73	+ label var months_108_rel "9 years"
	74	+
	75	+
	76	+ local obs = "."
	77	+ drop if(year==2011)
	78	+ drop if(year==2012)
	79	+
	80	+ generate fewer_one_year_abs = months_12
	81	+ //generate fewer_one_year_abs = months_3 + months_6 + months_9 + months_12
	82	+
	83	+ generate more_one_year_abs = months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
	84	+ label var fewer_one_year_abs "Editors with less than one year experience"
	85	+ label var more_one_year_abs "Editors with more than one year experience"
	86	+
	87	+ twoway (line one_year_exp year), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
	88	+ local f = "`loc'" + "\`proj'\" + "`proj'" + "_line_rel_one_vs_multi_years.png"
	89	+ graph export `f', replace
	90	+ //subtitle(Editors are getting older and influx of new editors has stagnated)
	91	+
	92	+
	93	+ graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) subtitle(Project `proj') legend(colfirst cols(1)) note("Based on the `proj' project, dataset `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall))
	94	+ local f = "`target'" + "\`proj'\" + "`proj'" + "_bar_abs_one_vs_multi_years.png"
	95	+ graph export `f', replace
	96	+
	97	+ graph bar (asis) months_12_rel months_24_rel months_36_rel months_48_rel months_60_rel months_72_rel months_84_rel months_96_rel months_108_rel, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) note("Based on the `proj' project, `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
	98	+ //graph bar (asis) months_3_rel months_6_rel months_9_rel months_12_rel months_24_rel months_36_rel months_48_rel months_60_rel months_72_rel months_84_rel months_96_rel months_108_rel, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) note("Based on the `proj' project, `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
	99	+ local f = "`target'" + "\`proj'\" + "`proj'" + "_bar_cohort.png"
	100	+ graph export `f', replace
	101	+
	102	+
	103	+ clear
	104	+}
	105	+set more on
	106	+
	107	+//label var months_3 "3 Months"
	108	+//label var months_6 "6 Months"
	109	+//label var months_9 "9 Months"
	110	+//label var months_12 "1 Year"
	111	+//label var months_24 "2 Years"
	112	+//label var months_36 "3 Years"
	113	+//label var months_48 "4 Years"
	114	+//label var months_60 "5 Years"
	115	+//label var months_72 "6 Years"
	116	+//label var months_84 "7 Years"
	117	+//label var months_96 "8 Years"
	118	+//label var months_108 "9 Years"
	119	+//generate one_year_exp = months_3+ months_6+ months_9+ months_12
	120	+
	121	+//generate fewer_one_year_abs = (one_year_exp/100) * n
	122	+//generate more_one_year_abs = n - fewer_one_year_abs
	123	+//label var fewer_one_year_abs "Editors with less than one year experience"
	124	+//label var more_one_year_abs "Editors with more than one year experience"
	125	+
	126	+//graph bar (asis) months_3 months_6 months_9 months_12 months_24 months_36 months_48 months_60 months_72 months_84 months_96 months_108, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) subtitle(Editors are getting older and influx of new editors has stagnated) note("Based on English Wikipedia, 345.000 editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(tiny)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
	127	+
	128	+//twoway (line one_year_exp year), ytitle(%) ytitle(, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the English Wikipedia, dataset 345.000 editors.", size(vsmall))
	129	+
	130	+
	131	+//graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) legend(colfirst cols(1))
	132	+
Index: trunk/tools/editor_trends/statistics/stata/histogram_edits_new_wikipedian.do
—	—	@@ -0,0 +1,14 @@
	2	+clear
	3	+local loc = "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
	4	+insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_histogram_edits.csv"
	5	+
	6	+sum(year)
	7	+return list
	8	+local max_year = r(max)
	9	+local min_year = r(min)
	10	+
	11	+forvalues year = `min_year'(1)`max_year' {
	12	+ histogram num_edits if year==`year', percent addlabel addlabopts(mlabsize(tiny) mlabangle(forty_five)) xtitle(Number of edits) title("Histogram Number of Edits" "New Wikipedians Made in `year'")
	13	+ local f = "`loc'" + "enwiki_" + "`year'" + "_histogram_edits.png"
	14	+ graph export `f', replace
	15	+}
Index: trunk/tools/editor_trends/statistics/stata/histogram_how_long_new_wikipedian_stays_active.do
—	—	@@ -0,0 +1,57 @@
	2	+clear
	3	+set more off
	4	+local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
	5	+local projects "ruwiki dewiki eswiki jawiki enwiki"
	6	+
	7	+foreach proj of local projects {
	8	+ clear
	9	+
	10	+ local p = "`loc'" + "`proj'" + "_forward_cohort.csv"
	11	+ insheet using `p'
	12	+ label var experience "Number of months active"
	13	+ gen date = date(_time, "YMD")
	14	+ format date %td
	15	+
	16	+ egen min_year= min(year(date))
	17	+ egen max_year= max(year(date))
	18	+ gen month = month(date)
	19	+ gen day = day(date)
	20	+
	21	+ sum(max_year)
	22	+ return list
	23	+ local max_year = r(max)
	24	+
	25	+ sum(min_year)
	26	+ return list
	27	+ local min_year = r(min)
	28	+
	29	+ gen first_year = 0
	30	+ replace first_year =1 if year(date)==`min_year'
	31	+
	32	+ sum(month) if first_year ==1
	33	+ return list
	34	+ local m = r(min)
	35	+
	36	+ sum(day) if(first_year ==1 & month==`m')
	37	+ return list
	38	+ local d = r(min)
	39	+
	40	+ di `min_year'
	41	+ di `m'
	42	+ di `d'
	43	+
	44	+ forvalues year = `min_year'(1)`max_year' {
	45	+ di `year'
	46	+ //local end_date = "1,31," + "`year'"
	47	+ //di `end_date'
	48	+ //list date if date==mdy("`m'", "`d'", "`year'")
	49	+ if mdy(`m', `d', `year') < mdy(`m',`d', `max_year') {
	50	+ histogram experience if date==mdy(`m', `d', `year'), discrete percent ylabel(0(5)100, labsize(vsmall)) title("How long do editors stay who entered `m'/`year'?") subtitle("Project `proj'") note("Based on the `proj' project." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall))
	51	+ local f = "`loc'" + "`proj'" + "_" + "`year'" + "_histogram_cohort_forward.png"
	52	+ graph export `f', replace
	53	+ }
	54	+ }
	55	+}
	56	+
	57	+
	58	+set more on
Index: trunk/tools/editor_trends/statistics/stata/histogram_time_to_new_wikipedian.do
—	—	@@ -0,0 +1,18 @@
	2	+clear
	3	+local loc = "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
	4	+insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_time_to_new_wikipedian.csv"
	5	+
	6	+//egen min_year = min(year)
	7	+//egen max_year = max(year)
	8	+
	9	+sum(year)
	10	+return list
	11	+local max_year = r(max)
	12	+local min_year = r(min)
	13	+
	14	+forvalues year = `min_year'(1)`max_year' {
	15	+ histogram time_to_new_wikipedian if year==`year', discrete percent xtitle(Number of days) note("An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) title("Histogram Number of Days it Takes" "to Become a New Wikipedian in `year'")
	16	+ local f = "`loc'" + "enwiki_" + "`year'" + "_histogram_time_to_new_wikipedian.png"
	17	+ graph export `f', replace
	18	+}
	19	+
Index: trunk/tools/editor_trends/statistics/stata/histogram_edits_new_wikipedian_outdated.do
—	—	@@ -0,0 +1,68 @@
	2	+insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_long_editors.csv"
	3	+local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit first_edit"
	4	+
	5	+foreach edit of local first_ten {
	6	+ gen date2 = date(`edit', "YMDhms")
	7	+ drop `edit'
	8	+ rename date2 `edit'
	9	+ format `edit' %td
	10	+}
	11	+
	12	+generate year_left = year(final_edit)
	13	+generate year_joined = year(first_edit)
	14	+sort year_joined
	15	+by year_joined: gen community_size_t = _N
	16	+
	17	+
	18	+forvalues year = 1(1)10{
	19	+ gen active200`year' = 0
	20	+ replace active200`year' =1 if((edits_10+(`year'*365)<=final_edit))
	21	+ egen community_size_200`year' = total(active200`year')
	22	+}
	23	+
	24	+forvalues t = 1(1)10{
	25	+ local t1 = `t'+1
	26	+ gen retention200`t' = community_size_200`t1' / community_size_200`t'
	27	+}
	28	+
	29	+generate time_to_new_wp = edits_10 - edits_1
	30	+generate active_time_wp = final_edit - edits_10
	31	+label time_to_new_wp "Number of days it took to become a new wikipedian"
	32	+label active_time_wp "Number of days active once becoming a new wikipedian"
	33	+
	34	+
	35	+
	36	+compress
	37	+
	38	+graph hbar (mean) time_to_new_wp, over(year_joined, label(labsize(small))) blabel(bar, size(tiny) format(%9.0f)) ytitle(Average number of days) ytitle(, size(vsmall)) ylabel(, labsize(small)) title("The average number of days to become" "a new wikipedian increases.") note("A new wikipedian is defined as somebody who has made at least 10 edits." "The year in which the 10th edit was made determines in which year an editor became a new wikipedian." "Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits.", size(vsmall))
	39	+histogram time_to_new_wp, percent ytitle(Percentage (%)) ytitle(, size(small)) xtitle(Number of days) xtitle(, size(small)) by(, title("Histograms of number of days it took" " to become a new wikipedian by year") subtitle(The pace by which contributors are becoming a new wikipedian is slowing down., size(small)) note("Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits." "A new wikipedian is somebody who has contributed at least 10 edits.", size(vsmall))) by(year_joined)
	40	+graph box time_to_new_wp, over(year_joined) nooutsides
	41	+glcurve edit_count, by( year_joined) split lorenz
	42	+
	43	+
	44	+
	45	+insheet using "C:\Users\diederik.vanliere\Desktop\dataset.csv"
	46	+// 0 = False
	47	+// 1 = True
	48	+
	49	+rename v1 id
	50	+rename v2 date
	51	+format date2 %td
	52	+gen date2 = date(date, "MD20Y")
	53	+sort id
	54	+by id: generate n = _n
	55	+by id: egen first_obs = min(date2)
	56	+by id: egen last_obs = max(date2)
	57	+by id: generate time_required = last_obs - first_obs
	58	+by id: generate year= year(last_obs)
	59	+
	60	+gen made_ten_edits =0
	61	+by id: egen temp = max(n)
	62	+by id: replace made_ten_edits=1 if(temp==10)
	63	+drop temp
	64	+
	65	+
	66	+
	67	+by year, sort: egen time_to_new_wikipedian = mean( time_required)
	68	+
	69	+compress
Index: trunk/tools/editor_trends/statistics/stata/cohort_line_charts_forward.do
—	—	@@ -0,0 +1,57 @@
	2	+clear
	3	+set more off
	4	+local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
	5	+local projects "ruwiki dewiki eswiki jawiki enwiki"
	6	+
	7	+foreach proj of local projects {
	8	+ clear
	9	+ local p = "`loc'" + "`proj'" + "_cohort_data_forward.csv"
	10	+ insheet using `p'
	11	+ ren v1 raw_date
	12	+ ren v2 experience
	13	+ ren v3 count
	14	+
	15	+ gen date = date(raw_date, "MY")
	16	+ format date %td
	17	+
	18	+ egen min_year= min(year(date))
	19	+ egen max_year= max(year(date))
	20	+ gen month = month(date)
	21	+ gen day = day(date)
	22	+
	23	+ sum(max_year)
	24	+ return list
	25	+ local max_year = r(max)
	26	+
	27	+ sum(min_year)
	28	+ return list
	29	+ local min_year = r(min)
	30	+
	31	+ gen first_year = 0
	32	+ replace first_year =1 if year(date)==`min_year'
	33	+
	34	+ sum(month) if first_year ==1
	35	+ return list
	36	+ local m = r(min)
	37	+
	38	+ sum(day) if(first_year ==1 & month==`m')
	39	+ return list
	40	+ local d = r(min)
	41	+
	42	+ replace count = . if count ==0
	43	+
	44	+ forvalues year = `min_year'(1)`max_year' {
	45	+ di `year'
	46	+ //local end_date = "1,31," + "`year'"
	47	+ //di `end_date'
	48	+ //list date if date==mdy("`m'", "`d'", "`year'")
	49	+
	50	+ if mdy(`m', 1, `year') < mdy(`m', 1, `max_year') {
	51	+ twoway (line count experience if date==mdy(1,1,`year'), sort cmissing(n)), ytitle(Number of New Wikipedians) xtitle(Number of months active) xlabel(0(4)108, labsize(vsmall)) title("The number of New Wikipedians active who entered 1/`year'") subtitle("Project `proj'")
	52	+
	53	+ local f = "`loc'" + "`proj'" + "_" + "`year'" + "_line_cohort_forward.png"
	54	+ graph export `f', replace
	55	+ }
	56	+ }
	57	+
	58	+}
Index: trunk/tools/editor_trends/configuration.py
—	—	@@ -194,3 +194,4 @@
195	195	'data', 'objects')
196	196	self.namespace_location = os.path.join(self.working_directory,
197	197	'namespaces')
	198	+ self.chart_location = os.path.join(self.working_directory, 'statistics', 'charts')
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -195,7 +195,7 @@
196	196	return 'wb'
197	197
198	198
199		~~-def write_list_to_csv(data, fh, recursive=False, newline=True):~~
	199	+def write_list_to_csv(data, fh, recursive=False, newline=True, format='wide'):
200	200	'''
201	201	@data is a list which can contain other lists that will be written as a
202	202	single line to a textfile
—	—	@@ -218,7 +218,7 @@
219	219	if len(d) == len(data[x]):
220	220	fh.write('\n')
221	221	elif type(d) == type({}):
222		~~- tab = write_dict_to_csv(d, fh, write_key=False, newline=newline)~~
	222	+ tab = write_dict_to_csv(d, fh, d.keys(), write_key=False, format=format)
223	223	else:
224	224	fh.write('%s' % d)
225	225	tab = True
—	—	@@ -245,8 +245,19 @@
246	246	fh.write('%s\t%s\t%s\n' % (key, d, data[key][d]))
247	247	else:
248	248	fh.write('%s\n' % (data[key]))
249		~~- else:~~
250		~~- print 'not yet implemented'~~
	249	+ elif format == 'wide':
	250	+ for key in keys:
	251	+ if write_key:
	252	+ fh.write('%s\t' % key)
	253	+ if type(data[key]) == type([]):
	254	+ for d in data[key]:
	255	+ fh.write('%s\t')
	256	+ elif type(data[key]) == type({}):
	257	+ write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format)
	258	+ else:
	259	+ fh.write('%s\t' % (data[key]))
	260	+ fh.write('\n')
	261	+
251	262	#if type(data[key]) == type([]):
252	263	# write_list_to_csv(data[key], fh, recursive=False, newline=True)
253	264
—	—	@@ -367,6 +378,14 @@
368	379	return d
369	380
370	381
	382	+def determine_canonical_name(filename):
	383	+ while filename.find('.') > -1:
	384	+ ext = determine_file_extension(filename)
	385	+ ext = '.%s' % ext
	386	+ filename = filename.replace(ext, '')
	387	+ return filename
	388	+
	389	+
371	390	def retrieve_file_list(location, extension, mask=None):
372	391	'''
373	392	Retrieve a list of files from a specified location.
Index: trunk/tools/editor_trends/utils/dump_downloader.py
—	—	@@ -29,35 +29,38 @@
30	30	import utils
31	31
32	32
33		~~-def create_list_dumpfiles(url, canonical_filename, ext):~~
	33	+def create_list_dumpfiles(domain, path, filename, ext):
34	34	'''
35	35	Wikipedia offers the option to download one dump file in separate batches.
36	36	This function determines how many files there are for a giving dump and puts
37	37	them in a queue.
38	38	'''
39	39	task_queue = multiprocessing.JoinableQueue()
	40	+ canonical_filename = utils.determine_canonical_name(filename)
40	41	for x in xrange(1, 100):
41		~~- f = '%s%s%s' % (canonical_filename, x, ext)~~
42		~~- res = check_remote_file_exists(url, f)~~
	42	+ f = '%s%s.xml.%s' % (canonical_filename, x, ext)
	43	+ res = check_remote_file_exists(domain, path, f)
43	44	if res == None or res.status != 200:
44	45	break
45	46	else:
46		~~- task_queue.add(f)~~
	47	+ print 'Added chunk to download: %s' % f
	48	+ task_queue.put(f)
47	49	for x in xrange(settings.number_of_processes):
48		~~- task_queue.add(None)~~
	50	+ task_queue.put(None)
49	51	return task_queue
50	52
51	53
52		~~-def check_remote_file_exists(url, filename):~~
	54	+def check_remote_file_exists(domain, path, filename):
53	55	'''
54	56	@url is the full path of the file to be downloaded
55	57	@filename is the name of the file to be downloaded
56	58	'''
57	59	try:
58		~~- if url.startswith('http://'):~~
59		~~- url = url[7:]~~
60		~~- conn = httplib.HTTPConnection(url)~~
61		~~- conn.request('HEAD', filename)~~
	60	+ if domain.startswith('http://'):
	61	+ domain = domain[7:]
	62	+ conn = httplib.HTTPConnection(domain)
	63	+ url = '%s%s' % (path, filename)
	64	+ conn.request('HEAD', url)
62	65	res = conn.getresponse()
63	66	conn.close()
64	67	return res

Status & tagging log

04:39, 14 January 2011 😂 (talk | contribs) changed the status of r80219 [removed: new added: deferred]