r80219 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80218‎ | r80219 | r80220 >
Date:00:45, 14 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
1) Stata do files to generate charts.
2) Another batch of small improvements.
Modified paths:
  • /trunk/tools/editor_trends/analyses/count_editors.py (modified) (history)
  • /trunk/tools/editor_trends/configuration.py (modified) (history)
  • /trunk/tools/editor_trends/datasets/enwiki (added) (history)
  • /trunk/tools/editor_trends/etl/exporter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/shaper.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/statistics/stata/cohort_charts.do (deleted) (history)
  • /trunk/tools/editor_trends/statistics/stata/cohort_charts_backward.do (added) (history)
  • /trunk/tools/editor_trends/statistics/stata/cohort_line_charts_forward.do (added) (history)
  • /trunk/tools/editor_trends/statistics/stata/combined_line_chart_experience.do (added) (history)
  • /trunk/tools/editor_trends/statistics/stata/histogram_edits_new_wikipedian.do (added) (history)
  • /trunk/tools/editor_trends/statistics/stata/histogram_edits_new_wikipedian_outdated.do (added) (history)
  • /trunk/tools/editor_trends/statistics/stata/histogram_how_long_new_wikipedian_stays_active.do (added) (history)
  • /trunk/tools/editor_trends/statistics/stata/histogram_time_to_new_wikipedian.do (added) (history)
  • /trunk/tools/editor_trends/statistics/stata/wiki.do (deleted) (history)
  • /trunk/tools/editor_trends/utils/dump_downloader.py (modified) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -58,6 +58,9 @@
5959 for location in locations:
6060 setattr(self, location, locations[location])
6161
 62+ def __str__(self):
 63+ return 'Configurator'
 64+
6265 def __iter__(self):
6366 for item in self.__dict__:
6467 yield item
@@ -66,6 +69,9 @@
6770 def __init__(self):
6871 self.t0 = datetime.datetime.now()
6972
 73+ def __str__(self):
 74+ return 'Timer started: %s' % self.t0
 75+
7076 def stop(self):
7177 self.t1 = datetime.datetime.now()
7278
@@ -167,16 +173,18 @@
168174 config['ignore'] = get_value(args, 'except')
169175 config['clean'] = get_value(args, 'new')
170176
 177+ config['project'] = project
 178+ config['full_project'] = get_projectname(args)
 179+ config['filename'] = generate_wikidump_filename(language_code, project, args)
 180+ config['namespaces'] = get_namespaces(args)
 181+
171182 config['dataset'] = os.path.join(settings.dataset_location, config['full_project'])
 183+ config['charts'] = os.path.join(settings.chart_location, config['full_project'])
172184 config['location'] = os.path.join(location, language_code, project)
173185 config['txt'] = os.path.join(config['location'], 'txt')
174186 config['sorted'] = os.path.join(config['location'], 'sorted')
175187
176 - config['project'] = project
177 - config['full_project'] = get_projectname(args)
178 - config['filename'] = generate_wikidump_filename(language_code, project, args)
179 - config['namespaces'] = get_namespaces(args)
180 - config['directories'] = [config['location'], config['txt'], config['sorted'], config['dataset']]
 188+ config['directories'] = [config['location'], config['txt'], config['sorted'], config['dataset'], config['charts']]
181189 config['path'] = '/%s/latest/' % config['full_project']
182190 config['targets'] = targets.split(',')
183191
@@ -211,7 +219,7 @@
212220 extension = utils.determine_file_extension(config.filename)
213221 filemode = utils.determine_file_mode(extension)
214222 log.log_to_mongo(config.full_project, 'download', timer, type='start')
215 - task_queue = dump_downloader.create_list_dumpfiles('%s%s' % (settings.wp_dump_location, path), config.filename, extension)
 223+ task_queue = dump_downloader.create_list_dumpfiles(settings.wp_dump_location, config.path, config.filename, extension)
216224 while True:
217225 filename = task_queue.get(block=False)
218226 if filename == None:
@@ -297,8 +305,8 @@
298306 timer = Timer()
299307 log.log_to_mongo(config.full_project, 'store', timer, type='start')
300308 db.cleanup_database(config.project, logger)
301 - write_message_to_log(logger, args, None, message=None, verb='Storing', location=config.location, input=config.input, project=config.project, collection=config.collection)
302 - store.launcher(config.input, config.project, config.collection)
 309+ write_message_to_log(logger, args, None, message=None, verb='Storing', location=config.location, input=config.sorted, project=config.full_project, collection=config.collection)
 310+ store.launcher(config.sorted, config.full_project, config.collection)
303311 timer.elapsed()
304312 log.log_to_mongo(full_project, 'store', timer, type='finish')
305313
@@ -321,6 +329,7 @@
322330 for target in config.targets:
323331 write_message_to_log(logger, args, None, message=None, verb='Exporting', target=target, dbname=config.full_project, collection=config.collection)
324332 target = datasets[target]
 333+ print 'Dataset is created by: %s' % target
325334 exporter.dataset_launcher(config.full_project, config.collection, target)
326335 timer.elapsed()
327336 log.log_to_mongo(config.full_project, 'export', timer, type='finish')
Index: trunk/tools/editor_trends/analyses/count_editors.py
@@ -70,6 +70,7 @@
7171 data = shaper.create_datacontainer('list')
7272 elif unit == 'year_dict':
7373 data = shaper.create_datacontainer('dict')
 74+ data = shaper.add_years_to_datacontainer(data, 0)
7475 else:
7576 data = {}
7677
@@ -82,6 +83,47 @@
8384 return data, prop
8485
8586
 87+def cohort_dataset_forward_histogram(data, editor, prop):
 88+ if prop == None:
 89+ final_year = datetime.datetime.now().year + 1
 90+ prop = ChartProperties(headers, False, 'long')
 91+ headers = ['year', 'edits']
 92+ prop.final_year = final_year
 93+
 94+ new_wikipedian = editor['new_wikipedian']
 95+ yearly_edits = editor['edits_by_year']
 96+ for year in xrange(new_wikipedian.year, prop.final_year):
 97+ data[new_wikipedian.year].append(yearly_edits[year])
 98+ return data, prop
 99+
 100+
 101+def cohort_dataset_forward_bar(data, editor, prop):
 102+ if prop == None:
 103+ final_year = datetime.datetime.now().year + 1
 104+ headers = ['experience'] + [y for y in xrange(2001, final_year)]
 105+ prop = ChartProperties(headers, False, 'wide')
 106+
 107+ prop.final_year = final_year
 108+ prop.cutoff_value = 5
 109+
 110+ new_wikipedian = editor['new_wikipedian']
 111+ last_edit = editor['final_edit']
 112+ monthly_edits = editor['monthly_edits']
 113+ yearly_edits = editor['edits_by_year']
 114+ active = []
 115+ for year in xrange(new_wikipedian.year, prop.final_year):
 116+ max_edits = max(monthly_edits.get(str(year), {0:0}).values())
 117+ if yearly_edits.get(str(year), 0) == 0 or max_edits < prop.cutoff_value:
 118+ continue
 119+ else:
 120+ active.append(year)
 121+
 122+ if active != []:
 123+ year = max(active)
 124+ data[new_wikipedian.year][year] += 1
 125+ return data, prop
 126+
 127+
86128 def new_editor_count(data, editor, prop):
87129 '''
88130 Summary: This function generates an overview of the number of
@@ -90,7 +132,7 @@
91133 stats.download.org to make sure that we are using the same numbers.
92134 '''
93135 if prop == None:
94 - headers = ['time', 'count']
 136+ headers = ['year', 'month', 'count']
95137 prop = ChartProperties(headers, False, 'long')
96138 new_wikipedian = editor['new_wikipedian']
97139 data[new_wikipedian.year][new_wikipedian.month] += 1
@@ -167,5 +209,6 @@
168210
169211
170212 if __name__ == '__main__':
171 - generate_chart_data('enwiki', 'editors', histogram_edits, unit='year_list')
 213+ generate_chart_data('enwiki', 'editors', cohort_dataset_forward_bar, unit='year_dict')
 214+ #generate_chart_data('enwiki', 'editors', histogram_edits, unit='year_list')
172215 #generate_chart_data('enwiki', 'editors', time_to_new_wikipedian, unit='year_list')
Index: trunk/tools/editor_trends/etl/exporter.py
@@ -272,29 +272,68 @@
273273 if new_wikipedian.month not in data[new_wikipedian.year]:
274274 data[new_wikipedian.year][new_wikipedian.month] = {}
275275 for i, year in enumerate(xrange(new_wikipedian.year, final_year)):
 276+ min_edits = min(obs['monthly_edits'].values())
 277+ if min_edits < 5:
 278+ continue
276279 months = edits.get(str(year), [])
277 - if i == 0:
278 - months = months.keys()
279 - months = [int(m) for m in months]
280 - months.sort()
281 - months = months[new_wikipedian.month - 1:]
282 - months = [str(m) for m in months]
 280+# if i == 0:
 281+# months = months.keys()
 282+# months = [int(m) for m in months]
 283+# months.sort()
 284+# months = months[new_wikipedian.month - 1:]
 285+# months = [str(m) for m in months]
283286 for month in months:
284287 experience = str(i * 12 + (int(month) - 1))
285288 if experience not in data[new_wikipedian.year][new_wikipedian.month]:
286289 data[new_wikipedian.year][new_wikipedian.month][experience] = 0
287290 data[new_wikipedian.year][new_wikipedian.month][experience] += 1 if edits[str(year)][month] > 0 else 0
288291
289 - fh = utils.create_txt_filehandle(settings.dataset_location, '%s_cohort_data_forward.csv' % (dbname), 'w', settings.encoding)
 292+ filename = 'cohort_data_forward.bin'
 293+ print 'Storing data as %s' % os.path.join(settings.binary_location, '%s_%s' % (dbname, filename))
 294+ utils.store_object(data, settings.binary_location, '%s_%s' % (dbname, filename))
 295+ cohort_charts.prepare_cohort_dataset(dbname, filename)
 296+
 297+ filename = '_cohort_data_forward_histogram.csv'
 298+ fh = utils.create_txt_filehandle(settings.dataset_location, '%s_%s' % (dbname, filename), 'w', settings.encoding)
290299 for year in data:
291300 for month in data[year]:
292301 obs = data[year][month].keys()
293302 obs.sort()
294303 for o in obs:
295 - utils.write_list_to_csv(['%s-%s' % (month, year), o, data[year][month][o]], fh, recursive=False, newline=True)
 304+ utils.write_list_to_csv(['%s-%s' % (month, year), o, data[year][month][o]], fh, recursive=False, format='long')
296305 fh.close()
297306
 307+def generate_cohort_dataset_backward(tasks, dbname, collection, **kwargs):
 308+ mongo = db.init_mongo_db(dbname)
 309+ editors = mongo[collection + '_dataset']
 310+ windows = create_windows(break_down_first_year=False)
 311+ data = shaper.create_datacontainer('dict')
 312+ data = shaper.add_windows_to_datacontainer(data, windows)
298313
 314+ while True:
 315+ id = tasks.get(block=False)
 316+ tasks.task_done()
 317+ if id == None:
 318+ break
 319+ obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1, 'edits_by_year': 1, 'last_edit_by_year': 1})
 320+ first_edit = obs['first_edit']
 321+ for year in xrange(2001, datetime.datetime.now().year + 1):
 322+ year = str(year)
 323+ if obs['edits_by_year'][year] > 0:
 324+ last_edit = obs['last_edit_by_year'][year]
 325+ editor_dt = relativedelta(last_edit, first_edit)
 326+ editor_dt = (editor_dt.years * 12) + editor_dt.months
 327+ for w in windows:
 328+ if w >= editor_dt:
 329+ data[int(year)][w] += 1
 330+ break
 331+ filename = 'cohort_data_backward.bin'
 332+ print 'Storing data as %s' % os.path.join(settings.binary_location, '%s_%s' % (dbname, filename))
 333+ utils.store_object(data, settings.binary_location, '%s_%s' % (dbname, filename))
 334+ cohort_charts.prepare_cohort_dataset(dbname, filename)
 335+
 336+
 337+
299338 def generate_cohort_dataset_backward_custom(tasks, dbname, collection):
300339 mongo = db.init_mongo_db(dbname)
301340 editors = mongo[collection + '_dataset']
@@ -326,36 +365,8 @@
327366
328367
329368
330 -def generate_cohort_dataset_backward(tasks, dbname, collection, **kwargs):
331 - mongo = db.init_mongo_db(dbname)
332 - editors = mongo[collection + '_dataset']
333 - windows = create_windows(break_down_first_year=False)
334 - data = shaper.create_datacontainer('dict')
335 - data = shaper.add_windows_to_datacontainer(data, windows)
336369
337 - while True:
338 - id = tasks.get(block=False)
339 - tasks.task_done()
340 - if id == None:
341 - break
342 - obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1, 'edits_by_year': 1, 'last_edit_by_year': 1})
343 - first_edit = obs['first_edit']
344 - for year in xrange(2001, datetime.datetime.now().year + 1):
345 - year = str(year)
346 - if obs['edits_by_year'][year] > 0:
347 - last_edit = obs['last_edit_by_year'][year]
348 - editor_dt = relativedelta(last_edit, first_edit)
349 - editor_dt = (editor_dt.years * 12) + editor_dt.months
350 - for w in windows:
351 - if w >= editor_dt:
352 - data[int(year)][w] += 1
353 - break
354 - filename = 'cohort_data_backward.bin'
355 - print 'Storing data as %s' % os.path.join(settings.binary_location, '%s%s' % (dbname, filename))
356 - utils.store_object(data, settings.binary_location, '%s%s' % (dbname, filename))
357 - cohort_charts.prepare_cohort_dataset(dbname, filename)
358370
359 -
360371 def generate_wide_editor_dataset(tasks, dbname, collection, **kwargs):
361372 mongo = db.init_mongo_db(dbname)
362373 editors = mongo[collection + '_dataset']
Index: trunk/tools/editor_trends/etl/store.py
@@ -31,6 +31,8 @@
3232
3333
3434 def store_editors(tasks, dbname, collection, input):
 35+ mongo = db.init_mongo_db(dbname)
 36+ collection = mongo[collection]
3537 editor_cache = cache.EditorCache(collection)
3638 prev_contributor = -1
3739 edits = 0
@@ -44,6 +46,7 @@
4547
4648 fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding)
4749 for line in utils.readline(fh):
 50+ print line
4851 if len(line) == 0:
4952 continue
5053 contributor = line[0]
@@ -68,15 +71,21 @@
6972
7073
7174 def launcher(input, dbname, collection):
 75+ hack = True
7276 mongo = db.init_mongo_db(dbname)
73 - collection = mongo[collection]
74 - collection.ensure_index('editor')
75 - collection.create_index('editor')
76 - files = utils.retrieve_file_list(input, 'csv')
 77+ coll = mongo[collection]
 78+ coll.ensure_index('editor')
 79+ coll.create_index('editor')
 80+
 81+ if hack:
 82+ input = 'C:\wikimedia\en\wiki\dbready'
 83+ files = utils.retrieve_file_list(input, 'txt')
 84+ else:
 85+ files = utils.retrieve_file_list(input, 'csv')
7786 print files
7887 print input
7988 tasks = multiprocessing.JoinableQueue()
80 - consumers = [multiprocessing.Process(target=store_editors, args=(tasks, dbname, collection, input)) for i in xrange(settings.number_of_processes)]
 89+ consumers = [multiprocessing.Process(target=store_editors, args=(tasks, dbname, collection, input)) for i in xrange(1)]
8190 for file in files:
8291 tasks.put(file)
8392
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -298,9 +298,10 @@
299299 '''
300300 try:
301301 return int(id) % settings.max_filehandles
302 - except:
 302+ except ValueError:
303303 return sum([ord(i) for i in id]) % settings.max_filehandles
304304
 305+
305306 if __name__ == '__main__':
306307 project = 'wiki'
307308 language_code = 'en'
Index: trunk/tools/editor_trends/etl/shaper.py
@@ -68,7 +68,15 @@
6969
7070 return datacontainer
7171
 72+def add_years_to_datacontainer(datacontainer, datatype):
 73+ final_year = datetime.datetime.now().year
 74+ for dc in datacontainer:
 75+ datacontainer[dc] = {}
 76+ for x in range(2001, final_year):
 77+ datacontainer[dc][x] = datatype
 78+ return datacontainer
7279
 80+
7381 def get_standard_deviation(numberList):
7482 mean = get_mean(numberList)
7583 std = 0
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts.do
@@ -1,123 +0,0 @@
2 -clear
3 -set more off
4 -local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
5 -//local projects "enwiki"
6 -local projects "enwiki ruwiki dewiki eswiki jawiki"
7 -foreach proj of local projects {
8 - //di "`loc'"
9 - //di "`proj'"
10 - local p = "`loc'" + "`proj'" + "_cohort_data.txt"
11 - //di "`p'"
12 - insheet using `p'
13 -
14 - sort year
15 -
16 - by year: generate n = months_3 + months_6 + months_9 + months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
17 - by year: egen obs = sum(n)
18 - by year: generate one_year_exp = ((months_3 + months_6 + months_9 + months_12) / n) * 100
19 -
20 - sum(obs)
21 - return list
22 - local obs = r(max)
23 -
24 - di "`n'"
25 -
26 -
27 - gen months_3_rel = (months_3 / n) * 100
28 - gen months_6_rel = (months_6 / n) * 100
29 - gen months_9_rel = (months_9 / n) * 100
30 - gen months_12_rel = (months_12 / n) * 100
31 - gen months_24_rel = (months_24 / n) * 100
32 - gen months_36_rel = (months_36 / n) * 100
33 - gen months_48_rel = (months_48 / n) * 100
34 - gen months_60_rel = (months_60 / n) * 100
35 - gen months_72_rel = (months_72 / n) * 100
36 - gen months_84_rel = (months_84 / n) * 100
37 - gen months_96_rel = (months_96 / n) * 100
38 - gen months_108_rel = (months_108 / n) * 100
39 - //local values "3 6 9 12 24 36 48 60 72 84 96 108"
40 - //foreach value of local values {
41 - // local new_var = "months_" + "`value'" + "_rel"
42 - // local var = "months_" + "`value'"
43 - // generate `new_var' = `var' /
44 - //}
45 -
46 - label var months_3 "3 months"
47 - label var months_6 "6 months"
48 - label var months_9 "9 months"
49 - label var months_12 "1 year"
50 - label var months_24 "2 years"
51 - label var months_36 "3 years"
52 - label var months_48 "4 years"
53 - label var months_60 "5 years"
54 - label var months_72 "6 years"
55 - label var months_84 "7 years"
56 - label var months_96 "8 years"
57 - label var months_108 "9 years"
58 -
59 - label var months_3_rel "3 months"
60 - label var months_6_rel "6 months"
61 - label var months_9_rel "9 months"
62 - label var months_12_rel "1 year"
63 - label var months_24_rel "2 years"
64 - label var months_36_rel "3 years"
65 - label var months_48_rel "4 years"
66 - label var months_60_rel "5 years"
67 - label var months_72_rel "6 years"
68 - label var months_84_rel "7 years"
69 - label var months_96_rel "8 years"
70 - label var months_108_rel "9 years"
71 -
72 -
73 -
74 - //drop if(year==2010)
75 - generate fewer_one_year_abs = months_3 + months_6 + months_9 + months_12
76 - generate more_one_year_abs = months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
77 - label var fewer_one_year_abs "Editors with less than one year experience"
78 - label var more_one_year_abs "Editors with more than one year experience"
79 -
80 - twoway (line one_year_exp year), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
81 - local f = "`loc'" + "`proj'" + "_line_rel_one_vs_multi_years.png"
82 - graph export `f', replace
83 - //subtitle(Editors are getting older and influx of new editors has stagnated)
84 -
85 -
86 - graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) subtitle(Project `proj') legend(colfirst cols(1)) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
87 - local f = "`loc'" + "`proj'" + "_bar_abs_one_vs_multi_years.png"
88 - graph export `f', replace
89 -
90 - graph bar (asis) months_3_rel months_6_rel months_9_rel months_12_rel months_24_rel months_36_rel months_48_rel months_60_rel months_72_rel months_84_rel months_96_rel months_108_rel, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) note("Based on the `proj' project, `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
91 - local f = "`loc'" + "`proj'" + "_bar_cohort.png"
92 - graph export `f', replace
93 -
94 -
95 - clear
96 -}
97 -set more on
98 -
99 -//label var months_3 "3 Months"
100 -//label var months_6 "6 Months"
101 -//label var months_9 "9 Months"
102 -//label var months_12 "1 Year"
103 -//label var months_24 "2 Years"
104 -//label var months_36 "3 Years"
105 -//label var months_48 "4 Years"
106 -//label var months_60 "5 Years"
107 -//label var months_72 "6 Years"
108 -//label var months_84 "7 Years"
109 -//label var months_96 "8 Years"
110 -//label var months_108 "9 Years"
111 -//generate one_year_exp = months_3+ months_6+ months_9+ months_12
112 -
113 -//generate fewer_one_year_abs = (one_year_exp/100) * n
114 -//generate more_one_year_abs = n - fewer_one_year_abs
115 -//label var fewer_one_year_abs "Editors with less than one year experience"
116 -//label var more_one_year_abs "Editors with more than one year experience"
117 -
118 -//graph bar (asis) months_3 months_6 months_9 months_12 months_24 months_36 months_48 months_60 months_72 months_84 months_96 months_108, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) subtitle(Editors are getting older and influx of new editors has stagnated) note("Based on English Wikipedia, 345.000 editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(tiny)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
119 -
120 -//twoway (line one_year_exp year), ytitle(%) ytitle(, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the English Wikipedia, dataset 345.000 editors.", size(vsmall))
121 -
122 -
123 -//graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) legend(colfirst cols(1))
124 -
Index: trunk/tools/editor_trends/statistics/stata/wiki.do
@@ -1,68 +0,0 @@
2 -insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_long_editors.csv"
3 -local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit first_edit"
4 -
5 -foreach edit of local first_ten {
6 - gen date2 = date(`edit', "YMDhms")
7 - drop `edit'
8 - rename date2 `edit'
9 - format `edit' %td
10 -}
11 -
12 -generate year_left = year(final_edit)
13 -generate year_joined = year(first_edit)
14 -sort year_joined
15 -by year_joined: gen community_size_t = _N
16 -
17 -
18 -forvalues year = 1(1)10{
19 - gen active200`year' = 0
20 - replace active200`year' =1 if((edits_10+(`year'*365)<=final_edit))
21 - egen community_size_200`year' = total(active200`year')
22 -}
23 -
24 -forvalues t = 1(1)10{
25 - local t1 = `t'+1
26 - gen retention200`t' = community_size_200`t1' / community_size_200`t'
27 -}
28 -
29 -generate time_to_new_wp = edits_10 - edits_1
30 -generate active_time_wp = final_edit - edits_10
31 -label time_to_new_wp "Number of days it took to become a new wikipedian"
32 -label active_time_wp "Number of days active once becoming a new wikipedian"
33 -
34 -
35 -
36 -compress
37 -
38 -graph hbar (mean) time_to_new_wp, over(year_joined, label(labsize(small))) blabel(bar, size(tiny) format(%9.0f)) ytitle(Average number of days) ytitle(, size(vsmall)) ylabel(, labsize(small)) title("The average number of days to become" "a new wikipedian increases.") note("A new wikipedian is defined as somebody who has made at least 10 edits." "The year in which the 10th edit was made determines in which year an editor became a new wikipedian." "Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits.", size(vsmall))
39 -histogram time_to_new_wp, percent ytitle(Percentage (%)) ytitle(, size(small)) xtitle(Number of days) xtitle(, size(small)) by(, title("Histograms of number of days it took" " to become a new wikipedian by year") subtitle(The pace by which contributors are becoming a new wikipedian is slowing down., size(small)) note("Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits." "A new wikipedian is somebody who has contributed at least 10 edits.", size(vsmall))) by(year_joined)
40 -graph box time_to_new_wp, over(year_joined) nooutsides
41 -glcurve edit_count, by( year_joined) split lorenz
42 -
43 -
44 -
45 -insheet using "C:\Users\diederik.vanliere\Desktop\dataset.csv"
46 -// 0 = False
47 -// 1 = True
48 -
49 -rename v1 id
50 -rename v2 date
51 -format date2 %td
52 -gen date2 = date(date, "MD20Y")
53 -sort id
54 -by id: generate n = _n
55 -by id: egen first_obs = min(date2)
56 -by id: egen last_obs = max(date2)
57 -by id: generate time_required = last_obs - first_obs
58 -by id: generate year= year(last_obs)
59 -
60 -gen made_ten_edits =0
61 -by id: egen temp = max(n)
62 -by id: replace made_ten_edits=1 if(temp==10)
63 -drop temp
64 -
65 -
66 -
67 -by year, sort: egen time_to_new_wikipedian = mean( time_required)
68 -
69 -compress
Index: trunk/tools/editor_trends/statistics/stata/combined_line_chart_experience.do
@@ -0,0 +1,14 @@
 2+clear
 3+set more off
 4+local source "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
 5+local target "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
 6+
 7+sort year
 8+by year: generate n = months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
 9+by year: generate one_year_exp = ((months_12) / n) * 100
 10+
 11+twoway (line one_year_exp year if project=="enwiki") (line one_year_exp year if project=="ruwiki") (line one_year_exp year if project=="eswiki") (line one_year_exp year if project=="jawiki") (line one_year_exp year if project=="frwiki") (line one_year_exp year if project=="dewiki"), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) legend(order(1 "Enwiki" 2 "Ruwiki" 3 "Eswiki" 4 "Jawiki" 5 "Frwiki" 6 "Dewiki"))
 12+//twoway (line one_year_exp year), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
 13+local f = "`target'" + "\`proj'\" + "`proj'" + "_line_rel_one_vs_multi_years.png"
 14+graph export `f', replace
 15+
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts_backward.do
@@ -0,0 +1,131 @@
 2+clear
 3+set more off
 4+local source "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
 5+local target "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
 6+local projects "enwiki"
 7+//local projects "enwiki ruwiki dewiki eswiki jawiki"
 8+foreach proj of local projects {
 9+ //di "`loc'"
 10+ //di "`proj'"
 11+ local p = "`source'" + "`proj'" + "_cohort_data_backward.txt"
 12+ //di "`p'"
 13+ insheet using `p'
 14+
 15+ sort year
 16+
 17+ by year: generate n = months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
 18+ //by year: generate n = months_3 + months_6 + months_9 + months_12 + months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
 19+ by year: egen obs = sum(n)
 20+ by year: generate one_year_exp = ((months_12) / n) * 100
 21+ //by year: generate one_year_exp = ((months_3 + months_6 + months_9 + months_12) / n) * 100
 22+
 23+ sum(obs)
 24+ return list
 25+ local obs = r(max)
 26+
 27+ di "`n'"
 28+
 29+
 30+ //gen months_3_rel = (months_3 / n) * 100
 31+ //gen months_6_rel = (months_6 / n) * 100
 32+ //gen months_9_rel = (months_9 / n) * 100
 33+ gen months_12_rel = (months_12 / n) * 100
 34+ gen months_24_rel = (months_24 / n) * 100
 35+ gen months_36_rel = (months_36 / n) * 100
 36+ gen months_48_rel = (months_48 / n) * 100
 37+ gen months_60_rel = (months_60 / n) * 100
 38+ gen months_72_rel = (months_72 / n) * 100
 39+ gen months_84_rel = (months_84 / n) * 100
 40+ gen months_96_rel = (months_96 / n) * 100
 41+ gen months_108_rel = (months_108 / n) * 100
 42+ //local values "3 6 9 12 24 36 48 60 72 84 96 108"
 43+ //foreach value of local values {
 44+ // local new_var = "months_" + "`value'" + "_rel"
 45+ // local var = "months_" + "`value'"
 46+ // generate `new_var' = `var' /
 47+ //}
 48+
 49+ //label var months_3 "3 months"
 50+ //label var months_6 "6 months"
 51+ //label var months_9 "9 months"
 52+ label var months_12 "1 year"
 53+ label var months_24 "2 years"
 54+ label var months_36 "3 years"
 55+ label var months_48 "4 years"
 56+ label var months_60 "5 years"
 57+ label var months_72 "6 years"
 58+ label var months_84 "7 years"
 59+ label var months_96 "8 years"
 60+ label var months_108 "9 years"
 61+
 62+ //label var months_3_rel "3 months"
 63+ //label var months_6_rel "6 months"
 64+ //label var months_9_rel "9 months"
 65+ label var months_12_rel "1 year"
 66+ label var months_24_rel "2 years"
 67+ label var months_36_rel "3 years"
 68+ label var months_48_rel "4 years"
 69+ label var months_60_rel "5 years"
 70+ label var months_72_rel "6 years"
 71+ label var months_84_rel "7 years"
 72+ label var months_96_rel "8 years"
 73+ label var months_108_rel "9 years"
 74+
 75+
 76+ local obs = "."
 77+ drop if(year==2011)
 78+ drop if(year==2012)
 79+
 80+ generate fewer_one_year_abs = months_12
 81+ //generate fewer_one_year_abs = months_3 + months_6 + months_9 + months_12
 82+
 83+ generate more_one_year_abs = months_24 + months_36 + months_48 + months_60 + months_72 + months_84 + months_96 + months_108
 84+ label var fewer_one_year_abs "Editors with less than one year experience"
 85+ label var more_one_year_abs "Editors with more than one year experience"
 86+
 87+ twoway (line one_year_exp year), ylabel(0(10)100, labsize(vsmall)) ytitle(%, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the `proj' project, dataset `obs' editors.", size(vsmall))
 88+ local f = "`loc'" + "\`proj'\" + "`proj'" + "_line_rel_one_vs_multi_years.png"
 89+ graph export `f', replace
 90+ //subtitle(Editors are getting older and influx of new editors has stagnated)
 91+
 92+
 93+ graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) subtitle(Project `proj') legend(colfirst cols(1)) note("Based on the `proj' project, dataset `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall))
 94+ local f = "`target'" + "\`proj'\" + "`proj'" + "_bar_abs_one_vs_multi_years.png"
 95+ graph export `f', replace
 96+
 97+ graph bar (asis) months_12_rel months_24_rel months_36_rel months_48_rel months_60_rel months_72_rel months_84_rel months_96_rel months_108_rel, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) note("Based on the `proj' project, `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
 98+ //graph bar (asis) months_3_rel months_6_rel months_9_rel months_12_rel months_24_rel months_36_rel months_48_rel months_60_rel months_72_rel months_84_rel months_96_rel months_108_rel, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) note("Based on the `proj' project, `obs' editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
 99+ local f = "`target'" + "\`proj'\" + "`proj'" + "_bar_cohort.png"
 100+ graph export `f', replace
 101+
 102+
 103+ clear
 104+}
 105+set more on
 106+
 107+//label var months_3 "3 Months"
 108+//label var months_6 "6 Months"
 109+//label var months_9 "9 Months"
 110+//label var months_12 "1 Year"
 111+//label var months_24 "2 Years"
 112+//label var months_36 "3 Years"
 113+//label var months_48 "4 Years"
 114+//label var months_60 "5 Years"
 115+//label var months_72 "6 Years"
 116+//label var months_84 "7 Years"
 117+//label var months_96 "8 Years"
 118+//label var months_108 "9 Years"
 119+//generate one_year_exp = months_3+ months_6+ months_9+ months_12
 120+
 121+//generate fewer_one_year_abs = (one_year_exp/100) * n
 122+//generate more_one_year_abs = n - fewer_one_year_abs
 123+//label var fewer_one_year_abs "Editors with less than one year experience"
 124+//label var more_one_year_abs "Editors with more than one year experience"
 125+
 126+//graph bar (asis) months_3 months_6 months_9 months_12 months_24 months_36 months_48 months_60 months_72 months_84 months_96 months_108, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) subtitle(Editors are getting older and influx of new editors has stagnated) note("Based on English Wikipedia, 345.000 editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(tiny)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
 127+
 128+//twoway (line one_year_exp year), ytitle(%) ytitle(, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the English Wikipedia, dataset 345.000 editors.", size(vsmall))
 129+
 130+
 131+//graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) legend(colfirst cols(1))
 132+
Index: trunk/tools/editor_trends/statistics/stata/histogram_edits_new_wikipedian.do
@@ -0,0 +1,14 @@
 2+clear
 3+local loc = "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
 4+insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_histogram_edits.csv"
 5+
 6+sum(year)
 7+return list
 8+local max_year = r(max)
 9+local min_year = r(min)
 10+
 11+forvalues year = `min_year'(1)`max_year' {
 12+ histogram num_edits if year==`year', percent addlabel addlabopts(mlabsize(tiny) mlabangle(forty_five)) xtitle(Number of edits) title("Histogram Number of Edits" "New Wikipedians Made in `year'")
 13+ local f = "`loc'" + "enwiki_" + "`year'" + "_histogram_edits.png"
 14+ graph export `f', replace
 15+}
Index: trunk/tools/editor_trends/statistics/stata/histogram_how_long_new_wikipedian_stays_active.do
@@ -0,0 +1,57 @@
 2+clear
 3+set more off
 4+local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
 5+local projects "ruwiki dewiki eswiki jawiki enwiki"
 6+
 7+foreach proj of local projects {
 8+ clear
 9+
 10+ local p = "`loc'" + "`proj'" + "_forward_cohort.csv"
 11+ insheet using `p'
 12+ label var experience "Number of months active"
 13+ gen date = date(_time, "YMD")
 14+ format date %td
 15+
 16+ egen min_year= min(year(date))
 17+ egen max_year= max(year(date))
 18+ gen month = month(date)
 19+ gen day = day(date)
 20+
 21+ sum(max_year)
 22+ return list
 23+ local max_year = r(max)
 24+
 25+ sum(min_year)
 26+ return list
 27+ local min_year = r(min)
 28+
 29+ gen first_year = 0
 30+ replace first_year =1 if year(date)==`min_year'
 31+
 32+ sum(month) if first_year ==1
 33+ return list
 34+ local m = r(min)
 35+
 36+ sum(day) if(first_year ==1 & month==`m')
 37+ return list
 38+ local d = r(min)
 39+
 40+ di `min_year'
 41+ di `m'
 42+ di `d'
 43+
 44+ forvalues year = `min_year'(1)`max_year' {
 45+ di `year'
 46+ //local end_date = "1,31," + "`year'"
 47+ //di `end_date'
 48+ //list date if date==mdy("`m'", "`d'", "`year'")
 49+ if mdy(`m', `d', `year') < mdy(`m',`d', `max_year') {
 50+ histogram experience if date==mdy(`m', `d', `year'), discrete percent ylabel(0(5)100, labsize(vsmall)) title("How long do editors stay who entered `m'/`year'?") subtitle("Project `proj'") note("Based on the `proj' project." "An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall))
 51+ local f = "`loc'" + "`proj'" + "_" + "`year'" + "_histogram_cohort_forward.png"
 52+ graph export `f', replace
 53+ }
 54+ }
 55+}
 56+
 57+
 58+set more on
Index: trunk/tools/editor_trends/statistics/stata/histogram_time_to_new_wikipedian.do
@@ -0,0 +1,18 @@
 2+clear
 3+local loc = "C:\Users\diederik.vanliere\workspace\editor_trends\statistics\charts\"
 4+insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_time_to_new_wikipedian.csv"
 5+
 6+//egen min_year = min(year)
 7+//egen max_year = max(year)
 8+
 9+sum(year)
 10+return list
 11+local max_year = r(max)
 12+local min_year = r(min)
 13+
 14+forvalues year = `min_year'(1)`max_year' {
 15+ histogram time_to_new_wikipedian if year==`year', discrete percent xtitle(Number of days) note("An editor is a person who has made at least 10 edits in the main namespace.", size(vsmall)) title("Histogram Number of Days it Takes" "to Become a New Wikipedian in `year'")
 16+ local f = "`loc'" + "enwiki_" + "`year'" + "_histogram_time_to_new_wikipedian.png"
 17+ graph export `f', replace
 18+}
 19+
Index: trunk/tools/editor_trends/statistics/stata/histogram_edits_new_wikipedian_outdated.do
@@ -0,0 +1,68 @@
 2+insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_long_editors.csv"
 3+local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit first_edit"
 4+
 5+foreach edit of local first_ten {
 6+ gen date2 = date(`edit', "YMDhms")
 7+ drop `edit'
 8+ rename date2 `edit'
 9+ format `edit' %td
 10+}
 11+
 12+generate year_left = year(final_edit)
 13+generate year_joined = year(first_edit)
 14+sort year_joined
 15+by year_joined: gen community_size_t = _N
 16+
 17+
 18+forvalues year = 1(1)10{
 19+ gen active200`year' = 0
 20+ replace active200`year' =1 if((edits_10+(`year'*365)<=final_edit))
 21+ egen community_size_200`year' = total(active200`year')
 22+}
 23+
 24+forvalues t = 1(1)10{
 25+ local t1 = `t'+1
 26+ gen retention200`t' = community_size_200`t1' / community_size_200`t'
 27+}
 28+
 29+generate time_to_new_wp = edits_10 - edits_1
 30+generate active_time_wp = final_edit - edits_10
 31+label time_to_new_wp "Number of days it took to become a new wikipedian"
 32+label active_time_wp "Number of days active once becoming a new wikipedian"
 33+
 34+
 35+
 36+compress
 37+
 38+graph hbar (mean) time_to_new_wp, over(year_joined, label(labsize(small))) blabel(bar, size(tiny) format(%9.0f)) ytitle(Average number of days) ytitle(, size(vsmall)) ylabel(, labsize(small)) title("The average number of days to become" "a new wikipedian increases.") note("A new wikipedian is defined as somebody who has made at least 10 edits." "The year in which the 10th edit was made determines in which year an editor became a new wikipedian." "Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits.", size(vsmall))
 39+histogram time_to_new_wp, percent ytitle(Percentage (%)) ytitle(, size(small)) xtitle(Number of days) xtitle(, size(small)) by(, title("Histograms of number of days it took" " to become a new wikipedian by year") subtitle(The pace by which contributors are becoming a new wikipedian is slowing down., size(small)) note("Sample is based on 83.265 new wikipedians who contributed 18,327,260 edits." "A new wikipedian is somebody who has contributed at least 10 edits.", size(vsmall))) by(year_joined)
 40+graph box time_to_new_wp, over(year_joined) nooutsides
 41+glcurve edit_count, by( year_joined) split lorenz
 42+
 43+
 44+
 45+insheet using "C:\Users\diederik.vanliere\Desktop\dataset.csv"
 46+// 0 = False
 47+// 1 = True
 48+
 49+rename v1 id
 50+rename v2 date
 51+format date2 %td
 52+gen date2 = date(date, "MD20Y")
 53+sort id
 54+by id: generate n = _n
 55+by id: egen first_obs = min(date2)
 56+by id: egen last_obs = max(date2)
 57+by id: generate time_required = last_obs - first_obs
 58+by id: generate year= year(last_obs)
 59+
 60+gen made_ten_edits =0
 61+by id: egen temp = max(n)
 62+by id: replace made_ten_edits=1 if(temp==10)
 63+drop temp
 64+
 65+
 66+
 67+by year, sort: egen time_to_new_wikipedian = mean( time_required)
 68+
 69+compress
Index: trunk/tools/editor_trends/statistics/stata/cohort_line_charts_forward.do
@@ -0,0 +1,57 @@
 2+clear
 3+set more off
 4+local loc "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\"
 5+local projects "ruwiki dewiki eswiki jawiki enwiki"
 6+
 7+foreach proj of local projects {
 8+ clear
 9+ local p = "`loc'" + "`proj'" + "_cohort_data_forward.csv"
 10+ insheet using `p'
 11+ ren v1 raw_date
 12+ ren v2 experience
 13+ ren v3 count
 14+
 15+ gen date = date(raw_date, "MY")
 16+ format date %td
 17+
 18+ egen min_year= min(year(date))
 19+ egen max_year= max(year(date))
 20+ gen month = month(date)
 21+ gen day = day(date)
 22+
 23+ sum(max_year)
 24+ return list
 25+ local max_year = r(max)
 26+
 27+ sum(min_year)
 28+ return list
 29+ local min_year = r(min)
 30+
 31+ gen first_year = 0
 32+ replace first_year =1 if year(date)==`min_year'
 33+
 34+ sum(month) if first_year ==1
 35+ return list
 36+ local m = r(min)
 37+
 38+ sum(day) if(first_year ==1 & month==`m')
 39+ return list
 40+ local d = r(min)
 41+
 42+ replace count = . if count ==0
 43+
 44+ forvalues year = `min_year'(1)`max_year' {
 45+ di `year'
 46+ //local end_date = "1,31," + "`year'"
 47+ //di `end_date'
 48+ //list date if date==mdy("`m'", "`d'", "`year'")
 49+
 50+ if mdy(`m', 1, `year') < mdy(`m', 1, `max_year') {
 51+ twoway (line count experience if date==mdy(1,1,`year'), sort cmissing(n)), ytitle(Number of New Wikipedians) xtitle(Number of months active) xlabel(0(4)108, labsize(vsmall)) title("The number of New Wikipedians active who entered 1/`year'") subtitle("Project `proj'")
 52+
 53+ local f = "`loc'" + "`proj'" + "_" + "`year'" + "_line_cohort_forward.png"
 54+ graph export `f', replace
 55+ }
 56+ }
 57+
 58+}
Index: trunk/tools/editor_trends/configuration.py
@@ -194,3 +194,4 @@
195195 'data', 'objects')
196196 self.namespace_location = os.path.join(self.working_directory,
197197 'namespaces')
 198+ self.chart_location = os.path.join(self.working_directory, 'statistics', 'charts')
Index: trunk/tools/editor_trends/utils/utils.py
@@ -195,7 +195,7 @@
196196 return 'wb'
197197
198198
199 -def write_list_to_csv(data, fh, recursive=False, newline=True):
 199+def write_list_to_csv(data, fh, recursive=False, newline=True, format='wide'):
200200 '''
201201 @data is a list which can contain other lists that will be written as a
202202 single line to a textfile
@@ -218,7 +218,7 @@
219219 if len(d) == len(data[x]):
220220 fh.write('\n')
221221 elif type(d) == type({}):
222 - tab = write_dict_to_csv(d, fh, write_key=False, newline=newline)
 222+ tab = write_dict_to_csv(d, fh, d.keys(), write_key=False, format=format)
223223 else:
224224 fh.write('%s' % d)
225225 tab = True
@@ -245,8 +245,19 @@
246246 fh.write('%s\t%s\t%s\n' % (key, d, data[key][d]))
247247 else:
248248 fh.write('%s\n' % (data[key]))
249 - else:
250 - print 'not yet implemented'
 249+ elif format == 'wide':
 250+ for key in keys:
 251+ if write_key:
 252+ fh.write('%s\t' % key)
 253+ if type(data[key]) == type([]):
 254+ for d in data[key]:
 255+ fh.write('%s\t')
 256+ elif type(data[key]) == type({}):
 257+ write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format)
 258+ else:
 259+ fh.write('%s\t' % (data[key]))
 260+ fh.write('\n')
 261+
251262 #if type(data[key]) == type([]):
252263 # write_list_to_csv(data[key], fh, recursive=False, newline=True)
253264
@@ -367,6 +378,14 @@
368379 return d
369380
370381
 382+def determine_canonical_name(filename):
 383+ while filename.find('.') > -1:
 384+ ext = determine_file_extension(filename)
 385+ ext = '.%s' % ext
 386+ filename = filename.replace(ext, '')
 387+ return filename
 388+
 389+
371390 def retrieve_file_list(location, extension, mask=None):
372391 '''
373392 Retrieve a list of files from a specified location.
Index: trunk/tools/editor_trends/utils/dump_downloader.py
@@ -29,35 +29,38 @@
3030 import utils
3131
3232
33 -def create_list_dumpfiles(url, canonical_filename, ext):
 33+def create_list_dumpfiles(domain, path, filename, ext):
3434 '''
3535 Wikipedia offers the option to download one dump file in separate batches.
3636 This function determines how many files there are for a giving dump and puts
3737 them in a queue.
3838 '''
3939 task_queue = multiprocessing.JoinableQueue()
 40+ canonical_filename = utils.determine_canonical_name(filename)
4041 for x in xrange(1, 100):
41 - f = '%s%s%s' % (canonical_filename, x, ext)
42 - res = check_remote_file_exists(url, f)
 42+ f = '%s%s.xml.%s' % (canonical_filename, x, ext)
 43+ res = check_remote_file_exists(domain, path, f)
4344 if res == None or res.status != 200:
4445 break
4546 else:
46 - task_queue.add(f)
 47+ print 'Added chunk to download: %s' % f
 48+ task_queue.put(f)
4749 for x in xrange(settings.number_of_processes):
48 - task_queue.add(None)
 50+ task_queue.put(None)
4951 return task_queue
5052
5153
52 -def check_remote_file_exists(url, filename):
 54+def check_remote_file_exists(domain, path, filename):
5355 '''
5456 @url is the full path of the file to be downloaded
5557 @filename is the name of the file to be downloaded
5658 '''
5759 try:
58 - if url.startswith('http://'):
59 - url = url[7:]
60 - conn = httplib.HTTPConnection(url)
61 - conn.request('HEAD', filename)
 60+ if domain.startswith('http://'):
 61+ domain = domain[7:]
 62+ conn = httplib.HTTPConnection(domain)
 63+ url = '%s%s' % (path, filename)
 64+ conn.request('HEAD', url)
6265 res = conn.getresponse()
6366 conn.close()
6467 return res

Status & tagging log