r78149 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r78148‎ | r78149 | r78150 >
Date:22:28, 9 December 2010
Author:diederik
Status:deferred
Tags:
Comment:
Rewrote export cohort data functionality.
Modified paths:
  • /trunk/tools/editor_trends/analyses/cohort_charts.py (modified) (history)
  • /trunk/tools/editor_trends/database/cache.py (modified) (history)
  • /trunk/tools/editor_trends/database/db.py (modified) (history)
  • /trunk/tools/editor_trends/etl/exporter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/loader.py (modified) (history)
  • /trunk/tools/editor_trends/etl/shaper.py (modified) (history)
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/cohort_charts.py
@@ -33,21 +33,14 @@
3434 years.sort()
3535 periods = dataset[2001].keys()
3636 periods.sort()
37 - periods.remove('n')
3837 headers = ['months_%s' % i for i in periods]
39 - headers.extend(['months_%s_abs' % i for i in periods])
4038 headers.insert(0, 'year')
4139 utils.write_list_to_csv(headers, fh)
 40+
4241 for year in years:
43 - n = float(dataset[year].pop('n'))
44 - obs = [100 * float(dataset[year][p]) / n if dataset[year][p] != 0 else '.' for p in periods]
45 - raw = [dataset[year][p] for p in periods]
46 - #print sum(obs)
 42+ obs = [dataset[year][p] for p in periods]
4743 obs.insert(0, year)
48 - obs.extend(raw)
49 - assert len(headers) == len(obs)
5044 utils.write_list_to_csv(obs, fh, newline=True)
51 - #utils.write_list_to_csv(raw, fh)
5245 fh.close()
5346
5447 if __name__ == '__main__':
Index: trunk/tools/editor_trends/manage.py
@@ -262,7 +262,7 @@
263263 write_message_to_log(logger, args, verb='Storing', location=location, input=input, project=project, collection=collection)
264264 num_editors = loader.store_editors(input, project, collection)
265265 cnt_editors = db.count_records(project, collection)
266 - assert num_editors == cnt_editors
 266+ #assert num_editors == cnt_editors
267267 timer.elapsed()
268268
269269
Index: trunk/tools/editor_trends/etl/exporter.py
@@ -103,8 +103,8 @@
104104 for m in months:
105105 #d = calendar.monthrange(int(year), int(m))[1] #determines the number of days in a given month/year
106106 #date = datetime.date(int(year), int(m), d)
107 - if id not in ds.time[year][m] and obs[var][year][str(m)] > 0:
108 - ds.time[year][m][id] = obs[var][year][str(m)]
 107+ if id not in ds.time[year][m] and obs[var][year][m] > 0:
 108+ ds.time[year][m][id] = obs[var][year][m]
109109
110110 def write_longitudinal_data(self):
111111 fh = utils.create_txt_filehandle(settings.dataset_location, self.name, 'w', settings.encoding)
@@ -148,7 +148,7 @@
149149 keys.sort()
150150 edits = []
151151 for key in keys:
152 - edits.append(str(obs[var][key]))
 152+ edits.append(obs[var][key])
153153 obs[var] = edits
154154 return obs
155155
@@ -191,59 +191,97 @@
192192 ld.write_longitudinal_data()
193193
194194
 195+def create_windows():
 196+ years = (datetime.datetime.now().year + 1) - 2001
 197+ p = [3, 6, 9]
 198+ windows = [y * 12 for y in xrange(1, years)]
 199+ windows = p + windows
 200+ return windows
 201+
 202+
195203 def generate_cohort_dataset(tasks, dbname, collection, **kwargs):
196204 mongo = db.init_mongo_db(dbname)
197205 editors = mongo[collection + '_dataset']
198 - year = datetime.datetime.now().year + 1
199 - begin = year - 2001
200 - p = [3, 6, 9]
201 - periods = [y * 12 for y in xrange(1, begin)]
202 - periods = p + periods
203206 data = {}
204 - while True:
205 - try:
206 - id = tasks.get(block=False)
207 - tasks.task_done()
208 - if id == None:
209 - break
210 - obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})
211 - if obs == None:
 207+# while True:
 208+# id = tasks.get(block=False)
 209+# tasks.task_done()
 210+# if id == None:
 211+# break
 212+# editor = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})
 213+ windows = create_windows()
 214+ data = shaper.create_datacontainer('dict')
 215+ data = shaper.add_windows_to_datacontainer(data, windows)
 216+
 217+ for editor in tasks:
 218+ obs = tasks[editor]
 219+ first_edit = obs['first_edit']
 220+ last_edit = obs['final_edit']
 221+ editor_dt = relativedelta(last_edit, first_edit)
 222+ editor_dt = (editor_dt.years * 12) + editor_dt.months
 223+ edits = []
 224+ for year in xrange(2001, datetime.datetime.now().year + 1):
 225+ #if year == 2009 and editor == '2':
 226+ # print 'debug'
 227+ if first_edit.year > year or last_edit.year < year:
212228 continue
213 - first_edit = obs['first_edit']
214 - last_edit = obs['final_edit']
215 - for y in xrange(2001, year):
216 - if y not in data:
217 - data[y] = {}
218 - data[y]['n'] = 0
219 - window_end = datetime.datetime(y, 12, 31)
220 - if window_end > datetime.datetime.now():
221 - now = datetime.datetime.now()
222 - m = now.month - 1 #Dump files are always lagging at least one month....
223 - d = now.day
224 - window_end = datetime.datetime(y, m, d)
225 - edits = []
226 - for period in periods:
227 - if period not in data[y]:
228 - data[y][period] = 0
229 - window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period)
230 - if first_edit.year > y or last_edit.year < y:
231 - continue
232 - if window_start < datetime.datetime(2001, 1, 1):
233 - window_start = datetime.datetime(2001, 1, 1)
 229+ window_end = datetime.datetime(year, 12, 31)
 230+ for window in windows:
 231+ window_start = window_end - relativedelta(months=window)
 232+ if window_start < datetime.datetime(2001, 1, 1):
 233+ window_start = datetime.datetime(2001, 1, 1)
 234+
 235+ if editor_dt > 11:
234236 if date_falls_in_window(window_start, window_end, first_edit):
235 - edits.append(period)
236 - if edits != []:
237 - p = min(edits)
238 - data[y][p] += 1
239 - data[y]['n'] += 1
 237+ edits.append(window)
 238+ elif window > editor_dt:
 239+ data[year][window] += 1
 240+ break
240241
241 - except Empty:
 242+ if edits != []:
 243+ w = min(edits)
 244+ data[year][w] += 1
 245+ edits = []
 246+
 247+
 248+ print 'Storing data as %s' % os.path.join(settings.binary_location, dbname + '_cohort_data.bin')
 249+ utils.store_object(data, settings.binary_location, dbname + '_cohort_data.bin')
 250+ cohort_charts.prepare_cohort_dataset(dbname)
 251+
 252+
 253+
 254+
 255+def generate_cohort_dataset_howie(tasks, dbname, collection, **kwargs):
 256+ mongo = db.init_mongo_db(dbname)
 257+ editors = mongo[collection + '_dataset']
 258+ windows = create_windows()
 259+ data = shaper.create_datacontainer('dict')
 260+ data = shaper.add_windows_to_datacontainer(data, windows)
 261+
 262+ while True:
 263+ id = tasks.get(block=False)
 264+ tasks.task_done()
 265+ if id == None:
242266 break
 267+ obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1, 'edits_by_year': 1, 'last_edit_by_year': 1})
 268+ first_edit = obs['first_edit']
 269+ for year in xrange(2001, datetime.datetime.now().year + 1):
 270+ year = str(year)
 271+ if obs['edits_by_year'][year] > 0:
 272+ last_edit = obs['last_edit_by_year'][year]
 273+ editor_dt = relativedelta(last_edit, first_edit)
 274+ editor_dt = (editor_dt.years * 12) + editor_dt.months
 275+ for w in windows:
 276+ if w >= editor_dt:
 277+ data[int(year)][w] += 1
 278+ break
243279 print 'Storing data as %s' % os.path.join(settings.binary_location, dbname + '_cohort_data.bin')
244280 utils.store_object(data, settings.binary_location, dbname + '_cohort_data.bin')
245281 cohort_charts.prepare_cohort_dataset(dbname)
246282
247283
 284+
 285+
248286 def date_falls_in_window(window_start, window_end, first_edit):
249287 if first_edit >= window_start and first_edit <= window_end:
250288 return True
@@ -293,6 +331,7 @@
294332 for editor in editors:
295333 tasks.put(editor)
296334 print 'The queue contains %s editors.' % tasks.qsize()
 335+ tasks.put(None)
297336 target(tasks, dbname, collection)
298337
299338 #for x in xrange(settings.number_of_processes):
@@ -304,9 +343,27 @@
305344 #tasks.join()
306345
307346
 347+def debug(dbname, collection):
 348+ editors = {
 349+ '1':{'first_edit': datetime.datetime(2009, 10, 1), 'final_edit': datetime.datetime(2009, 11, 30)},
 350+ '2':{'first_edit': datetime.datetime(2009, 12, 1), 'final_edit': datetime.datetime(2010, 2, 27)},
 351+ '3':{'first_edit': datetime.datetime(2009, 3, 1), 'final_edit': datetime.datetime(2009, 11, 30)},
 352+ '4':{'first_edit': datetime.datetime(2007, 1, 1), 'final_edit': datetime.datetime(2008, 4, 30)},
 353+ '5':{'first_edit': datetime.datetime(2006, 5, 1), 'final_edit': datetime.datetime(2009, 7, 30)},
 354+ '6':{'first_edit': datetime.datetime(2008, 11, 1), 'final_edit': datetime.datetime(2009, 6, 30)},
 355+ '7':{'first_edit': datetime.datetime(2009, 1, 1), 'final_edit': datetime.datetime(2009, 10, 30)},
 356+ '8':{'first_edit': datetime.datetime(2009, 7, 1), 'final_edit': datetime.datetime(2009, 7, 30)},
 357+ '9':{'first_edit': datetime.datetime(2009, 12, 1), 'final_edit': datetime.datetime(2010, 11, 30)},
 358+ '10':{'first_edit': datetime.datetime(2008, 5, 1), 'final_edit': datetime.datetime(2010, 11, 30)},
 359+ '11':{'first_edit': datetime.datetime(2007, 2, 1), 'final_edit': datetime.datetime(2010, 3, 30)},
 360+ '12':{'first_edit': datetime.datetime(2007, 2, 1), 'final_edit': datetime.datetime(2008, 2, 27)},
 361+ '13':{'first_edit': datetime.datetime(2007, 2, 1), 'final_edit': datetime.datetime(2009, 4, 30)},
 362+ }
 363+ generate_cohort_dataset(editors, dbname, collection)
308364 if __name__ == '__main__':
309365 dbname = 'enwiki'
310366 collection = 'editors'
311 - dataset_launcher(dbname, collection, generate_cohort_dataset)
312 - dataset_launcher(dbname, collection, generate_long_editor_dataset)
313 - dataset_launcher(dbname, collection, generate_wide_editor_dataset)
 367+ #debug(dbname, collection)
 368+ dataset_launcher(dbname, collection, generate_cohort_dataset_howie)
 369+ #dataset_launcher(dbname, collection, generate_long_editor_dataset)
 370+ #dataset_launcher(dbname, collection, generate_wide_editor_dataset)
Index: trunk/tools/editor_trends/etl/shaper.py
@@ -30,22 +30,28 @@
3131 def create_datacontainer(datatype):
3232 '''
3333 This function initializes an empty dictionary with as key the year (starting
34 - 2001 and running through) and as value @init_value, in most cases this will
 34+ 2001 and running through) and as value @datatype, in most cases this will
3535 be zero so the dictionary will act as a running tally for a variable but
36 - @init_value can also a list, [], or a dictionary, {}, or a set, set().
 36+ @datatype can also a list, [], or a dictionary, {}, or a set, set().
3737 '''
3838 data = {}
3939 year = datetime.datetime.now().year + 1
4040 for x in xrange(2001, year):
41 - data[str(x)] = add_datatype(datatype)
 41+ data[x] = add_datatype(datatype)
4242 return data
4343
 44+def add_windows_to_datacontainer(datacontainer, windows):
 45+ for dc in datacontainer:
 46+ for w in windows:
 47+ datacontainer[dc][w] = add_datatype()
4448
 49+ return datacontainer
 50+
4551 def add_months_to_datacontainer(datacontainer, datatype):
4652 for dc in datacontainer:
4753 datacontainer[dc] = {}
4854 for x in xrange(1, 13):
49 - datacontainer[dc][str(x)] = add_datatype(datatype)
 55+ datacontainer[dc][x] = add_datatype(datatype)
5056
5157 return datacontainer
5258
Index: trunk/tools/editor_trends/etl/transformer.py
@@ -64,14 +64,6 @@
6565 return '%s' % (self.id)
6666
6767 def __call__(self):
68 - #self.mongo = db.init_mongo_db(self.dbname)
69 -# input_db = self.mongo[self.collection]
70 -# output_db = self.mongo[self.collection + '_dataset']
71 -#
72 -# output_db.ensure_index('editor')
73 -# output_db.create_index('editor')
74 -# output_db.ensure_index('year_joined')
75 -# output_db.create_index('year_joined')
7668
7769 editor = self.input_db.find_one({'editor': self.id})
7870 if editor == None:
@@ -79,14 +71,20 @@
8072 edits = editor['edits']
8173 username = editor['username']
8274 monthly_edits = determine_edits_by_month(edits)
 75+ monthly_edits = db.stringify_keys(monthly_edits)
8376 edits = sort_edits(edits)
8477 edit_count = len(edits)
8578 new_wikipedian = edits[9]['date']
8679 first_edit = edits[0]['date']
8780 final_edit = edits[-1]['date']
8881 edits_by_year = determine_edits_by_year(edits)
 82+ edits_by_year = db.stringify_keys(edits_by_year)
 83+ last_edit_by_year = determine_last_edit_by_year(edits)
 84+ last_edit_by_year = db.stringify_keys(last_edit_by_year)
8985 articles_by_year = determine_articles_by_year(edits)
 86+ articles_by_year = db.stringify_keys(articles_by_year)
9087 edits = edits[:10]
 88+
9189 self.output_db.insert({'editor': self.id,
9290 'edits': edits,
9391 'edits_by_year': edits_by_year,
@@ -96,17 +94,28 @@
9795 'first_edit': first_edit,
9896 'articles_by_year': articles_by_year,
9997 'monthly_edits': monthly_edits,
 98+ 'last_edit_by_year': last_edit_by_year,
10099 'username': username
101100 })
102101
103102
 103+def determine_last_edit_by_year(edits):
 104+ datacontainer = shaper.create_datacontainer(0)
 105+ for edit in edits:
 106+ edit = edit['date']
 107+ if datacontainer[edit.year] == 0:
 108+ datacontainer[edit.year] = edit
 109+ elif datacontainer[edit.year] < edit:
 110+ datacontainer[edit.year] = edit
 111+ return datacontainer
 112+
104113 def determine_edits_by_month(edits):
105114 datacontainer = shaper.create_datacontainer(0.0)
106115 datacontainer = shaper.add_months_to_datacontainer(datacontainer, 0.0)
107116 for year in edits:
108117 for edit in edits[year]:
109 - m = str(edit['date'].month)
110 - datacontainer[year][m] += 1
 118+ m = edit['date'].month
 119+ datacontainer[int(year)][m] += 1
111120 return datacontainer
112121
113122
@@ -116,7 +125,7 @@
117126 '''
118127 edits = shaper.create_datacontainer(0.0)
119128 for date in dates:
120 - year = str(date['date'].year)
 129+ year = date['date'].year
121130 edits[year] += 1
122131 return edits
123132
@@ -128,7 +137,7 @@
129138 '''
130139 articles = shaper.create_datacontainer('set')
131140 for date in dates:
132 - year = str(date['date'].year)
 141+ year = date['date'].year
133142 articles[year].add(date['article'])
134143 for year in articles:
135144 articles[year] = len(articles[year])
Index: trunk/tools/editor_trends/etl/loader.py
@@ -32,7 +32,10 @@
3333
3434
3535 def store_editors(input, dbname, collection):
36 - filename = utils.retrieve_file_list(input, 'txt', mask=None)[0]
 36+ filename = utils.retrieve_file_list(input, 'txt', mask=None)
 37+ if len(filename) > 1:
 38+ filename = [f for f in filename if f.find('final') > -1]
 39+ filename = ''.join(filename)
3740 fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding)
3841 mongo = db.init_mongo_db(dbname)
3942 collection = mongo[collection]
Index: trunk/tools/editor_trends/database/cache.py
@@ -20,6 +20,7 @@
2121
2222 import sys
2323 sys.path.append('..')
 24+import bson
2425
2526 import configuration
2627 settings = configuration.Settings()
@@ -44,7 +45,8 @@
4546 def add(self, key, value):
4647 if value == 'NEXT':
4748 self.n += 1
48 - self.insert(key, self.editors[key]['edits'], self.editors[key]['username'])
 49+ edits = db.stringify_keys(self.editors[key]['edits'])
 50+ self.insert(key, edits, self.editors[key]['username'])
4951 del self.editors[key]
5052 else:
5153 if key not in self.editors:
@@ -55,11 +57,10 @@
5658 else:
5759 value.pop('username')
5860
59 - year = str(value['date'].year)
 61+ year = value['date'].year
6062 self.editors[key]['edits'][year].append(value)
6163 self.editors[key]['obs'] += 1
6264
63 -
6465 def update(self, editor, values):
6566 self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)
6667
@@ -68,9 +69,10 @@
6970 Adding the safe=True statement slows down the insert process but this assures that all data
7071 will be written.
7172 '''
72 - self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True)
73 - #except:
74 - # return False
 73+ try:
 74+ self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True)
 75+ except bson.errors.InvalidDocument:
 76+ print 'BSON document too large'
7577
7678 def store(self):
7779 utils.store_object(self, settings.binary_location, self.__repr__())
Index: trunk/tools/editor_trends/database/db.py
@@ -74,11 +74,27 @@
7575 mongo.collection.ensure_index(key)
7676
7777
 78+def stringify_keys(obj):
 79+ '''
 80+ @obj should be a dictionary where the keys are not yet strings. this function
 81+ is called just prior any insert / update query in mongo because mongo only
 82+ accepts strings as keys.
 83+ '''
 84+ d = {}
 85+ for o in obj:
 86+ if type(obj[o]) == type({}):
 87+ obj[o] = stringify_keys(obj[o])
 88+ d[str(o)] = obj[o]
 89+ return d
 90+
7891 def retrieve_distinct_keys(dbname, collection, field):
7992 #mongo = init_mongo_db(dbname)
8093 #editors = mongo[collection]
8194 #ids = retrieve_distinct_keys_mapreduce(editors, field)
82 -
 95+ '''
 96+ TODO: figure how big the index is and then take appropriate action, index < 4mb
 97+ just do a distinct query, index > 4mb do a map reduce.
 98+ '''
8399 if utils.check_file_exists(settings.binary_location, '%s_%s.bin' % (dbname, field)):
84100 ids = utils.load_object(settings.binary_location, '%s_%s.bin' % (dbname, field))
85101 else:

Status & tagging log