Index: trunk/tools/editor_trends/analyses/cohort_charts.py |
— | — | @@ -33,21 +33,14 @@ |
34 | 34 | years.sort() |
35 | 35 | periods = dataset[2001].keys() |
36 | 36 | periods.sort() |
37 | | - periods.remove('n') |
38 | 37 | headers = ['months_%s' % i for i in periods] |
39 | | - headers.extend(['months_%s_abs' % i for i in periods]) |
40 | 38 | headers.insert(0, 'year') |
41 | 39 | utils.write_list_to_csv(headers, fh) |
| 40 | + |
42 | 41 | for year in years: |
43 | | - n = float(dataset[year].pop('n')) |
44 | | - obs = [100 * float(dataset[year][p]) / n if dataset[year][p] != 0 else '.' for p in periods] |
45 | | - raw = [dataset[year][p] for p in periods] |
46 | | - #print sum(obs) |
| 42 | + obs = [dataset[year][p] for p in periods] |
47 | 43 | obs.insert(0, year) |
48 | | - obs.extend(raw) |
49 | | - assert len(headers) == len(obs) |
50 | 44 | utils.write_list_to_csv(obs, fh, newline=True) |
51 | | - #utils.write_list_to_csv(raw, fh) |
52 | 45 | fh.close() |
53 | 46 | |
54 | 47 | if __name__ == '__main__': |
Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -262,7 +262,7 @@ |
263 | 263 | write_message_to_log(logger, args, verb='Storing', location=location, input=input, project=project, collection=collection) |
264 | 264 | num_editors = loader.store_editors(input, project, collection) |
265 | 265 | cnt_editors = db.count_records(project, collection) |
266 | | - assert num_editors == cnt_editors |
| 266 | + #assert num_editors == cnt_editors |
267 | 267 | timer.elapsed() |
268 | 268 | |
269 | 269 | |
Index: trunk/tools/editor_trends/etl/exporter.py |
— | — | @@ -103,8 +103,8 @@ |
104 | 104 | for m in months: |
105 | 105 | #d = calendar.monthrange(int(year), int(m))[1] #determines the number of days in a given month/year |
106 | 106 | #date = datetime.date(int(year), int(m), d) |
107 | | - if id not in ds.time[year][m] and obs[var][year][str(m)] > 0: |
108 | | - ds.time[year][m][id] = obs[var][year][str(m)] |
| 107 | + if id not in ds.time[year][m] and obs[var][year][m] > 0: |
| 108 | + ds.time[year][m][id] = obs[var][year][m] |
109 | 109 | |
110 | 110 | def write_longitudinal_data(self): |
111 | 111 | fh = utils.create_txt_filehandle(settings.dataset_location, self.name, 'w', settings.encoding) |
— | — | @@ -148,7 +148,7 @@ |
149 | 149 | keys.sort() |
150 | 150 | edits = [] |
151 | 151 | for key in keys: |
152 | | - edits.append(str(obs[var][key])) |
| 152 | + edits.append(obs[var][key]) |
153 | 153 | obs[var] = edits |
154 | 154 | return obs |
155 | 155 | |
— | — | @@ -191,59 +191,97 @@ |
192 | 192 | ld.write_longitudinal_data() |
193 | 193 | |
194 | 194 | |
| 195 | +def create_windows(): |
| 196 | + years = (datetime.datetime.now().year + 1) - 2001 |
| 197 | + p = [3, 6, 9] |
| 198 | + windows = [y * 12 for y in xrange(1, years)] |
| 199 | + windows = p + windows |
| 200 | + return windows |
| 201 | + |
| 202 | + |
195 | 203 | def generate_cohort_dataset(tasks, dbname, collection, **kwargs): |
196 | 204 | mongo = db.init_mongo_db(dbname) |
197 | 205 | editors = mongo[collection + '_dataset'] |
198 | | - year = datetime.datetime.now().year + 1 |
199 | | - begin = year - 2001 |
200 | | - p = [3, 6, 9] |
201 | | - periods = [y * 12 for y in xrange(1, begin)] |
202 | | - periods = p + periods |
203 | 206 | data = {} |
204 | | - while True: |
205 | | - try: |
206 | | - id = tasks.get(block=False) |
207 | | - tasks.task_done() |
208 | | - if id == None: |
209 | | - break |
210 | | - obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1}) |
211 | | - if obs == None: |
| 207 | +# while True: |
| 208 | +# id = tasks.get(block=False) |
| 209 | +# tasks.task_done() |
| 210 | +# if id == None: |
| 211 | +# break |
| 212 | +# editor = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1}) |
| 213 | + windows = create_windows() |
| 214 | + data = shaper.create_datacontainer('dict') |
| 215 | + data = shaper.add_windows_to_datacontainer(data, windows) |
| 216 | + |
| 217 | + for editor in tasks: |
| 218 | + obs = tasks[editor] |
| 219 | + first_edit = obs['first_edit'] |
| 220 | + last_edit = obs['final_edit'] |
| 221 | + editor_dt = relativedelta(last_edit, first_edit) |
| 222 | + editor_dt = (editor_dt.years * 12) + editor_dt.months |
| 223 | + edits = [] |
| 224 | + for year in xrange(2001, datetime.datetime.now().year + 1): |
| 225 | + #if year == 2009 and editor == '2': |
| 226 | + # print 'debug' |
| 227 | + if first_edit.year > year or last_edit.year < year: |
212 | 228 | continue |
213 | | - first_edit = obs['first_edit'] |
214 | | - last_edit = obs['final_edit'] |
215 | | - for y in xrange(2001, year): |
216 | | - if y not in data: |
217 | | - data[y] = {} |
218 | | - data[y]['n'] = 0 |
219 | | - window_end = datetime.datetime(y, 12, 31) |
220 | | - if window_end > datetime.datetime.now(): |
221 | | - now = datetime.datetime.now() |
222 | | - m = now.month - 1 #Dump files are always lagging at least one month.... |
223 | | - d = now.day |
224 | | - window_end = datetime.datetime(y, m, d) |
225 | | - edits = [] |
226 | | - for period in periods: |
227 | | - if period not in data[y]: |
228 | | - data[y][period] = 0 |
229 | | - window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period) |
230 | | - if first_edit.year > y or last_edit.year < y: |
231 | | - continue |
232 | | - if window_start < datetime.datetime(2001, 1, 1): |
233 | | - window_start = datetime.datetime(2001, 1, 1) |
| 229 | + window_end = datetime.datetime(year, 12, 31) |
| 230 | + for window in windows: |
| 231 | + window_start = window_end - relativedelta(months=window) |
| 232 | + if window_start < datetime.datetime(2001, 1, 1): |
| 233 | + window_start = datetime.datetime(2001, 1, 1) |
| 234 | + |
| 235 | + if editor_dt > 11: |
234 | 236 | if date_falls_in_window(window_start, window_end, first_edit): |
235 | | - edits.append(period) |
236 | | - if edits != []: |
237 | | - p = min(edits) |
238 | | - data[y][p] += 1 |
239 | | - data[y]['n'] += 1 |
| 237 | + edits.append(window) |
| 238 | + elif window > editor_dt: |
| 239 | + data[year][window] += 1 |
| 240 | + break |
240 | 241 | |
241 | | - except Empty: |
| 242 | + if edits != []: |
| 243 | + w = min(edits) |
| 244 | + data[year][w] += 1 |
| 245 | + edits = [] |
| 246 | + |
| 247 | + |
| 248 | + print 'Storing data as %s' % os.path.join(settings.binary_location, dbname + '_cohort_data.bin') |
| 249 | + utils.store_object(data, settings.binary_location, dbname + '_cohort_data.bin') |
| 250 | + cohort_charts.prepare_cohort_dataset(dbname) |
| 251 | + |
| 252 | + |
| 253 | + |
| 254 | + |
| 255 | +def generate_cohort_dataset_howie(tasks, dbname, collection, **kwargs): |
| 256 | + mongo = db.init_mongo_db(dbname) |
| 257 | + editors = mongo[collection + '_dataset'] |
| 258 | + windows = create_windows() |
| 259 | + data = shaper.create_datacontainer('dict') |
| 260 | + data = shaper.add_windows_to_datacontainer(data, windows) |
| 261 | + |
| 262 | + while True: |
| 263 | + id = tasks.get(block=False) |
| 264 | + tasks.task_done() |
| 265 | + if id == None: |
242 | 266 | break |
| 267 | + obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1, 'edits_by_year': 1, 'last_edit_by_year': 1}) |
| 268 | + first_edit = obs['first_edit'] |
| 269 | + for year in xrange(2001, datetime.datetime.now().year + 1): |
| 270 | + year = str(year) |
| 271 | + if obs['edits_by_year'][year] > 0: |
| 272 | + last_edit = obs['last_edit_by_year'][year] |
| 273 | + editor_dt = relativedelta(last_edit, first_edit) |
| 274 | + editor_dt = (editor_dt.years * 12) + editor_dt.months |
| 275 | + for w in windows: |
| 276 | + if w >= editor_dt: |
| 277 | + data[int(year)][w] += 1 |
| 278 | + break |
243 | 279 | print 'Storing data as %s' % os.path.join(settings.binary_location, dbname + '_cohort_data.bin') |
244 | 280 | utils.store_object(data, settings.binary_location, dbname + '_cohort_data.bin') |
245 | 281 | cohort_charts.prepare_cohort_dataset(dbname) |
246 | 282 | |
247 | 283 | |
| 284 | + |
| 285 | + |
248 | 286 | def date_falls_in_window(window_start, window_end, first_edit): |
249 | 287 | if first_edit >= window_start and first_edit <= window_end: |
250 | 288 | return True |
— | — | @@ -293,6 +331,7 @@ |
294 | 332 | for editor in editors: |
295 | 333 | tasks.put(editor) |
296 | 334 | print 'The queue contains %s editors.' % tasks.qsize() |
| 335 | + tasks.put(None) |
297 | 336 | target(tasks, dbname, collection) |
298 | 337 | |
299 | 338 | #for x in xrange(settings.number_of_processes): |
— | — | @@ -304,9 +343,27 @@ |
305 | 344 | #tasks.join() |
306 | 345 | |
307 | 346 | |
| 347 | +def debug(dbname, collection): |
| 348 | + editors = { |
| 349 | + '1':{'first_edit': datetime.datetime(2009, 10, 1), 'final_edit': datetime.datetime(2009, 11, 30)}, |
| 350 | + '2':{'first_edit': datetime.datetime(2009, 12, 1), 'final_edit': datetime.datetime(2010, 2, 27)}, |
| 351 | + '3':{'first_edit': datetime.datetime(2009, 3, 1), 'final_edit': datetime.datetime(2009, 11, 30)}, |
| 352 | + '4':{'first_edit': datetime.datetime(2007, 1, 1), 'final_edit': datetime.datetime(2008, 4, 30)}, |
| 353 | + '5':{'first_edit': datetime.datetime(2006, 5, 1), 'final_edit': datetime.datetime(2009, 7, 30)}, |
| 354 | + '6':{'first_edit': datetime.datetime(2008, 11, 1), 'final_edit': datetime.datetime(2009, 6, 30)}, |
| 355 | + '7':{'first_edit': datetime.datetime(2009, 1, 1), 'final_edit': datetime.datetime(2009, 10, 30)}, |
| 356 | + '8':{'first_edit': datetime.datetime(2009, 7, 1), 'final_edit': datetime.datetime(2009, 7, 30)}, |
| 357 | + '9':{'first_edit': datetime.datetime(2009, 12, 1), 'final_edit': datetime.datetime(2010, 11, 30)}, |
| 358 | + '10':{'first_edit': datetime.datetime(2008, 5, 1), 'final_edit': datetime.datetime(2010, 11, 30)}, |
| 359 | + '11':{'first_edit': datetime.datetime(2007, 2, 1), 'final_edit': datetime.datetime(2010, 3, 30)}, |
| 360 | + '12':{'first_edit': datetime.datetime(2007, 2, 1), 'final_edit': datetime.datetime(2008, 2, 27)}, |
| 361 | + '13':{'first_edit': datetime.datetime(2007, 2, 1), 'final_edit': datetime.datetime(2009, 4, 30)}, |
| 362 | + } |
| 363 | + generate_cohort_dataset(editors, dbname, collection) |
308 | 364 | if __name__ == '__main__': |
309 | 365 | dbname = 'enwiki' |
310 | 366 | collection = 'editors' |
311 | | - dataset_launcher(dbname, collection, generate_cohort_dataset) |
312 | | - dataset_launcher(dbname, collection, generate_long_editor_dataset) |
313 | | - dataset_launcher(dbname, collection, generate_wide_editor_dataset) |
| 367 | + #debug(dbname, collection) |
| 368 | + dataset_launcher(dbname, collection, generate_cohort_dataset_howie) |
| 369 | + #dataset_launcher(dbname, collection, generate_long_editor_dataset) |
| 370 | + #dataset_launcher(dbname, collection, generate_wide_editor_dataset) |
Index: trunk/tools/editor_trends/etl/shaper.py |
— | — | @@ -30,22 +30,28 @@ |
31 | 31 | def create_datacontainer(datatype): |
32 | 32 | ''' |
33 | 33 | This function initializes an empty dictionary with as key the year (starting |
34 | | - 2001 and running through) and as value @init_value, in most cases this will |
| 34 | + 2001 and running through) and as value @datatype, in most cases this will |
35 | 35 | be zero so the dictionary will act as a running tally for a variable but |
36 | | - @init_value can also a list, [], or a dictionary, {}, or a set, set(). |
| 36 | + @datatype can also a list, [], or a dictionary, {}, or a set, set(). |
37 | 37 | ''' |
38 | 38 | data = {} |
39 | 39 | year = datetime.datetime.now().year + 1 |
40 | 40 | for x in xrange(2001, year): |
41 | | - data[str(x)] = add_datatype(datatype) |
| 41 | + data[x] = add_datatype(datatype) |
42 | 42 | return data |
43 | 43 | |
| 44 | +def add_windows_to_datacontainer(datacontainer, windows): |
| 45 | + for dc in datacontainer: |
| 46 | + for w in windows: |
| 47 | + datacontainer[dc][w] = add_datatype() |
44 | 48 | |
| 49 | + return datacontainer |
| 50 | + |
45 | 51 | def add_months_to_datacontainer(datacontainer, datatype): |
46 | 52 | for dc in datacontainer: |
47 | 53 | datacontainer[dc] = {} |
48 | 54 | for x in xrange(1, 13): |
49 | | - datacontainer[dc][str(x)] = add_datatype(datatype) |
| 55 | + datacontainer[dc][x] = add_datatype(datatype) |
50 | 56 | |
51 | 57 | return datacontainer |
52 | 58 | |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -64,14 +64,6 @@ |
65 | 65 | return '%s' % (self.id) |
66 | 66 | |
67 | 67 | def __call__(self): |
68 | | - #self.mongo = db.init_mongo_db(self.dbname) |
69 | | -# input_db = self.mongo[self.collection] |
70 | | -# output_db = self.mongo[self.collection + '_dataset'] |
71 | | -# |
72 | | -# output_db.ensure_index('editor') |
73 | | -# output_db.create_index('editor') |
74 | | -# output_db.ensure_index('year_joined') |
75 | | -# output_db.create_index('year_joined') |
76 | 68 | |
77 | 69 | editor = self.input_db.find_one({'editor': self.id}) |
78 | 70 | if editor == None: |
— | — | @@ -79,14 +71,20 @@ |
80 | 72 | edits = editor['edits'] |
81 | 73 | username = editor['username'] |
82 | 74 | monthly_edits = determine_edits_by_month(edits) |
| 75 | + monthly_edits = db.stringify_keys(monthly_edits) |
83 | 76 | edits = sort_edits(edits) |
84 | 77 | edit_count = len(edits) |
85 | 78 | new_wikipedian = edits[9]['date'] |
86 | 79 | first_edit = edits[0]['date'] |
87 | 80 | final_edit = edits[-1]['date'] |
88 | 81 | edits_by_year = determine_edits_by_year(edits) |
| 82 | + edits_by_year = db.stringify_keys(edits_by_year) |
| 83 | + last_edit_by_year = determine_last_edit_by_year(edits) |
| 84 | + last_edit_by_year = db.stringify_keys(last_edit_by_year) |
89 | 85 | articles_by_year = determine_articles_by_year(edits) |
| 86 | + articles_by_year = db.stringify_keys(articles_by_year) |
90 | 87 | edits = edits[:10] |
| 88 | + |
91 | 89 | self.output_db.insert({'editor': self.id, |
92 | 90 | 'edits': edits, |
93 | 91 | 'edits_by_year': edits_by_year, |
— | — | @@ -96,17 +94,28 @@ |
97 | 95 | 'first_edit': first_edit, |
98 | 96 | 'articles_by_year': articles_by_year, |
99 | 97 | 'monthly_edits': monthly_edits, |
| 98 | + 'last_edit_by_year': last_edit_by_year, |
100 | 99 | 'username': username |
101 | 100 | }) |
102 | 101 | |
103 | 102 | |
| 103 | +def determine_last_edit_by_year(edits): |
| 104 | + datacontainer = shaper.create_datacontainer(0) |
| 105 | + for edit in edits: |
| 106 | + edit = edit['date'] |
| 107 | + if datacontainer[edit.year] == 0: |
| 108 | + datacontainer[edit.year] = edit |
| 109 | + elif datacontainer[edit.year] < edit: |
| 110 | + datacontainer[edit.year] = edit |
| 111 | + return datacontainer |
| 112 | + |
104 | 113 | def determine_edits_by_month(edits): |
105 | 114 | datacontainer = shaper.create_datacontainer(0.0) |
106 | 115 | datacontainer = shaper.add_months_to_datacontainer(datacontainer, 0.0) |
107 | 116 | for year in edits: |
108 | 117 | for edit in edits[year]: |
109 | | - m = str(edit['date'].month) |
110 | | - datacontainer[year][m] += 1 |
| 118 | + m = edit['date'].month |
| 119 | + datacontainer[int(year)][m] += 1 |
111 | 120 | return datacontainer |
112 | 121 | |
113 | 122 | |
— | — | @@ -116,7 +125,7 @@ |
117 | 126 | ''' |
118 | 127 | edits = shaper.create_datacontainer(0.0) |
119 | 128 | for date in dates: |
120 | | - year = str(date['date'].year) |
| 129 | + year = date['date'].year |
121 | 130 | edits[year] += 1 |
122 | 131 | return edits |
123 | 132 | |
— | — | @@ -128,7 +137,7 @@ |
129 | 138 | ''' |
130 | 139 | articles = shaper.create_datacontainer('set') |
131 | 140 | for date in dates: |
132 | | - year = str(date['date'].year) |
| 141 | + year = date['date'].year |
133 | 142 | articles[year].add(date['article']) |
134 | 143 | for year in articles: |
135 | 144 | articles[year] = len(articles[year]) |
Index: trunk/tools/editor_trends/etl/loader.py |
— | — | @@ -32,7 +32,10 @@ |
33 | 33 | |
34 | 34 | |
35 | 35 | def store_editors(input, dbname, collection): |
36 | | - filename = utils.retrieve_file_list(input, 'txt', mask=None)[0] |
| 36 | + filename = utils.retrieve_file_list(input, 'txt', mask=None) |
| 37 | + if len(filename) > 1: |
| 38 | + filename = [f for f in filename if f.find('final') > -1] |
| 39 | + filename = ''.join(filename) |
37 | 40 | fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding) |
38 | 41 | mongo = db.init_mongo_db(dbname) |
39 | 42 | collection = mongo[collection] |
Index: trunk/tools/editor_trends/database/cache.py |
— | — | @@ -20,6 +20,7 @@ |
21 | 21 | |
22 | 22 | import sys |
23 | 23 | sys.path.append('..') |
| 24 | +import bson |
24 | 25 | |
25 | 26 | import configuration |
26 | 27 | settings = configuration.Settings() |
— | — | @@ -44,7 +45,8 @@ |
45 | 46 | def add(self, key, value): |
46 | 47 | if value == 'NEXT': |
47 | 48 | self.n += 1 |
48 | | - self.insert(key, self.editors[key]['edits'], self.editors[key]['username']) |
| 49 | + edits = db.stringify_keys(self.editors[key]['edits']) |
| 50 | + self.insert(key, edits, self.editors[key]['username']) |
49 | 51 | del self.editors[key] |
50 | 52 | else: |
51 | 53 | if key not in self.editors: |
— | — | @@ -55,11 +57,10 @@ |
56 | 58 | else: |
57 | 59 | value.pop('username') |
58 | 60 | |
59 | | - year = str(value['date'].year) |
| 61 | + year = value['date'].year |
60 | 62 | self.editors[key]['edits'][year].append(value) |
61 | 63 | self.editors[key]['obs'] += 1 |
62 | 64 | |
63 | | - |
64 | 65 | def update(self, editor, values): |
65 | 66 | self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True) |
66 | 67 | |
— | — | @@ -68,9 +69,10 @@ |
69 | 70 | Adding the safe=True statement slows down the insert process but this assures that all data |
70 | 71 | will be written. |
71 | 72 | ''' |
72 | | - self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True) |
73 | | - #except: |
74 | | - # return False |
| 73 | + try: |
| 74 | + self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True) |
| 75 | + except bson.errors.InvalidDocument: |
| 76 | + print 'BSON document too large' |
75 | 77 | |
76 | 78 | def store(self): |
77 | 79 | utils.store_object(self, settings.binary_location, self.__repr__()) |
Index: trunk/tools/editor_trends/database/db.py |
— | — | @@ -74,11 +74,27 @@ |
75 | 75 | mongo.collection.ensure_index(key) |
76 | 76 | |
77 | 77 | |
| 78 | +def stringify_keys(obj): |
| 79 | + ''' |
| 80 | + @obj should be a dictionary where the keys are not yet strings. this function |
| 81 | + is called just prior any insert / update query in mongo because mongo only |
| 82 | + accepts strings as keys. |
| 83 | + ''' |
| 84 | + d = {} |
| 85 | + for o in obj: |
| 86 | + if type(obj[o]) == type({}): |
| 87 | + obj[o] = stringify_keys(obj[o]) |
| 88 | + d[str(o)] = obj[o] |
| 89 | + return d |
| 90 | + |
78 | 91 | def retrieve_distinct_keys(dbname, collection, field): |
79 | 92 | #mongo = init_mongo_db(dbname) |
80 | 93 | #editors = mongo[collection] |
81 | 94 | #ids = retrieve_distinct_keys_mapreduce(editors, field) |
82 | | - |
| 95 | + ''' |
| 96 | + TODO: figure how big the index is and then take appropriate action, index < 4mb |
| 97 | + just do a distinct query, index > 4mb do a map reduce. |
| 98 | + ''' |
83 | 99 | if utils.check_file_exists(settings.binary_location, '%s_%s.bin' % (dbname, field)): |
84 | 100 | ids = utils.load_object(settings.binary_location, '%s_%s.bin' % (dbname, field)) |
85 | 101 | else: |