Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -17,8 +17,10 @@ |
18 | 18 | __date__ = '2010-12-10' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | | - |
| 21 | +from multiprocessing import JoinableQueue, Lock, Manager, RLock |
| 22 | +from Queue import Empty |
22 | 23 | import sys |
| 24 | +import cPickle |
23 | 25 | import os |
24 | 26 | import progressbar |
25 | 27 | import datetime |
— | — | @@ -27,83 +29,151 @@ |
28 | 30 | sys.path.append('..') |
29 | 31 | |
30 | 32 | import inventory |
| 33 | +import manage as manager |
31 | 34 | from classes import dataset |
32 | | -from classes import settings |
33 | | -settings = settings.Settings() |
| 35 | +from classes import runtime_settings |
| 36 | +from classes import consumers |
34 | 37 | from database import db |
35 | 38 | from utils import timer |
36 | 39 | from utils import log |
37 | 40 | |
| 41 | +class Analyzer(consumers.BaseConsumer): |
38 | 42 | |
| 43 | + def __init__(self, rts, tasks, result, var): |
| 44 | + super(Analyzer, self).__init__(rts, tasks, result) |
| 45 | + self.var = var |
39 | 46 | |
40 | | -def generate_chart_data(project, collection, language_code, func, encoder, **kwargs): |
41 | | - ''' |
42 | | - This is the entry function to be called to generate data for creating charts. |
43 | | - ''' |
44 | | - stopwatch = timer.Timer() |
45 | | - res = True |
46 | | - dbname = '%s%s' % (language_code, project) |
| 47 | + def convert_synchronized_objects(self): |
| 48 | + for obs in self.var: |
| 49 | + obs = self.var[obs] |
| 50 | + obs.data = obs.data.value |
| 51 | + |
| 52 | + def store(self): |
| 53 | + #self.convert_synchronized_objects() |
| 54 | + location = os.path.join(self.rts.binary_location, '%s_%s.bin' % (self.var.name, self.name)) |
| 55 | + fh = open(location, 'wb') |
| 56 | + cPickle.dump(self.var, fh) |
| 57 | + fh.close() |
| 58 | + |
| 59 | + def run(self): |
| 60 | + ''' |
| 61 | + Generic loop function that loops over all the editors of a Wikipedia |
| 62 | + project and then calls the function that does the actual aggregation. |
| 63 | + ''' |
| 64 | + mongo = db.init_mongo_db(self.rts.dbname) |
| 65 | + coll = mongo[self.rts.editors_dataset] |
| 66 | + while True: |
| 67 | + try: |
| 68 | + task = self.tasks.get(block=False) |
| 69 | + self.tasks.task_done() |
| 70 | + if task == None: |
| 71 | + #print self.var.number_of_obs(), len(self.var.obs) |
| 72 | + #self.store() |
| 73 | + self.result.put(self.var) |
| 74 | + break |
| 75 | + editor = coll.find_one({'editor': task.editor}) |
| 76 | + |
| 77 | + task.plugin(self.var, editor, dbname=self.rts.dbname) |
| 78 | + self.result.put(True) |
| 79 | + except Empty: |
| 80 | + pass |
| 81 | + |
| 82 | +class Task: |
| 83 | + def __init__(self, plugin, editor): |
| 84 | + self.plugin = plugin |
| 85 | + self.editor = editor |
| 86 | + |
| 87 | + |
| 88 | +def retrieve_plugin(func): |
47 | 89 | functions = inventory.available_analyses() |
48 | 90 | try: |
49 | | - func = functions[func] |
| 91 | + return functions[func] |
50 | 92 | except KeyError: |
51 | 93 | return False |
52 | 94 | |
53 | | - print 'Exporting data for chart: %s' % func.func_name |
54 | | - print 'Project: %s' % dbname |
55 | | - print 'Dataset: %s' % collection |
56 | 95 | |
57 | | - ds = loop_editors(dbname, project, collection, language_code, func, encoder, **kwargs) |
| 96 | +def feedback(plugin, rts): |
| 97 | + print 'Exporting data for chart: %s' % plugin.func_name |
| 98 | + print 'Project: %s' % rts.dbname |
| 99 | + print 'Dataset: %s' % rts.editors_dataset |
| 100 | + |
| 101 | + |
| 102 | +def write_output(ds, rts, stopwatch): |
58 | 103 | ds.create_filename() |
59 | | - print 'Storing dataset: %s' % os.path.join(settings.dataset_location, ds.filename) |
| 104 | + print 'Storing dataset: %s' % os.path.join(rts.dataset_location, |
| 105 | + ds.filename) |
60 | 106 | ds.write(format='csv') |
61 | | - |
62 | | - print 'Serializing dataset to %s_%s' % (dbname, 'charts') |
63 | | - log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start') |
| 107 | + print 'Serializing dataset to %s_%s' % (rts.dbname, 'charts') |
| 108 | + log.log_to_mongo(rts, 'chart', 'storing', stopwatch, event='start') |
64 | 109 | ds.write(format='mongo') |
65 | | - stopwatch.elapsed() |
66 | | - log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish') |
| 110 | + log.log_to_mongo(rts, 'chart', 'storing', stopwatch, event='finish') |
67 | 111 | |
68 | | - ds.summary() |
69 | | - return res |
70 | 112 | |
71 | | - |
72 | | -def loop_editors(dbname, project, collection, language_code, func, encoder, **kwargs): |
| 113 | +def generate_chart_data(rts, func, **kwargs): |
73 | 114 | ''' |
74 | | - Generic loop function that loops over all the editors of a Wikipedia project |
75 | | - and then calls the function that does the actual aggregation. |
| 115 | + This is the entry function to be called to generate data for creating |
| 116 | + charts. |
76 | 117 | ''' |
77 | | - mongo = db.init_mongo_db(dbname) |
78 | | - coll = mongo[collection] |
79 | | - editors = db.retrieve_distinct_keys(dbname, collection, 'editor') |
| 118 | + stopwatch = timer.Timer() |
| 119 | + plugin = retrieve_plugin(func) |
| 120 | + feedback(plugin, rts) |
80 | 121 | |
81 | 122 | |
82 | | - min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian') |
83 | | - pbar = progressbar.ProgressBar(maxval=len(editors)).start() |
84 | | - print 'Number of editors: %s' % len(editors) |
85 | | - |
| 123 | + tasks = JoinableQueue() |
| 124 | + result = JoinableQueue() |
| 125 | + mgr = Manager() |
| 126 | + lock = mgr.RLock() |
| 127 | + editors = db.retrieve_distinct_keys(rts.dbname, rts.editors_dataset, 'editor') |
| 128 | + min_year, max_year = determine_project_year_range(rts.dbname, |
| 129 | + rts.editors_dataset, |
| 130 | + 'new_wikipedian') |
86 | 131 | fmt = kwargs.pop('format', 'long') |
| 132 | + time_unit = kwargs.pop('time_unit', 'year') |
87 | 133 | kwargs['min_year'] = min_year |
88 | 134 | kwargs['max_year'] = max_year |
89 | | - variables = [] |
90 | | - ds = dataset.Dataset(func.func_name, |
91 | | - project, |
92 | | - coll.name, |
93 | | - language_code, |
94 | | - encoder, |
95 | | - variables, |
96 | | - format=fmt) |
97 | | - var = dataset.Variable('count', **kwargs) |
98 | 135 | |
| 136 | + pbar = progressbar.ProgressBar(maxval=len(editors)).start() |
| 137 | + var = dataset.Variable('count', time_unit, lock, **kwargs) |
| 138 | + |
99 | 139 | for editor in editors: |
100 | | - editor = coll.find_one({'editor': editor}) |
101 | | - var = func(var, editor, dbname=dbname) |
102 | | - pbar.update(pbar.currval + 1) |
| 140 | + tasks.put(Task(plugin, editor)) |
103 | 141 | |
| 142 | + consumers = [Analyzer(rts, tasks, result, var) for |
| 143 | + x in xrange(rts.number_of_processes)] |
| 144 | + |
| 145 | + for x in xrange(rts.number_of_processes): |
| 146 | + tasks.put(None) |
| 147 | + |
| 148 | + for w in consumers: |
| 149 | + w.start() |
| 150 | + |
| 151 | + ppills = rts.number_of_processes |
| 152 | + while True: |
| 153 | + while ppills > 0: |
| 154 | + try: |
| 155 | + res = result.get(block=True) |
| 156 | + if res == True: |
| 157 | + pbar.update(pbar.currval + 1) |
| 158 | + else: |
| 159 | + ppills -= 1 |
| 160 | + var = res |
| 161 | + except Empty: |
| 162 | + pass |
| 163 | + break |
| 164 | + |
| 165 | + |
| 166 | + tasks.join() |
| 167 | + ds = dataset.Dataset(plugin.func_name, rts, format=fmt) |
| 168 | + #var = consumers[0].var |
104 | 169 | ds.add_variable(var) |
105 | | - return ds |
106 | 170 | |
| 171 | + stopwatch.elapsed() |
| 172 | + write_output(ds, rts, stopwatch) |
107 | 173 | |
| 174 | + ds.summary() |
| 175 | + return True |
| 176 | + |
| 177 | + |
108 | 178 | def determine_project_year_range(dbname, collection, var): |
109 | 179 | ''' |
110 | 180 | Determine the first and final year for the observed data |
— | — | @@ -120,16 +190,24 @@ |
121 | 191 | |
122 | 192 | |
123 | 193 | if __name__ == '__main__': |
124 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50) |
125 | | - #generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5) |
126 | | - #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', 'to_bar_json', time_unit='year') |
127 | | - #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', 'to_bar_json', time_unit='year') |
128 | | - #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', 'to_bar_json', time_unit='year') |
129 | | - #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0) |
130 | | - #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide') |
131 | | - #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide') |
132 | | - #generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_edits', 'to_bar_json', time_unit='year', cutoff=0) |
133 | | - #generate_chart_data('wiki', 'editors_dataset', 'en', 'time_to_new_wikipedian', 'to_bar_json', time_unit='year', cutoff=0) |
134 | | - #generate_chart_data('wiki', 'editors_dataset', 'en', 'new_editor_count', 'to_bar_json', time_unit='month', cutoff=0) |
| 194 | + project, language, parser = manager.init_args_parser() |
| 195 | + args = parser.parse_args(['django']) |
| 196 | + rts = runtime_settings.init_environment('wiki', 'en', args) |
135 | 197 | |
136 | | - #available_analyses() |
| 198 | + #TEMP FIX, REMOVE |
| 199 | + rts.dbname = 'enwiki' |
| 200 | + rts.editors_dataset = 'editors_dataset' |
| 201 | + #END TEMP FIX |
| 202 | + |
| 203 | + generate_chart_data(rts, 'histogram_by_backward_cohort', time_unit='year', cutoff=1, cum_cutoff=10) |
| 204 | +# generate_chart_data(rts, 'edit_patterns', time_unit='year', cutoff=5) |
| 205 | +# generate_chart_data(rts, 'total_number_of_new_wikipedians', time_unit='year') |
| 206 | +# generate_chart_data(rts, 'total_number_of_articles', time_unit='year') |
| 207 | +# generate_chart_data(rts, 'total_cumulative_edits', time_unit='year') |
| 208 | +# generate_chart_data(rts, 'cohort_dataset_forward_histogram', time_unit='month', cutoff=5, cum_cutoff=0) |
| 209 | +# generate_chart_data(rts, 'cohort_dataset_backward_bar', time_unit='year', cutoff=10, cum_cutoff=0, format='wide') |
| 210 | +# generate_chart_data(rts, 'cohort_dataset_forward_bar', time_unit='year', cutoff=5, cum_cutoff=0, format='wide') |
| 211 | +# generate_chart_data(rts, 'histogram_edits', time_unit='year', cutoff=0) |
| 212 | +# generate_chart_data(rts, 'time_to_new_wikipedian', time_unit='year', cutoff=0) |
| 213 | +# generate_chart_data(rts, 'new_editor_count', time_unit='month', cutoff=0) |
| 214 | +# #available_analyses() |
Index: trunk/tools/editor_trends/classes/consumers.py |
— | — | @@ -26,5 +26,3 @@ |
27 | 27 | self.rts = rts |
28 | 28 | self.tasks = tasks |
29 | 29 | self.result = result |
30 | | - |
31 | | - |
Index: trunk/tools/editor_trends/classes/dataset.py |
— | — | @@ -25,8 +25,9 @@ |
26 | 26 | import sys |
27 | 27 | import hashlib |
28 | 28 | from pymongo.son_manipulator import SONManipulator |
29 | | -from multiprocessing import Lock |
| 29 | +from multiprocessing import RLock, Array, Value |
30 | 30 | from texttable import Texttable |
| 31 | +from datetime import timedelta |
31 | 32 | |
32 | 33 | |
33 | 34 | if '..' not in sys.path: |
— | — | @@ -90,6 +91,7 @@ |
91 | 92 | to lookup an Observation or Variable. |
92 | 93 | ''' |
93 | 94 | id = '_'.join([str(var) for var in vars]) |
| 95 | + #return id |
94 | 96 | m = hashlib.md5() |
95 | 97 | m.update(id) |
96 | 98 | #print id, m.hexdigest() |
— | — | @@ -121,7 +123,8 @@ |
122 | 124 | of the date |
123 | 125 | ''' |
124 | 126 | assert self.time_unit == 'year' or self.time_unit == 'month' \ |
125 | | - or self.time_unit == 'day', 'Time unit should either be year, month or day.' |
| 127 | + or self.time_unit == 'day', \ |
| 128 | + 'Time unit should either be year, month or day.' |
126 | 129 | |
127 | 130 | if self.time_unit == 'year': |
128 | 131 | datum = datetime.datetime(date.year, 1, 1) |
— | — | @@ -139,24 +142,29 @@ |
140 | 143 | Determine the width of a date range for an observation. |
141 | 144 | ''' |
142 | 145 | if self.time_unit == 'year': |
143 | | - return datetime.datetime(date.year, 12, 31), datetime.datetime(date.year, 1, 1) |
| 146 | + return datetime.datetime(date.year, 12, 31), \ |
| 147 | + datetime.datetime(date.year, 1, 1) |
144 | 148 | elif self.time_unit == 'month': |
145 | 149 | day = calendar.monthrange(date.year, date.month)[1] |
146 | | - return datetime.datetime(date.year, date.month, day), datetime.datetime(date.year, date.month, 1) |
| 150 | + return datetime.datetime(date.year, date.month, day), \ |
| 151 | + datetime.datetime(date.year, date.month, 1) |
147 | 152 | else: |
148 | | - return datetime.datetime(date.year, date.month, date.day), datetime.datetime(date.year, date.month, date.day) |
| 153 | + return datetime.datetime(date.year, date.month, date.day), \ |
| 154 | + datetime.datetime(date.year, date.month, date.day) |
149 | 155 | |
150 | 156 | |
151 | 157 | class Observation(Data): |
152 | | - lock = Lock() |
153 | 158 | ''' |
154 | 159 | The smallest unit, here the actual data is being stored. |
155 | 160 | Time_unit should either be 'year', 'month' or 'day'. |
156 | 161 | ''' |
157 | 162 | def __init__(self, date, time_unit, id, meta): |
158 | | - assert isinstance(date, datetime.datetime), 'Date variable should be a datetime.datetime instance.' |
| 163 | + assert isinstance(date, datetime.datetime), '''Date variable should be |
| 164 | + a datetime.datetime instance.''' |
| 165 | + #self.lock = lock #Lock() |
159 | 166 | self.date = date |
160 | 167 | self.data = 0 |
| 168 | + #self.data = Value('i', 0) |
161 | 169 | self.time_unit = time_unit |
162 | 170 | self.t1, self.t0 = self.set_date_range(date) |
163 | 171 | self.id = id |
— | — | @@ -164,7 +172,8 @@ |
165 | 173 | self.count = 0 |
166 | 174 | for mt in meta: |
167 | 175 | if isinstance(mt, float): |
168 | | - raise Exception, 'Mongo does not allow a dot "." in the name of a key, please use an integer or string as key.' |
| 176 | + raise Exception, '''Mongo does not allow a dot "." in the name |
| 177 | + of a key, please use an integer or string as key.''' |
169 | 178 | elif not isinstance(mt, list): |
170 | 179 | setattr(self, mt, meta[mt]) |
171 | 180 | self.props.append(mt) |
— | — | @@ -174,7 +183,9 @@ |
175 | 184 | return '%s' % self.date |
176 | 185 | |
177 | 186 | def __str__(self): |
178 | | - return 'range: %s:%s' % (self.t0, self.t1) |
| 187 | + return 'range: %s-%s-%s : %s-%s-%s' % (self.t0.month, self.t0.day, \ |
| 188 | + self.t0.year, self.t1.month, \ |
| 189 | + self.t1.day, self.t1.year) |
179 | 190 | |
180 | 191 | def __iter__(self): |
181 | 192 | for obs in self.data: |
— | — | @@ -186,17 +197,19 @@ |
187 | 198 | def add(self, value): |
188 | 199 | ''' |
189 | 200 | ''' |
190 | | - self.lock.acquire() |
191 | | - try: |
192 | | - if isinstance(value, list): |
193 | | - if self.count == 0: |
194 | | - self.data = [] |
195 | | - self.data.append(value) |
196 | | - else: |
197 | | - self.data += value |
198 | | - finally: |
199 | | - self.count += 1 |
200 | | - self.lock.release() |
| 201 | + #self.lock.acquire() |
| 202 | + #try: |
| 203 | + if isinstance(value, list): |
| 204 | + if self.count == 0: |
| 205 | + self.data = [] |
| 206 | + #self.data = Array('i', 0) |
| 207 | + self.data.append(value) |
| 208 | + else: |
| 209 | + self.data += value |
| 210 | + #self.data.value += value |
| 211 | + #finally: |
| 212 | + self.count += 1 |
| 213 | + #self.lock.release() |
201 | 214 | |
202 | 215 | |
203 | 216 | def get_date_range(self): |
— | — | @@ -207,10 +220,9 @@ |
208 | 221 | ''' |
209 | 222 | This class constructs a time-based variable. |
210 | 223 | ''' |
211 | | - |
212 | | - def __init__(self, name, time_unit, **kwargs): |
| 224 | + def __init__(self, name, time_unit, lock, **kwargs): |
213 | 225 | self.name = name |
214 | | - self.lock = Lock() |
| 226 | + self.lock = lock |
215 | 227 | self.obs = {} |
216 | 228 | self.time_unit = time_unit |
217 | 229 | self.groupbys = [] |
— | — | @@ -249,7 +261,6 @@ |
250 | 262 | for key in self: |
251 | 263 | yield (key, self.obs[key]) |
252 | 264 | |
253 | | - |
254 | 265 | def get_data(self): |
255 | 266 | return [o for o in self.itervalues()] |
256 | 267 | |
— | — | @@ -257,6 +268,8 @@ |
258 | 269 | self.lock.acquire() |
259 | 270 | try: |
260 | 271 | obs = self.obs.get(id, Observation(date, self.time_unit, id, meta)) |
| 272 | + #self.obs[id] = obs |
| 273 | + x = len(self.obs) |
261 | 274 | finally: |
262 | 275 | self.lock.release() |
263 | 276 | return obs |
— | — | @@ -264,10 +277,10 @@ |
265 | 278 | def add(self, date, value, meta={}): |
266 | 279 | ''' |
267 | 280 | The add function is used to add an observation to a variable. An |
268 | | - observation is always grouped by the combination of the date and time_unit. |
269 | | - Time_unit is a property of a Variable and indicates how granular the |
270 | | - observations should be grouped. For example, if time_unit == year then |
271 | | - all observations in a given year will be grouped. |
| 281 | + observation is always grouped by the combination of the date and |
| 282 | + time_unit. Time_unit is a property of a Variable and indicates how |
| 283 | + granular the observations should be grouped. For example, if |
| 284 | + time_unit == year then all observations in a given year will be grouped. |
272 | 285 | When calling add you should supply at least two variables: |
273 | 286 | 1) date: when did the observation happen |
274 | 287 | 2) value: an integer or float that was observed on that date |
— | — | @@ -276,25 +289,25 @@ |
277 | 290 | For example, if you add {'experience': 3} as the meta dict when calling |
278 | 291 | add then you will create an extra grouping called experience and all |
279 | 292 | future observations who fall in the same date range and the same |
280 | | - exerience level will be grouped by that particular observation. You |
281 | | - can use as many extra groupings as you want but usually one extra grouping |
282 | | - should be enough. |
| 293 | + exerience level, in this case 3, will be grouped by that particular |
| 294 | + observation. You can use as many extra groupings as you want but |
| 295 | + usually one extra grouping should be enough. |
283 | 296 | ''' |
284 | | - assert isinstance(meta, dict), 'The meta variable should be a dict (either empty or with variables to group by.' |
| 297 | + assert isinstance(meta, dict), '''The meta variable should be a dict |
| 298 | + (either empty or with variables to group by.''' |
285 | 299 | start, end = self.set_date_range(date) |
286 | 300 | values = meta.values() |
287 | 301 | values.insert(0, end) |
288 | 302 | values.insert(0, start) |
289 | 303 | id = self.__hash__(values) |
290 | | -# print values |
291 | | - self.lock.acquire() |
| 304 | + obs = self.get_observation(id, date, meta) |
| 305 | + obs.add(value) |
292 | 306 | try: |
293 | | - obs = self.get_observation(id, date, meta) |
294 | | - obs.add(value) |
| 307 | + self.lock.acquire() |
295 | 308 | self.obs[id] = obs |
296 | 309 | finally: |
297 | 310 | self.lock.release() |
298 | | - print len(self.obs) |
| 311 | + #print date, id, meta.values(), obs.count, len(self.obs) |
299 | 312 | |
300 | 313 | def number_of_obs(self): |
301 | 314 | n = 0 |
— | — | @@ -341,7 +354,6 @@ |
342 | 355 | ''' |
343 | 356 | |
344 | 357 | def __init__(self, chart, rts, vars=None, **kwargs): |
345 | | - #project, collection, language_code |
346 | 358 | self.encoder, chart, charts = json_encoders.get_json_encoder(chart) |
347 | 359 | if self.encoder == None: |
348 | 360 | raise exceptions.UnknownChartError(chart, charts) |
— | — | @@ -377,8 +389,8 @@ |
378 | 390 | print 'Project: %s%s' % (self.language_code, self.project) |
379 | 391 | print 'JSON encoder: %s' % self.encoder |
380 | 392 | print 'Raw data was retrieved from: %s%s/%s' % (self.language_code, |
381 | | - self.project, |
382 | | - self.collection) |
| 393 | + self.project, |
| 394 | + self.collection) |
383 | 395 | |
384 | 396 | def create_filename(self): |
385 | 397 | ''' |
— | — | @@ -422,7 +434,7 @@ |
423 | 435 | self.variables.append(var.name) |
424 | 436 | setattr(self, var.name, var) |
425 | 437 | else: |
426 | | - raise TypeError('You can only instance of Variable to a dataset.') |
| 438 | + raise TypeError('You can only add an instance of Variable to a dataset.') |
427 | 439 | |
428 | 440 | def write(self, format='csv'): |
429 | 441 | ''' |
— | — | @@ -483,14 +495,26 @@ |
484 | 496 | float_nums = [float(x) for x in number_list] |
485 | 497 | return sum(float_nums) / len(number_list) |
486 | 498 | |
| 499 | + def get_min(self, number_list): |
| 500 | + if number_list == []: |
| 501 | + return '.' |
| 502 | + else: |
| 503 | + return min(number_list) |
| 504 | + |
| 505 | + def get_max(self, number_list): |
| 506 | + if number_list == []: |
| 507 | + return '.' |
| 508 | + else: |
| 509 | + return max(number_list) |
| 510 | + |
487 | 511 | def descriptives(self): |
488 | 512 | for variable in self: |
489 | 513 | data = variable.get_data() |
490 | 514 | variable.mean = self.get_mean(data) |
491 | 515 | variable.median = self.get_median(data) |
492 | 516 | variable.sds = self.get_standard_deviation(data) |
493 | | - variable.min = min(data) |
494 | | - variable.max = max(data) |
| 517 | + variable.min = self.get_min(data) |
| 518 | + variable.max = self.get_max(data) |
495 | 519 | variable.num_obs = variable.number_of_obs() |
496 | 520 | variable.num_dates = len(variable) |
497 | 521 | variable.first_obs, variable.last_obs = variable.get_date_range() |
— | — | @@ -499,7 +523,7 @@ |
500 | 524 | self.descriptives() |
501 | 525 | table = Texttable(max_width=0) |
502 | 526 | vars = ['Variable', 'Mean', 'Median', 'SD', 'Minimum', 'Maximum', |
503 | | - 'Num Obs', 'Num of\nUnique Dates', 'First Obs', 'Final Obs'] |
| 527 | + 'Num Obs', 'Num of\nUnique Groups', 'First Obs', 'Final Obs'] |
504 | 528 | table.add_row([var for var in vars]) |
505 | 529 | table.set_cols_align(['r' for v in vars]) |
506 | 530 | table.set_cols_valign(['m' for v in vars]) |
— | — | @@ -521,29 +545,41 @@ |
522 | 546 | |
523 | 547 | d1 = datetime.datetime.today() |
524 | 548 | d2 = datetime.datetime(2007, 6, 7) |
525 | | - ds = Dataset('test', 'wiki', 'editors_dataset', 'en', 'to_bar_json', [ |
526 | | - {'name': 'count', 'time_unit': 'year'}, |
527 | | - # {'name': 'testest', 'time_unit': 'year'} |
528 | | - ]) |
529 | | - ds.count.add(d1, 10, {'exp': 3}) |
530 | | - ds.count.add(d1, 135, {'exp': 3}) |
531 | | - ds.count.add(d2, 1, {'exp': 4}) |
532 | | - #ds.testest.add(d1, 135) |
533 | | - #ds.testest.add(d2, 535) |
534 | | - ds.summary() |
535 | | - ds.write(format='csv') |
536 | | -# v = Variable('test', 'year') |
537 | | - ds.encode() |
| 549 | +# ds = Dataset('histogram', rts, [{'name': 'count', 'time_unit': 'year'}, |
| 550 | +# #{'name': 'testest', 'time_unit': 'year'} |
| 551 | +# ]) |
| 552 | +# ds.count.add(d1, 10, {'exp': 3}) |
| 553 | +# ds.count.add(d1, 135, {'exp': 3}) |
| 554 | +# ds.count.add(d2, 1, {'exp': 4}) |
| 555 | +# #ds.testest.add(d1, 135) |
| 556 | +# #ds.testest.add(d2, 535) |
| 557 | +# ds.summary() |
| 558 | +# ds.write(format='csv') |
| 559 | +# |
| 560 | +# ds.encode() |
| 561 | + #name, time_unit, lock, **kwargs |
| 562 | + lock = RLock() |
| 563 | + v = Variable('test', 'year', lock) |
| 564 | + v.add(d1, 10, {'exp': 3, 'test': 10}) |
| 565 | + v.add(d1, 135, {'exp': 3, 'test': 10}) |
| 566 | + v.add(d2, 1, {'exp': 4, 'test': 10}) |
| 567 | + v.add(d2, 1, {'exp': 4, 'test': 10}) |
| 568 | + v.add(d2 , 1, {'exp': 3, 'test': 8}) |
| 569 | + v.add(d2 , 1, {'exp': 2, 'test': 10}) |
| 570 | + v.add(d2 , 1, {'exp': 4, 'test': 11}) |
| 571 | + v.add(d2 , 1, {'exp': 8, 'test': 13}) |
| 572 | + v.add(d2 , 1, {'exp': 9, 'test': 12}) |
538 | 573 | |
539 | 574 | |
540 | | - # mongo.test.insert({'variables': ds}) |
| 575 | +# v.add(d2 + timedelta(days=400), 1, {'exp': 4, 'test': 10}) |
| 576 | +# v.add(d2 + timedelta(days=900), 1, {'exp': 3, 'test': 8}) |
| 577 | +# v.add(d2 + timedelta(days=1200), 1, {'exp': 2, 'test': 10}) |
| 578 | +# v.add(d2 + timedelta(days=1600), 1, {'exp': 4, 'test': 11}) |
| 579 | +# v.add(d2 + timedelta(days=2000), 1, {'exp': 8, 'test': 13}) |
| 580 | +# v.add(d2 + timedelta(days=2400), 1, {'exp': 9, 'test': 12}) |
541 | 581 | |
542 | | - # v.add(d2 , 5) |
543 | | - #o = v.get_observation(d2) |
544 | | -# ds = rawdata.find_one({'project': 'wiki', |
545 | | -# 'language_code': 'en', |
546 | | -# 'hash': 'cohort_dataset_backward_bar'}) |
| 582 | + print len(v), v.number_of_obs() |
547 | 583 | |
548 | | - |
| 584 | + # mongo.test.insert({'variables': ds}) |
549 | 585 | if __name__ == '__main__': |
550 | 586 | debug() |