Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -14,13 +14,14 @@ |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | 17 | __email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-12-10' |
| 18 | +_date__ = '2010-12-10' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | 21 | from multiprocessing import JoinableQueue, Manager, RLock, Process |
22 | 22 | from multiprocessing.managers import BaseManager |
23 | 23 | from Queue import Empty |
24 | 24 | |
| 25 | +import types |
25 | 26 | import sys |
26 | 27 | import cPickle |
27 | 28 | import os |
— | — | @@ -54,7 +55,8 @@ |
55 | 56 | keys = var.obs.keys() |
56 | 57 | d = {} |
57 | 58 | for key in keys: |
58 | | - d[key] = cPickle.loads(var.obs[key]) |
| 59 | + d[key] = var.obs[key] |
| 60 | + #d[key] = cPickle.loads(var.obs[key]) |
59 | 61 | var.obs = d |
60 | 62 | return var |
61 | 63 | |
— | — | @@ -107,6 +109,7 @@ |
108 | 110 | |
109 | 111 | db = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset) |
110 | 112 | editors = db.retrieve_distinct_keys('editor') |
| 113 | + editors = editors[:500] |
111 | 114 | min_year, max_year = determine_project_year_range(db, 'new_wikipedian') |
112 | 115 | |
113 | 116 | fmt = kwargs.pop('format', 'long') |
— | — | @@ -159,7 +162,7 @@ |
160 | 163 | |
161 | 164 | tasks.join() |
162 | 165 | |
163 | | - reconstruct_observations(var) |
| 166 | + var = reconstruct_observations(var) |
164 | 167 | ds = dataset.Dataset(plugin.func_name, rts, format=fmt, **kwargs) |
165 | 168 | ds.add_variable(var) |
166 | 169 | |
— | — | @@ -168,7 +171,28 @@ |
169 | 172 | |
170 | 173 | ds.summary() |
171 | 174 | |
| 175 | + for n, c in get_refcounts()[:100]: |
| 176 | + print '%10d %s' % (n, c.__name__) |
172 | 177 | |
| 178 | + |
| 179 | +def get_refcounts(): |
| 180 | + d = {} |
| 181 | + sys.modules |
| 182 | + # collect all classes |
| 183 | + for m in sys.modules.values(): |
| 184 | + for sym in dir(m): |
| 185 | + o = getattr (m, sym) |
| 186 | + if type(o) is types.ClassType: |
| 187 | + d[o] = sys.getrefcount (o) |
| 188 | + # sort by refcount |
| 189 | + pairs = map (lambda x: (x[1], x[0]), d.items()) |
| 190 | + pairs.sort() |
| 191 | + pairs.reverse() |
| 192 | + return pairs |
| 193 | + |
| 194 | + |
| 195 | + |
| 196 | + |
173 | 197 | def determine_project_year_range(db, var): |
174 | 198 | ''' |
175 | 199 | Determine the first and final year for the observed data |
Index: trunk/tools/editor_trends/classes/dataset.py |
— | — | @@ -27,7 +27,7 @@ |
28 | 28 | from multiprocessing import RLock |
29 | 29 | from texttable import Texttable |
30 | 30 | from datetime import timedelta |
31 | | - |
| 31 | +import cProfile |
32 | 32 | if '..' not in sys.path: |
33 | 33 | sys.path.append('..') |
34 | 34 | |
— | — | @@ -277,8 +277,8 @@ |
278 | 278 | return [o for o in self.itervalues()] |
279 | 279 | |
280 | 280 | def get_observation(self, key, date, meta): |
281 | | - '''Get a single observation based on a date key and posssibly meta data''' |
282 | | - return self.obs.get(key, Observation(date, self.time_unit, key, meta).serialize()) |
| 281 | + '''Get a single observation based on a date key and possibly meta data''' |
| 282 | + return self.obs.get(key, Observation(date, self.time_unit, key, meta)) |
283 | 283 | |
284 | 284 | def add(self, date, value, meta={}): |
285 | 285 | ''' |
— | — | @@ -295,12 +295,12 @@ |
296 | 296 | For example, if you add {'experience': 3} as the meta dict when calling |
297 | 297 | add then you will create an extra grouping called experience and all |
298 | 298 | future observations who fall in the same date range and the same |
299 | | - exerience level, in this case 3, will be grouped by that particular |
| 299 | + experience level, in this case 3, will be grouped by that particular |
300 | 300 | observation. You can use as many extra groupings as you want but |
301 | 301 | usually one extra grouping should be enough. |
302 | 302 | ''' |
303 | 303 | assert isinstance(meta, dict), '''The meta variable should be a dict |
304 | | - (either empty or with variables to group by.''' |
| 304 | + (either empty) or with variables to group by.''' |
305 | 305 | start, end = self.set_date_range(date) |
306 | 306 | values = meta.values() |
307 | 307 | values.insert(0, end) |
— | — | @@ -310,9 +310,9 @@ |
311 | 311 | self.lock.acquire() |
312 | 312 | try: |
313 | 313 | obs = self.get_observation(id, date, meta) |
314 | | - obs = cPickle.loads(obs) |
| 314 | + #obs = cPickle.loads(obs) |
315 | 315 | obs.add(value) |
316 | | - obs = obs.serialize() |
| 316 | + #obs = obs.serialize() |
317 | 317 | self.obs[id] = obs |
318 | 318 | finally: |
319 | 319 | self.lock.release() |
— | — | @@ -580,9 +580,10 @@ |
581 | 581 | else: |
582 | 582 | return max(number_list) |
583 | 583 | |
| 584 | + |
584 | 585 | def debug(): |
585 | 586 | db = storage.init_database('mongo', 'wikilytics', 'enwiki_charts') |
586 | | - db.add_son_manipulator(Transform()) |
| 587 | + #db.add_son_manipulator(Transform()) |
587 | 588 | |
588 | 589 | d1 = datetime.datetime.today() |
589 | 590 | d2 = datetime.datetime(2007, 6, 7) |
— | — | @@ -600,7 +601,7 @@ |
601 | 602 | # ds.encode() |
602 | 603 | #name, time_unit, lock, **kwargs |
603 | 604 | lock = RLock() |
604 | | - v = Variable('test', 'year', lock) |
| 605 | + v = Variable('test', 'year', lock, {}) |
605 | 606 | v.add(d1, 10, {'exp': 3, 'test': 10}) |
606 | 607 | v.add(d1, 135, {'exp': 3, 'test': 10}) |
607 | 608 | v.add(d2, 1, {'exp': 4, 'test': 10}) |
— | — | @@ -611,6 +612,7 @@ |
612 | 613 | v.add(d2 , 1, {'exp': 8, 'test': 13}) |
613 | 614 | v.add(d2 , 1, {'exp': 9, 'test': 12}) |
614 | 615 | |
| 616 | + #mem = get_refcounts() |
615 | 617 | |
616 | 618 | # v.add(d2 + timedelta(days=400), 1, {'exp': 4, 'test': 10}) |
617 | 619 | # v.add(d2 + timedelta(days=900), 1, {'exp': 3, 'test': 8}) |
— | — | @@ -619,8 +621,8 @@ |
620 | 622 | # v.add(d2 + timedelta(days=2000), 1, {'exp': 8, 'test': 13}) |
621 | 623 | # v.add(d2 + timedelta(days=2400), 1, {'exp': 9, 'test': 12}) |
622 | 624 | |
623 | | - print len(v), v.number_of_obs() |
| 625 | +# print len(v), v.number_of_obs() |
624 | 626 | |
625 | 627 | # mongo.test.insert({'variables': ds}) |
626 | 628 | if __name__ == '__main__': |
627 | | - debug() |
| 629 | + cProfile.run('debug()') |