Index: trunk/tools/editor_trends/database/cache.py |
— | — | @@ -0,0 +1,129 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | + |
| 5 | +''' |
| 6 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 7 | +This program is free software; you can redistribute it and/or |
| 8 | +modify it under the terms of the GNU General Public License version 2 |
| 9 | +as published by the Free Software Foundation. |
| 10 | +This program is distributed in the hope that it will be useful, |
| 11 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 13 | +See the GNU General Public License for more details, at |
| 14 | +http://www.fsf.org/licenses/gpl.html |
| 15 | +''' |
| 16 | + |
| 17 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 18 | +__author__email = 'dvanliere at gmail dot com' |
| 19 | +__date__ = 'Oct 24, 2010' |
| 20 | +__version__ = '0.1' |
| 21 | + |
| 22 | +''' |
| 23 | +This module provides a simple caching mechanism to speed-up the process of |
| 24 | +inserting records to MongoDB. The caching bject works as follows: |
| 25 | +1) Each edit from an author is added to a dictionary |
| 26 | +2) Every 50000 edits, the object returns %x with the most edits, and these are |
| 27 | +then stored in MongoDB. By packaging multiple edits in a single commit, |
| 28 | +processing time is significantly reduced. |
| 29 | + |
| 30 | +This caching mechanism does not create any benefits for authors with single or |
| 31 | +very few edits. |
| 32 | + |
| 33 | +''' |
| 34 | + |
| 35 | + |
| 36 | +import sys |
| 37 | +import datetime |
| 38 | + |
| 39 | +import settings |
| 40 | +import db |
| 41 | + |
| 42 | + |
| 43 | +class EditorCache(object): |
| 44 | + def __init__(self, collection): |
| 45 | + self.collection = collection |
| 46 | + self.editors = {} |
| 47 | + self.size = self.__sizeof__() |
| 48 | + self.cumulative_n = 0 |
| 49 | + self.time_started = datetime.datetime.now() |
| 50 | + self.n = self.current_cache_size() |
| 51 | + self.emptied = 1 |
| 52 | + |
| 53 | + |
| 54 | + def __repr__(self): |
| 55 | + pass |
| 56 | + |
| 57 | + |
| 58 | + def _store_editor(self, key, value): |
| 59 | + editor = self.collection.insert({'editor': key, 'edits': {}}) |
| 60 | + self.editors[key]['id'] = str(editor) |
| 61 | + |
| 62 | + |
| 63 | + def current_cache_size(self): |
| 64 | + return sum([self.editors[k].get('obs', 0) for k in self.editors]) |
| 65 | + |
| 66 | + |
| 67 | + def add(self, key, value): |
| 68 | + self.cumulative_n += 1 |
| 69 | + if key not in self.editors: |
| 70 | + self.editors[key] = {} |
| 71 | + self.editors[key]['obs'] = 0 |
| 72 | + self.editors[key]['edits'] = [] |
| 73 | + |
| 74 | + else: |
| 75 | + id = str(self.editors[key]['obs']) |
| 76 | + self.editors[key]['edits'].append(value) |
| 77 | + self.editors[key]['obs'] += 1 |
| 78 | + |
| 79 | + |
| 80 | + if self.cumulative_n % 25000 == 0: |
| 81 | + self.empty_all(5.0) |
| 82 | + |
| 83 | + |
| 84 | + def retrieve_top_k_editors(self, percentage): |
| 85 | + keys = self.editors.keys() |
| 86 | + obs = [] |
| 87 | + for k in keys: |
| 88 | + weight = float(self.editors[k].get('obs', 0)) / self.n |
| 89 | + obs.append((weight, k)) |
| 90 | + obs.sort() |
| 91 | + obs.reverse() |
| 92 | + l = int((len(obs) / 100.0) * percentage) |
| 93 | + if l == 0: |
| 94 | + l = 1 |
| 95 | + obs = obs[:l] |
| 96 | + obs = [o[1] for o in obs] |
| 97 | + return obs |
| 98 | + |
| 99 | + |
| 100 | + def update(self, editor, values): |
| 101 | + self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True) |
| 102 | + |
| 103 | + |
| 104 | + def empty_all(self, percentage): |
| 105 | + self.n = self.current_cache_size() |
| 106 | + if percentage < 100.0: |
| 107 | + keys = self.retrieve_top_k_editors(percentage) |
| 108 | + else: |
| 109 | + keys = self.editors.keys() |
| 110 | + print 'Emptying cache %s time' % self.emptied |
| 111 | + self.emptied += 1 |
| 112 | + for key in keys: |
| 113 | + if self.editors[key]['edits'] != {}: |
| 114 | + self.update(key, self.editors[key]['edits']) |
| 115 | + self.editors[key]['edits'] = [] |
| 116 | + self.editors[key]['obs'] = 0.0 |
| 117 | + |
| 118 | + |
| 119 | +def debug(): |
| 120 | + mongo = db.init_mongo_db('test') |
| 121 | + collection = mongo['test'] |
| 122 | + cache = EditorCache(collection) |
| 123 | + import random |
| 124 | + for i in xrange(100000): |
| 125 | + cache.add(str(random.randrange(0, 5)), {'date': 'woensaag', 'article': '3252'}) |
| 126 | + cache.empty_all(100) |
| 127 | + |
| 128 | + |
| 129 | +if __name__ == '__main__': |
| 130 | + debug() |
Property changes on: trunk/tools/editor_trends/database/cache.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 131 | + native |
Added: svn:mime-type |
2 | 132 | + text/plain |