r75339 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r75338‎ | r75339 | r75340 >
Date:20:17, 24 October 2010
Author:diederik
Status:deferred
Tags:
Comment:
Added simple caching mechanism to improve speed of inserting records in MongoDB.
Modified paths:
  • /trunk/tools/editor_trends/database/cache.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/database/cache.py
@@ -0,0 +1,129 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+
 5+'''
 6+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 7+This program is free software; you can redistribute it and/or
 8+modify it under the terms of the GNU General Public License version 2
 9+as published by the Free Software Foundation.
 10+This program is distributed in the hope that it will be useful,
 11+but WITHOUT ANY WARRANTY; without even the implied warranty of
 12+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 13+See the GNU General Public License for more details, at
 14+http://www.fsf.org/licenses/gpl.html
 15+'''
 16+
 17+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 18+__author__email = 'dvanliere at gmail dot com'
 19+__date__ = 'Oct 24, 2010'
 20+__version__ = '0.1'
 21+
 22+'''
 23+This module provides a simple caching mechanism to speed-up the process of
 24+inserting records to MongoDB. The caching bject works as follows:
 25+1) Each edit from an author is added to a dictionary
 26+2) Every 50000 edits, the object returns %x with the most edits, and these are
 27+then stored in MongoDB. By packaging multiple edits in a single commit,
 28+processing time is significantly reduced.
 29+
 30+This caching mechanism does not create any benefits for authors with single or
 31+very few edits.
 32+
 33+'''
 34+
 35+
 36+import sys
 37+import datetime
 38+
 39+import settings
 40+import db
 41+
 42+
 43+class EditorCache(object):
 44+ def __init__(self, collection):
 45+ self.collection = collection
 46+ self.editors = {}
 47+ self.size = self.__sizeof__()
 48+ self.cumulative_n = 0
 49+ self.time_started = datetime.datetime.now()
 50+ self.n = self.current_cache_size()
 51+ self.emptied = 1
 52+
 53+
 54+ def __repr__(self):
 55+ pass
 56+
 57+
 58+ def _store_editor(self, key, value):
 59+ editor = self.collection.insert({'editor': key, 'edits': {}})
 60+ self.editors[key]['id'] = str(editor)
 61+
 62+
 63+ def current_cache_size(self):
 64+ return sum([self.editors[k].get('obs', 0) for k in self.editors])
 65+
 66+
 67+ def add(self, key, value):
 68+ self.cumulative_n += 1
 69+ if key not in self.editors:
 70+ self.editors[key] = {}
 71+ self.editors[key]['obs'] = 0
 72+ self.editors[key]['edits'] = []
 73+
 74+ else:
 75+ id = str(self.editors[key]['obs'])
 76+ self.editors[key]['edits'].append(value)
 77+ self.editors[key]['obs'] += 1
 78+
 79+
 80+ if self.cumulative_n % 25000 == 0:
 81+ self.empty_all(5.0)
 82+
 83+
 84+ def retrieve_top_k_editors(self, percentage):
 85+ keys = self.editors.keys()
 86+ obs = []
 87+ for k in keys:
 88+ weight = float(self.editors[k].get('obs', 0)) / self.n
 89+ obs.append((weight, k))
 90+ obs.sort()
 91+ obs.reverse()
 92+ l = int((len(obs) / 100.0) * percentage)
 93+ if l == 0:
 94+ l = 1
 95+ obs = obs[:l]
 96+ obs = [o[1] for o in obs]
 97+ return obs
 98+
 99+
 100+ def update(self, editor, values):
 101+ self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)
 102+
 103+
 104+ def empty_all(self, percentage):
 105+ self.n = self.current_cache_size()
 106+ if percentage < 100.0:
 107+ keys = self.retrieve_top_k_editors(percentage)
 108+ else:
 109+ keys = self.editors.keys()
 110+ print 'Emptying cache %s time' % self.emptied
 111+ self.emptied += 1
 112+ for key in keys:
 113+ if self.editors[key]['edits'] != {}:
 114+ self.update(key, self.editors[key]['edits'])
 115+ self.editors[key]['edits'] = []
 116+ self.editors[key]['obs'] = 0.0
 117+
 118+
 119+def debug():
 120+ mongo = db.init_mongo_db('test')
 121+ collection = mongo['test']
 122+ cache = EditorCache(collection)
 123+ import random
 124+ for i in xrange(100000):
 125+ cache.add(str(random.randrange(0, 5)), {'date': 'woensaag', 'article': '3252'})
 126+ cache.empty_all(100)
 127+
 128+
 129+if __name__ == '__main__':
 130+ debug()
Property changes on: trunk/tools/editor_trends/database/cache.py
___________________________________________________________________
Added: svn:eol-style
1131 + native
Added: svn:mime-type
2132 + text/plain

Status & tagging log