Index: trunk/tools/editor_trends/analyses/plugins/edit_patterns.py |
— | — | @@ -37,11 +37,11 @@ |
38 | 38 | for year in years: |
39 | 39 | #for year in xrange(new_wikipedian.year, new_wikipedian.year + 2): |
40 | 40 | obs = [False for x in xrange(13)] |
41 | | - months = edit[year].keys() |
42 | | - for month in xrange(13): |
| 41 | + months = edits[year].keys() |
| 42 | + for month in xrange(1, 13): |
43 | 43 | count = edits[year].get(month, {}).get('0', 0) |
44 | | - date = datetime(int(year), int(month), 1) |
45 | 44 | if count >= var.cutoff: |
46 | 45 | obs[month] = True |
| 46 | + date = datetime(int(year), int(month), 1) |
47 | 47 | var.add(date, obs) |
48 | 48 | return var |
Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -36,7 +36,7 @@ |
37 | 37 | from classes import consumers |
38 | 38 | from classes import exceptions |
39 | 39 | from classes import analytics |
40 | | -from database import db |
| 40 | +from classes import storage |
41 | 41 | from utils import timer |
42 | 42 | from utils import log |
43 | 43 | |
— | — | @@ -104,10 +104,10 @@ |
105 | 105 | lock = mgr.RLock() |
106 | 106 | obs_proxy = mgr.dict(obs) |
107 | 107 | |
108 | | - editors = db.retrieve_distinct_keys(rts.dbname, rts.editors_dataset, 'editor') |
109 | | - min_year, max_year = determine_project_year_range(rts.dbname, |
110 | | - rts.editors_dataset, |
111 | | - 'new_wikipedian') |
| 108 | + db = storage.Database('mongo', rts.dbname, rts.editors_dataset) |
| 109 | + editors = db.retrieve_distinct_keys('editor') |
| 110 | + min_year, max_year = determine_project_year_range(db, 'new_wikipedian') |
| 111 | + |
112 | 112 | fmt = kwargs.pop('format', 'long') |
113 | 113 | time_unit = kwargs.pop('time_unit', 'year') |
114 | 114 | kwargs['min_year'] = min_year |
— | — | @@ -155,19 +155,17 @@ |
156 | 156 | write_output(ds, rts, stopwatch) |
157 | 157 | |
158 | 158 | ds.summary() |
159 | | - #return True |
160 | 159 | |
161 | 160 | |
162 | | -def determine_project_year_range(dbname, collection, var): |
| 161 | +def determine_project_year_range(db, var): |
163 | 162 | ''' |
164 | 163 | Determine the first and final year for the observed data |
165 | 164 | ''' |
166 | | - print dbname, collection, var |
167 | 165 | try: |
168 | | - max_year = db.run_query(dbname, collection, var, 'max') |
169 | | - max_year = max_year[var].year + 1 |
170 | | - min_year = db.run_query(dbname, collection, var, 'min') |
171 | | - min_year = min_year[var].year |
| 166 | + obs = db.find(var, 'max') |
| 167 | + max_year = obs[var].year + 1 |
| 168 | + obs = db.find(var, 'min') |
| 169 | + min_year = obs[var].year |
172 | 170 | except KeyError: |
173 | 171 | min_year = 2001 |
174 | 172 | max_year = datetime.datetime.today().year + 1 |
Index: trunk/tools/editor_trends/analyses/adhoc/bot_detector.py |
— | — | @@ -0,0 +1,274 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-11-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +import os |
| 23 | +import cStringIO |
| 24 | +import multiprocessing |
| 25 | +import xml.etree.cElementTree as cElementTree |
| 26 | +import sys |
| 27 | +from Queue import Empty |
| 28 | + |
| 29 | +if '..' not in sys.path: |
| 30 | + sys.path.append('..') |
| 31 | + |
| 32 | +from classes import settings |
| 33 | +settings = settings.Settings() |
| 34 | + |
| 35 | + |
| 36 | +from classes import storage |
| 37 | +from utils import file_utils |
| 38 | +from utils import messages |
| 39 | + |
| 40 | +from classes import consumers |
| 41 | +from classes import bots |
| 42 | + |
| 43 | + |
| 44 | +def read_bots_csv_file(location, filename, encoding, manager=False): |
| 45 | + ''' |
| 46 | + Constructs a dictionary from Bots.csv |
| 47 | + key is language |
| 48 | + value is a list of bot names |
| 49 | + ''' |
| 50 | + if manager: |
| 51 | + bot_dict = manager.dict() |
| 52 | + else: |
| 53 | + bot_dict = dict() |
| 54 | + for line in file_utils.read_data_from_csv(location, filename, encoding): |
| 55 | + line = line.strip() |
| 56 | + language, bots = line.split(',') |
| 57 | + bots = bots.split('|') |
| 58 | + for bot in bots: |
| 59 | + if bot not in bot_dict: |
| 60 | + b = botconsumers.Bot(bot) |
| 61 | + b.id = None |
| 62 | + else: |
| 63 | + b = bot_dict[bot] |
| 64 | + b.projects.append(language) |
| 65 | + bot_dict[bot] = b |
| 66 | + return bot_dict |
| 67 | + |
| 68 | + |
| 69 | +def retrieve_bots(language_code): |
| 70 | + ''' |
| 71 | + Loader function to retrieve list of id's of known Wikipedia bots. |
| 72 | + ''' |
| 73 | + ids = [] |
| 74 | + db = storage.Database('mongo', 'bots', 'ids') |
| 75 | + cursor = db.find() |
| 76 | + for bot in cursor: |
| 77 | + if bot['verified'] == 'True' and language_code in bot['projects']: |
| 78 | + ids.append(bot['name']) |
| 79 | + return ids |
| 80 | + |
| 81 | + |
| 82 | +def store_bots(): |
| 83 | + ''' |
| 84 | + This file reads the results from the lookup_bot_userid function and stores |
| 85 | + it in a MongoDB collection. |
| 86 | + ''' |
| 87 | + keys = ['name', 'verified', 'projects'] |
| 88 | + bots = file_utils.create_dict_from_csv_file(settings.csv_location, |
| 89 | + 'bots_ids.csv', |
| 90 | + 'utf-8', |
| 91 | + keys) |
| 92 | + db = storage.Database('mongo', 'wikilytics', 'bots') |
| 93 | + db.drop_collection() |
| 94 | + for id in bots: |
| 95 | + bot = bots[id] |
| 96 | + data = dict([(k, bot[k]) for k in keys]) |
| 97 | + data['id'] = id |
| 98 | + db.insert(data) |
| 99 | + |
| 100 | + print 'Stored %s bots' % db.count() |
| 101 | + |
| 102 | + |
| 103 | +def convert_object_to_dict(obj, exclude=[]): |
| 104 | + ''' |
| 105 | + @obj is an arbitray object where the properties need to be translated to |
| 106 | + keys and values to ease writing to a csv file. |
| 107 | + ''' |
| 108 | + d = {} |
| 109 | + for kw in obj.__dict__.keys(): |
| 110 | + if kw not in exclude: |
| 111 | + d[kw] = getattr(obj, kw) |
| 112 | + return d |
| 113 | + |
| 114 | + |
| 115 | +def write_bot_list_to_csv(bots, keys): |
| 116 | + fh = file_utils.create_txt_filehandle(settings.csv_location, 'bots_ids.csv', |
| 117 | + 'w', 'utf-8') |
| 118 | + bot_dict = convert_object_to_dict(bots, exclude=['time', 'written']) |
| 119 | + for bot in bot_dict: |
| 120 | + bot = bot_dict[bot] |
| 121 | + file_utils.write_dict_to_csv(bot, fh, keys, write_key=False, |
| 122 | + newline=True) |
| 123 | + fh.close() |
| 124 | + |
| 125 | + |
| 126 | +def lookup_bot_userid(data, fh, bots, keys): |
| 127 | + ''' |
| 128 | + This function is used to find the id's belonging to the different bots that |
| 129 | + are patrolling the Wikipedia sites. |
| 130 | + @xml_nodes is a list of xml elements that need to be parsed |
| 131 | + @bots is a dictionary containing the names of the bots to lookup |
| 132 | + ''' |
| 133 | + username = data[3] |
| 134 | + if username in bots: |
| 135 | + bot = bots.pop(username) |
| 136 | + setattr(bot, 'id', data[0]) |
| 137 | + setattr(bot, 'verified', True) |
| 138 | + bot = convert_object_to_dict(bot, exclude=['time']) |
| 139 | + file_utils.write_dict_to_csv(bot, fh, keys, write_key=False, newline=True) |
| 140 | + return bots |
| 141 | + |
| 142 | + |
| 143 | +def create_bot_validation_dataset(xml_nodes, fh, bots): |
| 144 | + revisions = xml_nodes.findall('revision') |
| 145 | + for revision in revisions: |
| 146 | + contributor = revision.find('contributor') |
| 147 | + #contributor = xml.retrieve_xml_node(revision, 'contributor') |
| 148 | + username = contributor.find('username') |
| 149 | + if username == None or username.text == None: |
| 150 | + continue |
| 151 | + else: |
| 152 | + username = username.text.lower() |
| 153 | + |
| 154 | + #print username.encode('utf-8') |
| 155 | + if username.find('bot') > -1 or username.find('script') > -1: |
| 156 | + bot = bots.get(username, botconsumers.Bot(username, verified=False)) |
| 157 | + bot.id = contributor.find('id').text |
| 158 | + timestamp = revision.find('timestamp').text |
| 159 | + if timestamp != None: |
| 160 | + timestamp = file_utils.convert_timestamp_to_datetime_naive(timestamp, settings.timestamp_format) |
| 161 | + bot.time[str(timestamp.year)].append(timestamp) |
| 162 | + bots[username] = bot |
| 163 | + |
| 164 | + return bots |
| 165 | + |
| 166 | + #bot = bots.get('PseudoBot') |
| 167 | + #bot.hours_active() |
| 168 | + #bot.avg_lag_between_edits() |
| 169 | + |
| 170 | + |
| 171 | +def bot_launcher(language_code, project, target, action, single=False, manager=False): |
| 172 | + ''' |
| 173 | + This function sets the stage to launch bot id detection and collecting data |
| 174 | + to discover new bots. |
| 175 | + ''' |
| 176 | + file_utils.delete_file(settings.csv_location, 'bots_ids.csv') |
| 177 | + location = os.path.join(settings.input_location, language_code, project) |
| 178 | + input_xml = os.path.join(location, 'chunks') |
| 179 | + input_txt = os.path.join(location, 'txt') |
| 180 | + |
| 181 | + tasks = multiprocessing.JoinableQueue() |
| 182 | + mgr = multiprocessing.Manager() |
| 183 | + keys = ['id', 'name', 'verified', 'projects'] |
| 184 | + |
| 185 | + if action == 'lookup': |
| 186 | + output_file = 'bots_ids.csv' |
| 187 | + files = file_utils.retrieve_file_list(input_txt, 'txt', mask=None) |
| 188 | + input_queue = pc.load_queue(files, poison_pill=True) |
| 189 | + bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', 'utf-8', manager=manager) |
| 190 | + for file in files: |
| 191 | + tasks.put(consumers.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
| 192 | + |
| 193 | + else: |
| 194 | + output_file = 'bots_predictionset.csv' |
| 195 | + files = file_utils.retrieve_file_list(input_xml, 'xml', mask=None) |
| 196 | + bots = {} |
| 197 | + for file in files: |
| 198 | + tasks.put(consumers.XMLFile(file, input_xml, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
| 199 | + |
| 200 | + #lock = mgr.Lock() |
| 201 | + if manager: |
| 202 | + manager = mgr |
| 203 | + |
| 204 | + tracker = {} |
| 205 | + if single: |
| 206 | + while True: |
| 207 | + try: |
| 208 | + print '%s files left in the queue...' % messages.show(tasks.qsize) |
| 209 | + task = tasks.get(block=False) |
| 210 | + bots = task(bots) |
| 211 | + except Empty: |
| 212 | + break |
| 213 | + else: |
| 214 | + bot_launcher_multi(tasks) |
| 215 | + |
| 216 | + file_utils.store_object(bots, settings.binary_location, 'bots.bin') |
| 217 | + if action == 'lookup': |
| 218 | + store_bots() |
| 219 | + if bots != {}: |
| 220 | + print 'The script was unable to retrieve the user id\s for the following %s bots:\n' % len(bots) |
| 221 | + keys = bots.keys() |
| 222 | + for key in keys: |
| 223 | + try: |
| 224 | + print '%s' % key.encode('utf-8') |
| 225 | + except: |
| 226 | + pass |
| 227 | + else: |
| 228 | + bot_training_dataset(bots) |
| 229 | + #write_bot_list_to_csv(bots, keys) |
| 230 | + |
| 231 | + |
| 232 | +def bot_training_dataset(bots): |
| 233 | + fh = file_utils.create_txt_filehandle(settings.csv_location, 'training_bots.csv', 'w', 'utf-8') |
| 234 | + keys = bots.keys() |
| 235 | + for key in keys: |
| 236 | + bot = bots.get(key) |
| 237 | + bot.hours_active() |
| 238 | + bot.avg_lag_between_edits() |
| 239 | + bot.write_training_dataset(fh) |
| 240 | + |
| 241 | + fh.close() |
| 242 | + |
| 243 | + |
| 244 | +def bot_launcher_multi(tasks): |
| 245 | + ''' |
| 246 | + This is the launcher that uses multiprocesses. |
| 247 | + ''' |
| 248 | + consumers = [consumers.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)] |
| 249 | + for x in xrange(settings.number_of_processes): |
| 250 | + tasks.put(None) |
| 251 | + |
| 252 | + for w in consumers: |
| 253 | + w.start() |
| 254 | + |
| 255 | + tasks.join() |
| 256 | + |
| 257 | + |
| 258 | +def debug_bots_dict(): |
| 259 | + bots = file_utils.load_object(settings.binary_location, 'bots.bin') |
| 260 | + for bot in bots: |
| 261 | + bot = bots[bot] |
| 262 | + edits = sum([len(bot.time[t]) for t in bot.time]) |
| 263 | + print bot.name, bot.verified, edits |
| 264 | + print 'done' |
| 265 | + return bots |
| 266 | + |
| 267 | + |
| 268 | +if __name__ == '__main__': |
| 269 | + language_code = 'en' |
| 270 | + project = 'wiki' |
| 271 | + #store_bots() |
| 272 | + #bots = debug_bots_dict() |
| 273 | + #write_bot_list_to_csv(bots) |
| 274 | + #language_code, project, lookup_bot_userid, single = False, manager = False |
| 275 | + bot_launcher(language_code, project, create_bot_validation_dataset, action='training', single=True, manager=False) |
Property changes on: trunk/tools/editor_trends/analyses/adhoc/bot_detector.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 276 | + native |
Index: trunk/tools/editor_trends/analyses/adhoc/community_graph.py |
— | — | @@ -23,7 +23,7 @@ |
24 | 24 | |
25 | 25 | from classes import settings |
26 | 26 | settings = settings.Settings() |
27 | | -from database import db |
| 27 | +from classes import storage |
28 | 28 | from utils import file_utils |
29 | 29 | |
30 | 30 | try: |
— | — | @@ -42,8 +42,8 @@ |
43 | 43 | |
44 | 44 | |
45 | 45 | def create_edgelist(project, collection): |
46 | | - ids = db.retrieve_distinct_keys(project, collection, 'editor') |
47 | | - conn = db.init_mongo_db(project) |
| 46 | + db = storage.Database('mongo', project, collection) |
| 47 | + ids = db.retrieve_distinct_keys('editor') |
48 | 48 | ids.sort() |
49 | 49 | fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8') |
50 | 50 | for i in ids: |
Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -32,7 +32,7 @@ |
33 | 33 | from utils import ordered_dict |
34 | 34 | from utils import log |
35 | 35 | from utils import timer |
36 | | -from database import db |
| 36 | +from classes import storage |
37 | 37 | from etl import downloader |
38 | 38 | from etl import enricher |
39 | 39 | from etl import store |
— | — | @@ -62,6 +62,12 @@ |
63 | 63 | pjc = projects.ProjectContainer() |
64 | 64 | rts = runtime_settings.RunTimeSettings(project, language) |
65 | 65 | |
| 66 | + file_choices = {'meta-full': 'stub-meta-history.xml.gz', |
| 67 | + 'meta-current': 'stub-meta-current.xml.gz', |
| 68 | + 'history-full': 'pages-meta-history.xml.7z', |
| 69 | + 'history-current': 'pages-meta-current.xml.bz2' |
| 70 | + } |
| 71 | + |
66 | 72 | #Init Argument Parser |
67 | 73 | parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter) |
68 | 74 | subparsers = parser.add_subparsers(help='sub - command help') |
— | — | @@ -131,7 +137,7 @@ |
132 | 138 | |
133 | 139 | #ALL |
134 | 140 | parser_all = subparsers.add_parser('all', |
135 | | - help='The all sub command runs the download, split, store and dataset \ |
| 141 | + help='The all sub command runs the download, extract, store and dataset \ |
136 | 142 | commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE \ |
137 | 143 | CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.') |
138 | 144 | parser_all.set_defaults(func=all_launcher) |
— | — | @@ -141,12 +147,6 @@ |
142 | 148 | executing all.', |
143 | 149 | default=[]) |
144 | 150 | |
145 | | - parser_all.add_argument('-n', '--new', |
146 | | - action='store_true', |
147 | | - help='This will delete all previous output and starts from scratch. \ |
148 | | - Mostly useful for debugging purposes.', |
149 | | - default=False) |
150 | | - |
151 | 151 | #DJANGO |
152 | 152 | parser_django = subparsers.add_parser('django') |
153 | 153 | parser_django.add_argument('-e', '--except', |
— | — | @@ -179,23 +179,23 @@ |
180 | 180 | help='Name of MongoDB collection', |
181 | 181 | default='editors_raw') |
182 | 182 | |
183 | | - parser.add_argument('-o', '--location', |
184 | | - action='store', |
185 | | - help='Indicate where you want to store the downloaded file.', |
186 | | - #default=settings.input_location) |
187 | | - default=rts.input_location) |
188 | 183 | |
189 | 184 | parser.add_argument('-ns', '--namespace', |
190 | 185 | action='store', |
191 | 186 | help='A list of namespaces to include for analysis.', |
192 | 187 | default='0') |
193 | 188 | |
| 189 | + parser.add_argument('-db', '--database', |
| 190 | + action='store', |
| 191 | + help='Specify the database that you want to use. Valid choices are mongo and cassandra.', |
| 192 | + default='mongo') |
| 193 | + |
194 | 194 | parser.add_argument('-f', '--file', |
195 | 195 | action='store', |
196 | | - choices=rts.file_choices, |
| 196 | + choices=file_choices, |
197 | 197 | help='Indicate which dump you want to download. Valid choices are:\n \ |
198 | | - %s' % ''.join([f + ',\n' for f in rts.file_choices]), |
199 | | - default='stub-meta-history.xml.gz') |
| 198 | + %s' % ''.join([f + ',\n' for f in file_choices]), |
| 199 | + default=file_choices['meta-full']) |
200 | 200 | |
201 | 201 | return project, language, parser |
202 | 202 | |
— | — | @@ -247,7 +247,7 @@ |
248 | 248 | rts.input_location = config.get('file_locations', 'input_location') |
249 | 249 | rts.output_location = config.get('file_locations', 'output_location') |
250 | 250 | |
251 | | - log.log_to_csv(logger, rts, 'New configuration', 'Creating', |
| 251 | + log.to_csv(logger, rts, 'New configuration', 'Creating', |
252 | 252 | config_launcher, |
253 | 253 | working_directory=working_directory, |
254 | 254 | input_location=input_location, |
— | — | @@ -262,10 +262,10 @@ |
263 | 263 | ''' |
264 | 264 | print 'Start downloading' |
265 | 265 | stopwatch = timer.Timer() |
266 | | - log.log_to_mongo(rts, 'dataset', 'download', stopwatch, event='start') |
| 266 | + log.to_db(rts, 'dataset', 'download', stopwatch, event='start') |
267 | 267 | downloader.launcher(rts, logger) |
268 | 268 | stopwatch.elapsed() |
269 | | - log.log_to_mongo(rts, 'dataset', 'download', stopwatch, event='finish') |
| 269 | + log.to_db(rts, 'dataset', 'download', stopwatch, event='finish') |
270 | 270 | |
271 | 271 | |
272 | 272 | def extract_launcher(rts, logger): |
— | — | @@ -276,12 +276,12 @@ |
277 | 277 | ''' |
278 | 278 | print 'Extracting data from XML' |
279 | 279 | stopwatch = timer.Timer() |
280 | | - log.log_to_mongo(rts, 'dataset', 'extract', stopwatch, event='start') |
281 | | - log.log_to_csv(logger, rts, 'Start', 'Extract', extract_launcher) |
| 280 | + log.to_db(rts, 'dataset', 'extract', stopwatch, event='start') |
| 281 | + log.to_csv(logger, rts, 'Start', 'Extract', extract_launcher) |
282 | 282 | enricher.launcher(rts) |
283 | 283 | stopwatch.elapsed() |
284 | | - log.log_to_mongo(rts, 'dataset', 'extract', stopwatch, event='finish') |
285 | | - log.log_to_csv(logger, rts, 'Finish', 'Extract', extract_launcher) |
| 284 | + log.to_db(rts, 'dataset', 'extract', stopwatch, event='finish') |
| 285 | + log.to_csv(logger, rts, 'Finish', 'Extract', extract_launcher) |
286 | 286 | |
287 | 287 | |
288 | 288 | def sort_launcher(rts, logger): |
— | — | @@ -291,12 +291,12 @@ |
292 | 292 | ''' |
293 | 293 | print 'Start sorting data' |
294 | 294 | stopwatch = timer.Timer() |
295 | | - log.log_to_mongo(rts, 'dataset', 'sort', stopwatch, event='start') |
296 | | - log.log_to_csv(logger, rts, 'Start', 'Sort', sort_launcher) |
| 295 | + log.to_db(rts, 'dataset', 'sort', stopwatch, event='start') |
| 296 | + log.to_csv(logger, rts, 'Start', 'Sort', sort_launcher) |
297 | 297 | sort.launcher(rts) |
298 | 298 | stopwatch.elapsed() |
299 | | - log.log_to_mongo(rts, 'dataset', 'sort', stopwatch, event='finish') |
300 | | - log.log_to_csv(logger, rts, 'Finish', 'Sort', sort_launcher) |
| 299 | + log.to_db(rts, 'dataset', 'sort', stopwatch, event='finish') |
| 300 | + log.to_csv(logger, rts, 'Finish', 'Sort', sort_launcher) |
301 | 301 | |
302 | 302 | |
303 | 303 | def store_launcher(rts, logger): |
— | — | @@ -306,13 +306,12 @@ |
307 | 307 | ''' |
308 | 308 | print 'Start storing data in MongoDB' |
309 | 309 | stopwatch = timer.Timer() |
310 | | - log.log_to_mongo(rts, 'dataset', 'store', stopwatch, event='start') |
311 | | - log.log_to_csv(logger, rts, 'Start', 'Store', store_launcher) |
312 | | - db.cleanup_database(rts.dbname, logger) |
| 310 | + log.to_db(rts, 'dataset', 'store', stopwatch, event='start') |
| 311 | + log.to_csv(logger, rts, 'Start', 'Store', store_launcher) |
313 | 312 | store.launcher(rts) |
314 | 313 | stopwatch.elapsed() |
315 | | - log.log_to_mongo(rts, 'dataset', 'store', stopwatch, event='finish') |
316 | | - log.log_to_csv(logger, rts, 'Finish', 'Store', store_launcher) |
| 314 | + log.to_db(rts, 'dataset', 'store', stopwatch, event='finish') |
| 315 | + log.to_csv(logger, rts, 'Finish', 'Store', store_launcher) |
317 | 316 | |
318 | 317 | |
319 | 318 | def transformer_launcher(rts, logger): |
— | — | @@ -322,13 +321,12 @@ |
323 | 322 | ''' |
324 | 323 | print 'Start transforming dataset' |
325 | 324 | stopwatch = timer.Timer() |
326 | | - log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, event='start') |
327 | | - log.log_to_csv(logger, rts, 'Start', 'Transform', transformer_launcher) |
328 | | - db.cleanup_database(rts.dbname, logger, 'dataset') |
| 325 | + log.to_db(rts, 'dataset', 'transform', stopwatch, event='start') |
| 326 | + log.to_csv(logger, rts, 'Start', 'Transform', transformer_launcher) |
329 | 327 | transformer.transform_editors_single_launcher(rts) |
330 | 328 | stopwatch.elapsed() |
331 | | - log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, event='finish') |
332 | | - log.log_to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher) |
| 329 | + log.to_db(rts, 'dataset', 'transform', stopwatch, event='finish') |
| 330 | + log.to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher) |
333 | 331 | |
334 | 332 | |
335 | 333 | def dataset_launcher(rts, logger): |
— | — | @@ -338,17 +336,17 @@ |
339 | 337 | ''' |
340 | 338 | print 'Start generating dataset' |
341 | 339 | stopwatch = timer.Timer() |
342 | | - log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='start') |
| 340 | + log.to_db(rts, 'dataset', 'export', stopwatch, event='start') |
343 | 341 | |
344 | 342 | for chart in rts.charts: |
345 | 343 | analyzer.generate_chart_data(rts, chart, **rts.keywords) |
346 | | - log.log_to_csv(logger, rts, 'Start', 'Dataset', dataset_launcher, |
| 344 | + log.to_csv(logger, rts, 'Start', 'Dataset', dataset_launcher, |
347 | 345 | chart=chart, |
348 | 346 | dbname=rts.dbname, |
349 | 347 | collection=rts.editors_dataset) |
350 | 348 | stopwatch.elapsed() |
351 | | - log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='finish') |
352 | | - log.log_to_csv(logger, rts, 'Finish', 'Dataset', dataset_launcher) |
| 349 | + log.to_db(rts, 'dataset', 'export', stopwatch, event='finish') |
| 350 | + log.to_csv(logger, rts, 'Finish', 'Dataset', dataset_launcher) |
353 | 351 | |
354 | 352 | |
355 | 353 | def cleanup(rts, logger): |
— | — | @@ -360,20 +358,20 @@ |
361 | 359 | #remove directories |
362 | 360 | for directory in directories: |
363 | 361 | file_utils.delete_file(directory, '', directory=True) |
364 | | - log.log_to_csv(logger, rts, |
| 362 | + log.to_csv(logger, rts, |
365 | 363 | message='Deleting %s' % directory, |
366 | 364 | verb='Deleting', |
367 | 365 | function=cleanup) |
368 | 366 | |
369 | 367 | #create directories |
370 | 368 | rts.verify_environment(directories) |
371 | | - log.log_to_csv(logger, rts, message='Deleting %s' % directory, |
| 369 | + log.to_csv(logger, rts, message='Deleting %s' % directory, |
372 | 370 | verb='Creating', function=rts.verify_environment) |
373 | 371 | |
374 | 372 | #remove binary files |
375 | 373 | filename = '%s%s' % (rts.full_project, '_editor.bin') |
376 | 374 | file_utils.delete_file(rts.binary_location, filename) |
377 | | - log.log_to_csv(logger, rts, message='Deleting %s' % filename, |
| 375 | + log.to_csv(logger, rts, message='Deleting %s' % filename, |
378 | 376 | verb='Deleting', |
379 | 377 | function=file_utils.delete_file) |
380 | 378 | |
— | — | @@ -386,13 +384,9 @@ |
387 | 385 | ''' |
388 | 386 | |
389 | 387 | stopwatch = timer.Timer() |
390 | | - log.log_to_mongo(rts, 'dataset', 'all', stopwatch, event='start') |
| 388 | + log.to_db(rts, 'dataset', 'all', stopwatch, event='start') |
391 | 389 | print 'Start of building %s %s dataset.' % (rts.language.name, rts.project) |
392 | 390 | |
393 | | - if rts.clean: |
394 | | - print 'Removing previous datasets...' |
395 | | - cleanup(rts, logger) |
396 | | - |
397 | 391 | functions = ordered_dict.OrderedDict(((downloader_launcher, 'download'), |
398 | 392 | (extract_launcher, 'extract'), |
399 | 393 | (sort_launcher, 'sort'), |
— | — | @@ -408,23 +402,9 @@ |
409 | 403 | elif res == None: |
410 | 404 | pass |
411 | 405 | stopwatch.elapsed() |
412 | | - log.log_to_mongo(rts, 'dataset', 'all', stopwatch, event='finish') |
| 406 | + log.to_db(rts, 'dataset', 'all', stopwatch, event='finish') |
413 | 407 | |
414 | 408 | |
415 | | - |
416 | | -def about_statement(): |
417 | | - ''' |
418 | | - prints generic version information. |
419 | | - ''' |
420 | | - print '' |
421 | | - print 'Wikilytics is (c) 2010-2011 by the Wikimedia Foundation.' |
422 | | - print 'Written by Diederik van Liere (dvanliere@gmail.com).' |
423 | | - print '''This software comes with ABSOLUTELY NO WARRANTY. This is free |
424 | | - software, and you are welcome to distribute it under certain conditions.''' |
425 | | - print 'See the README.1ST file for more information.' |
426 | | - print '' |
427 | | - |
428 | | - |
429 | 409 | def main(): |
430 | 410 | ''' |
431 | 411 | This function initializes the command line parser. |
— | — | @@ -449,10 +429,6 @@ |
450 | 430 | logger.debug('Chosen language: \t%s' % rts.language) |
451 | 431 | |
452 | 432 | #start manager |
453 | | - #detect_python_version(logger) |
454 | | - about_statement() |
455 | | - #config.create_configuration(settings, args) |
456 | | - |
457 | 433 | rts.show_settings() |
458 | 434 | args.func(rts, logger) |
459 | 435 | |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -25,7 +25,7 @@ |
26 | 26 | from utils import file_utils |
27 | 27 | from utils import text_utils |
28 | 28 | from database import cache |
29 | | -from database import db |
| 29 | +from classes import storage |
30 | 30 | from classes import consumers |
31 | 31 | |
32 | 32 | |
— | — | @@ -38,10 +38,8 @@ |
39 | 39 | The treshold is currently more than 9 edits and is not yet configurable. |
40 | 40 | ''' |
41 | 41 | def run(self): |
42 | | - mongo = db.init_mongo_db(self.rts.dbname) |
43 | | - collection = mongo[self.rts.editors_raw] |
44 | | - |
45 | | - editor_cache = cache.EditorCache(collection) |
| 42 | + db = storage.Database('mongo', self.rts.dbname, self.rts.editors_raw) |
| 43 | + editor_cache = cache.EditorCache(db) |
46 | 44 | prev_editor = -1 |
47 | 45 | while True: |
48 | 46 | try: |
— | — | @@ -102,17 +100,12 @@ |
103 | 101 | * category (if any) |
104 | 102 | * article id |
105 | 103 | ''' |
106 | | - mongo = db.init_mongo_db(rts.dbname) |
107 | | - db.drop_collection(rts.dbname, rts.articles_raw) |
108 | | - collection = mongo[rts.articles_raw] |
109 | | - collection.create_index('id') |
110 | | - collection.create_index('title') |
111 | | - collection.create_index('ns') |
112 | | - collection.create_index('category') |
113 | | - collection.ensure_index('id') |
114 | | - collection.ensure_index('title') |
115 | | - collection.ensure_index('ns') |
116 | | - collection.ensure_index('category') |
| 104 | + db = storage.Database('mongo', rts.dbname, rts.articles_raw) |
| 105 | + db.drop_collection() |
| 106 | + db.add_index('id') |
| 107 | + db.add_index('title') |
| 108 | + db.add_index('ns') |
| 109 | + db.add_index('category') |
117 | 110 | |
118 | 111 | location = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt') |
119 | 112 | fh = file_utils.create_txt_filehandle(location, 'titles.csv', 'r', 'utf-8') |
— | — | @@ -128,7 +121,7 @@ |
129 | 122 | data[key] = value |
130 | 123 | x += 2 |
131 | 124 | y += 2 |
132 | | - collection.insert(data) |
| 125 | + db.insert(data) |
133 | 126 | fh.close() |
134 | 127 | print 'Done...' |
135 | 128 | |
— | — | @@ -140,11 +133,9 @@ |
141 | 134 | ''' |
142 | 135 | store_articles(rts) |
143 | 136 | print 'Input directory is: %s ' % rts.sorted |
144 | | - db.drop_collection(rts.dbname, rts.editors_dataset) |
145 | | - mongo = db.init_mongo_db(rts.dbname) |
146 | | - coll = mongo[rts.editors_raw] |
147 | | - coll.ensure_index('editor') |
148 | | - coll.create_index('editor') |
| 137 | + db = storage.Database('mongo', rts.dbname, rts.editors_raw) |
| 138 | + db.drop_collection() |
| 139 | + db.add_index('editor') |
149 | 140 | |
150 | 141 | files = file_utils.retrieve_file_list(rts.sorted, 'csv') |
151 | 142 | pbar = progressbar.ProgressBar(maxval=len(files)).start() |
Index: trunk/tools/editor_trends/etl/downloader.py |
— | — | @@ -117,7 +117,4 @@ |
118 | 118 | w.start() |
119 | 119 | |
120 | 120 | tasks.join() |
121 | | -# for consumer in consumers: |
122 | | -# if consumer.exitcode != 0: |
123 | | -# result = False |
124 | 121 | |
Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -32,13 +32,7 @@ |
33 | 33 | if '..' not in sys.path: |
34 | 34 | sys.path.append('..') |
35 | 35 | |
36 | | -try: |
37 | | - from database import cassandra |
38 | | - import pycassa |
39 | | -except ImportError: |
40 | | - pass |
41 | | - |
42 | | -from database import db |
| 36 | +from classes import storage |
43 | 37 | from bots import detector |
44 | 38 | from utils import file_utils |
45 | 39 | |
— | — | @@ -759,14 +753,14 @@ |
760 | 754 | def multiprocessor_launcher(function, dataset, storage, locks, rts): |
761 | 755 | input_queue = JoinableQueue() |
762 | 756 | |
763 | | - files = file_utils.retrieve_file_list(rts.location) |
| 757 | + files = file_utils.retrieve_file_list(rts.input_location) |
764 | 758 | if len(files) > cpu_count(): |
765 | 759 | processors = cpu_count() - 1 |
766 | 760 | else: |
767 | 761 | processors = len(files) |
768 | 762 | |
769 | 763 | for filename in files: |
770 | | - filename = os.path.join(rts.location, filename) |
| 764 | + filename = os.path.join(rts.input_location, filename) |
771 | 765 | print filename |
772 | 766 | input_queue.put(filename) |
773 | 767 | |
— | — | @@ -818,7 +812,6 @@ |
819 | 813 | This is the generic entry point for regular Wikilytics usage. |
820 | 814 | ''' |
821 | 815 | # launcher for creating regular mongo dataset |
822 | | - path = rts.location |
823 | 816 | function = create_variables |
824 | 817 | storage = 'csv' |
825 | 818 | dataset = 'training' |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -17,28 +17,20 @@ |
18 | 18 | __date__ = '2010-11-02' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | | -import progressbar |
| 21 | +import sys |
| 22 | +import datetime |
22 | 23 | import multiprocessing |
23 | 24 | from Queue import Empty |
24 | 25 | from operator import itemgetter |
25 | | -import datetime |
26 | | -import sys |
27 | 26 | from copy import deepcopy |
28 | 27 | |
29 | | -from database import db |
| 28 | +import progressbar |
| 29 | +from classes import storage |
30 | 30 | from utils import file_utils |
31 | 31 | from utils import messages |
32 | 32 | from utils import data_converter |
33 | 33 | from classes import consumers |
34 | 34 | |
35 | | - |
36 | | -try: |
37 | | - import psyco |
38 | | - psyco.full() |
39 | | -except ImportError: |
40 | | - pass |
41 | | - |
42 | | - |
43 | 35 | class EditorConsumer(consumers.BaseConsumer): |
44 | 36 | |
45 | 37 | def run(self): |
— | — | @@ -52,10 +44,10 @@ |
53 | 45 | |
54 | 46 | |
55 | 47 | class Editor(object): |
56 | | - def __init__(self, id, input_db, output_db, **kwargs): |
| 48 | + def __init__(self, id, db_raw, db_dataset, **kwargs): |
57 | 49 | self.id = id |
58 | | - self.input_db = input_db |
59 | | - self.output_db = output_db |
| 50 | + self.db_raw = db_raw |
| 51 | + self.db_dataset = db_dataset |
60 | 52 | for kw in kwargs: |
61 | 53 | setattr(self, kw, kwargs[kw]) |
62 | 54 | |
— | — | @@ -64,7 +56,7 @@ |
65 | 57 | |
66 | 58 | def __call__(self): |
67 | 59 | cutoff = 9 |
68 | | - editor = self.input_db.find_one({'editor': self.id}) |
| 60 | + editor = self.db_raw.find_one('editor', self.id) |
69 | 61 | if editor == None: |
70 | 62 | return |
71 | 63 | edits = editor['edits'] |
— | — | @@ -75,7 +67,6 @@ |
76 | 68 | last_edit_by_year = determine_last_edit_by_year(edits, first_year, final_year) |
77 | 69 | articles_edited = determine_articles_workedon(edits, first_year, final_year) |
78 | 70 | article_count = determine_article_count(articles_edited, first_year, final_year) |
79 | | - articles_edited = db.stringify_keys(articles_edited) |
80 | 71 | |
81 | 72 | namespaces_edited = determine_namespaces_workedon(edits, first_year, final_year) |
82 | 73 | character_count = determine_edit_volume(edits, first_year, final_year) |
— | — | @@ -90,7 +81,6 @@ |
91 | 82 | totals = calculate_totals(totals, counts, revert_count, 'revert_count') |
92 | 83 | totals = calculate_totals(totals, counts, article_count, 'article_count') |
93 | 84 | totals = calculate_totals(totals, counts, edit_count, 'edit_count') |
94 | | - totals = db.stringify_keys(totals) |
95 | 85 | |
96 | 86 | if len(edits) > cutoff: |
97 | 87 | new_wikipedian = edits[cutoff]['date'] |
— | — | @@ -100,24 +90,23 @@ |
101 | 91 | first_edit = edits[0]['date'] |
102 | 92 | final_edit = edits[-1]['date'] |
103 | 93 | |
104 | | - self.output_db.insert({'editor': self.id, |
105 | | - 'username': username, |
106 | | - 'new_wikipedian': new_wikipedian, |
107 | | - 'cum_edit_count': cum_edit_count, |
108 | | - 'final_edit': final_edit, |
109 | | - 'first_edit': first_edit, |
110 | | - 'last_edit_by_year': last_edit_by_year, |
111 | | - 'articles_edited': articles_edited, |
112 | | - 'edit_count': edit_count, |
113 | | - 'namespaces_edited': namespaces_edited, |
114 | | - 'article_count': article_count, |
115 | | - 'character_count': character_count, |
116 | | - 'revert_count': revert_count, |
117 | | - 'totals': totals, |
118 | | - }, |
119 | | - safe=True) |
| 94 | + data = {'editor': self.id, |
| 95 | + 'username': username, |
| 96 | + 'new_wikipedian': new_wikipedian, |
| 97 | + 'cum_edit_count': cum_edit_count, |
| 98 | + 'final_edit': final_edit, |
| 99 | + 'first_edit': first_edit, |
| 100 | + 'last_edit_by_year': last_edit_by_year, |
| 101 | + 'articles_edited': articles_edited, |
| 102 | + 'edit_count': edit_count, |
| 103 | + 'namespaces_edited': namespaces_edited, |
| 104 | + 'article_count': article_count, |
| 105 | + 'character_count': character_count, |
| 106 | + 'revert_count': revert_count, |
| 107 | + 'totals': totals, |
| 108 | + } |
| 109 | + self.db_dataset.insert(data) |
120 | 110 | |
121 | | - |
122 | 111 | def cleanup_datacontainer(dc, variable_type): |
123 | 112 | ''' |
124 | 113 | valid variable_type are either a {}, a [] or 0. |
— | — | @@ -159,7 +148,6 @@ |
160 | 149 | dc[year][month].setdefault(ns, 0) |
161 | 150 | dc[year][month][ns] += 1 |
162 | 151 | dc = cleanup_datacontainer(dc, {}) |
163 | | - dc = db.stringify_keys(dc) |
164 | 152 | return dc |
165 | 153 | |
166 | 154 | |
— | — | @@ -192,7 +180,6 @@ |
193 | 181 | for month in dc[year]: |
194 | 182 | dc[year][month] = list(dc[year][month]) |
195 | 183 | dc = cleanup_datacontainer(dc, []) |
196 | | - dc = db.stringify_keys(dc) |
197 | 184 | return dc |
198 | 185 | |
199 | 186 | |
— | — | @@ -207,7 +194,6 @@ |
208 | 195 | dc[year][month].setdefault(ns, 0) |
209 | 196 | dc[year][month][ns] += 1 |
210 | 197 | dc = cleanup_datacontainer(dc, {}) |
211 | | - dc = db.stringify_keys(dc) |
212 | 198 | return dc |
213 | 199 | |
214 | 200 | |
— | — | @@ -225,13 +211,11 @@ |
226 | 212 | dc[year][month].setdefault(ns, {}) |
227 | 213 | dc[year][month][ns].setdefault('added', 0) |
228 | 214 | dc[year][month][ns].setdefault('removed', 0) |
229 | | - print edit |
230 | 215 | if edit['delta'] < 0: |
231 | 216 | dc[year][month][ns]['removed'] += edit['delta'] |
232 | 217 | elif edit['delta'] > 0: |
233 | 218 | dc[year][month][ns]['added'] += edit['delta'] |
234 | 219 | dc = cleanup_datacontainer(dc, {}) |
235 | | - dc = db.stringify_keys(dc) |
236 | 220 | return dc |
237 | 221 | |
238 | 222 | |
— | — | @@ -251,7 +235,6 @@ |
252 | 236 | dc[date] = edit |
253 | 237 | elif dc[date] < edit: |
254 | 238 | dc[date] = edit |
255 | | - dc = db.stringify_keys(dc) |
256 | 239 | return dc |
257 | 240 | |
258 | 241 | |
— | — | @@ -266,7 +249,6 @@ |
267 | 250 | for month in articles_edited[year]: |
268 | 251 | for ns in articles_edited[year][month]: |
269 | 252 | dc[year][month][ns] = len(articles_edited[year][month][ns]) |
270 | | - dc = db.stringify_keys(dc) |
271 | 253 | return dc |
272 | 254 | |
273 | 255 | |
— | — | @@ -276,7 +258,6 @@ |
277 | 259 | |
278 | 260 | |
279 | 261 | def transform_editors_multi_launcher(rts): |
280 | | - ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor') |
281 | 262 | tasks = multiprocessing.JoinableQueue() |
282 | 263 | consumers = [EditorConsumer(tasks, None) for i in xrange(rts.number_of_processes)] |
283 | 264 | |
— | — | @@ -293,23 +274,19 @@ |
294 | 275 | |
295 | 276 | |
296 | 277 | def setup_database(rts): |
297 | | - mongo = db.init_mongo_db(rts.dbname) |
298 | | - input_db = mongo[rts.editors_raw] |
299 | | - db.drop_collection(rts.dbname, rts.editors_dataset) |
300 | | - output_db = mongo[rts.editors_dataset] |
| 278 | + db_raw = storage.Database('mongo', rts.dbname, rts.editors_raw) |
| 279 | + db_dataset = storage.Database('mongo', rts.dbname, rts.editors_dataset) |
| 280 | + db_dataset.drop_collection() |
| 281 | + ids = db_dataset.retrieve_distinct_keys('editor') |
| 282 | + db_dataset.add_index('editor') |
| 283 | + db_dataset.add_index('new_wikipedian') |
301 | 284 | |
302 | | - output_db.ensure_index('editor') |
303 | | - output_db.create_index('editor') |
304 | | - output_db.ensure_index('new_wikipedian') |
305 | | - output_db.create_index('new_wikipedian') |
306 | | - return input_db, output_db |
| 285 | + return db_raw, db_dataset, ids |
307 | 286 | |
308 | 287 | |
309 | 288 | def transform_editors_single_launcher(rts): |
310 | 289 | print rts.dbname, rts.editors_raw |
311 | | - ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor') |
312 | | - print len(ids) |
313 | | - input_db, output_db = setup_database(rts) |
| 290 | + input_db, output_db, ids = setup_database(rts) |
314 | 291 | pbar = progressbar.ProgressBar(maxval=len(ids)).start() |
315 | 292 | for x, id in enumerate(ids): |
316 | 293 | editor = Editor(id, input_db, output_db) |
Index: trunk/tools/editor_trends/classes/settings.py |
— | — | @@ -97,13 +97,6 @@ |
98 | 98 | self.binary_location = os.path.join(self.working_directory, |
99 | 99 | 'data', 'objects') |
100 | 100 | |
101 | | - self.chart_location = os.path.join(self.working_directory, 'statistics', |
102 | | - 'charts') |
103 | | - self.file_choices = ('stub-meta-history.xml.gz', |
104 | | - 'stub-meta-current.xml.gz', |
105 | | - 'pages-meta-history.xml.7z', |
106 | | - 'pages-meta-current.xml.bz2',) |
107 | | - |
108 | 101 | def load_configuration(self): |
109 | 102 | if os.path.exists(os.path.join(self.working_directory, 'wiki.cfg')): |
110 | 103 | config = ConfigParser.RawConfigParser() |
— | — | @@ -117,7 +110,6 @@ |
118 | 111 | else: |
119 | 112 | return False |
120 | 113 | |
121 | | - |
122 | 114 | def determine_working_directory(self): |
123 | 115 | cwd = os.getcwd() |
124 | 116 | slashes = cwd.count(os.sep) |
Index: trunk/tools/editor_trends/classes/exceptions.py |
— | — | @@ -71,14 +71,12 @@ |
72 | 72 | return 'There is no JSON encoder called %s, please make sure that you \ |
73 | 73 | entered the right name' % self.func |
74 | 74 | |
75 | | -class UnknownChartError(Error): |
76 | | - def __init__(self, chart, charts): |
77 | | - self.chart = chart |
78 | | - self.charts = charts |
| 75 | +class NoDatabaseProviderInstalled(Error): |
| 76 | + def __init__(self): |
| 77 | + pass |
79 | 78 | |
80 | | - def __str__(self): |
81 | | - return 'Currently, chart type %s is not supported. Please choose one of \ |
82 | | - the following charts: %s' % (self.chart, self.charts) |
| 79 | + def __str__(self): |
| 80 | + return 'You need either to install Mongo or Cassandra to use Wikiltyics.' |
83 | 81 | |
84 | 82 | class UnknownPluginError(Error): |
85 | 83 | def __init__(self, plugin, plugins): |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -66,7 +66,6 @@ |
67 | 67 | self.function = self.get_value('func') |
68 | 68 | |
69 | 69 | self.ignore = self.get_value('except') |
70 | | - self.clean = self.get_value('new') |
71 | 70 | self.force = self.get_value('force') |
72 | 71 | self.location = self.get_project_location() |
73 | 72 | self.filename = self.generate_wikidump_filename() |
— | — | @@ -92,8 +91,8 @@ |
93 | 92 | self.verify_environment(self.directories) |
94 | 93 | |
95 | 94 | def __str__(self): |
96 | | - return 'Runtime Settings for project %s%s' % (self.language.name, |
97 | | - self.project.name) |
| 95 | + return 'Runtime Settings for project %s %s' % (self.language.name, |
| 96 | + self.project.full_name) |
98 | 97 | |
99 | 98 | def __iter__(self): |
100 | 99 | for item in self.__dict__: |
— | — | @@ -144,12 +143,8 @@ |
145 | 144 | ''' |
146 | 145 | Construct the full project location |
147 | 146 | ''' |
148 | | - if self.kaggle: |
149 | | - return os.path.join(self.input_location, self.language.code, |
| 147 | + return os.path.join(self.output_location, self.language.code, |
150 | 148 | self.project.name) |
151 | | - else: |
152 | | - return os.path.join(self.input_location, self.language.code, |
153 | | - self.project.name) |
154 | 149 | |
155 | 150 | def show_settings(self): |
156 | 151 | ''' |
— | — | @@ -164,6 +159,15 @@ |
165 | 160 | about['Output directory'] = '%s and subdirectories' % self.location |
166 | 161 | |
167 | 162 | max_length_key = max([len(key) for key in about.keys()]) |
| 163 | + |
| 164 | + print '' |
| 165 | + print 'Wikilytics is (c) 2010-2011 by the Wikimedia Foundation.' |
| 166 | + print 'Written by Diederik van Liere (dvanliere@gmail.com).' |
| 167 | + print '''This software comes with ABSOLUTELY NO WARRANTY. This is free |
| 168 | + software, and you are welcome to distribute it under certain conditions.''' |
| 169 | + print 'See the README.1ST file for more information.' |
| 170 | + print '\nPlatform: %s' % self.platform |
| 171 | + |
168 | 172 | print 'Final settings after parsing command line arguments:' |
169 | 173 | for ab in about: |
170 | 174 | print '%s: %s' % (ab.rjust(max_length_key), about[ab].encode('utf-8')) |
Index: trunk/tools/editor_trends/classes/storage.py |
— | — | @@ -0,0 +1,243 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)']) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-04-05' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import sys |
| 22 | +from abc import ABCMeta, abstractmethod |
| 23 | +if '..' not in sys.path: |
| 24 | + sys.path.append('..') |
| 25 | + |
| 26 | +from classes import settings |
| 27 | +from classes import exceptions |
| 28 | +settings = settings.Settings() |
| 29 | +import file_utils |
| 30 | + |
| 31 | +error = 0 |
| 32 | +try: |
| 33 | + import pymongo |
| 34 | + import bson |
| 35 | + from bson.code import Code |
| 36 | + from pymongo.errors import OperationFailure |
| 37 | +except ImportError: |
| 38 | + error += 1 |
| 39 | +try: |
| 40 | + import pycassa |
| 41 | +except ImportError: |
| 42 | + error += 1 |
| 43 | +if error == 2: |
| 44 | + raise exceptions.NoDatabaseProviderInstalled() |
| 45 | + |
| 46 | + |
| 47 | +class AbstractDatabase: |
| 48 | + __metaclass__ = ABCMeta |
| 49 | + |
| 50 | + def __init__(self, dbname, collection): |
| 51 | + self.dbname = dbname |
| 52 | + self.collection = collection |
| 53 | + self.db = self.connect() |
| 54 | + |
| 55 | + def __repr__(self): |
| 56 | + return '%s:%s' % (self.dbname, self.collection) |
| 57 | + |
| 58 | + @abstractmethod |
| 59 | + def connect(self): |
| 60 | + '''Initializes a connection to a specific collection in a database''' |
| 61 | + |
| 62 | + @abstractmethod |
| 63 | + def drop_collection(self): |
| 64 | + '''Deletes a collection from a database''' |
| 65 | + |
| 66 | + @abstractmethod |
| 67 | + def add_index(self, key): |
| 68 | + '''Add index to a collection''' |
| 69 | + |
| 70 | + @abstractmethod |
| 71 | + def insert(self, data): |
| 72 | + '''Insert observation to a collection''' |
| 73 | + |
| 74 | + @abstractmethod |
| 75 | + def update(self, key, data): |
| 76 | + '''Update an observation in a collection''' |
| 77 | + |
| 78 | + @abstractmethod |
| 79 | + def find(self, key, qualifier=None): |
| 80 | + '''Find an observationi in a collection''' |
| 81 | + |
| 82 | + @abstractmethod |
| 83 | + def save(self, data): |
| 84 | + '''Saves an observation and returns the id from the db''' |
| 85 | + |
| 86 | + @abstractmethod |
| 87 | + def count(self): |
| 88 | + '''Counts the number of observations in a collection''' |
| 89 | + |
| 90 | +class Mongo(AbstractDatabase): |
| 91 | + @classmethod |
| 92 | + def is_registrar_for(cls, storage): |
| 93 | + return storage == 'mongo' |
| 94 | + |
| 95 | + def connect(self): |
| 96 | + db = pymongo.Connection() |
| 97 | + return db[self.dbname] |
| 98 | + |
| 99 | + def save(self, data): |
| 100 | + assert isinstance(data, dict), 'You need to feed me dictionaries.' |
| 101 | + return self.db[self.collection].save(data) |
| 102 | + |
| 103 | + def insert(self, data, qualifiers=None): |
| 104 | + assert isinstance(data, dict), 'You need to feed me dictionaries.' |
| 105 | + data = self.stringify_keys(data) |
| 106 | + try: |
| 107 | + if qualifiers: |
| 108 | + self.db[self.collection].insert(data, qualifiers, safe=True) |
| 109 | + else: |
| 110 | + self.db[self.collection].insert(data, safe=True) |
| 111 | + except bson.errors.InvalidDocument, error: |
| 112 | + print error |
| 113 | + print 'BSON document too large, unable to store %s' % \ |
| 114 | + (data.keys()[0]) |
| 115 | + except OperationFailure, error: |
| 116 | + print 'It seems that you are running out of disk space. \ |
| 117 | + Error message: %s' % error |
| 118 | + sys.exit(-1) |
| 119 | + |
| 120 | + def update(self, key, value, data): |
| 121 | + assert isinstance(data, dict), 'You need to feed me dictionaries.' |
| 122 | + self.db[self.collection].update({key: value}, data, upsert=True) |
| 123 | + |
| 124 | + def find(self, key=None, qualifier=None): |
| 125 | + if qualifier == 'min': |
| 126 | + return self.db[self.collection].find({ |
| 127 | + key : {'$ne' : False}}).sort(key, pymongo.ASCENDING).limit(1)[0] |
| 128 | + elif qualifier == 'max': |
| 129 | + return self.db[self.collection].find({ |
| 130 | + key : {'$ne' : False}}).sort(key, pymongo.DESCENDING).limit(1)[0] |
| 131 | + elif key != None: |
| 132 | + return self.db[self.collection].find({key: 1}) |
| 133 | + else: |
| 134 | + return self.db[self.collection].find() |
| 135 | + |
| 136 | + def find_one(self, key, value): |
| 137 | + return self.db[self.collection].find_one({key: value}) |
| 138 | + |
| 139 | + def drop_collection(self): |
| 140 | + self.db.drop_collection(self.collection) |
| 141 | + |
| 142 | + def add_index(self, key): |
| 143 | + self.db[self.collection].create_index(key) |
| 144 | + self.db[self.collection].ensure_index(key) |
| 145 | + |
| 146 | + def count(self): |
| 147 | + return self.db[self.collection].count() |
| 148 | + |
| 149 | + def retrieve_distinct_keys(self, key, force_new=False): |
| 150 | + ''' |
| 151 | + TODO: figure out how big the index is and then take appropriate action, |
| 152 | + index < 4mb just do a distinct query, index > 4mb do a map reduce. |
| 153 | + ''' |
| 154 | + if force_new == False and \ |
| 155 | + file_utils.check_file_exists(settings.binary_location, '%s_%s_%s.bin' |
| 156 | + % (self.dbname, self.collection, key)): |
| 157 | + ids = file_utils.load_object(settings.binary_location, '%s_%s_%s.bin' |
| 158 | + % (self.dbname, self.collection, key)) |
| 159 | + else: |
| 160 | + #TODO this is a bit arbitrary, should check if index > 4Mb. |
| 161 | + if self.db[self.collection].count() < 200000: |
| 162 | + ids = self.db[self.collection].distinct(key) |
| 163 | + else: |
| 164 | + ids = self.retrieve_distinct_keys_mapreduce(key) |
| 165 | + file_utils.store_object(ids, settings.binary_location, \ |
| 166 | + '%s_%s_%s.bin' % (self.dbname, |
| 167 | + self.collection, key)) |
| 168 | + return ids |
| 169 | + |
| 170 | + def retrieve_distinct_keys_mapreduce(self, key): |
| 171 | + emit = 'function () { emit(this.%s, 1)};' % key |
| 172 | + map = Code(emit) |
| 173 | + reduce = Code("function()") |
| 174 | + |
| 175 | + ids = [] |
| 176 | + cursor = self.db[self.collection].map_reduce(map, reduce) |
| 177 | + for c in cursor.find(): |
| 178 | + ids.append(c['_id']) |
| 179 | + return ids |
| 180 | + |
| 181 | + def stringify_keys(self, data): |
| 182 | + ''' |
| 183 | + @data should be a dictionary where the keys are not yet strings. This |
| 184 | + function is called just prior any insert / update query in mongo |
| 185 | + because mongo only accepts strings as keys. |
| 186 | + ''' |
| 187 | + new_data = {} |
| 188 | + for key, value in data.iteritems(): |
| 189 | + if isinstance(value, dict): |
| 190 | + new_data[str(key)] = self.stringify_keys(value) |
| 191 | + else: |
| 192 | + new_data[str(key)] = value |
| 193 | + return new_data |
| 194 | + |
| 195 | + def start_server(self, port, path): |
| 196 | + default_port = 27017 |
| 197 | + port = default_port + port |
| 198 | + if settings.platform == 'Windows': |
| 199 | + p = subprocess.Popen([path, '--port %s', port, '--dbpath', |
| 200 | + 'c:\data\db', '--logpath', 'c:\mongodb\logs']) |
| 201 | + elif settings.platform == 'Linux': |
| 202 | + subprocess.Popen([path, '--port %s' % port]) |
| 203 | + elif settings.platform == 'OSX': |
| 204 | + raise exceptions.NotImplementedError |
| 205 | + else: |
| 206 | + raise exceptions.PlatformNotSupportedError(platform) |
| 207 | + |
| 208 | + |
| 209 | +class Cassandra(AbstractDatabase): |
| 210 | + @classmethod |
| 211 | + def is_registrar_for(cls, storage): |
| 212 | + return storage == 'cassandra' |
| 213 | + |
| 214 | + def install_schema(self, drop_first=False): |
| 215 | + sm = pycassa.system_manager.SystemManager('127.0.0.1:9160') |
| 216 | + if drop_first: |
| 217 | + sm.drop_keyspace(keyspace_name) |
| 218 | + |
| 219 | + sm.create_keyspace(keyspace_name, replication_factor=1) |
| 220 | + |
| 221 | + sm.create_column_family(self.dbname, self.collection, |
| 222 | + comparator_type=pycassa.system_manager.UTF8_TYPE, |
| 223 | + default_validation_class=pycassa.system_manager.UTF8_TYPE) |
| 224 | + |
| 225 | + sm.create_index(self.dbname, self.collection, 'article', pycassa.system_manager.UTF8_TYPE) |
| 226 | + sm.create_index(self.dbname, self.collection, 'username', pycassa.system_manager.UTF8_TYPE) |
| 227 | + sm.create_index(self.dbname, self.collection, 'user_id', pycassa.system_manager.LONG_TYPE) |
| 228 | + |
| 229 | + |
| 230 | + |
| 231 | +def Database(storage, dbname, collection): |
| 232 | + for cls in AbstractDatabase.__subclasses__(): |
| 233 | + if cls.is_registrar_for(storage): |
| 234 | + return cls(dbname, collection) |
| 235 | + raise ValueError |
| 236 | + |
| 237 | + |
| 238 | +if __name__ == '__main__': |
| 239 | + db = Database('mongo', 'wikilytics', 'zhwiki_editors_raw') |
| 240 | + ids = db.retrieve_distinct_keys('editor', force_new=True) |
| 241 | + #db.insert({'foo':'bar'}) |
| 242 | + #db.update('foo', 'bar', {}) |
| 243 | + #db.drop_collection() |
| 244 | + print db |
Property changes on: trunk/tools/editor_trends/classes/storage.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 245 | + native |
Index: trunk/tools/editor_trends/classes/dataset.py |
— | — | @@ -39,7 +39,7 @@ |
40 | 40 | |
41 | 41 | from utils import file_utils |
42 | 42 | from utils import data_converter |
43 | | -from database import db |
| 43 | +from classes import storage |
44 | 44 | from analyses import json_encoders |
45 | 45 | from classes import exceptions |
46 | 46 | |
— | — | @@ -445,12 +445,11 @@ |
446 | 446 | |
447 | 447 | def to_mongo(self): |
448 | 448 | dbname = '%s%s' % (self.language_code, self.project) |
449 | | - mongo = db.init_mongo_db(dbname) |
450 | | - coll = mongo['%s_%s' % (dbname, 'charts')] |
451 | | - mongo.add_son_manipulator(Transform()) |
452 | | - coll.remove({'hash':self.hash, 'project':self.project, |
| 449 | + db = storage.Database('mongo', dbname, 'charts') |
| 450 | + db.add_son_manipulator(Transform()) |
| 451 | + db.remove({'hash':self.hash, 'project':self.project, |
453 | 452 | 'language_code':self.language_code}) |
454 | | - coll.insert({'variables': self}) |
| 453 | + db.insert({'variables': self}) |
455 | 454 | |
456 | 455 | def to_csv(self): |
457 | 456 | data = data_converter.convert_dataset_to_lists(self, 'manage') |
— | — | @@ -542,8 +541,7 @@ |
543 | 542 | |
544 | 543 | |
545 | 544 | def debug(): |
546 | | - mongo = db.init_mongo_db('enwiki') |
547 | | - rawdata = mongo['enwiki_charts'] |
| 545 | + db = storage.Database('mongo', 'wikilytics', 'enwiki_charts') |
548 | 546 | mongo.add_son_manipulator(Transform()) |
549 | 547 | |
550 | 548 | d1 = datetime.datetime.today() |
Index: trunk/tools/editor_trends/classes/analytics.py |
— | — | @@ -24,7 +24,7 @@ |
25 | 25 | sys.path.append('..') |
26 | 26 | |
27 | 27 | from classes import consumers |
28 | | -from database import db |
| 28 | +from classes import storage |
29 | 29 | |
30 | 30 | class Replicator: |
31 | 31 | def __init__(self, plugin, time_unit, cutoff=None, cum_cutoff=None, **kwargs): |
— | — | @@ -56,13 +56,9 @@ |
57 | 57 | |
58 | 58 | def __call__(self): |
59 | 59 | project = 'wiki' |
60 | | - |
61 | 60 | #rts = runtime_settings.init_environment('wiki', 'en', args) |
62 | | - |
63 | 61 | for lang in self.languages: |
64 | 62 | self.rts = runtime_settings.init_environment(project, lang, self.args) |
65 | | - #TEMP FIX, REMOVE |
66 | | - #rts.dbname = 'enwiki' |
67 | 63 | self.rts.editors_dataset = 'editors_dataset' |
68 | 64 | |
69 | 65 | self.rts.dbname = '%s%s' % (lang, project) |
— | — | @@ -70,7 +66,8 @@ |
71 | 67 | for cutoff in self.cutoff: |
72 | 68 | generate_chart_data(self.rts, self.plugin, |
73 | 69 | time_unit=self.time_unit, |
74 | | - cutoff=cutoff, cum_cutoff=cum_cutoff, |
| 70 | + cutoff=cutoff, |
| 71 | + cum_cutoff=cum_cutoff, |
75 | 72 | **self.kwargs) |
76 | 73 | |
77 | 74 | |
— | — | @@ -84,8 +81,7 @@ |
85 | 82 | Generic loop function that loops over all the editors of a Wikipedia |
86 | 83 | project and then calls the plugin that does the actual mapping. |
87 | 84 | ''' |
88 | | - mongo = db.init_mongo_db(self.rts.dbname) |
89 | | - coll = mongo[self.rts.editors_dataset] |
| 85 | + db = storage.Database('mongo', self.rts.dbname, self.rts.editors_dataset) |
90 | 86 | while True: |
91 | 87 | try: |
92 | 88 | task = self.tasks.get(block=False) |
— | — | @@ -93,7 +89,7 @@ |
94 | 90 | if task == None: |
95 | 91 | self.result.put(self.var) |
96 | 92 | break |
97 | | - editor = coll.find_one({'editor': task.editor}) |
| 93 | + editor = db.find_one('editor', task.editor) |
98 | 94 | |
99 | 95 | task.plugin(self.var, editor, dbname=self.rts.dbname) |
100 | 96 | self.result.put(True) |
Index: trunk/tools/editor_trends/utils/inventory.py |
— | — | @@ -27,7 +27,7 @@ |
28 | 28 | from classes import settings |
29 | 29 | settings = settings.Settings() |
30 | 30 | |
31 | | -from database import db |
| 31 | +from classes import storage |
32 | 32 | from utils import http_utils |
33 | 33 | from classes import runtime_settings |
34 | 34 | from classes import languages |
— | — | @@ -65,11 +65,9 @@ |
66 | 66 | |
67 | 67 | |
68 | 68 | def store_available_dumps(self): |
69 | | - mongo = db.init_mongo_db('wikilytics') |
70 | | - coll = mongo['available_dumps'] |
| 69 | + db = storage.Database('mongo', 'wikilytics', 'available_dumps') |
| 70 | + db.save({'project': self.project, 'dumps': self.data}) |
71 | 71 | |
72 | | - coll.save({'project': self.project, 'dumps': self.data}) |
73 | | - |
74 | 72 | def run(self): |
75 | 73 | project = self.props.projects[self.project] |
76 | 74 | langs = self.props.project_supports_language(project) |
Index: trunk/tools/editor_trends/utils/compression.py |
— | — | @@ -136,11 +136,11 @@ |
137 | 137 | ''' |
138 | 138 | print 'Unzipping zip file' |
139 | 139 | stopwatch = timer.Timer() |
140 | | - log.log_to_mongo(properties, 'dataset', 'unpack', stopwatch, event='start') |
| 140 | + log.to_db(properties, 'dataset', 'unpack', stopwatch, event='start') |
141 | 141 | compressor = Compressor(location, filename) |
142 | 142 | retcode = compressor.extract() |
143 | 143 | stopwatch.elapsed() |
144 | | - log.log_to_mongo(properties, 'dataset', 'unpack', stopwatch, event='finish') |
| 144 | + log.to_db(properties, 'dataset', 'unpack', stopwatch, event='finish') |
145 | 145 | return retcode |
146 | 146 | |
147 | 147 | |
Index: trunk/tools/editor_trends/utils/log.py |
— | — | @@ -26,35 +26,33 @@ |
27 | 27 | from classes import settings |
28 | 28 | settings = settings.Settings() |
29 | 29 | |
30 | | -from database import db |
| 30 | +from classes import storage |
31 | 31 | |
32 | | -def log_to_mongo(rts, jobtype, task, timer, event='start'): |
33 | | - conn = db.init_mongo_db(rts.dbname) |
| 32 | +def to_db(rts, jobtype, task, timer, event='start'): |
| 33 | + db = storage.Database('mongo', rts.dbname, 'jobs') |
34 | 34 | created = datetime.datetime.now() |
35 | 35 | hash = '%s_%s' % (rts.project, rts.hash) |
36 | | - coll = conn['jobs'] |
37 | 36 | |
38 | | - job = coll.find_one({'hash': hash}) |
| 37 | + job = db.find_one('hash', hash) |
39 | 38 | |
| 39 | + data = {'hash': hash, |
| 40 | + 'created': created, |
| 41 | + 'jobtype': jobtype, |
| 42 | + 'in_progress': True, |
| 43 | + 'language_code': rts.language.code, |
| 44 | + 'project': rts.project.name, |
| 45 | + 'tasks': {}, |
| 46 | + } |
| 47 | + |
40 | 48 | if job == None: |
41 | 49 | if jobtype == 'dataset': |
42 | | - _id = coll.save({'hash': hash, 'created': created, 'finished': False, |
43 | | - 'language_code': rts.language.code, |
44 | | - 'project': rts.project.name, |
45 | | - 'in_progress': True, 'jobtype': jobtype, |
46 | | - 'tasks': {}}) |
47 | | - |
48 | | - |
| 50 | + data['finished'] = False |
| 51 | + _id = db.save(data) |
49 | 52 | elif jobtype == 'chart': |
50 | | - _id = coll.save({'hash': hash, 'created': created, |
51 | | - 'jobtype': jobtype, |
52 | | - 'finished': True, |
53 | | - 'in_progress': True, |
54 | | - 'project': rts.project.name, |
55 | | - 'language_code': rts.language.code, |
56 | | - 'tasks': {}}) |
| 53 | + data['finished'] = True |
| 54 | + _id = db.save(data) |
57 | 55 | |
58 | | - job = coll.find_one({'_id': _id}) |
| 56 | + job = db.find_one('_id', _id) |
59 | 57 | |
60 | 58 | tasks = job['tasks'] |
61 | 59 | t = tasks.get(task, {}) |
— | — | @@ -62,20 +60,22 @@ |
63 | 61 | t['start'] = timer.t0 |
64 | 62 | t['in_progress'] = True |
65 | 63 | tasks[task] = t |
66 | | - coll.update({'hash': hash}, {'$set': {'tasks': tasks}}) |
| 64 | + db.update('hash', hash, {'$set': {'tasks': tasks}}) |
| 65 | + #coll.update({'hash': hash}, {'$set': {'tasks': tasks}}) |
67 | 66 | elif event == 'finish': |
68 | 67 | t['finish'] = timer.t1 |
69 | 68 | t['in_progress'] = False |
70 | 69 | tasks[task] = t |
71 | | - if task == 'transform' or jobtype == 'chart': #final task, set entire task to finished |
72 | | - coll.update({'hash': hash}, {'$set': {'tasks': tasks, |
73 | | - 'in_progress': False, |
74 | | - 'finished': True}}) |
| 70 | + if task == 'transform' or jobtype == 'chart': |
| 71 | + #final task, set entire task to finished |
| 72 | + db.update('hash', hash, {'$set': {'tasks': tasks, |
| 73 | + 'in_progress': False, |
| 74 | + 'finished': True}}) |
75 | 75 | else: |
76 | | - coll.update({'hash': hash}, {'$set': {'tasks': tasks}}) |
| 76 | + db.update('hash', hash, {'$set': {'tasks': tasks}}) |
77 | 77 | |
78 | 78 | |
79 | | -def log_to_csv(logger, settings, message, verb, function, **kwargs): |
| 79 | +def to_csv(logger, settings, message, verb, function, **kwargs): |
80 | 80 | ''' |
81 | 81 | Writes detailed log information to logs / projectname_date.csv |
82 | 82 | ''' |
Index: trunk/tools/editor_trends/database/cassandra.py |
— | — | @@ -1,36 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__author__email = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2011-02-25' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import pycassa |
22 | | - |
23 | | -def install_schema(keyspace_name, drop_first=False): |
24 | | - |
25 | | - sm = pycassa.system_manager.SystemManager('127.0.0.1:9160') |
26 | | - if drop_first: |
27 | | - sm.drop_keyspace(keyspace_name) |
28 | | - |
29 | | - sm.create_keyspace(keyspace_name, replication_factor=1) |
30 | | - |
31 | | - sm.create_column_family(keyspace_name, 'revisions', |
32 | | - comparator_type=pycassa.system_manager.UTF8_TYPE, |
33 | | - default_validation_class=pycassa.system_manager.UTF8_TYPE) |
34 | | - |
35 | | - sm.create_index(keyspace_name, 'revisions', 'article', pycassa.system_manager.UTF8_TYPE) |
36 | | - sm.create_index(keyspace_name, 'revisions', 'username', pycassa.system_manager.UTF8_TYPE) |
37 | | - sm.create_index(keyspace_name, 'revisions', 'user_id', pycassa.system_manager.LONG_TYPE) |
Index: trunk/tools/editor_trends/database/db.py |
— | — | @@ -1,168 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-10-21' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import sys |
22 | | -import pymongo |
23 | | -from bson.code import Code |
24 | | - |
25 | | -if '..' not in sys.path: |
26 | | - sys.path.append('..') |
27 | | - |
28 | | -from classes import settings |
29 | | -settings = settings.Settings() |
30 | | -import file_utils |
31 | | - |
32 | | - |
33 | | -def init_mongo_db(dbname): |
34 | | - connection = pymongo.Connection() |
35 | | - db = connection[dbname] |
36 | | - return db |
37 | | - |
38 | | - |
39 | | -def drop_collection(dbname, collection): |
40 | | - db = init_mongo_db(dbname) |
41 | | - db.drop_collection(collection) |
42 | | - |
43 | | - |
44 | | -def get_collections(dbname): |
45 | | - db = init_mongo_db(dbname) |
46 | | - return db.collection_names() |
47 | | - |
48 | | - |
49 | | -def count_records(dbname, collection): |
50 | | - db = init_mongo_db(dbname) |
51 | | - return db[collection].count() |
52 | | - |
53 | | - |
54 | | -def cleanup_database(dbname, logger, endswith=None): |
55 | | - coll = get_collections(dbname) |
56 | | - for c in coll: |
57 | | - if not c.startswith('system'): |
58 | | - if endswith != None and c.endswith(endswith): |
59 | | - drop_collection(dbname, c) |
60 | | - logger.debug('Deleting collection %s from database %s.' % (c, dbname)) |
61 | | - |
62 | | - |
63 | | -def remove_documents_from_mongo_db(collection, ids): |
64 | | - collection.remove(ids) |
65 | | - |
66 | | - |
67 | | -def add_index_to_collection(db, collection, key): |
68 | | - ''' |
69 | | - @db is the name of the mongodb |
70 | | - @collection is the name of the 'table' in mongodb |
71 | | - @key name of the field to create the index |
72 | | - ''' |
73 | | - |
74 | | - mongo = init_mongo_db(db) |
75 | | - collection = mongo[collection] |
76 | | - mongo.collection.create_index(key) |
77 | | - mongo.collection.ensure_index(key) |
78 | | - |
79 | | - |
80 | | -def run_query(dbname, collection, var, qualifier=None): |
81 | | - mongo = init_mongo_db(dbname) |
82 | | - collection = mongo[collection] |
83 | | - if qualifier == 'min': |
84 | | - return collection.find({var : {'$ne' : False}}).sort(var, pymongo.ASCENDING).limit(1)[0] |
85 | | - elif qualifier == 'max': |
86 | | - return collection.find({var : {'$ne' : False}}).sort(var, pymongo.DESCENDING).limit(1)[0] |
87 | | - else: |
88 | | - return collection.find({var: 1}) |
89 | | - |
90 | | - |
91 | | -def stringify_keys(obj): |
92 | | - ''' |
93 | | - @obj should be a dictionary where the keys are not yet strings. this function |
94 | | - is called just prior any insert / update query in mongo because mongo only |
95 | | - accepts strings as keys. |
96 | | - ''' |
97 | | - d = {} |
98 | | - for o in obj: |
99 | | - if isinstance(obj[o], dict): |
100 | | - #if type(obj[o]) == type({}): |
101 | | - obj[o] = stringify_keys(obj[o]) |
102 | | - d[str(o)] = obj[o] |
103 | | - return d |
104 | | - |
105 | | - |
106 | | -def retrieve_min_value(dbname, collection, var): |
107 | | - mongo = init_mongo_db(dbname) |
108 | | - coll = mongo[collection] |
109 | | - emit = 'function () {emit(this.editor, this.edit_count);}' |
110 | | - map = Code(emit) |
111 | | - reduce = Code("function()") |
112 | | -# reduce = Code("""reduce = function (key, value) { |
113 | | -# return Math.max(value); |
114 | | -# } |
115 | | -# """) |
116 | | - cursor = coll.map_reduce(map, reduce) |
117 | | - |
118 | | - data = [] |
119 | | - for c in cursor.find(): |
120 | | - data.append(c) |
121 | | - return data |
122 | | - |
123 | | - |
124 | | -def retrieve_distinct_keys(dbname, collection, field, force_new=False): |
125 | | - #mongo = init_mongo_db(dbname) |
126 | | - #editors = mongo[collection] |
127 | | - #ids = retrieve_distinct_keys_mapreduce(editors, field) |
128 | | - ''' |
129 | | - TODO: figure how big the index is and then take appropriate action, index |
130 | | - < 4mb just do a distinct query, index > 4mb do a map reduce. |
131 | | - ''' |
132 | | - if force_new == False and file_utils.check_file_exists(settings.binary_location, |
133 | | - '%s_%s_%s.bin' % (dbname, collection, field)): |
134 | | - ids = file_utils.load_object(settings.binary_location, '%s_%s_%s.bin' % (dbname, collection, field)) |
135 | | - else: |
136 | | - mongo = init_mongo_db(dbname) |
137 | | - editors = mongo[collection] |
138 | | - #index_size = mongo.stats() |
139 | | - if editors.count () < 200000: #TODO this is a bit arbitrary, should check if index > 4Mb. |
140 | | - ids = editors.distinct(field) |
141 | | - else: |
142 | | - #params = {} |
143 | | - #params['size'] = 'size' |
144 | | - #size = editors.find_one({'size': 1}) |
145 | | - ids = retrieve_distinct_keys_mapreduce(editors, field) |
146 | | - file_utils.store_object(ids, settings.binary_location, '%s_%s_%s.bin' % (dbname, collection, field)) |
147 | | - return ids |
148 | | - |
149 | | - |
150 | | -def retrieve_distinct_keys_mapreduce(collection, field): |
151 | | - emit = 'function () { emit(this.%s, 1)};' % field |
152 | | - map = Code(emit) |
153 | | - |
154 | | - reduce = Code("function()") |
155 | | - |
156 | | - ids = [] |
157 | | - cursor = collection.map_reduce(map, reduce) |
158 | | - for c in cursor.find(): |
159 | | - ids.append(c['_id']) |
160 | | - return ids |
161 | | - |
162 | | - |
163 | | -def debug(): |
164 | | - #retrieve_distinct_keys('enwiki', 'editors_dataset', 'editor') |
165 | | - retrieve_min_value('enwiki', 'editors_dataset', 'new_wikipedian') |
166 | | - |
167 | | - |
168 | | -if __name__ == '__main__': |
169 | | - debug() |
Index: trunk/tools/editor_trends/database/launcher.py |
— | — | @@ -28,15 +28,15 @@ |
29 | 29 | from utils import file_utils |
30 | 30 | |
31 | 31 | |
32 | | -def start_mongodb_server(x, path): |
| 32 | +def start_mongodb_server(port, path): |
33 | 33 | default_port = 27017 |
34 | | - port = default_port + x |
| 34 | + port = default_port + port |
35 | 35 | if settings.platform == 'Windows': |
36 | 36 | p = subprocess.Popen([path, '--port', str(port), '--dbpath', 'c:\data\db', '--logpath', 'c:\mongodb\logs']) |
37 | 37 | elif settings.platform == 'Linux': |
38 | 38 | subprocess.Popen([path, '--port %s' % port]) |
39 | 39 | elif settings.platform == 'OSX': |
40 | | - raise NotImplementedError |
| 40 | + raise exceptions.NotImplementedError |
41 | 41 | else: |
42 | 42 | raise exceptions.PlatformNotSupportedError(platform) |
43 | 43 | |
Index: trunk/tools/editor_trends/database/cache.py |
— | — | @@ -20,8 +20,6 @@ |
21 | 21 | |
22 | 22 | import datetime |
23 | 23 | import sys |
24 | | -import bson |
25 | | -from pymongo.errors import OperationFailure |
26 | 24 | |
27 | 25 | if '..' not in sys.path: |
28 | 26 | sys.path.append('..') |
— | — | @@ -29,13 +27,12 @@ |
30 | 28 | from classes import settings |
31 | 29 | settings = settings.Settings() |
32 | 30 | |
33 | | -import db |
34 | 31 | from utils import file_utils |
35 | 32 | from utils import data_converter |
36 | 33 | |
37 | | -class EditorCache(object): |
38 | | - def __init__(self, collection): |
39 | | - self.collection = collection |
| 34 | +class EditorCache: |
| 35 | + def __init__(self, db): |
| 36 | + self.db = db |
40 | 37 | self.editors = {} |
41 | 38 | self.final_year = datetime.datetime.now().year + 1 |
42 | 39 | self.n = 0 |
— | — | @@ -57,8 +54,7 @@ |
58 | 55 | def add(self, key, value): |
59 | 56 | if value == 'NEXT': |
60 | 57 | self.n += 1 |
61 | | - edits = db.stringify_keys(self.editors[key]['edits']) |
62 | | - edits = self.drop_years_no_obs(edits) |
| 58 | + edits = self.drop_years_no_obs(self.editors[key]['edits']) |
63 | 59 | self.insert(key, edits, self.editors[key]['username']) |
64 | 60 | del self.editors[key] |
65 | 61 | else: |
— | — | @@ -74,22 +70,21 @@ |
75 | 71 | self.editors[key]['edits'][year].append(value) |
76 | 72 | self.editors[key]['obs'] += 1 |
77 | 73 | |
78 | | - def update(self, editor, values): |
79 | | - self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True) |
80 | | - |
81 | 74 | def insert(self, editor, values, username): |
82 | | - ''' |
83 | | - Adding the safe=True statement slows down the insert process but this |
84 | | - assures that all data will be written. |
85 | | - ''' |
86 | | - try: |
87 | | - self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True) |
88 | | - except bson.errors.InvalidDocument: |
89 | | - print 'BSON document too large, unable to store %s' % (username) |
90 | | - except OperationFailure, error: |
91 | | - print error |
92 | | - print 'It seems that you are running out of disk space.' |
93 | | - sys.exit(-1) |
| 75 | + data = {'editor': editor, 'edits': values, 'username': username} |
| 76 | + self.db.insert(data) |
| 77 | +# ''' |
| 78 | +# Adding the safe=True statement slows down the insert process but this |
| 79 | +# assures that all data will be written. |
| 80 | +# ''' |
| 81 | +# try: |
| 82 | +# self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True) |
| 83 | +# except bson.errors.InvalidDocument: |
| 84 | +# print 'BSON document too large, unable to store %s' % (username) |
| 85 | +# except OperationFailure, error: |
| 86 | +# print error |
| 87 | +# print 'It seems that you are running out of disk space.' |
| 88 | +# sys.exit(-1) |
94 | 89 | |
95 | 90 | def store(self): |
96 | 91 | file_utils.store_object(self, settings.binary_location, self.__repr__()) |
Index: trunk/tools/editor_trends/cronjobs.py |
— | — | @@ -22,7 +22,7 @@ |
23 | 23 | |
24 | 24 | import manage as manager |
25 | 25 | |
26 | | -from database import db |
| 26 | +from classes import storage |
27 | 27 | from classes import languages |
28 | 28 | from classes import projects |
29 | 29 | from classes import runtime_settings |
— | — | @@ -86,12 +86,11 @@ |
87 | 87 | This is the main entry point, it creates a queue with jobs and determines |
88 | 88 | the type of job and fires it off |
89 | 89 | ''' |
90 | | - mongo = db.init_mongo_db('wikilytics') |
91 | | - coll = mongo['jobs'] |
| 90 | + db = storage.Database('mongo', 'wikilytics', 'jobs') |
92 | 91 | tasks = [] |
93 | 92 | project, language, parser = manager.init_args_parser() |
94 | 93 | args = parser.parse_args(['django']) |
95 | | - jobs = coll.find({'finished': False, 'in_progress': False, 'error': False}) |
| 94 | + jobs = db.find({'finished': False, 'in_progress': False, 'error': False}) |
96 | 95 | for job in jobs: |
97 | 96 | tasks.append(job) |
98 | 97 | |
— | — | @@ -99,20 +98,19 @@ |
100 | 99 | if task['jobtype'] == 'dataset': |
101 | 100 | print 'Launching the Editor Trends Analytics Toolkit.' |
102 | 101 | res = launch_editor_trends_toolkit(task, args) |
103 | | - #res = False |
104 | 102 | else: |
105 | 103 | print 'Launching %s.' % task['jobtype'] |
106 | 104 | res = launch_chart(task, args) |
107 | 105 | |
108 | 106 | if res: |
109 | | - coll.update({'_id': task['_id']}, {'$set': {'finished': True}}) |
| 107 | + db.update({'_id': task['_id']}, {'$set': {'finished': True}}) |
110 | 108 | else: |
111 | 109 | ''' |
112 | 110 | To prevent jobs from recurring non-stop, set error to True. These |
113 | 111 | jobs will be excluded and need to be investigated to see what's |
114 | 112 | happening. |
115 | 113 | ''' |
116 | | - coll.update({'_id': task['_id']}, {'$set': {'error': True}}) |
| 114 | + db.update('_id', task['_id'], {'$set': {'error': True}}) |
117 | 115 | |
118 | 116 | |
119 | 117 | def debug(): |
Index: trunk/tools/editor_trends/bots/detector.py |
— | — | @@ -1,283 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-11-25' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | - |
22 | | -import os |
23 | | -import cStringIO |
24 | | -import multiprocessing |
25 | | -import xml.etree.cElementTree as cElementTree |
26 | | -import sys |
27 | | -from Queue import Empty |
28 | | - |
29 | | -if '..' not in sys.path: |
30 | | - sys.path.append('..') |
31 | | - |
32 | | -from classes import settings |
33 | | -settings = settings.Settings() |
34 | | - |
35 | | - |
36 | | -from database import db |
37 | | -from utils import file_utils |
38 | | -from utils import messages |
39 | | - |
40 | | -from classes import consumers |
41 | | -from classes import bots |
42 | | - |
43 | | -try: |
44 | | - import psyco |
45 | | - psyco.full() |
46 | | -except ImportError: |
47 | | - pass |
48 | | - |
49 | | - |
50 | | -def read_bots_csv_file(location, filename, encoding, manager=False): |
51 | | - ''' |
52 | | - Constructs a dictionary from Bots.csv |
53 | | - key is language |
54 | | - value is a list of bot names |
55 | | - ''' |
56 | | - if manager: |
57 | | - bot_dict = manager.dict() |
58 | | - else: |
59 | | - bot_dict = dict() |
60 | | - for line in file_utils.read_data_from_csv(location, filename, encoding): |
61 | | - line = line.strip() |
62 | | - language, bots = line.split(',') |
63 | | - bots = bots.split('|') |
64 | | - for bot in bots: |
65 | | - if bot not in bot_dict: |
66 | | - b = botconsumers.Bot(bot) |
67 | | - b.id = None |
68 | | - else: |
69 | | - b = bot_dict[bot] |
70 | | - b.projects.append(language) |
71 | | - bot_dict[bot] = b |
72 | | - return bot_dict |
73 | | - |
74 | | - |
75 | | -def retrieve_bots(language_code): |
76 | | - ''' |
77 | | - Loader function to retrieve list of id's of known Wikipedia bots. |
78 | | - ''' |
79 | | - ids = [] |
80 | | - mongo = db.init_mongo_db('bots') |
81 | | - bots = mongo['ids'] |
82 | | - cursor = bots.find() |
83 | | - for bot in cursor: |
84 | | - if bot['verified'] == 'True' and language_code in bot['projects']: |
85 | | - ids.append(bot['name']) |
86 | | - return ids |
87 | | - |
88 | | - |
89 | | -def store_bots(): |
90 | | - ''' |
91 | | - This file reads the results from the lookup_bot_userid function and stores |
92 | | - it in a MongoDB collection. |
93 | | - ''' |
94 | | - keys = ['name', 'verified', 'projects'] |
95 | | - bots = file_utils.create_dict_from_csv_file(settings.csv_location, |
96 | | - 'bots_ids.csv', |
97 | | - 'utf-8', |
98 | | - keys) |
99 | | - mongo = db.init_mongo_db('bots') |
100 | | - collection = mongo['ids'] |
101 | | - db.remove_documents_from_mongo_db(collection, None) |
102 | | - |
103 | | - for id in bots: |
104 | | - bot = bots[id] |
105 | | - data = dict([(k, bot[k]) for k in keys]) |
106 | | - data['id'] = id |
107 | | - collection.insert(data) |
108 | | - |
109 | | - print 'Stored %s bots' % collection.count() |
110 | | - |
111 | | - |
112 | | -def convert_object_to_dict(obj, exclude=[]): |
113 | | - ''' |
114 | | - @obj is an arbitray object where the properties need to be translated to |
115 | | - keys and values to ease writing to a csv file. |
116 | | - ''' |
117 | | - d = {} |
118 | | - for kw in obj.__dict__.keys(): |
119 | | - if kw not in exclude: |
120 | | - d[kw] = getattr(obj, kw) |
121 | | - return d |
122 | | - |
123 | | - |
124 | | -def write_bot_list_to_csv(bots, keys): |
125 | | - fh = file_utils.create_txt_filehandle(settings.csv_location, 'bots_ids.csv', |
126 | | - 'w', 'utf-8') |
127 | | - bot_dict = convert_object_to_dict(bots, exclude=['time', 'written']) |
128 | | - for bot in bot_dict: |
129 | | - bot = bot_dict[bot] |
130 | | - file_utils.write_dict_to_csv(bot, fh, keys, write_key=False, |
131 | | - newline=True) |
132 | | - fh.close() |
133 | | - |
134 | | - |
135 | | -def lookup_bot_userid(data, fh, bots, keys): |
136 | | - ''' |
137 | | - This function is used to find the id's belonging to the different bots that |
138 | | - are patrolling the Wikipedia sites. |
139 | | - @xml_nodes is a list of xml elements that need to be parsed |
140 | | - @bots is a dictionary containing the names of the bots to lookup |
141 | | - ''' |
142 | | - username = data[3] |
143 | | - if username in bots: |
144 | | - bot = bots.pop(username) |
145 | | - setattr(bot, 'id', data[0]) |
146 | | - setattr(bot, 'verified', True) |
147 | | - bot = convert_object_to_dict(bot, exclude=['time']) |
148 | | - file_utils.write_dict_to_csv(bot, fh, keys, write_key=False, newline=True) |
149 | | - return bots |
150 | | - |
151 | | - |
152 | | -def create_bot_validation_dataset(xml_nodes, fh, bots): |
153 | | - revisions = xml_nodes.findall('revision') |
154 | | - for revision in revisions: |
155 | | - contributor = revision.find('contributor') |
156 | | - #contributor = xml.retrieve_xml_node(revision, 'contributor') |
157 | | - username = contributor.find('username') |
158 | | - if username == None or username.text == None: |
159 | | - continue |
160 | | - else: |
161 | | - username = username.text.lower() |
162 | | - |
163 | | - #print username.encode('utf-8') |
164 | | - if username.find('bot') > -1 or username.find('script') > -1: |
165 | | - bot = bots.get(username, botconsumers.Bot(username, verified=False)) |
166 | | - bot.id = contributor.find('id').text |
167 | | - timestamp = revision.find('timestamp').text |
168 | | - if timestamp != None: |
169 | | - timestamp = file_utils.convert_timestamp_to_datetime_naive(timestamp, settings.timestamp_format) |
170 | | - bot.time[str(timestamp.year)].append(timestamp) |
171 | | - bots[username] = bot |
172 | | - |
173 | | - return bots |
174 | | - |
175 | | - #bot = bots.get('PseudoBot') |
176 | | - #bot.hours_active() |
177 | | - #bot.avg_lag_between_edits() |
178 | | - |
179 | | - |
180 | | -def bot_launcher(language_code, project, target, action, single=False, manager=False): |
181 | | - ''' |
182 | | - This function sets the stage to launch bot id detection and collecting data |
183 | | - to discover new bots. |
184 | | - ''' |
185 | | - file_utils.delete_file(settings.csv_location, 'bots_ids.csv') |
186 | | - location = os.path.join(settings.input_location, language_code, project) |
187 | | - input_xml = os.path.join(location, 'chunks') |
188 | | - input_txt = os.path.join(location, 'txt') |
189 | | - |
190 | | - tasks = multiprocessing.JoinableQueue() |
191 | | - mgr = multiprocessing.Manager() |
192 | | - keys = ['id', 'name', 'verified', 'projects'] |
193 | | - |
194 | | - if action == 'lookup': |
195 | | - output_file = 'bots_ids.csv' |
196 | | - files = file_utils.retrieve_file_list(input_txt, 'txt', mask=None) |
197 | | - input_queue = pc.load_queue(files, poison_pill=True) |
198 | | - bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', 'utf-8', manager=manager) |
199 | | - for file in files: |
200 | | - tasks.put(consumers.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
201 | | - |
202 | | - else: |
203 | | - output_file = 'bots_predictionset.csv' |
204 | | - files = file_utils.retrieve_file_list(input_xml, 'xml', mask=None) |
205 | | - bots = {} |
206 | | - for file in files: |
207 | | - tasks.put(consumers.XMLFile(file, input_xml, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
208 | | - |
209 | | - #lock = mgr.Lock() |
210 | | - if manager: |
211 | | - manager = mgr |
212 | | - |
213 | | - tracker = {} |
214 | | - if single: |
215 | | - while True: |
216 | | - try: |
217 | | - print '%s files left in the queue...' % messages.show(tasks.qsize) |
218 | | - task = tasks.get(block=False) |
219 | | - bots = task(bots) |
220 | | - except Empty: |
221 | | - break |
222 | | - else: |
223 | | - bot_launcher_multi(tasks) |
224 | | - |
225 | | - file_utils.store_object(bots, settings.binary_location, 'bots.bin') |
226 | | - if action == 'lookup': |
227 | | - store_bots() |
228 | | - if bots != {}: |
229 | | - print 'The script was unable to retrieve the user id\s for the following %s bots:\n' % len(bots) |
230 | | - keys = bots.keys() |
231 | | - for key in keys: |
232 | | - try: |
233 | | - print '%s' % key.encode('utf-8') |
234 | | - except: |
235 | | - pass |
236 | | - else: |
237 | | - bot_training_dataset(bots) |
238 | | - #write_bot_list_to_csv(bots, keys) |
239 | | - |
240 | | - |
241 | | -def bot_training_dataset(bots): |
242 | | - fh = file_utils.create_txt_filehandle(settings.csv_location, 'training_bots.csv', 'w', 'utf-8') |
243 | | - keys = bots.keys() |
244 | | - for key in keys: |
245 | | - bot = bots.get(key) |
246 | | - bot.hours_active() |
247 | | - bot.avg_lag_between_edits() |
248 | | - bot.write_training_dataset(fh) |
249 | | - |
250 | | - fh.close() |
251 | | - |
252 | | - |
253 | | -def bot_launcher_multi(tasks): |
254 | | - ''' |
255 | | - This is the launcher that uses multiprocesses. |
256 | | - ''' |
257 | | - consumers = [consumers.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)] |
258 | | - for x in xrange(settings.number_of_processes): |
259 | | - tasks.put(None) |
260 | | - |
261 | | - for w in consumers: |
262 | | - w.start() |
263 | | - |
264 | | - tasks.join() |
265 | | - |
266 | | - |
267 | | -def debug_bots_dict(): |
268 | | - bots = file_utils.load_object(settings.binary_location, 'bots.bin') |
269 | | - for bot in bots: |
270 | | - bot = bots[bot] |
271 | | - edits = sum([len(bot.time[t]) for t in bot.time]) |
272 | | - print bot.name, bot.verified, edits |
273 | | - print 'done' |
274 | | - return bots |
275 | | - |
276 | | - |
277 | | -if __name__ == '__main__': |
278 | | - language_code = 'en' |
279 | | - project = 'wiki' |
280 | | - #store_bots() |
281 | | - #bots = debug_bots_dict() |
282 | | - #write_bot_list_to_csv(bots) |
283 | | - #language_code, project, lookup_bot_userid, single = False, manager = False |
284 | | - bot_launcher(language_code, project, create_bot_validation_dataset, action='training', single=True, manager=False) |
Index: trunk/tools/editor_trends/code-snippets/mongodb/store.py |
— | — | @@ -6,7 +6,7 @@ |
7 | 7 | import datetime |
8 | 8 | import calendar |
9 | 9 | import time |
10 | | -from database import db |
| 10 | +from classes import storage |
11 | 11 | |
12 | 12 | |
13 | 13 | def test_date(): |
Index: trunk/tools/editor_trends/code-snippets/cassandra.py |
— | — | @@ -0,0 +1,36 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-02-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import pycassa |
| 22 | + |
| 23 | +def install_schema(keyspace_name, drop_first=False): |
| 24 | + |
| 25 | + sm = pycassa.system_manager.SystemManager('127.0.0.1:9160') |
| 26 | + if drop_first: |
| 27 | + sm.drop_keyspace(keyspace_name) |
| 28 | + |
| 29 | + sm.create_keyspace(keyspace_name, replication_factor=1) |
| 30 | + |
| 31 | + sm.create_column_family(keyspace_name, 'revisions', |
| 32 | + comparator_type=pycassa.system_manager.UTF8_TYPE, |
| 33 | + default_validation_class=pycassa.system_manager.UTF8_TYPE) |
| 34 | + |
| 35 | + sm.create_index(keyspace_name, 'revisions', 'article', pycassa.system_manager.UTF8_TYPE) |
| 36 | + sm.create_index(keyspace_name, 'revisions', 'username', pycassa.system_manager.UTF8_TYPE) |
| 37 | + sm.create_index(keyspace_name, 'revisions', 'user_id', pycassa.system_manager.LONG_TYPE) |
Property changes on: trunk/tools/editor_trends/code-snippets/cassandra.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 38 | + native |
Index: trunk/tools/editor_trends/code-snippets/cohort_confidence_intervals.py |
— | — | @@ -24,7 +24,7 @@ |
25 | 25 | settings = configuration.Settings() |
26 | 26 | from utils import file_utils |
27 | 27 | from utils import messages |
28 | | -from database import db |
| 28 | +from classes import storage |
29 | 29 | |
30 | 30 | |
31 | 31 | #def dataset_edits_by_month(dbname, **kwargs): |
Index: trunk/tools/editor_trends/code-snippets/exporter.py |
— | — | @@ -31,7 +31,7 @@ |
32 | 32 | settings = configuration.Settings() |
33 | 33 | from utils import file_utils |
34 | 34 | from utils import messages |
35 | | -from database import db |
| 35 | +from classes import storage |
36 | 36 | from etl import shaper |
37 | 37 | from analyses import cohort_charts |
38 | 38 | |
Index: trunk/tools/editor_trends/code-snippets/mongo.py |
— | — | @@ -0,0 +1,171 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +import sys |
| 23 | +import pymongo |
| 24 | +from bson.code import Code |
| 25 | + |
| 26 | +if '..' not in sys.path: |
| 27 | + sys.path.append('..') |
| 28 | + |
| 29 | +from classes import settings |
| 30 | +settings = settings.Settings() |
| 31 | +import file_utils |
| 32 | + |
| 33 | + |
| 34 | + |
| 35 | + |
| 36 | +def init_mongo_db(dbname): |
| 37 | + connection = pymongo.Connection() |
| 38 | + db = connection[dbname] |
| 39 | + return db |
| 40 | + |
| 41 | + |
| 42 | +def drop_collection(dbname, collection): |
| 43 | + db = init_mongo_db(dbname) |
| 44 | + db.drop_collection(collection) |
| 45 | + |
| 46 | + |
| 47 | +def get_collections(dbname): |
| 48 | + db = init_mongo_db(dbname) |
| 49 | + return db.collection_names() |
| 50 | + |
| 51 | + |
| 52 | +def count_records(dbname, collection): |
| 53 | + db = init_mongo_db(dbname) |
| 54 | + return db[collection].count() |
| 55 | + |
| 56 | + |
| 57 | +def cleanup_database(dbname, logger, endswith=None): |
| 58 | + coll = get_collections(dbname) |
| 59 | + for c in coll: |
| 60 | + if not c.startswith('system'): |
| 61 | + if endswith != None and c.endswith(endswith): |
| 62 | + drop_collection(dbname, c) |
| 63 | + logger.debug('Deleting collection %s from database %s.' % (c, dbname)) |
| 64 | + |
| 65 | + |
| 66 | +def remove_documents_from_mongo_db(collection, ids): |
| 67 | + collection.remove(ids) |
| 68 | + |
| 69 | + |
| 70 | +def add_index_to_collection(db, collection, key): |
| 71 | + ''' |
| 72 | + @db is the name of the mongodb |
| 73 | + @collection is the name of the 'table' in mongodb |
| 74 | + @key name of the field to create the index |
| 75 | + ''' |
| 76 | + |
| 77 | + mongo = init_mongo_db(db) |
| 78 | + collection = mongo[collection] |
| 79 | + mongo.collection.create_index(key) |
| 80 | + mongo.collection.ensure_index(key) |
| 81 | + |
| 82 | + |
| 83 | +def run_query(dbname, collection, var, qualifier=None): |
| 84 | + mongo = init_mongo_db(dbname) |
| 85 | + collection = mongo[collection] |
| 86 | + if qualifier == 'min': |
| 87 | + return collection.find({var : {'$ne' : False}}).sort(var, pymongo.ASCENDING).limit(1)[0] |
| 88 | + elif qualifier == 'max': |
| 89 | + return collection.find({var : {'$ne' : False}}).sort(var, pymongo.DESCENDING).limit(1)[0] |
| 90 | + else: |
| 91 | + return collection.find({var: 1}) |
| 92 | + |
| 93 | + |
| 94 | +def stringify_keys(obj): |
| 95 | + ''' |
| 96 | + @obj should be a dictionary where the keys are not yet strings. this function |
| 97 | + is called just prior any insert / update query in mongo because mongo only |
| 98 | + accepts strings as keys. |
| 99 | + ''' |
| 100 | + d = {} |
| 101 | + for o in obj: |
| 102 | + if isinstance(obj[o], dict): |
| 103 | + #if type(obj[o]) == type({}): |
| 104 | + obj[o] = stringify_keys(obj[o]) |
| 105 | + d[str(o)] = obj[o] |
| 106 | + return d |
| 107 | + |
| 108 | + |
| 109 | +def retrieve_min_value(dbname, collection, var): |
| 110 | + mongo = init_mongo_db(dbname) |
| 111 | + coll = mongo[collection] |
| 112 | + emit = 'function () {emit(this.editor, this.edit_count);}' |
| 113 | + map = Code(emit) |
| 114 | + reduce = Code("function()") |
| 115 | +# reduce = Code("""reduce = function (key, value) { |
| 116 | +# return Math.max(value); |
| 117 | +# } |
| 118 | +# """) |
| 119 | + cursor = coll.map_reduce(map, reduce) |
| 120 | + |
| 121 | + data = [] |
| 122 | + for c in cursor.find(): |
| 123 | + data.append(c) |
| 124 | + return data |
| 125 | + |
| 126 | + |
| 127 | +def retrieve_distinct_keys(dbname, collection, field, force_new=False): |
| 128 | + #mongo = init_mongo_db(dbname) |
| 129 | + #editors = mongo[collection] |
| 130 | + #ids = retrieve_distinct_keys_mapreduce(editors, field) |
| 131 | + ''' |
| 132 | + TODO: figure how big the index is and then take appropriate action, index |
| 133 | + < 4mb just do a distinct query, index > 4mb do a map reduce. |
| 134 | + ''' |
| 135 | + if force_new == False and file_utils.check_file_exists(settings.binary_location, |
| 136 | + '%s_%s_%s.bin' % (dbname, collection, field)): |
| 137 | + ids = file_utils.load_object(settings.binary_location, '%s_%s_%s.bin' % (dbname, collection, field)) |
| 138 | + else: |
| 139 | + mongo = init_mongo_db(dbname) |
| 140 | + editors = mongo[collection] |
| 141 | + #index_size = mongo.stats() |
| 142 | + if editors.count () < 200000: #TODO this is a bit arbitrary, should check if index > 4Mb. |
| 143 | + ids = editors.distinct(field) |
| 144 | + else: |
| 145 | + #params = {} |
| 146 | + #params['size'] = 'size' |
| 147 | + #size = editors.find_one({'size': 1}) |
| 148 | + ids = retrieve_distinct_keys_mapreduce(editors, field) |
| 149 | + file_utils.store_object(ids, settings.binary_location, '%s_%s_%s.bin' % (dbname, collection, field)) |
| 150 | + return ids |
| 151 | + |
| 152 | + |
| 153 | +def retrieve_distinct_keys_mapreduce(collection, field): |
| 154 | + emit = 'function () { emit(this.%s, 1)};' % field |
| 155 | + map = Code(emit) |
| 156 | + |
| 157 | + reduce = Code("function()") |
| 158 | + |
| 159 | + ids = [] |
| 160 | + cursor = collection.map_reduce(map, reduce) |
| 161 | + for c in cursor.find(): |
| 162 | + ids.append(c['_id']) |
| 163 | + return ids |
| 164 | + |
| 165 | + |
| 166 | +def debug(): |
| 167 | + #retrieve_distinct_keys('enwiki', 'editors_dataset', 'editor') |
| 168 | + retrieve_min_value('enwiki', 'editors_dataset', 'new_wikipedian') |
| 169 | + |
| 170 | + |
| 171 | +if __name__ == '__main__': |
| 172 | + debug() |
Property changes on: trunk/tools/editor_trends/code-snippets/mongo.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 173 | + native |