r82005 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r82004‎ | r82005 | r82006 >
Date:01:44, 12 February 2011
Author:diederik
Status:deferred
Tags:
Comment:
A lot of changes:
1) Using one database for all projects instead of a separate database for each project
2) RunTimeSettings inherits from Settings
3) Major code cleanup
Modified paths:
  • /trunk/tools/editor_trends/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/adhoc (added) (history)
  • /trunk/tools/editor_trends/analyses/adhoc/community_graph.py (added) (history)
  • /trunk/tools/editor_trends/analyses/adhoc/file_size_reduction.py (added) (history)
  • /trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py (added) (history)
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/community_graph.py (deleted) (history)
  • /trunk/tools/editor_trends/analyses/dataset.py (deleted) (history)
  • /trunk/tools/editor_trends/analyses/file_size_reduction.py (deleted) (history)
  • /trunk/tools/editor_trends/analyses/inventory.py (added) (history)
  • /trunk/tools/editor_trends/analyses/json_encoders.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/match_talkpage_article.py (deleted) (history)
  • /trunk/tools/editor_trends/analyses/plugins/edit_patterns.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py (modified) (history)
  • /trunk/tools/editor_trends/bots/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/classes/dataset.py (modified) (history)
  • /trunk/tools/editor_trends/classes/languages.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/classes/settings.py (modified) (history)
  • /trunk/tools/editor_trends/code-snippets/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/configuration.py (modified) (history)
  • /trunk/tools/editor_trends/etl/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/etl/downloader.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/sort.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/utils/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/utils/compression.py (modified) (history)
  • /trunk/tools/editor_trends/utils/log.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -26,7 +26,7 @@
2727 from argparse import RawTextHelpFormatter
2828 import ConfigParser
2929
30 -import configuration
 30+#import configuration
3131 from utils import file_utils
3232 from utils import ordered_dict
3333 from utils import log
@@ -40,7 +40,7 @@
4141 from etl import store
4242 from etl import sort
4343 from etl import transformer
44 -from analyses import analyzer
 44+from analyses import inventory
4545
4646
4747 def show_choices(settings, attr):
@@ -50,8 +50,7 @@
5151 return choices
5252
5353
54 -
55 -def config_launcher(properties, settings, logger):
 54+def config_launcher(properties, logger):
5655 '''
5756 Config launcher is used to reconfigure editor trends toolkit.
5857 '''
@@ -98,20 +97,20 @@
9998
10099
101100
102 -def downloader_launcher(properties, settings, logger):
 101+def downloader_launcher(properties, logger):
103102 '''
104103 This launcher calls the dump downloader to download a Wikimedia dump file.
105104 '''
106105 print 'Start downloading'
107106 stopwatch = timer.Timer()
108107 log.log_to_mongo(properties, 'dataset', 'download', stopwatch, event='start')
109 - res = downloader.launcher(properties, settings, logger)
 108+ res = downloader.launcher(properties, logger)
110109 stopwatch.elapsed()
111110 log.log_to_mongo(properties, 'dataset', 'download', stopwatch, event='finish')
112111 return res
113112
114113
115 -def extract_launcher(properties, settings, logger):
 114+def extract_launcher(properties, logger):
116115 '''
117116 The extract launcher is used to extract the required variables from a dump
118117 file. If the zip file is a known archive then it will first launch the
@@ -125,34 +124,34 @@
126125 log.log_to_mongo(properties, 'dataset', 'extract', stopwatch, event='finish')
127126
128127
129 -def sort_launcher(properties, settings, logger):
 128+def sort_launcher(rts, logger):
130129 '''
131130 After the extracter has finished then the created output files need to be
132131 sorted. This function takes care of that.
133132 '''
134133 print 'Start sorting data'
135134 stopwatch = timer.Timer()
136 - log.log_to_mongo(properties, 'dataset', 'sort', stopwatch, event='start')
 135+ log.log_to_mongo(rts, 'dataset', 'sort', stopwatch, event='start')
137136 # write_message_to_log(logger, settings,
138137 # message=None,
139138 # verb=None,
140139 # location=properties.location,
141140 # input=properties.txt,
142141 # output=properties.sorted)
143 - sort.mergesort_launcher(properties.txt, properties.sorted)
 142+ sort.launcher(rts)
144143 stopwatch.elapsed()
145 - log.log_to_mongo(properties, 'dataset', 'sort', stopwatch, event='finish')
 144+ log.log_to_mongo(rts, 'dataset', 'sort', stopwatch, event='finish')
146145
147146
148 -def store_launcher(properties, settings, logger):
 147+def store_launcher(rts, logger):
149148 '''
150149 The data is ready to be stored once the sorted function has completed. This
151150 function starts storing data in MongoDB.
152151 '''
153152 print 'Start storing data in MongoDB'
154153 stopwatch = timer.Timer()
155 - log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='start')
156 - db.cleanup_database(properties.dbname, logger)
 154+ log.log_to_mongo(rts, 'dataset', 'store', stopwatch, event='start')
 155+ db.cleanup_database(rts.dbname, logger)
157156 # write_message_to_log(logger, settings,
158157 # message=None,
159158 # verb='Storing',
@@ -163,36 +162,34 @@
164163 # collection=properties.collection)
165164 # for key in properties:
166165 # print key, getattr(properties, key)
167 - store.launcher(properties.sorted, properties.dbname, properties.collection)
168 -
 166+ store.launcher(rts)
169167 stopwatch.elapsed()
170 - log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='finish')
 168+ log.log_to_mongo(rts, 'dataset', 'store', stopwatch, event='finish')
171169
172170
173 -def transformer_launcher(properties, settings, logger):
 171+def transformer_launcher(rts, logger):
174172 print 'Start transforming dataset'
175173 stopwatch = timer.Timer()
176 - log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, event='start')
177 - db.cleanup_database(properties.dbname, logger, 'dataset')
 174+ log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, event='start')
 175+ db.cleanup_database(rts.dbname, logger, 'dataset')
178176 # write_message_to_log(logger, settings,
179177 # message=None,
180178 # verb='Transforming',
181179 # project=properties.project,
182180 # collection=properties.collection)
183 - transformer.transform_editors_single_launcher(properties.dbname,
184 - properties.collection)
 181+ transformer.transform_editors_single_launcher(rts)
185182 stopwatch.elapsed()
186 - log.log_to_mongo(properties, 'dataset', 'transform', stopwatch,
 183+ log.log_to_mongo(rts, 'dataset', 'transform', stopwatch,
187184 event='finish')
188185
189186
190 -def dataset_launcher(properties, settings, logger):
 187+def dataset_launcher(rts, logger):
191188 print 'Start exporting dataset'
192189 stopwatch = timer.Timer()
193 - log.log_to_mongo(properties, 'dataset', 'export', stopwatch, event='start')
 190+ log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='start')
194191
195 - collection = '%s_%s' % (properties.collection, 'dataset')
196 - for target in properties.targets:
 192+ #collection = '%s_%s' % (rts.collection, 'dataset')
 193+ for target in rts.targets:
197194 # write_message_to_log(logger, settings,
198195 # message=None,
199196 # verb='Exporting',
@@ -200,16 +197,16 @@
201198 # dbname=properties.full_project,
202199 # collection=properties.collection)
203200
204 - analyzer.generate_chart_data(properties.dbname,
205 - collection,
206 - properties.language.code,
 201+ analyzer.generate_chart_data(rts.dbname,
 202+ rts.editors_dataset,
 203+ rts.language.code,
207204 target,
208 - **properties.keywords)
 205+ **rts.keywords)
209206 stopwatch.elapsed()
210207 log.log_to_mongo(properties, 'dataset', 'export', stopwatch, event='finish')
211208
212209
213 -def cleanup(properties, settings, logger):
 210+def cleanup(rts, logger):
214211 directories = properties.directories[1:]
215212 for directory in directories:
216213 write_message_to_log(logger, setting,
@@ -232,7 +229,7 @@
233230 file_utils.delete_file(settings.binary_location, filename)
234231
235232
236 -def all_launcher(properties, settings, logger):
 233+def all_launcher(properties, logger):
237234 print 'The entire data processing chain has been called, this will take a \
238235 couple of hours (at least) to complete.'
239236 stopwatch = timer.Timer()
@@ -258,7 +255,7 @@
259256 for function, callname in functions.iteritems():
260257 if callname not in properties.ignore:
261258 print 'Starting %s' % function.func_name
262 - res = function(properties, settings, logger)
 259+ res = function(properties, logger)
263260 if res == False:
264261 sys.exit(False)
265262 elif res == None:
@@ -284,11 +281,11 @@
285282 '''
286283 Entry point for parsing command line and launching the needed function(s).
287284 '''
288 - settings = configuration.Settings()
 285+ #settings = configuration.Settings()
289286 language = languages.init()
290287 project = projects.init()
291288 pjc = projects.ProjectContainer()
292 - rts = runtime_settings.RunTimeSettings(project, language, settings)
 289+ rts = runtime_settings.RunTimeSettings(project, language)
293290
294291 #Init Argument Parser
295292 parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
@@ -301,7 +298,7 @@
302299 action='store',
303300 help='Enter the first letter of a language to see which languages are \
304301 available.')
305 - parser_languages.set_defaults(func=language.show_languages, args=[settings, project])
 302+ parser_languages.set_defaults(func=language.show_languages, args=[project])
306303
307304 #CONFIG
308305 parser_config = subparsers.add_parser('config',
@@ -350,7 +347,7 @@
351348 parser_dataset.add_argument('-c', '--charts',
352349 action='store',
353350 help='Should be a valid function name that matches one of the plugin functions',
354 - default=analyzer.available_analyses()['new_editor_count'])
 351+ default=inventory.available_analyses()['new_editor_count'])
355352
356353 parser_dataset.add_argument('-k', '--keywords',
357354 action='store',
@@ -399,12 +396,13 @@
400397 parser.add_argument('-c', '--collection',
401398 action='store',
402399 help='Name of MongoDB collection',
403 - default='editors')
 400+ default='editors_raw')
404401
405402 parser.add_argument('-o', '--location',
406403 action='store',
407404 help='Indicate where you want to store the downloaded file.',
408 - default=settings.input_location)
 405+ #default=settings.input_location)
 406+ default=rts.input_location)
409407
410408 parser.add_argument('-ns', '--namespace',
411409 action='store',
@@ -413,41 +411,41 @@
414412
415413 parser.add_argument('-f', '--file',
416414 action='store',
417 - choices=settings.file_choices,
 415+ choices=rts.file_choices,
418416 help='Indicate which dump you want to download. Valid choices are:\n \
419 - %s' % ''.join([f + ',\n' for f in settings.file_choices]),
 417+ %s' % ''.join([f + ',\n' for f in rts.file_choices]),
420418 default='stub-meta-history.xml.gz')
421419
422420
423 - return project, language, parser, settings
 421+ return project, language, parser
424422
425423 def main():
426 - project, language, parser, settings = init_args_parser()
 424+ project, language, parser, = init_args_parser()
427425 args = parser.parse_args()
428 - properties = runtime_settings.RunTimeSettings(project, language, settings, args)
 426+ rts = runtime_settings.RunTimeSettings(project, language, args)
429427 #initialize logger
430428 logger = logging.getLogger('manager')
431429 logger.setLevel(logging.DEBUG)
432430
433431 # Add the log message handler to the logger
434432 today = datetime.datetime.today()
435 - log_filename = os.path.join(settings.log_location, '%s%s_%s-%s-%s.log' \
436 - % (properties.language.code, properties.project.name,
 433+ log_filename = os.path.join(rts.log_location, '%s%s_%s-%s-%s.log' \
 434+ % (rts.language.code, rts.project.name,
437435 today.day, today.month, today.year))
438436 handler = logging.handlers.RotatingFileHandler(log_filename,
439437 maxBytes=1024 * 1024,
440438 backupCount=3)
441439
442440 logger.addHandler(handler)
443 - logger.debug('Chosen language: \t%s' % properties.language)
 441+ logger.debug('Chosen language: \t%s' % rts.language)
444442
445443 #start manager
446444 #detect_python_version(logger)
447445 about_statement()
448446 #config.create_configuration(settings, args)
449447
450 - properties.show_settings()
451 - args.func(properties, settings, logger)
 448+ rts.show_settings()
 449+ args.func(rts, logger)
452450
453451
454452 if __name__ == '__main__':
Index: trunk/tools/editor_trends/analyses/community_graph.py
@@ -1,62 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-10'
19 -__version__ = '0.1'
20 -
21 -import sys
22 -sys.path.append('..')
23 -
24 -import configuration
25 -settings = configuration.Settings()
26 -
27 -from database import db
28 -from utils import file_utils
29 -
30 -try:
31 - import psyco
32 - psyco.full()
33 -except ImportError:
34 - pass
35 -
36 -def create_articles_set(edits):
37 - s = set()
38 - years = edits.keys()
39 - for year in years:
40 - for edit in edits[year]:
41 - s.add(edit['article'])
42 - return s
43 -
44 -
45 -def create_edgelist(project, collection):
46 - ids = db.retrieve_distinct_keys(project, collection, 'editor')
47 - conn = db.init_mongo_db(project)
48 - ids.sort()
49 - fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', settings.encoding)
50 - for i in ids:
51 - author_i = conn[collection].find_one({'editor': i})
52 - article_i = create_articles_set(author_i['edits'])
53 - for j in ids:
54 - if i > j:
55 - author_j = conn[collection].find_one({'editor': j})
56 - article_j = create_articles_set(author_j['edits'])
57 - common = article_i.intersection(article_j)
58 - if len(common) > 0:
59 - file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True)
60 - fh.close()
61 -
62 -if __name__ == '__main__':
63 - create_edgelist('enwiki', 'editors')
Index: trunk/tools/editor_trends/analyses/dataset.py
@@ -1,473 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-14'
19 -__version__ = '0.1'
20 -
21 -import calendar
22 -import datetime
23 -import time
24 -import math
25 -import operator
26 -import sys
27 -from pymongo.son_manipulator import SONManipulator
28 -from multiprocessing import Lock
29 -
30 -
31 -sys.path.append('..')
32 -import configuration
33 -settings = configuration.Settings()
34 -
35 -from utils import file_utils
36 -from utils import data_converter
37 -from database import db
38 -import json_encoders
39 -
40 -class Transform(SONManipulator):
41 - '''
42 - This encoder transforms a Dataset to a MongoDB bson document.
43 - To use this encoder initalize a mongo database instance and then add:
44 - mongo.add_son_manipulator(Transform())
45 - '''
46 - def transform_incoming(self, son, collection):
47 - for (key, ds) in son.items():
48 - son[key] = {}
49 - for x, var in enumerate(ds):
50 - if isinstance(var, Variable):
51 - son[key][var.name] = var.encode()
52 - for prop in ds.props:
53 - son[prop] = getattr(ds, prop)
54 - return son
55 -
56 - def transform_outgoing(self, son, collection):
57 - for (key, value) in son.items():
58 - if isinstance(value, dict):
59 - names = value.keys()
60 - for name in names:
61 - var = Variable(name, None)
62 - var.decode(value)
63 - son['variables'][name] = var
64 - else: # Again, make sure to recurse into sub-docs
65 - son[key] = value
66 - name = son.pop('name', None)
67 - project = son.pop('project', None)
68 - collection = son.pop('collection', None)
69 - language_code = son.pop('language_code', None)
70 - variables = son.pop('variables', [])
71 - ds = Dataset(name, project, collection, language_code, **son)
72 - for var in variables:
73 - var = variables[var]
74 - ds.add_variable(var)
75 - return ds
76 -
77 -
78 -class Data:
79 - '''
80 - Some generic functions that are required by the Observation, Variable, and
81 - Dataset classes.
82 - '''
83 - def __hash__(self, vars):
84 - id = ''.join([str(var) for var in vars])
85 - return hash(id)
86 - #return int(self.convert_date_to_epoch(date))
87 -
88 - def encode_to_bson(self, data=None):
89 - if data:
90 - kwargs = dict([(str(key), value) for key, value in data.__dict__.iteritems()])
91 - else:
92 - kwargs = dict([(str(key), value) for key, value in self.__dict__.iteritems()])
93 - for key, value in kwargs.iteritems():
94 - if isinstance(value, dict):
95 - d = {}
96 - for k, v in value.iteritems():
97 - if isinstance(v, Observation):
98 - v = self.encode_to_bson(v)
99 - d[str(k)] = v
100 - kwargs[key] = d
101 - return kwargs
102 -
103 - def convert_date_to_epoch(self, date):
104 - assert self.time_unit == 'year' or self.time_unit == 'month' \
105 - or self.time_unit == 'day', 'Time unit should either be year, month or day.'
106 -
107 - if self.time_unit == 'year':
108 - datum = datetime.datetime(date.year, 1, 1)
109 - return int(time.mktime(datum.timetuple()))
110 - elif self.time_unit == 'month':
111 - datum = datetime.datetime(date.year, date.month, 1)
112 - return int(time.mktime(datum.timetuple()))
113 - elif self.time_unit == 'day':
114 - return int(time.mktime(date.timetuple()))
115 - else:
116 - return date
117 -
118 - def set_date_range(self, date):
119 - if self.time_unit == 'year':
120 - return datetime.datetime(date.year, 12, 31), datetime.datetime(date.year, 1, 1)
121 - elif self.time_unit == 'month':
122 - day = calendar.monthrange(date.year, date.month)[1]
123 - return datetime.datetime(date.year, date.month, day), datetime.datetime(date.year, date.month, 1)
124 - else:
125 - return datetime.datetime(date.year, date.month, date.day), datetime.datetime(date.year, date.month, date.day)
126 -
127 -
128 -class Observation(Data):
129 - lock = Lock()
130 - '''
131 - The smallest unit, here the actual data is being stored.
132 - Time_unit should either be 'year', 'month' or 'day'.
133 - '''
134 - def __init__(self, date, time_unit, id, meta):
135 - assert isinstance(date, datetime.datetime), 'Date variable should be a datetime.datetime instance.'
136 - self.date = date
137 - self.data = 0
138 - self.time_unit = time_unit
139 - self.t1, self.t0 = self.set_date_range(date)
140 - self.id = id
141 - self.props = []
142 - for mt in meta:
143 - if isinstance(mt, float):
144 - raise Exception, 'Mongo does not allow a dot "." in the name of a key, please use an integer or string as key.'
145 - elif not isinstance(mt, list):
146 - setattr(self, mt, meta[mt])
147 - self.props.append(mt)
148 - self._type = 'observation'
149 -
150 - def __repr__(self):
151 - return '%s' % self.date
152 -
153 - def __str__(self):
154 - return 'range: %s:%s' % (self.t0, self.t1)
155 -
156 - def __iter__(self):
157 - for obs in self.data:
158 - yield self.data[obs]
159 -
160 - def __getitem__(self, key):
161 - return getattr(self, key, [])
162 -
163 - def add(self, value):
164 - '''
165 - If update == True then data[i] will be incremented else data[i] will be
166 - created, in that case make sure that i is unique. Update is useful for
167 - tallying a variable.
168 - '''
169 - self.lock.acquire()
170 - try:
171 - self.data += value
172 - finally:
173 - self.lock.release()
174 -
175 - def get_date_range(self):
176 - return '%s-%s-%s:%s-%s-%s' % (self.t0.month, self.t0.day, self.t0.year, \
177 - self.t1.month, self.t1.day, self.t1.year)
178 -
179 -class Variable(Data):
180 - '''
181 - This class constructs a time-based variable.
182 - '''
183 - lock = Lock()
184 - def __init__(self, name, time_unit, **kwargs):
185 - self.name = name
186 - self.obs = {}
187 - self.time_unit = time_unit
188 - self.groupbys = []
189 - self._type = 'variable'
190 - self.props = ['name', 'time_unit', '_type']
191 - for kw in kwargs:
192 - setattr(self, kw, kwargs[kw])
193 - self.props.append(kw)
194 -
195 - def __str__(self):
196 - return '%s' % self.name
197 -
198 - def __repr__(self):
199 - return '%s' % self.name
200 -
201 - def __getitem__(self, key):
202 - return getattr(self, key, [])
203 -
204 - def __iter__(self):
205 - keys = self.obs.keys()
206 - for key in keys:
207 - yield key
208 -
209 - def __len__(self):
210 - return [x for x in xrange(self.obs())]
211 -
212 - def items(self):
213 - for key in self.__dict__.keys():
214 - yield key, getattr(self, key)
215 -
216 - def itervalues(self):
217 - for key in self:
218 - yield self.obs[key].data
219 -
220 - def iteritems(self):
221 - for key in self:
222 - yield (key, self.obs[key])
223 -
224 -
225 - def get_data(self):
226 - return [o for o in self.itervalues()]
227 -
228 - def get_observation(self, id, date, meta):
229 - self.lock.acquire()
230 - try:
231 - obs = self.obs.get(id, Observation(date, self.time_unit, id, meta))
232 - finally:
233 - self.lock.release()
234 - return obs
235 -
236 - def add(self, date, value, meta={}):
237 - assert isinstance(meta, dict), 'The meta variable should be a dict (either empty or with variables to group by.'
238 - #id = self.convert_date_to_epoch(date)
239 - start, end = self.set_date_range(date)
240 - values = meta.values()
241 - values.insert(0, end)
242 - values.insert(0, start)
243 - id = self.__hash__(values)
244 -
245 - obs = self.get_observation(id, date, meta)
246 - obs.add(value)
247 - self.obs[id] = obs
248 -
249 - def encode(self):
250 - bson = {}
251 - for prop in self.props:
252 - bson[prop] = getattr(self, prop)
253 -
254 - bson['obs'] = {}
255 - for obs in self:
256 - data = self.obs[obs]
257 - obs = str(obs)
258 - bson['obs'][obs] = data.encode_to_bson()
259 - return bson
260 -
261 - def decode(self, values):
262 - for varname in values:
263 - for prop in values[varname]:
264 - if isinstance(values[varname][prop], dict):
265 - data = values[varname][prop]
266 - for d in data:
267 - date = data[d]['date']
268 - obs = data[d]['data']
269 - self.add(date, obs)
270 - else:
271 - setattr(self, prop, values[varname][prop])
272 - self.props.append(prop)
273 -
274 - def get_date_range(self):
275 - dates = [self.obs[key].date for key in self]
276 - first = min(dates)
277 - last = max(dates)
278 - return first, last
279 -
280 -
281 -class Dataset:
282 - '''
283 - This class acts as a container for the Variable class and has some methods
284 - to output the dataset to a csv file, mongodb and display statistics.
285 - '''
286 -
287 - def __init__(self, name, project, collection, language_code, encoder, vars=None, **kwargs):
288 - encoders = json_encoders.available_json_encoders()
289 - if encoder not in encoders:
290 - raise exception.UnknownJSONEncoderError(encoder)
291 - else:
292 - self.encoder = encoder
293 - self.name = name
294 - self.project = project
295 - self.collection = collection
296 - self.language_code = language_code
297 - self.hash = self.name
298 - self._type = 'dataset'
299 - self.created = datetime.datetime.now()
300 - self.format = 'long'
301 - for kw in kwargs:
302 - setattr(self, kw, kwargs[kw])
303 - self.props = self.__dict__.keys()
304 -
305 - self.variables = []
306 - if vars != None:
307 - for kwargs in vars:
308 - name = kwargs.pop('name')
309 - setattr(self, name, Variable(name, **kwargs))
310 - self.variables.append(name)
311 - #self.filename = self.create_filename()
312 -
313 - def __repr__(self):
314 - return 'Dataset contains %s variables' % (len(self.variables))
315 -
316 - def __iter__(self):
317 - for var in self.variables:
318 - yield getattr(self, var)
319 -
320 -
321 - def create_filename(self):
322 - '''
323 - This function creates a filename for the dataset by searching for shared
324 - properties among the different variables in the dataset. All shared
325 - properties will be used in the filename to make sure that one analysis
326 - that's run with different parameters gets stored in separate files.
327 - '''
328 - common = {}
329 - props = set()
330 - for var in self.variables:
331 - s = set()
332 - var = getattr(self, var)
333 - for prop in var.props:
334 - if prop not in ['name', 'time_unit', '_type']:
335 - s.add(prop)
336 - props.add(prop)
337 - common[var.name] = s
338 -
339 - keys = []
340 - for prop in props:
341 - attrs = []
342 - for s in common.values():
343 - attrs.append(prop)
344 - if len(attrs) == len(common.values()):
345 - keys.append(prop)
346 - keys.sort()
347 - attrs = '_'.join(['%s=%s' % (k, getattr(var, k)) for k in keys])
348 - filename = '%s%s_%s_%s.csv' % (self.language_code,
349 - self.project,
350 - self.name,
351 - attrs)
352 - self.filename = filename
353 -
354 -
355 - def add_variable(self, var):
356 - if isinstance(var, Variable):
357 - self.variables.append(var.name)
358 - setattr(self, var.name, var)
359 - else:
360 - raise TypeError('You can only instance of Variable to a dataset.')
361 -
362 - def write(self, format='csv'):
363 - self.create_filename()
364 - if format == 'csv':
365 - self.to_csv()
366 - elif format == 'mongo':
367 - self.to_mongo()
368 -
369 - def to_mongo(self):
370 - dbname = '%s%s' % (self.language_code, self.project)
371 - mongo = db.init_mongo_db(dbname)
372 - coll = mongo['%s_%s' % (dbname, 'charts')]
373 - mongo.add_son_manipulator(Transform())
374 - coll.remove({'hash':self.hash, 'project':self.project,
375 - 'language_code':self.language_code})
376 - coll.insert({'variables': self})
377 -
378 - def to_csv(self):
379 - data = data_converter.convert_dataset_to_lists(self, 'manage')
380 - headers = data_converter.add_headers(self)
381 - fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding)
382 - file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True)
383 - file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format)
384 - fh.close()
385 -
386 - def encode(self):
387 - props = {}
388 - for prop in self.props:
389 - props[prop] = getattr(self, prop)
390 - return props
391 -
392 - def get_standard_deviation(self, number_list):
393 - mean = self.get_mean(number_list)
394 - std = 0
395 - n = len(number_list)
396 - for i in number_list:
397 - std = std + (i - mean) ** 2
398 - return math.sqrt(std / float(n - 1))
399 -
400 - def get_median(self, number_list):
401 - if number_list == []:
402 - return '.'
403 - data = sorted(number_list)
404 - data = [float(x) for x in data]
405 - if len(data) % 2 == 1:
406 - return data[(len(data) + 1) / 2 - 1]
407 - else:
408 - lower = data[len(data) / 2 - 1]
409 - upper = data[len(data) / 2]
410 - return (lower + upper) / 2
411 -
412 - def get_mean(self, number_list):
413 - if number_list == []:
414 - return '.'
415 - float_nums = [float(x) for x in number_list]
416 - return sum(float_nums) / len(number_list)
417 -
418 - def descriptives(self):
419 - for variable in self:
420 - data = variable.get_data()
421 - variable.mean = self.get_mean(data)
422 - variable.median = self.get_median(data)
423 - variable.sds = self.get_standard_deviation(data)
424 - variable.min = min(data)
425 - variable.max = max(data)
426 - variable.n = len(data)
427 - variable.first_obs, variable.last_obs = variable.get_date_range()
428 -
429 - def summary(self):
430 - self.descriptives()
431 - print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Variable', 'Mean',
432 - 'Median', 'SD', 'Minimum', 'Maximum',
433 - 'Num Obs', 'First Obs', 'Final Obs')
434 - for variable in self:
435 - print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (variable.name,
436 - variable.mean, variable.median,
437 - variable.sds, variable.min,
438 - variable.max, variable.n,
439 - variable.first_obs, variable.last_obs)
440 -
441 -
442 -def debug():
443 - mongo = db.init_mongo_db('enwiki')
444 - rawdata = mongo['enwiki_charts']
445 - mongo.add_son_manipulator(Transform())
446 -
447 - d1 = datetime.datetime.today()
448 - d2 = datetime.datetime(2007, 6, 7)
449 - ds = Dataset('test', 'wiki', 'editors_dataset', 'en', 'to_bar_json', [
450 - {'name': 'count', 'time_unit': 'year'},
451 - # {'name': 'testest', 'time_unit': 'year'}
452 - ])
453 - ds.count.add(d1, 10, ['exp', 'window'])
454 - ds.count.add(d1, 135, ['exp', 'window'])
455 - ds.count.add(d2, 1, ['exp', 'window'])
456 - #ds.testest.add(d1, 135)
457 - #ds.testest.add(d2, 535)
458 - ds.summary()
459 - ds.write(format='csv')
460 -# v = Variable('test', 'year')
461 - ds.encode()
462 - print ds
463 -
464 - # mongo.test.insert({'variables': ds})
465 -
466 - # v.add(d2 , 5)
467 - #o = v.get_observation(d2)
468 -# ds = rawdata.find_one({'project': 'wiki',
469 -# 'language_code': 'en',
470 -# 'hash': 'cohort_dataset_backward_bar'})
471 -
472 -
473 -if __name__ == '__main__':
474 - debug()
Index: trunk/tools/editor_trends/analyses/file_size_reduction.py
@@ -1,100 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2010-11-15'
19 -__version__ = '0.1'
20 -
21 -import sys
22 -sys.path.append('..')
23 -
24 -import os
25 -import xml.etree.cElementTree as cElementTree
26 -
27 -import configuration
28 -from utils import file_utils
29 -settings = configuration.Settings()
30 -
31 -
32 -class DumpStatistics(object):
33 - ''' Simple class to keep track of XML tags, how often they occur,
34 - and the length of strings they contain. This is used to calculate the
35 - overhead.
36 - '''
37 - def __init__(self):
38 - self.tags = {}
39 -
40 - def add_tag(self, kwargs):
41 - for kw in kwargs:
42 - if kw not in self.tags:
43 - self.tags[kw] = {}
44 - self.tags[kw]['n'] = 0
45 - self.tags[kw]['size'] = 0
46 - self.tags[kw]['n'] += 1
47 - self.tags[kw]['size'] += self.determine_length(kwargs[kw])
48 -
49 - def average_size_text(self):
50 - avg = {}
51 - for kw in self.tags:
52 - avg[kw] = self.tags[kw]['size'] / self.tags[kw]['n']
53 - return avg
54 -
55 - def total_size_text(self):
56 - return sum([self.tags[kw]['size'] for kw in self.tags])
57 -
58 - def total_size_xml(self):
59 - # the x2 is for the opening and closing tag
60 - # the +5 is for 2x <, 2x > and 1x /
61 - return sum([(len(kw) * (self.tags[kw]['n'] * 2) + 5) for kw in self.tags])
62 -
63 - def determine_length(self, text):
64 - if text == None:
65 - return 0
66 - else:
67 - return len(text)
68 -
69 -
70 -def calculate_filesize_overhead(location, filename):
71 - counter = None
72 - ds = DumpStatistics()
73 - filename = os.path.join(location, filename)
74 - context = cElementTree.iterparse(filename, events=('start', 'end'))
75 - context = iter(context)
76 - event, root = context.next() #get the root element of the XML doc
77 -
78 - try:
79 - for event, elem in context:
80 - if event == 'end':
81 - ds.add_tag({elem.tag:elem.text})
82 - root.clear() # when done parsing a section clear the tree to release memory
83 - except SyntaxError:
84 - pass
85 - file_utils.store_object(ds, settings.binary_location, 'ds')
86 - xml_size = ds.total_size_xml()
87 - text_size = ds.total_size_text()
88 - print text_size, xml_size
89 - print ds.tags
90 -
91 -
92 -def output_dumpstatistics():
93 - ds = file_utils.load_object(settings.binary_location, 'ds.bin')
94 -
95 - for key in ds.tags:
96 - print '%s\t%s' % (key, ds.tags[key])
97 -
98 -if __name__ == '__main__':
99 - input = os.path.join(settings.input_location, 'en', 'wiki')
100 - calculate_filesize_overhead(input, 'enwiki-latest-stub-meta-history.xml')
101 - output_dumpstatistics()
Index: trunk/tools/editor_trends/analyses/match_talkpage_article.py
@@ -1,72 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-07'
19 -__version__ = '0.1'
20 -
21 -import sys
22 -import os
23 -sys.path.append('..')
24 -
25 -import configuration
26 -settings = configuration.Settings()
27 -
28 -from etl import extracter
29 -from utils import file_utils
30 -import wikitree
31 -
32 -try:
33 - import psyco
34 - psyco.full()
35 -except ImportError:
36 - pass
37 -
38 -class Article:
39 - def __init__(self, title, id, talk_id=None):
40 - self.title = title
41 - self.id = id
42 - self.talk_id = talk_id
43 -
44 -
45 -def parse_dumpfile(project, language_code, namespaces=['0', '1']):
46 - articles = {}
47 - ns = extracter.load_namespace(language_code)
48 - non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces)
49 -
50 -
51 - location = os.path.join(settings.input_location, language_code, project)
52 - fh = file_utils.create_txt_filehandle(location,
53 - '%s%s-latest-stub-meta-history.xml' % (language_code, project),
54 - 'r', settings.encoding)
55 -
56 - for page, article_size in wikitree.parser.read_input(fh):
57 - title = page.find('title')
58 - if extracter.verify_article_belongs_namespace(title, non_valid_namespaces):
59 - article_id = page.find('id').text
60 - title = title.text
61 - if title.startswith(ns['1'].get('canonical')):
62 - namespace = 'Talk'
63 - article = articles.get(article_id, Article(None, None, article_id))
64 - article.talk_id = article_id
65 - else:
66 - namespace = 'Main'
67 - article = articles.get(article_id, Article(title, article_id))
68 - articles[article_id] = article
69 -
70 - file_utils.store_object(articles, settings.binary_location, 'talk2article.bin')
71 -
72 -if __name__ == '__main__':
73 - parse_dumpfile('wiki', 'en')
Index: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py
@@ -39,10 +39,10 @@
4040 if w >= editor_dt:
4141 datum = datetime.datetime(int(year), 12, 31)
4242 freq = int(editor['edits_by_year'][year])
43 - if datum == datetime.datetime(2003, 12, 31):
 43+ #if datum == datetime.datetime(2003, 12, 31):
4444 # if w == 24:
4545 # if freq == 1.0:
4646 # print 'break'
47 - var.add(datum, 1, {'window': w, 'frequency': freq}) #{w:{freq:1}})
48 - break
 47+ var.add(datum, 1, {'window': w, 'frequency': freq}) #{w:{freq:1}})
 48+ break
4949 return var
Index: trunk/tools/editor_trends/analyses/plugins/edit_patterns.py
@@ -27,23 +27,12 @@
2828 if dt.days < 366:
2929 return var
3030
31 - m = 0
32 - obs = {}
3331 for year in xrange(new_wikipedian.year, new_wikipedian.year + 2):
34 - if m == 12:
35 - break
 32+ obs = [False for x in xrange(13)]
3633 for month in xrange(new_wikipedian.month, 13):
3734 n = monthly[str(year)][str(month)]
3835 date = datetime.datetime(year, month, 1)
3936 if n >= var.cutoff:
40 - var.add(date, True, {'month':m})
41 - #obs[m] = True
42 - else:
43 - var.add(date, False, {'month':m})
44 - #obs[m] = False
45 - m += 1
46 - if m == 12:
47 - break
48 -# if m == 12:
49 -# var.add(date, obs)
 37+ obs[month] = True
 38+ var.add(date, obs)
5039 return var
Index: trunk/tools/editor_trends/analyses/json_encoders.py
@@ -17,9 +17,13 @@
1818 __date__ = '2011-01-27'
1919 __version__ = '0.1'
2020
 21+import sys
2122 import types
22 -import analyzer
2323
 24+if '..' not in sys.path:
 25+ sys.path.append('..')
 26+
 27+import inventory
2428 from classes import exceptions
2529 from utils import data_converter
2630
@@ -67,6 +71,7 @@
6872 options['series']['bars']['align'] = 'center'
6973 return options
7074
 75+
7176 def to_bar_json(ds):
7277 data = {}
7378
@@ -95,6 +100,7 @@
96101 print json
97102 return json
98103
 104+
99105 def to_stacked_bar_json(ds):
100106 '''
101107 This function outputs data in a format that is understood by jquery
Index: trunk/tools/editor_trends/analyses/inventory.py
@@ -0,0 +1,70 @@
 2+#!/usr/bin/python
 3+# coding=utf-8
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http,//www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2011-02-11'
 19+__version__ = '0.1'
 20+
 21+
 22+import os
 23+import sys
 24+import types
 25+
 26+def available_analyses(caller='manage'):
 27+ '''
 28+ Generates a dictionary:
 29+ key: name of analysis
 30+ value: function that generates the dataset
 31+ ignore: a list of functions that should never be called from manage.py,
 32+ they are not valid entry points.
 33+ '''
 34+ assert caller == 'django' or caller == 'manage'
 35+ ignore = ['__init__']
 36+ functions = {}
 37+
 38+ fn = os.path.realpath(__file__)
 39+ pos = fn.rfind(os.sep)
 40+ loc = fn[:pos]
 41+ path = os.path.join(loc , 'plugins')
 42+ plugins = import_libs(path)
 43+
 44+ for plugin in plugins:
 45+ if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:
 46+ functions[plugin.func_name] = plugin
 47+ if caller == 'manage':
 48+ return functions
 49+ elif caller == 'django':
 50+ django_functions = []
 51+ for function in functions:
 52+ fancy_name = function.replace('_', ' ').title()
 53+ django_functions.append((function, fancy_name))
 54+
 55+ return django_functions
 56+
 57+
 58+def import_libs(path):
 59+ '''
 60+ Dynamically importing functions from the plugins directory.
 61+ '''
 62+ library_list = []
 63+ sys.path.append(path)
 64+ for f in os.listdir(os.path.abspath(path)):
 65+ module_name, ext = os.path.splitext(f)
 66+ if ext == '.py':
 67+ module = __import__(module_name)
 68+ func = getattr(module, module_name)
 69+ library_list.append(func)
 70+
 71+ return library_list
Index: trunk/tools/editor_trends/analyses/__init__.py
@@ -0,0 +1 @@
 2+
Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -21,19 +21,20 @@
2222 import sys
2323 import os
2424 import progressbar
25 -import types
2625 import datetime
2726
28 -sys.path.append('..')
 27+if '..' not in sys.path:
 28+ sys.path.append('..')
2929
30 -import configuration
31 -settings = configuration.Settings()
 30+from classes import dataset
 31+from classes import settings
 32+settings = settings.Settings()
3233 from database import db
3334 from utils import timer
3435 from utils import log
35 -import dataset
3636
3737
 38+
3839 def generate_chart_data(project, collection, language_code, func, encoder, **kwargs):
3940 '''
4041 This is the entry function to be called to generate data for creating charts.
@@ -102,54 +103,6 @@
103104 return ds
104105
105106
106 -def available_analyses(caller='manage'):
107 - '''
108 - Generates a dictionary:
109 - key: name of analysis
110 - value: function that generates the dataset
111 - ignore: a list of functions that should never be called from manage.py,
112 - they are not valid entry points.
113 - '''
114 - assert caller == 'django' or caller == 'manage'
115 - ignore = ['__init__']
116 - functions = {}
117 -
118 - fn = os.path.realpath(__file__)
119 - pos = fn.rfind(os.sep)
120 - loc = fn[:pos]
121 - path = os.path.join(loc , 'plugins')
122 - plugins = import_libs(path)
123 -
124 - for plugin in plugins:
125 - if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:
126 - functions[plugin.func_name] = plugin
127 - if caller == 'manage':
128 - return functions
129 - elif caller == 'django':
130 - django_functions = []
131 - for function in functions:
132 - fancy_name = function.replace('_', ' ').title()
133 - django_functions.append((function, fancy_name))
134 -
135 - return django_functions
136 -
137 -
138 -def import_libs(path):
139 - '''
140 - Dynamically importing functions from the plugins directory.
141 - '''
142 - library_list = []
143 - sys.path.append(path)
144 - for f in os.listdir(os.path.abspath(path)):
145 - module_name, ext = os.path.splitext(f)
146 - if ext == '.py':
147 - module = __import__(module_name)
148 - func = getattr(module, module_name)
149 - library_list.append(func)
150 -
151 - return library_list
152 -
153 -
154107 def determine_project_year_range(dbname, collection, var):
155108 '''
156109 Determine the first and final year for the observed data
@@ -166,8 +119,8 @@
167120
168121
169122 if __name__ == '__main__':
170 - generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50)
171 - #generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
 123+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50)
 124+ generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
172125 #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', 'to_bar_json', time_unit='year')
173126 #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', 'to_bar_json', time_unit='year')
174127 #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', 'to_bar_json', time_unit='year')
Index: trunk/tools/editor_trends/analyses/adhoc/community_graph.py
@@ -0,0 +1,62 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-10'
 19+__version__ = '0.1'
 20+
 21+import sys
 22+sys.path.append('..')
 23+
 24+import configuration
 25+settings = configuration.Settings()
 26+
 27+from database import db
 28+from utils import file_utils
 29+
 30+try:
 31+ import psyco
 32+ psyco.full()
 33+except ImportError:
 34+ pass
 35+
 36+def create_articles_set(edits):
 37+ s = set()
 38+ years = edits.keys()
 39+ for year in years:
 40+ for edit in edits[year]:
 41+ s.add(edit['article'])
 42+ return s
 43+
 44+
 45+def create_edgelist(project, collection):
 46+ ids = db.retrieve_distinct_keys(project, collection, 'editor')
 47+ conn = db.init_mongo_db(project)
 48+ ids.sort()
 49+ fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', settings.encoding)
 50+ for i in ids:
 51+ author_i = conn[collection].find_one({'editor': i})
 52+ article_i = create_articles_set(author_i['edits'])
 53+ for j in ids:
 54+ if i > j:
 55+ author_j = conn[collection].find_one({'editor': j})
 56+ article_j = create_articles_set(author_j['edits'])
 57+ common = article_i.intersection(article_j)
 58+ if len(common) > 0:
 59+ file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True)
 60+ fh.close()
 61+
 62+if __name__ == '__main__':
 63+ create_edgelist('enwiki', 'editors')
Property changes on: trunk/tools/editor_trends/analyses/adhoc/community_graph.py
___________________________________________________________________
Added: svn:eol-style
164 + native
Index: trunk/tools/editor_trends/analyses/adhoc/file_size_reduction.py
@@ -0,0 +1,100 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2010-11-15'
 19+__version__ = '0.1'
 20+
 21+import sys
 22+sys.path.append('..')
 23+
 24+import os
 25+import xml.etree.cElementTree as cElementTree
 26+
 27+import configuration
 28+from utils import file_utils
 29+settings = configuration.Settings()
 30+
 31+
 32+class DumpStatistics(object):
 33+ ''' Simple class to keep track of XML tags, how often they occur,
 34+ and the length of strings they contain. This is used to calculate the
 35+ overhead.
 36+ '''
 37+ def __init__(self):
 38+ self.tags = {}
 39+
 40+ def add_tag(self, kwargs):
 41+ for kw in kwargs:
 42+ if kw not in self.tags:
 43+ self.tags[kw] = {}
 44+ self.tags[kw]['n'] = 0
 45+ self.tags[kw]['size'] = 0
 46+ self.tags[kw]['n'] += 1
 47+ self.tags[kw]['size'] += self.determine_length(kwargs[kw])
 48+
 49+ def average_size_text(self):
 50+ avg = {}
 51+ for kw in self.tags:
 52+ avg[kw] = self.tags[kw]['size'] / self.tags[kw]['n']
 53+ return avg
 54+
 55+ def total_size_text(self):
 56+ return sum([self.tags[kw]['size'] for kw in self.tags])
 57+
 58+ def total_size_xml(self):
 59+ # the x2 is for the opening and closing tag
 60+ # the +5 is for 2x <, 2x > and 1x /
 61+ return sum([(len(kw) * (self.tags[kw]['n'] * 2) + 5) for kw in self.tags])
 62+
 63+ def determine_length(self, text):
 64+ if text == None:
 65+ return 0
 66+ else:
 67+ return len(text)
 68+
 69+
 70+def calculate_filesize_overhead(location, filename):
 71+ counter = None
 72+ ds = DumpStatistics()
 73+ filename = os.path.join(location, filename)
 74+ context = cElementTree.iterparse(filename, events=('start', 'end'))
 75+ context = iter(context)
 76+ event, root = context.next() #get the root element of the XML doc
 77+
 78+ try:
 79+ for event, elem in context:
 80+ if event == 'end':
 81+ ds.add_tag({elem.tag:elem.text})
 82+ root.clear() # when done parsing a section clear the tree to release memory
 83+ except SyntaxError:
 84+ pass
 85+ file_utils.store_object(ds, settings.binary_location, 'ds')
 86+ xml_size = ds.total_size_xml()
 87+ text_size = ds.total_size_text()
 88+ print text_size, xml_size
 89+ print ds.tags
 90+
 91+
 92+def output_dumpstatistics():
 93+ ds = file_utils.load_object(settings.binary_location, 'ds.bin')
 94+
 95+ for key in ds.tags:
 96+ print '%s\t%s' % (key, ds.tags[key])
 97+
 98+if __name__ == '__main__':
 99+ input = os.path.join(settings.input_location, 'en', 'wiki')
 100+ calculate_filesize_overhead(input, 'enwiki-latest-stub-meta-history.xml')
 101+ output_dumpstatistics()
Property changes on: trunk/tools/editor_trends/analyses/adhoc/file_size_reduction.py
___________________________________________________________________
Added: svn:eol-style
1102 + native
Index: trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py
@@ -0,0 +1,72 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-07'
 19+__version__ = '0.1'
 20+
 21+import sys
 22+import os
 23+sys.path.append('..')
 24+
 25+import configuration
 26+settings = configuration.Settings()
 27+
 28+from etl import extracter
 29+from utils import file_utils
 30+import wikitree
 31+
 32+try:
 33+ import psyco
 34+ psyco.full()
 35+except ImportError:
 36+ pass
 37+
 38+class Article:
 39+ def __init__(self, title, id, talk_id=None):
 40+ self.title = title
 41+ self.id = id
 42+ self.talk_id = talk_id
 43+
 44+
 45+def parse_dumpfile(project, language_code, namespaces=['0', '1']):
 46+ articles = {}
 47+ ns = extracter.load_namespace(language_code)
 48+ non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces)
 49+
 50+
 51+ location = os.path.join(settings.input_location, language_code, project)
 52+ fh = file_utils.create_txt_filehandle(location,
 53+ '%s%s-latest-stub-meta-history.xml' % (language_code, project),
 54+ 'r', settings.encoding)
 55+
 56+ for page, article_size in wikitree.parser.read_input(fh):
 57+ title = page.find('title')
 58+ if extracter.verify_article_belongs_namespace(title, non_valid_namespaces):
 59+ article_id = page.find('id').text
 60+ title = title.text
 61+ if title.startswith(ns['1'].get('canonical')):
 62+ namespace = 'Talk'
 63+ article = articles.get(article_id, Article(None, None, article_id))
 64+ article.talk_id = article_id
 65+ else:
 66+ namespace = 'Main'
 67+ article = articles.get(article_id, Article(title, article_id))
 68+ articles[article_id] = article
 69+
 70+ file_utils.store_object(articles, settings.binary_location, 'talk2article.bin')
 71+
 72+if __name__ == '__main__':
 73+ parse_dumpfile('wiki', 'en')
Property changes on: trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py
___________________________________________________________________
Added: svn:eol-style
174 + native
Index: trunk/tools/editor_trends/etl/store.py
@@ -22,27 +22,22 @@
2323 import sys
2424 import os
2525
26 -sys.path.append('..')
27 -import configuration
28 -settings = configuration.Settings()
2926 from utils import file_utils
3027 from utils import text_utils
31 -from utils import messages
3228 from database import cache
 29+from utils import messages
3330 from database import db
3431
3532
36 -def store_articles(project, language_code):
37 - location = os.path.join(settings.input_location, language_code, project)
38 - fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', settings.encoding)
 33+def store_articles(rts):
 34+ location = os.path.join(rts.input_location, rts.language.code, rts.project.name)
 35+ fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', rts.encoding)
3936 headers = ['id', 'title']
4037 data = file_utils.read_unicode_text(fh)
4138 fh.close()
4239
43 - dbname = '%s%s' % (language_code, project)
44 - collection = '%s_%s' % (dbname, 'articles')
45 - mongo = db.init_mongo_db(dbname)
46 - collection = mongo[collection]
 40+ mongo = db.init_mongo_db(rts.dbname)
 41+ collection = mongo[rts.articles_raw]
4742
4843 articles = {}
4944 for x, d in enumerate(data):
@@ -55,7 +50,7 @@
5651 collection.insert(articles)
5752
5853
59 -def store_editors(tasks, dbname, collection, source):
 54+def store_editors(tasks, rts):
6055 '''
6156 This function is called by multiple consumers who each take a sorted file
6257 and create a cache object. If the number of edits made by an editor is above
@@ -63,8 +58,8 @@
6459 is discarded.
6560 The treshold is currently more than 9 edits and is not yet configurable.
6661 '''
67 - mongo = db.init_mongo_db(dbname)
68 - collection = mongo[collection]
 62+ mongo = db.init_mongo_db(rts.dbname)
 63+ collection = mongo[rts.editors_raw]
6964
7065 editor_cache = cache.EditorCache(collection)
7166 prev_contributor = -1
@@ -80,7 +75,7 @@
8176 break
8277 print '%s files left in the queue.' % messages.show(tasks.qsize)
8378
84 - fh = file_utils.create_txt_filehandle(source, filename, 'r', settings.encoding)
 79+ fh = file_utils.create_txt_filehandle(rts.sorted, filename, 'r', rts.encoding)
8580 for line in file_utils.read_raw_data(fh):
8681 if len(line) > 1:
8782 contributor = line[0]
@@ -89,7 +84,7 @@
9085 editor_cache.add(prev_contributor, 'NEXT')
9186 date = text_utils.convert_timestamp_to_datetime_utc(line[1])
9287 article_id = int(line[2])
93 - username = line[3].encode(settings.encoding)
 88+ username = line[3].encode(rts.encoding)
9489 ns = int(line[4])
9590 value = {'date': date,
9691 'article': article_id,
@@ -101,37 +96,41 @@
10297 #print editor_cache.n
10398
10499
105 -def launcher(source, dbname, collection):
 100+def launcher(rts):
106101 '''
107102 This is the main entry point and creates a number of workers and launches
108103 them.
109104 '''
110 - mongo = db.init_mongo_db(dbname)
111 - coll = mongo[collection]
 105+ #rts.sorted, rts.dbname, rts.collection
 106+ mongo = db.init_mongo_db(rts.dbname)
 107+ coll = mongo[rts.editors_raw]
112108 coll.ensure_index('editor')
113109 coll.create_index('editor')
114110
115 - files = file_utils.retrieve_file_list(source, 'csv')
 111+ files = file_utils.retrieve_file_list(rts.sorted, 'csv')
116112
117 - print 'Input directory is: %s ' % source
 113+ print 'Input directory is: %s ' % rts.sorted
118114 tasks = multiprocessing.JoinableQueue()
119115 consumers = [multiprocessing.Process(target=store_editors,
120 - args=(tasks, dbname, collection, source))
121 - for i in xrange(settings.number_of_processes)]
 116+ args=(tasks, rts))
 117+ for i in xrange(rts.number_of_processes)]
122118
123119 for filename in files:
124120 tasks.put(filename)
125121
126 - for x in xrange(settings.number_of_processes):
 122+ for x in xrange(rts.number_of_processes):
127123 tasks.put(None)
128124
129125 for w in consumers:
130126 w.start()
131127
132128 tasks.join()
 129+ store_articles(rts)
133130
134131
135132 def debug():
136133 store_articles('wiki', 'cs')
 134+
 135+
137136 if __name__ == '__main__':
138137 debug()
Index: trunk/tools/editor_trends/etl/downloader.py
@@ -48,14 +48,14 @@
4949 widgets = log.init_progressbar_widgets(filename)
5050 extension = file_utils.determine_file_extension(filename)
5151 filemode = file_utils.determine_file_mode(extension)
52 - filesize = http_utils.determine_remote_filesize(properties.settings.wp_dump_location,
 52+ filesize = http_utils.determine_remote_filesize(properties.wp_dump_location,
5353 properties.dump_relative_path,
5454 filename)
5555
56 - mod_date = http_utils.determine_modified_date(properties.settings.wp_dump_location,
 56+ mod_date = http_utils.determine_modified_date(properties.wp_dump_location,
5757 properties.dump_relative_path,
5858 filename)
59 - mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.settings.timestamp_server)
 59+ mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.timestamp_server)
6060 if file_utils.check_file_exists(properties.location, filename):
6161 mod_loc = file_utils.get_modified_date(properties.location, filename)
6262 if mod_loc == mod_date and (properties.force == False or properties.force == None):
@@ -66,7 +66,7 @@
6767 fh = file_utils.create_txt_filehandle(properties.location,
6868 filename,
6969 filemode,
70 - properties.settings.encoding)
 70+ properties.encoding)
7171 else:
7272 fh = file_utils.create_binary_filehandle(properties.location, filename, 'wb')
7373
@@ -100,18 +100,18 @@
101101
102102
103103
104 -def launcher(properties, settings, logger):
 104+def launcher(properties, logger):
105105 print 'Creating list of files to be downloaded...'
106 - tasks = http_utils.create_list_dumpfiles(properties.settings.wp_dump_location,
 106+ tasks = http_utils.create_list_dumpfiles(properties.wp_dump_location,
107107 properties.dump_relative_path,
108108 properties.dump_filename)
109109 #print tasks.qsize()
110110 #if tasks.qsize() < properties.settings.number_of_processes:
111 - # properties.settings.number_of_processes = tasks.qsize()
 111+ # properties..number_of_processes = tasks.qsize()
112112 if tasks.qsize() > 2:
113113 consumers = [multiprocessing.Process(target=download_wiki_file,
114114 args=(tasks, properties))
115 - for i in xrange(properties.settings.number_of_processes)]
 115+ for i in xrange(properties.number_of_processes)]
116116 else: consumers = [multiprocessing.Process(target=download_wiki_file,
117117 args=(tasks, properties))
118118 for i in xrange(1)]
Index: trunk/tools/editor_trends/etl/__init__.py
@@ -0,0 +1 @@
 2+
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -24,10 +24,6 @@
2525 import progressbar
2626 from Queue import Empty
2727
28 -sys.path.append('..')
29 -import configuration
30 -settings = configuration.Settings()
31 -
3228 import wikitree.parser
3329 from bots import detector
3430 from utils import file_utils
@@ -44,8 +40,8 @@
4541 RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
4642
4743
48 -def remove_numeric_character_references(text):
49 - return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode(settings.encoding)
 44+def remove_numeric_character_references(rts, text):
 45+ return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode(rts.encoding)
5046
5147
5248 def lenient_deccharref(m):
@@ -75,9 +71,9 @@
7672 return ns
7773
7874
79 -def parse_comments(revisions, function):
 75+def parse_comments(rts, revisions, function):
8076 for revision in revisions:
81 - comment = revision.find('{%s}comment' % settings.xml_namespace)
 77+ comment = revision.find('{%s}comment' % rts.xml_namespace)
8278 if comment != None and comment.text != None:
8379 comment.text = function(comment.text)
8480 return revisions
@@ -101,12 +97,7 @@
10298 else:
10399 return False
104100
105 -# for namespace in namespaces:
106 -# if title.startswith(namespace):
107 -# return False
108 -# return True
109101
110 -
111102 def validate_hostname(address):
112103 '''
113104 This is not a foolproof solution at all. The problem is that it's really
@@ -183,7 +174,7 @@
184175 return None
185176
186177
187 -def output_editor_information(revisions, page, bots):
 178+def output_editor_information(revisions, page, bots, rts):
188179 '''
189180 @elem is an XML element containing 1 revision from a page
190181 @output is where to store the data, a filehandle
@@ -237,6 +228,7 @@
238229 flat.append(f)
239230 return flat
240231
 232+
241233 def add_namespace_to_output(output, namespace):
242234 for x, o in enumerate(output):
243235 o.append(namespace['id'])
@@ -244,13 +236,13 @@
245237 return output
246238
247239
248 -def parse_dumpfile(tasks, project, language_code, filehandles, lock, namespaces=['0']):
249 - bot_ids = detector.retrieve_bots(language_code)
250 - location = os.path.join(settings.input_location, language_code, project)
251 - output = os.path.join(settings.input_location, language_code, project, 'txt')
 240+def parse_dumpfile(tasks, rts, filehandles, lock):
 241+ bot_ids = detector.retrieve_bots(rts.language.code)
 242+ location = os.path.join(rts.input_location, rts.language.code, rts.project.name)
 243+ output = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt')
252244 widgets = log.init_progressbar_widgets('Extracting data')
253245 filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a',
254 - settings.encoding) for fh in xrange(settings.max_filehandles)]
 246+ rts.encoding) for fh in xrange(rts.max_filehandles)]
255247
256248 while True:
257249 total, processed = 0.0, 0.0
@@ -269,11 +261,11 @@
270262 filesize = file_utils.determine_filesize(location, filename)
271263 print 'Opening %s...' % (os.path.join(location, filename))
272264 print 'Filesize: %s' % filesize
273 - fh1 = file_utils.create_txt_filehandle(location, filename, 'r', settings.encoding)
274 - fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', settings.encoding)
 265+ fh1 = file_utils.create_txt_filehandle(location, filename, 'r', rts.encoding)
 266+ fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', rts.encoding)
275267 ns, xml_namespace = wikitree.parser.extract_meta_information(fh1)
276 - ns = build_namespaces_locale(ns, namespaces)
277 - settings.xml_namespace = xml_namespace
 268+ ns = build_namespaces_locale(ns, rts.namespaces)
 269+ rts.xml_namespace = xml_namespace
278270
279271 pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()
280272 for page, article_size in wikitree.parser.read_input(fh1):
@@ -281,14 +273,13 @@
282274 total += 1
283275 namespace = parse_article(title, ns)
284276 if namespace != False:
285 - #if verify_article_belongs_namespace(title, ns):
286277 article_id = page.find('id').text
287278 title = page.find('title').text
288279 revisions = page.findall('revision')
289 - revisions = parse_comments(revisions, remove_numeric_character_references)
290 - output = output_editor_information(revisions, article_id, bot_ids)
 280+ revisions = parse_comments(rts, revisions, remove_numeric_character_references)
 281+ output = output_editor_information(revisions, article_id, bot_ids, rts)
291282 output = add_namespace_to_output(output, namespace)
292 - write_output(output, filehandles, lock)
 283+ write_output(output, filehandles, lock, rts)
293284 file_utils.write_list_to_csv([article_id, title], fh2)
294285 processed += 1
295286 page.clear()
@@ -317,14 +308,14 @@
318309 return d
319310
320311
321 -def write_output(observations, filehandles, lock):
 312+def write_output(observations, filehandles, lock, rts):
322313 observations = group_observations(observations)
323314 for obs in observations:
324315 lock.acquire() #lock the write around all edits of an editor for a particular page
325316 try:
326317 for i, o in enumerate(observations[obs]):
327318 if i == 0:
328 - fh = filehandles[hash(obs)]
 319+ fh = filehandles[hash(rts, obs)]
329320 file_utils.write_list_to_csv(o, fh)
330321
331322 except Exception, error:
@@ -333,16 +324,16 @@
334325 lock.release()
335326
336327
337 -def hash(id):
 328+def hash(rts, id):
338329 '''
339330 A very simple hash function based on modulo. The except clause has been
340331 added because there are instances where the username is stored in userid
341332 tag and hence that's a string and not an integer.
342333 '''
343334 try:
344 - return int(id) % settings.max_filehandles
 335+ return int(id) % rts.max_filehandles
345336 except ValueError:
346 - return sum([ord(i) for i in id]) % settings.max_filehandles
 337+ return sum([ord(i) for i in id]) % rts.max_filehandles
347338
348339
349340 def prepare(output):
@@ -380,7 +371,8 @@
381372 print tasks.qsize()
382373 return tasks
383374
384 -def launcher(properties):
 375+
 376+def launcher(rts):
385377 '''
386378 This is the main entry point for the extact phase of the data processing
387379 chain. First, it will put a the files that need to be extracted in a queue
@@ -389,10 +381,10 @@
390382 the variables from the different dump files.
391383 '''
392384 result = True
393 - tasks = unzip(properties)
 385+ tasks = unzip(rts)
394386
395 - output = os.path.join(settings.input_location, properties.language.code,
396 - properties.project.name, 'txt')
 387+ output = os.path.join(rts.input_location, rts.language.code,
 388+ rts.project.name, 'txt')
397389 result = prepare(output)
398390 if not result:
399391 return result
@@ -404,14 +396,12 @@
405397 filehandles = []
406398 consumers = [multiprocessing.Process(target=parse_dumpfile,
407399 args=(tasks,
408 - properties.project.name,
409 - properties.language.code,
 400+ rts,
410401 filehandles,
411 - lock,
412 - properties.namespaces))
413 - for x in xrange(settings.number_of_processes)]
 402+ lock))
 403+ for x in xrange(rts.number_of_processes)]
414404
415 - for x in xrange(settings.number_of_processes):
 405+ for x in xrange(rts.number_of_processes):
416406 tasks.put(None)
417407
418408 for w in consumers:
Index: trunk/tools/editor_trends/etl/transformer.py
@@ -23,9 +23,6 @@
2424 import datetime
2525 import sys
2626
27 -sys.path.append('..')
28 -import configuration
29 -settings = configuration.Settings()
3027 from database import db
3128 from utils import file_utils
3229 from utils import messages
@@ -101,6 +98,7 @@
10299 'username': username
103100 }, safe=True)
104101
 102+
105103 def determine_year_range(edits):
106104 years = [year for year in edits if edits[year] != []]
107105 first_year = int(min(years))
@@ -119,8 +117,6 @@
120118 return dc
121119
122120
123 -
124 -
125121 def determine_edits_by_month(edits, first_year, final_year):
126122 dc = shaper.create_datacontainer(first_year, final_year)
127123 dc = shaper.add_months_to_datacontainer(dc, 0.0)
@@ -161,17 +157,17 @@
162158 return sorted(edits, key=itemgetter('date'))
163159
164160
165 -def transform_editors_multi_launcher(dbname, collection):
166 - ids = db.retrieve_distinct_keys(dbname, collection, 'editor')
167 - kwargs = {'definition': 'traditional',
168 - 'pbar': True,
169 - }
 161+def transform_editors_multi_launcher(rts):
 162+ ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor')
 163+# kwargs = {'definition': 'traditional',
 164+# 'pbar': True,
 165+# }
170166 tasks = multiprocessing.JoinableQueue()
171 - consumers = [EditorConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
 167+ consumers = [EditorConsumer(tasks, None) for i in xrange(rts.number_of_processes)]
172168
173169 for id in ids:
174 - tasks.put(Editor(dbname, collection, id))
175 - for x in xrange(settings.number_of_processes):
 170+ tasks.put(Editor(rts.dbname, rts.editors_raw, id))
 171+ for x in xrange(rts.number_of_processes):
176172 tasks.put(None)
177173
178174 print messages.show(tasks.qsize)
@@ -181,10 +177,10 @@
182178 tasks.join()
183179
184180
185 -def setup_database(dbname, collection):
186 - mongo = db.init_mongo_db(dbname)
187 - input_db = mongo[collection]
188 - output_db = mongo['%s_dataset' % collection]
 181+def setup_database(rts):
 182+ mongo = db.init_mongo_db(rts.dbname)
 183+ input_db = mongo[rts.editors_raw]
 184+ output_db = mongo[rts.editors_dataset]
189185
190186 output_db.ensure_index('editor')
191187 output_db.create_index('editor')
@@ -193,9 +189,9 @@
194190 return input_db, output_db
195191
196192
197 -def transform_editors_single_launcher(dbname, collection):
198 - ids = db.retrieve_distinct_keys(dbname, collection, 'editor')
199 - input_db, output_db = setup_database(dbname, collection)
 193+def transform_editors_single_launcher(rts):
 194+ ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor')
 195+ input_db, output_db = setup_database(rts)
200196 for x, id in enumerate(ids):
201197 print '%s editors to go...' % (len(ids) - x)
202198 editor = Editor(id, input_db, output_db)
Index: trunk/tools/editor_trends/etl/sort.py
@@ -24,14 +24,10 @@
2525 import multiprocessing
2626 from Queue import Empty
2727
28 -sys.path.append('..')
29 -import configuration
30 -settings = configuration.Settings()
31 -
3228 from utils import file_utils
3329 from utils import messages
34 -#import wikitree.parser
3530
 31+
3632 def quick_sort(obs):
3733 '''
3834 Quicksort is a sorting algorithm developed by C. A. R. Hoare that, on \
@@ -79,12 +75,15 @@
8076
8177
8278
83 -def merge_sorted_files(target, files, iteration):
 79+def merge_sorted_files(target, files, iteration, rts):
8480 '''
85 - Merges smaller sorted files in one big file, no longer used.
 81+ Merges smaller sorted files in one big file, Only used for creating
 82+ data competition file.
8683 '''
87 - fh = file_utils.create_txt_filehandle(target, 'merged_%s.txt' % iteration,
88 - 'w', settings.encoding)
 84+ fh = file_utils.create_txt_filehandle(target,
 85+ 'merged_%s.txt' % iteration,
 86+ 'w',
 87+ rts.encoding)
8988 lines = 0
9089 for line in heapq.merge(*[readline(filename) for filename in files]):
9190 file_utils.write_list_to_csv(line, fh)
@@ -94,17 +93,19 @@
9594 return fh.name
9695
9796
98 -def write_sorted_file(sorted_data, filename, target):
 97+def write_sorted_file(sorted_data, filename, rts):
9998 '''
10099 Writes the sorted file to target
101100 '''
102 - fh = file_utils.create_txt_filehandle(target, filename, 'w',
103 - settings.encoding)
 101+ fh = file_utils.create_txt_filehandle(rts.sorted,
 102+ filename,
 103+ 'w',
 104+ rts.encoding)
104105 file_utils.write_list_to_csv(sorted_data, fh)
105106 fh.close()
106107
107108
108 -def mergesort_feeder(tasks, source, target):
 109+def mergesort_feeder(tasks, rts):
109110 '''
110111 The feeder function is called by the launcher and gives it a task to
111112 complete.
@@ -118,10 +119,10 @@
119120 print tasks.qsize()
120121 break
121122
122 - fh = file_utils.create_txt_filehandle(source,
123 - filename,
124 - 'r',
125 - settings.encoding)
 123+ fh = file_utils.create_txt_filehandle(rts.txt,
 124+ filename,
 125+ 'r',
 126+ rts.encoding)
126127 #print fh
127128 #data = fh.readlines()
128129 data = file_utils.read_unicode_text(fh)
@@ -129,7 +130,7 @@
130131 data = [d.strip() for d in data]
131132 data = [d.split('\t') for d in data]
132133 sorted_data = mergesort(data)
133 - write_sorted_file(sorted_data, filename, target)
 134+ write_sorted_file(sorted_data, filename, rts)
134135 print filename, messages.show(tasks.qsize)
135136 except UnicodeDecodeError, e:
136137 print e
@@ -137,19 +138,19 @@
138139 pass
139140
140141
141 -def mergesort_launcher(source, target):
142 - settings.verify_environment([source, target])
143 - files = file_utils.retrieve_file_list(source, 'csv')
144 - #print files
145 - print source
 142+def launcher(rts):
 143+ '''
 144+ rts is an instance of RunTimeSettings
 145+ '''
 146+ files = file_utils.retrieve_file_list(rts.txt, 'csv')
146147 tasks = multiprocessing.JoinableQueue()
147148 consumers = [multiprocessing.Process(target=mergesort_feeder,
148 - args=(tasks, source, target))
149 - for x in xrange(settings.number_of_processes)]
 149+ args=(tasks, rts))
 150+ for x in xrange(rts.number_of_processes)]
150151 for filename in files:
151152 tasks.put(filename)
152153
153 - for x in xrange(settings.number_of_processes):
 154+ for x in xrange(rts.number_of_processes):
154155 tasks.put(None)
155156
156157 for w in consumers:
@@ -157,6 +158,7 @@
158159
159160 tasks.join()
160161
 162+
161163 def debug():
162164 '''
163165 Simple test function
Index: trunk/tools/editor_trends/__init__.py
@@ -1,14 +1,30 @@
22 import os
33 import sys
44
5 -WORKING_DIRECTORY = os.getcwd()#[:-9]
6 -IGNORE_DIRS = ['wikistats', 'zips']
 5+from classes import singleton
76
8 -dirs = [name for name in os.listdir(WORKING_DIRECTORY) if
9 - os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
 7+class Path:
 8+ __metaclass__ = singleton.Singleton
109
 10+ def __init__(self):
 11+ self.cwd = self.determine_working_directory()
 12+ self.update_python_path()
1113
12 -for subdirname in dirs:
13 - if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
14 - sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname))
15 - #print os.path.join(WORKING_DIRECTORY, subdirname)
 14+ def determine_working_directory(self):
 15+ cwd = os.getcwd()
 16+ if not cwd.endswith('editor_trends%s' % os.sep):
 17+ pos = cwd.find('editor_trends') + 14
 18+ cwd = cwd[:pos]
 19+ return cwd
 20+
 21+ def update_python_path(self):
 22+ IGNORE_DIRS = ['wikistats', 'zips', 'datasets', 'mapreduce', 'logs',
 23+ 'statistics', 'js_scripts', 'deployment',
 24+ 'documentation', 'data', 'code-snippets']
 25+ dirs = [name for name in os.listdir(self.cwd) if
 26+ os.path.isdir(os.path.join(self.cwd, name))]
 27+ for subdirname in dirs:
 28+ if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
 29+ sys.path.append(os.path.join(self.cwd, subdirname))
 30+
 31+Path()
Index: trunk/tools/editor_trends/classes/settings.py
@@ -75,9 +75,7 @@
7676
7777 self.architecture = platform.machine()
7878 self.working_directory = self.determine_working_directory()
79 - print sys.path
8079 self.update_python_path()
81 - print sys.path
8280
8381 self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\'
8482 self.max_filehandles = self.determine_max_filehandles_open()
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -29,14 +29,15 @@
3030 import datetime
3131 import time
3232 import re
33 -sys.path.append('..')
 33+#sys.path.append('..')
3434
 35+from settings import Settings
3536 from utils import text_utils
3637 from utils import ordered_dict as odict
3738 from classes import languages
3839
3940
40 -class RunTimeSettings:
 41+class RunTimeSettings(Settings):
4142 '''
4243 This class keeps track of the commands issued by the user and is used to
4344 feed the different etl functions. Difference with configuration class is
@@ -44,25 +45,26 @@
4546 same for a user while these settings can change depending on the kind of
4647 analysis requested.
4748 '''
48 - def __init__(self, project, language, settings, args=None):
 49+ def __init__(self, project, language, args=None):
 50+ Settings.__init__(self)
4951 self.project = project
5052 self.language = language
51 - self.settings = settings
 53+ self.dbname = 'wikilytics'
5254
5355 if args:
5456 self.args = args
5557 self.hash = self.secs_since_epoch()
56 - print self.settings.input_location
57 - print self.get_value('location')
58 - self.base_location = self.settings.input_location if \
59 - self.settings.input_location != None else self.get_value('location')
 58+ #print self.settings.input_location
 59+ #print self.get_value('location')
 60+ self.input_location = self.input_location if \
 61+ self.input_location != None else self.get_value('location')
6062 self.project = self.update_project_settings()
6163 self.language = self.update_language_settings()
62 - self.dbname = '%s%s' % (self.language.code, self.project.name)
 64+ #self.dbname = '%s%s' % (self.language.code, self.project.name)
6365 self.targets = self.split_keywords(self.get_value('charts'))
6466 self.keywords = self.split_keywords(self.get_value('keywords'))
6567 self.function = self.get_value('func')
66 - self.collection = self.get_value('collection')
 68+
6769 self.ignore = self.get_value('except')
6870 self.clean = self.get_value('new')
6971 self.force = self.get_value('force')
@@ -70,9 +72,9 @@
7173 self.filename = self.generate_wikidump_filename()
7274 self.namespaces = self.get_namespaces()
7375
74 - self.dataset = os.path.join(settings.dataset_location,
 76+ self.dataset = os.path.join(self.dataset_location,
7577 self.project.name)
76 - self.charts = os.path.join(settings.chart_location,
 78+ self.charts = os.path.join(self.chart_location,
7779 self.project.name)
7880
7981 self.txt = os.path.join(self.location, 'txt')
@@ -86,8 +88,11 @@
8789 self.dump_filename = self.generate_wikidump_filename()
8890 self.dump_relative_path = self.set_dump_path()
8991 self.dump_absolute_path = self.set_dump_path(absolute=True)
90 - print self.directories
91 - settings.verify_environment(self.directories)
 92+ self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)
 93+ self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)
 94+ self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)
 95+ self.analyzer_collection = self.get_value('collection')
 96+ self.verify_environment(self.directories)
9297
9398 def __str__(self):
9499 return 'Runtime Settings for project %s%s' % (self.language.name,
@@ -126,7 +131,7 @@
127132 '''
128133 Construct the full project location
129134 '''
130 - return os.path.join(self.base_location, self.language.code, self.project.name)
 135+ return os.path.join(self.input_location, self.language.code, self.project.name)
131136
132137 def show_settings(self):
133138 '''
@@ -141,7 +146,7 @@
142147 max_length_key = max([len(key) for key in about.keys()])
143148 print 'Final settings after parsing command line arguments:'
144149 for ab in about:
145 - print '%s: %s' % (ab.rjust(max_length_key), about[ab].encode(self.settings.encoding))
 150+ print '%s: %s' % (ab.rjust(max_length_key), about[ab].encode(self.encoding))
146151
147152
148153 def get_value(self, key):
@@ -152,7 +157,7 @@
153158
154159 def set_dump_path(self, absolute=False):
155160 if absolute:
156 - return '%s/%s%s/latest/' % (self.settings.wp_dump_location, self.language.code, self.project.name)
 161+ return '%s/%s%s/latest/' % (self.wp_dump_location, self.language.code, self.project.name)
157162 else:
158163 return '/%s%s/latest/' % (self.language.code, self.project.name)
159164
Index: trunk/tools/editor_trends/classes/dataset.py
@@ -34,7 +34,7 @@
3535 from utils import file_utils
3636 from utils import data_converter
3737 from database import db
38 -import json_encoders
 38+from analyses import json_encoders
3939
4040 class Transform(SONManipulator):
4141 '''
Index: trunk/tools/editor_trends/classes/languages.py
@@ -31,20 +31,20 @@
3232 def __repr__(self):
3333 return u'%s - %s' % (self.code, self.name)
3434
35 - def show_languages(self, settings, project, startswith=None):
 35+ def show_languages(self, project, startswith=None):
3636 if startswith != None:
3737 startswith = startswith.title()
3838 project.valid_languages.sort()
3939 for language in project.valid_languages:
4040 try:
4141 if startswith != None and language.startswith(first):
42 - print '%s' % language.decode(settings.encoding)
 42+ print '%s' % language.decode('utf-8')
4343 elif startswith == None:
44 - print '%s' % language.decode(settings.encoding)
 44+ print '%s' % language.decode('utf-8')
4545 except UnicodeEncodeError:
4646 print '%s' % language
47 -
4847
 48+
4949 class LanguageContainer:
5050 def __init__(self):
5151 self.init_languages = odict.OrderedDict([
Index: trunk/tools/editor_trends/configuration.py
@@ -90,7 +90,9 @@
9191
9292 self.architecture = platform.machine()
9393 self.working_directory = self.determine_working_directory()
 94+ print sys.path
9495 self.update_python_path()
 96+ print sys.path
9597
9698 self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\'
9799 self.max_filehandles = self.determine_max_filehandles_open()
Index: trunk/tools/editor_trends/utils/__init__.py
@@ -0,0 +1 @@
 2+
Index: trunk/tools/editor_trends/utils/compression.py
@@ -22,10 +22,12 @@
2323 import os
2424 sys.path.append('..')
2525
26 -import configuration
27 -settings = configuration.Settings()
 26+#import configuration
 27+#settings = configuration.Settings()
 28+from classes import settings
 29+settings = settings.Settings()
 30+from classes import exceptions
2831 import file_utils
29 -from classes import exceptions
3032 import timer
3133 import log
3234
@@ -128,6 +130,7 @@
129131 self.name = p
130132 self.program_installed = path
131133
 134+
132135 def launch_zip_extractor(location, filename, properties):
133136 '''
134137
@@ -141,6 +144,7 @@
142145 log.log_to_mongo(properties, 'dataset', 'unpack', stopwatch, event='finish')
143146 return retcode
144147
 148+
145149 if __name__ == '__main__':
146 - c = Compressor('C:\Users\diederik.vanliere\Documents', 'django.zip')
 150+ c = Compressor('C:\Users\diederik.vanliere\Documents', 'test.zip')
147151 c.extract()
Index: trunk/tools/editor_trends/utils/log.py
@@ -27,10 +27,10 @@
2828
2929 from database import db
3030
31 -def log_to_mongo(properties, jobtype, task, timer, event='start'):
32 - conn = db.init_mongo_db('wikilytics')
 31+def log_to_mongo(rts, jobtype, task, timer, event='start'):
 32+ conn = db.init_mongo_db(rts.dbname)
3333 created = datetime.datetime.now()
34 - hash = '%s_%s' % (properties.project, properties.hash)
 34+ hash = '%s_%s' % (rts.project, rts.hash)
3535 coll = conn['jobs']
3636
3737 job = coll.find_one({'hash': hash})
@@ -38,8 +38,8 @@
3939 if job == None:
4040 if jobtype == 'dataset':
4141 _id = coll.save({'hash': hash, 'created': created, 'finished': False,
42 - 'language_code': properties.language.code,
43 - 'project': properties.project.name,
 42+ 'language_code': rts.language.code,
 43+ 'project': rts.project.name,
4444 'in_progress': True, 'jobtype': jobtype,
4545 'tasks': {}})
4646
@@ -47,8 +47,8 @@
4848 elif jobtype == 'chart':
4949 _id = coll.save({'hash': hash, 'created': created,
5050 'jobtype': jobtype,
51 - 'project': properties.project,
52 - 'language_code': properties.language_code,
 51+ 'project': rts.project,
 52+ 'language_code': rts.language_code,
5353 'tasks': {}})
5454
5555 job = coll.find_one({'_id': _id})
Index: trunk/tools/editor_trends/bots/__init__.py
@@ -0,0 +1 @@
 2+
Index: trunk/tools/editor_trends/code-snippets/__init__.py
@@ -0,0 +1,8 @@
 2+import os
 3+
 4+cwd = os.getcwd()
 5+pos = cwd.rfind(os.sep)
 6+cwd = cwd[:pos]
 7+
 8+from __init__ import Path
 9+Path()

Status & tagging log