r81075 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r81074‎ | r81075 | r81076 >
Date:00:13, 27 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Three new plugins and some minor fixes.
Modified paths:
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/dataset.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/total_cumulative_edits.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/total_number_of_articles.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/total_number_of_new_wikipedians.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/total_cumulative_edits.py
@@ -0,0 +1,26 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+import datetime
 22+
 23+def total_cumulative_edits(var, editor, **kwargs):
 24+ cnt = editor['edit_count']
 25+ today = datetime.datetime.today()
 26+ var.add(today, cnt, update=True)
 27+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/total_cumulative_edits.py
___________________________________________________________________
Added: svn:eol-style
128 + native
Index: trunk/tools/editor_trends/analyses/plugins/total_number_of_new_wikipedians.py
@@ -0,0 +1,25 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+import datetime
 22+
 23+def total_number_of_new_wikipedians(var, editor, **kwargs):
 24+ today = datetime.datetime.today()
 25+ var.add(today, 1, update=True)
 26+ return var
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py
@@ -21,12 +21,12 @@
2222 def cohort_dataset_forward_histogram(var, editor, **kwargs):
2323 # headers = ['year', 'month', 'edits']
2424 new_wikipedian = editor['new_wikipedian']
25 - final_edit = editor['final_edit']
 25+ final_edit = editor['final_edit'].year + 1
2626 yearly_edits = editor['edits_by_year']
2727 n = editor['edit_count']
2828
2929 if n >= var.cum_cutoff:
30 - for i, year in enumerate(xrange(new_wikipedian.year, final_edit.year)):
 30+ for i, year in enumerate(xrange(new_wikipedian.year, final_edit)):
3131 edits = editor['monthly_edits'].get(str(year), {0:0})
3232 if year == new_wikipedian.year:
3333 start = new_wikipedian.month
Index: trunk/tools/editor_trends/analyses/plugins/total_number_of_articles.py
@@ -0,0 +1,34 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+import datetime
 22+
 23+def total_number_of_articles(var, editor, **kwargs):
 24+ today = datetime.datetime.today()
 25+ obs = {}
 26+ for year in editor['edits']:
 27+ years = editor['edits'][year]
 28+ for edit in years:
 29+ article = edit['article']
 30+ obs.setdefault(article, 0)
 31+ obs[article] += 1
 32+
 33+
 34+ var.add(today, obs, update=True)
 35+ return var
Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -17,26 +17,20 @@
1818 __date__ = '2010-12-10'
1919 __version__ = '0.1'
2020
21 -import datetime
22 -import multiprocessing
23 -import calendar
 21+
2422 import sys
2523 import os
2624 import progressbar
2725 import types
28 -from dateutil.relativedelta import relativedelta
 26+import datetime
2927
3028 sys.path.append('..')
3129
3230 import configuration
3331 settings = configuration.Settings()
3432 from database import db
35 -from etl import shaper
36 -from utils import file_utils
3733 from utils import timer
38 -from utils import messages
3934 from utils import log
40 -import analyses.plugins as plugins
4135 import dataset
4236
4337
@@ -56,16 +50,19 @@
5751 print 'Exporting data for chart: %s' % func.func_name
5852 print 'Project: %s' % dbname
5953 print 'Dataset: %s' % collection
 54+
6055 ds = loop_editors(dbname, project, collection, language_code, func, **kwargs)
61 - file = '%s_%s.csv' % (dbname, func.func_name)
62 - print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file)
 56+ fn = '%s_%s.csv' % (dbname, func.func_name)
 57+
 58+ print 'Storing dataset: %s' % os.path.join(settings.dataset_location, fn)
6359 ds.write(format='csv')
 60+
6461 print 'Serializing dataset to %s_%s' % (dbname, 'charts')
6562 log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start')
6663 ds.write(format='mongo')
6764 stopwatch.elapsed()
 65+
6866 log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish')
69 -
7067 return res
7168
7269
@@ -74,26 +71,32 @@
7572 Generic loop function that loops over all the editors of a Wikipedia project
7673 and then calls the function that does the actual aggregation.
7774 '''
78 -
 75+ mongo = db.init_mongo_db(dbname)
 76+ coll = mongo[collection]
7977 editors = db.retrieve_distinct_keys(dbname, collection, 'editor')
8078
 79+
 80+ min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian')
8181 pbar = progressbar.ProgressBar(maxval=len(editors)).start()
82 - min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian')
8382 print 'Number of editors: %s' % len(editors)
84 - mongo = db.init_mongo_db(dbname)
85 - coll = mongo[collection]
86 - format = kwargs.pop('format', 'long')
 83+
 84+ fmt = kwargs.pop('format', 'long')
8785 kwargs['min_year'] = min_year
8886 kwargs['max_year'] = max_year
89 - vars = []
90 - ds = dataset.Dataset(func.func_name, project, coll.name, language_code, vars, format=format)
 87+ variables = []
 88+ ds = dataset.Dataset(func.func_name,
 89+ project,
 90+ coll.name,
 91+ language_code,
 92+ variables,
 93+ format=fmt)
9194 var = dataset.Variable('count', **kwargs)
9295
9396
9497
9598 for editor in editors:
9699 editor = coll.find_one({'editor': editor})
97 - data = func(var, editor, dbname=dbname)
 100+ var = func(var, editor, dbname=dbname)
98101 pbar.update(pbar.currval + 1)
99102
100103 ds.add_variable(var)
@@ -112,8 +115,9 @@
113116 ignore = ['__init__']
114117 functions = {}
115118
116 - pos = __file__.rfind(os.sep)
117 - loc = __file__[:pos]
 119+ fn = os.path.realpath(__file__)
 120+ pos = fn.rfind(os.sep)
 121+ loc = fn[:pos]
118122 path = os.path.join(loc , 'plugins')
119123 plugins = import_libs(path)
120124
@@ -135,7 +139,6 @@
136140 '''
137141 Dynamically importing functions from the plugins directory.
138142 '''
139 -
140143 library_list = []
141144 sys.path.append(path)
142145 for f in os.listdir(os.path.abspath(path)):
@@ -152,10 +155,14 @@
153156 '''
154157 Determine the first and final year for the observed data
155158 '''
156 - max_year = db.run_query(dbname, collection, var, 'max')
157 - max_year = max_year[var].year + 1
158 - min_year = db.run_query(dbname, collection, var, 'min')
159 - min_year = min_year[var].year
 159+ try:
 160+ max_year = db.run_query(dbname, collection, var, 'max')
 161+ max_year = max_year[var].year + 1
 162+ min_year = db.run_query(dbname, collection, var, 'min')
 163+ min_year = min_year[var].year
 164+ except KeyError:
 165+ min_year = 2001
 166+ max_year = datetime.datetime.today().year + 1
160167 return min_year, max_year
161168
162169
@@ -172,8 +179,10 @@
173180
174181
175182 if __name__ == '__main__':
176 -
177 - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=50)
 183+ generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', time_unit='year')
 184+ generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year')
 185+ generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', time_unit='year')
 186+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=50)
178187 #generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_backward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide')
179188 #generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide')
180189 #generate_chart_data('wiki', 'editors_dataset','en', histogram_edits, time_unit='year', cutoff=0)
Index: trunk/tools/editor_trends/analyses/dataset.py
@@ -98,10 +98,11 @@
9999 elif self.time_unit == 'month':
100100 datum = datetime.datetime(date.year, date.month, 1)
101101 return time.mktime(datum.timetuple())
 102+ elif self.time_unit == 'day':
 103+ return time.mktime(date.timetuple())
102104 else:
103 - return time.mktime(date.timetuple())
 105+ return date
104106
105 -
106107 class Observation(Data):
107108 '''
108109 The smallest unit, here the actual data is being stored.
@@ -176,8 +177,7 @@
177178
178179 class Variable(Data):
179180 '''
180 - This class constructs a time-based variable and has some associated simple
181 - statistical descriptives
 181+ This class constructs a time-based variable.
182182 '''
183183
184184 def __init__(self, name, time_unit, **kwargs):
@@ -284,7 +284,7 @@
285285 self.variables.append(name)
286286
287287 def __repr__(self):
288 - return 'Dataset contains %s variables' % (len(self.vars))
 288+ return 'Dataset contains %s variables' % (len(self.variables))
289289
290290 def __iter__(self):
291291 for var in self.variables:

Status & tagging log