Index: trunk/tools/editor_trends/analyses/plugins/total_cumulative_edits.py |
— | — | @@ -0,0 +1,26 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import datetime |
| 22 | + |
| 23 | +def total_cumulative_edits(var, editor, **kwargs): |
| 24 | + cnt = editor['edit_count'] |
| 25 | + today = datetime.datetime.today() |
| 26 | + var.add(today, cnt, update=True) |
| 27 | + return var |
Property changes on: trunk/tools/editor_trends/analyses/plugins/total_cumulative_edits.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 28 | + native |
Index: trunk/tools/editor_trends/analyses/plugins/total_number_of_new_wikipedians.py |
— | — | @@ -0,0 +1,25 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-25'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +import datetime
|
| 22 | +
|
| 23 | +def total_number_of_new_wikipedians(var, editor, **kwargs):
|
| 24 | + today = datetime.datetime.today()
|
| 25 | + var.add(today, 1, update=True)
|
| 26 | + return var
|
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py |
— | — | @@ -21,12 +21,12 @@ |
22 | 22 | def cohort_dataset_forward_histogram(var, editor, **kwargs):
|
23 | 23 | # headers = ['year', 'month', 'edits']
|
24 | 24 | new_wikipedian = editor['new_wikipedian']
|
25 | | - final_edit = editor['final_edit']
|
| 25 | + final_edit = editor['final_edit'].year + 1
|
26 | 26 | yearly_edits = editor['edits_by_year']
|
27 | 27 | n = editor['edit_count']
|
28 | 28 |
|
29 | 29 | if n >= var.cum_cutoff:
|
30 | | - for i, year in enumerate(xrange(new_wikipedian.year, final_edit.year)):
|
| 30 | + for i, year in enumerate(xrange(new_wikipedian.year, final_edit)):
|
31 | 31 | edits = editor['monthly_edits'].get(str(year), {0:0})
|
32 | 32 | if year == new_wikipedian.year:
|
33 | 33 | start = new_wikipedian.month
|
Index: trunk/tools/editor_trends/analyses/plugins/total_number_of_articles.py |
— | — | @@ -0,0 +1,34 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-25'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +import datetime
|
| 22 | +
|
| 23 | +def total_number_of_articles(var, editor, **kwargs):
|
| 24 | + today = datetime.datetime.today()
|
| 25 | + obs = {}
|
| 26 | + for year in editor['edits']:
|
| 27 | + years = editor['edits'][year]
|
| 28 | + for edit in years:
|
| 29 | + article = edit['article']
|
| 30 | + obs.setdefault(article, 0)
|
| 31 | + obs[article] += 1
|
| 32 | +
|
| 33 | +
|
| 34 | + var.add(today, obs, update=True)
|
| 35 | + return var
|
Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -17,26 +17,20 @@ |
18 | 18 | __date__ = '2010-12-10' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | | -import datetime |
22 | | -import multiprocessing |
23 | | -import calendar |
| 21 | + |
24 | 22 | import sys |
25 | 23 | import os |
26 | 24 | import progressbar |
27 | 25 | import types |
28 | | -from dateutil.relativedelta import relativedelta |
| 26 | +import datetime |
29 | 27 | |
30 | 28 | sys.path.append('..') |
31 | 29 | |
32 | 30 | import configuration |
33 | 31 | settings = configuration.Settings() |
34 | 32 | from database import db |
35 | | -from etl import shaper |
36 | | -from utils import file_utils |
37 | 33 | from utils import timer |
38 | | -from utils import messages |
39 | 34 | from utils import log |
40 | | -import analyses.plugins as plugins |
41 | 35 | import dataset |
42 | 36 | |
43 | 37 | |
— | — | @@ -56,16 +50,19 @@ |
57 | 51 | print 'Exporting data for chart: %s' % func.func_name |
58 | 52 | print 'Project: %s' % dbname |
59 | 53 | print 'Dataset: %s' % collection |
| 54 | + |
60 | 55 | ds = loop_editors(dbname, project, collection, language_code, func, **kwargs) |
61 | | - file = '%s_%s.csv' % (dbname, func.func_name) |
62 | | - print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file) |
| 56 | + fn = '%s_%s.csv' % (dbname, func.func_name) |
| 57 | + |
| 58 | + print 'Storing dataset: %s' % os.path.join(settings.dataset_location, fn) |
63 | 59 | ds.write(format='csv') |
| 60 | + |
64 | 61 | print 'Serializing dataset to %s_%s' % (dbname, 'charts') |
65 | 62 | log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start') |
66 | 63 | ds.write(format='mongo') |
67 | 64 | stopwatch.elapsed() |
| 65 | + |
68 | 66 | log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish') |
69 | | - |
70 | 67 | return res |
71 | 68 | |
72 | 69 | |
— | — | @@ -74,26 +71,32 @@ |
75 | 72 | Generic loop function that loops over all the editors of a Wikipedia project |
76 | 73 | and then calls the function that does the actual aggregation. |
77 | 74 | ''' |
78 | | - |
| 75 | + mongo = db.init_mongo_db(dbname) |
| 76 | + coll = mongo[collection] |
79 | 77 | editors = db.retrieve_distinct_keys(dbname, collection, 'editor') |
80 | 78 | |
| 79 | + |
| 80 | + min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian') |
81 | 81 | pbar = progressbar.ProgressBar(maxval=len(editors)).start() |
82 | | - min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian') |
83 | 82 | print 'Number of editors: %s' % len(editors) |
84 | | - mongo = db.init_mongo_db(dbname) |
85 | | - coll = mongo[collection] |
86 | | - format = kwargs.pop('format', 'long') |
| 83 | + |
| 84 | + fmt = kwargs.pop('format', 'long') |
87 | 85 | kwargs['min_year'] = min_year |
88 | 86 | kwargs['max_year'] = max_year |
89 | | - vars = [] |
90 | | - ds = dataset.Dataset(func.func_name, project, coll.name, language_code, vars, format=format) |
| 87 | + variables = [] |
| 88 | + ds = dataset.Dataset(func.func_name, |
| 89 | + project, |
| 90 | + coll.name, |
| 91 | + language_code, |
| 92 | + variables, |
| 93 | + format=fmt) |
91 | 94 | var = dataset.Variable('count', **kwargs) |
92 | 95 | |
93 | 96 | |
94 | 97 | |
95 | 98 | for editor in editors: |
96 | 99 | editor = coll.find_one({'editor': editor}) |
97 | | - data = func(var, editor, dbname=dbname) |
| 100 | + var = func(var, editor, dbname=dbname) |
98 | 101 | pbar.update(pbar.currval + 1) |
99 | 102 | |
100 | 103 | ds.add_variable(var) |
— | — | @@ -112,8 +115,9 @@ |
113 | 116 | ignore = ['__init__'] |
114 | 117 | functions = {} |
115 | 118 | |
116 | | - pos = __file__.rfind(os.sep) |
117 | | - loc = __file__[:pos] |
| 119 | + fn = os.path.realpath(__file__) |
| 120 | + pos = fn.rfind(os.sep) |
| 121 | + loc = fn[:pos] |
118 | 122 | path = os.path.join(loc , 'plugins') |
119 | 123 | plugins = import_libs(path) |
120 | 124 | |
— | — | @@ -135,7 +139,6 @@ |
136 | 140 | ''' |
137 | 141 | Dynamically importing functions from the plugins directory. |
138 | 142 | ''' |
139 | | - |
140 | 143 | library_list = [] |
141 | 144 | sys.path.append(path) |
142 | 145 | for f in os.listdir(os.path.abspath(path)): |
— | — | @@ -152,10 +155,14 @@ |
153 | 156 | ''' |
154 | 157 | Determine the first and final year for the observed data |
155 | 158 | ''' |
156 | | - max_year = db.run_query(dbname, collection, var, 'max') |
157 | | - max_year = max_year[var].year + 1 |
158 | | - min_year = db.run_query(dbname, collection, var, 'min') |
159 | | - min_year = min_year[var].year |
| 159 | + try: |
| 160 | + max_year = db.run_query(dbname, collection, var, 'max') |
| 161 | + max_year = max_year[var].year + 1 |
| 162 | + min_year = db.run_query(dbname, collection, var, 'min') |
| 163 | + min_year = min_year[var].year |
| 164 | + except KeyError: |
| 165 | + min_year = 2001 |
| 166 | + max_year = datetime.datetime.today().year + 1 |
160 | 167 | return min_year, max_year |
161 | 168 | |
162 | 169 | |
— | — | @@ -172,8 +179,10 @@ |
173 | 180 | |
174 | 181 | |
175 | 182 | if __name__ == '__main__': |
176 | | - |
177 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=50) |
| 183 | + generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', time_unit='year') |
| 184 | + generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year') |
| 185 | + generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', time_unit='year') |
| 186 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=50) |
178 | 187 | #generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_backward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide') |
179 | 188 | #generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide') |
180 | 189 | #generate_chart_data('wiki', 'editors_dataset','en', histogram_edits, time_unit='year', cutoff=0) |
Index: trunk/tools/editor_trends/analyses/dataset.py |
— | — | @@ -98,10 +98,11 @@ |
99 | 99 | elif self.time_unit == 'month': |
100 | 100 | datum = datetime.datetime(date.year, date.month, 1) |
101 | 101 | return time.mktime(datum.timetuple()) |
| 102 | + elif self.time_unit == 'day': |
| 103 | + return time.mktime(date.timetuple()) |
102 | 104 | else: |
103 | | - return time.mktime(date.timetuple()) |
| 105 | + return date |
104 | 106 | |
105 | | - |
106 | 107 | class Observation(Data): |
107 | 108 | ''' |
108 | 109 | The smallest unit, here the actual data is being stored. |
— | — | @@ -176,8 +177,7 @@ |
177 | 178 | |
178 | 179 | class Variable(Data): |
179 | 180 | ''' |
180 | | - This class constructs a time-based variable and has some associated simple |
181 | | - statistical descriptives |
| 181 | + This class constructs a time-based variable. |
182 | 182 | ''' |
183 | 183 | |
184 | 184 | def __init__(self, name, time_unit, **kwargs): |
— | — | @@ -284,7 +284,7 @@ |
285 | 285 | self.variables.append(name) |
286 | 286 | |
287 | 287 | def __repr__(self): |
288 | | - return 'Dataset contains %s variables' % (len(self.vars)) |
| 288 | + return 'Dataset contains %s variables' % (len(self.variables)) |
289 | 289 | |
290 | 290 | def __iter__(self): |
291 | 291 | for var in self.variables: |