Index: trunk/tools/editor_trends/analyses/plugins/active_editor_count.py |
— | — | @@ -0,0 +1,28 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +def active_editor_count(var, editor, **kwargs): |
| 23 | + monthly_edits = editor['monthly_edits'] |
| 24 | + for year in xrange(ds.count.min_year, var.max_year): |
| 25 | + for month in xrange(1, 13): |
| 26 | + if monthly_edits[str(year)][str(month)] >= var.cutoff: |
| 27 | + datum = datetime.date(year, month, 1) |
| 28 | + var.add(datum, {0:1}) |
| 29 | + return var |
Property changes on: trunk/tools/editor_trends/analyses/plugins/active_editor_count.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 30 | + native |
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_bar.py |
— | — | @@ -0,0 +1,36 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-25'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +
|
| 22 | +def cohort_dataset_forward_bar(var, editor, **kwargs):
|
| 23 | + new_wikipedian = editor['new_wikipedian']
|
| 24 | + last_edit = editor['final_edit']
|
| 25 | + monthly_edits = editor['monthly_edits']
|
| 26 | + yearly_edits = editor['edits_by_year']
|
| 27 | + n = editor['edit_count']
|
| 28 | +
|
| 29 | + if n >= var.cum_cutoff:
|
| 30 | + for year in xrange(new_wikipedian.year, var.max_year):
|
| 31 | + max_edits = max(monthly_edits.get(str(year), {0:0}).values())
|
| 32 | + if yearly_edits.get(str(year), 0) == 0 or max_edits < var.cutoff:
|
| 33 | + continue
|
| 34 | + else:
|
| 35 | + experience = (year - new_wikipedian.year) + 1
|
| 36 | + var.add(new_wikipedian, {experience: 1 })
|
| 37 | + return var
|
Index: trunk/tools/editor_trends/analyses/plugins/time_to_new_wikipedian.py |
— | — | @@ -0,0 +1,26 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-25'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +def time_to_new_wikipedian(var, editor, **kwargs):
|
| 22 | +# headers = ['year', 'time_to_new_wikipedian']
|
| 23 | + new_wikipedian = editor['new_wikipedian']
|
| 24 | + first_edit = editor['first_edit']
|
| 25 | + dt = new_wikipedian - first_edit
|
| 26 | + var.add(new_wikipedian, {0:dt.days}, update=False)
|
| 27 | + return var
|
Index: trunk/tools/editor_trends/analyses/plugins/__init__.py |
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py |
— | — | @@ -0,0 +1,39 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-25'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +
|
| 22 | +def cohort_dataset_forward_histogram(var, editor, **kwargs):
|
| 23 | +# headers = ['year', 'month', 'edits']
|
| 24 | + new_wikipedian = editor['new_wikipedian']
|
| 25 | + final_edit = editor['final_edit']
|
| 26 | + yearly_edits = editor['edits_by_year']
|
| 27 | + n = editor['edit_count']
|
| 28 | +
|
| 29 | + if n >= var.cum_cutoff:
|
| 30 | + for i, year in enumerate(xrange(new_wikipedian.year, final_edit.year)):
|
| 31 | + edits = editor['monthly_edits'].get(str(year), {0:0})
|
| 32 | + if year == new_wikipedian.year:
|
| 33 | + start = new_wikipedian.month
|
| 34 | + else:
|
| 35 | + start = 1
|
| 36 | + for month in xrange(start, 13):
|
| 37 | + if edits.get(str(month), 0) >= var.cutoff:
|
| 38 | + experience = i * 12 + (month - new_wikipedian.month)
|
| 39 | + var.add(new_wikipedian, {experience: 1})
|
| 40 | + return var
|
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py |
— | — | @@ -0,0 +1,40 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-25'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +
|
| 22 | +def cohort_dataset_backward_bar(var, editor, **kwargs):
|
| 23 | + #first_edit = editor['first_edit']
|
| 24 | + new_wikipedian = editor['new_wikipedian']
|
| 25 | + n = editor['edit_count']
|
| 26 | +
|
| 27 | + if n >= var.cum_cutoff:
|
| 28 | + windows = create_windows(var, break_down_first_year=False)
|
| 29 | + for year in xrange(new_wikipedian.year, var.max_year):
|
| 30 | + year = str(year)
|
| 31 | + if editor['edits_by_year'][year] >= var.cutoff:
|
| 32 | + last_edit = editor['last_edit_by_year'][year]
|
| 33 | + if last_edit != 0.0:
|
| 34 | + editor_dt = relativedelta(last_edit, new_wikipedian)
|
| 35 | + editor_dt = (editor_dt.years * 12) + editor_dt.months
|
| 36 | + for w in windows:
|
| 37 | + if w >= editor_dt:
|
| 38 | + datum = datetime.datetime(int(year), 12, 31)
|
| 39 | + var.add(datum, {w:1})
|
| 40 | + break
|
| 41 | + return var
|
Index: trunk/tools/editor_trends/analyses/plugins/histogram_edits.py |
— | — | @@ -0,0 +1,26 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-25'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +
|
| 22 | +def histogram_edits(var, editor, **kwargs):
|
| 23 | +# headers = ['year', 'num_edits', 'frequency']
|
| 24 | + cnt = editor['edit_count']
|
| 25 | + new_wikipedian = editor['new_wikipedian']
|
| 26 | + var.add(new_wikipedian, {0: cnt})
|
| 27 | + return var
|
Index: trunk/tools/editor_trends/analyses/plugins/new_editor_count.py |
— | — | @@ -0,0 +1,31 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__email__ = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-01-25'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +
|
| 22 | +def new_editor_count(var, editor, **kwargs):
|
| 23 | + '''
|
| 24 | + Summary: This function generates an overview of the number of
|
| 25 | + new_wikipedians for a given year / month combination.
|
| 26 | + Purpose: This data can be used to compare with Erik Zachte's
|
| 27 | + stats.download.org to make sure that we are using the same numbers.
|
| 28 | + '''
|
| 29 | +# headers = ['year', 'month', 'count']
|
| 30 | + new_wikipedian = editor['new_wikipedian']
|
| 31 | + var.add(new_wikipedian, {0:1})
|
| 32 | + return var
|
Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -0,0 +1,176 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-12-10' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import datetime |
| 22 | +import multiprocessing |
| 23 | +import calendar |
| 24 | +import sys |
| 25 | +import os |
| 26 | +import inspect |
| 27 | +import progressbar |
| 28 | +import types |
| 29 | +from dateutil.relativedelta import relativedelta |
| 30 | + |
| 31 | +sys.path.append('..') |
| 32 | + |
| 33 | +import configuration |
| 34 | +settings = configuration.Settings() |
| 35 | +from database import db |
| 36 | +from etl import shaper |
| 37 | +from utils import file_utils |
| 38 | +from utils import timer |
| 39 | +from utils import messages |
| 40 | +from utils import log |
| 41 | +import analyses.plugins as plugins |
| 42 | +import dataset |
| 43 | + |
| 44 | + |
| 45 | +def available_analyses(caller='manage'): |
| 46 | + ''' |
| 47 | + Generates a dictionary: |
| 48 | + key: name of analysis |
| 49 | + value: function that generates the dataset |
| 50 | + ignore: a list of functions that should never be called from manage.py, |
| 51 | + they are not valid entry points. |
| 52 | + ''' |
| 53 | + assert caller == 'django' or caller == 'manage' |
| 54 | + ignore = ['__init__'] |
| 55 | + functions = {} |
| 56 | + |
| 57 | + fn = '%s.py' % inspect.getmodulename(__file__) |
| 58 | + loc = __file__.replace(fn, '') |
| 59 | + path = os.path.join(loc , 'plugins') |
| 60 | + plugins = import_libs(path) |
| 61 | + |
| 62 | + for plugin in plugins: |
| 63 | + if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore: |
| 64 | + functions[plugin.func_name] = plugin |
| 65 | + if caller == 'manage': |
| 66 | + return functions |
| 67 | + elif caller == 'django': |
| 68 | + django_functions = [] |
| 69 | + for function in functions: |
| 70 | + fancy_name = function.replace('_', ' ').title() |
| 71 | + django_functions.append((function, fancy_name)) |
| 72 | + |
| 73 | + return django_functions |
| 74 | + |
| 75 | + |
| 76 | +def import_libs(path): |
| 77 | + """ |
| 78 | + Dynamically importing functions from the plugins directory. |
| 79 | + """ |
| 80 | + |
| 81 | + library_list = [] |
| 82 | + sys.path.append(path) |
| 83 | + for f in os.listdir(os.path.abspath(path)): |
| 84 | + module_name, ext = os.path.splitext(f) |
| 85 | + if ext == '.py': |
| 86 | + module = __import__(module_name) |
| 87 | + func = getattr(module, module_name) |
| 88 | + library_list.append(func) |
| 89 | + |
| 90 | + return library_list |
| 91 | + |
| 92 | + |
| 93 | +def determine_project_year_range(dbname, collection, var): |
| 94 | + ''' |
| 95 | + Determine the first and final year for the observed data |
| 96 | + ''' |
| 97 | + max_year = db.run_query(dbname, collection, var, 'max') |
| 98 | + max_year = max_year[var].year + 1 |
| 99 | + min_year = db.run_query(dbname, collection, var, 'min') |
| 100 | + min_year = min_year[var].year |
| 101 | + return min_year, max_year |
| 102 | + |
| 103 | + |
| 104 | +def create_windows(var, break_down_first_year=True): |
| 105 | + ''' |
| 106 | + This function creates a list of months. If break_down_first_year = True then |
| 107 | + the first year will be split in 3, 6, 9 months as well. |
| 108 | + ''' |
| 109 | + years = var.max_year - var.min_year |
| 110 | + windows = [y * 12 for y in xrange(1, years)] |
| 111 | + if break_down_first_year: |
| 112 | + windows = [3, 6, 9] + windows |
| 113 | + return windows |
| 114 | + |
| 115 | + |
| 116 | +def generate_chart_data(project, collection, language_code, func, **kwargs): |
| 117 | + ''' |
| 118 | + This is the entry function to be called to generate data for creating charts. |
| 119 | + ''' |
| 120 | + stopwatch = timer.Timer() |
| 121 | + dbname = '%s%s' % (language_code, project) |
| 122 | + print 'Exporting data for chart: %s' % func |
| 123 | + print 'Project: %s' % dbname |
| 124 | + print 'Dataset: %s' % collection |
| 125 | + ds = loop_editors(dbname, project, collection, language_code, func, **kwargs) |
| 126 | + file = '%s_%s.csv' % (dbname, func.func_name) |
| 127 | + print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file) |
| 128 | + ds.write(format='csv') |
| 129 | + print 'Serializing dataset to %s_%s' % (dbname, 'charts') |
| 130 | + log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start') |
| 131 | + ds.write(format='mongo') |
| 132 | + stopwatch.elapsed() |
| 133 | + log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish') |
| 134 | + |
| 135 | + |
| 136 | +def loop_editors(dbname, project, collection, language_code, func, **kwargs): |
| 137 | + ''' |
| 138 | + Generic loop function that loops over all the editors of a Wikipedia project |
| 139 | + and then calls the function that does the actual aggregation. |
| 140 | + ''' |
| 141 | + |
| 142 | + editors = db.retrieve_distinct_keys(dbname, collection, 'editor') |
| 143 | + |
| 144 | + pbar = progressbar.ProgressBar(maxval=len(editors)).start() |
| 145 | + min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian') |
| 146 | + print 'Number of editors: %s' % len(editors) |
| 147 | + mongo = db.init_mongo_db(dbname) |
| 148 | + coll = mongo[collection] |
| 149 | + format = kwargs.pop('format', 'long') |
| 150 | + kwargs['min_year'] = min_year |
| 151 | + kwargs['max_year'] = max_year |
| 152 | + vars = [] |
| 153 | + ds = dataset.Dataset(func, project, coll.name, language_code, vars, format=format) |
| 154 | + var = dataset.Variable('count', **kwargs) |
| 155 | + |
| 156 | + functions = available_analyses() |
| 157 | + func = functions[func] |
| 158 | + |
| 159 | + for editor in editors: |
| 160 | + editor = coll.find_one({'editor': editor}) |
| 161 | + data = func(var, editor, dbname=dbname) |
| 162 | + pbar.update(pbar.currval + 1) |
| 163 | + |
| 164 | + ds.add_variable(var) |
| 165 | + return ds |
| 166 | + |
| 167 | + |
| 168 | +if __name__ == '__main__': |
| 169 | + |
| 170 | + generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=50) |
| 171 | + #generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_backward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide') |
| 172 | + #generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide') |
| 173 | + #generate_chart_data('wiki', 'editors_dataset','en', histogram_edits, time_unit='year', cutoff=0) |
| 174 | + #generate_chart_data('wiki', 'editors_dataset','en', time_to_new_wikipedian, time_unit='year', cutoff=0) |
| 175 | + #generate_chart_data('wiki', 'editors_dataset','en', new_editor_count, time_unit='month', cutoff=0) |
| 176 | + |
| 177 | + #available_analyses() |
Property changes on: trunk/tools/editor_trends/analyses/analyzer.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 178 | + native |