r80980 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80979‎ | r80980 | r80981 >
Date:19:10, 25 January 2011
Author:diederik
Status:deferred (Comments)
Tags:
Comment:
This commit contains a simple extendable plugin architecture to add new analyses on the fly. Drop the function in the analyses/plugins/ folder and it becomes available both as an option in manage.py and as a chart in wikilytics. The signature of a plugin function is:
var: an instance of Variable
editor: an object from Mongo
kwargs: additional keyword arguments

var should be returned.
Modified paths:
  • /trunk/tools/editor_trends/analyses/analyzer.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/__init__.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/active_editor_count.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_bar.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/histogram_edits.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/new_editor_count.py (added) (history)
  • /trunk/tools/editor_trends/analyses/plugins/time_to_new_wikipedian.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/active_editor_count.py
@@ -0,0 +1,28 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+
 22+def active_editor_count(var, editor, **kwargs):
 23+ monthly_edits = editor['monthly_edits']
 24+ for year in xrange(ds.count.min_year, var.max_year):
 25+ for month in xrange(1, 13):
 26+ if monthly_edits[str(year)][str(month)] >= var.cutoff:
 27+ datum = datetime.date(year, month, 1)
 28+ var.add(datum, {0:1})
 29+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/active_editor_count.py
___________________________________________________________________
Added: svn:eol-style
130 + native
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_bar.py
@@ -0,0 +1,36 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+
 22+def cohort_dataset_forward_bar(var, editor, **kwargs):
 23+ new_wikipedian = editor['new_wikipedian']
 24+ last_edit = editor['final_edit']
 25+ monthly_edits = editor['monthly_edits']
 26+ yearly_edits = editor['edits_by_year']
 27+ n = editor['edit_count']
 28+
 29+ if n >= var.cum_cutoff:
 30+ for year in xrange(new_wikipedian.year, var.max_year):
 31+ max_edits = max(monthly_edits.get(str(year), {0:0}).values())
 32+ if yearly_edits.get(str(year), 0) == 0 or max_edits < var.cutoff:
 33+ continue
 34+ else:
 35+ experience = (year - new_wikipedian.year) + 1
 36+ var.add(new_wikipedian, {experience: 1 })
 37+ return var
Index: trunk/tools/editor_trends/analyses/plugins/time_to_new_wikipedian.py
@@ -0,0 +1,26 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+def time_to_new_wikipedian(var, editor, **kwargs):
 22+# headers = ['year', 'time_to_new_wikipedian']
 23+ new_wikipedian = editor['new_wikipedian']
 24+ first_edit = editor['first_edit']
 25+ dt = new_wikipedian - first_edit
 26+ var.add(new_wikipedian, {0:dt.days}, update=False)
 27+ return var
Index: trunk/tools/editor_trends/analyses/plugins/__init__.py
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py
@@ -0,0 +1,39 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+
 22+def cohort_dataset_forward_histogram(var, editor, **kwargs):
 23+# headers = ['year', 'month', 'edits']
 24+ new_wikipedian = editor['new_wikipedian']
 25+ final_edit = editor['final_edit']
 26+ yearly_edits = editor['edits_by_year']
 27+ n = editor['edit_count']
 28+
 29+ if n >= var.cum_cutoff:
 30+ for i, year in enumerate(xrange(new_wikipedian.year, final_edit.year)):
 31+ edits = editor['monthly_edits'].get(str(year), {0:0})
 32+ if year == new_wikipedian.year:
 33+ start = new_wikipedian.month
 34+ else:
 35+ start = 1
 36+ for month in xrange(start, 13):
 37+ if edits.get(str(month), 0) >= var.cutoff:
 38+ experience = i * 12 + (month - new_wikipedian.month)
 39+ var.add(new_wikipedian, {experience: 1})
 40+ return var
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py
@@ -0,0 +1,40 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+
 22+def cohort_dataset_backward_bar(var, editor, **kwargs):
 23+ #first_edit = editor['first_edit']
 24+ new_wikipedian = editor['new_wikipedian']
 25+ n = editor['edit_count']
 26+
 27+ if n >= var.cum_cutoff:
 28+ windows = create_windows(var, break_down_first_year=False)
 29+ for year in xrange(new_wikipedian.year, var.max_year):
 30+ year = str(year)
 31+ if editor['edits_by_year'][year] >= var.cutoff:
 32+ last_edit = editor['last_edit_by_year'][year]
 33+ if last_edit != 0.0:
 34+ editor_dt = relativedelta(last_edit, new_wikipedian)
 35+ editor_dt = (editor_dt.years * 12) + editor_dt.months
 36+ for w in windows:
 37+ if w >= editor_dt:
 38+ datum = datetime.datetime(int(year), 12, 31)
 39+ var.add(datum, {w:1})
 40+ break
 41+ return var
Index: trunk/tools/editor_trends/analyses/plugins/histogram_edits.py
@@ -0,0 +1,26 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+
 22+def histogram_edits(var, editor, **kwargs):
 23+# headers = ['year', 'num_edits', 'frequency']
 24+ cnt = editor['edit_count']
 25+ new_wikipedian = editor['new_wikipedian']
 26+ var.add(new_wikipedian, {0: cnt})
 27+ return var
Index: trunk/tools/editor_trends/analyses/plugins/new_editor_count.py
@@ -0,0 +1,31 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+
 22+def new_editor_count(var, editor, **kwargs):
 23+ '''
 24+ Summary: This function generates an overview of the number of
 25+ new_wikipedians for a given year / month combination.
 26+ Purpose: This data can be used to compare with Erik Zachte's
 27+ stats.download.org to make sure that we are using the same numbers.
 28+ '''
 29+# headers = ['year', 'month', 'count']
 30+ new_wikipedian = editor['new_wikipedian']
 31+ var.add(new_wikipedian, {0:1})
 32+ return var
Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -0,0 +1,176 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2010-12-10'
 19+__version__ = '0.1'
 20+
 21+import datetime
 22+import multiprocessing
 23+import calendar
 24+import sys
 25+import os
 26+import inspect
 27+import progressbar
 28+import types
 29+from dateutil.relativedelta import relativedelta
 30+
 31+sys.path.append('..')
 32+
 33+import configuration
 34+settings = configuration.Settings()
 35+from database import db
 36+from etl import shaper
 37+from utils import file_utils
 38+from utils import timer
 39+from utils import messages
 40+from utils import log
 41+import analyses.plugins as plugins
 42+import dataset
 43+
 44+
 45+def available_analyses(caller='manage'):
 46+ '''
 47+ Generates a dictionary:
 48+ key: name of analysis
 49+ value: function that generates the dataset
 50+ ignore: a list of functions that should never be called from manage.py,
 51+ they are not valid entry points.
 52+ '''
 53+ assert caller == 'django' or caller == 'manage'
 54+ ignore = ['__init__']
 55+ functions = {}
 56+
 57+ fn = '%s.py' % inspect.getmodulename(__file__)
 58+ loc = __file__.replace(fn, '')
 59+ path = os.path.join(loc , 'plugins')
 60+ plugins = import_libs(path)
 61+
 62+ for plugin in plugins:
 63+ if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:
 64+ functions[plugin.func_name] = plugin
 65+ if caller == 'manage':
 66+ return functions
 67+ elif caller == 'django':
 68+ django_functions = []
 69+ for function in functions:
 70+ fancy_name = function.replace('_', ' ').title()
 71+ django_functions.append((function, fancy_name))
 72+
 73+ return django_functions
 74+
 75+
 76+def import_libs(path):
 77+ """
 78+ Dynamically importing functions from the plugins directory.
 79+ """
 80+
 81+ library_list = []
 82+ sys.path.append(path)
 83+ for f in os.listdir(os.path.abspath(path)):
 84+ module_name, ext = os.path.splitext(f)
 85+ if ext == '.py':
 86+ module = __import__(module_name)
 87+ func = getattr(module, module_name)
 88+ library_list.append(func)
 89+
 90+ return library_list
 91+
 92+
 93+def determine_project_year_range(dbname, collection, var):
 94+ '''
 95+ Determine the first and final year for the observed data
 96+ '''
 97+ max_year = db.run_query(dbname, collection, var, 'max')
 98+ max_year = max_year[var].year + 1
 99+ min_year = db.run_query(dbname, collection, var, 'min')
 100+ min_year = min_year[var].year
 101+ return min_year, max_year
 102+
 103+
 104+def create_windows(var, break_down_first_year=True):
 105+ '''
 106+ This function creates a list of months. If break_down_first_year = True then
 107+ the first year will be split in 3, 6, 9 months as well.
 108+ '''
 109+ years = var.max_year - var.min_year
 110+ windows = [y * 12 for y in xrange(1, years)]
 111+ if break_down_first_year:
 112+ windows = [3, 6, 9] + windows
 113+ return windows
 114+
 115+
 116+def generate_chart_data(project, collection, language_code, func, **kwargs):
 117+ '''
 118+ This is the entry function to be called to generate data for creating charts.
 119+ '''
 120+ stopwatch = timer.Timer()
 121+ dbname = '%s%s' % (language_code, project)
 122+ print 'Exporting data for chart: %s' % func
 123+ print 'Project: %s' % dbname
 124+ print 'Dataset: %s' % collection
 125+ ds = loop_editors(dbname, project, collection, language_code, func, **kwargs)
 126+ file = '%s_%s.csv' % (dbname, func.func_name)
 127+ print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file)
 128+ ds.write(format='csv')
 129+ print 'Serializing dataset to %s_%s' % (dbname, 'charts')
 130+ log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start')
 131+ ds.write(format='mongo')
 132+ stopwatch.elapsed()
 133+ log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish')
 134+
 135+
 136+def loop_editors(dbname, project, collection, language_code, func, **kwargs):
 137+ '''
 138+ Generic loop function that loops over all the editors of a Wikipedia project
 139+ and then calls the function that does the actual aggregation.
 140+ '''
 141+
 142+ editors = db.retrieve_distinct_keys(dbname, collection, 'editor')
 143+
 144+ pbar = progressbar.ProgressBar(maxval=len(editors)).start()
 145+ min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian')
 146+ print 'Number of editors: %s' % len(editors)
 147+ mongo = db.init_mongo_db(dbname)
 148+ coll = mongo[collection]
 149+ format = kwargs.pop('format', 'long')
 150+ kwargs['min_year'] = min_year
 151+ kwargs['max_year'] = max_year
 152+ vars = []
 153+ ds = dataset.Dataset(func, project, coll.name, language_code, vars, format=format)
 154+ var = dataset.Variable('count', **kwargs)
 155+
 156+ functions = available_analyses()
 157+ func = functions[func]
 158+
 159+ for editor in editors:
 160+ editor = coll.find_one({'editor': editor})
 161+ data = func(var, editor, dbname=dbname)
 162+ pbar.update(pbar.currval + 1)
 163+
 164+ ds.add_variable(var)
 165+ return ds
 166+
 167+
 168+if __name__ == '__main__':
 169+
 170+ generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=50)
 171+ #generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_backward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide')
 172+ #generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide')
 173+ #generate_chart_data('wiki', 'editors_dataset','en', histogram_edits, time_unit='year', cutoff=0)
 174+ #generate_chart_data('wiki', 'editors_dataset','en', time_to_new_wikipedian, time_unit='year', cutoff=0)
 175+ #generate_chart_data('wiki', 'editors_dataset','en', new_editor_count, time_unit='month', cutoff=0)
 176+
 177+ #available_analyses()
Property changes on: trunk/tools/editor_trends/analyses/analyzer.py
___________________________________________________________________
Added: svn:eol-style
1178 + native

Comments

#Comment by Krinkle (talk | contribs)   19:16, 25 January 2011

time_to_new_wikipedian.py, cohort_dataset_forward_bar.py, cohort_dataset_forward_histogram.py, cohort_dataset_backward_bar.py, histogram_edits.py -- These seem to have wrong svn properties svn:eol-style

  + native

is needed (see the line height/breaks above)

Status & tagging log