r81236 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r81235‎ | r81236 | r81237 >
Date:06:45, 31 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Sync
Modified paths:
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/count_editors.py (deleted) (history)
  • /trunk/tools/editor_trends/analyses/plugins/edit_patterns.py (added) (history)
  • /trunk/tools/editor_trends/database/cache.py (modified) (history)
  • /trunk/tools/editor_trends/run.py (deleted) (history)

Diff [purge]

Index: trunk/tools/editor_trends/run.py
@@ -1,82 +0,0 @@
2 -import os
3 -import configuration
4 -settings = configuration.Settings()
5 -#from utils import namespace_downloader as nd
6 -#nd.launch_downloader()
7 -
8 -
9 -#def which(program):
10 -# import os
11 -# def is_exe(fpath):
12 -# return os.path.exists(fpath) and os.access(fpath, os.X_OK)
13 -#
14 -# fpath, fname = os.path.split(program)
15 -# if fpath:
16 -# if is_exe(program):
17 -# return program
18 -# else:
19 -# for path in os.environ["PATH"].split(os.pathsep):
20 -# exe_file = os.path.join(path, program)
21 -# if is_exe(exe_file):
22 -# return exe_file
23 -#
24 -# return None
25 -#
26 -#
27 -#result = which('7z.exe')
28 -#print result
29 -
30 -#from database import launcher
31 -#launcher.launcher()
32 -from utils import sort
33 -input = os.path.join(settings.input_location, 'en', 'wiki', 'txt')
34 -output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
35 -dbname = 'enwiki'
36 -#sort.debug_mergesort_feeder(input, output)
37 -#sort.mergesort_launcher(input, output)
38 -#sort.mergesort_external_launcher(dbname, output, output)
39 -
40 -
41 -
42 -
43 -
44 -from analyses import cohort_charts
45 -cohort_charts.prepare_cohort_dataset()
46 -import os
47 -
48 -import configuration
49 -settings = configuration.Settings()
50 -#from utils import namespace_downloader as nd
51 -#nd.launch_downloader()
52 -
53 -
54 -#def which(program):
55 -# import os
56 -# def is_exe(fpath):
57 -# return os.path.exists(fpath) and os.access(fpath, os.X_OK)
58 -#
59 -# fpath, fname = os.path.split(program)
60 -# if fpath:
61 -# if is_exe(program):
62 -# return program
63 -# else:
64 -# for path in os.environ["PATH"].split(os.pathsep):
65 -# exe_file = os.path.join(path, program)
66 -# if is_exe(exe_file):
67 -# return exe_file
68 -#
69 -# return None
70 -#
71 -#
72 -#result = which('7z.exe')
73 -#print result
74 -
75 -#from database import launcher
76 -#launcher.launcher()
77 -from etl import loader
78 -input = os.path.join(settings.input_location, 'en', 'wiki', 'txt')
79 -output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
80 -dbname = 'enwiki'
81 -#sort.debug_mergesort_feeder(input, output)
82 -#sort.mergesort_launcher(input, output)
83 -loader.mergesort_external_launcher(dbname, output, output)
\ No newline at end of file
Index: trunk/tools/editor_trends/analyses/count_editors.py
@@ -1,251 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2010-12-10'
19 -__version__ = '0.1'
20 -
21 -import datetime
22 -import multiprocessing
23 -import calendar
24 -import sys
25 -import os
26 -import progressbar
27 -import types
28 -from dateutil.relativedelta import relativedelta
29 -
30 -sys.path.append('..')
31 -
32 -import configuration
33 -settings = configuration.Settings()
34 -from database import db
35 -from etl import shaper
36 -from utils import file_utils
37 -from utils import timer
38 -from utils import messages
39 -from utils import log
40 -import dataset
41 -
42 -def available_analyses(caller='manage'):
43 - '''
44 - Generates a dictionary:
45 - key: name of analysis
46 - value: function that generates the dataset
47 - ignore: a list of functions that should never be called from manage.py,
48 - they are not valid entry points.
49 - '''
50 - assert caller == 'django' or caller == 'manage'
51 - ignore = ['analyses', 'determine_project_year_range', 'create_windows',
52 - 'generate_chart_data', 'loop_editors']
53 - functions = {}
54 - for func in globals():
55 - func = globals()[func]
56 - if isinstance(func, types.FunctionType) and func.func_name not in ignore:
57 - functions[func.func_name] = func
58 - if caller == 'manage':
59 - return functions
60 - elif caller == 'django':
61 - django_functions = []
62 - for function in functions:
63 - fancy_name = function.replace('_', ' ').title()
64 - django_functions.append((function, fancy_name))
65 -
66 - return django_functions
67 -
68 -
69 -def determine_project_year_range(dbname, collection, var):
70 - '''
71 - Determine the first and final year for the observed data
72 - '''
73 - max_year = db.run_query(dbname, collection, var, 'max')
74 - max_year = max_year[var].year + 1
75 - min_year = db.run_query(dbname, collection, var, 'min')
76 - min_year = min_year[var].year
77 - return min_year, max_year
78 -
79 -
80 -def create_windows(var, break_down_first_year=True):
81 - '''
82 - This function creates a list of months. If break_down_first_year = True then
83 - the first year will be split in 3, 6, 9 months as well.
84 - '''
85 - years = var.max_year - var.min_year
86 - windows = [y * 12 for y in xrange(1, years)]
87 - if break_down_first_year:
88 - windows = [3, 6, 9] + windows
89 - return windows
90 -
91 -
92 -def generate_chart_data(project, collection, language_code, func, **kwargs):
93 - '''
94 - This is the entry function to be called to generate data for creating charts.
95 - '''
96 - stopwatch = timer.Timer()
97 - dbname = '%s%s' % (language_code, project)
98 - print 'Exporting data for chart: %s' % func.func_name
99 - print 'Project: %s' % dbname
100 - print 'Dataset: %s' % collection
101 - ds = loop_editors(dbname, project, collection, language_code, func, **kwargs)
102 - file = '%s_%s.csv' % (dbname, func.func_name)
103 - print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file)
104 - ds.write(format='csv')
105 - print 'Serializing dataset to %s_%s' % (dbname, 'charts')
106 - log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start')
107 - ds.write(format='mongo')
108 - stopwatch.elapsed()
109 - log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish')
110 -
111 -
112 -def loop_editors(dbname, project, collection, language_code, func, **kwargs):
113 - '''
114 - Generic loop function that loops over all the editors of a Wikipedia project
115 - and then calls the function that does the actual aggregation.
116 - '''
117 -
118 - editors = db.retrieve_distinct_keys(dbname, collection, 'editor')
119 - pbar = progressbar.ProgressBar(maxval=len(editors)).start()
120 - min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian')
121 - print 'Number of editors: %s' % len(editors)
122 - mongo = db.init_mongo_db(dbname)
123 - coll = mongo[collection]
124 - format = kwargs.pop('format', 'long')
125 - kwargs['min_year'] = min_year
126 - kwargs['max_year'] = max_year
127 - vars = []
128 - ds = dataset.Dataset(func.func_name, project, coll.name, language_code, vars, format=format)
129 - var = dataset.Variable('count', **kwargs)
130 -# cutoff=cutoff,
131 -# cum_cutoff=cum_cutoff,
132 -# min_year=min_year,
133 -# max_year=max_year)
134 - for editor in editors:
135 - editor = coll.find_one({'editor': editor})
136 - data = func(var, editor, dbname=dbname)
137 - pbar.update(pbar.currval + 1)
138 -
139 - ds.add_variable(var)
140 - return ds
141 -
142 -
143 -def cohort_dataset_forward_histogram(var, editor, **kwargs):
144 -# headers = ['year', 'month', 'edits']
145 - new_wikipedian = editor['new_wikipedian']
146 - final_edit = editor['final_edit']
147 - yearly_edits = editor['edits_by_year']
148 - n = editor['edit_count']
149 -
150 - if n >= var.cum_cutoff:
151 - for i, year in enumerate(xrange(new_wikipedian.year, final_edit.year)):
152 - edits = editor['monthly_edits'].get(str(year), {0:0})
153 - if year == new_wikipedian.year:
154 - start = new_wikipedian.month
155 - else:
156 - start = 1
157 - for month in xrange(start, 13):
158 - if edits.get(str(month), 0) >= var.cutoff:
159 - experience = i * 12 + (month - new_wikipedian.month)
160 - var.add(new_wikipedian, {experience: 1})
161 - return var
162 -
163 -
164 -def cohort_dataset_backward_bar(var, editor, **kwargs):
165 - #first_edit = editor['first_edit']
166 - new_wikipedian = editor['new_wikipedian']
167 - n = editor['edit_count']
168 -
169 - if n >= var.cum_cutoff:
170 - windows = create_windows(var, break_down_first_year=False)
171 - for year in xrange(new_wikipedian.year, var.max_year):
172 - year = str(year)
173 - if editor['edits_by_year'][year] >= var.cutoff:
174 - last_edit = editor['last_edit_by_year'][year]
175 - if last_edit != 0.0:
176 - editor_dt = relativedelta(last_edit, new_wikipedian)
177 - editor_dt = (editor_dt.years * 12) + editor_dt.months
178 - for w in windows:
179 - if w >= editor_dt:
180 - datum = datetime.datetime(int(year), 12, 31)
181 - var.add(datum, {w:1})
182 - break
183 - return var
184 -
185 -
186 -def cohort_dataset_forward_bar(var, editor, **kwargs):
187 - new_wikipedian = editor['new_wikipedian']
188 - last_edit = editor['final_edit']
189 - monthly_edits = editor['monthly_edits']
190 - yearly_edits = editor['edits_by_year']
191 - n = editor['edit_count']
192 -
193 - if n >= var.cum_cutoff:
194 - for year in xrange(new_wikipedian.year, var.max_year):
195 - max_edits = max(monthly_edits.get(str(year), {0:0}).values())
196 - if yearly_edits.get(str(year), 0) == 0 or max_edits < var.cutoff:
197 - continue
198 - else:
199 - experience = (year - new_wikipedian.year) + 1
200 - var.add(new_wikipedian, {experience: 1 })
201 - return var
202 -
203 -
204 -def new_editor_count(var, editor, **kwargs):
205 - '''
206 - Summary: This function generates an overview of the number of
207 - new_wikipedians for a given year / month combination.
208 - Purpose: This data can be used to compare with Erik Zachte's
209 - stats.download.org to make sure that we are using the same numbers.
210 - '''
211 -# headers = ['year', 'month', 'count']
212 - new_wikipedian = editor['new_wikipedian']
213 - var.add(new_wikipedian, {0:1})
214 - return var
215 -
216 -
217 -def active_editor_count(var, editor, **kwargs):
218 - monthly_edits = editor['monthly_edits']
219 - for year in xrange(ds.count.min_year, var.max_year):
220 - for month in xrange(1, 13):
221 - if monthly_edits[str(year)][str(month)] >= var.cutoff:
222 - datum = datetime.date(year, month, 1)
223 - var.add(datum, {0:1})
224 - return var
225 -
226 -
227 -def histogram_edits(var, editor, **kwargs):
228 -# headers = ['year', 'num_edits', 'frequency']
229 - cnt = editor['edit_count']
230 - new_wikipedian = editor['new_wikipedian']
231 - var.add(new_wikipedian, {0: cnt})
232 - return var
233 -
234 -
235 -def time_to_new_wikipedian(var, editor, **kwargs):
236 -# headers = ['year', 'time_to_new_wikipedian']
237 - new_wikipedian = editor['new_wikipedian']
238 - first_edit = editor['first_edit']
239 - dt = new_wikipedian - first_edit
240 - var.add(new_wikipedian, {0:dt.days}, update=False)
241 - return var
242 -
243 -
244 -if __name__ == '__main__':
245 - generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_histogram, time_unit='month', cutoff=1, cum_cutoff=50)
246 - generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_backward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide')
247 - generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide')
248 - #generate_chart_data('wiki', 'editors_dataset','en', histogram_edits, time_unit='year', cutoff=0)
249 - #generate_chart_data('wiki', 'editors_dataset','en', time_to_new_wikipedian, time_unit='year', cutoff=0)
250 - #generate_chart_data('wiki', 'editors_dataset','en', new_editor_count, time_unit='month', cutoff=0)
251 -
252 - #analyses()
Index: trunk/tools/editor_trends/analyses/plugins/edit_patterns.py
@@ -0,0 +1,47 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-28'
 19+__version__ = '0.1'
 20+
 21+import datetime
 22+
 23+def edit_patterns(var, editor, **kwargs):
 24+ monthly = editor['monthly_edits']
 25+ new_wikipedian = editor['new_wikipedian']
 26+ final_edit = editor['final_edit']
 27+ dt = final_edit - new_wikipedian
 28+ if dt.days < 366:
 29+ return var
 30+
 31+ m = 0
 32+ obs = {}
 33+ for year in xrange(new_wikipedian.year, new_wikipedian.year + 2):
 34+ if m == 12:
 35+ break
 36+ for month in xrange(new_wikipedian.month, 13):
 37+ n = monthly[str(year)][str(month)]
 38+ date = datetime.datetime(year, month, 1)
 39+ if n >= var.cutoff:
 40+ obs[m] = True
 41+ else:
 42+ obs[m] = False
 43+ m += 1
 44+ if m == 12:
 45+ break
 46+ if m == 12:
 47+ var.add(date, obs, update=False)
 48+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/edit_patterns.py
___________________________________________________________________
Added: svn:eol-style
149 + native
Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -55,7 +55,7 @@
5656 fn = '%s_%s.csv' % (dbname, func.func_name)
5757
5858 print 'Storing dataset: %s' % os.path.join(settings.dataset_location, fn)
59 - ds.write(format='csv')
 59+ #ds.write(format='csv')
6060
6161 print 'Serializing dataset to %s_%s' % (dbname, 'charts')
6262 log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start')
@@ -168,14 +168,14 @@
169169
170170 if __name__ == '__main__':
171171 generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
172 - #generate_chart_data('wiki', 'editors_dataset','en', 'total_number_of_new_wikipedians', time_unit='year')
173 - #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year')
174 - #generate_chart_data('wiki', 'editors_dataset','en', 'total_cumulative_edits', time_unit='year')
175 - #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)
176 - #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')
177 - #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
178 - #generate_chart_data('wiki', 'editors_dataset','en', 'histogram_edits', time_unit='year', cutoff=0)
179 - #generate_chart_data('wiki', 'editors_dataset','en', 'time_to_new_wikipedian', time_unit='year', cutoff=0)
180 - #generate_chart_data('wiki', 'editors_dataset','en', 'new_editor_count', time_unit='month', cutoff=0)
 172+ generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', time_unit='year')
 173+ generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year')
 174+ generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', time_unit='year')
 175+ generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)
 176+ generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')
 177+ generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
 178+ generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_edits', time_unit='year', cutoff=0)
 179+ generate_chart_data('wiki', 'editors_dataset', 'en', 'time_to_new_wikipedian', time_unit='year', cutoff=0)
 180+ generate_chart_data('wiki', 'editors_dataset', 'en', 'new_editor_count', time_unit='month', cutoff=0)
181181
182182 #available_analyses()
Index: trunk/tools/editor_trends/database/cache.py
@@ -76,8 +76,8 @@
7777
7878 def insert(self, editor, values, username):
7979 '''
80 - Adding the safe=True statement slows down the insert process but this assures that all data
81 - will be written.
 80+ Adding the safe=True statement slows down the insert process but this
 81+ assures that all data will be written.
8282 '''
8383 try:
8484 self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True)

Status & tagging log