r76861 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r76860‎ \| r76861 \| r76862 >
Date:	23:59, 16 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Final commit code refactoring, contains some restructering of directories.
Modified paths:	/trunk/tools/editor_trends (modified) (history) /trunk/tools/editor_trends/construct_datasets.py (deleted) (history) /trunk/tools/editor_trends/datasets (modified) (history) /trunk/tools/editor_trends/documentation (added) (history) /trunk/tools/editor_trends/errors (modified) (history) /trunk/tools/editor_trends/init_bot_db.py (deleted) (history) /trunk/tools/editor_trends/map_wiki_editors.py (deleted) (history) /trunk/tools/editor_trends/optimize_editors.py (deleted) (history) /trunk/tools/editor_trends/settings.py (deleted) (history) /trunk/tools/editor_trends/split_xml_file.py (deleted) (history) /trunk/tools/editor_trends/statistics/__init__.py (replaced) (history) /trunk/tools/editor_trends/statistics/dataset.py (replaced) (history) /trunk/tools/editor_trends/statistics/median.py (replaced) (history) /trunk/tools/editor_trends/statistics/stata/cohort_charts.do (added) (history) /trunk/tools/editor_trends/statistics/stata/wiki.do (replaced) (history)

Diff [purge]

Index: trunk/tools/editor_trends/optimize_editors.py
—	—	@@ -1,171 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__author__email = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-11-02'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-from multiprocessing import Queue~~
22		~~-from Queue import Empty~~
23		~~-from operator import itemgetter~~
24		~~-import datetime~~
25		-
26		~~-import settings~~
27		~~-from database import db~~
28		~~-from utils import process_constructor as pc~~
29		~~-from utils import utils~~
30		~~-import construct_datasets~~
31		-
32		-
33		~~-try:~~
34		~~- import psyco~~
35		~~- psyco.full()~~
36		~~-except ImportError:~~
37		~~- pass~~
38		-
39		-
40		~~-def create_datacontainer(init_value=0):~~
41		~~- '''~~
42		~~- This function initializes an empty dictionary with as key the year (starting~~
43		~~- 2001 and running through) and as value @init_value, in most cases this will~~
44		~~- be zero so the dictionary will act as a running tally for a variable but~~
45		~~- @init_value can also a list, [], or a dictionary, {}, or a set, set().~~
46		~~- '''~~
47		~~- data = {}~~
48		~~- year = datetime.datetime.now().year + 1~~
49		~~- for x in xrange(2001, year):~~
50		~~- data[str(x)] = init_value~~
51		~~- return data~~
52		-
53		-
54		~~-def add_months_to_datacontainer(datacontainer):~~
55		~~- for dc in datacontainer:~~
56		~~- datacontainer[dc] = {}~~
57		~~- for x in xrange(1, 13):~~
58		~~- datacontainer[dc][str(x)] = 0~~
59		~~- return datacontainer~~
60		-
61		-
62		~~-def determine_edits_by_month(edits):~~
63		~~- datacontainer = create_datacontainer(init_value=0)~~
64		~~- datacontainer = add_months_to_datacontainer(datacontainer)~~
65		~~- for year in edits:~~
66		~~- months = set()~~
67		~~- for edit in edits[year]:~~
68		~~- m = str(edit['date'].month)~~
69		~~- if m not in months:~~
70		~~- datacontainer[year][m] = 1~~
71		~~- months.add(m)~~
72		~~- if len(months) == 12:~~
73		~~- break~~
74		~~- return datacontainer~~
75		-
76		-
77		~~-def determine_edits_by_year(dates):~~
78		~~- '''~~
79		~~- This function counts the number of edits by year made by a particular editor.~~
80		~~- '''~~
81		~~- edits = create_datacontainer()~~
82		~~- for date in dates:~~
83		~~- year = str(date['date'].year)~~
84		~~- edits[year] += 1~~
85		~~- return edits~~
86		-
87		-
88		~~-def determine_articles_by_year(dates):~~
89		~~- '''~~
90		~~- This function counts the number of unique articles by year edited by a~~
91		~~- particular editor.~~
92		~~- '''~~
93		~~- articles = create_datacontainer(set())~~
94		~~- for date in dates:~~
95		~~- year = str(date['date'].year)~~
96		~~- articles[year].add(date['article'])~~
97		~~- for article in articles:~~
98		~~- articles[article] = len(articles[article])~~
99		~~- return articles~~
100		-
101		-
102		~~-def sort_edits(edits):~~
103		~~- edits = utils.merge_list(edits)~~
104		~~- return sorted(edits, key=itemgetter('date'))~~
105		-
106		-
107		~~-def optimize_editors(input_queue, result_queue, pbar, **kwargs):~~
108		~~- dbname = kwargs.pop('dbname')~~
109		~~- mongo = db.init_mongo_db(dbname)~~
110		~~- input = mongo['test']~~
111		~~- output = mongo['dataset']~~
112		~~- output.ensure_index('editor')~~
113		~~- output.ensure_index('year_joined')~~
114		~~- definition = kwargs.pop('definition')~~
115		~~- while True:~~
116		~~- try:~~
117		~~- id = input_queue.get(block=False)~~
118		~~- editor = input.find_one({'editor': id})~~
119		~~- if editor == None:~~
120		~~- continue~~
121		~~- edits = editor['edits']~~
122		~~- monthly_edits = determine_edits_by_month(edits)~~
123		~~- edits = sort_edits(edits)~~
124		~~- edit_count = len(edits)~~
125		~~- new_wikipedian = edits[9]['date']~~
126		~~- first_edit = edits[0]['date']~~
127		~~- final_edit = edits[-1]['date']~~
128		~~- edits_by_year = determine_edits_by_year(edits)~~
129		~~- articles_by_year = determine_articles_by_year(edits)~~
130		-
131		~~- edits = edits[:10]~~
132		-
133		~~- output.insert({'editor': id, 'edits': edits,~~
134		~~- 'edits_by_year': edits_by_year,~~
135		~~- 'new_wikipedian': new_wikipedian,~~
136		~~- 'edit_count': edit_count,~~
137		~~- 'final_edit': final_edit,~~
138		~~- 'first_edit': first_edit,~~
139		~~- 'articles_by_year': articles_by_year,~~
140		~~- 'monthly_edits': monthly_edits})~~
141		~~- print 'Items left: %s' % input_queue.qsize()~~
142		~~- except Empty:~~
143		~~- break~~
144		-
145		-
146		~~-def run_optimize_editors(dbname):~~
147		~~- ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')~~
148		~~- kwargs = {'definition': 'traditional',~~
149		~~- 'pbar': True,~~
150		~~- 'dbname': 'enwiki',~~
151		~~- 'nr_input_processors': 1,~~
152		~~- 'nr_output_processors': 0,~~
153		~~- 'poison_pill': False~~
154		~~- }~~
155		~~- print len(ids)~~
156		~~- ids = list(ids)~~
157		~~- chunks = dict(0, ids)~~
158		~~- pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)~~
159		-
160		-
161		~~-def debug_optimize_editors(dbname):~~
162		~~- ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')~~
163		~~- q = pc.load_queue(ids)~~
164		~~- kwargs = {'definition': 'traditional',~~
165		~~- 'dbname': dbname~~
166		~~- }~~
167		~~- optimize_editors(q, False, True, kwargs)~~
168		-
169		-
170		~~-if __name__ == '__main__':~~
171		~~- #debug_optimize_editors('test')~~
172		~~- run_optimize_editors('enwiki')~~
\ No newline at end of file
Index: trunk/tools/editor_trends/map_wiki_editors.py
—	—	@@ -1,341 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__author__email = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-10-21'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-#Default Python libraries (Python => 2.6)~~
22		~~-import sys~~
23		~~-import os~~
24		~~-import time~~
25		~~-import datetime~~
26		~~-import codecs~~
27		~~-import math~~
28		~~-import cStringIO~~
29		~~-import re~~
30		~~-from operator import itemgetter~~
31		~~-import xml.etree.cElementTree as cElementTree~~
32		~~-from multiprocessing import Queue, JoinableQueue~~
33		~~-from Queue import Empty~~
34		~~-import pymongo~~
35		-
36		~~-# Custom written files~~
37		~~-import settings~~
38		~~-from utils import utils, models~~
39		~~-from database import db_settings~~
40		~~-from database import db~~
41		~~-from database import cache~~
42		~~-from wikitree import xml~~
43		~~-from statistics import dataset~~
44		~~-from utils import process_constructor as pc~~
45		-
46		-
47		~~-try:~~
48		~~- import psyco~~
49		~~- psyco.full()~~
50		~~-except ImportError:~~
51		~~- pass~~
52		-
53		-
54		~~-def determine_username_is_bot(username, kwargs):~~
55		~~- '''~~
56		~~- @username is the xml element containing the id of the user~~
57		~~- @kwargs should have a list with all the bot ids~~
58		-
59		~~- @Return False if username id is not in bot list id or True if username id~~
60		~~- is a bot id.~~
61		~~- '''~~
62		~~- ids = kwargs.get('bots', [])~~
63		~~- if ids == None:~~
64		~~- ids = []~~
65		~~- if username != None and username.text != None:~~
66		~~- id = username.text~~
67		~~- if id in ids:~~
68		~~- return 1~~
69		~~- else:~~
70		~~- return 0~~
71		-
72		-
73		~~-def extract_contributor_id(contributor, kwargs):~~
74		~~- '''~~
75		~~- @contributor is the xml contributor node containing a number of attributes~~
76		-
77		~~- Currently, we are only interested in registered contributors, hence we~~
78		~~- ignore anonymous editors. If you are interested in collecting data on~~
79		~~- anonymous editors then add the string 'ip' to the tags variable.~~
80		~~- '''~~
81		~~- tags = ['id']~~
82		~~- if contributor.get('deleted'):~~
83		~~- return - 1 # ASK: Not sure if this is the best way to code deleted contributors.~~
84		~~- for elem in contributor:~~
85		~~- if elem.tag in tags:~~
86		~~- if elem.text != None:~~
87		~~- return elem.text.decode('utf-8')~~
88		~~- else:~~
89		~~- return - 1~~
90		-
91		-
92		~~-def output_editor_information(elem, output, **kwargs):~~
93		~~- '''~~
94		~~- @elem is an XML element containing 1 revision from a page~~
95		~~- @output is where to store the data, either a queue or a filehandle~~
96		~~- @**kwargs contains extra information~~
97		-
98		~~- the variable tags determines which attributes are being parsed, the values in~~
99		~~- this dictionary are the functions used to extract the data.~~
100		~~- '''~~
101		~~- tags = {'contributor': {'editor': extract_contributor_id,~~
102		~~- 'bot': determine_username_is_bot},~~
103		~~- 'timestamp': {'date': xml.extract_text},~~
104		~~- }~~
105		~~- vars = {}~~
106		~~- headers = ['editor', 'date', 'article']~~
107		~~- destination = kwargs.pop('destination')~~
108		~~- revisions = elem.findall('revision')~~
109		~~- for revision in revisions:~~
110		~~- vars['article'] = elem.find('id').text.decode(settings.ENCODING)~~
111		~~- elements = revision.getchildren()~~
112		~~- for tag, functions in tags.iteritems():~~
113		~~- xml_node = xml.retrieve_xml_node(elements, tag)~~
114		~~- for var, function in functions.iteritems():~~
115		~~- vars[var] = function(xml_node, kwargs)~~
116		-
117		~~- #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])~~
118		~~- if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:~~
119		~~- vars.pop('bot')~~
120		~~- if destination == 'queue':~~
121		~~- output.put(vars)~~
122		~~- vars['date'] = utils.convert_timestamp_to_date(vars['date'])~~
123		~~- elif destination == 'file':~~
124		~~- data = []~~
125		~~- for head in headers:~~
126		~~- data.append(vars[head])~~
127		~~- utils.write_list_to_csv(data, output)~~
128		~~- vars = {}~~
129		-
130		-
131		~~-def parse_editors(xml_queue, data_queue, **kwargs):~~
132		~~- '''~~
133		~~- @xml_queue contains the filenames of the files to be parsed~~
134		~~- @data_queue is an instance of Queue where the extracted data is stored for~~
135		~~- further processing~~
136		~~- @pbar is an instance of progressbar to display the progress~~
137		~~- @bots is a list of id's of known Wikipedia bots~~
138		~~- @debug is a flag to indicate whether the function is called for debugging.~~
139		-
140		~~- Output is the data_queue that will be used by store_editors()~~
141		~~- '''~~
142		~~- input = kwargs.get('input', None)~~
143		~~- output = kwargs.get('output', None)~~
144		~~- debug = kwargs.get('debug', False)~~
145		~~- destination = kwargs.get('destination', 'file')~~
146		~~- bots = kwargs.get('bots', None)~~
147		~~- pbar = kwargs.get('pbar', None)~~
148		~~- if settings.DEBUG:~~
149		~~- messages = {}~~
150		~~- vars = {}~~
151		-
152		~~- while True:~~
153		~~- try:~~
154		~~- if debug:~~
155		~~- file = xml_queue~~
156		~~- else:~~
157		~~- file = xml_queue.get(block=False)~~
158		~~- if file == None:~~
159		~~- print 'Swallowed a poison pill'~~
160		~~- break~~
161		-
162		~~- data = xml.read_input(utils.create_txt_filehandle(input,~~
163		~~- file, 'r',~~
164		~~- encoding=settings.ENCODING))~~
165		~~- if destination == 'file':~~
166		~~- name = file[:-4] + '.txt'~~
167		~~- fh = utils.create_txt_filehandle(output, name, 'w', settings.ENCODING)~~
168		~~- for raw_data in data:~~
169		~~- xml_buffer = cStringIO.StringIO()~~
170		~~- raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')~~
171		-
172		~~- try:~~
173		~~- raw_data = ''.join(raw_data)~~
174		~~- xml_buffer.write(raw_data)~~
175		~~- elem = cElementTree.XML(xml_buffer.getvalue())~~
176		~~- output_editor_information(elem, fh, bots=bots, destination=destination)~~
177		~~- except SyntaxError, error:~~
178		~~- print error~~
179		~~- '''~~
180		~~- There are few cases with invalid tokens, they are fixed~~
181		~~- here and then reinserted into the XML DOM~~
182		~~- data = convert_html_entities(xml_buffer.getvalue())~~
183		~~- elem = cElementTree.XML(data)~~
184		~~- output_editor_information(elem)~~
185		~~- '''~~
186		~~- if settings.DEBUG:~~
187		~~- utils.track_errors(xml_buffer, error, file, messages)~~
188		~~- except UnicodeEncodeError, error:~~
189		~~- print error~~
190		~~- if settings.DEBUG:~~
191		~~- utils.track_errors(xml_buffer, error, file, messages)~~
192		~~- except MemoryError, error:~~
193		~~- print file, error~~
194		~~- print raw_data[:12]~~
195		~~- print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])~~
196		~~- if destination == 'queue':~~
197		~~- output.put('NEXT')~~
198		~~- while True:~~
199		~~- if output.qsize() < 100000:~~
200		~~- break~~
201		~~- else:~~
202		~~- time.sleep(10)~~
203		~~- print 'Still sleeping, queue is %s items long' % output.qsize()~~
204		-
205		~~- else:~~
206		~~- fh.close()~~
207		-
208		~~- if pbar:~~
209		~~- print file, xml_queue.qsize()~~
210		~~- #utils.update_progressbar(pbar, xml_queue)~~
211		-
212		~~- if debug:~~
213		~~- break~~
214		-
215		~~- except Empty:~~
216		~~- break~~
217		-
218		~~- if destination == 'queue':~~
219		~~- data_queue.put(None)~~
220		-
221		~~- if settings.DEBUG:~~
222		~~- utils.report_error_messages(messages, parse_editors)~~
223		-
224		-
225		~~-def store_editors(data_queue, **kwargs):~~
226		~~- '''~~
227		~~- @data_queue is an instance of Queue containing information extracted by~~
228		~~- parse_editors()~~
229		~~- @pids is a list of PIDs used to check if other processes are finished~~
230		~~- running~~
231		~~- @dbname is the name of the MongoDB collection where to store the information.~~
232		~~- '''~~
233		~~- dbname = kwargs.get('dbname', None)~~
234		~~- mongo = db.init_mongo_db(dbname)~~
235		~~- collection = mongo['editors']~~
236		~~- mongo.collection.ensure_index('editor')~~
237		~~- editor_cache = cache.EditorCache(collection)~~
238		-
239		~~- while True:~~
240		~~- try:~~
241		~~- edit = data_queue.get(block=False)~~
242		~~- data_queue.task_done()~~
243		~~- if edit == None:~~
244		~~- print 'Swallowing poison pill'~~
245		~~- break~~
246		~~- elif edit == 'NEXT':~~
247		~~- editor_cache.add('NEXT', '')~~
248		~~- else:~~
249		~~- contributor = edit['editor']~~
250		~~- value = {'date': edit['date'], 'article': edit['article']}~~
251		~~- editor_cache.add(contributor, value)~~
252		~~- #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True)~~
253		~~- #'$inc': {'edit_count': 1},~~
254		-
255		~~- except Empty:~~
256		~~- '''~~
257		~~- This checks whether the Queue is empty because the preprocessors are~~
258		~~- finished or because this function is faster in emptying the Queue~~
259		~~- then the preprocessors are able to fill it. If the preprocessors~~
260		~~- are finished and this Queue is empty than break, else wait for the~~
261		~~- Queue to fill.~~
262		~~- '''~~
263		~~- pass~~
264		-
265		~~- print 'Emptying entire cache.'~~
266		~~- editor_cache.store()~~
267		~~- print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n)~~
268		-
269		-
270		~~-def load_cache_objects():~~
271		~~- cache = {}~~
272		~~- files = utils.retrieve_file_list(settings.BINARY_OBJECT_FILE_LOCATION, '.bin')~~
273		~~- for x, file in enumerate(files):~~
274		~~- cache[x] = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, file)~~
275		~~- return cache~~
276		-
277		-
278		~~-def search_cache_for_missed_editors(dbname):~~
279		~~- mongo = db.init_mongo_db(dbname)~~
280		~~- collection = mongo['editors']~~
281		~~- editor_cache = cache.EditorCache(collection)~~
282		~~- cache = load_cache_objects()~~
283		~~- for c in cache:~~
284		~~- for editor in cache[c]:~~
285		~~- editor_cache.add(editor, cache[c][editor])~~
286		~~- cache[c] = {}~~
287		~~- editor_cache.add('NEXT', '')~~
288		~~- cache = {}~~
289		-
290		-
291		-
292		~~-def load_bot_ids():~~
293		~~- '''~~
294		~~- Loader function to retrieve list of id's of known Wikipedia bots.~~
295		~~- '''~~
296		~~- ids = {}~~
297		~~- mongo = db.init_mongo_db('bots')~~
298		~~- bots = mongo['ids']~~
299		~~- cursor = bots.find()~~
300		~~- for bot in cursor:~~
301		~~- ids[bot['id']] = bot['name']~~
302		~~- return ids~~
303		-
304		-
305		~~-def run_parse_editors(location, language, project):~~
306		~~- ids = load_bot_ids()~~
307		~~- input = os.path.join(location, language, project)~~
308		~~- output = os.path.join(input, 'txt')~~
309		-
310		~~- kwargs = {'bots': ids,~~
311		~~- 'dbname': language + project,~~
312		~~- 'language': language,~~
313		~~- 'project': project,~~
314		~~- 'pbar': True,~~
315		~~- 'destination': 'file',~~
316		~~- 'nr_input_processors': settings.NUMBER_OF_PROCESSES,~~
317		~~- 'nr_output_processors': settings.NUMBER_OF_PROCESSES,~~
318		~~- 'input': input,~~
319		~~- 'output': output,~~
320		~~- }~~
321		~~- source = os.path.join(location, language, project)~~
322		~~- files = utils.retrieve_file_list(source, 'xml')~~
323		-
324		~~- if not os.path.exists(input):~~
325		~~- utils.create_directory(input)~~
326		~~- if not os.path.exists(output):~~
327		~~- utils.create_directory(output)~~
328		-
329		~~- chunks = utils.split_list(files , settings.NUMBER_OF_PROCESSES)~~
330		~~- pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs)~~
331		-
332		-
333		~~-def debug_parse_editors(dbname):~~
334		~~- q = JoinableQueue()~~
335		~~- parse_editors('522.xml', q, None, None, debug=True, destination='file')~~
336		~~- store_editors(q, [], dbname)~~
337		-
338		-
339		~~-if __name__ == "__main__":~~
340		~~- #debug_parse_editors('test2')~~
341		~~- run_parse_editors(settings.XML_FILE_LOCATION, 'en', 'wiki')~~
342		~~- pass~~
Index: trunk/tools/editor_trends/settings.py
—	—	@@ -1,158 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__author__email = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-10-21'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-'''~~
22		~~-This file contains settings that are used for constructing and analyzing~~
23		~~-the datasets as part of the Editor Dynamics and Anti-Vandalism projects.~~
24		~~-'''~~
25		-
26		~~-from multiprocessing import cpu_count~~
27		~~-import os~~
28		~~-import sys~~
29		~~-import platform~~
30		-
31		~~-try:~~
32		~~- from pywin import win32file~~
33		~~- '''increase the maximum number of open files on Windows to 1024'''~~
34		~~- win32file._setmaxstdio(1024)~~
35		~~-except ImportError:~~
36		~~- pass~~
37		-
38		~~-try:~~
39		~~- import resource~~
40		~~-except ImportError:~~
41		~~- pass~~
42		-
43		~~-#Setting up the environment~~
44		~~-ops = {platform.win32_ver: 'Windows',~~
45		~~- platform.linux_distribution: 'Linux',~~
46		~~- platform.mac_ver: 'OSX'}~~
47		-
48		~~-for op in ops:~~
49		~~- if op() != ('', '', '') and op() != ('', ('', '', ''), ''):~~
50		~~- OS = ops[op]~~
51		-
52		~~-ARCH = platform.machine()~~
53		-
54		~~-WORKING_DIRECTORY = os.getcwd()~~
55		~~-IGNORE_DIRS = ['wikistats', 'zips']~~
56		~~-ROOT = '/' if OS != 'Windows' else 'c:\\'~~
57		-
58		~~-MINIMUM_PYTHON_VERSION = (2, 6)~~
59		-
60		~~-dirs = [name for name in os.listdir(WORKING_DIRECTORY) if~~
61		~~- os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]~~
62		~~-for subdirname in dirs:~~
63		~~- if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:~~
64		~~- sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname))~~
65		-
66		~~-WINDOWS_ZIP = ['7z.exe']~~
67		-
68		~~-OSX_ZIP = []~~
69		-
70		~~-LINUX_ZIP = []~~
71		~~-#General settings~~
72		-
73		~~-# Valid values are 'stand-alone' and 'hadoop'~~
74		~~-RUN_MODE = 'stand_alone'~~
75		-
76		~~-# If true then some more detailed debug information is collected~~
77		~~-DEBUG = True~~
78		-
79		~~-#If True then it will display a progress bar on the console.~~
80		~~-PROGRESS_BAR = True~~
81		-
82		~~-#Date format as used by Erik Zachte~~
83		~~-DATE_FORMAT = '%Y-%m-%d'~~
84		-
85		~~-# Timestamp format as generated by the MediaWiki dumps~~
86		~~-DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'~~
87		-
88		~~-#This section contains configuration variables for the different file locations.~~
89		-
90		~~-# Location where to write xml chunks~~
91		~~-XML_FILE_LOCATION = os.path.join(ROOT, 'wikimedia')~~
92		-
93		~~-# Input file~~
94		~~-XML_FILE = os.path.join(ROOT, 'Source_Files', 'enwiki-20100916-stub-meta-history.xml')~~
95		-
96		~~-# This is the place where error messages are stored for debugging purposes~~
97		~~-ERROR_MESSAGE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'errors')~~
98		-
99		~~-DATABASE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'database')~~
100		-
101		~~-BINARY_OBJECT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'objects')~~
102		-
103		~~-DATASETS_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'datasets')~~
104		-
105		~~-TXT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'csv')~~
106		-
107		~~-NAMESPACE_LOCATION = os.path.join(WORKING_DIRECTORY, 'namespaces')~~
108		~~-#This section contains configuration variables for parsing / encoding and~~
109		~~-#working with the XML files.~~
110		-
111		~~-# ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Sets for reason~~
112		~~-MAX_XML_FILE_SIZE = 67108864~~
113		-
114		~~-if OS == 'Windows' and ARCH == 'i386':~~
115		~~- MAX_FILES_OPEN = win32file._getmaxstdio()~~
116		~~-elif OS != 'Windows':~~
117		~~- MAX_FILES_OPEN = resource.getrlimit(resource.RLIMIT_NOFILE)~~
118		~~-else:~~
119		~~- MAX_FILES_OPEN = 500~~
120		-
121		~~-ENCODING = 'utf-8'~~
122		-
123		~~-# Name space, do not change as this works for Mediawiki wikis~~
124		~~-NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/'~~
125		-
126		~~-WINDOWS_REGISTER = {'7zip': 'Software\\7-Zip',~~
127		~~- }~~
128		-
129		~~-COMPRESSION_EXTENSIONS = ['gz', 'bz2', '7z']~~
130		-
131		-
132		~~-WIKIMEDIA_PROJECTS = {'commons': 'commonswiki',~~
133		~~- 'wikibooks': 'wikibooks',~~
134		~~- 'wikinews': 'wikinews',~~
135		~~- 'wikiquote': 'wikiquote',~~
136		~~- 'wikisource': 'wikisource',~~
137		~~- 'wikiversity': 'wikiversity',~~
138		~~- 'wiktionary': 'wiktionary',~~
139		~~- 'metawiki': 'metawiki',~~
140		~~- 'wikispecies': 'specieswiki',~~
141		~~- 'incubator': 'incubatorwiki',~~
142		~~- 'foundation': 'foundationwiki',~~
143		~~- 'mediawiki': 'mediawikiwiki',~~
144		~~- 'outreach': 'outreachwiki',~~
145		~~- 'strategic planning': 'strategywiki',~~
146		~~- 'usability initiative': 'usabilitywiki',~~
147		~~- 'multilingual wikisource': None~~
148		~~- }~~
149		-
150		~~-#Multiprocess settings used to parallelize workload~~
151		~~-#Change this to match your computers configuration (RAM / CPU)~~
152		~~-NUMBER_OF_PROCESSES = cpu_count() * 1~~
153		-
154		~~-#Extensions of ascii files, this is used to determine the filemode to use~~
155		~~-ASCII = ['txt', 'csv', 'xml', 'sql', 'json']~~
156		-
157		~~-WP_DUMP_LOCATION = 'http://download.wikimedia.org'~~
158		-
159		~~-MAX_CACHE_SIZE = 1024 * 1024~~
Index: trunk/tools/editor_trends/split_xml_file.py
—	—	@@ -1,186 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__author__email = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-10-21'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import xml.etree.cElementTree as cElementTree~~
22		~~-import codecs~~
23		~~-import utils~~
24		~~-import re~~
25		~~-import json~~
26		~~-import os~~
27		-
28		~~-import progressbar~~
29		-
30		~~-from utils import utils~~
31		~~-import settings~~
32		-
33		~~-try:~~
34		~~- import psyco~~
35		~~- psyco.full()~~
36		~~-except ImportError:~~
37		~~- pass~~
38		-
39		-
40		~~-RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')~~
41		-
42		-
43		~~-def remove_numeric_character_references(text):~~
44		~~- return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')~~
45		-
46		-
47		~~-def lenient_deccharref(m):~~
48		~~- try:~~
49		~~- return unichr(int(m.group(1)))~~
50		~~- except ValueError:~~
51		~~- '''~~
52		~~- There are a few articles that raise a Value Error here, the reason is~~
53		~~- that I am using a narrow Python build (UCS2) instead of a wide build~~
54		~~- (UCS4). The quick fix is to return an empty string...~~
55		~~- Real solution is to rebuild Python with UCS4 support.....~~
56		~~- '''~~
57		~~- return ''~~
58		-
59		-
60		~~-def remove_namespace(element, namespace):~~
61		~~- '''Remove namespace from the XML document.'''~~
62		~~- ns = u'{%s}' % namespace~~
63		~~- nsl = len(ns)~~
64		~~- for elem in element.getiterator():~~
65		~~- if elem.tag.startswith(ns):~~
66		~~- elem.tag = elem.tag[nsl:]~~
67		~~- return element~~
68		-
69		-
70		~~-def load_namespace(language):~~
71		~~- file = '%s_ns.json' % language~~
72		~~- fh = utils.create_txt_filehandle(settings.NAMESPACE_LOCATION, file, 'r', settings.ENCODING)~~
73		~~- ns = json.load(fh)~~
74		~~- fh.close()~~
75		~~- ns = ns['query']['namespaces']~~
76		~~- return ns~~
77		-
78		-
79		~~-def build_namespaces_locale(namespaces):~~
80		~~- '''~~
81		~~- Construct a list of all the non-main namespaces~~
82		~~- '''~~
83		~~- ns = []~~
84		~~- for namespace in namespaces:~~
85		~~- value = namespaces[namespace].get(u'*', None)~~
86		~~- if value != None and value != '':~~
87		~~- ns.append(value)~~
88		~~- return ns~~
89		-
90		-
91		~~-def parse_comments(xml, function):~~
92		~~- revisions = xml.findall('revision')~~
93		~~- for revision in revisions:~~
94		~~- comment = revision.find('comment')~~
95		~~- timestamp = revision.find('timestamp').text~~
96		~~- if comment != None and comment.text != None:~~
97		~~- comment.text = function(comment.text)~~
98		~~- return xml~~
99		-
100		-
101		~~-def is_article_main_namespace(elem, namespace):~~
102		~~- '''~~
103		~~- checks whether the article belongs to the main namespace~~
104		~~- '''~~
105		~~- title = elem.find('title').text~~
106		~~- for ns in namespace:~~
107		~~- if title.startswith(ns):~~
108		~~- return False~~
109		~~- return True~~
110		-
111		-
112		~~-def write_xml_file(element, fh, counter, language):~~
113		~~- '''Get file handle and write xml element to file'''~~
114		~~- size = len(cElementTree.tostring(element))~~
115		~~- fh, counter = create_xml_file_handle(fh, counter, size, language)~~
116		~~- try:~~
117		~~- fh.write(cElementTree.tostring(element))~~
118		~~- except MemoryError:~~
119		~~- print 'Add error capturing logic'~~
120		~~- fh.write('\n')~~
121		~~- return fh, counter~~
122		-
123		-
124		~~-def create_xml_file_handle(fh, counter, size, language):~~
125		~~- '''Create file handle if none is supplied or if file size > max file size.'''~~
126		~~- if not counter:~~
127		~~- counter = 0~~
128		~~- path = os.path.join(settings.XML_FILE_LOCATION, language, '%s.xml' % counter)~~
129		~~- if not fh:~~
130		~~- fh = codecs.open(path, 'w', encoding=settings.ENCODING)~~
131		~~- return fh, counter~~
132		~~- elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE:~~
133		~~- print 'Created chunk %s' % counter~~
134		~~- fh.close~~
135		~~- counter += 1~~
136		~~- fh = codecs.open(path, 'w', encoding=settings.ENCODING)~~
137		~~- return fh, counter~~
138		~~- else:~~
139		~~- return fh, counter~~
140		-
141		-
142		~~-def split_xml(location, filename, project, language_code):~~
143		~~- '''Reads xml file and splits it in N chunks'''~~
144		~~- #location = os.path.join(settings.XML_FILE_LOCATION, language)~~
145		~~- result = utils.check_file_exists(location, '')~~
146		~~- if result == False:~~
147		~~- result = utils.create_directory(location)~~
148		~~- if not result:~~
149		~~- return~~
150		-
151		~~- ns = load_namespace(language_code)~~
152		~~- ns = build_namespaces_locale(ns)~~
153		-
154		~~- fh = None~~
155		~~- counter = None~~
156		~~- source = os.path.join(location, filename)~~
157		~~- tag = '{%s}page' % settings.NAME_SPACE~~
158		-
159		~~- context = cElementTree.iterparse(source, events=('start', 'end'))~~
160		~~- context = iter(context)~~
161		~~- event, root = context.next() #get the root element of the XML doc~~
162		-
163		~~- try:~~
164		~~- for event, elem in context:~~
165		~~- if event == 'end':~~
166		~~- if elem.tag == tag:~~
167		~~- elem = remove_namespace(elem, settings.NAME_SPACE)~~
168		~~- if is_article_main_namespace(elem, ns):~~
169		~~- elem = parse_comments(elem, remove_numeric_character_references)~~
170		~~- fh, counter = write_xml_file(elem, fh, counter, language_code)~~
171		~~- root.clear() # when done parsing a section clear the tree to safe memory~~
172		~~- #elem = parse_comments(elem, convert_html_entities)~~
173		~~- #elem = parse_comments(elem, remove_ascii_control_characters)~~
174		~~- #print cElementTree.tostring(elem)~~
175		~~- except SyntaxError:~~
176		~~- fh = utils.create_txt_filehandle(settings.ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING)~~
177		~~- fh.write(cElementTree.tostring(elem))~~
178		~~- fh.close()~~
179		-
180		-
181		~~-if __name__ == "__main__":~~
182		~~- kwargs = {'location': 'c:\\Source_files\\',~~
183		~~- 'filename': settings.XML_FILE,~~
184		~~- 'project':'wiki',~~
185		~~- 'language_code':'en'~~
186		~~- }~~
187		~~- split_xml(**kwargs)~~
Index: trunk/tools/editor_trends/init_bot_db.py
—	—	@@ -1,196 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		-
18		~~-import os~~
19		~~-import cStringIO~~
20		~~-import xml.etree.cElementTree as cElementTree~~
21		-
22		-
23		~~-import settings~~
24		~~-from wikitree import xml~~
25		~~-from database import db~~
26		~~-from database import db_settings~~
27		~~-from utils import utils~~
28		~~-from utils import process_constructor as pc~~
29		-
30		~~-try:~~
31		~~- import psyco~~
32		~~- psyco.full()~~
33		~~-except ImportError:~~
34		~~- pass~~
35		-
36		-
37		~~-def create_bot_ids_db_mongo():~~
38		~~- ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.ENCODING)~~
39		~~- mongo = db.init_mongo_db('bots')~~
40		~~- collection = mongo['ids']~~
41		-
42		~~- db.remove_documents_from_mongo_db(collection, None)~~
43		-
44		~~- for id, name in ids.iteritems():~~
45		~~- collection.insert({'id': id, 'name': name})~~
46		-
47		~~- print collection.count()~~
48		-
49		-
50		~~-def create_bots_db(db_name):~~
51		~~- '''~~
52		~~- This function reads the csv file provided by Erik Zachte and constructs a~~
53		~~- sqlite memory database. The reason for this is that I suspect I will need~~
54		~~- some simple querying capabilities in the future, else a dictionary would~~
55		~~- suffice.~~
56		~~- '''~~
57		~~- connection = db.init_database('db_name')~~
58		~~- #connection = db.init_database('data/database/bots.db')~~
59		~~- cursor = connection.cursor()~~
60		~~- db.create_tables(cursor, db_settings.BOT_TABLE)~~
61		~~- values = []~~
62		~~- fields = [field[0] for field in db_settings.BOT_TABLE['bots']]~~
63		~~- for line in utils.read_data_from_csv('data/csv/StatisticsBots.csv', settings.ENCODING):~~
64		~~- line = line.split(',')~~
65		~~- row = []~~
66		~~- for x, (field, value) in enumerate(zip(fields, line)):~~
67		~~- if db_settings.BOT_TABLE['bots'][x][1] == 'INTEGER':~~
68		~~- value = int(value)~~
69		~~- elif db_settings.BOT_TABLE['bots'][x][1] == 'TEXT':~~
70		~~- value = value.replace('/', '-')~~
71		~~- #print field, value~~
72		~~- row.append(value)~~
73		~~- values.append(row)~~
74		-
75		~~- cursor.executemany('INSERT INTO bots VALUES (?,?,?,?,?,?,?,?,?,?);', values)~~
76		~~- connection.commit()~~
77		~~- if db_name == ':memory':~~
78		~~- return cursor~~
79		~~- else:~~
80		~~- connection.close()~~
81		-
82		-
83		~~-def retrieve_botnames_without_id(cursor, language):~~
84		~~- return cursor.execute('SELECT name FROM bots WHERE language=?', (language,)).fetchall()~~
85		-
86		-
87		~~-def lookup_username(input_queue, result_queue, progressbar, bots, debug=False):~~
88		~~- '''~~
89		~~- This function is used to find the id's belonging to the different bots that~~
90		~~- are patrolling the Wikipedia sites.~~
91		~~- @input_queue contains a list of xml files to parse~~
92		-
93		~~- @result_queue should be set to false as the results are directly written to~~
94		~~- a csv file.~~
95		-
96		~~- @progressbar depends on settings~~
97		-
98		~~- @bots is a dictionary containing the names of the bots to lookup~~
99		~~- '''~~
100		-
101		~~- #if len(bots.keys()) == 1:~~
102		~~- bots = bots['bots']~~
103		~~- #print bots.keys()~~
104		-
105		~~- if settings.DEBUG:~~
106		~~- messages = {}~~
107		-
108		~~- while True:~~
109		~~- if debug:~~
110		~~- file = input_queue~~
111		~~- else:~~
112		~~- file = input_queue.get(block=False)~~
113		-
114		~~- if file == None:~~
115		~~- break~~
116		-
117		~~- data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION +~~
118		~~- file, 'r', encoding=settings.ENCODING))~~
119		-
120		~~- for raw_data in data:~~
121		~~- xml_buffer = cStringIO.StringIO()~~
122		~~- raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')~~
123		~~- raw_data = ''.join(raw_data)~~
124		~~- raw_data = raw_data.encode('utf-8')~~
125		~~- xml_buffer.write(raw_data)~~
126		-
127		~~- try:~~
128		~~- xml_nodes = cElementTree.XML(xml_buffer.getvalue())~~
129		~~- revisions = xml_nodes.findall('revision')~~
130		~~- for revision in revisions:~~
131		~~- contributor = xml.retrieve_xml_node(revision, 'contributor')~~
132		~~- username = contributor.find('username')~~
133		~~- if username == None:~~
134		~~- continue~~
135		~~- username = xml.extract_text(username)~~
136		~~- #print username.encode('utf-8')~~
137		-
138		~~- if username in bots:~~
139		~~- id = contributor.find('id')~~
140		~~- id = xml.extract_text(id)~~
141		~~- #print username.encode('utf-8'), id~~
142		~~- utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.ENCODING)~~
143		~~- bots.pop(username)~~
144		~~- if bots == {}:~~
145		~~- print 'Mission accomplished'~~
146		~~- return~~
147		~~- except Exception, error:~~
148		~~- print error~~
149		~~- if settings.DEBUG:~~
150		~~- messages = utils.track_errors(xml_buffer, error, file,~~
151		~~- messages)~~
152		-
153		~~- if settings.DEBUG:~~
154		~~- utils.report_error_messages(messages, lookup_username)~~
155		-
156		-
157		~~-def add_id_to_botnames():~~
158		~~- '''~~
159		~~- This is the worker function for the multi-process version of~~
160		~~- lookup_username.First, the names of the bots are retrieved, then the~~
161		~~- multiprocess is launched by makinga call to pc.build_scaffolding. This is a~~
162		~~- generic launcher that takes as input the function to load the input_queue,~~
163		~~- the function that will do the main work and the objects to be put in the~~
164		~~- input_queue. The launcher also accepts optional keyword arguments.~~
165		~~- '''~~
166		~~- cursor = create_bots_db(':memory')~~
167		~~- files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml')~~
168		-
169		~~- botnames = retrieve_botnames_without_id(cursor, 'en')~~
170		~~- bots = {}~~
171		~~- for botname in botnames:~~
172		~~- bots[botname[0]] = 1~~
173		~~- pc.build_scaffolding(pc.load_queue, lookup_username, files, bots=bots)~~
174		~~- cursor.close()~~
175		-
176		-
177		~~-def debug_lookup_username():~~
178		~~- '''~~
179		~~- This function launches the lookup_username function but then single~~
180		~~- threaded, this eases debugging. That's also the reason why the queue~~
181		~~- parameters are set to None. When launching this function make sure that~~
182		~~- debug=False when calling lookup_username~~
183		~~- '''~~
184		~~- cursor = create_bots_db(':memory')~~
185		~~- botnames = retrieve_botnames_without_id(cursor, 'en')~~
186		~~- bots = {}~~
187		~~- for botname in botnames:~~
188		~~- bots[botname[0]] = 1~~
189		-
190		~~- lookup_username('12.xml', None, None, bots, debug=True)~~
191		~~- cursor.close()~~
192		-
193		-
194		~~-if __name__ == '__main__':~~
195		~~- #debug()~~
196		~~- #add_id_to_botnames()~~
197		~~- create_bot_ids_db_mongo()~~
Index: trunk/tools/editor_trends/construct_datasets.py
—	—	@@ -1,254 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__author__email = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-10-21'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-from multiprocessing import Queue~~
22		~~-from Queue import Empty~~
23		~~-import datetime~~
24		-from dateutil.relativedelta import *
25		-
26		~~-import progressbar~~
27		-
28		~~-import settings~~
29		~~-from utils import models, utils~~
30		~~-from database import db~~
31		~~-from utils import process_constructor as pc~~
32		-
33		~~-try:~~
34		~~- import psyco~~
35		~~- psyco.full()~~
36		~~-except ImportError:~~
37		~~- pass~~
38		-
39		-
40		~~-def retrieve_editor_ids_mongo(dbname, collection):~~
41		~~- if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,~~
42		~~- 'editors.bin'):~~
43		~~- ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,~~
44		~~- 'editors.bin')~~
45		~~- else:~~
46		~~- mongo = db.init_mongo_db(dbname)~~
47		~~- editors = mongo[collection]~~
48		~~- ids = editors.distinct('editor')~~
49		~~- utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)~~
50		~~- return ids~~
51		-
52		-
53		~~-def expand_edits(edits):~~
54		~~- data = []~~
55		~~- for edit in edits:~~
56		~~- data.append(edit['date'])~~
57		~~- return data~~
58		-
59		-
60		~~-def expand_observations(obs, vars_to_expand):~~
61		~~- for var in vars_to_expand:~~
62		~~- if var == 'edits':~~
63		~~- obs[var] = expand_edits(obs[var])~~
64		~~- elif var == 'edits_by_year':~~
65		~~- keys = obs[var].keys()~~
66		~~- keys.sort()~~
67		~~- edits = []~~
68		~~- for key in keys:~~
69		~~- edits.append(str(obs[var][key]))~~
70		~~- obs[var] = edits~~
71		~~- return obs~~
72		-
73		~~-def write_longitudinal_data(id, edits, fh):~~
74		~~- years = edits.keys()~~
75		~~- years.sort()~~
76		~~- for year in years:~~
77		~~- months = edits[year].keys()~~
78		~~- months = [int(m) for m in months]~~
79		~~- months.sort()~~
80		~~- for m in months:~~
81		~~- date = datetime.date(int(year), int(m), 1)~~
82		~~- fh.write('%s\t%s\t%s\n' % (id, date, edits[year][str(m)]))~~
83		-
84		-
85		~~-def expand_headers(headers, vars_to_expand, obs):~~
86		~~- for var in vars_to_expand:~~
87		~~- l = len(obs[var])~~
88		~~- pos = headers.index(var)~~
89		~~- for i in xrange(l):~~
90		~~- if var.endswith('year'):~~
91		~~- suffix = 2001 + i~~
92		~~- elif var.endswith('edits'):~~
93		~~- suffix = 1 + i~~
94		~~- headers.insert(pos + i, '%s_%s' % (var, suffix))~~
95		~~- headers.remove(var)~~
96		~~- return headers~~
97		-
98		-
99		~~-def generate_long_editor_dataset(input_queue, data_queue, pbar, **kwargs):~~
100		~~- debug = kwargs.pop('debug')~~
101		~~- dbname = kwargs.pop('dbname')~~
102		~~- mongo = db.init_mongo_db(dbname)~~
103		~~- editors = mongo['dataset']~~
104		~~- name = dbname + '_long_editors.csv'~~
105		~~- fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING)~~
106		~~- x = 0~~
107		~~- vars_to_expand = []~~
108		~~- while True:~~
109		~~- try:~~
110		~~- id = input_queue.get(block=False)~~
111		~~- obs = editors.find_one({'editor': id}, {'monthly_edits': 1})~~
112		~~- if x == 0:~~
113		~~- headers = obs.keys()~~
114		~~- headers.sort()~~
115		~~- headers = expand_headers(headers, vars_to_expand, obs)~~
116		~~- utils.write_list_to_csv(headers, fh)~~
117		~~- write_longitudinal_data(id, obs['monthly_edits'], fh)~~
118		~~- #utils.write_list_to_csv(data, fh)~~
119		~~- x += 1~~
120		~~- except Empty:~~
121		~~- break~~
122		-
123		-
124		~~-def generate_cohort_analysis(input_queue, data_queue, pbar, **kwargs):~~
125		~~- dbname = kwargs.get('dbname')~~
126		~~- pbar = kwargs.get('pbar')~~
127		~~- mongo = db.init_mongo_db(dbname)~~
128		~~- editors = mongo['dataset']~~
129		~~- year = datetime.datetime.now().year + 1~~
130		~~- begin = year - 2001~~
131		~~- p = [3, 6, 9]~~
132		~~- periods = [y * 12 for y in xrange(1, begin)]~~
133		~~- periods = p + periods~~
134		~~- data = {}~~
135		~~- while True:~~
136		~~- try:~~
137		~~- id = input_queue.get(block=False)~~
138		~~- obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})~~
139		~~- first_edit = obs['first_edit']~~
140		~~- last_edit = obs['final_edit']~~
141		~~- for y in xrange(2001, year):~~
142		~~- if y == 2010 and first_edit > datetime.datetime(2010, 1, 1):~~
143		~~- print 'debug'~~
144		~~- if y not in data:~~
145		~~- data[y] = {}~~
146		~~- data[y]['n'] = 0~~
147		~~- window_end = datetime.datetime(y, 12, 31)~~
148		~~- if window_end > datetime.datetime.now():~~
149		~~- now = datetime.datetime.now()~~
150		~~- m = now.month - 1 #Dump files are always lagging at least one month....~~
151		~~- d = now.day~~
152		~~- window_end = datetime.datetime(y, m, d)~~
153		~~- edits = []~~
154		~~- for period in periods:~~
155		~~- if period not in data[y]:~~
156		~~- data[y][period] = 0~~
157		~~- window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period)~~
158		~~- if window_start < datetime.datetime(2001, 1, 1):~~
159		~~- window_start = datetime.datetime(2001, 1, 1)~~
160		~~- if date_falls_in_window(window_start, window_end, first_edit, last_edit):~~
161		~~- edits.append(period)~~
162		~~- if edits != []:~~
163		~~- p = min(edits)~~
164		~~- data[y]['n'] += 1~~
165		~~- data[y][p] += 1~~
166		~~- #pbar.update(+1)~~
167		~~- except Empty:~~
168		~~- break~~
169		~~- utils.store_object(data, settings.BINARY_OBJECT_FILE_LOCATION, 'cohort_data')~~
170		-
171		~~-def date_falls_in_window(window_start, window_end, first_edit, last_edit):~~
172		~~- if first_edit >= window_start and first_edit <= window_end:~~
173		~~- return True~~
174		~~- else:~~
175		~~- return False~~
176		-
177		-
178		~~-def generate_wide_editor_dataset(input_queue, data_queue, pbar, **kwargs):~~
179		~~- dbname = kwargs.pop('dbname')~~
180		~~- mongo = db.init_mongo_db(dbname)~~
181		~~- editors = mongo['dataset']~~
182		~~- name = dbname + '_wide_editors.csv'~~
183		~~- fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING)~~
184		~~- x = 0~~
185		~~- vars_to_expand = ['edits', 'edits_by_year', 'articles_by_year']~~
186		~~- while True:~~
187		~~- try:~~
188		~~- if debug:~~
189		~~- id = u'99797'~~
190		~~- else:~~
191		~~- id = input_queue.get(block=False)~~
192		~~- print input_queue.qsize()~~
193		~~- obs = editors.find_one({'editor': id})~~
194		~~- obs = expand_observations(obs, vars_to_expand)~~
195		~~- if x == 0:~~
196		~~- headers = obs.keys()~~
197		~~- headers.sort()~~
198		~~- headers = expand_headers(headers, vars_to_expand, obs)~~
199		~~- utils.write_list_to_csv(headers, fh)~~
200		~~- data = []~~
201		~~- keys = obs.keys()~~
202		~~- keys.sort()~~
203		~~- for key in keys:~~
204		~~- data.append(obs[key])~~
205		~~- utils.write_list_to_csv(data, fh)~~
206		-
207		~~- x += 1~~
208		~~- except Empty:~~
209		~~- break~~
210		~~- fh.close()~~
211		-
212		-
213		~~-def retrieve_edits_by_contributor_launcher():~~
214		~~- pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors')~~
215		-
216		-
217		~~-def debug_retrieve_edits_by_contributor_launcher(dbname):~~
218		~~- kwargs = {'debug': False,~~
219		~~- 'dbname': dbname,~~
220		~~- }~~
221		~~- ids = retrieve_editor_ids_mongo(dbname, 'editors')~~
222		~~- input_queue = pc.load_queue(ids)~~
223		~~- q = Queue()~~
224		~~- generate_editor_dataset(input_queue, q, False, kwargs)~~
225		-
226		-
227		~~-def generate_editor_dataset_launcher(dbname):~~
228		~~- kwargs = {'nr_input_processors': 1,~~
229		~~- 'nr_output_processors': 1,~~
230		~~- 'debug': False,~~
231		~~- 'dbname': dbname,~~
232		~~- 'poison_pill':False,~~
233		~~- 'pbar': True~~
234		~~- }~~
235		~~- ids = retrieve_editor_ids_mongo(dbname, 'editors')~~
236		~~- ids = list(ids)~~
237		~~- chunks = dict({0: ids})~~
238		~~- pc.build_scaffolding(pc.load_queue, generate_cohort_analysis, chunks, False, False, **kwargs)~~
239		-
240		-
241		~~-def generate_editor_dataset_debug(dbname):~~
242		~~- ids = retrieve_editor_ids_mongo(dbname, 'editors')~~
243		~~- input_queue = pc.load_queue(ids)~~
244		~~- kwargs = {'nr_input_processors': 1,~~
245		~~- 'nr_output_processors': 1,~~
246		~~- 'debug': True,~~
247		~~- 'dbname': dbname,~~
248		~~- }~~
249		~~- generate_editor_dataset(input_queue, False, False, kwargs)~~
250		-
251		-
252		~~-if __name__ == '__main__':~~
253		~~- #generate_editor_dataset_debug('test')~~
254		~~- generate_editor_dataset_launcher('enwiki')~~
255		~~- #debug_retrieve_edits_by_contributor_launcher()~~
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts.do
—	—	@@ -0,0 +1,25 @@
	2	+label var months_3 "3 Months"
	3	+label var months_6 "6 Months"
	4	+label var months_9 "9 Months"
	5	+label var months_12 "1 Year"
	6	+label var months_24 "2 Years"
	7	+label var months_36 "3 Years"
	8	+label var months_48 "4 Years"
	9	+label var months_60 "5 Years"
	10	+label var months_72 "6 Years"
	11	+label var months_84 "7 Years"
	12	+label var months_96 "8 Years"
	13	+label var months_108 "9 Years"
	14	+generate one_year_exp = months_3+ months_6+ months_9+ months_12
	15	+
	16	+generate fewer_one_year_abs = (one_year_exp/100) * n
	17	+generate more_one_year_abs = n - fewer_one_year_abs
	18	+label var fewer_one_year_abs "Editors with less than one year experience"
	19	+label var more_one_year_abs "Editors with more than one year experience"
	20	+
	21	+graph bar (asis) months_3 months_6 months_9 months_12 months_24 months_36 months_48 months_60 months_72 months_84 months_96 months_108, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) subtitle(Editors are getting older and influx of new editors has stagnated) note("Based on English Wikipedia, 345.000 editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(tiny)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
	22	+
	23	+twoway (line one_year_exp year), ytitle(%) ytitle(, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the English Wikipedia, dataset 345.000 editors.", size(vsmall))
	24	+
	25	+
	26	+graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) legend(colfirst cols(1))
Index: trunk/tools/editor_trends/statistics/stata/wiki.do
—	—	@@ -1,4 +1,5 @@
2		~~-local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit"~~
	2	+insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_long_editors.csv"
	3	+local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit first_edit"
3	4
4	5	foreach edit of local first_ten {
5	6	gen date2 = date(`edit', "YMDhms")
—	—	@@ -8,6 +9,7 @@
9	10	}
10	11
11	12	generate year_left = year(final_edit)
	13	+generate year_joined = year(first_edit)
12	14	sort year_joined
13	15	by year_joined: gen community_size_t = _N
14	16
—	—	@@ -23,12 +25,6 @@
24	26	gen retention200`t' = community_size_200`t1' / community_size_200`t'
25	27	}
26	28
27		-
28		-
29		-
30		-
31		-
32		-
33	29	generate time_to_new_wp = edits_10 - edits_1
34	30	generate active_time_wp = final_edit - edits_10
35	31	label time_to_new_wp "Number of days it took to become a new wikipedian"
Property changes on: trunk/tools/editor_trends/datasets
___________________________________________________________________
Added: svn:ignore
36	32	+ cohort_data.txt
cohorts.dta
difference observations erik vs diederik.ods
difference observations erik vs diederik.xlsx
editors.dta
enwiki_editors.csv
enwiki_long_editors.csv
enwiki_wide_editors.csv
Property changes on: trunk/tools/editor_trends/documentation
___________________________________________________________________
Added: svn:ignore
37	33	+ language_codes.xlsx
Property changes on: trunk/tools/editor_trends/errors
___________________________________________________________________
Modified: svn:ignore
38	34	- *.bin
39	35	+ *.bin
split_xml
Property changes on: trunk/tools/editor_trends
___________________________________________________________________
Added: svn:default-eol-style
40	36	+ native

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r76861 [removed: new added: deferred]