r76861 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76860‎ | r76861 | r76862 >
Date:23:59, 16 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Final commit code refactoring, contains some restructering of directories.
Modified paths:
  • /trunk/tools/editor_trends (modified) (history)
  • /trunk/tools/editor_trends/construct_datasets.py (deleted) (history)
  • /trunk/tools/editor_trends/datasets (modified) (history)
  • /trunk/tools/editor_trends/documentation (added) (history)
  • /trunk/tools/editor_trends/errors (modified) (history)
  • /trunk/tools/editor_trends/init_bot_db.py (deleted) (history)
  • /trunk/tools/editor_trends/map_wiki_editors.py (deleted) (history)
  • /trunk/tools/editor_trends/optimize_editors.py (deleted) (history)
  • /trunk/tools/editor_trends/settings.py (deleted) (history)
  • /trunk/tools/editor_trends/split_xml_file.py (deleted) (history)
  • /trunk/tools/editor_trends/statistics/__init__.py (replaced) (history)
  • /trunk/tools/editor_trends/statistics/dataset.py (replaced) (history)
  • /trunk/tools/editor_trends/statistics/median.py (replaced) (history)
  • /trunk/tools/editor_trends/statistics/stata/cohort_charts.do (added) (history)
  • /trunk/tools/editor_trends/statistics/stata/wiki.do (replaced) (history)

Diff [purge]

Index: trunk/tools/editor_trends/optimize_editors.py
@@ -1,171 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
18 -__date__ = '2010-11-02'
19 -__version__ = '0.1'
20 -
21 -from multiprocessing import Queue
22 -from Queue import Empty
23 -from operator import itemgetter
24 -import datetime
25 -
26 -import settings
27 -from database import db
28 -from utils import process_constructor as pc
29 -from utils import utils
30 -import construct_datasets
31 -
32 -
33 -try:
34 - import psyco
35 - psyco.full()
36 -except ImportError:
37 - pass
38 -
39 -
40 -def create_datacontainer(init_value=0):
41 - '''
42 - This function initializes an empty dictionary with as key the year (starting
43 - 2001 and running through) and as value @init_value, in most cases this will
44 - be zero so the dictionary will act as a running tally for a variable but
45 - @init_value can also a list, [], or a dictionary, {}, or a set, set().
46 - '''
47 - data = {}
48 - year = datetime.datetime.now().year + 1
49 - for x in xrange(2001, year):
50 - data[str(x)] = init_value
51 - return data
52 -
53 -
54 -def add_months_to_datacontainer(datacontainer):
55 - for dc in datacontainer:
56 - datacontainer[dc] = {}
57 - for x in xrange(1, 13):
58 - datacontainer[dc][str(x)] = 0
59 - return datacontainer
60 -
61 -
62 -def determine_edits_by_month(edits):
63 - datacontainer = create_datacontainer(init_value=0)
64 - datacontainer = add_months_to_datacontainer(datacontainer)
65 - for year in edits:
66 - months = set()
67 - for edit in edits[year]:
68 - m = str(edit['date'].month)
69 - if m not in months:
70 - datacontainer[year][m] = 1
71 - months.add(m)
72 - if len(months) == 12:
73 - break
74 - return datacontainer
75 -
76 -
77 -def determine_edits_by_year(dates):
78 - '''
79 - This function counts the number of edits by year made by a particular editor.
80 - '''
81 - edits = create_datacontainer()
82 - for date in dates:
83 - year = str(date['date'].year)
84 - edits[year] += 1
85 - return edits
86 -
87 -
88 -def determine_articles_by_year(dates):
89 - '''
90 - This function counts the number of unique articles by year edited by a
91 - particular editor.
92 - '''
93 - articles = create_datacontainer(set())
94 - for date in dates:
95 - year = str(date['date'].year)
96 - articles[year].add(date['article'])
97 - for article in articles:
98 - articles[article] = len(articles[article])
99 - return articles
100 -
101 -
102 -def sort_edits(edits):
103 - edits = utils.merge_list(edits)
104 - return sorted(edits, key=itemgetter('date'))
105 -
106 -
107 -def optimize_editors(input_queue, result_queue, pbar, **kwargs):
108 - dbname = kwargs.pop('dbname')
109 - mongo = db.init_mongo_db(dbname)
110 - input = mongo['test']
111 - output = mongo['dataset']
112 - output.ensure_index('editor')
113 - output.ensure_index('year_joined')
114 - definition = kwargs.pop('definition')
115 - while True:
116 - try:
117 - id = input_queue.get(block=False)
118 - editor = input.find_one({'editor': id})
119 - if editor == None:
120 - continue
121 - edits = editor['edits']
122 - monthly_edits = determine_edits_by_month(edits)
123 - edits = sort_edits(edits)
124 - edit_count = len(edits)
125 - new_wikipedian = edits[9]['date']
126 - first_edit = edits[0]['date']
127 - final_edit = edits[-1]['date']
128 - edits_by_year = determine_edits_by_year(edits)
129 - articles_by_year = determine_articles_by_year(edits)
130 -
131 - edits = edits[:10]
132 -
133 - output.insert({'editor': id, 'edits': edits,
134 - 'edits_by_year': edits_by_year,
135 - 'new_wikipedian': new_wikipedian,
136 - 'edit_count': edit_count,
137 - 'final_edit': final_edit,
138 - 'first_edit': first_edit,
139 - 'articles_by_year': articles_by_year,
140 - 'monthly_edits': monthly_edits})
141 - print 'Items left: %s' % input_queue.qsize()
142 - except Empty:
143 - break
144 -
145 -
146 -def run_optimize_editors(dbname):
147 - ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
148 - kwargs = {'definition': 'traditional',
149 - 'pbar': True,
150 - 'dbname': 'enwiki',
151 - 'nr_input_processors': 1,
152 - 'nr_output_processors': 0,
153 - 'poison_pill': False
154 - }
155 - print len(ids)
156 - ids = list(ids)
157 - chunks = dict(0, ids)
158 - pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
159 -
160 -
161 -def debug_optimize_editors(dbname):
162 - ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
163 - q = pc.load_queue(ids)
164 - kwargs = {'definition': 'traditional',
165 - 'dbname': dbname
166 - }
167 - optimize_editors(q, False, True, kwargs)
168 -
169 -
170 -if __name__ == '__main__':
171 - #debug_optimize_editors('test')
172 - run_optimize_editors('enwiki')
\ No newline at end of file
Index: trunk/tools/editor_trends/map_wiki_editors.py
@@ -1,341 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
18 -__date__ = '2010-10-21'
19 -__version__ = '0.1'
20 -
21 -#Default Python libraries (Python => 2.6)
22 -import sys
23 -import os
24 -import time
25 -import datetime
26 -import codecs
27 -import math
28 -import cStringIO
29 -import re
30 -from operator import itemgetter
31 -import xml.etree.cElementTree as cElementTree
32 -from multiprocessing import Queue, JoinableQueue
33 -from Queue import Empty
34 -import pymongo
35 -
36 -# Custom written files
37 -import settings
38 -from utils import utils, models
39 -from database import db_settings
40 -from database import db
41 -from database import cache
42 -from wikitree import xml
43 -from statistics import dataset
44 -from utils import process_constructor as pc
45 -
46 -
47 -try:
48 - import psyco
49 - psyco.full()
50 -except ImportError:
51 - pass
52 -
53 -
54 -def determine_username_is_bot(username, kwargs):
55 - '''
56 - @username is the xml element containing the id of the user
57 - @kwargs should have a list with all the bot ids
58 -
59 - @Return False if username id is not in bot list id or True if username id
60 - is a bot id.
61 - '''
62 - ids = kwargs.get('bots', [])
63 - if ids == None:
64 - ids = []
65 - if username != None and username.text != None:
66 - id = username.text
67 - if id in ids:
68 - return 1
69 - else:
70 - return 0
71 -
72 -
73 -def extract_contributor_id(contributor, kwargs):
74 - '''
75 - @contributor is the xml contributor node containing a number of attributes
76 -
77 - Currently, we are only interested in registered contributors, hence we
78 - ignore anonymous editors. If you are interested in collecting data on
79 - anonymous editors then add the string 'ip' to the tags variable.
80 - '''
81 - tags = ['id']
82 - if contributor.get('deleted'):
83 - return - 1 # ASK: Not sure if this is the best way to code deleted contributors.
84 - for elem in contributor:
85 - if elem.tag in tags:
86 - if elem.text != None:
87 - return elem.text.decode('utf-8')
88 - else:
89 - return - 1
90 -
91 -
92 -def output_editor_information(elem, output, **kwargs):
93 - '''
94 - @elem is an XML element containing 1 revision from a page
95 - @output is where to store the data, either a queue or a filehandle
96 - @**kwargs contains extra information
97 -
98 - the variable tags determines which attributes are being parsed, the values in
99 - this dictionary are the functions used to extract the data.
100 - '''
101 - tags = {'contributor': {'editor': extract_contributor_id,
102 - 'bot': determine_username_is_bot},
103 - 'timestamp': {'date': xml.extract_text},
104 - }
105 - vars = {}
106 - headers = ['editor', 'date', 'article']
107 - destination = kwargs.pop('destination')
108 - revisions = elem.findall('revision')
109 - for revision in revisions:
110 - vars['article'] = elem.find('id').text.decode(settings.ENCODING)
111 - elements = revision.getchildren()
112 - for tag, functions in tags.iteritems():
113 - xml_node = xml.retrieve_xml_node(elements, tag)
114 - for var, function in functions.iteritems():
115 - vars[var] = function(xml_node, kwargs)
116 -
117 - #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])
118 - if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:
119 - vars.pop('bot')
120 - if destination == 'queue':
121 - output.put(vars)
122 - vars['date'] = utils.convert_timestamp_to_date(vars['date'])
123 - elif destination == 'file':
124 - data = []
125 - for head in headers:
126 - data.append(vars[head])
127 - utils.write_list_to_csv(data, output)
128 - vars = {}
129 -
130 -
131 -def parse_editors(xml_queue, data_queue, **kwargs):
132 - '''
133 - @xml_queue contains the filenames of the files to be parsed
134 - @data_queue is an instance of Queue where the extracted data is stored for
135 - further processing
136 - @pbar is an instance of progressbar to display the progress
137 - @bots is a list of id's of known Wikipedia bots
138 - @debug is a flag to indicate whether the function is called for debugging.
139 -
140 - Output is the data_queue that will be used by store_editors()
141 - '''
142 - input = kwargs.get('input', None)
143 - output = kwargs.get('output', None)
144 - debug = kwargs.get('debug', False)
145 - destination = kwargs.get('destination', 'file')
146 - bots = kwargs.get('bots', None)
147 - pbar = kwargs.get('pbar', None)
148 - if settings.DEBUG:
149 - messages = {}
150 - vars = {}
151 -
152 - while True:
153 - try:
154 - if debug:
155 - file = xml_queue
156 - else:
157 - file = xml_queue.get(block=False)
158 - if file == None:
159 - print 'Swallowed a poison pill'
160 - break
161 -
162 - data = xml.read_input(utils.create_txt_filehandle(input,
163 - file, 'r',
164 - encoding=settings.ENCODING))
165 - if destination == 'file':
166 - name = file[:-4] + '.txt'
167 - fh = utils.create_txt_filehandle(output, name, 'w', settings.ENCODING)
168 - for raw_data in data:
169 - xml_buffer = cStringIO.StringIO()
170 - raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
171 -
172 - try:
173 - raw_data = ''.join(raw_data)
174 - xml_buffer.write(raw_data)
175 - elem = cElementTree.XML(xml_buffer.getvalue())
176 - output_editor_information(elem, fh, bots=bots, destination=destination)
177 - except SyntaxError, error:
178 - print error
179 - '''
180 - There are few cases with invalid tokens, they are fixed
181 - here and then reinserted into the XML DOM
182 - data = convert_html_entities(xml_buffer.getvalue())
183 - elem = cElementTree.XML(data)
184 - output_editor_information(elem)
185 - '''
186 - if settings.DEBUG:
187 - utils.track_errors(xml_buffer, error, file, messages)
188 - except UnicodeEncodeError, error:
189 - print error
190 - if settings.DEBUG:
191 - utils.track_errors(xml_buffer, error, file, messages)
192 - except MemoryError, error:
193 - print file, error
194 - print raw_data[:12]
195 - print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
196 - if destination == 'queue':
197 - output.put('NEXT')
198 - while True:
199 - if output.qsize() < 100000:
200 - break
201 - else:
202 - time.sleep(10)
203 - print 'Still sleeping, queue is %s items long' % output.qsize()
204 -
205 - else:
206 - fh.close()
207 -
208 - if pbar:
209 - print file, xml_queue.qsize()
210 - #utils.update_progressbar(pbar, xml_queue)
211 -
212 - if debug:
213 - break
214 -
215 - except Empty:
216 - break
217 -
218 - if destination == 'queue':
219 - data_queue.put(None)
220 -
221 - if settings.DEBUG:
222 - utils.report_error_messages(messages, parse_editors)
223 -
224 -
225 -def store_editors(data_queue, **kwargs):
226 - '''
227 - @data_queue is an instance of Queue containing information extracted by
228 - parse_editors()
229 - @pids is a list of PIDs used to check if other processes are finished
230 - running
231 - @dbname is the name of the MongoDB collection where to store the information.
232 - '''
233 - dbname = kwargs.get('dbname', None)
234 - mongo = db.init_mongo_db(dbname)
235 - collection = mongo['editors']
236 - mongo.collection.ensure_index('editor')
237 - editor_cache = cache.EditorCache(collection)
238 -
239 - while True:
240 - try:
241 - edit = data_queue.get(block=False)
242 - data_queue.task_done()
243 - if edit == None:
244 - print 'Swallowing poison pill'
245 - break
246 - elif edit == 'NEXT':
247 - editor_cache.add('NEXT', '')
248 - else:
249 - contributor = edit['editor']
250 - value = {'date': edit['date'], 'article': edit['article']}
251 - editor_cache.add(contributor, value)
252 - #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True)
253 - #'$inc': {'edit_count': 1},
254 -
255 - except Empty:
256 - '''
257 - This checks whether the Queue is empty because the preprocessors are
258 - finished or because this function is faster in emptying the Queue
259 - then the preprocessors are able to fill it. If the preprocessors
260 - are finished and this Queue is empty than break, else wait for the
261 - Queue to fill.
262 - '''
263 - pass
264 -
265 - print 'Emptying entire cache.'
266 - editor_cache.store()
267 - print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n)
268 -
269 -
270 -def load_cache_objects():
271 - cache = {}
272 - files = utils.retrieve_file_list(settings.BINARY_OBJECT_FILE_LOCATION, '.bin')
273 - for x, file in enumerate(files):
274 - cache[x] = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, file)
275 - return cache
276 -
277 -
278 -def search_cache_for_missed_editors(dbname):
279 - mongo = db.init_mongo_db(dbname)
280 - collection = mongo['editors']
281 - editor_cache = cache.EditorCache(collection)
282 - cache = load_cache_objects()
283 - for c in cache:
284 - for editor in cache[c]:
285 - editor_cache.add(editor, cache[c][editor])
286 - cache[c] = {}
287 - editor_cache.add('NEXT', '')
288 - cache = {}
289 -
290 -
291 -
292 -def load_bot_ids():
293 - '''
294 - Loader function to retrieve list of id's of known Wikipedia bots.
295 - '''
296 - ids = {}
297 - mongo = db.init_mongo_db('bots')
298 - bots = mongo['ids']
299 - cursor = bots.find()
300 - for bot in cursor:
301 - ids[bot['id']] = bot['name']
302 - return ids
303 -
304 -
305 -def run_parse_editors(location, language, project):
306 - ids = load_bot_ids()
307 - input = os.path.join(location, language, project)
308 - output = os.path.join(input, 'txt')
309 -
310 - kwargs = {'bots': ids,
311 - 'dbname': language + project,
312 - 'language': language,
313 - 'project': project,
314 - 'pbar': True,
315 - 'destination': 'file',
316 - 'nr_input_processors': settings.NUMBER_OF_PROCESSES,
317 - 'nr_output_processors': settings.NUMBER_OF_PROCESSES,
318 - 'input': input,
319 - 'output': output,
320 - }
321 - source = os.path.join(location, language, project)
322 - files = utils.retrieve_file_list(source, 'xml')
323 -
324 - if not os.path.exists(input):
325 - utils.create_directory(input)
326 - if not os.path.exists(output):
327 - utils.create_directory(output)
328 -
329 - chunks = utils.split_list(files , settings.NUMBER_OF_PROCESSES)
330 - pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs)
331 -
332 -
333 -def debug_parse_editors(dbname):
334 - q = JoinableQueue()
335 - parse_editors('522.xml', q, None, None, debug=True, destination='file')
336 - store_editors(q, [], dbname)
337 -
338 -
339 -if __name__ == "__main__":
340 - #debug_parse_editors('test2')
341 - run_parse_editors(settings.XML_FILE_LOCATION, 'en', 'wiki')
342 - pass
Index: trunk/tools/editor_trends/settings.py
@@ -1,158 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
18 -__date__ = '2010-10-21'
19 -__version__ = '0.1'
20 -
21 -'''
22 -This file contains settings that are used for constructing and analyzing
23 -the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
24 -'''
25 -
26 -from multiprocessing import cpu_count
27 -import os
28 -import sys
29 -import platform
30 -
31 -try:
32 - from pywin import win32file
33 - '''increase the maximum number of open files on Windows to 1024'''
34 - win32file._setmaxstdio(1024)
35 -except ImportError:
36 - pass
37 -
38 -try:
39 - import resource
40 -except ImportError:
41 - pass
42 -
43 -#Setting up the environment
44 -ops = {platform.win32_ver: 'Windows',
45 - platform.linux_distribution: 'Linux',
46 - platform.mac_ver: 'OSX'}
47 -
48 -for op in ops:
49 - if op() != ('', '', '') and op() != ('', ('', '', ''), ''):
50 - OS = ops[op]
51 -
52 -ARCH = platform.machine()
53 -
54 -WORKING_DIRECTORY = os.getcwd()
55 -IGNORE_DIRS = ['wikistats', 'zips']
56 -ROOT = '/' if OS != 'Windows' else 'c:\\'
57 -
58 -MINIMUM_PYTHON_VERSION = (2, 6)
59 -
60 -dirs = [name for name in os.listdir(WORKING_DIRECTORY) if
61 - os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
62 -for subdirname in dirs:
63 - if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
64 - sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname))
65 -
66 -WINDOWS_ZIP = ['7z.exe']
67 -
68 -OSX_ZIP = []
69 -
70 -LINUX_ZIP = []
71 -#General settings
72 -
73 -# Valid values are 'stand-alone' and 'hadoop'
74 -RUN_MODE = 'stand_alone'
75 -
76 -# If true then some more detailed debug information is collected
77 -DEBUG = True
78 -
79 -#If True then it will display a progress bar on the console.
80 -PROGRESS_BAR = True
81 -
82 -#Date format as used by Erik Zachte
83 -DATE_FORMAT = '%Y-%m-%d'
84 -
85 -# Timestamp format as generated by the MediaWiki dumps
86 -DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
87 -
88 -#This section contains configuration variables for the different file locations.
89 -
90 -# Location where to write xml chunks
91 -XML_FILE_LOCATION = os.path.join(ROOT, 'wikimedia')
92 -
93 -# Input file
94 -XML_FILE = os.path.join(ROOT, 'Source_Files', 'enwiki-20100916-stub-meta-history.xml')
95 -
96 -# This is the place where error messages are stored for debugging purposes
97 -ERROR_MESSAGE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'errors')
98 -
99 -DATABASE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'database')
100 -
101 -BINARY_OBJECT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'objects')
102 -
103 -DATASETS_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'datasets')
104 -
105 -TXT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'csv')
106 -
107 -NAMESPACE_LOCATION = os.path.join(WORKING_DIRECTORY, 'namespaces')
108 -#This section contains configuration variables for parsing / encoding and
109 -#working with the XML files.
110 -
111 -# ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Sets for reason
112 -MAX_XML_FILE_SIZE = 67108864
113 -
114 -if OS == 'Windows' and ARCH == 'i386':
115 - MAX_FILES_OPEN = win32file._getmaxstdio()
116 -elif OS != 'Windows':
117 - MAX_FILES_OPEN = resource.getrlimit(resource.RLIMIT_NOFILE)
118 -else:
119 - MAX_FILES_OPEN = 500
120 -
121 -ENCODING = 'utf-8'
122 -
123 -# Name space, do not change as this works for Mediawiki wikis
124 -NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/'
125 -
126 -WINDOWS_REGISTER = {'7zip': 'Software\\7-Zip',
127 - }
128 -
129 -COMPRESSION_EXTENSIONS = ['gz', 'bz2', '7z']
130 -
131 -
132 -WIKIMEDIA_PROJECTS = {'commons': 'commonswiki',
133 - 'wikibooks': 'wikibooks',
134 - 'wikinews': 'wikinews',
135 - 'wikiquote': 'wikiquote',
136 - 'wikisource': 'wikisource',
137 - 'wikiversity': 'wikiversity',
138 - 'wiktionary': 'wiktionary',
139 - 'metawiki': 'metawiki',
140 - 'wikispecies': 'specieswiki',
141 - 'incubator': 'incubatorwiki',
142 - 'foundation': 'foundationwiki',
143 - 'mediawiki': 'mediawikiwiki',
144 - 'outreach': 'outreachwiki',
145 - 'strategic planning': 'strategywiki',
146 - 'usability initiative': 'usabilitywiki',
147 - 'multilingual wikisource': None
148 - }
149 -
150 -#Multiprocess settings used to parallelize workload
151 -#Change this to match your computers configuration (RAM / CPU)
152 -NUMBER_OF_PROCESSES = cpu_count() * 1
153 -
154 -#Extensions of ascii files, this is used to determine the filemode to use
155 -ASCII = ['txt', 'csv', 'xml', 'sql', 'json']
156 -
157 -WP_DUMP_LOCATION = 'http://download.wikimedia.org'
158 -
159 -MAX_CACHE_SIZE = 1024 * 1024
Index: trunk/tools/editor_trends/split_xml_file.py
@@ -1,186 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
18 -__date__ = '2010-10-21'
19 -__version__ = '0.1'
20 -
21 -import xml.etree.cElementTree as cElementTree
22 -import codecs
23 -import utils
24 -import re
25 -import json
26 -import os
27 -
28 -import progressbar
29 -
30 -from utils import utils
31 -import settings
32 -
33 -try:
34 - import psyco
35 - psyco.full()
36 -except ImportError:
37 - pass
38 -
39 -
40 -RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
41 -
42 -
43 -def remove_numeric_character_references(text):
44 - return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
45 -
46 -
47 -def lenient_deccharref(m):
48 - try:
49 - return unichr(int(m.group(1)))
50 - except ValueError:
51 - '''
52 - There are a few articles that raise a Value Error here, the reason is
53 - that I am using a narrow Python build (UCS2) instead of a wide build
54 - (UCS4). The quick fix is to return an empty string...
55 - Real solution is to rebuild Python with UCS4 support.....
56 - '''
57 - return ''
58 -
59 -
60 -def remove_namespace(element, namespace):
61 - '''Remove namespace from the XML document.'''
62 - ns = u'{%s}' % namespace
63 - nsl = len(ns)
64 - for elem in element.getiterator():
65 - if elem.tag.startswith(ns):
66 - elem.tag = elem.tag[nsl:]
67 - return element
68 -
69 -
70 -def load_namespace(language):
71 - file = '%s_ns.json' % language
72 - fh = utils.create_txt_filehandle(settings.NAMESPACE_LOCATION, file, 'r', settings.ENCODING)
73 - ns = json.load(fh)
74 - fh.close()
75 - ns = ns['query']['namespaces']
76 - return ns
77 -
78 -
79 -def build_namespaces_locale(namespaces):
80 - '''
81 - Construct a list of all the non-main namespaces
82 - '''
83 - ns = []
84 - for namespace in namespaces:
85 - value = namespaces[namespace].get(u'*', None)
86 - if value != None and value != '':
87 - ns.append(value)
88 - return ns
89 -
90 -
91 -def parse_comments(xml, function):
92 - revisions = xml.findall('revision')
93 - for revision in revisions:
94 - comment = revision.find('comment')
95 - timestamp = revision.find('timestamp').text
96 - if comment != None and comment.text != None:
97 - comment.text = function(comment.text)
98 - return xml
99 -
100 -
101 -def is_article_main_namespace(elem, namespace):
102 - '''
103 - checks whether the article belongs to the main namespace
104 - '''
105 - title = elem.find('title').text
106 - for ns in namespace:
107 - if title.startswith(ns):
108 - return False
109 - return True
110 -
111 -
112 -def write_xml_file(element, fh, counter, language):
113 - '''Get file handle and write xml element to file'''
114 - size = len(cElementTree.tostring(element))
115 - fh, counter = create_xml_file_handle(fh, counter, size, language)
116 - try:
117 - fh.write(cElementTree.tostring(element))
118 - except MemoryError:
119 - print 'Add error capturing logic'
120 - fh.write('\n')
121 - return fh, counter
122 -
123 -
124 -def create_xml_file_handle(fh, counter, size, language):
125 - '''Create file handle if none is supplied or if file size > max file size.'''
126 - if not counter:
127 - counter = 0
128 - path = os.path.join(settings.XML_FILE_LOCATION, language, '%s.xml' % counter)
129 - if not fh:
130 - fh = codecs.open(path, 'w', encoding=settings.ENCODING)
131 - return fh, counter
132 - elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE:
133 - print 'Created chunk %s' % counter
134 - fh.close
135 - counter += 1
136 - fh = codecs.open(path, 'w', encoding=settings.ENCODING)
137 - return fh, counter
138 - else:
139 - return fh, counter
140 -
141 -
142 -def split_xml(location, filename, project, language_code):
143 - '''Reads xml file and splits it in N chunks'''
144 - #location = os.path.join(settings.XML_FILE_LOCATION, language)
145 - result = utils.check_file_exists(location, '')
146 - if result == False:
147 - result = utils.create_directory(location)
148 - if not result:
149 - return
150 -
151 - ns = load_namespace(language_code)
152 - ns = build_namespaces_locale(ns)
153 -
154 - fh = None
155 - counter = None
156 - source = os.path.join(location, filename)
157 - tag = '{%s}page' % settings.NAME_SPACE
158 -
159 - context = cElementTree.iterparse(source, events=('start', 'end'))
160 - context = iter(context)
161 - event, root = context.next() #get the root element of the XML doc
162 -
163 - try:
164 - for event, elem in context:
165 - if event == 'end':
166 - if elem.tag == tag:
167 - elem = remove_namespace(elem, settings.NAME_SPACE)
168 - if is_article_main_namespace(elem, ns):
169 - elem = parse_comments(elem, remove_numeric_character_references)
170 - fh, counter = write_xml_file(elem, fh, counter, language_code)
171 - root.clear() # when done parsing a section clear the tree to safe memory
172 - #elem = parse_comments(elem, convert_html_entities)
173 - #elem = parse_comments(elem, remove_ascii_control_characters)
174 - #print cElementTree.tostring(elem)
175 - except SyntaxError:
176 - fh = utils.create_txt_filehandle(settings.ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING)
177 - fh.write(cElementTree.tostring(elem))
178 - fh.close()
179 -
180 -
181 -if __name__ == "__main__":
182 - kwargs = {'location': 'c:\\Source_files\\',
183 - 'filename': settings.XML_FILE,
184 - 'project':'wiki',
185 - 'language_code':'en'
186 - }
187 - split_xml(**kwargs)
Index: trunk/tools/editor_trends/init_bot_db.py
@@ -1,196 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -
18 -import os
19 -import cStringIO
20 -import xml.etree.cElementTree as cElementTree
21 -
22 -
23 -import settings
24 -from wikitree import xml
25 -from database import db
26 -from database import db_settings
27 -from utils import utils
28 -from utils import process_constructor as pc
29 -
30 -try:
31 - import psyco
32 - psyco.full()
33 -except ImportError:
34 - pass
35 -
36 -
37 -def create_bot_ids_db_mongo():
38 - ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.ENCODING)
39 - mongo = db.init_mongo_db('bots')
40 - collection = mongo['ids']
41 -
42 - db.remove_documents_from_mongo_db(collection, None)
43 -
44 - for id, name in ids.iteritems():
45 - collection.insert({'id': id, 'name': name})
46 -
47 - print collection.count()
48 -
49 -
50 -def create_bots_db(db_name):
51 - '''
52 - This function reads the csv file provided by Erik Zachte and constructs a
53 - sqlite memory database. The reason for this is that I suspect I will need
54 - some simple querying capabilities in the future, else a dictionary would
55 - suffice.
56 - '''
57 - connection = db.init_database('db_name')
58 - #connection = db.init_database('data/database/bots.db')
59 - cursor = connection.cursor()
60 - db.create_tables(cursor, db_settings.BOT_TABLE)
61 - values = []
62 - fields = [field[0] for field in db_settings.BOT_TABLE['bots']]
63 - for line in utils.read_data_from_csv('data/csv/StatisticsBots.csv', settings.ENCODING):
64 - line = line.split(',')
65 - row = []
66 - for x, (field, value) in enumerate(zip(fields, line)):
67 - if db_settings.BOT_TABLE['bots'][x][1] == 'INTEGER':
68 - value = int(value)
69 - elif db_settings.BOT_TABLE['bots'][x][1] == 'TEXT':
70 - value = value.replace('/', '-')
71 - #print field, value
72 - row.append(value)
73 - values.append(row)
74 -
75 - cursor.executemany('INSERT INTO bots VALUES (?,?,?,?,?,?,?,?,?,?);', values)
76 - connection.commit()
77 - if db_name == ':memory':
78 - return cursor
79 - else:
80 - connection.close()
81 -
82 -
83 -def retrieve_botnames_without_id(cursor, language):
84 - return cursor.execute('SELECT name FROM bots WHERE language=?', (language,)).fetchall()
85 -
86 -
87 -def lookup_username(input_queue, result_queue, progressbar, bots, debug=False):
88 - '''
89 - This function is used to find the id's belonging to the different bots that
90 - are patrolling the Wikipedia sites.
91 - @input_queue contains a list of xml files to parse
92 -
93 - @result_queue should be set to false as the results are directly written to
94 - a csv file.
95 -
96 - @progressbar depends on settings
97 -
98 - @bots is a dictionary containing the names of the bots to lookup
99 - '''
100 -
101 - #if len(bots.keys()) == 1:
102 - bots = bots['bots']
103 - #print bots.keys()
104 -
105 - if settings.DEBUG:
106 - messages = {}
107 -
108 - while True:
109 - if debug:
110 - file = input_queue
111 - else:
112 - file = input_queue.get(block=False)
113 -
114 - if file == None:
115 - break
116 -
117 - data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION +
118 - file, 'r', encoding=settings.ENCODING))
119 -
120 - for raw_data in data:
121 - xml_buffer = cStringIO.StringIO()
122 - raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
123 - raw_data = ''.join(raw_data)
124 - raw_data = raw_data.encode('utf-8')
125 - xml_buffer.write(raw_data)
126 -
127 - try:
128 - xml_nodes = cElementTree.XML(xml_buffer.getvalue())
129 - revisions = xml_nodes.findall('revision')
130 - for revision in revisions:
131 - contributor = xml.retrieve_xml_node(revision, 'contributor')
132 - username = contributor.find('username')
133 - if username == None:
134 - continue
135 - username = xml.extract_text(username)
136 - #print username.encode('utf-8')
137 -
138 - if username in bots:
139 - id = contributor.find('id')
140 - id = xml.extract_text(id)
141 - #print username.encode('utf-8'), id
142 - utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.ENCODING)
143 - bots.pop(username)
144 - if bots == {}:
145 - print 'Mission accomplished'
146 - return
147 - except Exception, error:
148 - print error
149 - if settings.DEBUG:
150 - messages = utils.track_errors(xml_buffer, error, file,
151 - messages)
152 -
153 - if settings.DEBUG:
154 - utils.report_error_messages(messages, lookup_username)
155 -
156 -
157 -def add_id_to_botnames():
158 - '''
159 - This is the worker function for the multi-process version of
160 - lookup_username.First, the names of the bots are retrieved, then the
161 - multiprocess is launched by makinga call to pc.build_scaffolding. This is a
162 - generic launcher that takes as input the function to load the input_queue,
163 - the function that will do the main work and the objects to be put in the
164 - input_queue. The launcher also accepts optional keyword arguments.
165 - '''
166 - cursor = create_bots_db(':memory')
167 - files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml')
168 -
169 - botnames = retrieve_botnames_without_id(cursor, 'en')
170 - bots = {}
171 - for botname in botnames:
172 - bots[botname[0]] = 1
173 - pc.build_scaffolding(pc.load_queue, lookup_username, files, bots=bots)
174 - cursor.close()
175 -
176 -
177 -def debug_lookup_username():
178 - '''
179 - This function launches the lookup_username function but then single
180 - threaded, this eases debugging. That's also the reason why the queue
181 - parameters are set to None. When launching this function make sure that
182 - debug=False when calling lookup_username
183 - '''
184 - cursor = create_bots_db(':memory')
185 - botnames = retrieve_botnames_without_id(cursor, 'en')
186 - bots = {}
187 - for botname in botnames:
188 - bots[botname[0]] = 1
189 -
190 - lookup_username('12.xml', None, None, bots, debug=True)
191 - cursor.close()
192 -
193 -
194 -if __name__ == '__main__':
195 - #debug()
196 - #add_id_to_botnames()
197 - create_bot_ids_db_mongo()
Index: trunk/tools/editor_trends/construct_datasets.py
@@ -1,254 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
18 -__date__ = '2010-10-21'
19 -__version__ = '0.1'
20 -
21 -from multiprocessing import Queue
22 -from Queue import Empty
23 -import datetime
24 -from dateutil.relativedelta import *
25 -
26 -import progressbar
27 -
28 -import settings
29 -from utils import models, utils
30 -from database import db
31 -from utils import process_constructor as pc
32 -
33 -try:
34 - import psyco
35 - psyco.full()
36 -except ImportError:
37 - pass
38 -
39 -
40 -def retrieve_editor_ids_mongo(dbname, collection):
41 - if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
42 - 'editors.bin'):
43 - ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
44 - 'editors.bin')
45 - else:
46 - mongo = db.init_mongo_db(dbname)
47 - editors = mongo[collection]
48 - ids = editors.distinct('editor')
49 - utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
50 - return ids
51 -
52 -
53 -def expand_edits(edits):
54 - data = []
55 - for edit in edits:
56 - data.append(edit['date'])
57 - return data
58 -
59 -
60 -def expand_observations(obs, vars_to_expand):
61 - for var in vars_to_expand:
62 - if var == 'edits':
63 - obs[var] = expand_edits(obs[var])
64 - elif var == 'edits_by_year':
65 - keys = obs[var].keys()
66 - keys.sort()
67 - edits = []
68 - for key in keys:
69 - edits.append(str(obs[var][key]))
70 - obs[var] = edits
71 - return obs
72 -
73 -def write_longitudinal_data(id, edits, fh):
74 - years = edits.keys()
75 - years.sort()
76 - for year in years:
77 - months = edits[year].keys()
78 - months = [int(m) for m in months]
79 - months.sort()
80 - for m in months:
81 - date = datetime.date(int(year), int(m), 1)
82 - fh.write('%s\t%s\t%s\n' % (id, date, edits[year][str(m)]))
83 -
84 -
85 -def expand_headers(headers, vars_to_expand, obs):
86 - for var in vars_to_expand:
87 - l = len(obs[var])
88 - pos = headers.index(var)
89 - for i in xrange(l):
90 - if var.endswith('year'):
91 - suffix = 2001 + i
92 - elif var.endswith('edits'):
93 - suffix = 1 + i
94 - headers.insert(pos + i, '%s_%s' % (var, suffix))
95 - headers.remove(var)
96 - return headers
97 -
98 -
99 -def generate_long_editor_dataset(input_queue, data_queue, pbar, **kwargs):
100 - debug = kwargs.pop('debug')
101 - dbname = kwargs.pop('dbname')
102 - mongo = db.init_mongo_db(dbname)
103 - editors = mongo['dataset']
104 - name = dbname + '_long_editors.csv'
105 - fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING)
106 - x = 0
107 - vars_to_expand = []
108 - while True:
109 - try:
110 - id = input_queue.get(block=False)
111 - obs = editors.find_one({'editor': id}, {'monthly_edits': 1})
112 - if x == 0:
113 - headers = obs.keys()
114 - headers.sort()
115 - headers = expand_headers(headers, vars_to_expand, obs)
116 - utils.write_list_to_csv(headers, fh)
117 - write_longitudinal_data(id, obs['monthly_edits'], fh)
118 - #utils.write_list_to_csv(data, fh)
119 - x += 1
120 - except Empty:
121 - break
122 -
123 -
124 -def generate_cohort_analysis(input_queue, data_queue, pbar, **kwargs):
125 - dbname = kwargs.get('dbname')
126 - pbar = kwargs.get('pbar')
127 - mongo = db.init_mongo_db(dbname)
128 - editors = mongo['dataset']
129 - year = datetime.datetime.now().year + 1
130 - begin = year - 2001
131 - p = [3, 6, 9]
132 - periods = [y * 12 for y in xrange(1, begin)]
133 - periods = p + periods
134 - data = {}
135 - while True:
136 - try:
137 - id = input_queue.get(block=False)
138 - obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})
139 - first_edit = obs['first_edit']
140 - last_edit = obs['final_edit']
141 - for y in xrange(2001, year):
142 - if y == 2010 and first_edit > datetime.datetime(2010, 1, 1):
143 - print 'debug'
144 - if y not in data:
145 - data[y] = {}
146 - data[y]['n'] = 0
147 - window_end = datetime.datetime(y, 12, 31)
148 - if window_end > datetime.datetime.now():
149 - now = datetime.datetime.now()
150 - m = now.month - 1 #Dump files are always lagging at least one month....
151 - d = now.day
152 - window_end = datetime.datetime(y, m, d)
153 - edits = []
154 - for period in periods:
155 - if period not in data[y]:
156 - data[y][period] = 0
157 - window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period)
158 - if window_start < datetime.datetime(2001, 1, 1):
159 - window_start = datetime.datetime(2001, 1, 1)
160 - if date_falls_in_window(window_start, window_end, first_edit, last_edit):
161 - edits.append(period)
162 - if edits != []:
163 - p = min(edits)
164 - data[y]['n'] += 1
165 - data[y][p] += 1
166 - #pbar.update(+1)
167 - except Empty:
168 - break
169 - utils.store_object(data, settings.BINARY_OBJECT_FILE_LOCATION, 'cohort_data')
170 -
171 -def date_falls_in_window(window_start, window_end, first_edit, last_edit):
172 - if first_edit >= window_start and first_edit <= window_end:
173 - return True
174 - else:
175 - return False
176 -
177 -
178 -def generate_wide_editor_dataset(input_queue, data_queue, pbar, **kwargs):
179 - dbname = kwargs.pop('dbname')
180 - mongo = db.init_mongo_db(dbname)
181 - editors = mongo['dataset']
182 - name = dbname + '_wide_editors.csv'
183 - fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING)
184 - x = 0
185 - vars_to_expand = ['edits', 'edits_by_year', 'articles_by_year']
186 - while True:
187 - try:
188 - if debug:
189 - id = u'99797'
190 - else:
191 - id = input_queue.get(block=False)
192 - print input_queue.qsize()
193 - obs = editors.find_one({'editor': id})
194 - obs = expand_observations(obs, vars_to_expand)
195 - if x == 0:
196 - headers = obs.keys()
197 - headers.sort()
198 - headers = expand_headers(headers, vars_to_expand, obs)
199 - utils.write_list_to_csv(headers, fh)
200 - data = []
201 - keys = obs.keys()
202 - keys.sort()
203 - for key in keys:
204 - data.append(obs[key])
205 - utils.write_list_to_csv(data, fh)
206 -
207 - x += 1
208 - except Empty:
209 - break
210 - fh.close()
211 -
212 -
213 -def retrieve_edits_by_contributor_launcher():
214 - pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors')
215 -
216 -
217 -def debug_retrieve_edits_by_contributor_launcher(dbname):
218 - kwargs = {'debug': False,
219 - 'dbname': dbname,
220 - }
221 - ids = retrieve_editor_ids_mongo(dbname, 'editors')
222 - input_queue = pc.load_queue(ids)
223 - q = Queue()
224 - generate_editor_dataset(input_queue, q, False, kwargs)
225 -
226 -
227 -def generate_editor_dataset_launcher(dbname):
228 - kwargs = {'nr_input_processors': 1,
229 - 'nr_output_processors': 1,
230 - 'debug': False,
231 - 'dbname': dbname,
232 - 'poison_pill':False,
233 - 'pbar': True
234 - }
235 - ids = retrieve_editor_ids_mongo(dbname, 'editors')
236 - ids = list(ids)
237 - chunks = dict({0: ids})
238 - pc.build_scaffolding(pc.load_queue, generate_cohort_analysis, chunks, False, False, **kwargs)
239 -
240 -
241 -def generate_editor_dataset_debug(dbname):
242 - ids = retrieve_editor_ids_mongo(dbname, 'editors')
243 - input_queue = pc.load_queue(ids)
244 - kwargs = {'nr_input_processors': 1,
245 - 'nr_output_processors': 1,
246 - 'debug': True,
247 - 'dbname': dbname,
248 - }
249 - generate_editor_dataset(input_queue, False, False, kwargs)
250 -
251 -
252 -if __name__ == '__main__':
253 - #generate_editor_dataset_debug('test')
254 - generate_editor_dataset_launcher('enwiki')
255 - #debug_retrieve_edits_by_contributor_launcher()
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts.do
@@ -0,0 +1,25 @@
 2+label var months_3 "3 Months"
 3+label var months_6 "6 Months"
 4+label var months_9 "9 Months"
 5+label var months_12 "1 Year"
 6+label var months_24 "2 Years"
 7+label var months_36 "3 Years"
 8+label var months_48 "4 Years"
 9+label var months_60 "5 Years"
 10+label var months_72 "6 Years"
 11+label var months_84 "7 Years"
 12+label var months_96 "8 Years"
 13+label var months_108 "9 Years"
 14+generate one_year_exp = months_3+ months_6+ months_9+ months_12
 15+
 16+generate fewer_one_year_abs = (one_year_exp/100) * n
 17+generate more_one_year_abs = n - fewer_one_year_abs
 18+label var fewer_one_year_abs "Editors with less than one year experience"
 19+label var more_one_year_abs "Editors with more than one year experience"
 20+
 21+graph bar (asis) months_3 months_6 months_9 months_12 months_24 months_36 months_48 months_60 months_72 months_84 months_96 months_108, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) subtitle(Editors are getting older and influx of new editors has stagnated) note("Based on English Wikipedia, 345.000 editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(tiny)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
 22+
 23+twoway (line one_year_exp year), ytitle(%) ytitle(, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the English Wikipedia, dataset 345.000 editors.", size(vsmall))
 24+
 25+
 26+graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) legend(colfirst cols(1))
Index: trunk/tools/editor_trends/statistics/stata/wiki.do
@@ -1,4 +1,5 @@
2 -local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit"
 2+insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_long_editors.csv"
 3+local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit first_edit"
34
45 foreach edit of local first_ten {
56 gen date2 = date(`edit', "YMDhms")
@@ -8,6 +9,7 @@
910 }
1011
1112 generate year_left = year(final_edit)
 13+generate year_joined = year(first_edit)
1214 sort year_joined
1315 by year_joined: gen community_size_t = _N
1416
@@ -23,12 +25,6 @@
2426 gen retention200`t' = community_size_200`t1' / community_size_200`t'
2527 }
2628
27 -
28 -
29 -
30 -
31 -
32 -
3329 generate time_to_new_wp = edits_10 - edits_1
3430 generate active_time_wp = final_edit - edits_10
3531 label time_to_new_wp "Number of days it took to become a new wikipedian"
Property changes on: trunk/tools/editor_trends/datasets
___________________________________________________________________
Added: svn:ignore
3632 + cohort_data.txt
cohorts.dta
difference observations erik vs diederik.ods
difference observations erik vs diederik.xlsx
editors.dta
enwiki_editors.csv
enwiki_long_editors.csv
enwiki_wide_editors.csv
Property changes on: trunk/tools/editor_trends/documentation
___________________________________________________________________
Added: svn:ignore
3733 + language_codes.xlsx
Property changes on: trunk/tools/editor_trends/errors
___________________________________________________________________
Modified: svn:ignore
3834 - *.bin
3935 + *.bin
split_xml
Property changes on: trunk/tools/editor_trends
___________________________________________________________________
Added: svn:default-eol-style
4036 + native

Status & tagging log