Index: trunk/tools/editor_trends/optimize_editors.py |
— | — | @@ -1,171 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__author__email = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-11-02' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -from multiprocessing import Queue |
22 | | -from Queue import Empty |
23 | | -from operator import itemgetter |
24 | | -import datetime |
25 | | - |
26 | | -import settings |
27 | | -from database import db |
28 | | -from utils import process_constructor as pc |
29 | | -from utils import utils |
30 | | -import construct_datasets |
31 | | - |
32 | | - |
33 | | -try: |
34 | | - import psyco |
35 | | - psyco.full() |
36 | | -except ImportError: |
37 | | - pass |
38 | | - |
39 | | - |
40 | | -def create_datacontainer(init_value=0): |
41 | | - ''' |
42 | | - This function initializes an empty dictionary with as key the year (starting |
43 | | - 2001 and running through) and as value @init_value, in most cases this will |
44 | | - be zero so the dictionary will act as a running tally for a variable but |
45 | | - @init_value can also a list, [], or a dictionary, {}, or a set, set(). |
46 | | - ''' |
47 | | - data = {} |
48 | | - year = datetime.datetime.now().year + 1 |
49 | | - for x in xrange(2001, year): |
50 | | - data[str(x)] = init_value |
51 | | - return data |
52 | | - |
53 | | - |
54 | | -def add_months_to_datacontainer(datacontainer): |
55 | | - for dc in datacontainer: |
56 | | - datacontainer[dc] = {} |
57 | | - for x in xrange(1, 13): |
58 | | - datacontainer[dc][str(x)] = 0 |
59 | | - return datacontainer |
60 | | - |
61 | | - |
62 | | -def determine_edits_by_month(edits): |
63 | | - datacontainer = create_datacontainer(init_value=0) |
64 | | - datacontainer = add_months_to_datacontainer(datacontainer) |
65 | | - for year in edits: |
66 | | - months = set() |
67 | | - for edit in edits[year]: |
68 | | - m = str(edit['date'].month) |
69 | | - if m not in months: |
70 | | - datacontainer[year][m] = 1 |
71 | | - months.add(m) |
72 | | - if len(months) == 12: |
73 | | - break |
74 | | - return datacontainer |
75 | | - |
76 | | - |
77 | | -def determine_edits_by_year(dates): |
78 | | - ''' |
79 | | - This function counts the number of edits by year made by a particular editor. |
80 | | - ''' |
81 | | - edits = create_datacontainer() |
82 | | - for date in dates: |
83 | | - year = str(date['date'].year) |
84 | | - edits[year] += 1 |
85 | | - return edits |
86 | | - |
87 | | - |
88 | | -def determine_articles_by_year(dates): |
89 | | - ''' |
90 | | - This function counts the number of unique articles by year edited by a |
91 | | - particular editor. |
92 | | - ''' |
93 | | - articles = create_datacontainer(set()) |
94 | | - for date in dates: |
95 | | - year = str(date['date'].year) |
96 | | - articles[year].add(date['article']) |
97 | | - for article in articles: |
98 | | - articles[article] = len(articles[article]) |
99 | | - return articles |
100 | | - |
101 | | - |
102 | | -def sort_edits(edits): |
103 | | - edits = utils.merge_list(edits) |
104 | | - return sorted(edits, key=itemgetter('date')) |
105 | | - |
106 | | - |
107 | | -def optimize_editors(input_queue, result_queue, pbar, **kwargs): |
108 | | - dbname = kwargs.pop('dbname') |
109 | | - mongo = db.init_mongo_db(dbname) |
110 | | - input = mongo['test'] |
111 | | - output = mongo['dataset'] |
112 | | - output.ensure_index('editor') |
113 | | - output.ensure_index('year_joined') |
114 | | - definition = kwargs.pop('definition') |
115 | | - while True: |
116 | | - try: |
117 | | - id = input_queue.get(block=False) |
118 | | - editor = input.find_one({'editor': id}) |
119 | | - if editor == None: |
120 | | - continue |
121 | | - edits = editor['edits'] |
122 | | - monthly_edits = determine_edits_by_month(edits) |
123 | | - edits = sort_edits(edits) |
124 | | - edit_count = len(edits) |
125 | | - new_wikipedian = edits[9]['date'] |
126 | | - first_edit = edits[0]['date'] |
127 | | - final_edit = edits[-1]['date'] |
128 | | - edits_by_year = determine_edits_by_year(edits) |
129 | | - articles_by_year = determine_articles_by_year(edits) |
130 | | - |
131 | | - edits = edits[:10] |
132 | | - |
133 | | - output.insert({'editor': id, 'edits': edits, |
134 | | - 'edits_by_year': edits_by_year, |
135 | | - 'new_wikipedian': new_wikipedian, |
136 | | - 'edit_count': edit_count, |
137 | | - 'final_edit': final_edit, |
138 | | - 'first_edit': first_edit, |
139 | | - 'articles_by_year': articles_by_year, |
140 | | - 'monthly_edits': monthly_edits}) |
141 | | - print 'Items left: %s' % input_queue.qsize() |
142 | | - except Empty: |
143 | | - break |
144 | | - |
145 | | - |
146 | | -def run_optimize_editors(dbname): |
147 | | - ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors') |
148 | | - kwargs = {'definition': 'traditional', |
149 | | - 'pbar': True, |
150 | | - 'dbname': 'enwiki', |
151 | | - 'nr_input_processors': 1, |
152 | | - 'nr_output_processors': 0, |
153 | | - 'poison_pill': False |
154 | | - } |
155 | | - print len(ids) |
156 | | - ids = list(ids) |
157 | | - chunks = dict(0, ids) |
158 | | - pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs) |
159 | | - |
160 | | - |
161 | | -def debug_optimize_editors(dbname): |
162 | | - ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors') |
163 | | - q = pc.load_queue(ids) |
164 | | - kwargs = {'definition': 'traditional', |
165 | | - 'dbname': dbname |
166 | | - } |
167 | | - optimize_editors(q, False, True, kwargs) |
168 | | - |
169 | | - |
170 | | -if __name__ == '__main__': |
171 | | - #debug_optimize_editors('test') |
172 | | - run_optimize_editors('enwiki') |
\ No newline at end of file |
Index: trunk/tools/editor_trends/map_wiki_editors.py |
— | — | @@ -1,341 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__author__email = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-10-21' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -#Default Python libraries (Python => 2.6) |
22 | | -import sys |
23 | | -import os |
24 | | -import time |
25 | | -import datetime |
26 | | -import codecs |
27 | | -import math |
28 | | -import cStringIO |
29 | | -import re |
30 | | -from operator import itemgetter |
31 | | -import xml.etree.cElementTree as cElementTree |
32 | | -from multiprocessing import Queue, JoinableQueue |
33 | | -from Queue import Empty |
34 | | -import pymongo |
35 | | - |
36 | | -# Custom written files |
37 | | -import settings |
38 | | -from utils import utils, models |
39 | | -from database import db_settings |
40 | | -from database import db |
41 | | -from database import cache |
42 | | -from wikitree import xml |
43 | | -from statistics import dataset |
44 | | -from utils import process_constructor as pc |
45 | | - |
46 | | - |
47 | | -try: |
48 | | - import psyco |
49 | | - psyco.full() |
50 | | -except ImportError: |
51 | | - pass |
52 | | - |
53 | | - |
54 | | -def determine_username_is_bot(username, kwargs): |
55 | | - ''' |
56 | | - @username is the xml element containing the id of the user |
57 | | - @kwargs should have a list with all the bot ids |
58 | | - |
59 | | - @Return False if username id is not in bot list id or True if username id |
60 | | - is a bot id. |
61 | | - ''' |
62 | | - ids = kwargs.get('bots', []) |
63 | | - if ids == None: |
64 | | - ids = [] |
65 | | - if username != None and username.text != None: |
66 | | - id = username.text |
67 | | - if id in ids: |
68 | | - return 1 |
69 | | - else: |
70 | | - return 0 |
71 | | - |
72 | | - |
73 | | -def extract_contributor_id(contributor, kwargs): |
74 | | - ''' |
75 | | - @contributor is the xml contributor node containing a number of attributes |
76 | | - |
77 | | - Currently, we are only interested in registered contributors, hence we |
78 | | - ignore anonymous editors. If you are interested in collecting data on |
79 | | - anonymous editors then add the string 'ip' to the tags variable. |
80 | | - ''' |
81 | | - tags = ['id'] |
82 | | - if contributor.get('deleted'): |
83 | | - return - 1 # ASK: Not sure if this is the best way to code deleted contributors. |
84 | | - for elem in contributor: |
85 | | - if elem.tag in tags: |
86 | | - if elem.text != None: |
87 | | - return elem.text.decode('utf-8') |
88 | | - else: |
89 | | - return - 1 |
90 | | - |
91 | | - |
92 | | -def output_editor_information(elem, output, **kwargs): |
93 | | - ''' |
94 | | - @elem is an XML element containing 1 revision from a page |
95 | | - @output is where to store the data, either a queue or a filehandle |
96 | | - @**kwargs contains extra information |
97 | | - |
98 | | - the variable tags determines which attributes are being parsed, the values in |
99 | | - this dictionary are the functions used to extract the data. |
100 | | - ''' |
101 | | - tags = {'contributor': {'editor': extract_contributor_id, |
102 | | - 'bot': determine_username_is_bot}, |
103 | | - 'timestamp': {'date': xml.extract_text}, |
104 | | - } |
105 | | - vars = {} |
106 | | - headers = ['editor', 'date', 'article'] |
107 | | - destination = kwargs.pop('destination') |
108 | | - revisions = elem.findall('revision') |
109 | | - for revision in revisions: |
110 | | - vars['article'] = elem.find('id').text.decode(settings.ENCODING) |
111 | | - elements = revision.getchildren() |
112 | | - for tag, functions in tags.iteritems(): |
113 | | - xml_node = xml.retrieve_xml_node(elements, tag) |
114 | | - for var, function in functions.iteritems(): |
115 | | - vars[var] = function(xml_node, kwargs) |
116 | | - |
117 | | - #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot']) |
118 | | - if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None: |
119 | | - vars.pop('bot') |
120 | | - if destination == 'queue': |
121 | | - output.put(vars) |
122 | | - vars['date'] = utils.convert_timestamp_to_date(vars['date']) |
123 | | - elif destination == 'file': |
124 | | - data = [] |
125 | | - for head in headers: |
126 | | - data.append(vars[head]) |
127 | | - utils.write_list_to_csv(data, output) |
128 | | - vars = {} |
129 | | - |
130 | | - |
131 | | -def parse_editors(xml_queue, data_queue, **kwargs): |
132 | | - ''' |
133 | | - @xml_queue contains the filenames of the files to be parsed |
134 | | - @data_queue is an instance of Queue where the extracted data is stored for |
135 | | - further processing |
136 | | - @pbar is an instance of progressbar to display the progress |
137 | | - @bots is a list of id's of known Wikipedia bots |
138 | | - @debug is a flag to indicate whether the function is called for debugging. |
139 | | - |
140 | | - Output is the data_queue that will be used by store_editors() |
141 | | - ''' |
142 | | - input = kwargs.get('input', None) |
143 | | - output = kwargs.get('output', None) |
144 | | - debug = kwargs.get('debug', False) |
145 | | - destination = kwargs.get('destination', 'file') |
146 | | - bots = kwargs.get('bots', None) |
147 | | - pbar = kwargs.get('pbar', None) |
148 | | - if settings.DEBUG: |
149 | | - messages = {} |
150 | | - vars = {} |
151 | | - |
152 | | - while True: |
153 | | - try: |
154 | | - if debug: |
155 | | - file = xml_queue |
156 | | - else: |
157 | | - file = xml_queue.get(block=False) |
158 | | - if file == None: |
159 | | - print 'Swallowed a poison pill' |
160 | | - break |
161 | | - |
162 | | - data = xml.read_input(utils.create_txt_filehandle(input, |
163 | | - file, 'r', |
164 | | - encoding=settings.ENCODING)) |
165 | | - if destination == 'file': |
166 | | - name = file[:-4] + '.txt' |
167 | | - fh = utils.create_txt_filehandle(output, name, 'w', settings.ENCODING) |
168 | | - for raw_data in data: |
169 | | - xml_buffer = cStringIO.StringIO() |
170 | | - raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
171 | | - |
172 | | - try: |
173 | | - raw_data = ''.join(raw_data) |
174 | | - xml_buffer.write(raw_data) |
175 | | - elem = cElementTree.XML(xml_buffer.getvalue()) |
176 | | - output_editor_information(elem, fh, bots=bots, destination=destination) |
177 | | - except SyntaxError, error: |
178 | | - print error |
179 | | - ''' |
180 | | - There are few cases with invalid tokens, they are fixed |
181 | | - here and then reinserted into the XML DOM |
182 | | - data = convert_html_entities(xml_buffer.getvalue()) |
183 | | - elem = cElementTree.XML(data) |
184 | | - output_editor_information(elem) |
185 | | - ''' |
186 | | - if settings.DEBUG: |
187 | | - utils.track_errors(xml_buffer, error, file, messages) |
188 | | - except UnicodeEncodeError, error: |
189 | | - print error |
190 | | - if settings.DEBUG: |
191 | | - utils.track_errors(xml_buffer, error, file, messages) |
192 | | - except MemoryError, error: |
193 | | - print file, error |
194 | | - print raw_data[:12] |
195 | | - print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data]) |
196 | | - if destination == 'queue': |
197 | | - output.put('NEXT') |
198 | | - while True: |
199 | | - if output.qsize() < 100000: |
200 | | - break |
201 | | - else: |
202 | | - time.sleep(10) |
203 | | - print 'Still sleeping, queue is %s items long' % output.qsize() |
204 | | - |
205 | | - else: |
206 | | - fh.close() |
207 | | - |
208 | | - if pbar: |
209 | | - print file, xml_queue.qsize() |
210 | | - #utils.update_progressbar(pbar, xml_queue) |
211 | | - |
212 | | - if debug: |
213 | | - break |
214 | | - |
215 | | - except Empty: |
216 | | - break |
217 | | - |
218 | | - if destination == 'queue': |
219 | | - data_queue.put(None) |
220 | | - |
221 | | - if settings.DEBUG: |
222 | | - utils.report_error_messages(messages, parse_editors) |
223 | | - |
224 | | - |
225 | | -def store_editors(data_queue, **kwargs): |
226 | | - ''' |
227 | | - @data_queue is an instance of Queue containing information extracted by |
228 | | - parse_editors() |
229 | | - @pids is a list of PIDs used to check if other processes are finished |
230 | | - running |
231 | | - @dbname is the name of the MongoDB collection where to store the information. |
232 | | - ''' |
233 | | - dbname = kwargs.get('dbname', None) |
234 | | - mongo = db.init_mongo_db(dbname) |
235 | | - collection = mongo['editors'] |
236 | | - mongo.collection.ensure_index('editor') |
237 | | - editor_cache = cache.EditorCache(collection) |
238 | | - |
239 | | - while True: |
240 | | - try: |
241 | | - edit = data_queue.get(block=False) |
242 | | - data_queue.task_done() |
243 | | - if edit == None: |
244 | | - print 'Swallowing poison pill' |
245 | | - break |
246 | | - elif edit == 'NEXT': |
247 | | - editor_cache.add('NEXT', '') |
248 | | - else: |
249 | | - contributor = edit['editor'] |
250 | | - value = {'date': edit['date'], 'article': edit['article']} |
251 | | - editor_cache.add(contributor, value) |
252 | | - #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True) |
253 | | - #'$inc': {'edit_count': 1}, |
254 | | - |
255 | | - except Empty: |
256 | | - ''' |
257 | | - This checks whether the Queue is empty because the preprocessors are |
258 | | - finished or because this function is faster in emptying the Queue |
259 | | - then the preprocessors are able to fill it. If the preprocessors |
260 | | - are finished and this Queue is empty than break, else wait for the |
261 | | - Queue to fill. |
262 | | - ''' |
263 | | - pass |
264 | | - |
265 | | - print 'Emptying entire cache.' |
266 | | - editor_cache.store() |
267 | | - print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n) |
268 | | - |
269 | | - |
270 | | -def load_cache_objects(): |
271 | | - cache = {} |
272 | | - files = utils.retrieve_file_list(settings.BINARY_OBJECT_FILE_LOCATION, '.bin') |
273 | | - for x, file in enumerate(files): |
274 | | - cache[x] = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, file) |
275 | | - return cache |
276 | | - |
277 | | - |
278 | | -def search_cache_for_missed_editors(dbname): |
279 | | - mongo = db.init_mongo_db(dbname) |
280 | | - collection = mongo['editors'] |
281 | | - editor_cache = cache.EditorCache(collection) |
282 | | - cache = load_cache_objects() |
283 | | - for c in cache: |
284 | | - for editor in cache[c]: |
285 | | - editor_cache.add(editor, cache[c][editor]) |
286 | | - cache[c] = {} |
287 | | - editor_cache.add('NEXT', '') |
288 | | - cache = {} |
289 | | - |
290 | | - |
291 | | - |
292 | | -def load_bot_ids(): |
293 | | - ''' |
294 | | - Loader function to retrieve list of id's of known Wikipedia bots. |
295 | | - ''' |
296 | | - ids = {} |
297 | | - mongo = db.init_mongo_db('bots') |
298 | | - bots = mongo['ids'] |
299 | | - cursor = bots.find() |
300 | | - for bot in cursor: |
301 | | - ids[bot['id']] = bot['name'] |
302 | | - return ids |
303 | | - |
304 | | - |
305 | | -def run_parse_editors(location, language, project): |
306 | | - ids = load_bot_ids() |
307 | | - input = os.path.join(location, language, project) |
308 | | - output = os.path.join(input, 'txt') |
309 | | - |
310 | | - kwargs = {'bots': ids, |
311 | | - 'dbname': language + project, |
312 | | - 'language': language, |
313 | | - 'project': project, |
314 | | - 'pbar': True, |
315 | | - 'destination': 'file', |
316 | | - 'nr_input_processors': settings.NUMBER_OF_PROCESSES, |
317 | | - 'nr_output_processors': settings.NUMBER_OF_PROCESSES, |
318 | | - 'input': input, |
319 | | - 'output': output, |
320 | | - } |
321 | | - source = os.path.join(location, language, project) |
322 | | - files = utils.retrieve_file_list(source, 'xml') |
323 | | - |
324 | | - if not os.path.exists(input): |
325 | | - utils.create_directory(input) |
326 | | - if not os.path.exists(output): |
327 | | - utils.create_directory(output) |
328 | | - |
329 | | - chunks = utils.split_list(files , settings.NUMBER_OF_PROCESSES) |
330 | | - pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs) |
331 | | - |
332 | | - |
333 | | -def debug_parse_editors(dbname): |
334 | | - q = JoinableQueue() |
335 | | - parse_editors('522.xml', q, None, None, debug=True, destination='file') |
336 | | - store_editors(q, [], dbname) |
337 | | - |
338 | | - |
339 | | -if __name__ == "__main__": |
340 | | - #debug_parse_editors('test2') |
341 | | - run_parse_editors(settings.XML_FILE_LOCATION, 'en', 'wiki') |
342 | | - pass |
Index: trunk/tools/editor_trends/settings.py |
— | — | @@ -1,158 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__author__email = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-10-21' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -''' |
22 | | -This file contains settings that are used for constructing and analyzing |
23 | | -the datasets as part of the Editor Dynamics and Anti-Vandalism projects. |
24 | | -''' |
25 | | - |
26 | | -from multiprocessing import cpu_count |
27 | | -import os |
28 | | -import sys |
29 | | -import platform |
30 | | - |
31 | | -try: |
32 | | - from pywin import win32file |
33 | | - '''increase the maximum number of open files on Windows to 1024''' |
34 | | - win32file._setmaxstdio(1024) |
35 | | -except ImportError: |
36 | | - pass |
37 | | - |
38 | | -try: |
39 | | - import resource |
40 | | -except ImportError: |
41 | | - pass |
42 | | - |
43 | | -#Setting up the environment |
44 | | -ops = {platform.win32_ver: 'Windows', |
45 | | - platform.linux_distribution: 'Linux', |
46 | | - platform.mac_ver: 'OSX'} |
47 | | - |
48 | | -for op in ops: |
49 | | - if op() != ('', '', '') and op() != ('', ('', '', ''), ''): |
50 | | - OS = ops[op] |
51 | | - |
52 | | -ARCH = platform.machine() |
53 | | - |
54 | | -WORKING_DIRECTORY = os.getcwd() |
55 | | -IGNORE_DIRS = ['wikistats', 'zips'] |
56 | | -ROOT = '/' if OS != 'Windows' else 'c:\\' |
57 | | - |
58 | | -MINIMUM_PYTHON_VERSION = (2, 6) |
59 | | - |
60 | | -dirs = [name for name in os.listdir(WORKING_DIRECTORY) if |
61 | | - os.path.isdir(os.path.join(WORKING_DIRECTORY, name))] |
62 | | -for subdirname in dirs: |
63 | | - if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS: |
64 | | - sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname)) |
65 | | - |
66 | | -WINDOWS_ZIP = ['7z.exe'] |
67 | | - |
68 | | -OSX_ZIP = [] |
69 | | - |
70 | | -LINUX_ZIP = [] |
71 | | -#General settings |
72 | | - |
73 | | -# Valid values are 'stand-alone' and 'hadoop' |
74 | | -RUN_MODE = 'stand_alone' |
75 | | - |
76 | | -# If true then some more detailed debug information is collected |
77 | | -DEBUG = True |
78 | | - |
79 | | -#If True then it will display a progress bar on the console. |
80 | | -PROGRESS_BAR = True |
81 | | - |
82 | | -#Date format as used by Erik Zachte |
83 | | -DATE_FORMAT = '%Y-%m-%d' |
84 | | - |
85 | | -# Timestamp format as generated by the MediaWiki dumps |
86 | | -DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ' |
87 | | - |
88 | | -#This section contains configuration variables for the different file locations. |
89 | | - |
90 | | -# Location where to write xml chunks |
91 | | -XML_FILE_LOCATION = os.path.join(ROOT, 'wikimedia') |
92 | | - |
93 | | -# Input file |
94 | | -XML_FILE = os.path.join(ROOT, 'Source_Files', 'enwiki-20100916-stub-meta-history.xml') |
95 | | - |
96 | | -# This is the place where error messages are stored for debugging purposes |
97 | | -ERROR_MESSAGE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'errors') |
98 | | - |
99 | | -DATABASE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'database') |
100 | | - |
101 | | -BINARY_OBJECT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'objects') |
102 | | - |
103 | | -DATASETS_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'datasets') |
104 | | - |
105 | | -TXT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'csv') |
106 | | - |
107 | | -NAMESPACE_LOCATION = os.path.join(WORKING_DIRECTORY, 'namespaces') |
108 | | -#This section contains configuration variables for parsing / encoding and |
109 | | -#working with the XML files. |
110 | | - |
111 | | -# ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Sets for reason |
112 | | -MAX_XML_FILE_SIZE = 67108864 |
113 | | - |
114 | | -if OS == 'Windows' and ARCH == 'i386': |
115 | | - MAX_FILES_OPEN = win32file._getmaxstdio() |
116 | | -elif OS != 'Windows': |
117 | | - MAX_FILES_OPEN = resource.getrlimit(resource.RLIMIT_NOFILE) |
118 | | -else: |
119 | | - MAX_FILES_OPEN = 500 |
120 | | - |
121 | | -ENCODING = 'utf-8' |
122 | | - |
123 | | -# Name space, do not change as this works for Mediawiki wikis |
124 | | -NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/' |
125 | | - |
126 | | -WINDOWS_REGISTER = {'7zip': 'Software\\7-Zip', |
127 | | - } |
128 | | - |
129 | | -COMPRESSION_EXTENSIONS = ['gz', 'bz2', '7z'] |
130 | | - |
131 | | - |
132 | | -WIKIMEDIA_PROJECTS = {'commons': 'commonswiki', |
133 | | - 'wikibooks': 'wikibooks', |
134 | | - 'wikinews': 'wikinews', |
135 | | - 'wikiquote': 'wikiquote', |
136 | | - 'wikisource': 'wikisource', |
137 | | - 'wikiversity': 'wikiversity', |
138 | | - 'wiktionary': 'wiktionary', |
139 | | - 'metawiki': 'metawiki', |
140 | | - 'wikispecies': 'specieswiki', |
141 | | - 'incubator': 'incubatorwiki', |
142 | | - 'foundation': 'foundationwiki', |
143 | | - 'mediawiki': 'mediawikiwiki', |
144 | | - 'outreach': 'outreachwiki', |
145 | | - 'strategic planning': 'strategywiki', |
146 | | - 'usability initiative': 'usabilitywiki', |
147 | | - 'multilingual wikisource': None |
148 | | - } |
149 | | - |
150 | | -#Multiprocess settings used to parallelize workload |
151 | | -#Change this to match your computers configuration (RAM / CPU) |
152 | | -NUMBER_OF_PROCESSES = cpu_count() * 1 |
153 | | - |
154 | | -#Extensions of ascii files, this is used to determine the filemode to use |
155 | | -ASCII = ['txt', 'csv', 'xml', 'sql', 'json'] |
156 | | - |
157 | | -WP_DUMP_LOCATION = 'http://download.wikimedia.org' |
158 | | - |
159 | | -MAX_CACHE_SIZE = 1024 * 1024 |
Index: trunk/tools/editor_trends/split_xml_file.py |
— | — | @@ -1,186 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__author__email = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-10-21' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import xml.etree.cElementTree as cElementTree |
22 | | -import codecs |
23 | | -import utils |
24 | | -import re |
25 | | -import json |
26 | | -import os |
27 | | - |
28 | | -import progressbar |
29 | | - |
30 | | -from utils import utils |
31 | | -import settings |
32 | | - |
33 | | -try: |
34 | | - import psyco |
35 | | - psyco.full() |
36 | | -except ImportError: |
37 | | - pass |
38 | | - |
39 | | - |
40 | | -RE_NUMERIC_CHARACTER = re.compile('&#(\d+);') |
41 | | - |
42 | | - |
43 | | -def remove_numeric_character_references(text): |
44 | | - return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8') |
45 | | - |
46 | | - |
47 | | -def lenient_deccharref(m): |
48 | | - try: |
49 | | - return unichr(int(m.group(1))) |
50 | | - except ValueError: |
51 | | - ''' |
52 | | - There are a few articles that raise a Value Error here, the reason is |
53 | | - that I am using a narrow Python build (UCS2) instead of a wide build |
54 | | - (UCS4). The quick fix is to return an empty string... |
55 | | - Real solution is to rebuild Python with UCS4 support..... |
56 | | - ''' |
57 | | - return '' |
58 | | - |
59 | | - |
60 | | -def remove_namespace(element, namespace): |
61 | | - '''Remove namespace from the XML document.''' |
62 | | - ns = u'{%s}' % namespace |
63 | | - nsl = len(ns) |
64 | | - for elem in element.getiterator(): |
65 | | - if elem.tag.startswith(ns): |
66 | | - elem.tag = elem.tag[nsl:] |
67 | | - return element |
68 | | - |
69 | | - |
70 | | -def load_namespace(language): |
71 | | - file = '%s_ns.json' % language |
72 | | - fh = utils.create_txt_filehandle(settings.NAMESPACE_LOCATION, file, 'r', settings.ENCODING) |
73 | | - ns = json.load(fh) |
74 | | - fh.close() |
75 | | - ns = ns['query']['namespaces'] |
76 | | - return ns |
77 | | - |
78 | | - |
79 | | -def build_namespaces_locale(namespaces): |
80 | | - ''' |
81 | | - Construct a list of all the non-main namespaces |
82 | | - ''' |
83 | | - ns = [] |
84 | | - for namespace in namespaces: |
85 | | - value = namespaces[namespace].get(u'*', None) |
86 | | - if value != None and value != '': |
87 | | - ns.append(value) |
88 | | - return ns |
89 | | - |
90 | | - |
91 | | -def parse_comments(xml, function): |
92 | | - revisions = xml.findall('revision') |
93 | | - for revision in revisions: |
94 | | - comment = revision.find('comment') |
95 | | - timestamp = revision.find('timestamp').text |
96 | | - if comment != None and comment.text != None: |
97 | | - comment.text = function(comment.text) |
98 | | - return xml |
99 | | - |
100 | | - |
101 | | -def is_article_main_namespace(elem, namespace): |
102 | | - ''' |
103 | | - checks whether the article belongs to the main namespace |
104 | | - ''' |
105 | | - title = elem.find('title').text |
106 | | - for ns in namespace: |
107 | | - if title.startswith(ns): |
108 | | - return False |
109 | | - return True |
110 | | - |
111 | | - |
112 | | -def write_xml_file(element, fh, counter, language): |
113 | | - '''Get file handle and write xml element to file''' |
114 | | - size = len(cElementTree.tostring(element)) |
115 | | - fh, counter = create_xml_file_handle(fh, counter, size, language) |
116 | | - try: |
117 | | - fh.write(cElementTree.tostring(element)) |
118 | | - except MemoryError: |
119 | | - print 'Add error capturing logic' |
120 | | - fh.write('\n') |
121 | | - return fh, counter |
122 | | - |
123 | | - |
124 | | -def create_xml_file_handle(fh, counter, size, language): |
125 | | - '''Create file handle if none is supplied or if file size > max file size.''' |
126 | | - if not counter: |
127 | | - counter = 0 |
128 | | - path = os.path.join(settings.XML_FILE_LOCATION, language, '%s.xml' % counter) |
129 | | - if not fh: |
130 | | - fh = codecs.open(path, 'w', encoding=settings.ENCODING) |
131 | | - return fh, counter |
132 | | - elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE: |
133 | | - print 'Created chunk %s' % counter |
134 | | - fh.close |
135 | | - counter += 1 |
136 | | - fh = codecs.open(path, 'w', encoding=settings.ENCODING) |
137 | | - return fh, counter |
138 | | - else: |
139 | | - return fh, counter |
140 | | - |
141 | | - |
142 | | -def split_xml(location, filename, project, language_code): |
143 | | - '''Reads xml file and splits it in N chunks''' |
144 | | - #location = os.path.join(settings.XML_FILE_LOCATION, language) |
145 | | - result = utils.check_file_exists(location, '') |
146 | | - if result == False: |
147 | | - result = utils.create_directory(location) |
148 | | - if not result: |
149 | | - return |
150 | | - |
151 | | - ns = load_namespace(language_code) |
152 | | - ns = build_namespaces_locale(ns) |
153 | | - |
154 | | - fh = None |
155 | | - counter = None |
156 | | - source = os.path.join(location, filename) |
157 | | - tag = '{%s}page' % settings.NAME_SPACE |
158 | | - |
159 | | - context = cElementTree.iterparse(source, events=('start', 'end')) |
160 | | - context = iter(context) |
161 | | - event, root = context.next() #get the root element of the XML doc |
162 | | - |
163 | | - try: |
164 | | - for event, elem in context: |
165 | | - if event == 'end': |
166 | | - if elem.tag == tag: |
167 | | - elem = remove_namespace(elem, settings.NAME_SPACE) |
168 | | - if is_article_main_namespace(elem, ns): |
169 | | - elem = parse_comments(elem, remove_numeric_character_references) |
170 | | - fh, counter = write_xml_file(elem, fh, counter, language_code) |
171 | | - root.clear() # when done parsing a section clear the tree to safe memory |
172 | | - #elem = parse_comments(elem, convert_html_entities) |
173 | | - #elem = parse_comments(elem, remove_ascii_control_characters) |
174 | | - #print cElementTree.tostring(elem) |
175 | | - except SyntaxError: |
176 | | - fh = utils.create_txt_filehandle(settings.ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING) |
177 | | - fh.write(cElementTree.tostring(elem)) |
178 | | - fh.close() |
179 | | - |
180 | | - |
181 | | -if __name__ == "__main__": |
182 | | - kwargs = {'location': 'c:\\Source_files\\', |
183 | | - 'filename': settings.XML_FILE, |
184 | | - 'project':'wiki', |
185 | | - 'language_code':'en' |
186 | | - } |
187 | | - split_xml(**kwargs) |
Index: trunk/tools/editor_trends/init_bot_db.py |
— | — | @@ -1,196 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | - |
18 | | -import os |
19 | | -import cStringIO |
20 | | -import xml.etree.cElementTree as cElementTree |
21 | | - |
22 | | - |
23 | | -import settings |
24 | | -from wikitree import xml |
25 | | -from database import db |
26 | | -from database import db_settings |
27 | | -from utils import utils |
28 | | -from utils import process_constructor as pc |
29 | | - |
30 | | -try: |
31 | | - import psyco |
32 | | - psyco.full() |
33 | | -except ImportError: |
34 | | - pass |
35 | | - |
36 | | - |
37 | | -def create_bot_ids_db_mongo(): |
38 | | - ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.ENCODING) |
39 | | - mongo = db.init_mongo_db('bots') |
40 | | - collection = mongo['ids'] |
41 | | - |
42 | | - db.remove_documents_from_mongo_db(collection, None) |
43 | | - |
44 | | - for id, name in ids.iteritems(): |
45 | | - collection.insert({'id': id, 'name': name}) |
46 | | - |
47 | | - print collection.count() |
48 | | - |
49 | | - |
50 | | -def create_bots_db(db_name): |
51 | | - ''' |
52 | | - This function reads the csv file provided by Erik Zachte and constructs a |
53 | | - sqlite memory database. The reason for this is that I suspect I will need |
54 | | - some simple querying capabilities in the future, else a dictionary would |
55 | | - suffice. |
56 | | - ''' |
57 | | - connection = db.init_database('db_name') |
58 | | - #connection = db.init_database('data/database/bots.db') |
59 | | - cursor = connection.cursor() |
60 | | - db.create_tables(cursor, db_settings.BOT_TABLE) |
61 | | - values = [] |
62 | | - fields = [field[0] for field in db_settings.BOT_TABLE['bots']] |
63 | | - for line in utils.read_data_from_csv('data/csv/StatisticsBots.csv', settings.ENCODING): |
64 | | - line = line.split(',') |
65 | | - row = [] |
66 | | - for x, (field, value) in enumerate(zip(fields, line)): |
67 | | - if db_settings.BOT_TABLE['bots'][x][1] == 'INTEGER': |
68 | | - value = int(value) |
69 | | - elif db_settings.BOT_TABLE['bots'][x][1] == 'TEXT': |
70 | | - value = value.replace('/', '-') |
71 | | - #print field, value |
72 | | - row.append(value) |
73 | | - values.append(row) |
74 | | - |
75 | | - cursor.executemany('INSERT INTO bots VALUES (?,?,?,?,?,?,?,?,?,?);', values) |
76 | | - connection.commit() |
77 | | - if db_name == ':memory': |
78 | | - return cursor |
79 | | - else: |
80 | | - connection.close() |
81 | | - |
82 | | - |
83 | | -def retrieve_botnames_without_id(cursor, language): |
84 | | - return cursor.execute('SELECT name FROM bots WHERE language=?', (language,)).fetchall() |
85 | | - |
86 | | - |
87 | | -def lookup_username(input_queue, result_queue, progressbar, bots, debug=False): |
88 | | - ''' |
89 | | - This function is used to find the id's belonging to the different bots that |
90 | | - are patrolling the Wikipedia sites. |
91 | | - @input_queue contains a list of xml files to parse |
92 | | - |
93 | | - @result_queue should be set to false as the results are directly written to |
94 | | - a csv file. |
95 | | - |
96 | | - @progressbar depends on settings |
97 | | - |
98 | | - @bots is a dictionary containing the names of the bots to lookup |
99 | | - ''' |
100 | | - |
101 | | - #if len(bots.keys()) == 1: |
102 | | - bots = bots['bots'] |
103 | | - #print bots.keys() |
104 | | - |
105 | | - if settings.DEBUG: |
106 | | - messages = {} |
107 | | - |
108 | | - while True: |
109 | | - if debug: |
110 | | - file = input_queue |
111 | | - else: |
112 | | - file = input_queue.get(block=False) |
113 | | - |
114 | | - if file == None: |
115 | | - break |
116 | | - |
117 | | - data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION + |
118 | | - file, 'r', encoding=settings.ENCODING)) |
119 | | - |
120 | | - for raw_data in data: |
121 | | - xml_buffer = cStringIO.StringIO() |
122 | | - raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
123 | | - raw_data = ''.join(raw_data) |
124 | | - raw_data = raw_data.encode('utf-8') |
125 | | - xml_buffer.write(raw_data) |
126 | | - |
127 | | - try: |
128 | | - xml_nodes = cElementTree.XML(xml_buffer.getvalue()) |
129 | | - revisions = xml_nodes.findall('revision') |
130 | | - for revision in revisions: |
131 | | - contributor = xml.retrieve_xml_node(revision, 'contributor') |
132 | | - username = contributor.find('username') |
133 | | - if username == None: |
134 | | - continue |
135 | | - username = xml.extract_text(username) |
136 | | - #print username.encode('utf-8') |
137 | | - |
138 | | - if username in bots: |
139 | | - id = contributor.find('id') |
140 | | - id = xml.extract_text(id) |
141 | | - #print username.encode('utf-8'), id |
142 | | - utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.ENCODING) |
143 | | - bots.pop(username) |
144 | | - if bots == {}: |
145 | | - print 'Mission accomplished' |
146 | | - return |
147 | | - except Exception, error: |
148 | | - print error |
149 | | - if settings.DEBUG: |
150 | | - messages = utils.track_errors(xml_buffer, error, file, |
151 | | - messages) |
152 | | - |
153 | | - if settings.DEBUG: |
154 | | - utils.report_error_messages(messages, lookup_username) |
155 | | - |
156 | | - |
157 | | -def add_id_to_botnames(): |
158 | | - ''' |
159 | | - This is the worker function for the multi-process version of |
160 | | - lookup_username.First, the names of the bots are retrieved, then the |
161 | | - multiprocess is launched by makinga call to pc.build_scaffolding. This is a |
162 | | - generic launcher that takes as input the function to load the input_queue, |
163 | | - the function that will do the main work and the objects to be put in the |
164 | | - input_queue. The launcher also accepts optional keyword arguments. |
165 | | - ''' |
166 | | - cursor = create_bots_db(':memory') |
167 | | - files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml') |
168 | | - |
169 | | - botnames = retrieve_botnames_without_id(cursor, 'en') |
170 | | - bots = {} |
171 | | - for botname in botnames: |
172 | | - bots[botname[0]] = 1 |
173 | | - pc.build_scaffolding(pc.load_queue, lookup_username, files, bots=bots) |
174 | | - cursor.close() |
175 | | - |
176 | | - |
177 | | -def debug_lookup_username(): |
178 | | - ''' |
179 | | - This function launches the lookup_username function but then single |
180 | | - threaded, this eases debugging. That's also the reason why the queue |
181 | | - parameters are set to None. When launching this function make sure that |
182 | | - debug=False when calling lookup_username |
183 | | - ''' |
184 | | - cursor = create_bots_db(':memory') |
185 | | - botnames = retrieve_botnames_without_id(cursor, 'en') |
186 | | - bots = {} |
187 | | - for botname in botnames: |
188 | | - bots[botname[0]] = 1 |
189 | | - |
190 | | - lookup_username('12.xml', None, None, bots, debug=True) |
191 | | - cursor.close() |
192 | | - |
193 | | - |
194 | | -if __name__ == '__main__': |
195 | | - #debug() |
196 | | - #add_id_to_botnames() |
197 | | - create_bot_ids_db_mongo() |
Index: trunk/tools/editor_trends/construct_datasets.py |
— | — | @@ -1,254 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__author__email = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-10-21' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -from multiprocessing import Queue |
22 | | -from Queue import Empty |
23 | | -import datetime |
24 | | -from dateutil.relativedelta import * |
25 | | - |
26 | | -import progressbar |
27 | | - |
28 | | -import settings |
29 | | -from utils import models, utils |
30 | | -from database import db |
31 | | -from utils import process_constructor as pc |
32 | | - |
33 | | -try: |
34 | | - import psyco |
35 | | - psyco.full() |
36 | | -except ImportError: |
37 | | - pass |
38 | | - |
39 | | - |
40 | | -def retrieve_editor_ids_mongo(dbname, collection): |
41 | | - if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION, |
42 | | - 'editors.bin'): |
43 | | - ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, |
44 | | - 'editors.bin') |
45 | | - else: |
46 | | - mongo = db.init_mongo_db(dbname) |
47 | | - editors = mongo[collection] |
48 | | - ids = editors.distinct('editor') |
49 | | - utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo) |
50 | | - return ids |
51 | | - |
52 | | - |
53 | | -def expand_edits(edits): |
54 | | - data = [] |
55 | | - for edit in edits: |
56 | | - data.append(edit['date']) |
57 | | - return data |
58 | | - |
59 | | - |
60 | | -def expand_observations(obs, vars_to_expand): |
61 | | - for var in vars_to_expand: |
62 | | - if var == 'edits': |
63 | | - obs[var] = expand_edits(obs[var]) |
64 | | - elif var == 'edits_by_year': |
65 | | - keys = obs[var].keys() |
66 | | - keys.sort() |
67 | | - edits = [] |
68 | | - for key in keys: |
69 | | - edits.append(str(obs[var][key])) |
70 | | - obs[var] = edits |
71 | | - return obs |
72 | | - |
73 | | -def write_longitudinal_data(id, edits, fh): |
74 | | - years = edits.keys() |
75 | | - years.sort() |
76 | | - for year in years: |
77 | | - months = edits[year].keys() |
78 | | - months = [int(m) for m in months] |
79 | | - months.sort() |
80 | | - for m in months: |
81 | | - date = datetime.date(int(year), int(m), 1) |
82 | | - fh.write('%s\t%s\t%s\n' % (id, date, edits[year][str(m)])) |
83 | | - |
84 | | - |
85 | | -def expand_headers(headers, vars_to_expand, obs): |
86 | | - for var in vars_to_expand: |
87 | | - l = len(obs[var]) |
88 | | - pos = headers.index(var) |
89 | | - for i in xrange(l): |
90 | | - if var.endswith('year'): |
91 | | - suffix = 2001 + i |
92 | | - elif var.endswith('edits'): |
93 | | - suffix = 1 + i |
94 | | - headers.insert(pos + i, '%s_%s' % (var, suffix)) |
95 | | - headers.remove(var) |
96 | | - return headers |
97 | | - |
98 | | - |
99 | | -def generate_long_editor_dataset(input_queue, data_queue, pbar, **kwargs): |
100 | | - debug = kwargs.pop('debug') |
101 | | - dbname = kwargs.pop('dbname') |
102 | | - mongo = db.init_mongo_db(dbname) |
103 | | - editors = mongo['dataset'] |
104 | | - name = dbname + '_long_editors.csv' |
105 | | - fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING) |
106 | | - x = 0 |
107 | | - vars_to_expand = [] |
108 | | - while True: |
109 | | - try: |
110 | | - id = input_queue.get(block=False) |
111 | | - obs = editors.find_one({'editor': id}, {'monthly_edits': 1}) |
112 | | - if x == 0: |
113 | | - headers = obs.keys() |
114 | | - headers.sort() |
115 | | - headers = expand_headers(headers, vars_to_expand, obs) |
116 | | - utils.write_list_to_csv(headers, fh) |
117 | | - write_longitudinal_data(id, obs['monthly_edits'], fh) |
118 | | - #utils.write_list_to_csv(data, fh) |
119 | | - x += 1 |
120 | | - except Empty: |
121 | | - break |
122 | | - |
123 | | - |
124 | | -def generate_cohort_analysis(input_queue, data_queue, pbar, **kwargs): |
125 | | - dbname = kwargs.get('dbname') |
126 | | - pbar = kwargs.get('pbar') |
127 | | - mongo = db.init_mongo_db(dbname) |
128 | | - editors = mongo['dataset'] |
129 | | - year = datetime.datetime.now().year + 1 |
130 | | - begin = year - 2001 |
131 | | - p = [3, 6, 9] |
132 | | - periods = [y * 12 for y in xrange(1, begin)] |
133 | | - periods = p + periods |
134 | | - data = {} |
135 | | - while True: |
136 | | - try: |
137 | | - id = input_queue.get(block=False) |
138 | | - obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1}) |
139 | | - first_edit = obs['first_edit'] |
140 | | - last_edit = obs['final_edit'] |
141 | | - for y in xrange(2001, year): |
142 | | - if y == 2010 and first_edit > datetime.datetime(2010, 1, 1): |
143 | | - print 'debug' |
144 | | - if y not in data: |
145 | | - data[y] = {} |
146 | | - data[y]['n'] = 0 |
147 | | - window_end = datetime.datetime(y, 12, 31) |
148 | | - if window_end > datetime.datetime.now(): |
149 | | - now = datetime.datetime.now() |
150 | | - m = now.month - 1 #Dump files are always lagging at least one month.... |
151 | | - d = now.day |
152 | | - window_end = datetime.datetime(y, m, d) |
153 | | - edits = [] |
154 | | - for period in periods: |
155 | | - if period not in data[y]: |
156 | | - data[y][period] = 0 |
157 | | - window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period) |
158 | | - if window_start < datetime.datetime(2001, 1, 1): |
159 | | - window_start = datetime.datetime(2001, 1, 1) |
160 | | - if date_falls_in_window(window_start, window_end, first_edit, last_edit): |
161 | | - edits.append(period) |
162 | | - if edits != []: |
163 | | - p = min(edits) |
164 | | - data[y]['n'] += 1 |
165 | | - data[y][p] += 1 |
166 | | - #pbar.update(+1) |
167 | | - except Empty: |
168 | | - break |
169 | | - utils.store_object(data, settings.BINARY_OBJECT_FILE_LOCATION, 'cohort_data') |
170 | | - |
171 | | -def date_falls_in_window(window_start, window_end, first_edit, last_edit): |
172 | | - if first_edit >= window_start and first_edit <= window_end: |
173 | | - return True |
174 | | - else: |
175 | | - return False |
176 | | - |
177 | | - |
178 | | -def generate_wide_editor_dataset(input_queue, data_queue, pbar, **kwargs): |
179 | | - dbname = kwargs.pop('dbname') |
180 | | - mongo = db.init_mongo_db(dbname) |
181 | | - editors = mongo['dataset'] |
182 | | - name = dbname + '_wide_editors.csv' |
183 | | - fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING) |
184 | | - x = 0 |
185 | | - vars_to_expand = ['edits', 'edits_by_year', 'articles_by_year'] |
186 | | - while True: |
187 | | - try: |
188 | | - if debug: |
189 | | - id = u'99797' |
190 | | - else: |
191 | | - id = input_queue.get(block=False) |
192 | | - print input_queue.qsize() |
193 | | - obs = editors.find_one({'editor': id}) |
194 | | - obs = expand_observations(obs, vars_to_expand) |
195 | | - if x == 0: |
196 | | - headers = obs.keys() |
197 | | - headers.sort() |
198 | | - headers = expand_headers(headers, vars_to_expand, obs) |
199 | | - utils.write_list_to_csv(headers, fh) |
200 | | - data = [] |
201 | | - keys = obs.keys() |
202 | | - keys.sort() |
203 | | - for key in keys: |
204 | | - data.append(obs[key]) |
205 | | - utils.write_list_to_csv(data, fh) |
206 | | - |
207 | | - x += 1 |
208 | | - except Empty: |
209 | | - break |
210 | | - fh.close() |
211 | | - |
212 | | - |
213 | | -def retrieve_edits_by_contributor_launcher(): |
214 | | - pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors') |
215 | | - |
216 | | - |
217 | | -def debug_retrieve_edits_by_contributor_launcher(dbname): |
218 | | - kwargs = {'debug': False, |
219 | | - 'dbname': dbname, |
220 | | - } |
221 | | - ids = retrieve_editor_ids_mongo(dbname, 'editors') |
222 | | - input_queue = pc.load_queue(ids) |
223 | | - q = Queue() |
224 | | - generate_editor_dataset(input_queue, q, False, kwargs) |
225 | | - |
226 | | - |
227 | | -def generate_editor_dataset_launcher(dbname): |
228 | | - kwargs = {'nr_input_processors': 1, |
229 | | - 'nr_output_processors': 1, |
230 | | - 'debug': False, |
231 | | - 'dbname': dbname, |
232 | | - 'poison_pill':False, |
233 | | - 'pbar': True |
234 | | - } |
235 | | - ids = retrieve_editor_ids_mongo(dbname, 'editors') |
236 | | - ids = list(ids) |
237 | | - chunks = dict({0: ids}) |
238 | | - pc.build_scaffolding(pc.load_queue, generate_cohort_analysis, chunks, False, False, **kwargs) |
239 | | - |
240 | | - |
241 | | -def generate_editor_dataset_debug(dbname): |
242 | | - ids = retrieve_editor_ids_mongo(dbname, 'editors') |
243 | | - input_queue = pc.load_queue(ids) |
244 | | - kwargs = {'nr_input_processors': 1, |
245 | | - 'nr_output_processors': 1, |
246 | | - 'debug': True, |
247 | | - 'dbname': dbname, |
248 | | - } |
249 | | - generate_editor_dataset(input_queue, False, False, kwargs) |
250 | | - |
251 | | - |
252 | | -if __name__ == '__main__': |
253 | | - #generate_editor_dataset_debug('test') |
254 | | - generate_editor_dataset_launcher('enwiki') |
255 | | - #debug_retrieve_edits_by_contributor_launcher() |
Index: trunk/tools/editor_trends/statistics/stata/cohort_charts.do |
— | — | @@ -0,0 +1,25 @@ |
| 2 | +label var months_3 "3 Months"
|
| 3 | +label var months_6 "6 Months"
|
| 4 | +label var months_9 "9 Months"
|
| 5 | +label var months_12 "1 Year"
|
| 6 | +label var months_24 "2 Years"
|
| 7 | +label var months_36 "3 Years"
|
| 8 | +label var months_48 "4 Years"
|
| 9 | +label var months_60 "5 Years"
|
| 10 | +label var months_72 "6 Years"
|
| 11 | +label var months_84 "7 Years"
|
| 12 | +label var months_96 "8 Years"
|
| 13 | +label var months_108 "9 Years"
|
| 14 | +generate one_year_exp = months_3+ months_6+ months_9+ months_12
|
| 15 | +
|
| 16 | +generate fewer_one_year_abs = (one_year_exp/100) * n
|
| 17 | +generate more_one_year_abs = n - fewer_one_year_abs
|
| 18 | +label var fewer_one_year_abs "Editors with less than one year experience"
|
| 19 | +label var more_one_year_abs "Editors with more than one year experience"
|
| 20 | +
|
| 21 | +graph bar (asis) months_3 months_6 months_9 months_12 months_24 months_36 months_48 months_60 months_72 months_84 months_96 months_108, over(year, label(labsize(small))) stack ylabel(, labsize(vsmall) format(%9.0g)) title(Wikipedia Age Composition by Year) subtitle(Editors are getting older and influx of new editors has stagnated) note("Based on English Wikipedia, 345.000 editors." "An editor is a person who has made at least 10 edits in the main namespace.", size(tiny)) legend(nocolfirst rowgap(tiny) colgap(tiny) size(vsmall))
|
| 22 | +
|
| 23 | +twoway (line one_year_exp year), ytitle(%) ytitle(, size(vsmall)) xtitle() xlabel(2001(1)2010, labsize(vsmall)) title(Percentage of Wikipedia editors with 1 year experience) note("Based on the English Wikipedia, dataset 345.000 editors.", size(vsmall))
|
| 24 | +
|
| 25 | +
|
| 26 | +graph bar (asis) fewer_one_year_abs more_one_year_abs, over(year, label(labsize(vsmall))) stack blabel(bar, size(tiny) position(inside) format(%9.0f)) ylabel(, labsize(vsmall) format(%9.0g)) title(Editors with one year vs multiple years of experience) legend(colfirst cols(1))
|
Index: trunk/tools/editor_trends/statistics/stata/wiki.do |
— | — | @@ -1,4 +1,5 @@ |
2 | | -local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit" |
| 2 | +insheet using "C:\Users\diederik.vanliere\workspace\editor_trends\datasets\enwiki_long_editors.csv" |
| 3 | +local first_ten "edits_1 edits_2 edits_3 edits_4 edits_5 edits_6 edits_7 edits_8 edits_9 edits_10 final_edit first_edit" |
3 | 4 | |
4 | 5 | foreach edit of local first_ten { |
5 | 6 | gen date2 = date(`edit', "YMDhms") |
— | — | @@ -8,6 +9,7 @@ |
9 | 10 | } |
10 | 11 | |
11 | 12 | generate year_left = year(final_edit) |
| 13 | +generate year_joined = year(first_edit) |
12 | 14 | sort year_joined |
13 | 15 | by year_joined: gen community_size_t = _N |
14 | 16 | |
— | — | @@ -23,12 +25,6 @@ |
24 | 26 | gen retention200`t' = community_size_200`t1' / community_size_200`t' |
25 | 27 | } |
26 | 28 | |
27 | | - |
28 | | - |
29 | | - |
30 | | - |
31 | | - |
32 | | - |
33 | 29 | generate time_to_new_wp = edits_10 - edits_1 |
34 | 30 | generate active_time_wp = final_edit - edits_10 |
35 | 31 | label time_to_new_wp "Number of days it took to become a new wikipedian" |
Property changes on: trunk/tools/editor_trends/datasets |
___________________________________________________________________ |
Added: svn:ignore |
36 | 32 | + cohort_data.txt |
cohorts.dta |
difference observations erik vs diederik.ods |
difference observations erik vs diederik.xlsx |
editors.dta |
enwiki_editors.csv |
enwiki_long_editors.csv |
enwiki_wide_editors.csv |
Property changes on: trunk/tools/editor_trends/documentation |
___________________________________________________________________ |
Added: svn:ignore |
37 | 33 | + language_codes.xlsx |
Property changes on: trunk/tools/editor_trends/errors |
___________________________________________________________________ |
Modified: svn:ignore |
38 | 34 | - *.bin |
39 | 35 | + *.bin |
split_xml |
Property changes on: trunk/tools/editor_trends |
___________________________________________________________________ |
Added: svn:default-eol-style |
40 | 36 | + native |