Index: trunk/tools/editor_trends/optimize_editors.py |
— | — | @@ -0,0 +1,120 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__author__email = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2010-11-02'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +
|
| 22 | +
|
| 23 | +import settings
|
| 24 | +from database import db
|
| 25 | +from utils import process_constructor as pc
|
| 26 | +
|
| 27 | +
|
| 28 | +def create_datacontainer(init_value=0):
|
| 29 | + '''
|
| 30 | + This function initializes an empty dictionary with as key the year (starting
|
| 31 | + 2001 and running through) and as value @init_value, in most cases this will
|
| 32 | + be zero so the dictionary will act as a running tally for a variable but
|
| 33 | + @init_value can also a list, [], or a dictionary, {}, or a set, set().
|
| 34 | + '''
|
| 35 | + data = {}
|
| 36 | + year = datetime.datetime.now().year + 1
|
| 37 | + for x in xrange(2001, year):
|
| 38 | + data[str(x)] = init_value
|
| 39 | + return data
|
| 40 | +
|
| 41 | +
|
| 42 | +def determine_edits_by_year(dates):
|
| 43 | + '''
|
| 44 | + This function counts the number of edits by year made by a particular editor.
|
| 45 | + '''
|
| 46 | + edits = create_datacontainer()
|
| 47 | + for date in dates:
|
| 48 | + year = str(date['date'].year)
|
| 49 | + edits[year] += 1
|
| 50 | + return edits
|
| 51 | +
|
| 52 | +
|
| 53 | +def determine_articles_by_year(dates):
|
| 54 | + '''
|
| 55 | + This function counts the number of unique articles by year edited by a
|
| 56 | + particular editor.
|
| 57 | + '''
|
| 58 | + articles = create_datacontainer(set())
|
| 59 | + for date in dates:
|
| 60 | + year = str(date['date'].year)
|
| 61 | + articles[year].add(date['article'])
|
| 62 | + for article in articles:
|
| 63 | + articles[article] = len(article)
|
| 64 | + return articles
|
| 65 | +
|
| 66 | +
|
| 67 | +def optimize_editors(input_queue, result_queue, pbar, kwargs):
|
| 68 | + dbname = kwargs.pop('dbname')
|
| 69 | + mongo = db.init_mongo_db(dbname)
|
| 70 | + input = mongo['editors']
|
| 71 | + output = mongo['dataset']
|
| 72 | + mongo.output.ensure_index('editor')
|
| 73 | + mongo.output.ensure_index('year_joined')
|
| 74 | + definition = kwargs.pop('definition')
|
| 75 | + while True:
|
| 76 | + try:
|
| 77 | + id = input_queue.get(block=False)
|
| 78 | + editor = input.find_one({'editor': id})
|
| 79 | + edits = editor['edits']
|
| 80 | + edits = sorted(edits, key=itemgetter('date'))
|
| 81 | + edit_count = len(edits)
|
| 82 | + new_wikipedian = edits[9]['date'].year
|
| 83 | + first_edit = edits[0]['date']
|
| 84 | + final_edit = edits[-1]['date']
|
| 85 | + edits_by_year = determine_edits_by_year(edits)
|
| 86 | + articles_by_year = determine_articles_by_year(edits)
|
| 87 | + edits = edits[:10]
|
| 88 | +
|
| 89 | + output.insert({'editor': id, 'edits': edits,
|
| 90 | + 'edits_by_year': edits_by_year,
|
| 91 | + 'year_joined': year,
|
| 92 | + 'edit_count': edit_count,
|
| 93 | + 'final_edit': final_edit,
|
| 94 | + 'first_edit': first_edit,
|
| 95 | + 'articles_by_year': articles_by_year})
|
| 96 | + print 'Items left: %s' % input_queue.qsize()
|
| 97 | + except Empty:
|
| 98 | + break
|
| 99 | +
|
| 100 | +def run_optimize_editors(dbname):
|
| 101 | + ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
|
| 102 | + kwargs = {'definition': 'traditional',
|
| 103 | + 'pbar': True,
|
| 104 | + 'dbname': 'enwiki',
|
| 105 | + 'nr_input_processors': 2,
|
| 106 | + 'nr_output_processors': 0,
|
| 107 | + }
|
| 108 | + pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs)
|
| 109 | +
|
| 110 | +
|
| 111 | +def debug_optimize_editors(dbname):
|
| 112 | + ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
|
| 113 | + q = pc.load_queue(ids)
|
| 114 | + kwargs = {'definition': 'traditional',
|
| 115 | + 'dbname': 'enwiki'
|
| 116 | + }
|
| 117 | + optimize_editors(q, False, True, kwargs)
|
| 118 | +
|
| 119 | +
|
| 120 | +if __name__ == '__main__':
|
| 121 | + run_optimize_editors('enwiki') |
\ No newline at end of file |
Index: trunk/tools/editor_trends/map_wiki_editors.py |
— | — | @@ -21,11 +21,14 @@ |
22 | 22 | import sys |
23 | 23 | import os |
24 | 24 | import time |
| 25 | +import datetime |
25 | 26 | import codecs |
| 27 | +import math |
26 | 28 | import cStringIO |
27 | 29 | import re |
| 30 | +from operator import itemgetter |
28 | 31 | import xml.etree.cElementTree as cElementTree |
29 | | -from multiprocessing import Queue |
| 32 | +from multiprocessing import Queue, JoinableQueue |
30 | 33 | from Queue import Empty |
31 | 34 | import pymongo |
32 | 35 | |
— | — | @@ -34,6 +37,7 @@ |
35 | 38 | from utils import utils, models |
36 | 39 | from database import db_settings |
37 | 40 | from database import db |
| 41 | +from database import cache |
38 | 42 | from wikitree import xml |
39 | 43 | from statistics import dataset |
40 | 44 | from utils import process_constructor as pc |
— | — | @@ -45,13 +49,15 @@ |
46 | 50 | except ImportError: |
47 | 51 | pass |
48 | 52 | |
49 | | -#contributors = {} |
50 | 53 | |
51 | | -RE_BOT = re.compile('bot', re.IGNORECASE) |
52 | | -RE_SCRIPT = re.compile('script', re.IGNORECASE) |
| 54 | +def determine_username_is_bot(username, kwargs): |
| 55 | + ''' |
| 56 | + @username is the xml element containing the id of the user |
| 57 | + @kwargs should have a list with all the bot ids |
53 | 58 | |
54 | | - |
55 | | -def determine_username_is_bot(username, kwargs): |
| 59 | + @Return False if username id is not in bot list id or True if username id |
| 60 | + is a bot id. |
| 61 | + ''' |
56 | 62 | ids = kwargs.get('bots', []) |
57 | 63 | if ids == None: |
58 | 64 | ids = [] |
— | — | @@ -66,14 +72,14 @@ |
67 | 73 | def extract_contributor_id(contributor, kwargs): |
68 | 74 | ''' |
69 | 75 | @contributor is the xml contributor node containing a number of attributes |
70 | | - |
| 76 | + |
71 | 77 | Currently, we are only interested in registered contributors, hence we |
72 | 78 | ignore anonymous editors. If you are interested in collecting data on |
73 | 79 | anonymous editors then add the string 'ip' to the tags variable. |
74 | 80 | ''' |
75 | 81 | tags = ['id'] |
76 | 82 | if contributor.get('deleted'): |
77 | | - return - 1 #Not sure if this is the best way to code deleted contributors. |
| 83 | + return - 1 # ASK: Not sure if this is the best way to code deleted contributors. |
78 | 84 | for elem in contributor: |
79 | 85 | if elem.tag in tags: |
80 | 86 | if elem.text != None: |
— | — | @@ -83,6 +89,14 @@ |
84 | 90 | |
85 | 91 | |
86 | 92 | def output_editor_information(elem, data_queue, **kwargs): |
| 93 | + ''' |
| 94 | + @elem is an XML element containing 1 revision from a page |
| 95 | + @data_queue is where to store the data |
| 96 | + @**kwargs contains extra information |
| 97 | + |
| 98 | + the variable tags determines which attributes are being parsed, the values in |
| 99 | + this dictionary are the functions used to extract the data. |
| 100 | + ''' |
87 | 101 | tags = {'contributor': {'editor': extract_contributor_id, 'bot': determine_username_is_bot}, |
88 | 102 | 'timestamp': {'date': xml.extract_text}, |
89 | 103 | } |
— | — | @@ -104,10 +118,24 @@ |
105 | 119 | data_queue.put(vars) |
106 | 120 | vars = {} |
107 | 121 | |
108 | | -def parse_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'): |
| 122 | + |
| 123 | +def parse_editors(xml_queue, data_queue, pbar, bots, **kwargs): |
| 124 | + ''' |
| 125 | + @xml_queue contains the filenames of the files to be parsed |
| 126 | + @data_queue is an instance of Queue where the extracted data is stored for |
| 127 | + further processing |
| 128 | + @pbar is an instance of progressbar to display the progress |
| 129 | + @bots is a list of id's of known Wikipedia bots |
| 130 | + @debug is a flag to indicate whether the function is called for debugging. |
| 131 | + |
| 132 | + Output is the data_queue that will be used by store_editors() |
| 133 | + ''' |
| 134 | + file_location = os.path.join(settings.XML_FILE_LOCATION, kwargs.get('language', 'en')) |
| 135 | + debug = kwargs.get('debug', None) |
109 | 136 | if settings.DEBUG: |
110 | 137 | messages = {} |
111 | 138 | vars = {} |
| 139 | + |
112 | 140 | while True: |
113 | 141 | try: |
114 | 142 | if debug: |
— | — | @@ -117,12 +145,13 @@ |
118 | 146 | if file == None: |
119 | 147 | print 'Swallowed a poison pill' |
120 | 148 | break |
121 | | - data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION, |
| 149 | + data = xml.read_input(utils.create_txt_filehandle(file_location, |
122 | 150 | file, 'r', |
123 | 151 | encoding=settings.ENCODING)) |
124 | 152 | for raw_data in data: |
125 | 153 | xml_buffer = cStringIO.StringIO() |
126 | 154 | raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
| 155 | + |
127 | 156 | try: |
128 | 157 | raw_data = ''.join(raw_data) |
129 | 158 | xml_buffer.write(raw_data) |
— | — | @@ -144,142 +173,122 @@ |
145 | 174 | if settings.DEBUG: |
146 | 175 | utils.track_errors(xml_buffer, error, file, messages) |
147 | 176 | except MemoryError, error: |
148 | | - ''' |
149 | | - There is one xml file causing an out of memory file, not |
150 | | - sure which one yet. This happens when raw_data = |
151 | | - ''.join(raw_data) is called. 18-22 |
152 | | - ''' |
153 | 177 | print file, error |
154 | 178 | print raw_data[:12] |
155 | 179 | print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data]) |
156 | | - if settings.DEBUG: |
157 | | - utils.track_errors(xml_buffer, error, file, messages) |
158 | 180 | |
| 181 | + data_queue.put('NEXT') |
159 | 182 | if pbar: |
160 | | - #print xml_queue.qsize() |
161 | | - utils.update_progressbar(pbar, xml_queue) |
| 183 | + print file, xml_queue.qsize(), data_queue.qsize() |
| 184 | + #utils.update_progressbar(pbar, xml_queue) |
162 | 185 | if debug: |
163 | 186 | break |
164 | 187 | |
| 188 | + while True: |
| 189 | + if data_queue.qsize() < 100000: |
| 190 | + break |
| 191 | + else: |
| 192 | + time.sleep(10) |
| 193 | + print 'Still sleeping, queue is %s items long' % data_queue.qsize() |
| 194 | + |
165 | 195 | except Empty: |
166 | 196 | break |
167 | 197 | |
| 198 | + #for x in xrange(4): |
| 199 | + data_queue.put(None) |
| 200 | + |
168 | 201 | if settings.DEBUG: |
169 | | - utils.report_error_messages(messages, lookup_new_editors) |
| 202 | + utils.report_error_messages(messages, parse_editors) |
170 | 203 | |
171 | 204 | |
172 | 205 | def store_editors(data_queue, pids, dbname): |
| 206 | + ''' |
| 207 | + @data_queue is an instance of Queue containing information extracted by |
| 208 | + parse_editors() |
| 209 | + @pids is a list of PIDs used to check if other processes are finished |
| 210 | + running |
| 211 | + @dbname is the name of the MongoDB collection where to store the information. |
| 212 | + ''' |
173 | 213 | mongo = db.init_mongo_db(dbname) |
174 | 214 | collection = mongo['editors'] |
175 | 215 | mongo.collection.ensure_index('editor') |
| 216 | + editor_cache = cache.EditorCache(collection) |
176 | 217 | while True: |
177 | 218 | try: |
178 | 219 | edit = data_queue.get(block=False) |
179 | | - contributor = edit['editor'] |
180 | | - value = {'date':edit['date'], 'article': edit['article']} |
181 | | - collection.update({'editor': contributor}, {'$inc': {'edit_count': 1}, |
182 | | - '$push': {'edits': value}}, True) |
| 220 | + data_queue.task_done() |
| 221 | + if edit == None: |
| 222 | + print 'Swallowing poison pill' |
| 223 | + break |
| 224 | + elif edit == 'NEXT': |
| 225 | + editor_cache.add('NEXT', '') |
| 226 | + else: |
| 227 | + contributor = edit['editor'] |
| 228 | + value = {'date': edit['date'], 'article': edit['article']} |
| 229 | + editor_cache.add(contributor, value) |
| 230 | + #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True) |
| 231 | + #'$inc': {'edit_count': 1}, |
| 232 | + |
183 | 233 | except Empty: |
184 | 234 | ''' |
185 | 235 | This checks whether the Queue is empty because the preprocessors are |
186 | 236 | finished or because this function is faster in emptying the Queue |
187 | | - then the preprocessors are able to fill it. If this preprocessors |
| 237 | + then the preprocessors are able to fill it. If the preprocessors |
188 | 238 | are finished and this Queue is empty than break, else wait for the |
189 | 239 | Queue to fill. |
190 | 240 | ''' |
191 | | - if all([utils.check_if_process_is_running(pid) for pid in pids]): |
192 | | - pass |
193 | | - #print 'Empty queue or not %s?' % data_queue.qsize() |
194 | | - else: |
195 | | - break |
| 241 | + pass |
196 | 242 | |
| 243 | + print 'Emptying entire cache.' |
| 244 | + editor_cache.store() |
| 245 | + print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n) |
197 | 246 | |
198 | | -def optimize_editors(dbname, input_queue, **kwargs): |
199 | | - mongo = db.init_mongo_db(dbname) |
200 | | - collection = mongo['editors'] |
201 | | - definition = kwargs.pop('definition') |
202 | | - while True: |
203 | | - try: |
204 | | - id = input_queue.get(block=False) |
205 | | - #id = '94033' |
206 | | - editor = collection.find_one({'editor': id}) |
207 | | - edits = editor['edits'] |
208 | | - edits.sort() |
209 | | - year = edits[0]['date'].year |
210 | | - new_wikipedian = dataset.determine_editor_is_new_wikipedian(edits, defintion) |
211 | | - collection.update({'editor': id}, {'$set': {'edits': edits, 'year_joined': year, 'new_wikipedian': new_wikipedian}}) |
212 | | - |
213 | | - except Empty: |
214 | | - break |
215 | 247 | |
| 248 | +def load_bot_ids(): |
| 249 | + ''' |
| 250 | + Loader function to retrieve list of id's of known Wikipedia bots. |
| 251 | + ''' |
| 252 | + ids = {} |
| 253 | + mongo = db.init_mongo_db('bots') |
| 254 | + bots = mongo['ids'] |
| 255 | + cursor = bots.find() |
| 256 | + for bot in cursor: |
| 257 | + ids[bot['id']] = bot['name'] |
| 258 | + return ids |
216 | 259 | |
217 | | -def store_data_db(data_queue, pids): |
218 | | - connection = db.init_database() |
219 | | - cursor = connection.cursor() |
220 | | - db.create_tables(cursor, db_settings.CONTRIBUTOR_TABLE) |
221 | 260 | |
222 | | - empty = 0 |
223 | | - |
224 | | - values = [] |
225 | | - while True: |
226 | | - try: |
227 | | - chunk = data_queue.get(block=False) |
228 | | - contributor = chunk['contributor'].encode(settings.ENCODING) |
229 | | - article = chunk['article'] |
230 | | - timestamp = chunk['timestamp'].encode(settings.ENCODING) |
231 | | - bot = chunk['bot'] |
232 | | - values.append((contributor, article, timestamp, bot)) |
233 | | - |
234 | | - if len(values) == 50000: |
235 | | - cursor.executemany('INSERT INTO contributors VALUES (?,?,?,?)', values) |
236 | | - connection.commit() |
237 | | - #print 'Size of queue: %s' % data_queue.qsize() |
238 | | - values = [] |
239 | | - |
240 | | - except Empty: |
241 | | - if all([utils.check_if_process_is_running(pid) for pid in pids]): |
242 | | - pass |
243 | | - else: |
244 | | - break |
245 | | - connection.close() |
246 | | - |
247 | | - |
248 | | -def run_stand_alone(dbname): |
249 | | - files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml') |
250 | | - #files = files[:2] |
| 261 | +def run_parse_editors(dbname, language): |
| 262 | + ids = load_bot_ids() |
251 | 263 | kwargs = {'bots': ids, |
252 | 264 | 'dbname': dbname, |
253 | 265 | 'pbar': True, |
254 | | - 'definition': 'traditional'} |
| 266 | + 'nr_input_processors': 1, |
| 267 | + 'nr_output_processors': 1, |
| 268 | + 'language': language, |
| 269 | + } |
| 270 | + chunks = {} |
| 271 | + file_location = os.path.join(settings.XML_FILE_LOCATION, language) |
| 272 | + files = utils.retrieve_file_list(file_location, 'xml') |
| 273 | + parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0)) |
| 274 | + a = 0 |
| 275 | + for x in xrange(settings.NUMBER_OF_PROCESSES): |
| 276 | + b = a + parts |
| 277 | + chunks[x] = files[a:b] |
| 278 | + a = (x + 1) * parts |
255 | 279 | |
256 | | - mongo = db.init_mongo_db('bots') |
257 | | - bots = mongo['ids'] |
258 | | - ids = {} |
259 | | - cursor = bots.find() |
260 | | - for bot in cursor: |
261 | | - ids[bot['id']] = bot['name'] |
262 | | - |
263 | | - pc.build_scaffolding(pc.load_queue, parse_editors, files, store_editors, True, **kwargs) |
264 | | - ids = retrieve_ids_mongo_new(dbname, 'editors') |
265 | | - pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs) |
266 | 280 | |
267 | | -def debug_lookup_new_editors(): |
268 | | - q = Queue() |
269 | | - import progressbar |
270 | | - pbar = progressbar.ProgressBar().start() |
| 281 | + for x in xrange(settings.NUMBER_OF_PROCESSES): |
| 282 | + pc.build_scaffolding(pc.load_queue, parse_editors, chunks[x], store_editors, True, **kwargs) |
| 283 | + |
| 284 | + |
| 285 | +def debug_parse_editors(dbname): |
| 286 | + q = JoinableQueue() |
271 | 287 | #edits = db.init_mongo_db('editors') |
272 | | - parse_editors('464.xml', q, None, None, True) |
273 | | - store_data_mongo(q, [], 'test') |
274 | | - #keys = ['editor'] |
275 | | - #for key in keys: |
276 | | - # db.add_index_to_collection('editors', 'editors', key) |
| 288 | + parse_editors('en\\522.xml', q, None, None, True) |
| 289 | + store_editors(q, [], dbname) |
277 | 290 | |
| 291 | + |
278 | 292 | if __name__ == "__main__": |
279 | | - #optimize_editors('enwiki') |
280 | | - #debug_lookup_new_editors() |
281 | | - |
282 | | - if settings.RUN_MODE == 'stand_alone': |
283 | | - run_stand_alone() |
284 | | - print 'Finished processing XML files.' |
285 | | - else: |
286 | | - run_hadoop() |
| 293 | + #debug_parse_editors('test') |
| 294 | + run_parse_editors('test', 'en') |
| 295 | + pass |
Index: trunk/tools/editor_trends/settings.py |
— | — | @@ -30,21 +30,29 @@ |
31 | 31 | |
32 | 32 | #Setting up the environment |
33 | 33 | ops = {platform.win32_ver: 'Windows', |
34 | | - platform.linux_distribution: 'Linux', |
35 | | - platform.mac_ver: 'OSX'} |
| 34 | + platform.linux_distribution: 'Linux', |
| 35 | + platform.mac_ver: 'OSX'} |
| 36 | + |
36 | 37 | for op in ops: |
37 | 38 | if op() != ('', '', '') and op() != ('', ('', '', ''), ''): |
38 | 39 | OS = ops[op] |
39 | 40 | |
40 | | -WORKING_DIRECTORY = os.getcwd()#[:-9] |
| 41 | +WORKING_DIRECTORY = os.getcwd() |
41 | 42 | IGNORE_DIRS = ['wikistats', 'zips'] |
| 43 | +ROOT = '/' if OS != 'Windows' else 'c:\\' |
42 | 44 | |
43 | | -dirs = [name for name in os.listdir(WORKING_DIRECTORY) if os.path.isdir(os.path.join(WORKING_DIRECTORY, name))] |
| 45 | + |
| 46 | +dirs = [name for name in os.listdir(WORKING_DIRECTORY) if |
| 47 | + os.path.isdir(os.path.join(WORKING_DIRECTORY, name))] |
44 | 48 | for subdirname in dirs: |
45 | 49 | if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS: |
46 | 50 | sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname)) |
47 | 51 | |
| 52 | +WINDOWS_ZIP = ['7z.exe'] |
48 | 53 | |
| 54 | +OSX_ZIP = [] |
| 55 | + |
| 56 | +LINUX_ZIP = [] |
49 | 57 | #General settings |
50 | 58 | |
51 | 59 | # Valid values are 'stand-alone' and 'hadoop' |
— | — | @@ -65,22 +73,23 @@ |
66 | 74 | #This section contains configuration variables for the different file locations. |
67 | 75 | |
68 | 76 | # Location where to write xml chunks |
69 | | -XML_FILE_LOCATION = 'C:/wikimedia/' |
| 77 | +XML_FILE_LOCATION = os.path.join(ROOT, 'wikimedia') |
70 | 78 | |
71 | 79 | # Input file |
72 | | -XML_FILE = 'C:/Source_Files/enwiki-20100916-stub-meta-history.xml' |
| 80 | +XML_FILE = os.path.join(ROOT, 'Source_Files', 'enwiki-20100916-stub-meta-history.xml') |
73 | 81 | |
74 | 82 | # This is the place where error messages are stored for debugging purposes |
75 | | -ERROR_MESSAGE_FILE_LOCATION = WORKING_DIRECTORY + '/errors/' |
| 83 | +ERROR_MESSAGE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'errors') |
76 | 84 | |
77 | | -DATABASE_FILE_LOCATION = WORKING_DIRECTORY + '/data/database/' |
| 85 | +DATABASE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'database') |
78 | 86 | |
79 | | -BINARY_OBJECT_FILE_LOCATION = WORKING_DIRECTORY + '/data/objects/' |
| 87 | +BINARY_OBJECT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'objects') |
80 | 88 | |
81 | | -DATASETS_FILE_LOCATION = WORKING_DIRECTORY + '/datasets/' |
| 89 | +DATASETS_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'datasets') |
82 | 90 | |
83 | | -TXT_FILE_LOCATION = WORKING_DIRECTORY + '/csv/' |
| 91 | +TXT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'csv') |
84 | 92 | |
| 93 | +NAMESPACE_LOCATION = os.path.join(WORKING_DIRECTORY, 'namespaces') |
85 | 94 | #This section contains configuration variables for parsing / encoding and |
86 | 95 | #working with the XML files. |
87 | 96 | |
— | — | @@ -92,12 +101,32 @@ |
93 | 102 | # Name space, do not change as this works for Mediawiki wikis |
94 | 103 | NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/' |
95 | 104 | |
| 105 | + |
| 106 | +WIKIMEDIA_PROJECTS = {'commons': 'commonswiki', |
| 107 | + 'wikibooks': 'wikibooks', |
| 108 | + 'wikinews': 'wikinews', |
| 109 | + 'wikiquote': 'wikiquote', |
| 110 | + 'wikisource': 'wikisource', |
| 111 | + 'wikiversity': 'wikiversity', |
| 112 | + 'wiktionary': 'wiktionary', |
| 113 | + 'metawiki': 'metawiki', |
| 114 | + 'wikispecies': 'specieswiki', |
| 115 | + 'incubator': 'incubatorwiki', |
| 116 | + 'foundation': 'foundationwiki', |
| 117 | + 'mediawiki': 'mediawikiwiki', |
| 118 | + 'outreach': 'outreachwiki', |
| 119 | + 'strategic planning': 'strategywiki', |
| 120 | + 'usability initiative': 'usabilitywiki', |
| 121 | + 'multilingual wikisource': None |
| 122 | + } |
| 123 | + |
96 | 124 | #Multiprocess settings used to parallelize workload |
97 | 125 | #Change this to match your computers configuration (RAM / CPU) |
98 | 126 | NUMBER_OF_PROCESSES = cpu_count() * 1 |
99 | 127 | |
100 | | -#Extensions of ascii files, this is used to determine the filemode to use |
| 128 | +#Extensions of ascii files, this is used to determine the filemode to use |
101 | 129 | ASCII = ['txt', 'csv', 'xml', 'sql'] |
102 | 130 | |
103 | 131 | WP_DUMP_LOCATION = 'http://download.wikimedia.org' |
104 | 132 | |
| 133 | +MAX_CACHE_SIZE = 1024 * 1024 |
Index: trunk/tools/editor_trends/utils/namespace_downloader.py |
— | — | @@ -0,0 +1,43 @@ |
| 2 | +
|
| 3 | +
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__author__email = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = 'Oct 27, 2010'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +import languages
|
| 22 | +import dump_downloader as dd
|
| 23 | +import settings
|
| 24 | +
|
| 25 | +PATH = '/w/api.php?action=query&meta=siteinfo&siprop=namespaces|namespacealiases&format=json'
|
| 26 | +LOCATION = settings.NAMESPACE_LOCATION
|
| 27 | +
|
| 28 | +def retrieve_json_namespace():
|
| 29 | + visited = set()
|
| 30 | + for language in languages.MAPPING:
|
| 31 | + language = languages.MAPPING[language]
|
| 32 | + filename = '%s_ns.json' % language
|
| 33 | + if language not in visited:
|
| 34 | + domain = 'http://%s.wikipedia.org' % language
|
| 35 | + dd.download_wiki_file(domain, PATH, filename, LOCATION, 'w', True)
|
| 36 | + visited.add(language)
|
| 37 | +
|
| 38 | +
|
| 39 | +def launch_downloader():
|
| 40 | + retrieve_json_namespace()
|
| 41 | +
|
| 42 | +
|
| 43 | +if __name__ == '__main__':
|
| 44 | + launch_downloader() |
\ No newline at end of file |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -65,7 +65,7 @@ |
66 | 66 | return False |
67 | 67 | else: |
68 | 68 | os.kill(pid, 0) |
69 | | - return Tru |
| 69 | + return True |
70 | 70 | except Exception, error: |
71 | 71 | print error |
72 | 72 | return False |
— | — | @@ -132,7 +132,7 @@ |
133 | 133 | # read / write data related functions |
134 | 134 | def read_data_from_csv(filename, encoding): |
135 | 135 | if hasattr(filename, '__call__'): |
136 | | - filename = construct_filename_from_function(filename) |
| 136 | + filename = construct_filename(filename) |
137 | 137 | |
138 | 138 | fh = open_txt_file(filename, 'r', encoding=encoding) |
139 | 139 | for line in fh: |
— | — | @@ -140,13 +140,15 @@ |
141 | 141 | |
142 | 142 | fh.close() |
143 | 143 | |
144 | | -def create_directory(language): |
| 144 | + |
| 145 | +def create_directory(path): |
145 | 146 | try: |
146 | | - os.mkdir(settings.WORKING_DIRECTORY + '/' + language) |
| 147 | + os.mkdir(path) |
147 | 148 | return True |
148 | | - except IOERROR: |
| 149 | + except IOError: |
149 | 150 | return False |
150 | 151 | |
| 152 | + |
151 | 153 | def determine_file_extension(filename): |
152 | 154 | pos = filename.rfind('.') + 1 |
153 | 155 | return filename[pos:] |
— | — | @@ -158,10 +160,18 @@ |
159 | 161 | else: |
160 | 162 | return 'wb' |
161 | 163 | |
162 | | - |
163 | | -def write_data_to_csv(data, location, function, encoding): |
164 | | - filename = construct_filename_from_function(function, '.csv') |
165 | | - fh = open_txt_file(location, filename, 'a', encoding=encoding) |
| 164 | +def write_list_to_csv(data, fh, recursive=False): |
| 165 | + if recursive: |
| 166 | + recursive = False |
| 167 | + for d in data: |
| 168 | + if type(d) == type([]): |
| 169 | + recursive = write_list_to_csv(d, fh, True) |
| 170 | + else: |
| 171 | + fh.write('%s\t' % d) |
| 172 | + if recursive: |
| 173 | + return True |
| 174 | + |
| 175 | +def write_dict_to_csv(data, fh): |
166 | 176 | keys = data.keys() |
167 | 177 | for key in keys: |
168 | 178 | fh.write('%s' % key) |
— | — | @@ -172,45 +182,68 @@ |
173 | 183 | else: |
174 | 184 | fh.write('\t%s' % (obs)) |
175 | 185 | fh.write('\n') |
176 | | - fh.close() |
177 | 186 | |
178 | 187 | |
179 | | -def open_txt_file(location, filename, mode, encoding): |
180 | | - return codecs.open(location + filename, mode, encoding=encoding) |
| 188 | +def create_txt_filehandle(location, name, mode, encoding): |
| 189 | + filename = construct_filename(name, '.csv') |
| 190 | + path = os.path.join(location, filename) |
| 191 | + return codecs.open(path, mode, encoding=encoding) |
181 | 192 | |
182 | 193 | |
183 | | -def open_binary_file(location, filename, mode): |
184 | | - return open(location + filename, mode) |
| 194 | +def create_binary_filehandle(location, filename, mode): |
| 195 | + path = os.path.join(location, filename) |
| 196 | + return open(path, mode) |
185 | 197 | |
186 | | -def construct_filename_from_function(function, extension): |
187 | | - return function.func_name + extension |
188 | 198 | |
| 199 | +def construct_filename(name, extension): |
| 200 | + if hasattr(name, '__call__'): |
| 201 | + return name.func_name + extension |
| 202 | + else: |
| 203 | + return name |
189 | 204 | |
| 205 | + |
190 | 206 | def check_file_exists(location, filename): |
191 | 207 | if hasattr(filename, '__call__'): |
192 | | - filename = construct_filename_from_function(filename, '.bin') |
193 | | - if os.path.exists(location + filename): |
| 208 | + filename = construct_filename(filename, '.bin') |
| 209 | + if os.path.exists(os.path.join(location, filename)): |
194 | 210 | return True |
195 | 211 | else: |
196 | 212 | return False |
197 | 213 | |
198 | 214 | |
| 215 | +def which(program): |
| 216 | + def is_exe(fpath): |
| 217 | + return os.path.exists(fpath) and os.access(fpath, os.X_OK) |
| 218 | + |
| 219 | + fpath, fname = os.path.split(program) |
| 220 | + if fpath: |
| 221 | + if is_exe(program): |
| 222 | + return program |
| 223 | + else: |
| 224 | + for path in os.environ["PATH"].split(os.pathsep): |
| 225 | + exe_file = os.path.join(path, program) |
| 226 | + if is_exe(exe_file): |
| 227 | + return exe_file |
| 228 | + |
| 229 | + return None |
| 230 | + |
| 231 | + |
199 | 232 | def store_object(object, location, filename): |
200 | 233 | if hasattr(filename, '__call__'): |
201 | | - filename = construct_filename_from_function(filename, '.bin') |
| 234 | + filename = construct_filename(filename, '.bin') |
202 | 235 | if not filename.endswith('.bin'): |
203 | 236 | filename = filename + '.bin' |
204 | | - fh = open(location + filename, 'wb') |
| 237 | + fh = create_binary_filehandle(location, filename, 'wb') |
205 | 238 | cPickle.dump(object, fh) |
206 | 239 | fh.close() |
207 | 240 | |
208 | 241 | |
209 | 242 | def load_object(location, filename): |
210 | 243 | if hasattr(filename, '__call__'): |
211 | | - filename = construct_filename_from_function(filename, '.bin') |
| 244 | + filename = construct_filename(filename, '.bin') |
212 | 245 | if not filename.endswith('.bin'): |
213 | 246 | filename = filename + '.bin' |
214 | | - fh = open(location + filename, 'rb') |
| 247 | + fh = create_binary_filehandle(location, filename, 'rb') |
215 | 248 | obj = cPickle.load(fh) |
216 | 249 | fh.close() |
217 | 250 | return obj |
— | — | @@ -293,8 +326,8 @@ |
294 | 327 | |
295 | 328 | |
296 | 329 | def debug(): |
297 | | - dt = humanize_time_difference(64) |
298 | | - print dt |
299 | | - |
| 330 | + #dt = humanize_time_difference(64) |
| 331 | + #print dt |
| 332 | + check_if_process_is_running(3012) |
300 | 333 | if __name__ == '__main__': |
301 | 334 | debug() |
Index: trunk/tools/editor_trends/utils/process_constructor.py |
— | — | @@ -17,7 +17,7 @@ |
18 | 18 | __date__ = '2010-10-21' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | | -from multiprocessing import Process, Queue |
| 21 | +from multiprocessing import Process, Queue, JoinableQueue |
22 | 22 | from Queue import Empty |
23 | 23 | |
24 | 24 | import settings |
— | — | @@ -53,29 +53,32 @@ |
54 | 54 | @kwargs is a dictionary with optional variables. Used to supply to main |
55 | 55 | ''' |
56 | 56 | |
57 | | - input_queue = Queue() |
| 57 | + nr_input_processors = kwargs.pop('nr_input_processors') |
| 58 | + nr_output_processors = kwargs.pop('nr_output_processors') |
| 59 | + |
58 | 60 | if result_queue: |
59 | | - result_queue = Queue() |
| 61 | + result_queue = JoinableQueue() |
60 | 62 | |
61 | | - load_input_queue(input_queue, obj, poison_pill=True) |
| 63 | + input_queue = load_input_queue(obj, poison_pill=True) |
62 | 64 | |
63 | 65 | if settings.PROGRESS_BAR: |
64 | 66 | pbar = progressbar.ProgressBar(maxval=input_queue.qsize()).start() |
| 67 | + kwargs['pbar'] = pbar |
65 | 68 | else: |
66 | 69 | pbar = False |
67 | 70 | |
68 | 71 | |
69 | 72 | input_processes = [models.ProcessInputQueue(main, input_queue, result_queue, |
70 | | - **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES -1)] |
| 73 | + **kwargs) for i in xrange(nr_input_processors)] |
71 | 74 | |
72 | 75 | for input_process in input_processes: |
73 | 76 | input_process.start() |
74 | 77 | pids = [p.pid for p in input_processes] |
75 | 78 | kwargs['pids'] = pids |
76 | | - |
| 79 | + |
77 | 80 | if result_queue: |
78 | 81 | result_processes = [models.ProcessResultQueue(result_processor, |
79 | | - result_queue, **kwargs) for i in xrange(24)] |
| 82 | + result_queue, **kwargs) for i in xrange(nr_output_processors)] |
80 | 83 | for result_process in result_processes: |
81 | 84 | result_process.start() |
82 | 85 | |
— | — | @@ -95,7 +98,7 @@ |
96 | 99 | print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed)) |
97 | 100 | |
98 | 101 | |
99 | | -def load_queue(input_queue, obj, poison_pill=False): |
| 102 | +def load_queue(obj, poison_pill=False): |
100 | 103 | ''' |
101 | 104 | @input_queue should be an instance of multiprocessing.Queue |
102 | 105 | |
— | — | @@ -103,7 +106,7 @@ |
104 | 107 | |
105 | 108 | @returns: queue with tasks |
106 | 109 | ''' |
107 | | - |
| 110 | + input_queue = Queue() |
108 | 111 | if isinstance(obj, type(list)): |
109 | 112 | data = utils.load_object(obj) |
110 | 113 | else: |
Index: trunk/tools/editor_trends/split_xml_file.py |
— | — | @@ -21,6 +21,12 @@ |
22 | 22 | import codecs |
23 | 23 | import utils |
24 | 24 | import re |
| 25 | +import json |
| 26 | +import os |
| 27 | + |
| 28 | +import progressbar |
| 29 | + |
| 30 | +from utils import utils |
25 | 31 | import settings |
26 | 32 | |
27 | 33 | try: |
— | — | @@ -30,6 +36,7 @@ |
31 | 37 | pass |
32 | 38 | |
33 | 39 | |
| 40 | + |
34 | 41 | RE_NUMERIC_CHARACTER = re.compile('&#(\d+);') |
35 | 42 | |
36 | 43 | |
— | — | @@ -38,7 +45,16 @@ |
39 | 46 | |
40 | 47 | |
41 | 48 | def lenient_deccharref(m): |
42 | | - return unichr(int(m.group(1))) |
| 49 | + try: |
| 50 | + return unichr(int(m.group(1))) |
| 51 | + except ValueError: |
| 52 | + ''' |
| 53 | + There are a few articles that raise a Value Error here, the reason is |
| 54 | + that I am using a narrow Python build (UCS2) instead of a wide build |
| 55 | + (UCS4). The quick fix is to return an empty string... |
| 56 | + Real solution is to rebuild Python with UCS4 support..... |
| 57 | + ''' |
| 58 | + return '' |
43 | 59 | |
44 | 60 | |
45 | 61 | def remove_namespace(element, namespace): |
— | — | @@ -50,42 +66,70 @@ |
51 | 67 | elem.tag = elem.tag[nsl:] |
52 | 68 | return element |
53 | 69 | |
| 70 | +def load_namespace(language): |
| 71 | + file = '%s_ns.json' % language |
| 72 | + fh = utils.create_txt_filehandle(settings.NAMESPACE_LOCATION, file, 'r', settings.ENCODING) |
| 73 | + ns = json.load(fh) |
| 74 | + fh.close() |
| 75 | + ns = ns['query']['namespaces'] |
| 76 | + return ns |
54 | 77 | |
| 78 | + |
| 79 | +def build_namespaces_locale(namespaces): |
| 80 | + ns = [] |
| 81 | + for namespace in namespaces: |
| 82 | + value = namespaces[namespace].get(u'canonical', None) |
| 83 | + if value != None and not value.endswith('talk'): |
| 84 | + ns.append(value) |
| 85 | + return ns |
| 86 | + |
| 87 | + |
55 | 88 | def parse_comments(xml, function): |
56 | 89 | revisions = xml.findall('revision') |
57 | 90 | for revision in revisions: |
58 | 91 | comment = revision.find('comment') |
59 | 92 | timestamp = revision.find('timestamp').text |
60 | | - |
61 | 93 | # text1 = remove_ascii_control_characters(text) |
62 | 94 | # text2 = remove_numeric_character_references(text) |
63 | 95 | # text3 = convert_html_entities(text) |
64 | | - |
65 | 96 | if comment != None and comment.text != None: |
66 | 97 | comment.text = function(comment.text) |
67 | 98 | return xml |
68 | 99 | |
69 | 100 | |
| 101 | +def is_article_main_namespace(elem, namespace): |
| 102 | + title = elem.find('title').text |
| 103 | + for ns in namespace: |
| 104 | + if title.startswith(ns): |
| 105 | + return False |
| 106 | + return True |
| 107 | + |
| 108 | + |
| 109 | + |
70 | 110 | def write_xml_file(element, fh, counter, language): |
71 | 111 | '''Get file handle and write xml element to file''' |
72 | 112 | size = len(cElementTree.tostring(element)) |
73 | | - fh, counter = create_xml_file_handle(fh, counter, size) |
74 | | - fh.write(cElementTree.tostring(element)) |
| 113 | + fh, counter = create_xml_file_handle(fh, counter, size, language) |
| 114 | + try: |
| 115 | + fh.write(cElementTree.tostring(element)) |
| 116 | + except MemoryError: |
| 117 | + print 'Add error capturing logic' |
75 | 118 | fh.write('\n') |
76 | 119 | return fh, counter |
77 | 120 | |
78 | 121 | |
79 | | -def create_xml_file_handle(fh, counter, size): |
| 122 | +def create_xml_file_handle(fh, counter, size, language): |
80 | 123 | '''Create file handle if none is supplied or if file size > max file size.''' |
| 124 | + path = os.path.join(settings.XML_FILE_LOCATION , language, '%s.xml' % counter) |
81 | 125 | if not fh: |
82 | 126 | counter = 0 |
83 | | - fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING) |
| 127 | + fh = codecs.open(path, 'w', encoding=settings.ENCODING) |
84 | 128 | return fh, counter |
85 | 129 | elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE: |
86 | 130 | print 'Created chunk %s' % counter |
87 | 131 | fh.close |
88 | 132 | counter += 1 |
89 | | - fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING) |
| 133 | + fh = codecs.open(path, 'w', encoding=settings.ENCODING) |
90 | 134 | return fh, counter |
91 | 135 | else: |
92 | 136 | return fh, counter |
— | — | @@ -93,14 +137,21 @@ |
94 | 138 | |
95 | 139 | def split_xml(language): |
96 | 140 | '''Reads xml file and splits it in N chunks''' |
97 | | - result = utils.create_directory(language) |
| 141 | + location = os.path.join(settings.XML_FILE_LOCATION, language) |
| 142 | + result = utils.check_file_exists(location, '') |
| 143 | + if result == False: |
| 144 | + result = utils.create_directory(location) |
98 | 145 | if not result: |
99 | 146 | return |
100 | 147 | |
| 148 | + ns = load_namespace(language) |
| 149 | + ns = build_namespaces_locale(ns) |
| 150 | + |
| 151 | + |
101 | 152 | fh = None |
102 | 153 | counter = None |
103 | 154 | tag = '{%s}page' % settings.NAME_SPACE |
104 | | - |
| 155 | + |
105 | 156 | context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end')) |
106 | 157 | context = iter(context) |
107 | 158 | event, root = context.next() # get the root element of the XML doc |
— | — | @@ -110,12 +161,16 @@ |
111 | 162 | if elem.tag == tag: |
112 | 163 | elem = remove_namespace(elem, settings.NAME_SPACE) |
113 | 164 | elem = parse_comments(elem, remove_numeric_character_references) |
| 165 | + |
| 166 | + if is_article_main_namespace(elem, ns): |
| 167 | + fh, counter = write_xml_file(elem, fh, counter, language) |
| 168 | + |
| 169 | + root.clear() # when done parsing a section clear the tree to safe memory |
| 170 | + |
114 | 171 | #elem = parse_comments(elem, convert_html_entities) |
115 | 172 | #elem = parse_comments(elem, remove_ascii_control_characters) |
116 | | - fh, counter = write_xml_file(elem, fh, counter, language) |
117 | 173 | #print cElementTree.tostring(elem) |
118 | | - root.clear() # when done parsing a section clear the tree to safe memory |
119 | 174 | |
120 | 175 | |
121 | 176 | if __name__ == "__main__": |
122 | | - split_xml('enwiki') |
| 177 | + split_xml('en') |
Index: trunk/tools/editor_trends/construct_datasets.py |
— | — | @@ -19,7 +19,6 @@ |
20 | 20 | |
21 | 21 | from multiprocessing import Queue |
22 | 22 | from Queue import Empty |
23 | | -import sqlite3 |
24 | 23 | |
25 | 24 | import progressbar |
26 | 25 | |
— | — | @@ -35,46 +34,63 @@ |
36 | 35 | pass |
37 | 36 | |
38 | 37 | |
39 | | -def retrieve_editor_ids_mongo(RANDOM_SAMPLE=True): |
40 | | - raise DeprecatedError |
41 | | -# if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION, |
42 | | -# retrieve_editor_ids_mongo): |
43 | | -# contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, |
44 | | -# retrieve_editor_ids_mongo) |
45 | | -# else: |
46 | | -# mongo = db.init_mongo_db('editors') |
47 | | -# editors = mongo['editors'] |
48 | | -# contributors = set() |
49 | | -# #ids = editors.find().distinct('editor') |
50 | | -# ids = editors.find() |
51 | | -# for x, id in enumerate(ids): |
52 | | -# contributors.add(id['editor']) |
53 | | -# if len(contributors) == 100000: |
54 | | -# if RANDOM_SAMPLE: |
55 | | -# break |
56 | | -# if contributors != set(): |
57 | | -# utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo) |
58 | | -# return contributors |
59 | | - |
60 | | -def retrieve_ids_mongo_new(dbname, collection): |
61 | | - if utils.check_file_exists(settings.TXT_FILE_LOCATION, |
| 38 | +def retrieve_editor_ids_mongo(dbname, collection): |
| 39 | + if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION, |
62 | 40 | retrieve_editor_ids_mongo): |
63 | | - ids = utils.load_object(settings.TXT_FILE_LOCATION, |
| 41 | + ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, |
64 | 42 | retrieve_editor_ids_mongo) |
65 | 43 | else: |
66 | 44 | mongo = db.init_mongo_db(dbname) |
67 | 45 | editors = mongo[collection] |
68 | | - ids = editors.distinct() |
69 | | - utils.store_object(contributors, settings.TXT_FILE_LOCATION, retrieve_editor_ids_mongo) |
| 46 | + ids = editors.distinct('editor') |
| 47 | + utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo) |
70 | 48 | return ids |
71 | 49 | |
| 50 | + |
| 51 | +def expand_edits(edits): |
| 52 | + data = [] |
| 53 | + for edit in edits: |
| 54 | + data.append(edit['date']) |
| 55 | + return data |
| 56 | + |
| 57 | + |
| 58 | +def expand_observations(obs, vars_to_expand): |
| 59 | + for var in vars_to_expand: |
| 60 | + if var == 'edits': |
| 61 | + obs[var] = expand_edits(obs[var]) |
| 62 | + elif var == 'edits_by_year': |
| 63 | + keys = obs[var].keys() |
| 64 | + keys.sort() |
| 65 | + edits = [] |
| 66 | + for key in keys: |
| 67 | + edits.append(str(obs[var][key])) |
| 68 | + obs[var] = edits |
| 69 | + return obs |
| 70 | + |
| 71 | + |
| 72 | +def expand_headers(headers, vars_to_expand, obs): |
| 73 | + for var in vars_to_expand: |
| 74 | + l = len(obs[var]) |
| 75 | + pos = headers.index(var) |
| 76 | + for i in xrange(l): |
| 77 | + if var.endswith('year'): |
| 78 | + suffix = 2001 + i |
| 79 | + elif var.endswith('edits'): |
| 80 | + suffix = 1 + i |
| 81 | + headers.insert(pos+i, '%s_%s' % (var, suffix)) |
| 82 | + headers.remove(var) |
| 83 | + return headers |
| 84 | + |
| 85 | + |
72 | 86 | def generate_editor_dataset(input_queue, data_queue, pbar, kwargs): |
73 | | - definition = kwargs.pop('definition') |
74 | | - limit = kwargs.pop('limit') |
75 | 87 | debug = kwargs.pop('debug') |
76 | | - mongo = db.init_mongo_db('editors') |
77 | | - editors = mongo['editors'] |
78 | | - data = {} |
| 88 | + dbname = kwargs.pop('dbname') |
| 89 | + mongo = db.init_mongo_db(dbname) |
| 90 | + editors = mongo['dataset'] |
| 91 | + name = dbname + '_editors.csv' |
| 92 | + fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING) |
| 93 | + x = 0 |
| 94 | + vars_to_expand = ['edits', 'edits_by_year'] |
79 | 95 | while True: |
80 | 96 | try: |
81 | 97 | if debug: |
— | — | @@ -83,115 +99,68 @@ |
84 | 100 | id = input_queue.get(block=False) |
85 | 101 | |
86 | 102 | print input_queue.qsize() |
87 | | - if definition == 'Traditional': |
88 | 103 | |
89 | | - obs = editors.find({'editor': id}, {'date':1}).sort('date').limit(limit) |
90 | | - contributors = [] |
91 | | - for ob in obs: |
92 | | - contributors.append(ob['date']) |
93 | | - obs = '' |
94 | | - else: |
95 | | - obs = editors.find({'editor': id}, {'date':1}).sort('date') |
96 | | - contributors = set() |
97 | | - for ob in obs: |
98 | | - if len(contributors) == limit: |
99 | | - break |
100 | | - else: |
101 | | - contributors.add(ob['date']) |
102 | | - obs.close() |
103 | | - if len(contributors) < limit: |
104 | | - new_wikipedian = False |
105 | | - else: |
106 | | - new_wikipedian = True |
107 | | - data[id] = [contributors, new_wikipedian] |
| 104 | + obs = editors.find_one({'editor': id}) |
| 105 | + obs = expand_observations(obs, vars_to_expand) |
| 106 | + if x == 0: |
| 107 | + headers = obs.keys() |
| 108 | + headers.sort() |
| 109 | + headers = expand_headers(headers, vars_to_expand, obs) |
| 110 | + utils.write_list_to_csv(headers, fh) |
| 111 | + fh.write('\n') |
| 112 | + data = [] |
| 113 | + keys = obs.keys() |
| 114 | + keys.sort() |
| 115 | + for key in keys: |
| 116 | + data.append(obs[key]) |
| 117 | + utils.write_list_to_csv(data, fh) |
| 118 | + fh.write('\n') |
108 | 119 | |
109 | | - |
| 120 | + x += 1 |
110 | 121 | except Empty: |
111 | | - utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING) |
112 | 122 | break |
| 123 | + fh.close() |
113 | 124 | |
114 | 125 | |
115 | | -def retrieve_editor_ids_db(): |
116 | | - contributors = set() |
117 | | - connection = db.init_database() |
118 | | - cursor = connection.cursor() |
119 | | - if settings.PROGRESS_BAR: |
120 | | - cursor.execute('SELECT MAX(ROWID) FROM contributors') |
121 | | - for id in cursor: |
122 | | - pass |
123 | | - pbar = progressbar.ProgressBar(maxval=id[0]).start() |
124 | | - |
125 | | - cursor.execute('SELECT contributor FROM contributors WHERE bot=0') |
126 | | - |
127 | | - print 'Retrieving contributors...' |
128 | | - for x, contributor in enumerate(cursor): |
129 | | - contributors.add(contributor[0]) |
130 | | - if x % 100000 == 0: |
131 | | - pbar.update(x) |
132 | | - print 'Serializing contributors...' |
133 | | - utils.store_object(contributors, 'contributors') |
134 | | - print 'Finished serializing contributors...' |
135 | | - |
136 | | - if pbar: |
137 | | - pbar.finish() |
138 | | - print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed)) |
139 | | - |
140 | | - connection.close() |
141 | | - |
142 | | - |
143 | | -def retrieve_edits_by_contributor(input_queue, result_queue, pbar): |
144 | | - connection = db.init_database() |
145 | | - cursor = connection.cursor() |
146 | | - |
147 | | - while True: |
148 | | - try: |
149 | | - contributor = input_queue.get(block=False) |
150 | | - if contributor == None: |
151 | | - break |
152 | | - |
153 | | - cursor.execute('SELECT contributor, timestamp, bot FROM contributors WHERE contributor=?', (contributor,)) |
154 | | - edits = {} |
155 | | - edits[contributor] = set() |
156 | | - for edit, timestamp, bot in cursor: |
157 | | - date = utils.convert_timestamp_to_date(timestamp) |
158 | | - edits[contributor].add(date) |
159 | | - #print edit, timestamp, bot |
160 | | - |
161 | | - utils.write_data_to_csv(edits, retrieve_edits_by_contributor) |
162 | | - if pbar: |
163 | | - utils.update_progressbar(pbar, input_queue) |
164 | | - |
165 | | - except Empty: |
166 | | - pass |
167 | | - |
168 | | - connection.close() |
169 | | - |
170 | | - |
171 | 126 | def retrieve_edits_by_contributor_launcher(): |
172 | 127 | pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors') |
173 | 128 | |
174 | 129 | |
175 | 130 | def debug_retrieve_edits_by_contributor_launcher(): |
176 | | - q = Queue() |
177 | | - kwargs = {'definition':'Traditional', |
178 | | - 'limit': 10, |
179 | | - 'debug': False |
| 131 | + kwargs = {'debug': False, |
| 132 | + 'dbname': 'enwiki', |
180 | 133 | } |
181 | | - ids = retrieve_editor_ids_mongo() |
182 | | - input_queue = pc.load_queue(q, ids) |
183 | | - generate_editor_dataset(input_queue, False, False, kwargs) |
| 134 | + ids = retrieve_editor_ids_mongo('enwiki', 'editors') |
| 135 | + input_queue = pc.load_queue(ids) |
| 136 | + q = Queue() |
| 137 | + generate_editor_dataset(input_queue, q, False, kwargs) |
184 | 138 | #generate_editor_dataset_launcher() |
185 | 139 | #retrieve_list_contributors() |
186 | 140 | #retrieve_edits_by_contributor() |
187 | 141 | |
188 | 142 | def generate_editor_dataset_launcher(): |
189 | | - kwargs = {'definition':'Traditional', |
190 | | - 'limit': 10, |
191 | | - 'debug': False |
| 143 | + kwargs = {'nr_input_processors': 1, |
| 144 | + 'nr_output_processors': 1, |
| 145 | + 'debug': False, |
| 146 | + 'dbname': 'enwiki', |
192 | 147 | } |
193 | | - pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, kwargs) |
| 148 | + ids = retrieve_editor_ids_mongo('enwiki', 'editors') |
| 149 | + pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, **kwargs) |
194 | 150 | |
195 | 151 | |
| 152 | +def generate_editor_dataset_debug(): |
| 153 | + ids = retrieve_editor_ids_mongo('enwiki', 'editors') |
| 154 | + input_queue = pc.load_queue(ids) |
| 155 | + #write_dataset(input_queue, [], 'enwiki') |
| 156 | + kwargs = {'nr_input_processors': 1, |
| 157 | + 'nr_output_processors': 1, |
| 158 | + 'debug': True, |
| 159 | + 'dbname': 'enwiki', |
| 160 | + } |
| 161 | + generate_editor_dataset(input_queue, False, False, kwargs) |
| 162 | + |
| 163 | + |
196 | 164 | if __name__ == '__main__': |
197 | | - #generate_editor_dataset_launcher() |
198 | | - debug_retrieve_edits_by_contributor_launcher() |
| 165 | + #generate_editor_dataset_debug() |
| 166 | + generate_editor_dataset_launcher() |
| 167 | + #debug_retrieve_edits_by_contributor_launcher() |
Property changes on: trunk/tools/editor_trends |
___________________________________________________________________ |
Modified: svn:ignore |
199 | 168 | - wikistats |
zips |
notes.txt |
*.pyc |
datasets |
errors |
.settings |
.project |
.pydevproject |
200 | 169 | + wikistats |
zips |
notes.txt |
*.pyc |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |