Index: trunk/tools/editor_trends/etl/optimize_editors.py |
— | — | @@ -0,0 +1,172 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-11-02' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +from multiprocessing import Queue |
| 22 | +from Queue import Empty |
| 23 | +from operator import itemgetter |
| 24 | +import datetime |
| 25 | + |
| 26 | +import configuration |
| 27 | +settings = configuration.Settings() |
| 28 | +from database import db |
| 29 | +from utils import process_constructor as pc |
| 30 | +from utils import utils |
| 31 | +import construct_datasets |
| 32 | + |
| 33 | + |
| 34 | +try: |
| 35 | + import psyco |
| 36 | + psyco.full() |
| 37 | +except ImportError: |
| 38 | + pass |
| 39 | + |
| 40 | + |
| 41 | +def create_datacontainer(init_value=0): |
| 42 | + ''' |
| 43 | + This function initializes an empty dictionary with as key the year (starting |
| 44 | + 2001 and running through) and as value @init_value, in most cases this will |
| 45 | + be zero so the dictionary will act as a running tally for a variable but |
| 46 | + @init_value can also a list, [], or a dictionary, {}, or a set, set(). |
| 47 | + ''' |
| 48 | + data = {} |
| 49 | + year = datetime.datetime.now().year + 1 |
| 50 | + for x in xrange(2001, year): |
| 51 | + data[str(x)] = init_value |
| 52 | + return data |
| 53 | + |
| 54 | + |
| 55 | +def add_months_to_datacontainer(datacontainer): |
| 56 | + for dc in datacontainer: |
| 57 | + datacontainer[dc] = {} |
| 58 | + for x in xrange(1, 13): |
| 59 | + datacontainer[dc][str(x)] = 0 |
| 60 | + return datacontainer |
| 61 | + |
| 62 | + |
| 63 | +def determine_edits_by_month(edits): |
| 64 | + datacontainer = create_datacontainer(init_value=0) |
| 65 | + datacontainer = add_months_to_datacontainer(datacontainer) |
| 66 | + for year in edits: |
| 67 | + months = set() |
| 68 | + for edit in edits[year]: |
| 69 | + m = str(edit['date'].month) |
| 70 | + if m not in months: |
| 71 | + datacontainer[year][m] = 1 |
| 72 | + months.add(m) |
| 73 | + if len(months) == 12: |
| 74 | + break |
| 75 | + return datacontainer |
| 76 | + |
| 77 | + |
| 78 | +def determine_edits_by_year(dates): |
| 79 | + ''' |
| 80 | + This function counts the number of edits by year made by a particular editor. |
| 81 | + ''' |
| 82 | + edits = create_datacontainer() |
| 83 | + for date in dates: |
| 84 | + year = str(date['date'].year) |
| 85 | + edits[year] += 1 |
| 86 | + return edits |
| 87 | + |
| 88 | + |
| 89 | +def determine_articles_by_year(dates): |
| 90 | + ''' |
| 91 | + This function counts the number of unique articles by year edited by a |
| 92 | + particular editor. |
| 93 | + ''' |
| 94 | + articles = create_datacontainer(set()) |
| 95 | + for date in dates: |
| 96 | + year = str(date['date'].year) |
| 97 | + articles[year].add(date['article']) |
| 98 | + for article in articles: |
| 99 | + articles[article] = len(articles[article]) |
| 100 | + return articles |
| 101 | + |
| 102 | + |
| 103 | +def sort_edits(edits): |
| 104 | + edits = utils.merge_list(edits) |
| 105 | + return sorted(edits, key=itemgetter('date')) |
| 106 | + |
| 107 | + |
| 108 | +def optimize_editors(input_queue, result_queue, pbar, **kwargs): |
| 109 | + dbname = kwargs.pop('dbname') |
| 110 | + mongo = db.init_mongo_db(dbname) |
| 111 | + input = mongo['test'] |
| 112 | + output = mongo['dataset'] |
| 113 | + output.ensure_index('editor') |
| 114 | + output.ensure_index('year_joined') |
| 115 | + definition = kwargs.pop('definition') |
| 116 | + while True: |
| 117 | + try: |
| 118 | + id = input_queue.get(block=False) |
| 119 | + editor = input.find_one({'editor': id}) |
| 120 | + if editor == None: |
| 121 | + continue |
| 122 | + edits = editor['edits'] |
| 123 | + monthly_edits = determine_edits_by_month(edits) |
| 124 | + edits = sort_edits(edits) |
| 125 | + edit_count = len(edits) |
| 126 | + new_wikipedian = edits[9]['date'] |
| 127 | + first_edit = edits[0]['date'] |
| 128 | + final_edit = edits[-1]['date'] |
| 129 | + edits_by_year = determine_edits_by_year(edits) |
| 130 | + articles_by_year = determine_articles_by_year(edits) |
| 131 | + |
| 132 | + edits = edits[:10] |
| 133 | + |
| 134 | + output.insert({'editor': id, 'edits': edits, |
| 135 | + 'edits_by_year': edits_by_year, |
| 136 | + 'new_wikipedian': new_wikipedian, |
| 137 | + 'edit_count': edit_count, |
| 138 | + 'final_edit': final_edit, |
| 139 | + 'first_edit': first_edit, |
| 140 | + 'articles_by_year': articles_by_year, |
| 141 | + 'monthly_edits': monthly_edits}) |
| 142 | + print 'Items left: %s' % input_queue.qsize() |
| 143 | + except Empty: |
| 144 | + break |
| 145 | + |
| 146 | + |
| 147 | +def run_optimize_editors(dbname): |
| 148 | + ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors') |
| 149 | + kwargs = {'definition': 'traditional', |
| 150 | + 'pbar': True, |
| 151 | + 'dbname': 'enwiki', |
| 152 | + 'nr_input_processors': 1, |
| 153 | + 'nr_output_processors': 0, |
| 154 | + 'poison_pill': False |
| 155 | + } |
| 156 | + print len(ids) |
| 157 | + ids = list(ids) |
| 158 | + chunks = dict(0, ids) |
| 159 | + pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs) |
| 160 | + |
| 161 | + |
| 162 | +def debug_optimize_editors(dbname): |
| 163 | + ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors') |
| 164 | + q = pc.load_queue(ids) |
| 165 | + kwargs = {'definition': 'traditional', |
| 166 | + 'dbname': dbname |
| 167 | + } |
| 168 | + optimize_editors(q, False, True, kwargs) |
| 169 | + |
| 170 | + |
| 171 | +if __name__ == '__main__': |
| 172 | + #debug_optimize_editors('test') |
| 173 | + run_optimize_editors('enwiki') |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/etl/optimize_editors.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 174 | + native |
Index: trunk/tools/editor_trends/etl/extract.py |
— | — | @@ -0,0 +1,338 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +#Default Python libraries (Python => 2.6) |
| 22 | +import sys |
| 23 | +import os |
| 24 | +import time |
| 25 | +import datetime |
| 26 | +import codecs |
| 27 | +import math |
| 28 | +import cStringIO |
| 29 | +import re |
| 30 | +from operator import itemgetter |
| 31 | +import xml.etree.cElementTree as cElementTree |
| 32 | +from multiprocessing import Queue, JoinableQueue |
| 33 | +from Queue import Empty |
| 34 | +import pymongo |
| 35 | + |
| 36 | +# Custom written files |
| 37 | +import configuration |
| 38 | +settings = configuration.Settings() |
| 39 | +from utils import utils, models |
| 40 | +from database import db_settings |
| 41 | +from database import db |
| 42 | +from database import cache |
| 43 | +from wikitree import xml |
| 44 | +from statistics import dataset |
| 45 | +from utils import process_constructor as pc |
| 46 | + |
| 47 | + |
| 48 | +try: |
| 49 | + import psyco |
| 50 | + psyco.full() |
| 51 | +except ImportError: |
| 52 | + pass |
| 53 | + |
| 54 | + |
| 55 | +def determine_username_is_bot(username, kwargs): |
| 56 | + ''' |
| 57 | + @username is the xml element containing the id of the user |
| 58 | + @kwargs should have a list with all the bot ids |
| 59 | + |
| 60 | + @Return False if username id is not in bot list id or True if username id |
| 61 | + is a bot id. |
| 62 | + ''' |
| 63 | + ids = kwargs.get('bots', []) |
| 64 | + if ids == None: |
| 65 | + ids = [] |
| 66 | + if username != None and username.text != None: |
| 67 | + id = username.text |
| 68 | + if id in ids: |
| 69 | + return 1 |
| 70 | + else: |
| 71 | + return 0 |
| 72 | + |
| 73 | + |
| 74 | +def extract_contributor_id(contributor, kwargs): |
| 75 | + ''' |
| 76 | + @contributor is the xml contributor node containing a number of attributes |
| 77 | + |
| 78 | + Currently, we are only interested in registered contributors, hence we |
| 79 | + ignore anonymous editors. If you are interested in collecting data on |
| 80 | + anonymous editors then add the string 'ip' to the tags variable. |
| 81 | + ''' |
| 82 | + tags = ['id'] |
| 83 | + if contributor.get('deleted'): |
| 84 | + return - 1 # ASK: Not sure if this is the best way to code deleted contributors. |
| 85 | + for elem in contributor: |
| 86 | + if elem.tag in tags: |
| 87 | + if elem.text != None: |
| 88 | + return elem.text.decode('utf-8') |
| 89 | + else: |
| 90 | + return - 1 |
| 91 | + |
| 92 | + |
| 93 | +def output_editor_information(elem, output, **kwargs): |
| 94 | + ''' |
| 95 | + @elem is an XML element containing 1 revision from a page |
| 96 | + @output is where to store the data, either a queue or a filehandle |
| 97 | + @**kwargs contains extra information |
| 98 | + |
| 99 | + the variable tags determines which attributes are being parsed, the values in |
| 100 | + this dictionary are the functions used to extract the data. |
| 101 | + ''' |
| 102 | + tags = {'contributor': {'editor': extract_contributor_id, |
| 103 | + 'bot': determine_username_is_bot}, |
| 104 | + 'timestamp': {'date': xml.extract_text}, |
| 105 | + } |
| 106 | + vars = {} |
| 107 | + headers = ['editor', 'date', 'article'] |
| 108 | + destination = kwargs.pop('destination') |
| 109 | + revisions = elem.findall('revision') |
| 110 | + for revision in revisions: |
| 111 | + vars['article'] = elem.find('id').text.decode(settings.encoding) |
| 112 | + elements = revision.getchildren() |
| 113 | + for tag, functions in tags.iteritems(): |
| 114 | + xml_node = xml.retrieve_xml_node(elements, tag) |
| 115 | + for var, function in functions.iteritems(): |
| 116 | + vars[var] = function(xml_node, kwargs) |
| 117 | + |
| 118 | + #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot']) |
| 119 | + if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None: |
| 120 | + vars.pop('bot') |
| 121 | + if destination == 'queue': |
| 122 | + output.put(vars) |
| 123 | + vars['date'] = utils.convert_timestamp_to_date(vars['date']) |
| 124 | + elif destination == 'file': |
| 125 | + data = [] |
| 126 | + for head in headers: |
| 127 | + data.append(vars[head]) |
| 128 | + utils.write_list_to_csv(data, output) |
| 129 | + vars = {} |
| 130 | + |
| 131 | + |
| 132 | +def parse_editors(xml_queue, data_queue, **kwargs): |
| 133 | + ''' |
| 134 | + @xml_queue contains the filenames of the files to be parsed |
| 135 | + @data_queue is an instance of Queue where the extracted data is stored for |
| 136 | + further processing |
| 137 | + @pbar is an instance of progressbar to display the progress |
| 138 | + @bots is a list of id's of known Wikipedia bots |
| 139 | + @debug is a flag to indicate whether the function is called for debugging. |
| 140 | + |
| 141 | + Output is the data_queue that will be used by store_editors() |
| 142 | + ''' |
| 143 | + input = kwargs.get('input', None) |
| 144 | + output = kwargs.get('output', None) |
| 145 | + debug = kwargs.get('debug', False) |
| 146 | + destination = kwargs.get('destination', 'file') |
| 147 | + bots = kwargs.get('bots', None) |
| 148 | + pbar = kwargs.get('pbar', None) |
| 149 | + if settings.debug: |
| 150 | + messages = {} |
| 151 | + vars = {} |
| 152 | + |
| 153 | + while True: |
| 154 | + try: |
| 155 | + if debug: |
| 156 | + file = xml_queue |
| 157 | + else: |
| 158 | + file = xml_queue.get(block=False) |
| 159 | + if file == None: |
| 160 | + print 'Swallowed a poison pill' |
| 161 | + break |
| 162 | + |
| 163 | + data = xml.read_input(utils.create_txt_filehandle(input, |
| 164 | + file, 'r', |
| 165 | + encoding=settings.encoding)) |
| 166 | + if destination == 'file': |
| 167 | + name = file[:-4] + '.txt' |
| 168 | + fh = utils.create_txt_filehandle(output, name, 'w', settings.encoding) |
| 169 | + for raw_data in data: |
| 170 | + xml_buffer = cStringIO.StringIO() |
| 171 | + raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
| 172 | + |
| 173 | + try: |
| 174 | + raw_data = ''.join(raw_data) |
| 175 | + xml_buffer.write(raw_data) |
| 176 | + elem = cElementTree.XML(xml_buffer.getvalue()) |
| 177 | + output_editor_information(elem, fh, bots=bots, destination=destination) |
| 178 | + except SyntaxError, error: |
| 179 | + print error |
| 180 | + ''' |
| 181 | + There are few cases with invalid tokens, they are fixed |
| 182 | + here and then reinserted into the XML DOM |
| 183 | + data = convert_html_entities(xml_buffer.getvalue()) |
| 184 | + elem = cElementTree.XML(data) |
| 185 | + output_editor_information(elem) |
| 186 | + ''' |
| 187 | + if settings.debug: |
| 188 | + utils.track_errors(xml_buffer, error, file, messages) |
| 189 | + except UnicodeEncodeError, error: |
| 190 | + print error |
| 191 | + if settings.debug: |
| 192 | + utils.track_errors(xml_buffer, error, file, messages) |
| 193 | + except MemoryError, error: |
| 194 | + print file, error |
| 195 | + print raw_data[:12] |
| 196 | + print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data]) |
| 197 | + if destination == 'queue': |
| 198 | + output.put('NEXT') |
| 199 | + while True: |
| 200 | + if output.qsize() < 100000: |
| 201 | + break |
| 202 | + else: |
| 203 | + time.sleep(10) |
| 204 | + print 'Still sleeping, queue is %s items long' % output.qsize() |
| 205 | + |
| 206 | + else: |
| 207 | + fh.close() |
| 208 | + |
| 209 | + if pbar: |
| 210 | + print file, xml_queue.qsize() |
| 211 | + #utils.update_progressbar(pbar, xml_queue) |
| 212 | + |
| 213 | + if debug: |
| 214 | + break |
| 215 | + |
| 216 | + except Empty: |
| 217 | + break |
| 218 | + |
| 219 | + if destination == 'queue': |
| 220 | + data_queue.put(None) |
| 221 | + |
| 222 | + if settings.debug: |
| 223 | + utils.report_error_messages(messages, parse_editors) |
| 224 | + |
| 225 | + |
| 226 | +def store_editors(data_queue, **kwargs): |
| 227 | + ''' |
| 228 | + @data_queue is an instance of Queue containing information extracted by |
| 229 | + parse_editors() |
| 230 | + @pids is a list of PIDs used to check if other processes are finished |
| 231 | + running |
| 232 | + @dbname is the name of the MongoDB collection where to store the information. |
| 233 | + ''' |
| 234 | + dbname = kwargs.get('dbname', None) |
| 235 | + mongo = db.init_mongo_db(dbname) |
| 236 | + collection = mongo['editors'] |
| 237 | + mongo.collection.ensure_index('editor') |
| 238 | + editor_cache = cache.EditorCache(collection) |
| 239 | + |
| 240 | + while True: |
| 241 | + try: |
| 242 | + edit = data_queue.get(block=False) |
| 243 | + data_queue.task_done() |
| 244 | + if edit == None: |
| 245 | + print 'Swallowing poison pill' |
| 246 | + break |
| 247 | + elif edit == 'NEXT': |
| 248 | + editor_cache.add('NEXT', '') |
| 249 | + else: |
| 250 | + contributor = edit['editor'] |
| 251 | + value = {'date': edit['date'], 'article': edit['article']} |
| 252 | + editor_cache.add(contributor, value) |
| 253 | + #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True) |
| 254 | + #'$inc': {'edit_count': 1}, |
| 255 | + |
| 256 | + except Empty: |
| 257 | + ''' |
| 258 | + This checks whether the Queue is empty because the preprocessors are |
| 259 | + finished or because this function is faster in emptying the Queue |
| 260 | + then the preprocessors are able to fill it. If the preprocessors |
| 261 | + are finished and this Queue is empty than break, else wait for the |
| 262 | + Queue to fill. |
| 263 | + ''' |
| 264 | + pass |
| 265 | + |
| 266 | + print 'Emptying entire cache.' |
| 267 | + editor_cache.store() |
| 268 | + print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n) |
| 269 | + |
| 270 | + |
| 271 | +def load_cache_objects(): |
| 272 | + cache = {} |
| 273 | + files = utils.retrieve_file_list(settings.binary_location, '.bin') |
| 274 | + for x, file in enumerate(files): |
| 275 | + cache[x] = utils.load_object(settings.binary_location, file) |
| 276 | + return cache |
| 277 | + |
| 278 | + |
| 279 | +def search_cache_for_missed_editors(dbname): |
| 280 | + mongo = db.init_mongo_db(dbname) |
| 281 | + collection = mongo['editors'] |
| 282 | + editor_cache = cache.EditorCache(collection) |
| 283 | + cache = load_cache_objects() |
| 284 | + for c in cache: |
| 285 | + for editor in cache[c]: |
| 286 | + editor_cache.add(editor, cache[c][editor]) |
| 287 | + cache[c] = {} |
| 288 | + editor_cache.add('NEXT', '') |
| 289 | + cache = {} |
| 290 | + |
| 291 | + |
| 292 | + |
| 293 | +def load_bot_ids(): |
| 294 | + ''' |
| 295 | + Loader function to retrieve list of id's of known Wikipedia bots. |
| 296 | + ''' |
| 297 | + ids = {} |
| 298 | + mongo = db.init_mongo_db('bots') |
| 299 | + bots = mongo['ids'] |
| 300 | + cursor = bots.find() |
| 301 | + for bot in cursor: |
| 302 | + ids[bot['id']] = bot['name'] |
| 303 | + return ids |
| 304 | + |
| 305 | + |
| 306 | +def run_parse_editors(location, language, project): |
| 307 | + ids = load_bot_ids() |
| 308 | + base = os.path.join(location, language, project) |
| 309 | + input = os.path.join(base, 'chunks') |
| 310 | + output = os.path.join(base, 'txt') |
| 311 | + settings.verify_environment([input, output]) |
| 312 | + files = utils.retrieve_file_list(input, 'xml') |
| 313 | + |
| 314 | + kwargs = {'bots': ids, |
| 315 | + 'dbname': language + project, |
| 316 | + 'language': language, |
| 317 | + 'project': project, |
| 318 | + 'pbar': True, |
| 319 | + 'destination': 'file', |
| 320 | + 'nr_input_processors': settings.number_of_processes, |
| 321 | + 'nr_output_processors': settings.number_of_processes, |
| 322 | + 'input': input, |
| 323 | + 'output': output, |
| 324 | + } |
| 325 | + |
| 326 | + chunks = utils.split_list(files, settings.number_of_processes) |
| 327 | + pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs) |
| 328 | + |
| 329 | + |
| 330 | +def debug_parse_editors(dbname): |
| 331 | + q = JoinableQueue() |
| 332 | + parse_editors('522.xml', q, None, None, debug=True, destination='file') |
| 333 | + store_editors(q, [], dbname) |
| 334 | + |
| 335 | + |
| 336 | +if __name__ == "__main__": |
| 337 | + #debug_parse_editors('test2') |
| 338 | + run_parse_editors(settings.input_location, 'en', 'wiki') |
| 339 | + pass |
Property changes on: trunk/tools/editor_trends/etl/extract.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 340 | + text/plain |
Added: svn:eol-style |
2 | 341 | + native |
Index: trunk/tools/editor_trends/etl/construct_datasets.py |
— | — | @@ -0,0 +1,255 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +from multiprocessing import Queue |
| 22 | +from Queue import Empty |
| 23 | +import datetime |
| 24 | +from dateutil.relativedelta import * |
| 25 | + |
| 26 | +import progressbar |
| 27 | + |
| 28 | +import configuration |
| 29 | +settings = configuration.Settings() |
| 30 | +from utils import models, utils |
| 31 | +from database import db |
| 32 | +from utils import process_constructor as pc |
| 33 | + |
| 34 | +try: |
| 35 | + import psyco |
| 36 | + psyco.full() |
| 37 | +except ImportError: |
| 38 | + pass |
| 39 | + |
| 40 | + |
| 41 | +def retrieve_editor_ids_mongo(dbname, collection): |
| 42 | + if utils.check_file_exists(settings.binary_location, |
| 43 | + 'editors.bin'): |
| 44 | + ids = utils.load_object(settings.binary_location, |
| 45 | + 'editors.bin') |
| 46 | + else: |
| 47 | + mongo = db.init_mongo_db(dbname) |
| 48 | + editors = mongo[collection] |
| 49 | + ids = editors.distinct('editor') |
| 50 | + utils.store_object(ids, settings.binary_location, retrieve_editor_ids_mongo) |
| 51 | + return ids |
| 52 | + |
| 53 | + |
| 54 | +def expand_edits(edits): |
| 55 | + data = [] |
| 56 | + for edit in edits: |
| 57 | + data.append(edit['date']) |
| 58 | + return data |
| 59 | + |
| 60 | + |
| 61 | +def expand_observations(obs, vars_to_expand): |
| 62 | + for var in vars_to_expand: |
| 63 | + if var == 'edits': |
| 64 | + obs[var] = expand_edits(obs[var]) |
| 65 | + elif var == 'edits_by_year': |
| 66 | + keys = obs[var].keys() |
| 67 | + keys.sort() |
| 68 | + edits = [] |
| 69 | + for key in keys: |
| 70 | + edits.append(str(obs[var][key])) |
| 71 | + obs[var] = edits |
| 72 | + return obs |
| 73 | + |
| 74 | +def write_longitudinal_data(id, edits, fh): |
| 75 | + years = edits.keys() |
| 76 | + years.sort() |
| 77 | + for year in years: |
| 78 | + months = edits[year].keys() |
| 79 | + months = [int(m) for m in months] |
| 80 | + months.sort() |
| 81 | + for m in months: |
| 82 | + date = datetime.date(int(year), int(m), 1) |
| 83 | + fh.write('%s\t%s\t%s\n' % (id, date, edits[year][str(m)])) |
| 84 | + |
| 85 | + |
| 86 | +def expand_headers(headers, vars_to_expand, obs): |
| 87 | + for var in vars_to_expand: |
| 88 | + l = len(obs[var]) |
| 89 | + pos = headers.index(var) |
| 90 | + for i in xrange(l): |
| 91 | + if var.endswith('year'): |
| 92 | + suffix = 2001 + i |
| 93 | + elif var.endswith('edits'): |
| 94 | + suffix = 1 + i |
| 95 | + headers.insert(pos + i, '%s_%s' % (var, suffix)) |
| 96 | + headers.remove(var) |
| 97 | + return headers |
| 98 | + |
| 99 | + |
| 100 | +def generate_long_editor_dataset(input_queue, data_queue, pbar, **kwargs): |
| 101 | + debug = kwargs.pop('debug') |
| 102 | + dbname = kwargs.pop('dbname') |
| 103 | + mongo = db.init_mongo_db(dbname) |
| 104 | + editors = mongo['dataset'] |
| 105 | + name = dbname + '_long_editors.csv' |
| 106 | + fh = utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding) |
| 107 | + x = 0 |
| 108 | + vars_to_expand = [] |
| 109 | + while True: |
| 110 | + try: |
| 111 | + id = input_queue.get(block=False) |
| 112 | + obs = editors.find_one({'editor': id}, {'monthly_edits': 1}) |
| 113 | + if x == 0: |
| 114 | + headers = obs.keys() |
| 115 | + headers.sort() |
| 116 | + headers = expand_headers(headers, vars_to_expand, obs) |
| 117 | + utils.write_list_to_csv(headers, fh) |
| 118 | + write_longitudinal_data(id, obs['monthly_edits'], fh) |
| 119 | + #utils.write_list_to_csv(data, fh) |
| 120 | + x += 1 |
| 121 | + except Empty: |
| 122 | + break |
| 123 | + |
| 124 | + |
| 125 | +def generate_cohort_analysis(input_queue, data_queue, pbar, **kwargs): |
| 126 | + dbname = kwargs.get('dbname') |
| 127 | + pbar = kwargs.get('pbar') |
| 128 | + mongo = db.init_mongo_db(dbname) |
| 129 | + editors = mongo['dataset'] |
| 130 | + year = datetime.datetime.now().year + 1 |
| 131 | + begin = year - 2001 |
| 132 | + p = [3, 6, 9] |
| 133 | + periods = [y * 12 for y in xrange(1, begin)] |
| 134 | + periods = p + periods |
| 135 | + data = {} |
| 136 | + while True: |
| 137 | + try: |
| 138 | + id = input_queue.get(block=False) |
| 139 | + obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1}) |
| 140 | + first_edit = obs['first_edit'] |
| 141 | + last_edit = obs['final_edit'] |
| 142 | + for y in xrange(2001, year): |
| 143 | + if y == 2010 and first_edit > datetime.datetime(2010, 1, 1): |
| 144 | + print 'debug' |
| 145 | + if y not in data: |
| 146 | + data[y] = {} |
| 147 | + data[y]['n'] = 0 |
| 148 | + window_end = datetime.datetime(y, 12, 31) |
| 149 | + if window_end > datetime.datetime.now(): |
| 150 | + now = datetime.datetime.now() |
| 151 | + m = now.month - 1 #Dump files are always lagging at least one month.... |
| 152 | + d = now.day |
| 153 | + window_end = datetime.datetime(y, m, d) |
| 154 | + edits = [] |
| 155 | + for period in periods: |
| 156 | + if period not in data[y]: |
| 157 | + data[y][period] = 0 |
| 158 | + window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period) |
| 159 | + if window_start < datetime.datetime(2001, 1, 1): |
| 160 | + window_start = datetime.datetime(2001, 1, 1) |
| 161 | + if date_falls_in_window(window_start, window_end, first_edit, last_edit): |
| 162 | + edits.append(period) |
| 163 | + if edits != []: |
| 164 | + p = min(edits) |
| 165 | + data[y]['n'] += 1 |
| 166 | + data[y][p] += 1 |
| 167 | + #pbar.update(+1) |
| 168 | + except Empty: |
| 169 | + break |
| 170 | + utils.store_object(data, settings.binary_location, 'cohort_data') |
| 171 | + |
| 172 | +def date_falls_in_window(window_start, window_end, first_edit, last_edit): |
| 173 | + if first_edit >= window_start and first_edit <= window_end: |
| 174 | + return True |
| 175 | + else: |
| 176 | + return False |
| 177 | + |
| 178 | + |
| 179 | +def generate_wide_editor_dataset(input_queue, data_queue, pbar, **kwargs): |
| 180 | + dbname = kwargs.pop('dbname') |
| 181 | + mongo = db.init_mongo_db(dbname) |
| 182 | + editors = mongo['dataset'] |
| 183 | + name = dbname + '_wide_editors.csv' |
| 184 | + fh = utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding) |
| 185 | + x = 0 |
| 186 | + vars_to_expand = ['edits', 'edits_by_year', 'articles_by_year'] |
| 187 | + while True: |
| 188 | + try: |
| 189 | + if debug: |
| 190 | + id = u'99797' |
| 191 | + else: |
| 192 | + id = input_queue.get(block=False) |
| 193 | + print input_queue.qsize() |
| 194 | + obs = editors.find_one({'editor': id}) |
| 195 | + obs = expand_observations(obs, vars_to_expand) |
| 196 | + if x == 0: |
| 197 | + headers = obs.keys() |
| 198 | + headers.sort() |
| 199 | + headers = expand_headers(headers, vars_to_expand, obs) |
| 200 | + utils.write_list_to_csv(headers, fh) |
| 201 | + data = [] |
| 202 | + keys = obs.keys() |
| 203 | + keys.sort() |
| 204 | + for key in keys: |
| 205 | + data.append(obs[key]) |
| 206 | + utils.write_list_to_csv(data, fh) |
| 207 | + |
| 208 | + x += 1 |
| 209 | + except Empty: |
| 210 | + break |
| 211 | + fh.close() |
| 212 | + |
| 213 | + |
| 214 | +def retrieve_edits_by_contributor_launcher(): |
| 215 | + pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors') |
| 216 | + |
| 217 | + |
| 218 | +def debug_retrieve_edits_by_contributor_launcher(dbname): |
| 219 | + kwargs = {'debug': False, |
| 220 | + 'dbname': dbname, |
| 221 | + } |
| 222 | + ids = retrieve_editor_ids_mongo(dbname, 'editors') |
| 223 | + input_queue = pc.load_queue(ids) |
| 224 | + q = Queue() |
| 225 | + generate_editor_dataset(input_queue, q, False, kwargs) |
| 226 | + |
| 227 | + |
| 228 | +def generate_editor_dataset_launcher(dbname): |
| 229 | + kwargs = {'nr_input_processors': 1, |
| 230 | + 'nr_output_processors': 1, |
| 231 | + 'debug': False, |
| 232 | + 'dbname': dbname, |
| 233 | + 'poison_pill':False, |
| 234 | + 'pbar': True |
| 235 | + } |
| 236 | + ids = retrieve_editor_ids_mongo(dbname, 'editors') |
| 237 | + ids = list(ids) |
| 238 | + chunks = dict({0: ids}) |
| 239 | + pc.build_scaffolding(pc.load_queue, generate_cohort_analysis, chunks, False, False, **kwargs) |
| 240 | + |
| 241 | + |
| 242 | +def generate_editor_dataset_debug(dbname): |
| 243 | + ids = retrieve_editor_ids_mongo(dbname, 'editors') |
| 244 | + input_queue = pc.load_queue(ids) |
| 245 | + kwargs = {'nr_input_processors': 1, |
| 246 | + 'nr_output_processors': 1, |
| 247 | + 'debug': True, |
| 248 | + 'dbname': dbname, |
| 249 | + } |
| 250 | + generate_editor_dataset(input_queue, False, False, kwargs) |
| 251 | + |
| 252 | + |
| 253 | +if __name__ == '__main__': |
| 254 | + #generate_editor_dataset_debug('test') |
| 255 | + generate_editor_dataset_launcher('enwiki') |
| 256 | + #debug_retrieve_edits_by_contributor_launcher() |
Property changes on: trunk/tools/editor_trends/etl/construct_datasets.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 257 | + text/plain |
Added: svn:eol-style |
2 | 258 | + native |
Index: trunk/tools/editor_trends/etl/__init__.py |
Property changes on: trunk/tools/editor_trends/etl/__init__.py |
___________________________________________________________________ |
Added: svn:eol-style |
3 | 259 | + native |
Index: trunk/tools/editor_trends/etl/chunker.py |
— | — | @@ -0,0 +1,211 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import xml.etree.cElementTree as cElementTree |
| 22 | +import sys |
| 23 | +import codecs |
| 24 | +import re |
| 25 | +import json |
| 26 | +import os |
| 27 | + |
| 28 | +import progressbar |
| 29 | + |
| 30 | + |
| 31 | +sys.path.append('..') |
| 32 | +import configuration |
| 33 | +from utils import utils |
| 34 | +from wikitree import xml |
| 35 | +settings = configuration.Settings() |
| 36 | + |
| 37 | +try: |
| 38 | + import psyco |
| 39 | + psyco.full() |
| 40 | +except ImportError: |
| 41 | + pass |
| 42 | + |
| 43 | + |
| 44 | +RE_NUMERIC_CHARACTER = re.compile('&#(\d+);') |
| 45 | + |
| 46 | + |
| 47 | +def remove_numeric_character_references(text): |
| 48 | + return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8') |
| 49 | + |
| 50 | + |
| 51 | +def lenient_deccharref(m): |
| 52 | + try: |
| 53 | + return unichr(int(m.group(1))) |
| 54 | + except ValueError: |
| 55 | + ''' |
| 56 | + There are a few articles that raise a Value Error here, the reason is |
| 57 | + that I am using a narrow Python build (UCS2) instead of a wide build |
| 58 | + (UCS4). The quick fix is to return an empty string... |
| 59 | + Real solution is to rebuild Python with UCS4 support..... |
| 60 | + ''' |
| 61 | + return '' |
| 62 | + |
| 63 | + |
| 64 | +def remove_namespace(element, namespace): |
| 65 | + '''Remove namespace from the XML document.''' |
| 66 | + ns = u'{%s}' % namespace |
| 67 | + nsl = len(ns) |
| 68 | + for elem in element.getiterator(): |
| 69 | + if elem.tag.startswith(ns): |
| 70 | + elem.tag = elem.tag[nsl:] |
| 71 | + return element |
| 72 | + |
| 73 | + |
| 74 | +def load_namespace(language): |
| 75 | + file = '%s_ns.json' % language |
| 76 | + fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding) |
| 77 | + ns = json.load(fh) |
| 78 | + fh.close() |
| 79 | + ns = ns['query']['namespaces'] |
| 80 | + return ns |
| 81 | + |
| 82 | + |
| 83 | +def build_namespaces_locale(namespaces): |
| 84 | + ''' |
| 85 | + Construct a list of all the non-main namespaces |
| 86 | + ''' |
| 87 | + ns = [] |
| 88 | + for namespace in namespaces: |
| 89 | + value = namespaces[namespace].get(u'*', None) |
| 90 | + if value != None and value != '': |
| 91 | + ns.append(value) |
| 92 | + return ns |
| 93 | + |
| 94 | + |
| 95 | +def parse_comments(xml, function): |
| 96 | + revisions = xml.findall('revision') |
| 97 | + for revision in revisions: |
| 98 | + comment = revision.find('comment') |
| 99 | + timestamp = revision.find('timestamp').text |
| 100 | + if comment != None and comment.text != None: |
| 101 | + comment.text = function(comment.text) |
| 102 | + return xml |
| 103 | + |
| 104 | + |
| 105 | +def is_article_main_namespace(elem, namespace): |
| 106 | + ''' |
| 107 | + checks whether the article belongs to the main namespace |
| 108 | + ''' |
| 109 | + title = elem.find('title').text |
| 110 | + for ns in namespace: |
| 111 | + if title.startswith(ns): |
| 112 | + return False |
| 113 | + return True |
| 114 | + |
| 115 | + |
| 116 | +def write_xml_file(element, fh, counter, language): |
| 117 | + '''Get file handle and write xml element to file''' |
| 118 | + size = len(cElementTree.tostring(element)) |
| 119 | + fh, counter = create_file_handle(fh, counter, size, language) |
| 120 | + try: |
| 121 | + fh.write(cElementTree.tostring(element)) |
| 122 | + except MemoryError: |
| 123 | + print 'Add error capturing logic' |
| 124 | + fh.write('\n') |
| 125 | + return fh, counter |
| 126 | + |
| 127 | + |
| 128 | +def create_file_handle(fh, counter, size, language): |
| 129 | + '''Create file handle if none is supplied or if file size > max file size.''' |
| 130 | + if not counter: |
| 131 | + counter = 0 |
| 132 | + path = os.path.join(settings.input_location, language, '%s.xml' % counter) |
| 133 | + if not fh: |
| 134 | + fh = codecs.open(path, 'w', encoding=settings.encoding) |
| 135 | + return fh, counter |
| 136 | + elif (fh.tell() + size) > settings.binary_location: |
| 137 | + print 'Created chunk %s' % counter |
| 138 | + fh.close |
| 139 | + counter += 1 |
| 140 | + fh = codecs.open(path, 'w', encoding=settings.encoding) |
| 141 | + return fh, counter |
| 142 | + else: |
| 143 | + return fh, counter |
| 144 | + |
| 145 | + |
| 146 | +def flatten_xml_elements(data, page): |
| 147 | + flat = [] |
| 148 | + for x, elems in enumerate(data): |
| 149 | + flat.append([page]) |
| 150 | + for elem in elems: |
| 151 | + if elem.tag != 'id': |
| 152 | + if len(elem.getchildren()) > 0: |
| 153 | + for el in elem.getchildren(): |
| 154 | + flat[x].append(xml.extract_text(elem, None)) |
| 155 | + else: |
| 156 | + flat[x].append(xml.extract_text(elem, None)) |
| 157 | + return flat |
| 158 | + |
| 159 | + |
| 160 | +def split_file(output, input, project, language_code, language, format='xml'): |
| 161 | + '''Reads xml file and splits it in N chunks''' |
| 162 | + #location = os.path.join(settings.input_location, language) |
| 163 | + output = os.path.join(output, language_code, project) |
| 164 | + settings.verify_environment([output]) |
| 165 | + if format == 'xml': |
| 166 | + fh = None |
| 167 | + else: |
| 168 | + f = input.replace('.xml', '') |
| 169 | + fh = utils.create_txt_filehandle(output, '%s.tsv' % f, 'w', settings.encoding) |
| 170 | + |
| 171 | + ns = load_namespace(language_code) |
| 172 | + ns = build_namespaces_locale(ns) |
| 173 | + |
| 174 | + |
| 175 | + counter = None |
| 176 | + tag = '{%s}page' % settings.xml_namespace |
| 177 | + |
| 178 | + |
| 179 | + context = cElementTree.iterparse(input, events=('start', 'end')) |
| 180 | + context = iter(context) |
| 181 | + event, root = context.next() #get the root element of the XML doc |
| 182 | + |
| 183 | + try: |
| 184 | + for event, elem in context: |
| 185 | + if event == 'end': |
| 186 | + if elem.tag == tag: |
| 187 | + elem = remove_namespace(elem, settings.xml_namespace) |
| 188 | + if is_article_main_namespace(elem, ns): |
| 189 | + page = elem.find('id').text |
| 190 | + elem = parse_comments(elem, remove_numeric_character_references) |
| 191 | + if format == 'xml': |
| 192 | + fh, counter = write_settings.input_filename(elem, fh, counter, language_code) |
| 193 | + else: |
| 194 | + data = [el.getchildren() for el in elem if el.tag == 'revision'] |
| 195 | + data = flatten_xml_elements(data, page) |
| 196 | + utils.write_list_to_csv(data, fh, recursive=False, newline=True) |
| 197 | + root.clear() # when done parsing a section clear the tree to safe memory |
| 198 | + except SyntaxError: |
| 199 | + f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding) |
| 200 | + f.write(cElementTree.tostring(elem)) |
| 201 | + f.close() |
| 202 | + finally: |
| 203 | + fh.close() |
| 204 | + |
| 205 | +if __name__ == "__main__": |
| 206 | + kwargs = {'output': settings.input_location, |
| 207 | + 'input': settings.input_filename, |
| 208 | + 'project':'wiki', |
| 209 | + 'language_code':'en', |
| 210 | + 'format': 'tsv' |
| 211 | + } |
| 212 | + split_file(**kwargs) |
Property changes on: trunk/tools/editor_trends/etl/chunker.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 213 | + text/plain |
Added: svn:eol-style |
2 | 214 | + native |
Index: trunk/tools/editor_trends/etl/xml2pig.py |
— | — | @@ -0,0 +1,30 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# -*- coding: utf-8 -*-
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http://www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__author__email = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2010-11-15'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +import sys
|
| 22 | +sys.path.append('..')
|
| 23 | +
|
| 24 | +import os
|
| 25 | +import xml.etree.cElementTree as cElementTree
|
| 26 | +
|
| 27 | +import configuration |
| 28 | +settings = configuration.Settings()
|
| 29 | +import split_settings.input_filename
|
| 30 | +
|
| 31 | +
|
Index: trunk/tools/editor_trends/etl/loader.py |
— | — | @@ -0,0 +1,140 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-11-16' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +import sys |
| 23 | + |
| 24 | +sys.path.append('..') |
| 25 | +import configuration |
| 26 | +settings = configuration.Settings() |
| 27 | +from database import db |
| 28 | +from database import cache |
| 29 | +from utils import utils |
| 30 | +import process_constructor as pc |
| 31 | + |
| 32 | + |
| 33 | +def store_editors(input, filename, dbname): |
| 34 | + fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding) |
| 35 | + mongo = db.init_mongo_db(dbname) |
| 36 | + collection = mongo['test'] |
| 37 | + mongo.collection.ensure_index('editor') |
| 38 | + mongo.collection.create_index('editor') |
| 39 | + editor_cache = cache.EditorCache(collection) |
| 40 | + prev_contributor = -1 |
| 41 | + x = 0 |
| 42 | + edits = 0 |
| 43 | + editors = set() |
| 44 | + for line in readline(fh): |
| 45 | + if len(line) == 0: |
| 46 | + continue |
| 47 | + contributor = int(line[0]) |
| 48 | + if contributor == 5767932: |
| 49 | + print 'debug' |
| 50 | + if prev_contributor != contributor: |
| 51 | + if edits >= 10: |
| 52 | + result = editor_cache.add(prev_contributor, 'NEXT') |
| 53 | + if result: |
| 54 | + editors.add(prev_contributor) |
| 55 | + result = None |
| 56 | + x += 1 |
| 57 | + print 'Stored %s editors' % x |
| 58 | + else: |
| 59 | + editor_cache.clear(prev_contributor) |
| 60 | + edits = 0 |
| 61 | + edits += 1 |
| 62 | + date = utils.convert_timestamp_to_date(line[1]) #+ datetime.timedelta(days=1) |
| 63 | + article_id = int(line[2]) |
| 64 | + value = {'date': date, 'article': article_id} |
| 65 | + editor_cache.add(contributor, value) |
| 66 | + prev_contributor = contributor |
| 67 | + fh.close() |
| 68 | + utils.store_object(editors, settings.binary_location, 'editors') |
| 69 | + |
| 70 | + |
| 71 | +def mergesort_external_launcher(dbname, input, output): |
| 72 | + files = utils.retrieve_file_list(input, 'txt', mask='') |
| 73 | + x = 0 |
| 74 | + maxval = 99999 |
| 75 | + while maxval >= settings.max_filehandles: |
| 76 | + x += 1.0 |
| 77 | + maxval = round(len(files) / x) |
| 78 | + chunks = utils.split_list(files, int(x)) |
| 79 | + '''1st iteration external mergesort''' |
| 80 | + for chunk in chunks: |
| 81 | + filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]] |
| 82 | + filename = merge_sorted_files(output, filehandles, chunk) |
| 83 | + filehandles = [fh.close() for fh in filehandles] |
| 84 | + pass |
| 85 | + '''2nd iteration external mergesort, if necessary''' |
| 86 | + if len(chunks) > 1: |
| 87 | + files = utils.retrieve_file_list(output, 'txt', mask='[merged]') |
| 88 | + filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.encoding) for file in files] |
| 89 | + filename = merge_sorted_files(output, filehandles, 'final') |
| 90 | + filehandles = [fh.close() for fh in filehandles] |
| 91 | + filename = 'merged_final.txt' |
| 92 | + store_editors(output, filename, dbname) |
| 93 | + |
| 94 | + |
| 95 | +def mergesort_feeder(input_queue, result_queue, **kwargs): |
| 96 | + input = kwargs.get('input', None) |
| 97 | + output = kwargs.get('output', None) |
| 98 | + while True: |
| 99 | + try: |
| 100 | + file = input_queue.get(block=False) |
| 101 | + fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding) |
| 102 | + data = fh.readlines() |
| 103 | + fh.close() |
| 104 | + data = [d.replace('\n', '') for d in data] |
| 105 | + data = [d.split('\t') for d in data] |
| 106 | + sorted_data = mergesort(data) |
| 107 | + write_sorted_file(sorted_data, file, output) |
| 108 | + except Empty: |
| 109 | + break |
| 110 | + |
| 111 | + |
| 112 | +def mergesort_launcher(input, output): |
| 113 | + kwargs = {'pbar': True, |
| 114 | + 'nr_input_processors': settings.number_of_processes, |
| 115 | + 'nr_output_processors': settings.number_of_processes, |
| 116 | + 'input': input, |
| 117 | + 'output': output, |
| 118 | + 'poison_pill': False |
| 119 | + } |
| 120 | + files = utils.retrieve_file_list(input, 'txt') |
| 121 | + chunks = utils.split_list(files, settings.number_of_processes) |
| 122 | + pc.build_scaffolding(pc.load_queue, mergesort_feeder, chunks, False, False, **kwargs) |
| 123 | + |
| 124 | + |
| 125 | +def debug_mergesort_feeder(input, output): |
| 126 | + kwargs = { |
| 127 | + 'input': input, |
| 128 | + 'output': output, |
| 129 | + } |
| 130 | + files = utils.retrieve_file_list(input, 'txt') |
| 131 | + chunks = utils.split_list(files, settings.number_of_processes) |
| 132 | + q = pc.load_queue(chunks[0]) |
| 133 | + mergesort_feeder(q, False, **kwargs) |
| 134 | + |
| 135 | + |
| 136 | +if __name__ == '__main__': |
| 137 | + input = os.path.join(settings.input_location, 'en', 'wiki', 'txt') |
| 138 | + output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted') |
| 139 | + dbname = 'enwiki' |
| 140 | + mergesort_launcher(input, output) |
| 141 | + mergesort_external_launcher(dbname, output, output) |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/etl/loader.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 142 | + native |
Index: trunk/tools/editor_trends/etl/bots.py |
— | — | @@ -0,0 +1,123 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | +import os |
| 19 | +import cStringIO |
| 20 | +import xml.etree.cElementTree as cElementTree |
| 21 | + |
| 22 | + |
| 23 | +import configuration |
| 24 | +settings = configuration.Settings() |
| 25 | +from wikitree import xml |
| 26 | +from database import db |
| 27 | +from database import db_settings |
| 28 | +from utils import utils |
| 29 | +from utils import process_constructor as pc |
| 30 | + |
| 31 | +try: |
| 32 | + import psyco |
| 33 | + psyco.full() |
| 34 | +except ImportError: |
| 35 | + pass |
| 36 | + |
| 37 | + |
| 38 | +def create_bot_ids_db_mongo(): |
| 39 | + ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.encoding) |
| 40 | + mongo = db.init_mongo_db('bots') |
| 41 | + collection = mongo['ids'] |
| 42 | + |
| 43 | + db.remove_documents_from_mongo_db(collection, None) |
| 44 | + |
| 45 | + for id, name in ids.iteritems(): |
| 46 | + collection.insert({'id': id, 'name': name}) |
| 47 | + |
| 48 | + print collection.count() |
| 49 | + |
| 50 | + |
| 51 | +def lookup_username(input_queue, result_queue, progressbar, bots, debug=False): |
| 52 | + ''' |
| 53 | + This function is used to find the id's belonging to the different bots that |
| 54 | + are patrolling the Wikipedia sites. |
| 55 | + @input_queue contains a list of xml files to parse |
| 56 | + |
| 57 | + @result_queue should be set to false as the results are directly written to |
| 58 | + a csv file. |
| 59 | + |
| 60 | + @progressbar depends on settings |
| 61 | + |
| 62 | + @bots is a dictionary containing the names of the bots to lookup |
| 63 | + ''' |
| 64 | + |
| 65 | + #if len(bots.keys()) == 1: |
| 66 | + bots = bots['bots'] |
| 67 | + #print bots.keys() |
| 68 | + |
| 69 | + if settings.debug: |
| 70 | + messages = {} |
| 71 | + |
| 72 | + while True: |
| 73 | + if debug: |
| 74 | + file = input_queue |
| 75 | + else: |
| 76 | + file = input_queue.get(block=False) |
| 77 | + |
| 78 | + if file == None: |
| 79 | + break |
| 80 | + |
| 81 | + data = xml.read_input(utils.open_txt_file(settings.input_location + |
| 82 | + file, 'r', encoding=settings.encoding)) |
| 83 | + |
| 84 | + for raw_data in data: |
| 85 | + xml_buffer = cStringIO.StringIO() |
| 86 | + raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
| 87 | + raw_data = ''.join(raw_data) |
| 88 | + raw_data = raw_data.encode('utf-8') |
| 89 | + xml_buffer.write(raw_data) |
| 90 | + |
| 91 | + try: |
| 92 | + xml_nodes = cElementTree.XML(xml_buffer.getvalue()) |
| 93 | + revisions = xml_nodes.findall('revision') |
| 94 | + for revision in revisions: |
| 95 | + contributor = xml.retrieve_xml_node(revision, 'contributor') |
| 96 | + username = contributor.find('username') |
| 97 | + if username == None: |
| 98 | + continue |
| 99 | + username = xml.extract_text(username) |
| 100 | + #print username.encode('utf-8') |
| 101 | + |
| 102 | + if username in bots: |
| 103 | + id = contributor.find('id') |
| 104 | + id = xml.extract_text(id) |
| 105 | + #print username.encode('utf-8'), id |
| 106 | + utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.encoding) |
| 107 | + bots.pop(username) |
| 108 | + if bots == {}: |
| 109 | + print 'Mission accomplished' |
| 110 | + return |
| 111 | + except Exception, error: |
| 112 | + print error |
| 113 | + if settings.debug: |
| 114 | + messages = utils.track_errors(xml_buffer, error, file, |
| 115 | + messages) |
| 116 | + |
| 117 | + if settings.debug: |
| 118 | + utils.report_error_messages(messages, lookup_username) |
| 119 | + |
| 120 | + |
| 121 | +if __name__ == '__main__': |
| 122 | + #debug() |
| 123 | + #add_id_to_botnames() |
| 124 | + create_bot_ids_db_mongo() |
Property changes on: trunk/tools/editor_trends/etl/bots.py |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 125 | + text/plain |
Added: svn:eol-style |
2 | 126 | + native |