r76846 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76845‎ | r76846 | r76847 >
Date:23:13, 16 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Major refactoring:
* moved files to etl (extract-transform-load) directory

this commit will contain some small issues.
Modified paths:
  • /trunk/tools/editor_trends/etl (added) (history)
  • /trunk/tools/editor_trends/etl/__init__.py (added) (history)
  • /trunk/tools/editor_trends/etl/bots.py (added) (history)
  • /trunk/tools/editor_trends/etl/chunker.py (added) (history)
  • /trunk/tools/editor_trends/etl/construct_datasets.py (added) (history)
  • /trunk/tools/editor_trends/etl/extract.py (added) (history)
  • /trunk/tools/editor_trends/etl/loader.py (added) (history)
  • /trunk/tools/editor_trends/etl/optimize_editors.py (added) (history)
  • /trunk/tools/editor_trends/etl/xml2pig.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/optimize_editors.py
@@ -0,0 +1,172 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-11-02'
 19+__version__ = '0.1'
 20+
 21+from multiprocessing import Queue
 22+from Queue import Empty
 23+from operator import itemgetter
 24+import datetime
 25+
 26+import configuration
 27+settings = configuration.Settings()
 28+from database import db
 29+from utils import process_constructor as pc
 30+from utils import utils
 31+import construct_datasets
 32+
 33+
 34+try:
 35+ import psyco
 36+ psyco.full()
 37+except ImportError:
 38+ pass
 39+
 40+
 41+def create_datacontainer(init_value=0):
 42+ '''
 43+ This function initializes an empty dictionary with as key the year (starting
 44+ 2001 and running through) and as value @init_value, in most cases this will
 45+ be zero so the dictionary will act as a running tally for a variable but
 46+ @init_value can also a list, [], or a dictionary, {}, or a set, set().
 47+ '''
 48+ data = {}
 49+ year = datetime.datetime.now().year + 1
 50+ for x in xrange(2001, year):
 51+ data[str(x)] = init_value
 52+ return data
 53+
 54+
 55+def add_months_to_datacontainer(datacontainer):
 56+ for dc in datacontainer:
 57+ datacontainer[dc] = {}
 58+ for x in xrange(1, 13):
 59+ datacontainer[dc][str(x)] = 0
 60+ return datacontainer
 61+
 62+
 63+def determine_edits_by_month(edits):
 64+ datacontainer = create_datacontainer(init_value=0)
 65+ datacontainer = add_months_to_datacontainer(datacontainer)
 66+ for year in edits:
 67+ months = set()
 68+ for edit in edits[year]:
 69+ m = str(edit['date'].month)
 70+ if m not in months:
 71+ datacontainer[year][m] = 1
 72+ months.add(m)
 73+ if len(months) == 12:
 74+ break
 75+ return datacontainer
 76+
 77+
 78+def determine_edits_by_year(dates):
 79+ '''
 80+ This function counts the number of edits by year made by a particular editor.
 81+ '''
 82+ edits = create_datacontainer()
 83+ for date in dates:
 84+ year = str(date['date'].year)
 85+ edits[year] += 1
 86+ return edits
 87+
 88+
 89+def determine_articles_by_year(dates):
 90+ '''
 91+ This function counts the number of unique articles by year edited by a
 92+ particular editor.
 93+ '''
 94+ articles = create_datacontainer(set())
 95+ for date in dates:
 96+ year = str(date['date'].year)
 97+ articles[year].add(date['article'])
 98+ for article in articles:
 99+ articles[article] = len(articles[article])
 100+ return articles
 101+
 102+
 103+def sort_edits(edits):
 104+ edits = utils.merge_list(edits)
 105+ return sorted(edits, key=itemgetter('date'))
 106+
 107+
 108+def optimize_editors(input_queue, result_queue, pbar, **kwargs):
 109+ dbname = kwargs.pop('dbname')
 110+ mongo = db.init_mongo_db(dbname)
 111+ input = mongo['test']
 112+ output = mongo['dataset']
 113+ output.ensure_index('editor')
 114+ output.ensure_index('year_joined')
 115+ definition = kwargs.pop('definition')
 116+ while True:
 117+ try:
 118+ id = input_queue.get(block=False)
 119+ editor = input.find_one({'editor': id})
 120+ if editor == None:
 121+ continue
 122+ edits = editor['edits']
 123+ monthly_edits = determine_edits_by_month(edits)
 124+ edits = sort_edits(edits)
 125+ edit_count = len(edits)
 126+ new_wikipedian = edits[9]['date']
 127+ first_edit = edits[0]['date']
 128+ final_edit = edits[-1]['date']
 129+ edits_by_year = determine_edits_by_year(edits)
 130+ articles_by_year = determine_articles_by_year(edits)
 131+
 132+ edits = edits[:10]
 133+
 134+ output.insert({'editor': id, 'edits': edits,
 135+ 'edits_by_year': edits_by_year,
 136+ 'new_wikipedian': new_wikipedian,
 137+ 'edit_count': edit_count,
 138+ 'final_edit': final_edit,
 139+ 'first_edit': first_edit,
 140+ 'articles_by_year': articles_by_year,
 141+ 'monthly_edits': monthly_edits})
 142+ print 'Items left: %s' % input_queue.qsize()
 143+ except Empty:
 144+ break
 145+
 146+
 147+def run_optimize_editors(dbname):
 148+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
 149+ kwargs = {'definition': 'traditional',
 150+ 'pbar': True,
 151+ 'dbname': 'enwiki',
 152+ 'nr_input_processors': 1,
 153+ 'nr_output_processors': 0,
 154+ 'poison_pill': False
 155+ }
 156+ print len(ids)
 157+ ids = list(ids)
 158+ chunks = dict(0, ids)
 159+ pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
 160+
 161+
 162+def debug_optimize_editors(dbname):
 163+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
 164+ q = pc.load_queue(ids)
 165+ kwargs = {'definition': 'traditional',
 166+ 'dbname': dbname
 167+ }
 168+ optimize_editors(q, False, True, kwargs)
 169+
 170+
 171+if __name__ == '__main__':
 172+ #debug_optimize_editors('test')
 173+ run_optimize_editors('enwiki')
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/etl/optimize_editors.py
___________________________________________________________________
Added: svn:eol-style
1174 + native
Index: trunk/tools/editor_trends/etl/extract.py
@@ -0,0 +1,338 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
 20+
 21+#Default Python libraries (Python => 2.6)
 22+import sys
 23+import os
 24+import time
 25+import datetime
 26+import codecs
 27+import math
 28+import cStringIO
 29+import re
 30+from operator import itemgetter
 31+import xml.etree.cElementTree as cElementTree
 32+from multiprocessing import Queue, JoinableQueue
 33+from Queue import Empty
 34+import pymongo
 35+
 36+# Custom written files
 37+import configuration
 38+settings = configuration.Settings()
 39+from utils import utils, models
 40+from database import db_settings
 41+from database import db
 42+from database import cache
 43+from wikitree import xml
 44+from statistics import dataset
 45+from utils import process_constructor as pc
 46+
 47+
 48+try:
 49+ import psyco
 50+ psyco.full()
 51+except ImportError:
 52+ pass
 53+
 54+
 55+def determine_username_is_bot(username, kwargs):
 56+ '''
 57+ @username is the xml element containing the id of the user
 58+ @kwargs should have a list with all the bot ids
 59+
 60+ @Return False if username id is not in bot list id or True if username id
 61+ is a bot id.
 62+ '''
 63+ ids = kwargs.get('bots', [])
 64+ if ids == None:
 65+ ids = []
 66+ if username != None and username.text != None:
 67+ id = username.text
 68+ if id in ids:
 69+ return 1
 70+ else:
 71+ return 0
 72+
 73+
 74+def extract_contributor_id(contributor, kwargs):
 75+ '''
 76+ @contributor is the xml contributor node containing a number of attributes
 77+
 78+ Currently, we are only interested in registered contributors, hence we
 79+ ignore anonymous editors. If you are interested in collecting data on
 80+ anonymous editors then add the string 'ip' to the tags variable.
 81+ '''
 82+ tags = ['id']
 83+ if contributor.get('deleted'):
 84+ return - 1 # ASK: Not sure if this is the best way to code deleted contributors.
 85+ for elem in contributor:
 86+ if elem.tag in tags:
 87+ if elem.text != None:
 88+ return elem.text.decode('utf-8')
 89+ else:
 90+ return - 1
 91+
 92+
 93+def output_editor_information(elem, output, **kwargs):
 94+ '''
 95+ @elem is an XML element containing 1 revision from a page
 96+ @output is where to store the data, either a queue or a filehandle
 97+ @**kwargs contains extra information
 98+
 99+ the variable tags determines which attributes are being parsed, the values in
 100+ this dictionary are the functions used to extract the data.
 101+ '''
 102+ tags = {'contributor': {'editor': extract_contributor_id,
 103+ 'bot': determine_username_is_bot},
 104+ 'timestamp': {'date': xml.extract_text},
 105+ }
 106+ vars = {}
 107+ headers = ['editor', 'date', 'article']
 108+ destination = kwargs.pop('destination')
 109+ revisions = elem.findall('revision')
 110+ for revision in revisions:
 111+ vars['article'] = elem.find('id').text.decode(settings.encoding)
 112+ elements = revision.getchildren()
 113+ for tag, functions in tags.iteritems():
 114+ xml_node = xml.retrieve_xml_node(elements, tag)
 115+ for var, function in functions.iteritems():
 116+ vars[var] = function(xml_node, kwargs)
 117+
 118+ #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])
 119+ if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:
 120+ vars.pop('bot')
 121+ if destination == 'queue':
 122+ output.put(vars)
 123+ vars['date'] = utils.convert_timestamp_to_date(vars['date'])
 124+ elif destination == 'file':
 125+ data = []
 126+ for head in headers:
 127+ data.append(vars[head])
 128+ utils.write_list_to_csv(data, output)
 129+ vars = {}
 130+
 131+
 132+def parse_editors(xml_queue, data_queue, **kwargs):
 133+ '''
 134+ @xml_queue contains the filenames of the files to be parsed
 135+ @data_queue is an instance of Queue where the extracted data is stored for
 136+ further processing
 137+ @pbar is an instance of progressbar to display the progress
 138+ @bots is a list of id's of known Wikipedia bots
 139+ @debug is a flag to indicate whether the function is called for debugging.
 140+
 141+ Output is the data_queue that will be used by store_editors()
 142+ '''
 143+ input = kwargs.get('input', None)
 144+ output = kwargs.get('output', None)
 145+ debug = kwargs.get('debug', False)
 146+ destination = kwargs.get('destination', 'file')
 147+ bots = kwargs.get('bots', None)
 148+ pbar = kwargs.get('pbar', None)
 149+ if settings.debug:
 150+ messages = {}
 151+ vars = {}
 152+
 153+ while True:
 154+ try:
 155+ if debug:
 156+ file = xml_queue
 157+ else:
 158+ file = xml_queue.get(block=False)
 159+ if file == None:
 160+ print 'Swallowed a poison pill'
 161+ break
 162+
 163+ data = xml.read_input(utils.create_txt_filehandle(input,
 164+ file, 'r',
 165+ encoding=settings.encoding))
 166+ if destination == 'file':
 167+ name = file[:-4] + '.txt'
 168+ fh = utils.create_txt_filehandle(output, name, 'w', settings.encoding)
 169+ for raw_data in data:
 170+ xml_buffer = cStringIO.StringIO()
 171+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
 172+
 173+ try:
 174+ raw_data = ''.join(raw_data)
 175+ xml_buffer.write(raw_data)
 176+ elem = cElementTree.XML(xml_buffer.getvalue())
 177+ output_editor_information(elem, fh, bots=bots, destination=destination)
 178+ except SyntaxError, error:
 179+ print error
 180+ '''
 181+ There are few cases with invalid tokens, they are fixed
 182+ here and then reinserted into the XML DOM
 183+ data = convert_html_entities(xml_buffer.getvalue())
 184+ elem = cElementTree.XML(data)
 185+ output_editor_information(elem)
 186+ '''
 187+ if settings.debug:
 188+ utils.track_errors(xml_buffer, error, file, messages)
 189+ except UnicodeEncodeError, error:
 190+ print error
 191+ if settings.debug:
 192+ utils.track_errors(xml_buffer, error, file, messages)
 193+ except MemoryError, error:
 194+ print file, error
 195+ print raw_data[:12]
 196+ print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
 197+ if destination == 'queue':
 198+ output.put('NEXT')
 199+ while True:
 200+ if output.qsize() < 100000:
 201+ break
 202+ else:
 203+ time.sleep(10)
 204+ print 'Still sleeping, queue is %s items long' % output.qsize()
 205+
 206+ else:
 207+ fh.close()
 208+
 209+ if pbar:
 210+ print file, xml_queue.qsize()
 211+ #utils.update_progressbar(pbar, xml_queue)
 212+
 213+ if debug:
 214+ break
 215+
 216+ except Empty:
 217+ break
 218+
 219+ if destination == 'queue':
 220+ data_queue.put(None)
 221+
 222+ if settings.debug:
 223+ utils.report_error_messages(messages, parse_editors)
 224+
 225+
 226+def store_editors(data_queue, **kwargs):
 227+ '''
 228+ @data_queue is an instance of Queue containing information extracted by
 229+ parse_editors()
 230+ @pids is a list of PIDs used to check if other processes are finished
 231+ running
 232+ @dbname is the name of the MongoDB collection where to store the information.
 233+ '''
 234+ dbname = kwargs.get('dbname', None)
 235+ mongo = db.init_mongo_db(dbname)
 236+ collection = mongo['editors']
 237+ mongo.collection.ensure_index('editor')
 238+ editor_cache = cache.EditorCache(collection)
 239+
 240+ while True:
 241+ try:
 242+ edit = data_queue.get(block=False)
 243+ data_queue.task_done()
 244+ if edit == None:
 245+ print 'Swallowing poison pill'
 246+ break
 247+ elif edit == 'NEXT':
 248+ editor_cache.add('NEXT', '')
 249+ else:
 250+ contributor = edit['editor']
 251+ value = {'date': edit['date'], 'article': edit['article']}
 252+ editor_cache.add(contributor, value)
 253+ #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True)
 254+ #'$inc': {'edit_count': 1},
 255+
 256+ except Empty:
 257+ '''
 258+ This checks whether the Queue is empty because the preprocessors are
 259+ finished or because this function is faster in emptying the Queue
 260+ then the preprocessors are able to fill it. If the preprocessors
 261+ are finished and this Queue is empty than break, else wait for the
 262+ Queue to fill.
 263+ '''
 264+ pass
 265+
 266+ print 'Emptying entire cache.'
 267+ editor_cache.store()
 268+ print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n)
 269+
 270+
 271+def load_cache_objects():
 272+ cache = {}
 273+ files = utils.retrieve_file_list(settings.binary_location, '.bin')
 274+ for x, file in enumerate(files):
 275+ cache[x] = utils.load_object(settings.binary_location, file)
 276+ return cache
 277+
 278+
 279+def search_cache_for_missed_editors(dbname):
 280+ mongo = db.init_mongo_db(dbname)
 281+ collection = mongo['editors']
 282+ editor_cache = cache.EditorCache(collection)
 283+ cache = load_cache_objects()
 284+ for c in cache:
 285+ for editor in cache[c]:
 286+ editor_cache.add(editor, cache[c][editor])
 287+ cache[c] = {}
 288+ editor_cache.add('NEXT', '')
 289+ cache = {}
 290+
 291+
 292+
 293+def load_bot_ids():
 294+ '''
 295+ Loader function to retrieve list of id's of known Wikipedia bots.
 296+ '''
 297+ ids = {}
 298+ mongo = db.init_mongo_db('bots')
 299+ bots = mongo['ids']
 300+ cursor = bots.find()
 301+ for bot in cursor:
 302+ ids[bot['id']] = bot['name']
 303+ return ids
 304+
 305+
 306+def run_parse_editors(location, language, project):
 307+ ids = load_bot_ids()
 308+ base = os.path.join(location, language, project)
 309+ input = os.path.join(base, 'chunks')
 310+ output = os.path.join(base, 'txt')
 311+ settings.verify_environment([input, output])
 312+ files = utils.retrieve_file_list(input, 'xml')
 313+
 314+ kwargs = {'bots': ids,
 315+ 'dbname': language + project,
 316+ 'language': language,
 317+ 'project': project,
 318+ 'pbar': True,
 319+ 'destination': 'file',
 320+ 'nr_input_processors': settings.number_of_processes,
 321+ 'nr_output_processors': settings.number_of_processes,
 322+ 'input': input,
 323+ 'output': output,
 324+ }
 325+
 326+ chunks = utils.split_list(files, settings.number_of_processes)
 327+ pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs)
 328+
 329+
 330+def debug_parse_editors(dbname):
 331+ q = JoinableQueue()
 332+ parse_editors('522.xml', q, None, None, debug=True, destination='file')
 333+ store_editors(q, [], dbname)
 334+
 335+
 336+if __name__ == "__main__":
 337+ #debug_parse_editors('test2')
 338+ run_parse_editors(settings.input_location, 'en', 'wiki')
 339+ pass
Property changes on: trunk/tools/editor_trends/etl/extract.py
___________________________________________________________________
Added: svn:mime-type
1340 + text/plain
Added: svn:eol-style
2341 + native
Index: trunk/tools/editor_trends/etl/construct_datasets.py
@@ -0,0 +1,255 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
 20+
 21+from multiprocessing import Queue
 22+from Queue import Empty
 23+import datetime
 24+from dateutil.relativedelta import *
 25+
 26+import progressbar
 27+
 28+import configuration
 29+settings = configuration.Settings()
 30+from utils import models, utils
 31+from database import db
 32+from utils import process_constructor as pc
 33+
 34+try:
 35+ import psyco
 36+ psyco.full()
 37+except ImportError:
 38+ pass
 39+
 40+
 41+def retrieve_editor_ids_mongo(dbname, collection):
 42+ if utils.check_file_exists(settings.binary_location,
 43+ 'editors.bin'):
 44+ ids = utils.load_object(settings.binary_location,
 45+ 'editors.bin')
 46+ else:
 47+ mongo = db.init_mongo_db(dbname)
 48+ editors = mongo[collection]
 49+ ids = editors.distinct('editor')
 50+ utils.store_object(ids, settings.binary_location, retrieve_editor_ids_mongo)
 51+ return ids
 52+
 53+
 54+def expand_edits(edits):
 55+ data = []
 56+ for edit in edits:
 57+ data.append(edit['date'])
 58+ return data
 59+
 60+
 61+def expand_observations(obs, vars_to_expand):
 62+ for var in vars_to_expand:
 63+ if var == 'edits':
 64+ obs[var] = expand_edits(obs[var])
 65+ elif var == 'edits_by_year':
 66+ keys = obs[var].keys()
 67+ keys.sort()
 68+ edits = []
 69+ for key in keys:
 70+ edits.append(str(obs[var][key]))
 71+ obs[var] = edits
 72+ return obs
 73+
 74+def write_longitudinal_data(id, edits, fh):
 75+ years = edits.keys()
 76+ years.sort()
 77+ for year in years:
 78+ months = edits[year].keys()
 79+ months = [int(m) for m in months]
 80+ months.sort()
 81+ for m in months:
 82+ date = datetime.date(int(year), int(m), 1)
 83+ fh.write('%s\t%s\t%s\n' % (id, date, edits[year][str(m)]))
 84+
 85+
 86+def expand_headers(headers, vars_to_expand, obs):
 87+ for var in vars_to_expand:
 88+ l = len(obs[var])
 89+ pos = headers.index(var)
 90+ for i in xrange(l):
 91+ if var.endswith('year'):
 92+ suffix = 2001 + i
 93+ elif var.endswith('edits'):
 94+ suffix = 1 + i
 95+ headers.insert(pos + i, '%s_%s' % (var, suffix))
 96+ headers.remove(var)
 97+ return headers
 98+
 99+
 100+def generate_long_editor_dataset(input_queue, data_queue, pbar, **kwargs):
 101+ debug = kwargs.pop('debug')
 102+ dbname = kwargs.pop('dbname')
 103+ mongo = db.init_mongo_db(dbname)
 104+ editors = mongo['dataset']
 105+ name = dbname + '_long_editors.csv'
 106+ fh = utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding)
 107+ x = 0
 108+ vars_to_expand = []
 109+ while True:
 110+ try:
 111+ id = input_queue.get(block=False)
 112+ obs = editors.find_one({'editor': id}, {'monthly_edits': 1})
 113+ if x == 0:
 114+ headers = obs.keys()
 115+ headers.sort()
 116+ headers = expand_headers(headers, vars_to_expand, obs)
 117+ utils.write_list_to_csv(headers, fh)
 118+ write_longitudinal_data(id, obs['monthly_edits'], fh)
 119+ #utils.write_list_to_csv(data, fh)
 120+ x += 1
 121+ except Empty:
 122+ break
 123+
 124+
 125+def generate_cohort_analysis(input_queue, data_queue, pbar, **kwargs):
 126+ dbname = kwargs.get('dbname')
 127+ pbar = kwargs.get('pbar')
 128+ mongo = db.init_mongo_db(dbname)
 129+ editors = mongo['dataset']
 130+ year = datetime.datetime.now().year + 1
 131+ begin = year - 2001
 132+ p = [3, 6, 9]
 133+ periods = [y * 12 for y in xrange(1, begin)]
 134+ periods = p + periods
 135+ data = {}
 136+ while True:
 137+ try:
 138+ id = input_queue.get(block=False)
 139+ obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})
 140+ first_edit = obs['first_edit']
 141+ last_edit = obs['final_edit']
 142+ for y in xrange(2001, year):
 143+ if y == 2010 and first_edit > datetime.datetime(2010, 1, 1):
 144+ print 'debug'
 145+ if y not in data:
 146+ data[y] = {}
 147+ data[y]['n'] = 0
 148+ window_end = datetime.datetime(y, 12, 31)
 149+ if window_end > datetime.datetime.now():
 150+ now = datetime.datetime.now()
 151+ m = now.month - 1 #Dump files are always lagging at least one month....
 152+ d = now.day
 153+ window_end = datetime.datetime(y, m, d)
 154+ edits = []
 155+ for period in periods:
 156+ if period not in data[y]:
 157+ data[y][period] = 0
 158+ window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period)
 159+ if window_start < datetime.datetime(2001, 1, 1):
 160+ window_start = datetime.datetime(2001, 1, 1)
 161+ if date_falls_in_window(window_start, window_end, first_edit, last_edit):
 162+ edits.append(period)
 163+ if edits != []:
 164+ p = min(edits)
 165+ data[y]['n'] += 1
 166+ data[y][p] += 1
 167+ #pbar.update(+1)
 168+ except Empty:
 169+ break
 170+ utils.store_object(data, settings.binary_location, 'cohort_data')
 171+
 172+def date_falls_in_window(window_start, window_end, first_edit, last_edit):
 173+ if first_edit >= window_start and first_edit <= window_end:
 174+ return True
 175+ else:
 176+ return False
 177+
 178+
 179+def generate_wide_editor_dataset(input_queue, data_queue, pbar, **kwargs):
 180+ dbname = kwargs.pop('dbname')
 181+ mongo = db.init_mongo_db(dbname)
 182+ editors = mongo['dataset']
 183+ name = dbname + '_wide_editors.csv'
 184+ fh = utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding)
 185+ x = 0
 186+ vars_to_expand = ['edits', 'edits_by_year', 'articles_by_year']
 187+ while True:
 188+ try:
 189+ if debug:
 190+ id = u'99797'
 191+ else:
 192+ id = input_queue.get(block=False)
 193+ print input_queue.qsize()
 194+ obs = editors.find_one({'editor': id})
 195+ obs = expand_observations(obs, vars_to_expand)
 196+ if x == 0:
 197+ headers = obs.keys()
 198+ headers.sort()
 199+ headers = expand_headers(headers, vars_to_expand, obs)
 200+ utils.write_list_to_csv(headers, fh)
 201+ data = []
 202+ keys = obs.keys()
 203+ keys.sort()
 204+ for key in keys:
 205+ data.append(obs[key])
 206+ utils.write_list_to_csv(data, fh)
 207+
 208+ x += 1
 209+ except Empty:
 210+ break
 211+ fh.close()
 212+
 213+
 214+def retrieve_edits_by_contributor_launcher():
 215+ pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors')
 216+
 217+
 218+def debug_retrieve_edits_by_contributor_launcher(dbname):
 219+ kwargs = {'debug': False,
 220+ 'dbname': dbname,
 221+ }
 222+ ids = retrieve_editor_ids_mongo(dbname, 'editors')
 223+ input_queue = pc.load_queue(ids)
 224+ q = Queue()
 225+ generate_editor_dataset(input_queue, q, False, kwargs)
 226+
 227+
 228+def generate_editor_dataset_launcher(dbname):
 229+ kwargs = {'nr_input_processors': 1,
 230+ 'nr_output_processors': 1,
 231+ 'debug': False,
 232+ 'dbname': dbname,
 233+ 'poison_pill':False,
 234+ 'pbar': True
 235+ }
 236+ ids = retrieve_editor_ids_mongo(dbname, 'editors')
 237+ ids = list(ids)
 238+ chunks = dict({0: ids})
 239+ pc.build_scaffolding(pc.load_queue, generate_cohort_analysis, chunks, False, False, **kwargs)
 240+
 241+
 242+def generate_editor_dataset_debug(dbname):
 243+ ids = retrieve_editor_ids_mongo(dbname, 'editors')
 244+ input_queue = pc.load_queue(ids)
 245+ kwargs = {'nr_input_processors': 1,
 246+ 'nr_output_processors': 1,
 247+ 'debug': True,
 248+ 'dbname': dbname,
 249+ }
 250+ generate_editor_dataset(input_queue, False, False, kwargs)
 251+
 252+
 253+if __name__ == '__main__':
 254+ #generate_editor_dataset_debug('test')
 255+ generate_editor_dataset_launcher('enwiki')
 256+ #debug_retrieve_edits_by_contributor_launcher()
Property changes on: trunk/tools/editor_trends/etl/construct_datasets.py
___________________________________________________________________
Added: svn:mime-type
1257 + text/plain
Added: svn:eol-style
2258 + native
Index: trunk/tools/editor_trends/etl/__init__.py
Property changes on: trunk/tools/editor_trends/etl/__init__.py
___________________________________________________________________
Added: svn:eol-style
3259 + native
Index: trunk/tools/editor_trends/etl/chunker.py
@@ -0,0 +1,211 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
 20+
 21+import xml.etree.cElementTree as cElementTree
 22+import sys
 23+import codecs
 24+import re
 25+import json
 26+import os
 27+
 28+import progressbar
 29+
 30+
 31+sys.path.append('..')
 32+import configuration
 33+from utils import utils
 34+from wikitree import xml
 35+settings = configuration.Settings()
 36+
 37+try:
 38+ import psyco
 39+ psyco.full()
 40+except ImportError:
 41+ pass
 42+
 43+
 44+RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
 45+
 46+
 47+def remove_numeric_character_references(text):
 48+ return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
 49+
 50+
 51+def lenient_deccharref(m):
 52+ try:
 53+ return unichr(int(m.group(1)))
 54+ except ValueError:
 55+ '''
 56+ There are a few articles that raise a Value Error here, the reason is
 57+ that I am using a narrow Python build (UCS2) instead of a wide build
 58+ (UCS4). The quick fix is to return an empty string...
 59+ Real solution is to rebuild Python with UCS4 support.....
 60+ '''
 61+ return ''
 62+
 63+
 64+def remove_namespace(element, namespace):
 65+ '''Remove namespace from the XML document.'''
 66+ ns = u'{%s}' % namespace
 67+ nsl = len(ns)
 68+ for elem in element.getiterator():
 69+ if elem.tag.startswith(ns):
 70+ elem.tag = elem.tag[nsl:]
 71+ return element
 72+
 73+
 74+def load_namespace(language):
 75+ file = '%s_ns.json' % language
 76+ fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)
 77+ ns = json.load(fh)
 78+ fh.close()
 79+ ns = ns['query']['namespaces']
 80+ return ns
 81+
 82+
 83+def build_namespaces_locale(namespaces):
 84+ '''
 85+ Construct a list of all the non-main namespaces
 86+ '''
 87+ ns = []
 88+ for namespace in namespaces:
 89+ value = namespaces[namespace].get(u'*', None)
 90+ if value != None and value != '':
 91+ ns.append(value)
 92+ return ns
 93+
 94+
 95+def parse_comments(xml, function):
 96+ revisions = xml.findall('revision')
 97+ for revision in revisions:
 98+ comment = revision.find('comment')
 99+ timestamp = revision.find('timestamp').text
 100+ if comment != None and comment.text != None:
 101+ comment.text = function(comment.text)
 102+ return xml
 103+
 104+
 105+def is_article_main_namespace(elem, namespace):
 106+ '''
 107+ checks whether the article belongs to the main namespace
 108+ '''
 109+ title = elem.find('title').text
 110+ for ns in namespace:
 111+ if title.startswith(ns):
 112+ return False
 113+ return True
 114+
 115+
 116+def write_xml_file(element, fh, counter, language):
 117+ '''Get file handle and write xml element to file'''
 118+ size = len(cElementTree.tostring(element))
 119+ fh, counter = create_file_handle(fh, counter, size, language)
 120+ try:
 121+ fh.write(cElementTree.tostring(element))
 122+ except MemoryError:
 123+ print 'Add error capturing logic'
 124+ fh.write('\n')
 125+ return fh, counter
 126+
 127+
 128+def create_file_handle(fh, counter, size, language):
 129+ '''Create file handle if none is supplied or if file size > max file size.'''
 130+ if not counter:
 131+ counter = 0
 132+ path = os.path.join(settings.input_location, language, '%s.xml' % counter)
 133+ if not fh:
 134+ fh = codecs.open(path, 'w', encoding=settings.encoding)
 135+ return fh, counter
 136+ elif (fh.tell() + size) > settings.binary_location:
 137+ print 'Created chunk %s' % counter
 138+ fh.close
 139+ counter += 1
 140+ fh = codecs.open(path, 'w', encoding=settings.encoding)
 141+ return fh, counter
 142+ else:
 143+ return fh, counter
 144+
 145+
 146+def flatten_xml_elements(data, page):
 147+ flat = []
 148+ for x, elems in enumerate(data):
 149+ flat.append([page])
 150+ for elem in elems:
 151+ if elem.tag != 'id':
 152+ if len(elem.getchildren()) > 0:
 153+ for el in elem.getchildren():
 154+ flat[x].append(xml.extract_text(elem, None))
 155+ else:
 156+ flat[x].append(xml.extract_text(elem, None))
 157+ return flat
 158+
 159+
 160+def split_file(output, input, project, language_code, language, format='xml'):
 161+ '''Reads xml file and splits it in N chunks'''
 162+ #location = os.path.join(settings.input_location, language)
 163+ output = os.path.join(output, language_code, project)
 164+ settings.verify_environment([output])
 165+ if format == 'xml':
 166+ fh = None
 167+ else:
 168+ f = input.replace('.xml', '')
 169+ fh = utils.create_txt_filehandle(output, '%s.tsv' % f, 'w', settings.encoding)
 170+
 171+ ns = load_namespace(language_code)
 172+ ns = build_namespaces_locale(ns)
 173+
 174+
 175+ counter = None
 176+ tag = '{%s}page' % settings.xml_namespace
 177+
 178+
 179+ context = cElementTree.iterparse(input, events=('start', 'end'))
 180+ context = iter(context)
 181+ event, root = context.next() #get the root element of the XML doc
 182+
 183+ try:
 184+ for event, elem in context:
 185+ if event == 'end':
 186+ if elem.tag == tag:
 187+ elem = remove_namespace(elem, settings.xml_namespace)
 188+ if is_article_main_namespace(elem, ns):
 189+ page = elem.find('id').text
 190+ elem = parse_comments(elem, remove_numeric_character_references)
 191+ if format == 'xml':
 192+ fh, counter = write_settings.input_filename(elem, fh, counter, language_code)
 193+ else:
 194+ data = [el.getchildren() for el in elem if el.tag == 'revision']
 195+ data = flatten_xml_elements(data, page)
 196+ utils.write_list_to_csv(data, fh, recursive=False, newline=True)
 197+ root.clear() # when done parsing a section clear the tree to safe memory
 198+ except SyntaxError:
 199+ f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding)
 200+ f.write(cElementTree.tostring(elem))
 201+ f.close()
 202+ finally:
 203+ fh.close()
 204+
 205+if __name__ == "__main__":
 206+ kwargs = {'output': settings.input_location,
 207+ 'input': settings.input_filename,
 208+ 'project':'wiki',
 209+ 'language_code':'en',
 210+ 'format': 'tsv'
 211+ }
 212+ split_file(**kwargs)
Property changes on: trunk/tools/editor_trends/etl/chunker.py
___________________________________________________________________
Added: svn:mime-type
1213 + text/plain
Added: svn:eol-style
2214 + native
Index: trunk/tools/editor_trends/etl/xml2pig.py
@@ -0,0 +1,30 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-11-15'
 19+__version__ = '0.1'
 20+
 21+import sys
 22+sys.path.append('..')
 23+
 24+import os
 25+import xml.etree.cElementTree as cElementTree
 26+
 27+import configuration
 28+settings = configuration.Settings()
 29+import split_settings.input_filename
 30+
 31+
Index: trunk/tools/editor_trends/etl/loader.py
@@ -0,0 +1,140 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-11-16'
 19+__version__ = '0.1'
 20+
 21+
 22+import sys
 23+
 24+sys.path.append('..')
 25+import configuration
 26+settings = configuration.Settings()
 27+from database import db
 28+from database import cache
 29+from utils import utils
 30+import process_constructor as pc
 31+
 32+
 33+def store_editors(input, filename, dbname):
 34+ fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding)
 35+ mongo = db.init_mongo_db(dbname)
 36+ collection = mongo['test']
 37+ mongo.collection.ensure_index('editor')
 38+ mongo.collection.create_index('editor')
 39+ editor_cache = cache.EditorCache(collection)
 40+ prev_contributor = -1
 41+ x = 0
 42+ edits = 0
 43+ editors = set()
 44+ for line in readline(fh):
 45+ if len(line) == 0:
 46+ continue
 47+ contributor = int(line[0])
 48+ if contributor == 5767932:
 49+ print 'debug'
 50+ if prev_contributor != contributor:
 51+ if edits >= 10:
 52+ result = editor_cache.add(prev_contributor, 'NEXT')
 53+ if result:
 54+ editors.add(prev_contributor)
 55+ result = None
 56+ x += 1
 57+ print 'Stored %s editors' % x
 58+ else:
 59+ editor_cache.clear(prev_contributor)
 60+ edits = 0
 61+ edits += 1
 62+ date = utils.convert_timestamp_to_date(line[1]) #+ datetime.timedelta(days=1)
 63+ article_id = int(line[2])
 64+ value = {'date': date, 'article': article_id}
 65+ editor_cache.add(contributor, value)
 66+ prev_contributor = contributor
 67+ fh.close()
 68+ utils.store_object(editors, settings.binary_location, 'editors')
 69+
 70+
 71+def mergesort_external_launcher(dbname, input, output):
 72+ files = utils.retrieve_file_list(input, 'txt', mask='')
 73+ x = 0
 74+ maxval = 99999
 75+ while maxval >= settings.max_filehandles:
 76+ x += 1.0
 77+ maxval = round(len(files) / x)
 78+ chunks = utils.split_list(files, int(x))
 79+ '''1st iteration external mergesort'''
 80+ for chunk in chunks:
 81+ filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]]
 82+ filename = merge_sorted_files(output, filehandles, chunk)
 83+ filehandles = [fh.close() for fh in filehandles]
 84+ pass
 85+ '''2nd iteration external mergesort, if necessary'''
 86+ if len(chunks) > 1:
 87+ files = utils.retrieve_file_list(output, 'txt', mask='[merged]')
 88+ filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.encoding) for file in files]
 89+ filename = merge_sorted_files(output, filehandles, 'final')
 90+ filehandles = [fh.close() for fh in filehandles]
 91+ filename = 'merged_final.txt'
 92+ store_editors(output, filename, dbname)
 93+
 94+
 95+def mergesort_feeder(input_queue, result_queue, **kwargs):
 96+ input = kwargs.get('input', None)
 97+ output = kwargs.get('output', None)
 98+ while True:
 99+ try:
 100+ file = input_queue.get(block=False)
 101+ fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding)
 102+ data = fh.readlines()
 103+ fh.close()
 104+ data = [d.replace('\n', '') for d in data]
 105+ data = [d.split('\t') for d in data]
 106+ sorted_data = mergesort(data)
 107+ write_sorted_file(sorted_data, file, output)
 108+ except Empty:
 109+ break
 110+
 111+
 112+def mergesort_launcher(input, output):
 113+ kwargs = {'pbar': True,
 114+ 'nr_input_processors': settings.number_of_processes,
 115+ 'nr_output_processors': settings.number_of_processes,
 116+ 'input': input,
 117+ 'output': output,
 118+ 'poison_pill': False
 119+ }
 120+ files = utils.retrieve_file_list(input, 'txt')
 121+ chunks = utils.split_list(files, settings.number_of_processes)
 122+ pc.build_scaffolding(pc.load_queue, mergesort_feeder, chunks, False, False, **kwargs)
 123+
 124+
 125+def debug_mergesort_feeder(input, output):
 126+ kwargs = {
 127+ 'input': input,
 128+ 'output': output,
 129+ }
 130+ files = utils.retrieve_file_list(input, 'txt')
 131+ chunks = utils.split_list(files, settings.number_of_processes)
 132+ q = pc.load_queue(chunks[0])
 133+ mergesort_feeder(q, False, **kwargs)
 134+
 135+
 136+if __name__ == '__main__':
 137+ input = os.path.join(settings.input_location, 'en', 'wiki', 'txt')
 138+ output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
 139+ dbname = 'enwiki'
 140+ mergesort_launcher(input, output)
 141+ mergesort_external_launcher(dbname, output, output)
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/etl/loader.py
___________________________________________________________________
Added: svn:eol-style
1142 + native
Index: trunk/tools/editor_trends/etl/bots.py
@@ -0,0 +1,123 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+import os
 19+import cStringIO
 20+import xml.etree.cElementTree as cElementTree
 21+
 22+
 23+import configuration
 24+settings = configuration.Settings()
 25+from wikitree import xml
 26+from database import db
 27+from database import db_settings
 28+from utils import utils
 29+from utils import process_constructor as pc
 30+
 31+try:
 32+ import psyco
 33+ psyco.full()
 34+except ImportError:
 35+ pass
 36+
 37+
 38+def create_bot_ids_db_mongo():
 39+ ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.encoding)
 40+ mongo = db.init_mongo_db('bots')
 41+ collection = mongo['ids']
 42+
 43+ db.remove_documents_from_mongo_db(collection, None)
 44+
 45+ for id, name in ids.iteritems():
 46+ collection.insert({'id': id, 'name': name})
 47+
 48+ print collection.count()
 49+
 50+
 51+def lookup_username(input_queue, result_queue, progressbar, bots, debug=False):
 52+ '''
 53+ This function is used to find the id's belonging to the different bots that
 54+ are patrolling the Wikipedia sites.
 55+ @input_queue contains a list of xml files to parse
 56+
 57+ @result_queue should be set to false as the results are directly written to
 58+ a csv file.
 59+
 60+ @progressbar depends on settings
 61+
 62+ @bots is a dictionary containing the names of the bots to lookup
 63+ '''
 64+
 65+ #if len(bots.keys()) == 1:
 66+ bots = bots['bots']
 67+ #print bots.keys()
 68+
 69+ if settings.debug:
 70+ messages = {}
 71+
 72+ while True:
 73+ if debug:
 74+ file = input_queue
 75+ else:
 76+ file = input_queue.get(block=False)
 77+
 78+ if file == None:
 79+ break
 80+
 81+ data = xml.read_input(utils.open_txt_file(settings.input_location +
 82+ file, 'r', encoding=settings.encoding))
 83+
 84+ for raw_data in data:
 85+ xml_buffer = cStringIO.StringIO()
 86+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
 87+ raw_data = ''.join(raw_data)
 88+ raw_data = raw_data.encode('utf-8')
 89+ xml_buffer.write(raw_data)
 90+
 91+ try:
 92+ xml_nodes = cElementTree.XML(xml_buffer.getvalue())
 93+ revisions = xml_nodes.findall('revision')
 94+ for revision in revisions:
 95+ contributor = xml.retrieve_xml_node(revision, 'contributor')
 96+ username = contributor.find('username')
 97+ if username == None:
 98+ continue
 99+ username = xml.extract_text(username)
 100+ #print username.encode('utf-8')
 101+
 102+ if username in bots:
 103+ id = contributor.find('id')
 104+ id = xml.extract_text(id)
 105+ #print username.encode('utf-8'), id
 106+ utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.encoding)
 107+ bots.pop(username)
 108+ if bots == {}:
 109+ print 'Mission accomplished'
 110+ return
 111+ except Exception, error:
 112+ print error
 113+ if settings.debug:
 114+ messages = utils.track_errors(xml_buffer, error, file,
 115+ messages)
 116+
 117+ if settings.debug:
 118+ utils.report_error_messages(messages, lookup_username)
 119+
 120+
 121+if __name__ == '__main__':
 122+ #debug()
 123+ #add_id_to_botnames()
 124+ create_bot_ids_db_mongo()
Property changes on: trunk/tools/editor_trends/etl/bots.py
___________________________________________________________________
Added: svn:mime-type
1125 + text/plain
Added: svn:eol-style
2126 + native

Status & tagging log