r76846 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r76845‎ \| r76846 \| r76847 >
Date:	23:13, 16 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Major refactoring: * moved files to etl (extract-transform-load) directory this commit will contain some small issues.
Modified paths:	/trunk/tools/editor_trends/etl (added) (history) /trunk/tools/editor_trends/etl/__init__.py (added) (history) /trunk/tools/editor_trends/etl/bots.py (added) (history) /trunk/tools/editor_trends/etl/chunker.py (added) (history) /trunk/tools/editor_trends/etl/construct_datasets.py (added) (history) /trunk/tools/editor_trends/etl/extract.py (added) (history) /trunk/tools/editor_trends/etl/loader.py (added) (history) /trunk/tools/editor_trends/etl/optimize_editors.py (added) (history) /trunk/tools/editor_trends/etl/xml2pig.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/optimize_editors.py
—	—	@@ -0,0 +1,172 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-11-02'
	19	+__version__ = '0.1'
	20	+
	21	+from multiprocessing import Queue
	22	+from Queue import Empty
	23	+from operator import itemgetter
	24	+import datetime
	25	+
	26	+import configuration
	27	+settings = configuration.Settings()
	28	+from database import db
	29	+from utils import process_constructor as pc
	30	+from utils import utils
	31	+import construct_datasets
	32	+
	33	+
	34	+try:
	35	+ import psyco
	36	+ psyco.full()
	37	+except ImportError:
	38	+ pass
	39	+
	40	+
	41	+def create_datacontainer(init_value=0):
	42	+ '''
	43	+ This function initializes an empty dictionary with as key the year (starting
	44	+ 2001 and running through) and as value @init_value, in most cases this will
	45	+ be zero so the dictionary will act as a running tally for a variable but
	46	+ @init_value can also a list, [], or a dictionary, {}, or a set, set().
	47	+ '''
	48	+ data = {}
	49	+ year = datetime.datetime.now().year + 1
	50	+ for x in xrange(2001, year):
	51	+ data[str(x)] = init_value
	52	+ return data
	53	+
	54	+
	55	+def add_months_to_datacontainer(datacontainer):
	56	+ for dc in datacontainer:
	57	+ datacontainer[dc] = {}
	58	+ for x in xrange(1, 13):
	59	+ datacontainer[dc][str(x)] = 0
	60	+ return datacontainer
	61	+
	62	+
	63	+def determine_edits_by_month(edits):
	64	+ datacontainer = create_datacontainer(init_value=0)
	65	+ datacontainer = add_months_to_datacontainer(datacontainer)
	66	+ for year in edits:
	67	+ months = set()
	68	+ for edit in edits[year]:
	69	+ m = str(edit['date'].month)
	70	+ if m not in months:
	71	+ datacontainer[year][m] = 1
	72	+ months.add(m)
	73	+ if len(months) == 12:
	74	+ break
	75	+ return datacontainer
	76	+
	77	+
	78	+def determine_edits_by_year(dates):
	79	+ '''
	80	+ This function counts the number of edits by year made by a particular editor.
	81	+ '''
	82	+ edits = create_datacontainer()
	83	+ for date in dates:
	84	+ year = str(date['date'].year)
	85	+ edits[year] += 1
	86	+ return edits
	87	+
	88	+
	89	+def determine_articles_by_year(dates):
	90	+ '''
	91	+ This function counts the number of unique articles by year edited by a
	92	+ particular editor.
	93	+ '''
	94	+ articles = create_datacontainer(set())
	95	+ for date in dates:
	96	+ year = str(date['date'].year)
	97	+ articles[year].add(date['article'])
	98	+ for article in articles:
	99	+ articles[article] = len(articles[article])
	100	+ return articles
	101	+
	102	+
	103	+def sort_edits(edits):
	104	+ edits = utils.merge_list(edits)
	105	+ return sorted(edits, key=itemgetter('date'))
	106	+
	107	+
	108	+def optimize_editors(input_queue, result_queue, pbar, **kwargs):
	109	+ dbname = kwargs.pop('dbname')
	110	+ mongo = db.init_mongo_db(dbname)
	111	+ input = mongo['test']
	112	+ output = mongo['dataset']
	113	+ output.ensure_index('editor')
	114	+ output.ensure_index('year_joined')
	115	+ definition = kwargs.pop('definition')
	116	+ while True:
	117	+ try:
	118	+ id = input_queue.get(block=False)
	119	+ editor = input.find_one({'editor': id})
	120	+ if editor == None:
	121	+ continue
	122	+ edits = editor['edits']
	123	+ monthly_edits = determine_edits_by_month(edits)
	124	+ edits = sort_edits(edits)
	125	+ edit_count = len(edits)
	126	+ new_wikipedian = edits[9]['date']
	127	+ first_edit = edits[0]['date']
	128	+ final_edit = edits[-1]['date']
	129	+ edits_by_year = determine_edits_by_year(edits)
	130	+ articles_by_year = determine_articles_by_year(edits)
	131	+
	132	+ edits = edits[:10]
	133	+
	134	+ output.insert({'editor': id, 'edits': edits,
	135	+ 'edits_by_year': edits_by_year,
	136	+ 'new_wikipedian': new_wikipedian,
	137	+ 'edit_count': edit_count,
	138	+ 'final_edit': final_edit,
	139	+ 'first_edit': first_edit,
	140	+ 'articles_by_year': articles_by_year,
	141	+ 'monthly_edits': monthly_edits})
	142	+ print 'Items left: %s' % input_queue.qsize()
	143	+ except Empty:
	144	+ break
	145	+
	146	+
	147	+def run_optimize_editors(dbname):
	148	+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
	149	+ kwargs = {'definition': 'traditional',
	150	+ 'pbar': True,
	151	+ 'dbname': 'enwiki',
	152	+ 'nr_input_processors': 1,
	153	+ 'nr_output_processors': 0,
	154	+ 'poison_pill': False
	155	+ }
	156	+ print len(ids)
	157	+ ids = list(ids)
	158	+ chunks = dict(0, ids)
	159	+ pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
	160	+
	161	+
	162	+def debug_optimize_editors(dbname):
	163	+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
	164	+ q = pc.load_queue(ids)
	165	+ kwargs = {'definition': 'traditional',
	166	+ 'dbname': dbname
	167	+ }
	168	+ optimize_editors(q, False, True, kwargs)
	169	+
	170	+
	171	+if __name__ == '__main__':
	172	+ #debug_optimize_editors('test')
	173	+ run_optimize_editors('enwiki')
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/etl/optimize_editors.py
___________________________________________________________________
Added: svn:eol-style
1	174	+ native
Index: trunk/tools/editor_trends/etl/extract.py
—	—	@@ -0,0 +1,338 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
	20	+
	21	+#Default Python libraries (Python => 2.6)
	22	+import sys
	23	+import os
	24	+import time
	25	+import datetime
	26	+import codecs
	27	+import math
	28	+import cStringIO
	29	+import re
	30	+from operator import itemgetter
	31	+import xml.etree.cElementTree as cElementTree
	32	+from multiprocessing import Queue, JoinableQueue
	33	+from Queue import Empty
	34	+import pymongo
	35	+
	36	+# Custom written files
	37	+import configuration
	38	+settings = configuration.Settings()
	39	+from utils import utils, models
	40	+from database import db_settings
	41	+from database import db
	42	+from database import cache
	43	+from wikitree import xml
	44	+from statistics import dataset
	45	+from utils import process_constructor as pc
	46	+
	47	+
	48	+try:
	49	+ import psyco
	50	+ psyco.full()
	51	+except ImportError:
	52	+ pass
	53	+
	54	+
	55	+def determine_username_is_bot(username, kwargs):
	56	+ '''
	57	+ @username is the xml element containing the id of the user
	58	+ @kwargs should have a list with all the bot ids
	59	+
	60	+ @Return False if username id is not in bot list id or True if username id
	61	+ is a bot id.
	62	+ '''
	63	+ ids = kwargs.get('bots', [])
	64	+ if ids == None:
	65	+ ids = []
	66	+ if username != None and username.text != None:
	67	+ id = username.text
	68	+ if id in ids:
	69	+ return 1
	70	+ else:
	71	+ return 0
	72	+
	73	+
	74	+def extract_contributor_id(contributor, kwargs):
	75	+ '''
	76	+ @contributor is the xml contributor node containing a number of attributes
	77	+
	78	+ Currently, we are only interested in registered contributors, hence we
	79	+ ignore anonymous editors. If you are interested in collecting data on
	80	+ anonymous editors then add the string 'ip' to the tags variable.
	81	+ '''
	82	+ tags = ['id']
	83	+ if contributor.get('deleted'):
	84	+ return - 1 # ASK: Not sure if this is the best way to code deleted contributors.
	85	+ for elem in contributor:
	86	+ if elem.tag in tags:
	87	+ if elem.text != None:
	88	+ return elem.text.decode('utf-8')
	89	+ else:
	90	+ return - 1
	91	+
	92	+
	93	+def output_editor_information(elem, output, **kwargs):
	94	+ '''
	95	+ @elem is an XML element containing 1 revision from a page
	96	+ @output is where to store the data, either a queue or a filehandle
	97	+ @**kwargs contains extra information
	98	+
	99	+ the variable tags determines which attributes are being parsed, the values in
	100	+ this dictionary are the functions used to extract the data.
	101	+ '''
	102	+ tags = {'contributor': {'editor': extract_contributor_id,
	103	+ 'bot': determine_username_is_bot},
	104	+ 'timestamp': {'date': xml.extract_text},
	105	+ }
	106	+ vars = {}
	107	+ headers = ['editor', 'date', 'article']
	108	+ destination = kwargs.pop('destination')
	109	+ revisions = elem.findall('revision')
	110	+ for revision in revisions:
	111	+ vars['article'] = elem.find('id').text.decode(settings.encoding)
	112	+ elements = revision.getchildren()
	113	+ for tag, functions in tags.iteritems():
	114	+ xml_node = xml.retrieve_xml_node(elements, tag)
	115	+ for var, function in functions.iteritems():
	116	+ vars[var] = function(xml_node, kwargs)
	117	+
	118	+ #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])
	119	+ if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:
	120	+ vars.pop('bot')
	121	+ if destination == 'queue':
	122	+ output.put(vars)
	123	+ vars['date'] = utils.convert_timestamp_to_date(vars['date'])
	124	+ elif destination == 'file':
	125	+ data = []
	126	+ for head in headers:
	127	+ data.append(vars[head])
	128	+ utils.write_list_to_csv(data, output)
	129	+ vars = {}
	130	+
	131	+
	132	+def parse_editors(xml_queue, data_queue, **kwargs):
	133	+ '''
	134	+ @xml_queue contains the filenames of the files to be parsed
	135	+ @data_queue is an instance of Queue where the extracted data is stored for
	136	+ further processing
	137	+ @pbar is an instance of progressbar to display the progress
	138	+ @bots is a list of id's of known Wikipedia bots
	139	+ @debug is a flag to indicate whether the function is called for debugging.
	140	+
	141	+ Output is the data_queue that will be used by store_editors()
	142	+ '''
	143	+ input = kwargs.get('input', None)
	144	+ output = kwargs.get('output', None)
	145	+ debug = kwargs.get('debug', False)
	146	+ destination = kwargs.get('destination', 'file')
	147	+ bots = kwargs.get('bots', None)
	148	+ pbar = kwargs.get('pbar', None)
	149	+ if settings.debug:
	150	+ messages = {}
	151	+ vars = {}
	152	+
	153	+ while True:
	154	+ try:
	155	+ if debug:
	156	+ file = xml_queue
	157	+ else:
	158	+ file = xml_queue.get(block=False)
	159	+ if file == None:
	160	+ print 'Swallowed a poison pill'
	161	+ break
	162	+
	163	+ data = xml.read_input(utils.create_txt_filehandle(input,
	164	+ file, 'r',
	165	+ encoding=settings.encoding))
	166	+ if destination == 'file':
	167	+ name = file[:-4] + '.txt'
	168	+ fh = utils.create_txt_filehandle(output, name, 'w', settings.encoding)
	169	+ for raw_data in data:
	170	+ xml_buffer = cStringIO.StringIO()
	171	+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
	172	+
	173	+ try:
	174	+ raw_data = ''.join(raw_data)
	175	+ xml_buffer.write(raw_data)
	176	+ elem = cElementTree.XML(xml_buffer.getvalue())
	177	+ output_editor_information(elem, fh, bots=bots, destination=destination)
	178	+ except SyntaxError, error:
	179	+ print error
	180	+ '''
	181	+ There are few cases with invalid tokens, they are fixed
	182	+ here and then reinserted into the XML DOM
	183	+ data = convert_html_entities(xml_buffer.getvalue())
	184	+ elem = cElementTree.XML(data)
	185	+ output_editor_information(elem)
	186	+ '''
	187	+ if settings.debug:
	188	+ utils.track_errors(xml_buffer, error, file, messages)
	189	+ except UnicodeEncodeError, error:
	190	+ print error
	191	+ if settings.debug:
	192	+ utils.track_errors(xml_buffer, error, file, messages)
	193	+ except MemoryError, error:
	194	+ print file, error
	195	+ print raw_data[:12]
	196	+ print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
	197	+ if destination == 'queue':
	198	+ output.put('NEXT')
	199	+ while True:
	200	+ if output.qsize() < 100000:
	201	+ break
	202	+ else:
	203	+ time.sleep(10)
	204	+ print 'Still sleeping, queue is %s items long' % output.qsize()
	205	+
	206	+ else:
	207	+ fh.close()
	208	+
	209	+ if pbar:
	210	+ print file, xml_queue.qsize()
	211	+ #utils.update_progressbar(pbar, xml_queue)
	212	+
	213	+ if debug:
	214	+ break
	215	+
	216	+ except Empty:
	217	+ break
	218	+
	219	+ if destination == 'queue':
	220	+ data_queue.put(None)
	221	+
	222	+ if settings.debug:
	223	+ utils.report_error_messages(messages, parse_editors)
	224	+
	225	+
	226	+def store_editors(data_queue, **kwargs):
	227	+ '''
	228	+ @data_queue is an instance of Queue containing information extracted by
	229	+ parse_editors()
	230	+ @pids is a list of PIDs used to check if other processes are finished
	231	+ running
	232	+ @dbname is the name of the MongoDB collection where to store the information.
	233	+ '''
	234	+ dbname = kwargs.get('dbname', None)
	235	+ mongo = db.init_mongo_db(dbname)
	236	+ collection = mongo['editors']
	237	+ mongo.collection.ensure_index('editor')
	238	+ editor_cache = cache.EditorCache(collection)
	239	+
	240	+ while True:
	241	+ try:
	242	+ edit = data_queue.get(block=False)
	243	+ data_queue.task_done()
	244	+ if edit == None:
	245	+ print 'Swallowing poison pill'
	246	+ break
	247	+ elif edit == 'NEXT':
	248	+ editor_cache.add('NEXT', '')
	249	+ else:
	250	+ contributor = edit['editor']
	251	+ value = {'date': edit['date'], 'article': edit['article']}
	252	+ editor_cache.add(contributor, value)
	253	+ #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True)
	254	+ #'$inc': {'edit_count': 1},
	255	+
	256	+ except Empty:
	257	+ '''
	258	+ This checks whether the Queue is empty because the preprocessors are
	259	+ finished or because this function is faster in emptying the Queue
	260	+ then the preprocessors are able to fill it. If the preprocessors
	261	+ are finished and this Queue is empty than break, else wait for the
	262	+ Queue to fill.
	263	+ '''
	264	+ pass
	265	+
	266	+ print 'Emptying entire cache.'
	267	+ editor_cache.store()
	268	+ print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n)
	269	+
	270	+
	271	+def load_cache_objects():
	272	+ cache = {}
	273	+ files = utils.retrieve_file_list(settings.binary_location, '.bin')
	274	+ for x, file in enumerate(files):
	275	+ cache[x] = utils.load_object(settings.binary_location, file)
	276	+ return cache
	277	+
	278	+
	279	+def search_cache_for_missed_editors(dbname):
	280	+ mongo = db.init_mongo_db(dbname)
	281	+ collection = mongo['editors']
	282	+ editor_cache = cache.EditorCache(collection)
	283	+ cache = load_cache_objects()
	284	+ for c in cache:
	285	+ for editor in cache[c]:
	286	+ editor_cache.add(editor, cache[c][editor])
	287	+ cache[c] = {}
	288	+ editor_cache.add('NEXT', '')
	289	+ cache = {}
	290	+
	291	+
	292	+
	293	+def load_bot_ids():
	294	+ '''
	295	+ Loader function to retrieve list of id's of known Wikipedia bots.
	296	+ '''
	297	+ ids = {}
	298	+ mongo = db.init_mongo_db('bots')
	299	+ bots = mongo['ids']
	300	+ cursor = bots.find()
	301	+ for bot in cursor:
	302	+ ids[bot['id']] = bot['name']
	303	+ return ids
	304	+
	305	+
	306	+def run_parse_editors(location, language, project):
	307	+ ids = load_bot_ids()
	308	+ base = os.path.join(location, language, project)
	309	+ input = os.path.join(base, 'chunks')
	310	+ output = os.path.join(base, 'txt')
	311	+ settings.verify_environment([input, output])
	312	+ files = utils.retrieve_file_list(input, 'xml')
	313	+
	314	+ kwargs = {'bots': ids,
	315	+ 'dbname': language + project,
	316	+ 'language': language,
	317	+ 'project': project,
	318	+ 'pbar': True,
	319	+ 'destination': 'file',
	320	+ 'nr_input_processors': settings.number_of_processes,
	321	+ 'nr_output_processors': settings.number_of_processes,
	322	+ 'input': input,
	323	+ 'output': output,
	324	+ }
	325	+
	326	+ chunks = utils.split_list(files, settings.number_of_processes)
	327	+ pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs)
	328	+
	329	+
	330	+def debug_parse_editors(dbname):
	331	+ q = JoinableQueue()
	332	+ parse_editors('522.xml', q, None, None, debug=True, destination='file')
	333	+ store_editors(q, [], dbname)
	334	+
	335	+
	336	+if __name__ == "__main__":
	337	+ #debug_parse_editors('test2')
	338	+ run_parse_editors(settings.input_location, 'en', 'wiki')
	339	+ pass
Property changes on: trunk/tools/editor_trends/etl/extract.py
___________________________________________________________________
Added: svn:mime-type
1	340	+ text/plain
Added: svn:eol-style
2	341	+ native
Index: trunk/tools/editor_trends/etl/construct_datasets.py
—	—	@@ -0,0 +1,255 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
	20	+
	21	+from multiprocessing import Queue
	22	+from Queue import Empty
	23	+import datetime
	24	+from dateutil.relativedelta import *
	25	+
	26	+import progressbar
	27	+
	28	+import configuration
	29	+settings = configuration.Settings()
	30	+from utils import models, utils
	31	+from database import db
	32	+from utils import process_constructor as pc
	33	+
	34	+try:
	35	+ import psyco
	36	+ psyco.full()
	37	+except ImportError:
	38	+ pass
	39	+
	40	+
	41	+def retrieve_editor_ids_mongo(dbname, collection):
	42	+ if utils.check_file_exists(settings.binary_location,
	43	+ 'editors.bin'):
	44	+ ids = utils.load_object(settings.binary_location,
	45	+ 'editors.bin')
	46	+ else:
	47	+ mongo = db.init_mongo_db(dbname)
	48	+ editors = mongo[collection]
	49	+ ids = editors.distinct('editor')
	50	+ utils.store_object(ids, settings.binary_location, retrieve_editor_ids_mongo)
	51	+ return ids
	52	+
	53	+
	54	+def expand_edits(edits):
	55	+ data = []
	56	+ for edit in edits:
	57	+ data.append(edit['date'])
	58	+ return data
	59	+
	60	+
	61	+def expand_observations(obs, vars_to_expand):
	62	+ for var in vars_to_expand:
	63	+ if var == 'edits':
	64	+ obs[var] = expand_edits(obs[var])
	65	+ elif var == 'edits_by_year':
	66	+ keys = obs[var].keys()
	67	+ keys.sort()
	68	+ edits = []
	69	+ for key in keys:
	70	+ edits.append(str(obs[var][key]))
	71	+ obs[var] = edits
	72	+ return obs
	73	+
	74	+def write_longitudinal_data(id, edits, fh):
	75	+ years = edits.keys()
	76	+ years.sort()
	77	+ for year in years:
	78	+ months = edits[year].keys()
	79	+ months = [int(m) for m in months]
	80	+ months.sort()
	81	+ for m in months:
	82	+ date = datetime.date(int(year), int(m), 1)
	83	+ fh.write('%s\t%s\t%s\n' % (id, date, edits[year][str(m)]))
	84	+
	85	+
	86	+def expand_headers(headers, vars_to_expand, obs):
	87	+ for var in vars_to_expand:
	88	+ l = len(obs[var])
	89	+ pos = headers.index(var)
	90	+ for i in xrange(l):
	91	+ if var.endswith('year'):
	92	+ suffix = 2001 + i
	93	+ elif var.endswith('edits'):
	94	+ suffix = 1 + i
	95	+ headers.insert(pos + i, '%s_%s' % (var, suffix))
	96	+ headers.remove(var)
	97	+ return headers
	98	+
	99	+
	100	+def generate_long_editor_dataset(input_queue, data_queue, pbar, **kwargs):
	101	+ debug = kwargs.pop('debug')
	102	+ dbname = kwargs.pop('dbname')
	103	+ mongo = db.init_mongo_db(dbname)
	104	+ editors = mongo['dataset']
	105	+ name = dbname + '_long_editors.csv'
	106	+ fh = utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding)
	107	+ x = 0
	108	+ vars_to_expand = []
	109	+ while True:
	110	+ try:
	111	+ id = input_queue.get(block=False)
	112	+ obs = editors.find_one({'editor': id}, {'monthly_edits': 1})
	113	+ if x == 0:
	114	+ headers = obs.keys()
	115	+ headers.sort()
	116	+ headers = expand_headers(headers, vars_to_expand, obs)
	117	+ utils.write_list_to_csv(headers, fh)
	118	+ write_longitudinal_data(id, obs['monthly_edits'], fh)
	119	+ #utils.write_list_to_csv(data, fh)
	120	+ x += 1
	121	+ except Empty:
	122	+ break
	123	+
	124	+
	125	+def generate_cohort_analysis(input_queue, data_queue, pbar, **kwargs):
	126	+ dbname = kwargs.get('dbname')
	127	+ pbar = kwargs.get('pbar')
	128	+ mongo = db.init_mongo_db(dbname)
	129	+ editors = mongo['dataset']
	130	+ year = datetime.datetime.now().year + 1
	131	+ begin = year - 2001
	132	+ p = [3, 6, 9]
	133	+ periods = [y * 12 for y in xrange(1, begin)]
	134	+ periods = p + periods
	135	+ data = {}
	136	+ while True:
	137	+ try:
	138	+ id = input_queue.get(block=False)
	139	+ obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})
	140	+ first_edit = obs['first_edit']
	141	+ last_edit = obs['final_edit']
	142	+ for y in xrange(2001, year):
	143	+ if y == 2010 and first_edit > datetime.datetime(2010, 1, 1):
	144	+ print 'debug'
	145	+ if y not in data:
	146	+ data[y] = {}
	147	+ data[y]['n'] = 0
	148	+ window_end = datetime.datetime(y, 12, 31)
	149	+ if window_end > datetime.datetime.now():
	150	+ now = datetime.datetime.now()
	151	+ m = now.month - 1 #Dump files are always lagging at least one month....
	152	+ d = now.day
	153	+ window_end = datetime.datetime(y, m, d)
	154	+ edits = []
	155	+ for period in periods:
	156	+ if period not in data[y]:
	157	+ data[y][period] = 0
	158	+ window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period)
	159	+ if window_start < datetime.datetime(2001, 1, 1):
	160	+ window_start = datetime.datetime(2001, 1, 1)
	161	+ if date_falls_in_window(window_start, window_end, first_edit, last_edit):
	162	+ edits.append(period)
	163	+ if edits != []:
	164	+ p = min(edits)
	165	+ data[y]['n'] += 1
	166	+ data[y][p] += 1
	167	+ #pbar.update(+1)
	168	+ except Empty:
	169	+ break
	170	+ utils.store_object(data, settings.binary_location, 'cohort_data')
	171	+
	172	+def date_falls_in_window(window_start, window_end, first_edit, last_edit):
	173	+ if first_edit >= window_start and first_edit <= window_end:
	174	+ return True
	175	+ else:
	176	+ return False
	177	+
	178	+
	179	+def generate_wide_editor_dataset(input_queue, data_queue, pbar, **kwargs):
	180	+ dbname = kwargs.pop('dbname')
	181	+ mongo = db.init_mongo_db(dbname)
	182	+ editors = mongo['dataset']
	183	+ name = dbname + '_wide_editors.csv'
	184	+ fh = utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding)
	185	+ x = 0
	186	+ vars_to_expand = ['edits', 'edits_by_year', 'articles_by_year']
	187	+ while True:
	188	+ try:
	189	+ if debug:
	190	+ id = u'99797'
	191	+ else:
	192	+ id = input_queue.get(block=False)
	193	+ print input_queue.qsize()
	194	+ obs = editors.find_one({'editor': id})
	195	+ obs = expand_observations(obs, vars_to_expand)
	196	+ if x == 0:
	197	+ headers = obs.keys()
	198	+ headers.sort()
	199	+ headers = expand_headers(headers, vars_to_expand, obs)
	200	+ utils.write_list_to_csv(headers, fh)
	201	+ data = []
	202	+ keys = obs.keys()
	203	+ keys.sort()
	204	+ for key in keys:
	205	+ data.append(obs[key])
	206	+ utils.write_list_to_csv(data, fh)
	207	+
	208	+ x += 1
	209	+ except Empty:
	210	+ break
	211	+ fh.close()
	212	+
	213	+
	214	+def retrieve_edits_by_contributor_launcher():
	215	+ pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors')
	216	+
	217	+
	218	+def debug_retrieve_edits_by_contributor_launcher(dbname):
	219	+ kwargs = {'debug': False,
	220	+ 'dbname': dbname,
	221	+ }
	222	+ ids = retrieve_editor_ids_mongo(dbname, 'editors')
	223	+ input_queue = pc.load_queue(ids)
	224	+ q = Queue()
	225	+ generate_editor_dataset(input_queue, q, False, kwargs)
	226	+
	227	+
	228	+def generate_editor_dataset_launcher(dbname):
	229	+ kwargs = {'nr_input_processors': 1,
	230	+ 'nr_output_processors': 1,
	231	+ 'debug': False,
	232	+ 'dbname': dbname,
	233	+ 'poison_pill':False,
	234	+ 'pbar': True
	235	+ }
	236	+ ids = retrieve_editor_ids_mongo(dbname, 'editors')
	237	+ ids = list(ids)
	238	+ chunks = dict({0: ids})
	239	+ pc.build_scaffolding(pc.load_queue, generate_cohort_analysis, chunks, False, False, **kwargs)
	240	+
	241	+
	242	+def generate_editor_dataset_debug(dbname):
	243	+ ids = retrieve_editor_ids_mongo(dbname, 'editors')
	244	+ input_queue = pc.load_queue(ids)
	245	+ kwargs = {'nr_input_processors': 1,
	246	+ 'nr_output_processors': 1,
	247	+ 'debug': True,
	248	+ 'dbname': dbname,
	249	+ }
	250	+ generate_editor_dataset(input_queue, False, False, kwargs)
	251	+
	252	+
	253	+if __name__ == '__main__':
	254	+ #generate_editor_dataset_debug('test')
	255	+ generate_editor_dataset_launcher('enwiki')
	256	+ #debug_retrieve_edits_by_contributor_launcher()
Property changes on: trunk/tools/editor_trends/etl/construct_datasets.py
___________________________________________________________________
Added: svn:mime-type
1	257	+ text/plain
Added: svn:eol-style
2	258	+ native
Index: trunk/tools/editor_trends/etl/__init__.py
Property changes on: trunk/tools/editor_trends/etl/__init__.py
___________________________________________________________________
Added: svn:eol-style
3	259	+ native
Index: trunk/tools/editor_trends/etl/chunker.py
—	—	@@ -0,0 +1,211 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
	20	+
	21	+import xml.etree.cElementTree as cElementTree
	22	+import sys
	23	+import codecs
	24	+import re
	25	+import json
	26	+import os
	27	+
	28	+import progressbar
	29	+
	30	+
	31	+sys.path.append('..')
	32	+import configuration
	33	+from utils import utils
	34	+from wikitree import xml
	35	+settings = configuration.Settings()
	36	+
	37	+try:
	38	+ import psyco
	39	+ psyco.full()
	40	+except ImportError:
	41	+ pass
	42	+
	43	+
	44	+RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
	45	+
	46	+
	47	+def remove_numeric_character_references(text):
	48	+ return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
	49	+
	50	+
	51	+def lenient_deccharref(m):
	52	+ try:
	53	+ return unichr(int(m.group(1)))
	54	+ except ValueError:
	55	+ '''
	56	+ There are a few articles that raise a Value Error here, the reason is
	57	+ that I am using a narrow Python build (UCS2) instead of a wide build
	58	+ (UCS4). The quick fix is to return an empty string...
	59	+ Real solution is to rebuild Python with UCS4 support.....
	60	+ '''
	61	+ return ''
	62	+
	63	+
	64	+def remove_namespace(element, namespace):
	65	+ '''Remove namespace from the XML document.'''
	66	+ ns = u'{%s}' % namespace
	67	+ nsl = len(ns)
	68	+ for elem in element.getiterator():
	69	+ if elem.tag.startswith(ns):
	70	+ elem.tag = elem.tag[nsl:]
	71	+ return element
	72	+
	73	+
	74	+def load_namespace(language):
	75	+ file = '%s_ns.json' % language
	76	+ fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)
	77	+ ns = json.load(fh)
	78	+ fh.close()
	79	+ ns = ns['query']['namespaces']
	80	+ return ns
	81	+
	82	+
	83	+def build_namespaces_locale(namespaces):
	84	+ '''
	85	+ Construct a list of all the non-main namespaces
	86	+ '''
	87	+ ns = []
	88	+ for namespace in namespaces:
	89	+ value = namespaces[namespace].get(u'*', None)
	90	+ if value != None and value != '':
	91	+ ns.append(value)
	92	+ return ns
	93	+
	94	+
	95	+def parse_comments(xml, function):
	96	+ revisions = xml.findall('revision')
	97	+ for revision in revisions:
	98	+ comment = revision.find('comment')
	99	+ timestamp = revision.find('timestamp').text
	100	+ if comment != None and comment.text != None:
	101	+ comment.text = function(comment.text)
	102	+ return xml
	103	+
	104	+
	105	+def is_article_main_namespace(elem, namespace):
	106	+ '''
	107	+ checks whether the article belongs to the main namespace
	108	+ '''
	109	+ title = elem.find('title').text
	110	+ for ns in namespace:
	111	+ if title.startswith(ns):
	112	+ return False
	113	+ return True
	114	+
	115	+
	116	+def write_xml_file(element, fh, counter, language):
	117	+ '''Get file handle and write xml element to file'''
	118	+ size = len(cElementTree.tostring(element))
	119	+ fh, counter = create_file_handle(fh, counter, size, language)
	120	+ try:
	121	+ fh.write(cElementTree.tostring(element))
	122	+ except MemoryError:
	123	+ print 'Add error capturing logic'
	124	+ fh.write('\n')
	125	+ return fh, counter
	126	+
	127	+
	128	+def create_file_handle(fh, counter, size, language):
	129	+ '''Create file handle if none is supplied or if file size > max file size.'''
	130	+ if not counter:
	131	+ counter = 0
	132	+ path = os.path.join(settings.input_location, language, '%s.xml' % counter)
	133	+ if not fh:
	134	+ fh = codecs.open(path, 'w', encoding=settings.encoding)
	135	+ return fh, counter
	136	+ elif (fh.tell() + size) > settings.binary_location:
	137	+ print 'Created chunk %s' % counter
	138	+ fh.close
	139	+ counter += 1
	140	+ fh = codecs.open(path, 'w', encoding=settings.encoding)
	141	+ return fh, counter
	142	+ else:
	143	+ return fh, counter
	144	+
	145	+
	146	+def flatten_xml_elements(data, page):
	147	+ flat = []
	148	+ for x, elems in enumerate(data):
	149	+ flat.append([page])
	150	+ for elem in elems:
	151	+ if elem.tag != 'id':
	152	+ if len(elem.getchildren()) > 0:
	153	+ for el in elem.getchildren():
	154	+ flat[x].append(xml.extract_text(elem, None))
	155	+ else:
	156	+ flat[x].append(xml.extract_text(elem, None))
	157	+ return flat
	158	+
	159	+
	160	+def split_file(output, input, project, language_code, language, format='xml'):
	161	+ '''Reads xml file and splits it in N chunks'''
	162	+ #location = os.path.join(settings.input_location, language)
	163	+ output = os.path.join(output, language_code, project)
	164	+ settings.verify_environment([output])
	165	+ if format == 'xml':
	166	+ fh = None
	167	+ else:
	168	+ f = input.replace('.xml', '')
	169	+ fh = utils.create_txt_filehandle(output, '%s.tsv' % f, 'w', settings.encoding)
	170	+
	171	+ ns = load_namespace(language_code)
	172	+ ns = build_namespaces_locale(ns)
	173	+
	174	+
	175	+ counter = None
	176	+ tag = '{%s}page' % settings.xml_namespace
	177	+
	178	+
	179	+ context = cElementTree.iterparse(input, events=('start', 'end'))
	180	+ context = iter(context)
	181	+ event, root = context.next() #get the root element of the XML doc
	182	+
	183	+ try:
	184	+ for event, elem in context:
	185	+ if event == 'end':
	186	+ if elem.tag == tag:
	187	+ elem = remove_namespace(elem, settings.xml_namespace)
	188	+ if is_article_main_namespace(elem, ns):
	189	+ page = elem.find('id').text
	190	+ elem = parse_comments(elem, remove_numeric_character_references)
	191	+ if format == 'xml':
	192	+ fh, counter = write_settings.input_filename(elem, fh, counter, language_code)
	193	+ else:
	194	+ data = [el.getchildren() for el in elem if el.tag == 'revision']
	195	+ data = flatten_xml_elements(data, page)
	196	+ utils.write_list_to_csv(data, fh, recursive=False, newline=True)
	197	+ root.clear() # when done parsing a section clear the tree to safe memory
	198	+ except SyntaxError:
	199	+ f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding)
	200	+ f.write(cElementTree.tostring(elem))
	201	+ f.close()
	202	+ finally:
	203	+ fh.close()
	204	+
	205	+if __name__ == "__main__":
	206	+ kwargs = {'output': settings.input_location,
	207	+ 'input': settings.input_filename,
	208	+ 'project':'wiki',
	209	+ 'language_code':'en',
	210	+ 'format': 'tsv'
	211	+ }
	212	+ split_file(**kwargs)
Property changes on: trunk/tools/editor_trends/etl/chunker.py
___________________________________________________________________
Added: svn:mime-type
1	213	+ text/plain
Added: svn:eol-style
2	214	+ native
Index: trunk/tools/editor_trends/etl/xml2pig.py
—	—	@@ -0,0 +1,30 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-11-15'
	19	+__version__ = '0.1'
	20	+
	21	+import sys
	22	+sys.path.append('..')
	23	+
	24	+import os
	25	+import xml.etree.cElementTree as cElementTree
	26	+
	27	+import configuration
	28	+settings = configuration.Settings()
	29	+import split_settings.input_filename
	30	+
	31	+
Index: trunk/tools/editor_trends/etl/loader.py
—	—	@@ -0,0 +1,140 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-11-16'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+import sys
	23	+
	24	+sys.path.append('..')
	25	+import configuration
	26	+settings = configuration.Settings()
	27	+from database import db
	28	+from database import cache
	29	+from utils import utils
	30	+import process_constructor as pc
	31	+
	32	+
	33	+def store_editors(input, filename, dbname):
	34	+ fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding)
	35	+ mongo = db.init_mongo_db(dbname)
	36	+ collection = mongo['test']
	37	+ mongo.collection.ensure_index('editor')
	38	+ mongo.collection.create_index('editor')
	39	+ editor_cache = cache.EditorCache(collection)
	40	+ prev_contributor = -1
	41	+ x = 0
	42	+ edits = 0
	43	+ editors = set()
	44	+ for line in readline(fh):
	45	+ if len(line) == 0:
	46	+ continue
	47	+ contributor = int(line[0])
	48	+ if contributor == 5767932:
	49	+ print 'debug'
	50	+ if prev_contributor != contributor:
	51	+ if edits >= 10:
	52	+ result = editor_cache.add(prev_contributor, 'NEXT')
	53	+ if result:
	54	+ editors.add(prev_contributor)
	55	+ result = None
	56	+ x += 1
	57	+ print 'Stored %s editors' % x
	58	+ else:
	59	+ editor_cache.clear(prev_contributor)
	60	+ edits = 0
	61	+ edits += 1
	62	+ date = utils.convert_timestamp_to_date(line[1]) #+ datetime.timedelta(days=1)
	63	+ article_id = int(line[2])
	64	+ value = {'date': date, 'article': article_id}
	65	+ editor_cache.add(contributor, value)
	66	+ prev_contributor = contributor
	67	+ fh.close()
	68	+ utils.store_object(editors, settings.binary_location, 'editors')
	69	+
	70	+
	71	+def mergesort_external_launcher(dbname, input, output):
	72	+ files = utils.retrieve_file_list(input, 'txt', mask='')
	73	+ x = 0
	74	+ maxval = 99999
	75	+ while maxval >= settings.max_filehandles:
	76	+ x += 1.0
	77	+ maxval = round(len(files) / x)
	78	+ chunks = utils.split_list(files, int(x))
	79	+ '''1st iteration external mergesort'''
	80	+ for chunk in chunks:
	81	+ filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]]
	82	+ filename = merge_sorted_files(output, filehandles, chunk)
	83	+ filehandles = [fh.close() for fh in filehandles]
	84	+ pass
	85	+ '''2nd iteration external mergesort, if necessary'''
	86	+ if len(chunks) > 1:
	87	+ files = utils.retrieve_file_list(output, 'txt', mask='[merged]')
	88	+ filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.encoding) for file in files]
	89	+ filename = merge_sorted_files(output, filehandles, 'final')
	90	+ filehandles = [fh.close() for fh in filehandles]
	91	+ filename = 'merged_final.txt'
	92	+ store_editors(output, filename, dbname)
	93	+
	94	+
	95	+def mergesort_feeder(input_queue, result_queue, **kwargs):
	96	+ input = kwargs.get('input', None)
	97	+ output = kwargs.get('output', None)
	98	+ while True:
	99	+ try:
	100	+ file = input_queue.get(block=False)
	101	+ fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding)
	102	+ data = fh.readlines()
	103	+ fh.close()
	104	+ data = [d.replace('\n', '') for d in data]
	105	+ data = [d.split('\t') for d in data]
	106	+ sorted_data = mergesort(data)
	107	+ write_sorted_file(sorted_data, file, output)
	108	+ except Empty:
	109	+ break
	110	+
	111	+
	112	+def mergesort_launcher(input, output):
	113	+ kwargs = {'pbar': True,
	114	+ 'nr_input_processors': settings.number_of_processes,
	115	+ 'nr_output_processors': settings.number_of_processes,
	116	+ 'input': input,
	117	+ 'output': output,
	118	+ 'poison_pill': False
	119	+ }
	120	+ files = utils.retrieve_file_list(input, 'txt')
	121	+ chunks = utils.split_list(files, settings.number_of_processes)
	122	+ pc.build_scaffolding(pc.load_queue, mergesort_feeder, chunks, False, False, **kwargs)
	123	+
	124	+
	125	+def debug_mergesort_feeder(input, output):
	126	+ kwargs = {
	127	+ 'input': input,
	128	+ 'output': output,
	129	+ }
	130	+ files = utils.retrieve_file_list(input, 'txt')
	131	+ chunks = utils.split_list(files, settings.number_of_processes)
	132	+ q = pc.load_queue(chunks[0])
	133	+ mergesort_feeder(q, False, **kwargs)
	134	+
	135	+
	136	+if __name__ == '__main__':
	137	+ input = os.path.join(settings.input_location, 'en', 'wiki', 'txt')
	138	+ output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
	139	+ dbname = 'enwiki'
	140	+ mergesort_launcher(input, output)
	141	+ mergesort_external_launcher(dbname, output, output)
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/etl/loader.py
___________________________________________________________________
Added: svn:eol-style
1	142	+ native
Index: trunk/tools/editor_trends/etl/bots.py
—	—	@@ -0,0 +1,123 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+import os
	19	+import cStringIO
	20	+import xml.etree.cElementTree as cElementTree
	21	+
	22	+
	23	+import configuration
	24	+settings = configuration.Settings()
	25	+from wikitree import xml
	26	+from database import db
	27	+from database import db_settings
	28	+from utils import utils
	29	+from utils import process_constructor as pc
	30	+
	31	+try:
	32	+ import psyco
	33	+ psyco.full()
	34	+except ImportError:
	35	+ pass
	36	+
	37	+
	38	+def create_bot_ids_db_mongo():
	39	+ ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.encoding)
	40	+ mongo = db.init_mongo_db('bots')
	41	+ collection = mongo['ids']
	42	+
	43	+ db.remove_documents_from_mongo_db(collection, None)
	44	+
	45	+ for id, name in ids.iteritems():
	46	+ collection.insert({'id': id, 'name': name})
	47	+
	48	+ print collection.count()
	49	+
	50	+
	51	+def lookup_username(input_queue, result_queue, progressbar, bots, debug=False):
	52	+ '''
	53	+ This function is used to find the id's belonging to the different bots that
	54	+ are patrolling the Wikipedia sites.
	55	+ @input_queue contains a list of xml files to parse
	56	+
	57	+ @result_queue should be set to false as the results are directly written to
	58	+ a csv file.
	59	+
	60	+ @progressbar depends on settings
	61	+
	62	+ @bots is a dictionary containing the names of the bots to lookup
	63	+ '''
	64	+
	65	+ #if len(bots.keys()) == 1:
	66	+ bots = bots['bots']
	67	+ #print bots.keys()
	68	+
	69	+ if settings.debug:
	70	+ messages = {}
	71	+
	72	+ while True:
	73	+ if debug:
	74	+ file = input_queue
	75	+ else:
	76	+ file = input_queue.get(block=False)
	77	+
	78	+ if file == None:
	79	+ break
	80	+
	81	+ data = xml.read_input(utils.open_txt_file(settings.input_location +
	82	+ file, 'r', encoding=settings.encoding))
	83	+
	84	+ for raw_data in data:
	85	+ xml_buffer = cStringIO.StringIO()
	86	+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
	87	+ raw_data = ''.join(raw_data)
	88	+ raw_data = raw_data.encode('utf-8')
	89	+ xml_buffer.write(raw_data)
	90	+
	91	+ try:
	92	+ xml_nodes = cElementTree.XML(xml_buffer.getvalue())
	93	+ revisions = xml_nodes.findall('revision')
	94	+ for revision in revisions:
	95	+ contributor = xml.retrieve_xml_node(revision, 'contributor')
	96	+ username = contributor.find('username')
	97	+ if username == None:
	98	+ continue
	99	+ username = xml.extract_text(username)
	100	+ #print username.encode('utf-8')
	101	+
	102	+ if username in bots:
	103	+ id = contributor.find('id')
	104	+ id = xml.extract_text(id)
	105	+ #print username.encode('utf-8'), id
	106	+ utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.encoding)
	107	+ bots.pop(username)
	108	+ if bots == {}:
	109	+ print 'Mission accomplished'
	110	+ return
	111	+ except Exception, error:
	112	+ print error
	113	+ if settings.debug:
	114	+ messages = utils.track_errors(xml_buffer, error, file,
	115	+ messages)
	116	+
	117	+ if settings.debug:
	118	+ utils.report_error_messages(messages, lookup_username)
	119	+
	120	+
	121	+if __name__ == '__main__':
	122	+ #debug()
	123	+ #add_id_to_botnames()
	124	+ create_bot_ids_db_mongo()
Property changes on: trunk/tools/editor_trends/etl/bots.py
___________________________________________________________________
Added: svn:mime-type
1	125	+ text/plain
Added: svn:eol-style
2	126	+ native

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r76846 [removed: new added: deferred]