r77300 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r77299‎ \| r77300 \| r77301 >
Date:	21:18, 25 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	* Expanded bot detection functionality * Manage.py has more configurable options * Made OO solution to create datasets.
Modified paths:	/trunk/tools/editor_trends (modified) (history) /trunk/tools/editor_trends/analyses/cohort_confidence_intervals.py (added) (history) /trunk/tools/editor_trends/bots/bots.py (added) (history) /trunk/tools/editor_trends/configuration.py (modified) (history) /trunk/tools/editor_trends/errors (deleted) (history) /trunk/tools/editor_trends/etl/bots.py (deleted) (history) /trunk/tools/editor_trends/etl/exporter.py (modified) (history) /trunk/tools/editor_trends/etl/extract.py (modified) (history) /trunk/tools/editor_trends/etl/loader.py (modified) (history) /trunk/tools/editor_trends/etl/shaper.py (added) (history) /trunk/tools/editor_trends/etl/transformer.py (modified) (history) /trunk/tools/editor_trends/experience (added) (history) /trunk/tools/editor_trends/experience/__init__.py (added) (history) /trunk/tools/editor_trends/experience/map.py (added) (history) /trunk/tools/editor_trends/logs (added) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/statistics/r (added) (history) /trunk/tools/editor_trends/tests (added) (history) /trunk/tools/editor_trends/tests/__init__.py (added) (history) /trunk/tools/editor_trends/tests/mongodb (added) (history) /trunk/tools/editor_trends/tests/mongodb/__init__.py (added) (history) /trunk/tools/editor_trends/tests/mongodb/store.py (added) (history) /trunk/tools/editor_trends/tests/test.py (added) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -172,7 +172,7 @@
173	173
174	174	def launch_zip_extractor(args, location, file):
175	175	timer = Timer()
176		~~- utils.zip_extract(location, file, compression='7z')~~
	176	+ utils.zip_extract(location, file)
177	177	timer.elapsed()
178	178
179	179
—	—	@@ -211,7 +211,8 @@
212	212	print 'dataset launcher'
213	213	timer = Timer()
214	214	project = kwargs.pop('full_project')
215		~~- transformer.run_optimize_editors(project)~~
	215	+ collection = kwargs.pop('collection')
	216	+ transformer.run_optimize_editors(project, collection)
216	217	timer.elapsed()
217	218
218	219
—	—	@@ -313,6 +314,9 @@
314	315
315	316	parser_transform = subparsers.add_parser('transform', help='Transform the raw datatabe to an enriched dataset that can be exported.')
316	317	parser_transform.set_defaults(func=transformer_launcher)
	318	+ parser_transform.add_argument('-c', '--collection', action='store',
	319	+ help='Name of MongoDB collection',
	320	+ default='editors')
317	321
318	322	parser_dataset = subparsers.add_parser('export', help='Create a dataset from the MongoDB and write it to a csv file.')
319	323	parser_dataset.set_defaults(func=exporter_launcher)
Index: trunk/tools/editor_trends/analyses/cohort_confidence_intervals.py
—	—	@@ -0,0 +1,49 @@
	2	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	3	+__author__email = 'dvanliere at gmail dot com'
	4	+__date__ = '2010-11-24'
	5	+__version__ = '0.1'
	6	+
	7	+import sys
	8	+sys.path.append('..')
	9	+
	10	+import configuration
	11	+settings = configuration.Settings()
	12	+from utils import utils
	13	+from database import db
	14	+
	15	+
	16	+def dataset_edits_by_month(dbname, **kwargs):
	17	+ dbname = kwargs.pop('dbname')
	18	+ mongo = db.init_mongo_db(dbname)
	19	+ editors = mongo['dataset']
	20	+ name = dbname + '_edits_by_month.csv'
	21	+ fh = utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding)
	22	+ x = 0
	23	+ vars_to_expand = ['monthly_edits']
	24	+ while True:
	25	+ try:
	26	+ id = input_queue.get(block=False)
	27	+ print input_queue.qsize()
	28	+ obs = editors.find_one({'editor': id})
	29	+ obs = expand_observations(obs, vars_to_expand)
	30	+ if x == 0:
	31	+ headers = obs.keys()
	32	+ headers.sort()
	33	+ headers = expand_headers(headers, vars_to_expand, obs)
	34	+ utils.write_list_to_csv(headers, fh)
	35	+ data = []
	36	+ keys = obs.keys()
	37	+ keys.sort()
	38	+ for key in keys:
	39	+ data.append(obs[key])
	40	+ utils.write_list_to_csv(data, fh)
	41	+
	42	+ x += 1
	43	+ except Empty:
	44	+ break
	45	+ fh.close()
	46	+
	47	+
	48	+if __name__ == '__main__':
	49	+
	50	+
\ No newline at end of file
Index: trunk/tools/editor_trends/etl/bots.py
—	—	@@ -1,123 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		-
18		~~-import os~~
19		~~-import cStringIO~~
20		~~-import xml.etree.cElementTree as cElementTree~~
21		-
22		-
23		~~-import configuration~~
24		~~-settings = configuration.Settings()~~
25		~~-from wikitree import xml~~
26		~~-from database import db~~
27		~~-from database import db_settings~~
28		~~-from utils import utils~~
29		~~-from utils import process_constructor as pc~~
30		-
31		~~-try:~~
32		~~- import psyco~~
33		~~- psyco.full()~~
34		~~-except ImportError:~~
35		~~- pass~~
36		-
37		-
38		~~-def create_bot_ids_db_mongo():~~
39		~~- ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.encoding)~~
40		~~- mongo = db.init_mongo_db('bots')~~
41		~~- collection = mongo['ids']~~
42		-
43		~~- db.remove_documents_from_mongo_db(collection, None)~~
44		-
45		~~- for id, name in ids.iteritems():~~
46		~~- collection.insert({'id': id, 'name': name})~~
47		-
48		~~- print collection.count()~~
49		-
50		-
51		~~-def lookup_username(input_queue, result_queue, progressbar, bots, debug=False):~~
52		~~- '''~~
53		~~- This function is used to find the id's belonging to the different bots that~~
54		~~- are patrolling the Wikipedia sites.~~
55		~~- @input_queue contains a list of xml files to parse~~
56		-
57		~~- @result_queue should be set to false as the results are directly written to~~
58		~~- a csv file.~~
59		-
60		~~- @progressbar depends on settings~~
61		-
62		~~- @bots is a dictionary containing the names of the bots to lookup~~
63		~~- '''~~
64		-
65		~~- #if len(bots.keys()) == 1:~~
66		~~- bots = bots['bots']~~
67		~~- #print bots.keys()~~
68		-
69		~~- if settings.debug:~~
70		~~- messages = {}~~
71		-
72		~~- while True:~~
73		~~- if debug:~~
74		~~- file = input_queue~~
75		~~- else:~~
76		~~- file = input_queue.get(block=False)~~
77		-
78		~~- if file == None:~~
79		~~- break~~
80		-
81		~~- data = xml.read_input(utils.open_txt_file(settings.input_location +~~
82		~~- file, 'r', encoding=settings.encoding))~~
83		-
84		~~- for raw_data in data:~~
85		~~- xml_buffer = cStringIO.StringIO()~~
86		~~- raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')~~
87		~~- raw_data = ''.join(raw_data)~~
88		~~- raw_data = raw_data.encode('utf-8')~~
89		~~- xml_buffer.write(raw_data)~~
90		-
91		~~- try:~~
92		~~- xml_nodes = cElementTree.XML(xml_buffer.getvalue())~~
93		~~- revisions = xml_nodes.findall('revision')~~
94		~~- for revision in revisions:~~
95		~~- contributor = xml.retrieve_xml_node(revision, 'contributor')~~
96		~~- username = contributor.find('username')~~
97		~~- if username == None:~~
98		~~- continue~~
99		~~- username = xml.extract_text(username)~~
100		~~- #print username.encode('utf-8')~~
101		-
102		~~- if username in bots:~~
103		~~- id = contributor.find('id')~~
104		~~- id = xml.extract_text(id)~~
105		~~- #print username.encode('utf-8'), id~~
106		~~- utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.encoding)~~
107		~~- bots.pop(username)~~
108		~~- if bots == {}:~~
109		~~- print 'Mission accomplished'~~
110		~~- return~~
111		~~- except Exception, error:~~
112		~~- print error~~
113		~~- if settings.debug:~~
114		~~- messages = utils.track_errors(xml_buffer, error, file,~~
115		~~- messages)~~
116		-
117		~~- if settings.debug:~~
118		~~- utils.report_error_messages(messages, lookup_username)~~
119		-
120		-
121		~~-if __name__ == '__main__':~~
122		~~- #debug()~~
123		~~- #add_id_to_botnames()~~
124		~~- create_bot_ids_db_mongo()~~
Index: trunk/tools/editor_trends/etl/exporter.py
—	—	@@ -17,19 +17,23 @@
18	18	__date__ = '2010-10-21'
19	19	__version__ = '0.1'
20	20
	21	+import sys
	22	+import datetime
	23	+from dateutil.relativedelta import *
	24	+import calendar
21	25	from multiprocessing import Queue
22	26	from Queue import Empty
23		~~-import datetime~~
24		-from dateutil.relativedelta import *
25		~~-import sys~~
26		~~-import progressbar~~
27	27
	28	+
	29	+
28	30	sys.path.append('..')
29	31	import configuration
30	32	settings = configuration.Settings()
31	33	from utils import models, utils
32	34	from database import db
	35	+from etl import shaper
33	36	from utils import process_constructor as pc
	37	+import progressbar
34	38
35	39	try:
36	40	import psyco
—	—	@@ -38,6 +42,91 @@
39	43	pass
40	44
41	45
	46	+class Variable(object):
	47	+
	48	+ def __init__(self, var):
	49	+ setattr(self, 'name', var)
	50	+ self.stats = ['n', 'avg', 'sd', 'min', 'max']
	51	+ setattr(self, 'time', shaper.create_datacontainer())
	52	+ setattr(self, 'time', shaper.add_months_to_datacontainer(getattr(self, 'time'), datatype='dict'))
	53	+
	54	+ for var in self.stats:
	55	+ setattr(self, var, shaper.create_datacontainer())
	56	+ setattr(self, var, shaper.add_months_to_datacontainer(getattr(self, var), datatype='list'))
	57	+
	58	+ def __repr__(self):
	59	+ return self.name
	60	+
	61	+ def descriptives(self):
	62	+ for year in self.time:
	63	+ for month in self.time[year]:
	64	+ data = [self.time[year][month][k] for k in self.time[year][month].keys()]
	65	+ self.avg[year][month] = shaper.get_mean(data)
	66	+ self.sd[year][month] = shaper.get_standard_deviation(data)
	67	+ self.min[year][month] = min(data)
	68	+ self.max[year][month] = max(data)
	69	+ self.n[year][month] = len(data)
	70	+
	71	+
	72	+class LongDataset(object):
	73	+
	74	+ def __init__(self, vars):
	75	+ self.name = 'long_dataset.tsv'
	76	+ self.vars = []
	77	+ for var in vars:
	78	+ setattr(self, var, Variable(var))
	79	+ self.vars.append(var)
	80	+
	81	+ def __repr__(self):
	82	+ return 'Dataset containing: %s' % (self.vars)
	83	+
	84	+ def write_headers(self, fh):
	85	+ fh.write('_time\t')
	86	+ for var in self.vars:
	87	+ var = getattr(self, var)
	88	+ for stat in var.stats:
	89	+ fh.write('%s_%s\t' % (var.name, stat))
	90	+ fh.write('\n')
	91	+
	92	+ def convert_to_longitudinal_data(self, id, obs, vars):
	93	+ for var in vars:
	94	+ ds = getattr(self, var)
	95	+ years = obs[var].keys()
	96	+ for year in years:
	97	+ months = obs[var][year].keys()
	98	+ for m in months:
	99	+ #d = calendar.monthrange(int(year), int(m))[1] #determines the number of days in a given month/year
	100	+ #date = datetime.date(int(year), int(m), d)
	101	+ if id not in ds.time[year][m]:
	102	+ ds.time[year][m][id] = 0
	103	+ ds.time[year][m][id] = obs[var][year][str(m)]
	104	+
	105	+ def write_longitudinal_data(self):
	106	+ fh = utils.create_txt_filehandle(settings.dataset_location, self.name, 'w', settings.encoding)
	107	+ self.write_headers(fh)
	108	+ dc = shaper.create_datacontainer()
	109	+ dc = shaper.add_months_to_datacontainer(dc)
	110	+
	111	+ for var in self.vars:
	112	+ var = getattr(self, var)
	113	+ var.descriptives()
	114	+ years = dc.keys()
	115	+ years.sort()
	116	+ for year in years:
	117	+ months = dc[year].keys()
	118	+ months.sort()
	119	+ for month in months:
	120	+ d = calendar.monthrange(int(year), int(month))[1] #determines the number of days in a given month/year
	121	+ date = datetime.date(int(year), int(month), d)
	122	+ fh.write('%s\t' % date)
	123	+ for var in self.vars:
	124	+ var = getattr(self, var)
	125	+ #data = ['%s_%s\t' % (var.name, getattr(var, stat)[year][month]) for stat in var.stats]
	126	+ fh.write(''.join(['%s\t' % (getattr(var, stat)[year][month],) for stat in var.stats]))
	127	+ fh.write('\n')
	128	+ fh.close()
	129	+
	130	+
42	131	def retrieve_editor_ids_mongo(dbname, collection):
43	132	if utils.check_file_exists(settings.binary_location,
44	133	'editors.bin'):
—	—	@@ -71,16 +160,6 @@
72	161	obs[var] = edits
73	162	return obs
74	163
75		~~-def write_longitudinal_data(id, edits, fh):~~
76		~~- years = edits.keys()~~
77		~~- years.sort()~~
78		~~- for year in years:~~
79		~~- months = edits[year].keys()~~
80		~~- months = [int(m) for m in months]~~
81		~~- months.sort()~~
82		~~- for m in months:~~
83		~~- date = datetime.date(int(year), int(m), 1)~~
84		~~- fh.write('%s\t%s\t%s\n' % (id, date, edits[year][str(m)]))~~
85	164
86	165
87	166	def expand_headers(headers, vars_to_expand, obs):
—	—	@@ -97,32 +176,28 @@
98	177	return headers
99	178
100	179
101		~~-def generate_long_editor_dataset(input_queue, data_queue, pbar, **kwargs):~~
102		~~- debug = kwargs.pop('debug')~~
	180	+def generate_long_editor_dataset(input_queue, vars, **kwargs):
103	181	dbname = kwargs.pop('dbname')
104	182	mongo = db.init_mongo_db(dbname)
105	183	editors = mongo['dataset']
106	184	name = dbname + '_long_editors.csv'
107		~~- fh = utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding)~~
108		~~- x = 0~~
	185	+ #fh = utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding)
109	186	vars_to_expand = []
	187	+ keys = dict([(var, 1) for var in vars])
	188	+ ld = LongDataset(vars)
110	189	while True:
111	190	try:
112	191	id = input_queue.get(block=False)
113		~~- obs = editors.find_one({'editor': id}, {'monthly_edits': 1})~~
114		~~- if x == 0:~~
115		~~- headers = obs.keys()~~
116		~~- headers.sort()~~
117		~~- headers = expand_headers(headers, vars_to_expand, obs)~~
118		~~- utils.write_list_to_csv(headers, fh)~~
119		~~- write_longitudinal_data(id, obs['monthly_edits'], fh)~~
	192	+ print id
	193	+ obs = editors.find_one({'editor': id}, keys)
	194	+ ld.convert_to_longitudinal_data(id, obs, vars)
120	195	#utils.write_list_to_csv(data, fh)
121		~~- x += 1~~
122	196	except Empty:
123	197	break
	198	+ ld.write_longitudinal_data()
124	199
125	200
126		~~-def generate_cohort_analysis(input_queue, data_queue, pbar, **kwargs):~~
	201	+def generate_cohort_analysis(input_queue, **kwargs):
127	202	dbname = kwargs.get('dbname')
128	203	pbar = kwargs.get('pbar')
129	204	mongo = db.init_mongo_db(dbname)
—	—	@@ -169,6 +244,7 @@
170	245	break
171	246	utils.store_object(data, settings.binary_location, 'cohort_data')
172	247
	248	+
173	249	def date_falls_in_window(window_start, window_end, first_edit, last_edit):
174	250	if first_edit >= window_start and first_edit <= window_end:
175	251	return True
—	—	@@ -176,7 +252,7 @@
177	253	return False
178	254
179	255
180		~~-def generate_wide_editor_dataset(input_queue, data_queue, pbar, **kwargs):~~
	256	+def generate_wide_editor_dataset(input_queue, **kwargs):
181	257	dbname = kwargs.pop('dbname')
182	258	mongo = db.init_mongo_db(dbname)
183	259	editors = mongo['dataset']
—	—	@@ -241,16 +317,19 @@
242	318
243	319	def generate_editor_dataset_debug(dbname):
244	320	ids = retrieve_editor_ids_mongo(dbname, 'editors')
	321	+ #ids = list(ids)[:1000]
245	322	input_queue = pc.load_queue(ids)
246	323	kwargs = {'nr_input_processors': 1,
247	324	'nr_output_processors': 1,
248	325	'debug': True,
249	326	'dbname': dbname,
250	327	}
251		~~- generate_editor_dataset(input_queue, False, False, kwargs)~~
	328	+ #generate_editor_dataset(input_queue, False, False, kwargs)
	329	+ vars = ['monthly_edits']
	330	+ generate_long_editor_dataset(input_queue, vars, **kwargs)
252	331
253		-
254	332	if __name__ == '__main__':
255	333	#generate_editor_dataset_debug('test')
256		~~- generate_editor_dataset_launcher('enwiki')~~
	334	+ #generate_editor_dataset_launcher('enwiki')
	335	+ generate_editor_dataset_debug('enwiki')
257	336	#debug_retrieve_edits_by_contributor_launcher()
Index: trunk/tools/editor_trends/etl/extract.py
—	—	@@ -135,29 +135,25 @@
136	136
137	137
138	138
139		~~-def determine_username_is_bot(username, kwargs):~~
	139	+def determine_username_is_bot(contributor, bots):
140	140	'''
141		~~- @username is the xml element containing the id of the user~~
142		~~- @kwargs should have a list with all the bot ids~~
143		-
144		~~- @Return False if username id is not in bot list id or True if username id~~
	141	+ #contributor is an xml element containing the id of the contributor
	142	+ @bots should have a dcit with all the bot ids and bot names
	143	+ @Return False if username id is not in bot dict id or True if username id
145	144	is a bot id.
146	145	'''
147		~~- ids = kwargs.get('bots', [])~~
148		~~- if ids == None:~~
149		~~- ids = []~~
150		~~- if username != None and username.text != None:~~
151		~~- id = username.text~~
152		~~- if id in ids:~~
153		~~- return 1~~
154		~~- else:~~
155		~~- return 0~~
	146	+ for elem in contributor:
	147	+ if elem.tag == 'id':
	148	+ if elem.text in bots['bots']:
	149	+ return 1
	150	+ else:
	151	+ return 0
156	152
157	153
158	154	def extract_username(contributor, kwargs):
159	155	for elem in contributor:
160	156	if elem.tag == 'username':
161		~~- return elem.text #.encode(settings.encoding)~~
	157	+ return elem.text
162	158	else:
163	159	return None
164	160
—	—	@@ -167,16 +163,14 @@
168	164	@contributor is the xml contributor node containing a number of attributes
169	165
170	166	Currently, we are only interested in registered contributors, hence we
171		~~- ignore anonymous editors. If you are interested in collecting data on~~
172		~~- anonymous editors then add the string 'ip' to the tags variable.~~
	167	+ ignore anonymous editors.
173	168	'''
174		~~- tags = ['id']~~
175	169	if contributor.get('deleted'):
176	170	return - 1 # ASK: Not sure if this is the best way to code deleted contributors.
177	171	for elem in contributor:
178		~~- if elem.tag in tags:~~
	172	+ if elem.tag == 'id':
179	173	if elem.text != None:
180		~~- return elem.text.encode(settings.encoding)~~
	174	+ return elem.text
181	175	else:
182	176	return - 1
183	177
—	—	@@ -209,6 +203,8 @@
210	204	vars[var] = function(xml_node, kwargs)
211	205
212	206	#print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])
	207	+ if vars['username'] == 'ClueBot':
	208	+ print 'debug'
213	209	if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:
214	210	vars.pop('bot')
215	211	if destination == 'queue':
—	—	@@ -222,100 +218,6 @@
223	219	vars = {}
224	220
225	221
226		~~-#def parse_editors(xml_queue, data_queue, **kwargs):~~
227		~~-# '''~~
228		~~-# @xml_queue contains the filenames of the files to be parsed~~
229		~~-# @data_queue is an instance of Queue where the extracted data is stored for~~
230		~~-# further processing~~
231		~~-# @pbar is an instance of progressbar to display the progress~~
232		~~-# @bots is a list of id's of known Wikipedia bots~~
233		~~-# @debug is a flag to indicate whether the function is called for debugging.~~
234		-#
235		~~-# Output is the data_queue that will be used by store_editors()~~
236		~~-# '''~~
237		~~-# input = kwargs.get('input', None)~~
238		~~-# output = kwargs.get('output', None)~~
239		~~-# debug = kwargs.get('debug', False)~~
240		~~-# destination = kwargs.get('destination', 'file')~~
241		~~-# bots = kwargs.get('bots', None)~~
242		~~-# pbar = kwargs.get('pbar', None)~~
243		~~-# if settings.debug:~~
244		~~-# messages = {}~~
245		~~-# vars = {}~~
246		-#
247		~~-# while True:~~
248		~~-# try:~~
249		~~-# if debug:~~
250		~~-# file = xml_queue~~
251		~~-# else:~~
252		~~-# file = xml_queue.get(block=False)~~
253		~~-# if file == None:~~
254		~~-# print 'Swallowed a poison pill'~~
255		~~-# break~~
256		-#
257		~~-# data = xml.read_input(utils.create_txt_filehandle(input,~~
258		~~-# file, 'r',~~
259		~~-# encoding=settings.encoding))~~
260		~~-# if destination == 'file':~~
261		~~-# name = file[:-4] + '.txt'~~
262		~~-# fh = utils.create_txt_filehandle(output, name, 'w', settings.encoding)~~
263		~~-# for raw_data in data:~~
264		~~-# xml_buffer = cStringIO.StringIO()~~
265		~~-# raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')~~
266		-#
267		~~-# try:~~
268		~~-# raw_data = ''.join(raw_data)~~
269		~~-# xml_buffer.write(raw_data)~~
270		~~-# elem = cElementTree.XML(xml_buffer.getvalue())~~
271		~~-# output_editor_information(elem, fh, bots=bots, destination=destination)~~
272		~~-# except SyntaxError, error:~~
273		~~-# print error~~
274		~~-# '''~~
275		~~-# There are few cases with invalid tokens, they are fixed~~
276		~~-# here and then reinserted into the XML DOM~~
277		~~-# data = convert_html_entities(xml_buffer.getvalue())~~
278		~~-# elem = cElementTree.XML(data)~~
279		~~-# output_editor_information(elem)~~
280		~~-# '''~~
281		~~-# if settings.debug:~~
282		~~-# utils.track_errors(xml_buffer, error, file, messages)~~
283		~~-# except UnicodeEncodeError, error:~~
284		~~-# print error~~
285		~~-# if settings.debug:~~
286		~~-# utils.track_errors(xml_buffer, error, file, messages)~~
287		~~-# except MemoryError, error:~~
288		~~-# print file, error~~
289		~~-# print raw_data[:12]~~
290		~~-# print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])~~
291		~~-# if destination == 'queue':~~
292		~~-# output.put('NEXT')~~
293		~~-# while True:~~
294		~~-# if output.qsize() < 100000:~~
295		~~-# break~~
296		~~-# else:~~
297		~~-# time.sleep(10)~~
298		~~-# print 'Still sleeping, queue is %s items long' % output.qsize()~~
299		-#
300		~~-# else:~~
301		~~-# fh.close()~~
302		-#
303		~~-# if pbar:~~
304		~~-# print file, xml_queue.qsize()~~
305		~~-# #utils.update_progressbar(pbar, xml_queue)~~
306		-#
307		~~-# if debug:~~
308		~~-# break~~
309		-#
310		~~-# except Empty:~~
311		~~-# break~~
312		-#
313		~~-# if destination == 'queue':~~
314		~~-# data_queue.put(None)~~
315		-#
316		~~-# if settings.debug:~~
317		~~-# utils.report_error_messages(messages, parse_editors)~~
318		-
319		-
320	222	def load_bot_ids():
321	223	'''
322	224	Loader function to retrieve list of id's of known Wikipedia bots.
—	—	@@ -352,12 +254,14 @@
353	255	tasks.join()
354	256
355	257
356		~~-def debug_parse_editors(dbname):~~
357		~~- q = JoinableQueue()~~
358		~~- parse_editors('522.xml', q, None, None, debug=True, destination='file')~~
359		~~- store_editors(q, [], dbname)~~
	258	+def debug_parse_editors(location):
	259	+ bots = load_bot_ids()
	260	+ input = os.path.join(location, 'chunks')
	261	+ output = os.path.join(location, 'txt')
	262	+ xml_file = XMLFile(input, output, '1.xml', bots, output_editor_information, destination='file')
	263	+ xml_file()
360	264
361		-
362		~~-if __name__ == "__main__":~~
363		~~- #debug_parse_editors('test2')~~
364		~~- run_parse_editors(os.path.join(settings.input_location, 'en', 'wiki'))~~
	265	+if __name__ == '__main__':
	266	+ location = os.path.join(settings.input_location, 'en', 'wiki')
	267	+ debug_parse_editors(location)
	268	+ #run_parse_editors(location)
Index: trunk/tools/editor_trends/etl/shaper.py
—	—	@@ -0,0 +1,72 @@
	2	+
	3	+
	4	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	5	+__author__email = 'dvanliere at gmail dot com'
	6	+__date__ = '2010-11-24'
	7	+__version__ = '0.1'
	8	+
	9	+import datetime
	10	+import math
	11	+
	12	+def create_datacontainer(init_value=0):
	13	+ '''
	14	+ This function initializes an empty dictionary with as key the year (starting
	15	+ 2001 and running through) and as value @init_value, in most cases this will
	16	+ be zero so the dictionary will act as a running tally for a variable but
	17	+ @init_value can also a list, [], or a dictionary, {}, or a set, set().
	18	+ '''
	19	+ data = {}
	20	+ year = datetime.datetime.now().year + 1
	21	+ for x in xrange(2001, year):
	22	+ if init_value == 'set':
	23	+ data[str(x)] = set()
	24	+ else:
	25	+ data[str(x)] = init_value
	26	+ return data
	27	+
	28	+
	29	+def add_months_to_datacontainer(datacontainer, datatype=0.0):
	30	+ for dc in datacontainer:
	31	+ datacontainer[dc] = {}
	32	+ for x in xrange(1, 13):
	33	+ if datatype =='dict':
	34	+ datacontainer[dc][str(x)] = dict()
	35	+ elif datatype == 'list':
	36	+ datacontainer[dc][str(x)] = list()
	37	+ elif datatype == 'set':
	38	+ datacontainer[dc][str(x)] = set()
	39	+ else:
	40	+ datacontainer[dc][str(x)] = 0.0
	41	+ #else:
	42	+ # datacontainer[dc][str(x)] = 0.0
	43	+ return datacontainer
	44	+
	45	+
	46	+def get_standard_deviation(numberList):
	47	+ mean = get_mean(numberList)
	48	+ std = 0
	49	+ n = len(numberList)
	50	+ for i in numberList:
	51	+ std = std + (i - mean)**2
	52	+ return math.sqrt(std / float(n-1))
	53	+
	54	+
	55	+def get_median(numberList):
	56	+ #print numberList
	57	+ if numberList== []: return '.'
	58	+ theValues = sorted(numberList)
	59	+ theValues = [float(x) for x in theValues]
	60	+ if len(theValues) % 2 == 1:
	61	+ return theValues[(len(theValues)+1)/2-1]
	62	+ else:
	63	+ lower = theValues[len(theValues)/2-1]
	64	+ upper = theValues[len(theValues)/2]
	65	+ #print upper, lower
	66	+ return (lower + upper) / 2
	67	+
	68	+
	69	+def get_mean(numberList):
	70	+ #print numberList
	71	+ if numberList== []: return '.'
	72	+ floatNums = [float(x) for x in numberList]
	73	+ return sum(floatNums) / len(numberList)
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/etl/shaper.py
___________________________________________________________________
Added: svn:eol-style
1	74	+ native
Index: trunk/tools/editor_trends/etl/transformer.py
—	—	@@ -31,8 +31,8 @@
32	32	from utils import utils
33	33	from utils import models
34	34	import construct_datasets
	35	+import shaper
35	36
36		-
37	37	try:
38	38	import psyco
39	39	psyco.full()
—	—	@@ -52,27 +52,25 @@
53	53
54	54
55	55	class Editor(object):
56		~~- def __init__(self, dbname, id, **kwargs):~~
	56	+ def __init__(self, dbname, collection, id, **kwargs):
57	57	self.dbname = dbname
58	58	self.id = id
	59	+ self.collection = collection
59	60	for kw in kwargs:
60	61	setattr(self, kw, kwargs[kw])
61	62
62	63	def __str__(self):
63	64	return '%s' % (self.id)
64		~~- # mongo = db.init_mongo_db(dbname)~~
65		~~- # input = mongo[dbname]~~
66		~~- # output = mongo['dataset']~~
67		~~- # output.ensure_index('editor')~~
68		~~- # output.ensure_index('year_joined')~~
69		-
	65	+
70	66	def __call__(self):
71	67	self.mongo = db.init_mongo_db(self.dbname)
72		~~- input_db = self.mongo['editors']~~
73		~~- output_db = self.mongo['dataset']~~
	68	+ input_db = self.mongo[self.collection]
	69	+ output_db = self.mongo[self.collection +'_dataset']
74	70
75	71	output_db.ensure_index('editor')
76	72	output_db.create_index('editor')
	73	+ output_db.ensure_index('year_joined')
	74	+ output_db.create_index('year_joined')
77	75
78	76	editor = input_db.find_one({'editor': self.id})
79	77	if editor == None:
—	—	@@ -100,43 +98,14 @@
101	99	'username': username
102	100	})
103	101
104		~~-def create_datacontainer(init_value=0):~~
105		~~- '''~~
106		~~- This function initializes an empty dictionary with as key the year (starting~~
107		~~- 2001 and running through) and as value @init_value, in most cases this will~~
108		~~- be zero so the dictionary will act as a running tally for a variable but~~
109		~~- @init_value can also a list, [], or a dictionary, {}, or a set, set().~~
110		~~- '''~~
111		~~- data = {}~~
112		~~- year = datetime.datetime.now().year + 1~~
113		~~- for x in xrange(2001, year):~~
114		~~- if init_value == 'set':~~
115		~~- data[str(x)] = set()~~
116		~~- else:~~
117		~~- data[str(x)] = init_value~~
118		~~- return data~~
119	102
120		-
121		~~-def add_months_to_datacontainer(datacontainer):~~
122		~~- for dc in datacontainer:~~
123		~~- datacontainer[dc] = {}~~
124		~~- for x in xrange(1, 13):~~
125		~~- datacontainer[dc][str(x)] = 0~~
126		~~- return datacontainer~~
127		-
128		-
129	103	def determine_edits_by_month(edits):
130		~~- datacontainer = create_datacontainer(init_value=0)~~
131		~~- datacontainer = add_months_to_datacontainer(datacontainer)~~
	104	+ datacontainer = shaper.create_datacontainer(init_value=0)
	105	+ datacontainer = shaper.add_months_to_datacontainer(datacontainer)
132	106	for year in edits:
133		~~- months = set()~~
134	107	for edit in edits[year]:
135	108	m = str(edit['date'].month)
136		~~- if m not in months:~~
137		~~- datacontainer[year][m] = 1~~
138		~~- months.add(m)~~
139		~~- if len(months) == 12:~~
140		~~- break~~
	109	+ datacontainer[year][m] += 1
141	110	return datacontainer
142	111
143	112
—	—	@@ -144,7 +113,7 @@
145	114	'''
146	115	This function counts the number of edits by year made by a particular editor.
147	116	'''
148		~~- edits = create_datacontainer()~~
	117	+ edits = shaper.create_datacontainer()
149	118	for date in dates:
150	119	year = str(date['date'].year)
151	120	edits[year] += 1
—	—	@@ -156,7 +125,7 @@
157	126	This function counts the number of unique articles by year edited by a
158	127	particular editor.
159	128	'''
160		~~- articles = create_datacontainer('set')~~
	129	+ articles = shaper.create_datacontainer('set')
161	130	for date in dates:
162	131	year = str(date['date'].year)
163	132	articles[year].add(date['article'])
—	—	@@ -179,8 +148,8 @@
180	149	# definition = kwargs.pop('definition')
181	150
182	151
183		~~-def run_optimize_editors(dbname):~~
184		~~- ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')~~
	152	+def run_optimize_editors(dbname, collection):
	153	+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, collection)
185	154	kwargs = {'definition': 'traditional',
186	155	'pbar': True,
187	156	}
—	—	@@ -190,7 +159,7 @@
191	160	consumers = [EditorConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
192	161
193	162	for id in ids:
194		~~- tasks.put(Editor(dbname, id))~~
	163	+ tasks.put(Editor(dbname, collection, id))
195	164	for x in xrange(settings.number_of_processes):
196	165	tasks.put(None)
197	166
—	—	@@ -212,4 +181,4 @@
213	182
214	183	if __name__ == '__main__':
215	184	#debug_optimize_editors('test')
216		~~- run_optimize_editors('enwiki')~~
	185	+ run_optimize_editors('enwiki', 'test')
Index: trunk/tools/editor_trends/etl/loader.py
—	—	@@ -71,7 +71,7 @@
72	72	utils.store_object(editors, settings.binary_location, 'editors')
73	73
74	74
75		~~-def mergesort_external_launcher(dbname, input, intermediate_output, output):~~
	75	+def mergesort_external_launcher(dbname, input, output):
76	76	files = utils.retrieve_file_list(input, 'txt', mask='')
77	77	x = 0
78	78	maxval = 99999
—	—	@@ -80,11 +80,11 @@
81	81	maxval = round(len(files) / x)
82	82	chunks = utils.split_list(files, int(x))
83	83	'''1st iteration external mergesort'''
84		~~- if len(chunks) < 2:~~
85		~~- intermediate_output = output~~
	84	+ to_remove = []
86	85	for chunk in chunks:
87	86	filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]]
88		~~- filename = sort.merge_sorted_files(intermediate_output, filehandles, chunk)~~
	87	+ filename = sort.merge_sorted_files(output, filehandles, chunk)
	88	+ to_remove.append(filename)
89	89	filehandles = [fh.close() for fh in filehandles]
90	90	'''2nd iteration external mergesort, if necessary'''
91	91	if len(chunks) > 1:
—	—	@@ -93,6 +93,9 @@
94	94	filename = sort.merge_sorted_files(output, filehandles, 'final')
95	95	filehandles = [fh.close() for fh in filehandles]
96	96	filename = 'merged_final.txt'
	97	+ for r in to_remove:
	98	+ utils.delete_file(output ,r)
	99	+
97	100
98	101
99	102
Index: trunk/tools/editor_trends/experience/map.py
—	—	@@ -0,0 +1,122 @@
	2	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	3	+__author__email = 'dvanliere at gmail dot com'
	4	+__date__ = '2010-11-22'
	5	+__version__ = '0.1'
	6	+
	7	+import xml.etree.cElementTree as cElementTree
	8	+import cStringIO
	9	+import os
	10	+import sys
	11	+import codecs
	12	+import multiprocessing
	13	+sys.path.append('..')
	14	+import cProfile
	15	+
	16	+import configuration
	17	+settings = configuration.Settings()
	18	+
	19	+from etl import extract
	20	+from utils import models
	21	+from wikitree import xml
	22	+from utils import utils
	23	+from etl import chunker
	24	+
	25	+
	26	+def extract_article_talk_pages(page, output, **kwargs):
	27	+ tags = {'title': xml.extract_text,
	28	+ 'id': xml.extract_text,
	29	+ }
	30	+ headers = ['id', 'title']
	31	+ vars = {}
	32	+ elements = page.getchildren()
	33	+ for tag, function in tags.iteritems():
	34	+ xml_node = xml.retrieve_xml_node(elements, tag)
	35	+ vars[tag] = function(xml_node, kwargs)
	36	+
	37	+ data = []
	38	+ for head in headers:
	39	+ data.append(vars[head])
	40	+ utils.write_list_to_csv(data, output)
	41	+
	42	+
	43	+def map_article_talk_ids(language_code):
	44	+ ns = chunker.load_namespace(language_code)
	45	+ talk_ns = ns['1'].get(u'*', None)
	46	+ input = os.path.join(settings.input_location, 'en', 'wiki', 'article_talk')
	47	+ files = utils.retrieve_file_list(input, 'txt')
	48	+ articles = {}
	49	+ talks = {}
	50	+ for file in files:
	51	+ fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding)
	52	+ for line in fh:
	53	+ line = line.replace('\n', '')
	54	+ id, article = line.split('\t')
	55	+ if not article.startswith(talk_ns):
	56	+ articles[article] = {}
	57	+ articles[article]['id'] = id
	58	+ else:
	59	+ talks[article] = id
	60	+ fh.close()
	61	+ utils.store_object(articles, settings.binary_location, 'articles.bin')
	62	+ utils.store_object(talks, settings.binary_location, 'talks.bin')
	63	+
	64	+ for article in articles:
	65	+ talk = '%s:%s' % (talk_ns, article)
	66	+ if talk in talks:
	67	+ articles[article]['talk_id'] = talks[talk]
	68	+
	69	+ utils.store_object(articles, settings.binary_location, 'articles_talks.bin')
	70	+
	71	+
	72	+def article_to_talk_launcher(**kwargs):
	73	+ file = 'dewiki-latest-stub-meta-current.xml'#'enwiki-20100916-stub-meta-history.xml'
	74	+ include = [0, 1]
	75	+ language_code = 'en'
	76	+ project = 'wiki'
	77	+ input = os.path.join(settings.input_location, 'en', 'wiki')
	78	+ output = os.path.join(settings.input_location, 'en', 'wiki', 'chunks')
	79	+ chunker.split_file(input, file, project, language_code, include, format='xml', zip=True)
	80	+ files = utils.retrieve_file_list(output, 'xml')
	81	+
	82	+
	83	+ tasks = multiprocessing.JoinableQueue()
	84	+ consumers = [extract.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
	85	+ input = output
	86	+ output = os.path.join(settings.input_location, 'en', 'wiki', 'article_talk')
	87	+ for file in files:
	88	+ tasks.put(extract.XMLFile(input, output, file, [], extract_article_talk_pages, destination='file'))
	89	+ for x in xrange(settings.number_of_processes):
	90	+ tasks.put(None)
	91	+
	92	+ print tasks.qsize()
	93	+ for w in consumers:
	94	+ w.start()
	95	+
	96	+ tasks.join()
	97	+
	98	+
	99	+def debug_map_article_talk_ids():
	100	+ map_article_talk_ids('de')
	101	+
	102	+
	103	+def debug_article_to_talk():
	104	+ input = os.path.join(settings.input_location, 'en', 'wiki', 'chunks', '0.xml')
	105	+ output = os.path.join(settings.input_location, 'en', 'wiki', 'txt', 'test.txt')
	106	+ f = codecs.open(output, 'w', encoding=settings.encoding)
	107	+ fh = open(input, 'r')
	108	+ data = xml.read_input(fh)
	109	+ for raw_data in data:
	110	+ xml_buffer = cStringIO.StringIO()
	111	+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
	112	+ raw_data = ''.join(raw_data)
	113	+ xml_buffer.write(raw_data)
	114	+ elem = cElementTree.XML(xml_buffer.getvalue())
	115	+ extract_article_talk_pages(elem, f)
	116	+ f.close()
	117	+
	118	+
	119	+if __name__ == '__main__':
	120	+ #cProfile.run('article_to_talk_launcher()')
	121	+ #debug_article_to_talk()
	122	+ debug_map_article_talk_ids()
	123	+ #article_to_talk_launcher()
Property changes on: trunk/tools/editor_trends/experience/map.py
___________________________________________________________________
Added: svn:eol-style
1	124	+ native
Index: trunk/tools/editor_trends/experience/__init__.py
Property changes on: trunk/tools/editor_trends/experience/__init__.py
___________________________________________________________________
Added: svn:eol-style
2	125	+ native
Index: trunk/tools/editor_trends/configuration.py
—	—	@@ -27,6 +27,7 @@
28	28	import os
29	29	import sys
30	30	import platform
	31	+import subprocess
31	32
32	33	try:
33	34	from _winreg import *
—	—	@@ -138,14 +139,17 @@
139	140	return QueryValueEx(key, 'Path')[0]
140	141	except WindowsError:
141	142	return None
142		-
143		-
	143	+
	144	+ def detect_linux_program(self, program):
	145	+ path = subprocess.Popen(['which', '%s' % program],stdout=subprocess.PIPE).communicate()[0]
	146	+ return path.replace('\n','')
	147	+
144	148	def detect_installed_program(self, program):
145	149	if self.platform == 'Windows':
146	150	path = self.detect_windows_program(program)
147		~~- return path~~
148		~~- else:~~
149		~~- raise NotImplementedError~~
	151	+ elif self.platform == 'Linux':
	152	+ path = self.detect_linux_program(program)
	153	+ return path
150	154
151	155	def determine_max_filehandles_open(self):
152	156	if self.platform == 'Windows' and self.architecture == 'i386':
—	—	@@ -167,7 +171,7 @@
168	172	def determine_ziptool(self):
169	173	tools = {'OSX': None,
170	174	'Windows': '7z.exe',
171		~~- 'Linux': None}~~
	175	+ 'Linux': 'unzip'}
172	176	return tools[self.platform]
173	177
174	178	def set_file_locations(self):
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -143,16 +143,13 @@
144	144
145	145
146	146	# read / write data related functions
147		~~-def read_data_from_csv(filename, encoding):~~
	147	+def read_data_from_csv(location, filename, encoding):
148	148	'''
149	149	@filename is the path (either absolute or relative) including the name of
150	150	of the file
151	151	@encoding is usually utf-8
152	152	'''
153		~~- if hasattr(filename, '__call__'):~~
154		~~- filename = construct_filename(filename)~~
155		-
156		~~- fh = open_txt_file(filename, 'r', encoding=encoding)~~
	153	+ fh = create_txt_filehandle(location, filename, 'r', encoding)
157	154	for line in fh:
158	155	yield line
159	156
—	—	@@ -220,6 +217,7 @@
221	218
222	219	def write_dict_to_csv(data, fh, write_key=True, newline=True):
223	220	keys = data.keys()
	221	+ keys.sort()
224	222	for key in keys:
225	223	if write_key:
226	224	fh.write('%s' % key)
—	—	@@ -317,12 +315,12 @@
318	316	return dict([[v, k] for k, v in dictionary.items()])
319	317
320	318
321		~~-def create_dict_from_csv_file(filename, encoding):~~
	319	+def create_dict_from_csv_file(location, filename, encoding):
322	320	'''
323	321	Constructs a dictionary from a txtfile
324	322	'''
325	323	d = {}
326		~~- for line in read_data_from_csv(filename, encoding):~~
	324	+ for line in read_data_from_csv(location, filename, encoding):
327	325	line = clean_string(line)
328	326	value, key = line.split('\t')
329	327	d[key] = value
—	—	@@ -375,12 +373,13 @@
376	374	raise exceptions.PlatformNotSupportedError
377	375
378	376
379		~~-def zip_extract(path, location, source):~~
	377	+def zip_extract(location, source):
380	378	'''
381	379	@path is the absolute path to the zip program
382	380	@location is the directory where to store the compressed file
383	381	@source is the name of the zipfile
384	382	'''
	383	+ path = settings.path_ziptool
385	384	if settings.platform == 'Windows':
386	385	p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait()
387	386	elif settings.platform == 'Linux':
Index: trunk/tools/editor_trends/tests/mongodb/store.py
—	—	@@ -0,0 +1,28 @@
	2	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	3	+__author__email = 'dvanliere at gmail dot com'
	4	+__date__ = '2010-11-09'
	5	+__version__ = '0.1'
	6	+
	7	+import datetime
	8	+import calendar
	9	+import time
	10	+from database import db
	11	+
	12	+
	13	+def test_date():
	14	+
	15	+ mongo = db.init_mongo_db('unit_test')
	16	+ collection = mongo['foo']
	17	+ d1 = datetime.datetime(2007, 1, 1)
	18	+ d2 = datetime.datetime(2006, 12, 31)
	19	+
	20	+ if d1.utcoffset() is not None:
	21	+ d1 = d1 - d1.utcoffset()
	22	+ millis = int(calendar.timegm(d1.timetuple()) * 1000 + d1.microsecond / 1000)
	23	+ millis = millis /1000
	24	+ d3 = time.gmtime(millis)
	25	+ #d3 = datetime.date(2007, 1, 1)
	26	+ collection.insert({'date': d1})
	27	+ collection.insert({'date': d2})
	28	+ #collection.insert({'date': d3})
	29	+
Property changes on: trunk/tools/editor_trends/tests/mongodb/store.py
___________________________________________________________________
Added: svn:eol-style
1	30	+ native
Index: trunk/tools/editor_trends/tests/mongodb/__init__.py
Property changes on: trunk/tools/editor_trends/tests/mongodb/__init__.py
___________________________________________________________________
Added: svn:eol-style
2	31	+ native
Index: trunk/tools/editor_trends/tests/__init__.py
Property changes on: trunk/tools/editor_trends/tests/__init__.py
___________________________________________________________________
Added: svn:eol-style
3	32	+ native
Index: trunk/tools/editor_trends/tests/test.py
—	—	@@ -0,0 +1,6 @@
	2	+import configuration
	3	+settings = configuration.Settings()
	4	+
	5	+from tests.mongodb import store
	6	+
	7	+store.test_date()
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/logs
___________________________________________________________________
Added: svn:ignore
1	8	+ *.bin
split_xml
Index: trunk/tools/editor_trends/bots/bots.py
—	—	@@ -0,0 +1,141 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+
	18	+import os
	19	+import cStringIO
	20	+import xml.etree.cElementTree as cElementTree
	21	+import sys
	22	+sys.path.append('..')
	23	+
	24	+import configuration
	25	+settings = configuration.Settings()
	26	+from wikitree import xml
	27	+from database import db
	28	+from database import db_settings
	29	+from utils import utils
	30	+from utils import process_constructor as pc
	31	+
	32	+try:
	33	+ import psyco
	34	+ psyco.full()
	35	+except ImportError:
	36	+ pass
	37	+
	38	+
	39	+def read_bots_csv_file(location, filename, encoding):
	40	+ '''
	41	+ Constructs a dictionary:
	42	+ key is language
	43	+ value is a list of bot names
	44	+ '''
	45	+ d = {}
	46	+ for line in utils.read_data_from_csv(location, filename, encoding):
	47	+ line = utils.clean_string(line)
	48	+ language, bots = line.split(',')
	49	+ bots = bots.split('\|')
	50	+ for bot in bots:
	51	+ if bot not in d:
	52	+ d[bot] = {}
	53	+ d[bot]['id'] = None
	54	+ d[bot]['languages'] = []
	55	+ d[bot]['languages'].append(language)
	56	+ return d
	57	+
	58	+
	59	+def store_bots():
	60	+ bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding)
	61	+ mongo = db.init_mongo_db('bots')
	62	+ collection = mongo['ids']
	63	+ db.remove_documents_from_mongo_db(collection, None)
	64	+ for id, name in ids.iteritems():
	65	+ collection.insert({'id': int(id), 'name': name, 'language': language})
	66	+
	67	+ print 'Stored %s bots' % collection.count()
	68	+
	69	+
	70	+def lookup_bot_userid(input_queue, language_code, project, bots):
	71	+ '''
	72	+ This function is used to find the id's belonging to the different bots that
	73	+ are patrolling the Wikipedia sites.
	74	+ @input_queue contains a list of xml files to parse
	75	+ @bots is a dictionary containing the names of the bots to lookup
	76	+ '''
	77	+ if settings.debug:
	78	+ messages = {}
	79	+
	80	+ location = os.path.join(settings.input_location, language_code, project, 'chunks')
	81	+ fh = utils.create_txt_filehandle(settings.csv_location, 'bots_ids.csv', 'w', settings.encoding)
	82	+
	83	+ while True:
	84	+ file = input_queue.get(block=False)
	85	+ if file == None:
	86	+ break
	87	+ data = xml.read_input(utils.create_txt_filehandle(location,
	88	+ file,
	89	+ 'r',
	90	+ settings.encoding))
	91	+
	92	+ for raw_data in data:
	93	+ xml_buffer = cStringIO.StringIO()
	94	+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
	95	+ raw_data = ''.join(raw_data)
	96	+ raw_data = raw_data.encode('utf-8')
	97	+ xml_buffer.write(raw_data)
	98	+
	99	+ try:
	100	+ xml_nodes = cElementTree.XML(xml_buffer.getvalue())
	101	+ revisions = xml_nodes.findall('revision')
	102	+ for revision in revisions:
	103	+ contributor = xml.retrieve_xml_node(revision, 'contributor')
	104	+ username = contributor.find('username')
	105	+ if username == None:
	106	+ continue
	107	+ username = xml.extract_text(username, None)
	108	+ #print username.encode('utf-8')
	109	+ if username in bots:
	110	+ id = contributor.find('id')
	111	+ id = xml.extract_text(id, None)
	112	+ #print username.encode('utf-8'), id
	113	+ bot = bots[username]
	114	+ bot['_username'] = username
	115	+ bot['id'] = id
	116	+ utils.write_dict_to_csv(bot, fh, write_key=False)
	117	+ bots.pop(username)
	118	+ if bots == {}:
	119	+ print 'Found id numbers for all bots.'
	120	+ return
	121	+
	122	+ except Exception, error:
	123	+ print error
	124	+ if settings.debug:
	125	+ messages = utils.track_errors(xml_buffer, error, file,
	126	+ messages)
	127	+ fh.close()
	128	+
	129	+ if settings.debug:
	130	+ utils.report_error_messages(messages, lookup_username)
	131	+
	132	+def bot_launcher(language_code, project):
	133	+ bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding)
	134	+ files = utils.retrieve_file_list(os.path.join(settings.input_location, language_code, project, 'chunks'), 'xml', mask=None)
	135	+ input_queue = pc.load_queue(files, poison_pill=True)
	136	+ lookup_bot_userid(input_queue, language_code, project, bots)
	137	+
	138	+
	139	+if __name__ == '__main__':
	140	+ language_code = 'en'
	141	+ project = 'wiki'
	142	+ bot_launcher(language_code, project)
Property changes on: trunk/tools/editor_trends/bots/bots.py
___________________________________________________________________
Added: svn:eol-style
1	143	+ native
Added: svn:mime-type
2	144	+ text/plain
Property changes on: trunk/tools/editor_trends
___________________________________________________________________
Modified: svn:ignore
3	145	- wikistats
zips
notes.txt
*.pyc
datasets
errors
.settings
.project
.pydevproject
wiki.cfg
4	146	+ wikistats
zips
notes.txt
*.pyc
datasets
errors
.settings
.project
.pydevproject
wiki.cfg
fabric.py
fabfile.py
deployment

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r77300 [removed: new added: deferred]