r77300 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r77299‎ | r77300 | r77301 >
Date:21:18, 25 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
* Expanded bot detection functionality
* Manage.py has more configurable options
* Made OO solution to create datasets.
Modified paths:
  • /trunk/tools/editor_trends (modified) (history)
  • /trunk/tools/editor_trends/analyses/cohort_confidence_intervals.py (added) (history)
  • /trunk/tools/editor_trends/bots/bots.py (added) (history)
  • /trunk/tools/editor_trends/configuration.py (modified) (history)
  • /trunk/tools/editor_trends/errors (deleted) (history)
  • /trunk/tools/editor_trends/etl/bots.py (deleted) (history)
  • /trunk/tools/editor_trends/etl/exporter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extract.py (modified) (history)
  • /trunk/tools/editor_trends/etl/loader.py (modified) (history)
  • /trunk/tools/editor_trends/etl/shaper.py (added) (history)
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)
  • /trunk/tools/editor_trends/experience (added) (history)
  • /trunk/tools/editor_trends/experience/__init__.py (added) (history)
  • /trunk/tools/editor_trends/experience/map.py (added) (history)
  • /trunk/tools/editor_trends/logs (added) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/statistics/r (added) (history)
  • /trunk/tools/editor_trends/tests (added) (history)
  • /trunk/tools/editor_trends/tests/__init__.py (added) (history)
  • /trunk/tools/editor_trends/tests/mongodb (added) (history)
  • /trunk/tools/editor_trends/tests/mongodb/__init__.py (added) (history)
  • /trunk/tools/editor_trends/tests/mongodb/store.py (added) (history)
  • /trunk/tools/editor_trends/tests/test.py (added) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -172,7 +172,7 @@
173173
174174 def launch_zip_extractor(args, location, file):
175175 timer = Timer()
176 - utils.zip_extract(location, file, compression='7z')
 176+ utils.zip_extract(location, file)
177177 timer.elapsed()
178178
179179
@@ -211,7 +211,8 @@
212212 print 'dataset launcher'
213213 timer = Timer()
214214 project = kwargs.pop('full_project')
215 - transformer.run_optimize_editors(project)
 215+ collection = kwargs.pop('collection')
 216+ transformer.run_optimize_editors(project, collection)
216217 timer.elapsed()
217218
218219
@@ -313,6 +314,9 @@
314315
315316 parser_transform = subparsers.add_parser('transform', help='Transform the raw datatabe to an enriched dataset that can be exported.')
316317 parser_transform.set_defaults(func=transformer_launcher)
 318+ parser_transform.add_argument('-c', '--collection', action='store',
 319+ help='Name of MongoDB collection',
 320+ default='editors')
317321
318322 parser_dataset = subparsers.add_parser('export', help='Create a dataset from the MongoDB and write it to a csv file.')
319323 parser_dataset.set_defaults(func=exporter_launcher)
Index: trunk/tools/editor_trends/analyses/cohort_confidence_intervals.py
@@ -0,0 +1,49 @@
 2+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 3+__author__email = 'dvanliere at gmail dot com'
 4+__date__ = '2010-11-24'
 5+__version__ = '0.1'
 6+
 7+import sys
 8+sys.path.append('..')
 9+
 10+import configuration
 11+settings = configuration.Settings()
 12+from utils import utils
 13+from database import db
 14+
 15+
 16+def dataset_edits_by_month(dbname, **kwargs):
 17+ dbname = kwargs.pop('dbname')
 18+ mongo = db.init_mongo_db(dbname)
 19+ editors = mongo['dataset']
 20+ name = dbname + '_edits_by_month.csv'
 21+ fh = utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding)
 22+ x = 0
 23+ vars_to_expand = ['monthly_edits']
 24+ while True:
 25+ try:
 26+ id = input_queue.get(block=False)
 27+ print input_queue.qsize()
 28+ obs = editors.find_one({'editor': id})
 29+ obs = expand_observations(obs, vars_to_expand)
 30+ if x == 0:
 31+ headers = obs.keys()
 32+ headers.sort()
 33+ headers = expand_headers(headers, vars_to_expand, obs)
 34+ utils.write_list_to_csv(headers, fh)
 35+ data = []
 36+ keys = obs.keys()
 37+ keys.sort()
 38+ for key in keys:
 39+ data.append(obs[key])
 40+ utils.write_list_to_csv(data, fh)
 41+
 42+ x += 1
 43+ except Empty:
 44+ break
 45+ fh.close()
 46+
 47+
 48+if __name__ == '__main__':
 49+
 50+
\ No newline at end of file
Index: trunk/tools/editor_trends/etl/bots.py
@@ -1,123 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -
18 -import os
19 -import cStringIO
20 -import xml.etree.cElementTree as cElementTree
21 -
22 -
23 -import configuration
24 -settings = configuration.Settings()
25 -from wikitree import xml
26 -from database import db
27 -from database import db_settings
28 -from utils import utils
29 -from utils import process_constructor as pc
30 -
31 -try:
32 - import psyco
33 - psyco.full()
34 -except ImportError:
35 - pass
36 -
37 -
38 -def create_bot_ids_db_mongo():
39 - ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.encoding)
40 - mongo = db.init_mongo_db('bots')
41 - collection = mongo['ids']
42 -
43 - db.remove_documents_from_mongo_db(collection, None)
44 -
45 - for id, name in ids.iteritems():
46 - collection.insert({'id': id, 'name': name})
47 -
48 - print collection.count()
49 -
50 -
51 -def lookup_username(input_queue, result_queue, progressbar, bots, debug=False):
52 - '''
53 - This function is used to find the id's belonging to the different bots that
54 - are patrolling the Wikipedia sites.
55 - @input_queue contains a list of xml files to parse
56 -
57 - @result_queue should be set to false as the results are directly written to
58 - a csv file.
59 -
60 - @progressbar depends on settings
61 -
62 - @bots is a dictionary containing the names of the bots to lookup
63 - '''
64 -
65 - #if len(bots.keys()) == 1:
66 - bots = bots['bots']
67 - #print bots.keys()
68 -
69 - if settings.debug:
70 - messages = {}
71 -
72 - while True:
73 - if debug:
74 - file = input_queue
75 - else:
76 - file = input_queue.get(block=False)
77 -
78 - if file == None:
79 - break
80 -
81 - data = xml.read_input(utils.open_txt_file(settings.input_location +
82 - file, 'r', encoding=settings.encoding))
83 -
84 - for raw_data in data:
85 - xml_buffer = cStringIO.StringIO()
86 - raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
87 - raw_data = ''.join(raw_data)
88 - raw_data = raw_data.encode('utf-8')
89 - xml_buffer.write(raw_data)
90 -
91 - try:
92 - xml_nodes = cElementTree.XML(xml_buffer.getvalue())
93 - revisions = xml_nodes.findall('revision')
94 - for revision in revisions:
95 - contributor = xml.retrieve_xml_node(revision, 'contributor')
96 - username = contributor.find('username')
97 - if username == None:
98 - continue
99 - username = xml.extract_text(username)
100 - #print username.encode('utf-8')
101 -
102 - if username in bots:
103 - id = contributor.find('id')
104 - id = xml.extract_text(id)
105 - #print username.encode('utf-8'), id
106 - utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.encoding)
107 - bots.pop(username)
108 - if bots == {}:
109 - print 'Mission accomplished'
110 - return
111 - except Exception, error:
112 - print error
113 - if settings.debug:
114 - messages = utils.track_errors(xml_buffer, error, file,
115 - messages)
116 -
117 - if settings.debug:
118 - utils.report_error_messages(messages, lookup_username)
119 -
120 -
121 -if __name__ == '__main__':
122 - #debug()
123 - #add_id_to_botnames()
124 - create_bot_ids_db_mongo()
Index: trunk/tools/editor_trends/etl/exporter.py
@@ -17,19 +17,23 @@
1818 __date__ = '2010-10-21'
1919 __version__ = '0.1'
2020
 21+import sys
 22+import datetime
 23+from dateutil.relativedelta import *
 24+import calendar
2125 from multiprocessing import Queue
2226 from Queue import Empty
23 -import datetime
24 -from dateutil.relativedelta import *
25 -import sys
26 -import progressbar
2727
 28+
 29+
2830 sys.path.append('..')
2931 import configuration
3032 settings = configuration.Settings()
3133 from utils import models, utils
3234 from database import db
 35+from etl import shaper
3336 from utils import process_constructor as pc
 37+import progressbar
3438
3539 try:
3640 import psyco
@@ -38,6 +42,91 @@
3943 pass
4044
4145
 46+class Variable(object):
 47+
 48+ def __init__(self, var):
 49+ setattr(self, 'name', var)
 50+ self.stats = ['n', 'avg', 'sd', 'min', 'max']
 51+ setattr(self, 'time', shaper.create_datacontainer())
 52+ setattr(self, 'time', shaper.add_months_to_datacontainer(getattr(self, 'time'), datatype='dict'))
 53+
 54+ for var in self.stats:
 55+ setattr(self, var, shaper.create_datacontainer())
 56+ setattr(self, var, shaper.add_months_to_datacontainer(getattr(self, var), datatype='list'))
 57+
 58+ def __repr__(self):
 59+ return self.name
 60+
 61+ def descriptives(self):
 62+ for year in self.time:
 63+ for month in self.time[year]:
 64+ data = [self.time[year][month][k] for k in self.time[year][month].keys()]
 65+ self.avg[year][month] = shaper.get_mean(data)
 66+ self.sd[year][month] = shaper.get_standard_deviation(data)
 67+ self.min[year][month] = min(data)
 68+ self.max[year][month] = max(data)
 69+ self.n[year][month] = len(data)
 70+
 71+
 72+class LongDataset(object):
 73+
 74+ def __init__(self, vars):
 75+ self.name = 'long_dataset.tsv'
 76+ self.vars = []
 77+ for var in vars:
 78+ setattr(self, var, Variable(var))
 79+ self.vars.append(var)
 80+
 81+ def __repr__(self):
 82+ return 'Dataset containing: %s' % (self.vars)
 83+
 84+ def write_headers(self, fh):
 85+ fh.write('_time\t')
 86+ for var in self.vars:
 87+ var = getattr(self, var)
 88+ for stat in var.stats:
 89+ fh.write('%s_%s\t' % (var.name, stat))
 90+ fh.write('\n')
 91+
 92+ def convert_to_longitudinal_data(self, id, obs, vars):
 93+ for var in vars:
 94+ ds = getattr(self, var)
 95+ years = obs[var].keys()
 96+ for year in years:
 97+ months = obs[var][year].keys()
 98+ for m in months:
 99+ #d = calendar.monthrange(int(year), int(m))[1] #determines the number of days in a given month/year
 100+ #date = datetime.date(int(year), int(m), d)
 101+ if id not in ds.time[year][m]:
 102+ ds.time[year][m][id] = 0
 103+ ds.time[year][m][id] = obs[var][year][str(m)]
 104+
 105+ def write_longitudinal_data(self):
 106+ fh = utils.create_txt_filehandle(settings.dataset_location, self.name, 'w', settings.encoding)
 107+ self.write_headers(fh)
 108+ dc = shaper.create_datacontainer()
 109+ dc = shaper.add_months_to_datacontainer(dc)
 110+
 111+ for var in self.vars:
 112+ var = getattr(self, var)
 113+ var.descriptives()
 114+ years = dc.keys()
 115+ years.sort()
 116+ for year in years:
 117+ months = dc[year].keys()
 118+ months.sort()
 119+ for month in months:
 120+ d = calendar.monthrange(int(year), int(month))[1] #determines the number of days in a given month/year
 121+ date = datetime.date(int(year), int(month), d)
 122+ fh.write('%s\t' % date)
 123+ for var in self.vars:
 124+ var = getattr(self, var)
 125+ #data = ['%s_%s\t' % (var.name, getattr(var, stat)[year][month]) for stat in var.stats]
 126+ fh.write(''.join(['%s\t' % (getattr(var, stat)[year][month],) for stat in var.stats]))
 127+ fh.write('\n')
 128+ fh.close()
 129+
 130+
42131 def retrieve_editor_ids_mongo(dbname, collection):
43132 if utils.check_file_exists(settings.binary_location,
44133 'editors.bin'):
@@ -71,16 +160,6 @@
72161 obs[var] = edits
73162 return obs
74163
75 -def write_longitudinal_data(id, edits, fh):
76 - years = edits.keys()
77 - years.sort()
78 - for year in years:
79 - months = edits[year].keys()
80 - months = [int(m) for m in months]
81 - months.sort()
82 - for m in months:
83 - date = datetime.date(int(year), int(m), 1)
84 - fh.write('%s\t%s\t%s\n' % (id, date, edits[year][str(m)]))
85164
86165
87166 def expand_headers(headers, vars_to_expand, obs):
@@ -97,32 +176,28 @@
98177 return headers
99178
100179
101 -def generate_long_editor_dataset(input_queue, data_queue, pbar, **kwargs):
102 - debug = kwargs.pop('debug')
 180+def generate_long_editor_dataset(input_queue, vars, **kwargs):
103181 dbname = kwargs.pop('dbname')
104182 mongo = db.init_mongo_db(dbname)
105183 editors = mongo['dataset']
106184 name = dbname + '_long_editors.csv'
107 - fh = utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding)
108 - x = 0
 185+ #fh = utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding)
109186 vars_to_expand = []
 187+ keys = dict([(var, 1) for var in vars])
 188+ ld = LongDataset(vars)
110189 while True:
111190 try:
112191 id = input_queue.get(block=False)
113 - obs = editors.find_one({'editor': id}, {'monthly_edits': 1})
114 - if x == 0:
115 - headers = obs.keys()
116 - headers.sort()
117 - headers = expand_headers(headers, vars_to_expand, obs)
118 - utils.write_list_to_csv(headers, fh)
119 - write_longitudinal_data(id, obs['monthly_edits'], fh)
 192+ print id
 193+ obs = editors.find_one({'editor': id}, keys)
 194+ ld.convert_to_longitudinal_data(id, obs, vars)
120195 #utils.write_list_to_csv(data, fh)
121 - x += 1
122196 except Empty:
123197 break
 198+ ld.write_longitudinal_data()
124199
125200
126 -def generate_cohort_analysis(input_queue, data_queue, pbar, **kwargs):
 201+def generate_cohort_analysis(input_queue, **kwargs):
127202 dbname = kwargs.get('dbname')
128203 pbar = kwargs.get('pbar')
129204 mongo = db.init_mongo_db(dbname)
@@ -169,6 +244,7 @@
170245 break
171246 utils.store_object(data, settings.binary_location, 'cohort_data')
172247
 248+
173249 def date_falls_in_window(window_start, window_end, first_edit, last_edit):
174250 if first_edit >= window_start and first_edit <= window_end:
175251 return True
@@ -176,7 +252,7 @@
177253 return False
178254
179255
180 -def generate_wide_editor_dataset(input_queue, data_queue, pbar, **kwargs):
 256+def generate_wide_editor_dataset(input_queue, **kwargs):
181257 dbname = kwargs.pop('dbname')
182258 mongo = db.init_mongo_db(dbname)
183259 editors = mongo['dataset']
@@ -241,16 +317,19 @@
242318
243319 def generate_editor_dataset_debug(dbname):
244320 ids = retrieve_editor_ids_mongo(dbname, 'editors')
 321+ #ids = list(ids)[:1000]
245322 input_queue = pc.load_queue(ids)
246323 kwargs = {'nr_input_processors': 1,
247324 'nr_output_processors': 1,
248325 'debug': True,
249326 'dbname': dbname,
250327 }
251 - generate_editor_dataset(input_queue, False, False, kwargs)
 328+ #generate_editor_dataset(input_queue, False, False, kwargs)
 329+ vars = ['monthly_edits']
 330+ generate_long_editor_dataset(input_queue, vars, **kwargs)
252331
253 -
254332 if __name__ == '__main__':
255333 #generate_editor_dataset_debug('test')
256 - generate_editor_dataset_launcher('enwiki')
 334+ #generate_editor_dataset_launcher('enwiki')
 335+ generate_editor_dataset_debug('enwiki')
257336 #debug_retrieve_edits_by_contributor_launcher()
Index: trunk/tools/editor_trends/etl/extract.py
@@ -135,29 +135,25 @@
136136
137137
138138
139 -def determine_username_is_bot(username, kwargs):
 139+def determine_username_is_bot(contributor, bots):
140140 '''
141 - @username is the xml element containing the id of the user
142 - @kwargs should have a list with all the bot ids
143 -
144 - @Return False if username id is not in bot list id or True if username id
 141+ #contributor is an xml element containing the id of the contributor
 142+ @bots should have a dcit with all the bot ids and bot names
 143+ @Return False if username id is not in bot dict id or True if username id
145144 is a bot id.
146145 '''
147 - ids = kwargs.get('bots', [])
148 - if ids == None:
149 - ids = []
150 - if username != None and username.text != None:
151 - id = username.text
152 - if id in ids:
153 - return 1
154 - else:
155 - return 0
 146+ for elem in contributor:
 147+ if elem.tag == 'id':
 148+ if elem.text in bots['bots']:
 149+ return 1
 150+ else:
 151+ return 0
156152
157153
158154 def extract_username(contributor, kwargs):
159155 for elem in contributor:
160156 if elem.tag == 'username':
161 - return elem.text #.encode(settings.encoding)
 157+ return elem.text
162158 else:
163159 return None
164160
@@ -167,16 +163,14 @@
168164 @contributor is the xml contributor node containing a number of attributes
169165
170166 Currently, we are only interested in registered contributors, hence we
171 - ignore anonymous editors. If you are interested in collecting data on
172 - anonymous editors then add the string 'ip' to the tags variable.
 167+ ignore anonymous editors.
173168 '''
174 - tags = ['id']
175169 if contributor.get('deleted'):
176170 return - 1 # ASK: Not sure if this is the best way to code deleted contributors.
177171 for elem in contributor:
178 - if elem.tag in tags:
 172+ if elem.tag == 'id':
179173 if elem.text != None:
180 - return elem.text.encode(settings.encoding)
 174+ return elem.text
181175 else:
182176 return - 1
183177
@@ -209,6 +203,8 @@
210204 vars[var] = function(xml_node, kwargs)
211205
212206 #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])
 207+ if vars['username'] == 'ClueBot':
 208+ print 'debug'
213209 if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:
214210 vars.pop('bot')
215211 if destination == 'queue':
@@ -222,100 +218,6 @@
223219 vars = {}
224220
225221
226 -#def parse_editors(xml_queue, data_queue, **kwargs):
227 -# '''
228 -# @xml_queue contains the filenames of the files to be parsed
229 -# @data_queue is an instance of Queue where the extracted data is stored for
230 -# further processing
231 -# @pbar is an instance of progressbar to display the progress
232 -# @bots is a list of id's of known Wikipedia bots
233 -# @debug is a flag to indicate whether the function is called for debugging.
234 -#
235 -# Output is the data_queue that will be used by store_editors()
236 -# '''
237 -# input = kwargs.get('input', None)
238 -# output = kwargs.get('output', None)
239 -# debug = kwargs.get('debug', False)
240 -# destination = kwargs.get('destination', 'file')
241 -# bots = kwargs.get('bots', None)
242 -# pbar = kwargs.get('pbar', None)
243 -# if settings.debug:
244 -# messages = {}
245 -# vars = {}
246 -#
247 -# while True:
248 -# try:
249 -# if debug:
250 -# file = xml_queue
251 -# else:
252 -# file = xml_queue.get(block=False)
253 -# if file == None:
254 -# print 'Swallowed a poison pill'
255 -# break
256 -#
257 -# data = xml.read_input(utils.create_txt_filehandle(input,
258 -# file, 'r',
259 -# encoding=settings.encoding))
260 -# if destination == 'file':
261 -# name = file[:-4] + '.txt'
262 -# fh = utils.create_txt_filehandle(output, name, 'w', settings.encoding)
263 -# for raw_data in data:
264 -# xml_buffer = cStringIO.StringIO()
265 -# raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
266 -#
267 -# try:
268 -# raw_data = ''.join(raw_data)
269 -# xml_buffer.write(raw_data)
270 -# elem = cElementTree.XML(xml_buffer.getvalue())
271 -# output_editor_information(elem, fh, bots=bots, destination=destination)
272 -# except SyntaxError, error:
273 -# print error
274 -# '''
275 -# There are few cases with invalid tokens, they are fixed
276 -# here and then reinserted into the XML DOM
277 -# data = convert_html_entities(xml_buffer.getvalue())
278 -# elem = cElementTree.XML(data)
279 -# output_editor_information(elem)
280 -# '''
281 -# if settings.debug:
282 -# utils.track_errors(xml_buffer, error, file, messages)
283 -# except UnicodeEncodeError, error:
284 -# print error
285 -# if settings.debug:
286 -# utils.track_errors(xml_buffer, error, file, messages)
287 -# except MemoryError, error:
288 -# print file, error
289 -# print raw_data[:12]
290 -# print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
291 -# if destination == 'queue':
292 -# output.put('NEXT')
293 -# while True:
294 -# if output.qsize() < 100000:
295 -# break
296 -# else:
297 -# time.sleep(10)
298 -# print 'Still sleeping, queue is %s items long' % output.qsize()
299 -#
300 -# else:
301 -# fh.close()
302 -#
303 -# if pbar:
304 -# print file, xml_queue.qsize()
305 -# #utils.update_progressbar(pbar, xml_queue)
306 -#
307 -# if debug:
308 -# break
309 -#
310 -# except Empty:
311 -# break
312 -#
313 -# if destination == 'queue':
314 -# data_queue.put(None)
315 -#
316 -# if settings.debug:
317 -# utils.report_error_messages(messages, parse_editors)
318 -
319 -
320222 def load_bot_ids():
321223 '''
322224 Loader function to retrieve list of id's of known Wikipedia bots.
@@ -352,12 +254,14 @@
353255 tasks.join()
354256
355257
356 -def debug_parse_editors(dbname):
357 - q = JoinableQueue()
358 - parse_editors('522.xml', q, None, None, debug=True, destination='file')
359 - store_editors(q, [], dbname)
 258+def debug_parse_editors(location):
 259+ bots = load_bot_ids()
 260+ input = os.path.join(location, 'chunks')
 261+ output = os.path.join(location, 'txt')
 262+ xml_file = XMLFile(input, output, '1.xml', bots, output_editor_information, destination='file')
 263+ xml_file()
360264
361 -
362 -if __name__ == "__main__":
363 - #debug_parse_editors('test2')
364 - run_parse_editors(os.path.join(settings.input_location, 'en', 'wiki'))
 265+if __name__ == '__main__':
 266+ location = os.path.join(settings.input_location, 'en', 'wiki')
 267+ debug_parse_editors(location)
 268+ #run_parse_editors(location)
Index: trunk/tools/editor_trends/etl/shaper.py
@@ -0,0 +1,72 @@
 2+
 3+
 4+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 5+__author__email = 'dvanliere at gmail dot com'
 6+__date__ = '2010-11-24'
 7+__version__ = '0.1'
 8+
 9+import datetime
 10+import math
 11+
 12+def create_datacontainer(init_value=0):
 13+ '''
 14+ This function initializes an empty dictionary with as key the year (starting
 15+ 2001 and running through) and as value @init_value, in most cases this will
 16+ be zero so the dictionary will act as a running tally for a variable but
 17+ @init_value can also a list, [], or a dictionary, {}, or a set, set().
 18+ '''
 19+ data = {}
 20+ year = datetime.datetime.now().year + 1
 21+ for x in xrange(2001, year):
 22+ if init_value == 'set':
 23+ data[str(x)] = set()
 24+ else:
 25+ data[str(x)] = init_value
 26+ return data
 27+
 28+
 29+def add_months_to_datacontainer(datacontainer, datatype=0.0):
 30+ for dc in datacontainer:
 31+ datacontainer[dc] = {}
 32+ for x in xrange(1, 13):
 33+ if datatype =='dict':
 34+ datacontainer[dc][str(x)] = dict()
 35+ elif datatype == 'list':
 36+ datacontainer[dc][str(x)] = list()
 37+ elif datatype == 'set':
 38+ datacontainer[dc][str(x)] = set()
 39+ else:
 40+ datacontainer[dc][str(x)] = 0.0
 41+ #else:
 42+ # datacontainer[dc][str(x)] = 0.0
 43+ return datacontainer
 44+
 45+
 46+def get_standard_deviation(numberList):
 47+ mean = get_mean(numberList)
 48+ std = 0
 49+ n = len(numberList)
 50+ for i in numberList:
 51+ std = std + (i - mean)**2
 52+ return math.sqrt(std / float(n-1))
 53+
 54+
 55+def get_median(numberList):
 56+ #print numberList
 57+ if numberList== []: return '.'
 58+ theValues = sorted(numberList)
 59+ theValues = [float(x) for x in theValues]
 60+ if len(theValues) % 2 == 1:
 61+ return theValues[(len(theValues)+1)/2-1]
 62+ else:
 63+ lower = theValues[len(theValues)/2-1]
 64+ upper = theValues[len(theValues)/2]
 65+ #print upper, lower
 66+ return (lower + upper) / 2
 67+
 68+
 69+def get_mean(numberList):
 70+ #print numberList
 71+ if numberList== []: return '.'
 72+ floatNums = [float(x) for x in numberList]
 73+ return sum(floatNums) / len(numberList)
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/etl/shaper.py
___________________________________________________________________
Added: svn:eol-style
174 + native
Index: trunk/tools/editor_trends/etl/transformer.py
@@ -31,8 +31,8 @@
3232 from utils import utils
3333 from utils import models
3434 import construct_datasets
 35+import shaper
3536
36 -
3737 try:
3838 import psyco
3939 psyco.full()
@@ -52,27 +52,25 @@
5353
5454
5555 class Editor(object):
56 - def __init__(self, dbname, id, **kwargs):
 56+ def __init__(self, dbname, collection, id, **kwargs):
5757 self.dbname = dbname
5858 self.id = id
 59+ self.collection = collection
5960 for kw in kwargs:
6061 setattr(self, kw, kwargs[kw])
6162
6263 def __str__(self):
6364 return '%s' % (self.id)
64 - # mongo = db.init_mongo_db(dbname)
65 - # input = mongo[dbname]
66 - # output = mongo['dataset']
67 - # output.ensure_index('editor')
68 - # output.ensure_index('year_joined')
69 -
 65+
7066 def __call__(self):
7167 self.mongo = db.init_mongo_db(self.dbname)
72 - input_db = self.mongo['editors']
73 - output_db = self.mongo['dataset']
 68+ input_db = self.mongo[self.collection]
 69+ output_db = self.mongo[self.collection +'_dataset']
7470
7571 output_db.ensure_index('editor')
7672 output_db.create_index('editor')
 73+ output_db.ensure_index('year_joined')
 74+ output_db.create_index('year_joined')
7775
7876 editor = input_db.find_one({'editor': self.id})
7977 if editor == None:
@@ -100,43 +98,14 @@
10199 'username': username
102100 })
103101
104 -def create_datacontainer(init_value=0):
105 - '''
106 - This function initializes an empty dictionary with as key the year (starting
107 - 2001 and running through) and as value @init_value, in most cases this will
108 - be zero so the dictionary will act as a running tally for a variable but
109 - @init_value can also a list, [], or a dictionary, {}, or a set, set().
110 - '''
111 - data = {}
112 - year = datetime.datetime.now().year + 1
113 - for x in xrange(2001, year):
114 - if init_value == 'set':
115 - data[str(x)] = set()
116 - else:
117 - data[str(x)] = init_value
118 - return data
119102
120 -
121 -def add_months_to_datacontainer(datacontainer):
122 - for dc in datacontainer:
123 - datacontainer[dc] = {}
124 - for x in xrange(1, 13):
125 - datacontainer[dc][str(x)] = 0
126 - return datacontainer
127 -
128 -
129103 def determine_edits_by_month(edits):
130 - datacontainer = create_datacontainer(init_value=0)
131 - datacontainer = add_months_to_datacontainer(datacontainer)
 104+ datacontainer = shaper.create_datacontainer(init_value=0)
 105+ datacontainer = shaper.add_months_to_datacontainer(datacontainer)
132106 for year in edits:
133 - months = set()
134107 for edit in edits[year]:
135108 m = str(edit['date'].month)
136 - if m not in months:
137 - datacontainer[year][m] = 1
138 - months.add(m)
139 - if len(months) == 12:
140 - break
 109+ datacontainer[year][m] += 1
141110 return datacontainer
142111
143112
@@ -144,7 +113,7 @@
145114 '''
146115 This function counts the number of edits by year made by a particular editor.
147116 '''
148 - edits = create_datacontainer()
 117+ edits = shaper.create_datacontainer()
149118 for date in dates:
150119 year = str(date['date'].year)
151120 edits[year] += 1
@@ -156,7 +125,7 @@
157126 This function counts the number of unique articles by year edited by a
158127 particular editor.
159128 '''
160 - articles = create_datacontainer('set')
 129+ articles = shaper.create_datacontainer('set')
161130 for date in dates:
162131 year = str(date['date'].year)
163132 articles[year].add(date['article'])
@@ -179,8 +148,8 @@
180149 # definition = kwargs.pop('definition')
181150
182151
183 -def run_optimize_editors(dbname):
184 - ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
 152+def run_optimize_editors(dbname, collection):
 153+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, collection)
185154 kwargs = {'definition': 'traditional',
186155 'pbar': True,
187156 }
@@ -190,7 +159,7 @@
191160 consumers = [EditorConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
192161
193162 for id in ids:
194 - tasks.put(Editor(dbname, id))
 163+ tasks.put(Editor(dbname, collection, id))
195164 for x in xrange(settings.number_of_processes):
196165 tasks.put(None)
197166
@@ -212,4 +181,4 @@
213182
214183 if __name__ == '__main__':
215184 #debug_optimize_editors('test')
216 - run_optimize_editors('enwiki')
 185+ run_optimize_editors('enwiki', 'test')
Index: trunk/tools/editor_trends/etl/loader.py
@@ -71,7 +71,7 @@
7272 utils.store_object(editors, settings.binary_location, 'editors')
7373
7474
75 -def mergesort_external_launcher(dbname, input, intermediate_output, output):
 75+def mergesort_external_launcher(dbname, input, output):
7676 files = utils.retrieve_file_list(input, 'txt', mask='')
7777 x = 0
7878 maxval = 99999
@@ -80,11 +80,11 @@
8181 maxval = round(len(files) / x)
8282 chunks = utils.split_list(files, int(x))
8383 '''1st iteration external mergesort'''
84 - if len(chunks) < 2:
85 - intermediate_output = output
 84+ to_remove = []
8685 for chunk in chunks:
8786 filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]]
88 - filename = sort.merge_sorted_files(intermediate_output, filehandles, chunk)
 87+ filename = sort.merge_sorted_files(output, filehandles, chunk)
 88+ to_remove.append(filename)
8989 filehandles = [fh.close() for fh in filehandles]
9090 '''2nd iteration external mergesort, if necessary'''
9191 if len(chunks) > 1:
@@ -93,6 +93,9 @@
9494 filename = sort.merge_sorted_files(output, filehandles, 'final')
9595 filehandles = [fh.close() for fh in filehandles]
9696 filename = 'merged_final.txt'
 97+ for r in to_remove:
 98+ utils.delete_file(output ,r)
 99+
97100
98101
99102
Index: trunk/tools/editor_trends/experience/map.py
@@ -0,0 +1,122 @@
 2+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 3+__author__email = 'dvanliere at gmail dot com'
 4+__date__ = '2010-11-22'
 5+__version__ = '0.1'
 6+
 7+import xml.etree.cElementTree as cElementTree
 8+import cStringIO
 9+import os
 10+import sys
 11+import codecs
 12+import multiprocessing
 13+sys.path.append('..')
 14+import cProfile
 15+
 16+import configuration
 17+settings = configuration.Settings()
 18+
 19+from etl import extract
 20+from utils import models
 21+from wikitree import xml
 22+from utils import utils
 23+from etl import chunker
 24+
 25+
 26+def extract_article_talk_pages(page, output, **kwargs):
 27+ tags = {'title': xml.extract_text,
 28+ 'id': xml.extract_text,
 29+ }
 30+ headers = ['id', 'title']
 31+ vars = {}
 32+ elements = page.getchildren()
 33+ for tag, function in tags.iteritems():
 34+ xml_node = xml.retrieve_xml_node(elements, tag)
 35+ vars[tag] = function(xml_node, kwargs)
 36+
 37+ data = []
 38+ for head in headers:
 39+ data.append(vars[head])
 40+ utils.write_list_to_csv(data, output)
 41+
 42+
 43+def map_article_talk_ids(language_code):
 44+ ns = chunker.load_namespace(language_code)
 45+ talk_ns = ns['1'].get(u'*', None)
 46+ input = os.path.join(settings.input_location, 'en', 'wiki', 'article_talk')
 47+ files = utils.retrieve_file_list(input, 'txt')
 48+ articles = {}
 49+ talks = {}
 50+ for file in files:
 51+ fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding)
 52+ for line in fh:
 53+ line = line.replace('\n', '')
 54+ id, article = line.split('\t')
 55+ if not article.startswith(talk_ns):
 56+ articles[article] = {}
 57+ articles[article]['id'] = id
 58+ else:
 59+ talks[article] = id
 60+ fh.close()
 61+ utils.store_object(articles, settings.binary_location, 'articles.bin')
 62+ utils.store_object(talks, settings.binary_location, 'talks.bin')
 63+
 64+ for article in articles:
 65+ talk = '%s:%s' % (talk_ns, article)
 66+ if talk in talks:
 67+ articles[article]['talk_id'] = talks[talk]
 68+
 69+ utils.store_object(articles, settings.binary_location, 'articles_talks.bin')
 70+
 71+
 72+def article_to_talk_launcher(**kwargs):
 73+ file = 'dewiki-latest-stub-meta-current.xml'#'enwiki-20100916-stub-meta-history.xml'
 74+ include = [0, 1]
 75+ language_code = 'en'
 76+ project = 'wiki'
 77+ input = os.path.join(settings.input_location, 'en', 'wiki')
 78+ output = os.path.join(settings.input_location, 'en', 'wiki', 'chunks')
 79+ chunker.split_file(input, file, project, language_code, include, format='xml', zip=True)
 80+ files = utils.retrieve_file_list(output, 'xml')
 81+
 82+
 83+ tasks = multiprocessing.JoinableQueue()
 84+ consumers = [extract.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
 85+ input = output
 86+ output = os.path.join(settings.input_location, 'en', 'wiki', 'article_talk')
 87+ for file in files:
 88+ tasks.put(extract.XMLFile(input, output, file, [], extract_article_talk_pages, destination='file'))
 89+ for x in xrange(settings.number_of_processes):
 90+ tasks.put(None)
 91+
 92+ print tasks.qsize()
 93+ for w in consumers:
 94+ w.start()
 95+
 96+ tasks.join()
 97+
 98+
 99+def debug_map_article_talk_ids():
 100+ map_article_talk_ids('de')
 101+
 102+
 103+def debug_article_to_talk():
 104+ input = os.path.join(settings.input_location, 'en', 'wiki', 'chunks', '0.xml')
 105+ output = os.path.join(settings.input_location, 'en', 'wiki', 'txt', 'test.txt')
 106+ f = codecs.open(output, 'w', encoding=settings.encoding)
 107+ fh = open(input, 'r')
 108+ data = xml.read_input(fh)
 109+ for raw_data in data:
 110+ xml_buffer = cStringIO.StringIO()
 111+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
 112+ raw_data = ''.join(raw_data)
 113+ xml_buffer.write(raw_data)
 114+ elem = cElementTree.XML(xml_buffer.getvalue())
 115+ extract_article_talk_pages(elem, f)
 116+ f.close()
 117+
 118+
 119+if __name__ == '__main__':
 120+ #cProfile.run('article_to_talk_launcher()')
 121+ #debug_article_to_talk()
 122+ debug_map_article_talk_ids()
 123+ #article_to_talk_launcher()
Property changes on: trunk/tools/editor_trends/experience/map.py
___________________________________________________________________
Added: svn:eol-style
1124 + native
Index: trunk/tools/editor_trends/experience/__init__.py
Property changes on: trunk/tools/editor_trends/experience/__init__.py
___________________________________________________________________
Added: svn:eol-style
2125 + native
Index: trunk/tools/editor_trends/configuration.py
@@ -27,6 +27,7 @@
2828 import os
2929 import sys
3030 import platform
 31+import subprocess
3132
3233 try:
3334 from _winreg import *
@@ -138,14 +139,17 @@
139140 return QueryValueEx(key, 'Path')[0]
140141 except WindowsError:
141142 return None
142 -
143 -
 143+
 144+ def detect_linux_program(self, program):
 145+ path = subprocess.Popen(['which', '%s' % program],stdout=subprocess.PIPE).communicate()[0]
 146+ return path.replace('\n','')
 147+
144148 def detect_installed_program(self, program):
145149 if self.platform == 'Windows':
146150 path = self.detect_windows_program(program)
147 - return path
148 - else:
149 - raise NotImplementedError
 151+ elif self.platform == 'Linux':
 152+ path = self.detect_linux_program(program)
 153+ return path
150154
151155 def determine_max_filehandles_open(self):
152156 if self.platform == 'Windows' and self.architecture == 'i386':
@@ -167,7 +171,7 @@
168172 def determine_ziptool(self):
169173 tools = {'OSX': None,
170174 'Windows': '7z.exe',
171 - 'Linux': None}
 175+ 'Linux': 'unzip'}
172176 return tools[self.platform]
173177
174178 def set_file_locations(self):
Index: trunk/tools/editor_trends/utils/utils.py
@@ -143,16 +143,13 @@
144144
145145
146146 # read / write data related functions
147 -def read_data_from_csv(filename, encoding):
 147+def read_data_from_csv(location, filename, encoding):
148148 '''
149149 @filename is the path (either absolute or relative) including the name of
150150 of the file
151151 @encoding is usually utf-8
152152 '''
153 - if hasattr(filename, '__call__'):
154 - filename = construct_filename(filename)
155 -
156 - fh = open_txt_file(filename, 'r', encoding=encoding)
 153+ fh = create_txt_filehandle(location, filename, 'r', encoding)
157154 for line in fh:
158155 yield line
159156
@@ -220,6 +217,7 @@
221218
222219 def write_dict_to_csv(data, fh, write_key=True, newline=True):
223220 keys = data.keys()
 221+ keys.sort()
224222 for key in keys:
225223 if write_key:
226224 fh.write('%s' % key)
@@ -317,12 +315,12 @@
318316 return dict([[v, k] for k, v in dictionary.items()])
319317
320318
321 -def create_dict_from_csv_file(filename, encoding):
 319+def create_dict_from_csv_file(location, filename, encoding):
322320 '''
323321 Constructs a dictionary from a txtfile
324322 '''
325323 d = {}
326 - for line in read_data_from_csv(filename, encoding):
 324+ for line in read_data_from_csv(location, filename, encoding):
327325 line = clean_string(line)
328326 value, key = line.split('\t')
329327 d[key] = value
@@ -375,12 +373,13 @@
376374 raise exceptions.PlatformNotSupportedError
377375
378376
379 -def zip_extract(path, location, source):
 377+def zip_extract(location, source):
380378 '''
381379 @path is the absolute path to the zip program
382380 @location is the directory where to store the compressed file
383381 @source is the name of the zipfile
384382 '''
 383+ path = settings.path_ziptool
385384 if settings.platform == 'Windows':
386385 p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait()
387386 elif settings.platform == 'Linux':
Index: trunk/tools/editor_trends/tests/mongodb/store.py
@@ -0,0 +1,28 @@
 2+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 3+__author__email = 'dvanliere at gmail dot com'
 4+__date__ = '2010-11-09'
 5+__version__ = '0.1'
 6+
 7+import datetime
 8+import calendar
 9+import time
 10+from database import db
 11+
 12+
 13+def test_date():
 14+
 15+ mongo = db.init_mongo_db('unit_test')
 16+ collection = mongo['foo']
 17+ d1 = datetime.datetime(2007, 1, 1)
 18+ d2 = datetime.datetime(2006, 12, 31)
 19+
 20+ if d1.utcoffset() is not None:
 21+ d1 = d1 - d1.utcoffset()
 22+ millis = int(calendar.timegm(d1.timetuple()) * 1000 + d1.microsecond / 1000)
 23+ millis = millis /1000
 24+ d3 = time.gmtime(millis)
 25+ #d3 = datetime.date(2007, 1, 1)
 26+ collection.insert({'date': d1})
 27+ collection.insert({'date': d2})
 28+ #collection.insert({'date': d3})
 29+
Property changes on: trunk/tools/editor_trends/tests/mongodb/store.py
___________________________________________________________________
Added: svn:eol-style
130 + native
Index: trunk/tools/editor_trends/tests/mongodb/__init__.py
Property changes on: trunk/tools/editor_trends/tests/mongodb/__init__.py
___________________________________________________________________
Added: svn:eol-style
231 + native
Index: trunk/tools/editor_trends/tests/__init__.py
Property changes on: trunk/tools/editor_trends/tests/__init__.py
___________________________________________________________________
Added: svn:eol-style
332 + native
Index: trunk/tools/editor_trends/tests/test.py
@@ -0,0 +1,6 @@
 2+import configuration
 3+settings = configuration.Settings()
 4+
 5+from tests.mongodb import store
 6+
 7+store.test_date()
\ No newline at end of file
Property changes on: trunk/tools/editor_trends/logs
___________________________________________________________________
Added: svn:ignore
18 + *.bin
split_xml
Index: trunk/tools/editor_trends/bots/bots.py
@@ -0,0 +1,141 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+
 18+import os
 19+import cStringIO
 20+import xml.etree.cElementTree as cElementTree
 21+import sys
 22+sys.path.append('..')
 23+
 24+import configuration
 25+settings = configuration.Settings()
 26+from wikitree import xml
 27+from database import db
 28+from database import db_settings
 29+from utils import utils
 30+from utils import process_constructor as pc
 31+
 32+try:
 33+ import psyco
 34+ psyco.full()
 35+except ImportError:
 36+ pass
 37+
 38+
 39+def read_bots_csv_file(location, filename, encoding):
 40+ '''
 41+ Constructs a dictionary:
 42+ key is language
 43+ value is a list of bot names
 44+ '''
 45+ d = {}
 46+ for line in utils.read_data_from_csv(location, filename, encoding):
 47+ line = utils.clean_string(line)
 48+ language, bots = line.split(',')
 49+ bots = bots.split('|')
 50+ for bot in bots:
 51+ if bot not in d:
 52+ d[bot] = {}
 53+ d[bot]['id'] = None
 54+ d[bot]['languages'] = []
 55+ d[bot]['languages'].append(language)
 56+ return d
 57+
 58+
 59+def store_bots():
 60+ bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding)
 61+ mongo = db.init_mongo_db('bots')
 62+ collection = mongo['ids']
 63+ db.remove_documents_from_mongo_db(collection, None)
 64+ for id, name in ids.iteritems():
 65+ collection.insert({'id': int(id), 'name': name, 'language': language})
 66+
 67+ print 'Stored %s bots' % collection.count()
 68+
 69+
 70+def lookup_bot_userid(input_queue, language_code, project, bots):
 71+ '''
 72+ This function is used to find the id's belonging to the different bots that
 73+ are patrolling the Wikipedia sites.
 74+ @input_queue contains a list of xml files to parse
 75+ @bots is a dictionary containing the names of the bots to lookup
 76+ '''
 77+ if settings.debug:
 78+ messages = {}
 79+
 80+ location = os.path.join(settings.input_location, language_code, project, 'chunks')
 81+ fh = utils.create_txt_filehandle(settings.csv_location, 'bots_ids.csv', 'w', settings.encoding)
 82+
 83+ while True:
 84+ file = input_queue.get(block=False)
 85+ if file == None:
 86+ break
 87+ data = xml.read_input(utils.create_txt_filehandle(location,
 88+ file,
 89+ 'r',
 90+ settings.encoding))
 91+
 92+ for raw_data in data:
 93+ xml_buffer = cStringIO.StringIO()
 94+ raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
 95+ raw_data = ''.join(raw_data)
 96+ raw_data = raw_data.encode('utf-8')
 97+ xml_buffer.write(raw_data)
 98+
 99+ try:
 100+ xml_nodes = cElementTree.XML(xml_buffer.getvalue())
 101+ revisions = xml_nodes.findall('revision')
 102+ for revision in revisions:
 103+ contributor = xml.retrieve_xml_node(revision, 'contributor')
 104+ username = contributor.find('username')
 105+ if username == None:
 106+ continue
 107+ username = xml.extract_text(username, None)
 108+ #print username.encode('utf-8')
 109+ if username in bots:
 110+ id = contributor.find('id')
 111+ id = xml.extract_text(id, None)
 112+ #print username.encode('utf-8'), id
 113+ bot = bots[username]
 114+ bot['_username'] = username
 115+ bot['id'] = id
 116+ utils.write_dict_to_csv(bot, fh, write_key=False)
 117+ bots.pop(username)
 118+ if bots == {}:
 119+ print 'Found id numbers for all bots.'
 120+ return
 121+
 122+ except Exception, error:
 123+ print error
 124+ if settings.debug:
 125+ messages = utils.track_errors(xml_buffer, error, file,
 126+ messages)
 127+ fh.close()
 128+
 129+ if settings.debug:
 130+ utils.report_error_messages(messages, lookup_username)
 131+
 132+def bot_launcher(language_code, project):
 133+ bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding)
 134+ files = utils.retrieve_file_list(os.path.join(settings.input_location, language_code, project, 'chunks'), 'xml', mask=None)
 135+ input_queue = pc.load_queue(files, poison_pill=True)
 136+ lookup_bot_userid(input_queue, language_code, project, bots)
 137+
 138+
 139+if __name__ == '__main__':
 140+ language_code = 'en'
 141+ project = 'wiki'
 142+ bot_launcher(language_code, project)
Property changes on: trunk/tools/editor_trends/bots/bots.py
___________________________________________________________________
Added: svn:eol-style
1143 + native
Added: svn:mime-type
2144 + text/plain
Property changes on: trunk/tools/editor_trends
___________________________________________________________________
Modified: svn:ignore
3145 - wikistats
zips
notes.txt
*.pyc
datasets
errors
.settings
.project
.pydevproject
wiki.cfg
4146 + wikistats
zips
notes.txt
*.pyc
datasets
errors
.settings
.project
.pydevproject
wiki.cfg
fabric.py
fabfile.py
deployment

Status & tagging log