r77690 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r77689‎ | r77690 | r77691 >
Date:23:38, 3 December 2010
Author:diederik
Status:deferred
Tags:
Comment:
* 0.1 release
Modified paths:
  • /trunk/tools/editor_trends/bots/bots.py (modified) (history)
  • /trunk/tools/editor_trends/configuration.py (modified) (history)
  • /trunk/tools/editor_trends/database/db.py (modified) (history)
  • /trunk/tools/editor_trends/etl/chunker.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extract.py (modified) (history)
  • /trunk/tools/editor_trends/etl/models.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/utils/exceptions.py (modified) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)
  • /trunk/tools/editor_trends/wikitree/xml.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -19,6 +19,7 @@
2020
2121 import os
2222 import logging
 23+import logging.handlers
2324 import sys
2425 import datetime
2526 from argparse import ArgumentParser
@@ -36,6 +37,7 @@
3738 from utils import dump_downloader
3839 from utils import compression
3940 from utils import ordered_dict
 41+from utils import exceptions
4042 from database import db
4143 from etl import chunker
4244 from etl import extract
@@ -104,11 +106,18 @@
105107 logger.debug('Starting %s task' % function.func_name)
106108 if message:
107109 logger.debug(message)
 110+
 111+ max_length = max([len(kw) for kw in kwargs])
 112+ #max_tab = max_length / 4
108113 for kw in kwargs:
109114 if verb:
110115 logger.debug('Action: %s\tSetting: %s' % (verb, kwargs[kw]))
111116 else:
112 - logger.debug('Key: %s\tSetting: %s' % (kw, kwargs[kw]))
 117+ tabs = (max_length - len(kw)) / 4
 118+ if tabs == 0:
 119+ tabs = 1
 120+ tabs = ''.join(['\t' for t in xrange(tabs)])
 121+ logger.debug('\tKey: %s%sSetting: %s' % (kw, tabs, kwargs[kw]))
113122
114123
115124
@@ -128,6 +137,7 @@
129138 location = get_value(args, 'location') if get_value(args, 'location') != None else settings.input_location
130139 project = get_project(args)
131140 language_code = get_language(args)
 141+ config['format'] = get_value(args, 'format')
132142 config['language_code'] = language_code
133143 config['language'] = get_value(args, 'language')
134144 config['location'] = os.path.join(location, language_code, project)
@@ -191,7 +201,7 @@
192202 language = kwargs.pop('language')
193203 language_code = kwargs.pop('language_code')
194204 namespaces = kwargs.pop('namespaces')
195 -
 205+ format = kwargs.pop('format')
196206 ext = utils.determine_file_extension(filename)
197207 file = filename.replace('.' + ext, '')
198208 result = utils.check_file_exists(location, file)
@@ -202,11 +212,12 @@
203213 if retcode != 0:
204214 sys.exit(retcode)
205215
206 - chunker.split_file(location, file, project, language_code, namespaces, format='xml', zip=False)
 216+ chunker.split_file(location, file, project, language_code, namespaces, format=format, zip=False)
207217 timer.elapsed()
208218
209219
210220 def launch_zip_extractor(args, logger, location, file):
 221+ print 'Unzipping zip file'
211222 timer = Timer()
212223 write_message_to_log(logger, args, location=location, file=file)
213224 compressor = compression.Compressor(location, file)
@@ -215,56 +226,60 @@
216227
217228
218229 def extract_launcher(args, logger, **kwargs):
 230+ print 'Extracting data from XML'
219231 timer = Timer()
220 - write_message_to_log(logger, args, **kwargs)
221232 location = kwargs.pop('location')
222233 language_code = kwargs.pop('language_code')
223234 project = kwargs.pop('project')
 235+ write_message_to_log(logger, args, location=location, language_code=language_code, project=project)
224236 extract.run_parse_editors(location, **kwargs)
225237 timer.elapsed()
226238
227239
228240 def sort_launcher(args, logger, **kwargs):
 241+ print 'Start sorting data'
229242 timer = Timer()
230 - write_message_to_log(logger, args, **kwargs)
231243 location = kwargs.pop('location')
232244 input = os.path.join(location, 'txt')
233245 output = os.path.join(location, 'sorted')
234246 final_output = os.path.join(location, 'dbready')
 247+ write_message_to_log(logger, args, location=location, input=input, output=output, final_output=final_output)
235248 loader.mergesort_launcher(input, output)
236249 loader.mergesort_external_launcher(output, final_output)
237250 timer.elapsed()
238251
239252
240253 def store_launcher(args, logger, **kwargs):
 254+ print 'Start storing data in MongoDB'
241255 timer = Timer()
242 - write_message_to_log(logger, args, **kwargs)
243256 location = kwargs.pop('location')
244257 input = os.path.join(location, 'dbready')
245258 dbname = kwargs.pop('full_project')
246259 collection = kwargs.pop('collection')
 260+ write_message_to_log(logger, args, verb='Storing', location=location, input=input, dbname=dbname, collection=collection)
247261 loader.store_editors(input, dbname, collection)
248262 timer.elapsed()
249263
250264
251265 def transformer_launcher(args, logger, **kwargs):
252 - print 'dataset launcher'
 266+ print 'Start transforming dataset'
253267 timer = Timer()
254 - write_message_to_log(logger, args, **kwargs)
255268 project = kwargs.pop('full_project')
256269 collection = kwargs.pop('collection')
 270+ write_message_to_log(logger, args, verb='Transforming', project=project, collection=collection)
257271 transformer.transform_editors_single_launcher(project, collection)
258272 timer.elapsed()
259273
260274
261275 def exporter_launcher(args, logger, **kwargs):
 276+ print 'Start exporting dataset'
262277 timer = Timer()
263 - write_message_to_log(logger, args, **kwargs)
264278 collection = get_value(args, 'collection')
265 - dbname = kwargs.pop('full_project')
 279+ dbname = kwargs.get('full_project')
266280 targets = get_value(args, 'datasets')
267281 targets = targets.split(',')
268282 for target in targets:
 283+ write_message_to_log(logger, args, verb='Exporting', target=target, dbname=dbname, collection=collection)
269284 exporter.dataset_launcher(dbname, collection, target)
270285 timer.elapsed()
271286
@@ -274,16 +289,19 @@
275290 timer = Timer()
276291 full_project = kwargs.get('full_project', None)
277292 message = 'Start of building %s dataset.' % full_project
278 - db.cleanup_database(full_project)
279 - write_message_to_log(logger, args, message, **kwargs)
 293+
 294+ db.cleanup_database(full_project, logger)
280295 ignore = get_value(args, 'except')
281296 clean = get_value(args, 'new')
 297+ format = get_value(args, 'format')
 298+ write_message_to_log(logger, args, message=message, full_project=full_project, ignore=ignore, clean=clean)
282299 if clean:
283300 dirs = kwargs.get('directories')[1:]
284301 for dir in dirs:
285 - write_message_to_log(logger, args, verb='Deleting', **kwargs)
286 - utils.delete_file(dir, '')
287 -
 302+ write_message_to_log(logger, args, verb='Deleting', dir=dir)
 303+ utils.delete_file(dir, '', directory=True)
 304+ if format != 'xml':
 305+ ignore = ignore + ',extract'
288306 functions = ordered_dict.OrderedDict(((dump_downloader_launcher, 'download'),
289307 (chunker_launcher, 'split'),
290308 (extract_launcher, 'extract'),
@@ -328,8 +346,9 @@
329347 version = sys.version_info[0:2]
330348 logger.debug('Python version: %s' % '.'.join(str(version)))
331349 if version < settings.minimum_python_version:
332 - raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).'
 350+ raise exceptions.OutDatedPythonVersionError
333351
 352+
334353 def about():
335354 print 'Editor Trends Software is (c) 2010 by the Wikimedia Foundation.'
336355 print 'Written by Diederik van Liere (dvanliere@gmail.com).'
@@ -339,9 +358,6 @@
340359
341360
342361 def main():
343 - logger = logging.getLogger('manager')
344 - logger.setLevel(logging.DEBUG)
345 -
346362 default_language = determine_default_language()
347363
348364 datasets = {'cohort': 'generate_cohort_dataset',
@@ -397,7 +413,7 @@
398414 help='Should be a list of functions that are to be ignored when executing \'all\'.',
399415 default=[])
400416
401 - parser_all.add_argument('-n', '--new', action='store_false',
 417+ parser_all.add_argument('-n', '--new', action='store_true',
402418 help='This will delete all previous output and starts from scratch. Mostly useful for debugging purposes.',
403419 default=False)
404420
@@ -421,10 +437,13 @@
422438 default=settings.input_location
423439 )
424440
425 - parser.add_argument('-n', '--namespace', action='store',
 441+ parser.add_argument('-ns', '--namespace', action='store',
426442 help='A list of namespaces to include for analysis.',
427443 default='0')
428444
 445+ parser.add_argument('-fo', '--format', action='store',
 446+ help='Indicate which format the chunks should be stored. Valid options are xml and txt.',
 447+ default='txt')
429448
430449 parser.add_argument('-f', '--file', action='store',
431450 choices=file_choices,
@@ -444,9 +463,22 @@
445464 parser.add_argument('-prog', '--progress', action='store_true', default=True,
446465 help='Indicate whether you want to have a progressbar.')
447466
 467+ args = parser.parse_args()
 468+ #initialize logger
 469+ logger = logging.getLogger('manager')
 470+ logger.setLevel(logging.DEBUG)
 471+
 472+ # Add the log message handler to the logger
 473+ today = datetime.datetime.today()
 474+ log_filename = os.path.join(settings.log_location, '%s%s_%s-%s-%s.log' % (args.language, args.project, today.day, today.month, today.year))
 475+ handler = logging.handlers.RotatingFileHandler(log_filename, maxBytes=1024 * 1024, backupCount=3)
 476+
 477+ logger.addHandler(handler)
 478+ logger.debug('Default language: \t%s' % default_language)
 479+
 480+ #start manager
448481 detect_python_version(logger)
449482 about()
450 - args = parser.parse_args()
451483 config.create_configuration(settings, args)
452484 locations = determine_file_locations(args, logger)
453485 settings.verify_environment(locations['directories'])
Index: trunk/tools/editor_trends/wikitree/xml.py
@@ -26,7 +26,7 @@
2727 return utils.unescape(text)
2828
2929
30 -def extract_text(elem, kwargs):
 30+def extract_text(elem, **kwargs):
3131 if elem != None and elem.text != None:
3232 #try:
3333 return elem.text #.decode(settings.encoding)
Index: trunk/tools/editor_trends/etl/extract.py
@@ -20,13 +20,12 @@
2121 #Default Python libraries (Python => 2.6)
2222 import sys
2323 import os
24 -import time
25 -import datetime
26 -import codecs
27 -import math
 24+#import time
 25+#import datetime
 26+#import codecs
 27+#import math
2828
29 -import re
30 -from operator import itemgetter
 29+#from operator import itemgetter
3130
3231 import multiprocessing
3332 from Queue import Empty
@@ -43,7 +42,7 @@
4443 from wikitree import xml
4544 from bots import bots
4645 from etl import models
47 -from utils import process_constructor as pc
 46+#from utils import process_constructor as pc
4847
4948 try:
5049 import psyco
@@ -51,25 +50,49 @@
5251 except ImportError:
5352 pass
5453
 54+def validate_hostname(address):
 55+ '''
 56+ This is not a foolproof solution at all. The problem is that it's really hard
 57+ to determine whether a string is a hostname or not **reliably**. This is a
 58+ very fast rule of thumb. Will lead to false positives, but that's life :)
 59+ '''
 60+ parts = address.split(".")
 61+ if len(parts) > 2:
 62+ return True
 63+ else:
 64+ return False
5565
 66+def validate_ip(address):
 67+ parts = address.split(".")
 68+ if len(parts) != 4:
 69+ return False
 70+ parts = parts[:3]
 71+ for item in parts:
 72+ try:
 73+ if not 0 <= int(item) <= 255:
 74+ return False
 75+ except ValueError:
 76+ return False
 77+ return True
5678
5779
58 -def determine_username_is_bot(contributor, bots):
 80+def determine_username_is_bot(contributor, **kwargs):
5981 '''
6082 #contributor is an xml element containing the id of the contributor
6183 @bots should have a dcit with all the bot ids and bot names
6284 @Return False if username id is not in bot dict id or True if username id
6385 is a bot id.
6486 '''
 87+ bots = kwargs.get('bots')
6588 for elem in contributor:
6689 if elem.tag == 'id':
67 - if elem.text in bots['bots']:
 90+ if elem.text in bots:
6891 return 1
6992 else:
7093 return 0
7194
7295
73 -def extract_username(contributor, kwargs):
 96+def extract_username(contributor, **kwargs):
7497 for elem in contributor:
7598 if elem.tag == 'username':
7699 return elem.text
@@ -77,41 +100,44 @@
78101 return None
79102
80103
81 -def extract_contributor_id(contributor, kwargs):
 104+def extract_contributor_id(contributor, **kwargs):
82105 '''
83106 @contributor is the xml contributor node containing a number of attributes
84107
85108 Currently, we are only interested in registered contributors, hence we
86109 ignore anonymous editors.
87110 '''
88 - if contributor.get('deleted'):
89 - return - 1 # ASK: Not sure if this is the best way to code deleted contributors.
 111+ #if contributor.get('deleted'):
 112+ # return None # ASK: Not sure if this is the best way to code deleted contributors.
90113 for elem in contributor:
91 - if elem.tag == 'id':
92 - if elem.text != None:
93 - return elem.text
 114+ if elem.tag == 'id' and elem.text != None:
 115+ return {'id':elem.text}
 116+
 117+ elif elem.tag == 'ip' and elem.text != None:
 118+ if validate_ip(elem.text) == False and validate_hostname(elem.text) == False:
 119+ return {'username':elem.text, 'id': elem.text}
94120 else:
95 - return - 1
 121+ return None
 122+ return None
96123
97 -
98124 def output_editor_information(elem, fh, **kwargs):
99125 '''
100126 @elem is an XML element containing 1 revision from a page
101 - @output is where to store the data, either a queue or a filehandle
 127+ @output is where to store the data, a filehandle
102128 @**kwargs contains extra information
103129
104130 the variable tags determines which attributes are being parsed, the values in
105131 this dictionary are the functions used to extract the data.
106132 '''
107 - tags = {'contributor': {'editor': extract_contributor_id,
 133+ tags = {'contributor': {'id': extract_contributor_id,
108134 'bot': determine_username_is_bot,
109135 'username': extract_username,
110136 },
111137 'timestamp': {'date': xml.extract_text},
112138 }
113139 vars = {}
114 - headers = ['editor', 'date', 'article', 'username']
115 - #destination = kwargs.pop('destination')
 140+ #counter = kwargs.pop('counter')
 141+ headers = ['id', 'date', 'article', 'username']
116142 revisions = elem.findall('revision')
117143 for revision in revisions:
118144 vars['article'] = elem.find('id').text.decode(settings.encoding)
@@ -119,18 +145,27 @@
120146 for tag, functions in tags.iteritems():
121147 xml_node = xml.retrieve_xml_node(elements, tag)
122148 for var, function in functions.iteritems():
123 - vars[var] = function(xml_node, kwargs)
 149+ value = function(xml_node, **kwargs)
 150+ if type(value) == type({}):
 151+ for kw in value:
 152+ vars[kw] = value[kw]
 153+ #if vars['username'] not in counter:
 154+ # counter['username'] = c
 155+ # c += 1
 156+ #vars['id'] = counter[vars['username']]
 157+ else:
 158+ vars[var] = value
124159
125160 #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])
126 - if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:
 161+ if vars['bot'] != 1 and vars['id'] != None:
127162 vars.pop('bot')
128163 data = []
129164 for head in headers:
130165 data.append(vars[head])
131166 utils.write_list_to_csv(data, fh)
132167 vars = {}
 168+ #return counter, c
133169
134 -
135170 def run_parse_editors(location, **kwargs):
136171 bot_ids = bots.retrieve_bots()
137172 input = os.path.join(location, 'chunks')
@@ -157,7 +192,7 @@
158193 bot_ids = bots.retrieve_bots()
159194 input = os.path.join(location, 'chunks')
160195 output = os.path.join(location, 'txt')
161 - xml_file = models.XMLFile(input, output, '1.xml', bot_ids, output_editor_information)
 196+ xml_file = models.XMLFile(input, output, 'pages_full_en.xml', bot_ids, output_editor_information)
162197 xml_file()
163198
164199 if __name__ == '__main__':
Index: trunk/tools/editor_trends/etl/chunker.py
@@ -30,10 +30,14 @@
3131
3232 sys.path.append('..')
3333 import configuration
 34+settings = configuration.Settings()
 35+
3436 from utils import utils
 37+import extract
3538 from wikitree import xml
36 -settings = configuration.Settings()
 39+from bots import bots
3740
 41+
3842 try:
3943 import psyco
4044 psyco.full()
@@ -114,12 +118,12 @@
115119 return True
116120
117121
118 -def write_xml_file(element, fh, output, counter):
 122+def write_xml_file(element, fh, output, counter, format):
119123 '''Get file handle and write xml element to file'''
120124 try:
121125 xml_string = cElementTree.tostring(element)
122126 size = len(xml_string)
123 - fh, counter, new_file = create_file_handle(fh, output, counter, size)
 127+ fh, counter, new_file = create_file_handle(fh, output, counter, size, format)
124128 fh.write(xml_string)
125129 except MemoryError:
126130 print 'Add error capturing logic'
@@ -134,7 +138,7 @@
135139 return fh, counter, new_file
136140
137141
138 -def create_file_handle(fh, output, counter, size):
 142+def create_file_handle(fh, output, counter, size, format):
139143 '''
140144 @fh is file handle, if none is supplied or if file size > max file size then
141145 create a new file handle
@@ -144,56 +148,79 @@
145149 '''
146150 if not fh:
147151 counter = 0
148 - path = os.path.join(output, '%s.xml' % counter)
 152+ path = os.path.join(output, '%s.%s' % (counter, format))
149153 fh = codecs.open(path, 'w', encoding=settings.encoding)
150154 return fh, counter, False
151155 elif (fh.tell() + size) > settings.max_xmlfile_size:
152156 print 'Created chunk %s' % (counter + 1)
153157 fh.close
154158 counter += 1
155 - path = os.path.join(output, '%s.xml' % counter)
 159+ path = os.path.join(output, '%s.%s' % (counter, format))
156160 fh = codecs.open(path, 'w', encoding=settings.encoding)
157161 return fh, counter, True
158162 else:
159163 return fh, counter, False
160164
161165
162 -def flatten_xml_elements(data, page):
 166+def flatten_xml_elements(data, page, bots):
 167+ headers = ['id', 'date', 'article', 'username']
 168+ tags = {'contributor': {'id': extract.extract_contributor_id,
 169+ 'bot': extract.determine_username_is_bot,
 170+ 'username': extract.extract_username,
 171+ },
 172+ 'timestamp': {'date': xml.extract_text},
 173+ }
 174+ vars = {}
163175 flat = []
 176+
164177 for x, elems in enumerate(data):
165 - flat.append([page])
166 - for elem in elems:
167 - if elem.tag != 'id':
168 - if len(elem.getchildren()) > 0:
169 - for el in elem.getchildren():
170 - flat[x].append(xml.extract_text(elem, None))
 178+ vars[x] = {}
 179+ vars[x]['article'] = page
 180+ for tag in tags:
 181+ el = xml.retrieve_xml_node(elems, tag)
 182+ for function in tags[tag].keys():
 183+ f = tags[tag][function]
 184+ value = f(el, bots=bots)
 185+ if type(value) == type({}):
 186+ for kw in value:
 187+ vars[x][kw] = value[kw]
171188 else:
172 - flat[x].append(xml.extract_text(elem, None))
 189+ vars[x][function] = value
 190+
 191+ for x, var in enumerate(vars):
 192+ if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:
 193+ continue
 194+ else:
 195+ f = []
 196+ for head in headers:
 197+ f.append(vars[x][head])
 198+ flat.append(f)
 199+
173200 return flat
174201
175202
176203 def split_file(location, file, project, language_code, namespaces=[0], format='xml', zip=False):
177204 '''
178205 Reads xml file and splits it in N chunks
179 -
180206 @namespaces is a list indicating which namespaces should be included, default
181207 is to include namespace 0 (main namespace)
182208 @zip indicates whether to compress the chunk or not
183209 '''
184 - #location = os.path.join(settings.input_location, language)
185210 input = os.path.join(location, file)
186 - output = os.path.join(location, 'chunks')
187 - settings.verify_environment([output])
188211 if format == 'xml':
189 - fh = None
 212+ output = os.path.join(location, 'chunks')
190213 else:
191 - f = input.replace('.xml', '')
192 - fh = utils.create_txt_filehandle(output, '%s.tsv' % f, 'w', settings.encoding)
 214+ output = os.path.join(location, 'txt')
 215+ bot_ids = bots.retrieve_bots()
 216+ settings.verify_environment([output])
193217
 218+ fh = None
 219+ counter = 0
 220+
194221 ns = load_namespace(language_code)
195222 ns = build_namespaces_locale(ns, namespaces)
 223+ #settings.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
196224
197 - counter = 0
198225 tag = '{%s}page' % settings.xml_namespace
199226 context = cElementTree.iterparse(input, events=('start', 'end'))
200227 context = iter(context)
@@ -206,16 +233,21 @@
207234 if is_article_main_namespace(elem, ns):
208235 page = elem.find('id').text
209236 elem = parse_comments(elem, remove_numeric_character_references)
 237+
210238 if format == 'xml':
211 - fh, counter, new_file = write_xml_file(elem, fh, output, counter)
212 - if zip and new_file:
213 - file = str(counter - 1) + '.xml'
214 - utils.zip_archive(settings.path_ziptool, output, file)
215 - utils.delete_file(output, file)
 239+ fh, counter, new_file = write_xml_file(elem, fh, output, counter, format)
216240 else:
217241 data = [el.getchildren() for el in elem if el.tag == 'revision']
218 - data = flatten_xml_elements(data, page)
219 - utils.write_list_to_csv(data, fh, recursive=False, newline=True)
 242+ data = flatten_xml_elements(data, page, bot_ids)
 243+ if data != None:
 244+ size = 64 * len(data)
 245+ fh, counter, new_file = create_file_handle(fh, output, counter, size, format)
 246+ utils.write_list_to_csv(data, fh, recursive=False, newline=True)
 247+
 248+ if zip and new_file:
 249+ file = str(counter - 1) + format
 250+ utils.zip_archive(settings.path_ziptool, output, file)
 251+ utils.delete_file(output, file)
220252 root.clear() # when done parsing a section clear the tree to safe memory
221253 except SyntaxError:
222254 f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding)
@@ -225,8 +257,8 @@
226258 fh.close()
227259
228260 if __name__ == "__main__":
229 - kwargs = {'output': settings.input_location,
230 - 'input': settings.input_filename,
 261+ kwargs = {'location': settings.input_location,
 262+ 'file': settings.input_filename,
231263 'project':'wiki',
232264 'language_code':'en',
233265 'format': 'tsv'
Index: trunk/tools/editor_trends/etl/models.py
@@ -29,18 +29,49 @@
3030 from utils import utils
3131 from wikitree import xml
3232
 33+class TXTFile(object):
 34+
 35+ def __init__(self, file, location, output, output_file, target, **kwargs):
 36+ self.file = file
 37+ self.location = location
 38+ self.target = target
 39+ self.output = output
 40+ self.output_file = output_file
 41+ for kw in kwargs:
 42+ setattr(self, kw, kwargs[kw])
 43+
 44+ def __str__(self):
 45+ return '%s' % (self.file)
 46+
 47+ def __call__(self, bots):
 48+ self.bots = bots
 49+ self.fr = utils.create_txt_filehandle(self.location, self.file, 'r', settings.encoding)
 50+ self.fw = utils.create_txt_filehandle(self.output, self.output_file, 'w', settings.encoding)
 51+ for line in self.fr:
 52+ line = line.replace('\n', '')
 53+ if line == '':
 54+ continue
 55+ line = line.split('\t')
 56+ self.bots = self.target(line, self.fw, self.bots, self.keys)
 57+ if self.bots == {}:
 58+ break
 59+ self.fr.close()
 60+ self.fw.close()
 61+ return self.bots
 62+
3363 class XMLFileConsumer(models.BaseConsumer):
3464
3565 def run(self):
3666 while True:
37 - print 'Queue is %s files long...' % (self.task_queue.qsize() - settings.number_of_processes)
3867 new_xmlfile = self.task_queue.get()
3968 self.task_queue.task_done()
4069 if new_xmlfile == None:
4170 print 'Swallowed a poison pill'
4271 break
 72+ print 'Queue is %s files long...' % self.task_queue.qsize()
4373 new_xmlfile()
4474
 75+
4576 class XMLFile(object):
4677 def __init__(self, input, output, xml_file, bots, target, output_file=None, **kwargs):
4778 self.file = xml_file
Index: trunk/tools/editor_trends/configuration.py
@@ -50,9 +50,9 @@
5151 self.progressbar = True
5252 self.encoding = 'utf-8'
5353 self.date_format = '%Y-%m-%d' #Date format as used by Erik Zachte
54 - self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' # Timestampformat as generated by the MediaWiki dumps
 54+ self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' # Timestamp format as generated by the MediaWiki dumps
5555
56 - self.max_xmlfile_size = 67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
 56+ self.max_xmlfile_size = 4096 * 1024 #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
5757 self.number_of_processes = cpu_count() * process_multiplier
5858 #Change this to match your computers configuration (RAM / CPU)
5959 self.minimum_python_version = (2, 6)
@@ -70,8 +70,8 @@
7171 self.root = '/' if self.platform != 'Windows' else 'c:\\'
7272 self.file_locations = self.set_file_locations()
7373 self.max_filehandles = self.determine_max_filehandles_open()
 74+ self.tab_width = 4 if self.platform == 'Windows' else 8
7475
75 -
7676 self.load_configuration()
7777 self.set_custom_settings(**kwargs)
7878 self.dumpversions = {'0': 'http://www.mediawiki.org/xml/export-0.4/',
Index: trunk/tools/editor_trends/utils/utils.py
@@ -31,8 +31,9 @@
3232 import os
3333 import ctypes
3434 import time
35 -import subprocess
 35+#import subprocess
3636 import sys
 37+import shutil
3738 sys.path.append('..')
3839
3940 import configuration
@@ -248,9 +249,19 @@
249250 return name
250251
251252
252 -def delete_file(location, filename):
 253+def delete_file(location, filename, directory=False):
253254 if check_file_exists(location, filename):
254 - os.remove(os.path.join(location, filename))
 255+ if not directory:
 256+ try:
 257+ path = os.path.join(location, filename)
 258+ os.remove(path)
 259+ except WindowsError, error:
 260+ print error
 261+ else:
 262+ try:
 263+ shutil.rmtree(location)
 264+ except Exception, error:
 265+ print error
255266
256267
257268 def check_file_exists(location, filename):
Index: trunk/tools/editor_trends/utils/exceptions.py
@@ -43,3 +43,10 @@
4444
4545 def __str__(self):
4646 print 'You have not installed a program to extract %s archives.' % self.extension
 47+
 48+class OutDatedPythonVersionError(Error):
 49+ def __init__(self, version):
 50+ self.version = version
 51+
 52+ def __str__(self):
 53+ print 'Please upgrade to Python 2.6 or higher (but not Python 3.x).'
Index: trunk/tools/editor_trends/database/db.py
@@ -17,18 +17,18 @@
1818 __date__ = '2010-10-21'
1919 __version__ = '0.1'
2020
21 -import sqlite3 as sqlite
 21+#import sqlite3 as sqlite
2222 from pymongo import Connection
2323
2424
25 -import configuration
26 -settings = configuration.Settings()
27 -from database import db_settings
 25+#import configuration
 26+#settings = configuration.Settings()
 27+#from database import db_settings
2828
2929
30 -def init_mongo_db(db):
 30+def init_mongo_db(dbname):
3131 connection = Connection()
32 - db = connection[db]
 32+ db = connection[dbname]
3333 return db
3434
3535
@@ -42,11 +42,12 @@
4343 return db.collection_names()
4444
4545
46 -def cleanup_database(dbname):
 46+def cleanup_database(dbname, logger):
4747 coll = get_collections(dbname)
4848 for c in coll:
4949 if not c.startswith('system'):
5050 drop_collection(dbname, c)
 51+ logger.debug('Deleting collection %s from database %s.' % (c, dbname))
5152
5253
5354 def remove_documents_from_mongo_db(collection, ids):
Index: trunk/tools/editor_trends/bots/bots.py
@@ -31,6 +31,7 @@
3232 from wikitree import xml
3333 from database import db
3434 from utils import utils
 35+#from etl import extract
3536 from utils import process_constructor as pc
3637 from etl import models
3738 import models as botmodels
@@ -110,54 +111,51 @@
111112 keys and values to ease writing to a csv file.
112113 '''
113114 d = {}
114 - for o in obj:
115 - bot = obj[o]
116 - d[o] = {}
117 - for kw in bot.__dict__.keys():
118 - if kw not in exclude:
119 - d[o][kw] = getattr(bot, kw)
 115+ for kw in obj.__dict__.keys():
 116+ if kw not in exclude:
 117+ d[kw] = getattr(obj, kw)
120118 return d
121119
122120
123 -def write_bot_list_to_csv(bots):
 121+def write_bot_list_to_csv(bots, keys):
124122 fh = utils.create_txt_filehandle(settings.csv_location, 'bots_ids.csv', 'w', settings.encoding)
125123 bot_dict = convert_object_to_dict(bots, exclude=['time', 'written'])
126 - keys = ['id', 'name', 'verified', 'projects']
127124 for bot in bot_dict:
128125 bot = bot_dict[bot]
129126 utils.write_dict_to_csv(bot, fh, keys, write_key=False, newline=True)
130127 fh.close()
131128
132129
133 -def lookup_bot_userid(xml_nodes, bots):
 130+def lookup_bot_userid(data, fh, bots, keys):
134131 '''
135132 This function is used to find the id's belonging to the different bots that
136133 are patrolling the Wikipedia sites.
137134 @xml_nodes is a list of xml elements that need to be parsed
138135 @bots is a dictionary containing the names of the bots to lookup
139136 '''
140 - revisions = xml_nodes.findall('revision')
141 - for revision in revisions:
142 - contributor = xml.retrieve_xml_node(revision, 'contributor')
143 - username = contributor.find('username')
144 - if username == None or username.text == None:
145 - continue
146 - else:
147 - username = username.text #encode(settings.encoding)
148 - name = username.lower()
 137+ username = data[3]
 138+ if username in bots:
 139+ bot = bots.pop(username)
 140+ setattr(bot, 'id', data[0])
 141+ setattr(bot, 'verified', True)
 142+ bot = convert_object_to_dict(bot, exclude=['time'])
 143+ utils.write_dict_to_csv(bot, fh, keys, write_key=False, newline=True)
 144+ return bots
149145
150 - #print username.encode('utf-8')
151 - if (username in bots and bots[username].verified == True) or name.find('bot') > -1:
152 - bot = bots.get(username, botmodels.Bot(username, verified=False))
153 - id = contributor.find('id').text
154 - bot.id = id
155 - bot.name = username
156 - timestamp = revision.find('timestamp').text
157 - if timestamp != None:
158 - timestamp = utils.convert_timestamp_to_datetime_naive(timestamp)
159 - bot.time[str(timestamp.year)].append(timestamp)
160146
161 - bots[username] = bot
 147+def create_bot_validation_dataset(data, fh, bots, keys):
 148+ username = data[3].lower()
 149+ #print username.encode('utf-8')
 150+ if username.find('bot') > -1 or username.find('script') > -1:
 151+ bot = bots.get(username, botmodels.Bot(username, verified=False))
 152+ setattr(bot, 'id', data[0])
 153+
 154+ timestamp = data[1]
 155+ if timestamp != None:
 156+ timestamp = utils.convert_timestamp_to_datetime_naive(timestamp)
 157+ bot.time[str(timestamp.year)].append(timestamp)
 158+ bots[username] = bot
 159+
162160 return bots
163161
164162 #bot = bots.get('PseudoBot')
@@ -165,26 +163,36 @@
166164 #bot.avg_lag_between_edits()
167165
168166
169 -def bot_launcher(language_code, project, single=False, manager=False):
 167+def bot_launcher(language_code, project, target, action, single=False, manager=False):
170168 '''
171169 This function sets the stage to launch bot id detection and collecting data
172170 to discover new bots.
173171 '''
174172 utils.delete_file(settings.csv_location, 'bots_ids.csv')
175173 location = os.path.join(settings.input_location, language_code, project)
176 - input = os.path.join(location, 'chunks')
177 -
178 - files = utils.retrieve_file_list(input, 'xml', mask=None)
 174+ input_xml = os.path.join(location, 'chunks')
 175+ input_txt = os.path.join(location, 'txt')
 176+ files = utils.retrieve_file_list(input_txt, 'txt', mask=None)
 177+ files = files[400:405]
179178 input_queue = pc.load_queue(files, poison_pill=True)
180179 tasks = multiprocessing.JoinableQueue()
181180 mgr = multiprocessing.Manager()
 181+ keys = ['id', 'name', 'verified', 'projects']
 182+
 183+ if action == 'lookup':
 184+ output_file = 'bots_ids.csv'
 185+ bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager)
 186+ else:
 187+ output_file = 'bots_predictionset.csv'
 188+ bots = {}
 189+
182190 #lock = mgr.Lock()
183191 if manager:
184192 manager = mgr
185 - bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager)
186193
 194+
187195 for file in files:
188 - tasks.put(models.XMLFile(input, settings.csv_location, file, None, lookup_bot_userid))
 196+ tasks.put(models.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys))
189197
190198 tracker = {}
191199 if single:
@@ -199,21 +207,22 @@
200208 bot_launcher_multi(tasks)
201209
202210 utils.store_object(bots, settings.binary_location, 'bots.bin')
203 - write_bot_list_to_csv(bots)
204 - bot_training_dataset(bots)
205 - store_bots()
206 - if bots != {}:
207 - print 'The script was unable to retrieve the user id\s for the following %s bots:\n' % len(bots)
208 - keys = bots.keys()
209 - for key in keys:
210 - try:
211 - print '%s' % key.encode(settings.encoding)
212 - except:
213 - pass
 211+ if action == 'lookup':
 212+ store_bots()
 213+ if bots != {}:
 214+ print 'The script was unable to retrieve the user id\s for the following %s bots:\n' % len(bots)
 215+ keys = bots.keys()
 216+ for key in keys:
 217+ try:
 218+ print '%s' % key.encode(settings.encoding)
 219+ except:
 220+ pass
 221+ else:
 222+ bot_training_dataset(bots)
 223+ #write_bot_list_to_csv(bots, keys)
214224
215225
216226
217 -
218227 def bot_training_dataset(bots):
219228 fh = utils.create_txt_filehandle(settings.csv_location, 'training_bots.csv', 'w', settings.encoding)
220229 keys = bots.keys()
@@ -251,8 +260,9 @@
252261 if __name__ == '__main__':
253262 language_code = 'en'
254263 project = 'wiki'
255 - store_bots()
 264+ #store_bots()
256265 #bots = debug_bots_dict()
257266 #write_bot_list_to_csv(bots)
258 - #bot_launcher(language_code, project, single=True)
 267+ #language_code, project, lookup_bot_userid, single = False, manager = False
 268+ bot_launcher(language_code, project, create_bot_validation_dataset, action='training', single=True, manager=False)
259269 #cProfile.run(bot_launcher(language_code, project, single=True), 'profile')

Follow-up revisions

RevisionCommit summaryAuthorDate
r77723Retagging editor_trends at 0.1/r77690, doing r77691 properlyreedy15:35, 4 December 2010

Status & tagging log