Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -19,6 +19,7 @@ |
20 | 20 | |
21 | 21 | import os |
22 | 22 | import logging |
| 23 | +import logging.handlers |
23 | 24 | import sys |
24 | 25 | import datetime |
25 | 26 | from argparse import ArgumentParser |
— | — | @@ -36,6 +37,7 @@ |
37 | 38 | from utils import dump_downloader |
38 | 39 | from utils import compression |
39 | 40 | from utils import ordered_dict |
| 41 | +from utils import exceptions |
40 | 42 | from database import db |
41 | 43 | from etl import chunker |
42 | 44 | from etl import extract |
— | — | @@ -104,11 +106,18 @@ |
105 | 107 | logger.debug('Starting %s task' % function.func_name) |
106 | 108 | if message: |
107 | 109 | logger.debug(message) |
| 110 | + |
| 111 | + max_length = max([len(kw) for kw in kwargs]) |
| 112 | + #max_tab = max_length / 4 |
108 | 113 | for kw in kwargs: |
109 | 114 | if verb: |
110 | 115 | logger.debug('Action: %s\tSetting: %s' % (verb, kwargs[kw])) |
111 | 116 | else: |
112 | | - logger.debug('Key: %s\tSetting: %s' % (kw, kwargs[kw])) |
| 117 | + tabs = (max_length - len(kw)) / 4 |
| 118 | + if tabs == 0: |
| 119 | + tabs = 1 |
| 120 | + tabs = ''.join(['\t' for t in xrange(tabs)]) |
| 121 | + logger.debug('\tKey: %s%sSetting: %s' % (kw, tabs, kwargs[kw])) |
113 | 122 | |
114 | 123 | |
115 | 124 | |
— | — | @@ -128,6 +137,7 @@ |
129 | 138 | location = get_value(args, 'location') if get_value(args, 'location') != None else settings.input_location |
130 | 139 | project = get_project(args) |
131 | 140 | language_code = get_language(args) |
| 141 | + config['format'] = get_value(args, 'format') |
132 | 142 | config['language_code'] = language_code |
133 | 143 | config['language'] = get_value(args, 'language') |
134 | 144 | config['location'] = os.path.join(location, language_code, project) |
— | — | @@ -191,7 +201,7 @@ |
192 | 202 | language = kwargs.pop('language') |
193 | 203 | language_code = kwargs.pop('language_code') |
194 | 204 | namespaces = kwargs.pop('namespaces') |
195 | | - |
| 205 | + format = kwargs.pop('format') |
196 | 206 | ext = utils.determine_file_extension(filename) |
197 | 207 | file = filename.replace('.' + ext, '') |
198 | 208 | result = utils.check_file_exists(location, file) |
— | — | @@ -202,11 +212,12 @@ |
203 | 213 | if retcode != 0: |
204 | 214 | sys.exit(retcode) |
205 | 215 | |
206 | | - chunker.split_file(location, file, project, language_code, namespaces, format='xml', zip=False) |
| 216 | + chunker.split_file(location, file, project, language_code, namespaces, format=format, zip=False) |
207 | 217 | timer.elapsed() |
208 | 218 | |
209 | 219 | |
210 | 220 | def launch_zip_extractor(args, logger, location, file): |
| 221 | + print 'Unzipping zip file' |
211 | 222 | timer = Timer() |
212 | 223 | write_message_to_log(logger, args, location=location, file=file) |
213 | 224 | compressor = compression.Compressor(location, file) |
— | — | @@ -215,56 +226,60 @@ |
216 | 227 | |
217 | 228 | |
218 | 229 | def extract_launcher(args, logger, **kwargs): |
| 230 | + print 'Extracting data from XML' |
219 | 231 | timer = Timer() |
220 | | - write_message_to_log(logger, args, **kwargs) |
221 | 232 | location = kwargs.pop('location') |
222 | 233 | language_code = kwargs.pop('language_code') |
223 | 234 | project = kwargs.pop('project') |
| 235 | + write_message_to_log(logger, args, location=location, language_code=language_code, project=project) |
224 | 236 | extract.run_parse_editors(location, **kwargs) |
225 | 237 | timer.elapsed() |
226 | 238 | |
227 | 239 | |
228 | 240 | def sort_launcher(args, logger, **kwargs): |
| 241 | + print 'Start sorting data' |
229 | 242 | timer = Timer() |
230 | | - write_message_to_log(logger, args, **kwargs) |
231 | 243 | location = kwargs.pop('location') |
232 | 244 | input = os.path.join(location, 'txt') |
233 | 245 | output = os.path.join(location, 'sorted') |
234 | 246 | final_output = os.path.join(location, 'dbready') |
| 247 | + write_message_to_log(logger, args, location=location, input=input, output=output, final_output=final_output) |
235 | 248 | loader.mergesort_launcher(input, output) |
236 | 249 | loader.mergesort_external_launcher(output, final_output) |
237 | 250 | timer.elapsed() |
238 | 251 | |
239 | 252 | |
240 | 253 | def store_launcher(args, logger, **kwargs): |
| 254 | + print 'Start storing data in MongoDB' |
241 | 255 | timer = Timer() |
242 | | - write_message_to_log(logger, args, **kwargs) |
243 | 256 | location = kwargs.pop('location') |
244 | 257 | input = os.path.join(location, 'dbready') |
245 | 258 | dbname = kwargs.pop('full_project') |
246 | 259 | collection = kwargs.pop('collection') |
| 260 | + write_message_to_log(logger, args, verb='Storing', location=location, input=input, dbname=dbname, collection=collection) |
247 | 261 | loader.store_editors(input, dbname, collection) |
248 | 262 | timer.elapsed() |
249 | 263 | |
250 | 264 | |
251 | 265 | def transformer_launcher(args, logger, **kwargs): |
252 | | - print 'dataset launcher' |
| 266 | + print 'Start transforming dataset' |
253 | 267 | timer = Timer() |
254 | | - write_message_to_log(logger, args, **kwargs) |
255 | 268 | project = kwargs.pop('full_project') |
256 | 269 | collection = kwargs.pop('collection') |
| 270 | + write_message_to_log(logger, args, verb='Transforming', project=project, collection=collection) |
257 | 271 | transformer.transform_editors_single_launcher(project, collection) |
258 | 272 | timer.elapsed() |
259 | 273 | |
260 | 274 | |
261 | 275 | def exporter_launcher(args, logger, **kwargs): |
| 276 | + print 'Start exporting dataset' |
262 | 277 | timer = Timer() |
263 | | - write_message_to_log(logger, args, **kwargs) |
264 | 278 | collection = get_value(args, 'collection') |
265 | | - dbname = kwargs.pop('full_project') |
| 279 | + dbname = kwargs.get('full_project') |
266 | 280 | targets = get_value(args, 'datasets') |
267 | 281 | targets = targets.split(',') |
268 | 282 | for target in targets: |
| 283 | + write_message_to_log(logger, args, verb='Exporting', target=target, dbname=dbname, collection=collection) |
269 | 284 | exporter.dataset_launcher(dbname, collection, target) |
270 | 285 | timer.elapsed() |
271 | 286 | |
— | — | @@ -274,16 +289,19 @@ |
275 | 290 | timer = Timer() |
276 | 291 | full_project = kwargs.get('full_project', None) |
277 | 292 | message = 'Start of building %s dataset.' % full_project |
278 | | - db.cleanup_database(full_project) |
279 | | - write_message_to_log(logger, args, message, **kwargs) |
| 293 | + |
| 294 | + db.cleanup_database(full_project, logger) |
280 | 295 | ignore = get_value(args, 'except') |
281 | 296 | clean = get_value(args, 'new') |
| 297 | + format = get_value(args, 'format') |
| 298 | + write_message_to_log(logger, args, message=message, full_project=full_project, ignore=ignore, clean=clean) |
282 | 299 | if clean: |
283 | 300 | dirs = kwargs.get('directories')[1:] |
284 | 301 | for dir in dirs: |
285 | | - write_message_to_log(logger, args, verb='Deleting', **kwargs) |
286 | | - utils.delete_file(dir, '') |
287 | | - |
| 302 | + write_message_to_log(logger, args, verb='Deleting', dir=dir) |
| 303 | + utils.delete_file(dir, '', directory=True) |
| 304 | + if format != 'xml': |
| 305 | + ignore = ignore + ',extract' |
288 | 306 | functions = ordered_dict.OrderedDict(((dump_downloader_launcher, 'download'), |
289 | 307 | (chunker_launcher, 'split'), |
290 | 308 | (extract_launcher, 'extract'), |
— | — | @@ -328,8 +346,9 @@ |
329 | 347 | version = sys.version_info[0:2] |
330 | 348 | logger.debug('Python version: %s' % '.'.join(str(version))) |
331 | 349 | if version < settings.minimum_python_version: |
332 | | - raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).' |
| 350 | + raise exceptions.OutDatedPythonVersionError |
333 | 351 | |
| 352 | + |
334 | 353 | def about(): |
335 | 354 | print 'Editor Trends Software is (c) 2010 by the Wikimedia Foundation.' |
336 | 355 | print 'Written by Diederik van Liere (dvanliere@gmail.com).' |
— | — | @@ -339,9 +358,6 @@ |
340 | 359 | |
341 | 360 | |
342 | 361 | def main(): |
343 | | - logger = logging.getLogger('manager') |
344 | | - logger.setLevel(logging.DEBUG) |
345 | | - |
346 | 362 | default_language = determine_default_language() |
347 | 363 | |
348 | 364 | datasets = {'cohort': 'generate_cohort_dataset', |
— | — | @@ -397,7 +413,7 @@ |
398 | 414 | help='Should be a list of functions that are to be ignored when executing \'all\'.', |
399 | 415 | default=[]) |
400 | 416 | |
401 | | - parser_all.add_argument('-n', '--new', action='store_false', |
| 417 | + parser_all.add_argument('-n', '--new', action='store_true', |
402 | 418 | help='This will delete all previous output and starts from scratch. Mostly useful for debugging purposes.', |
403 | 419 | default=False) |
404 | 420 | |
— | — | @@ -421,10 +437,13 @@ |
422 | 438 | default=settings.input_location |
423 | 439 | ) |
424 | 440 | |
425 | | - parser.add_argument('-n', '--namespace', action='store', |
| 441 | + parser.add_argument('-ns', '--namespace', action='store', |
426 | 442 | help='A list of namespaces to include for analysis.', |
427 | 443 | default='0') |
428 | 444 | |
| 445 | + parser.add_argument('-fo', '--format', action='store', |
| 446 | + help='Indicate which format the chunks should be stored. Valid options are xml and txt.', |
| 447 | + default='txt') |
429 | 448 | |
430 | 449 | parser.add_argument('-f', '--file', action='store', |
431 | 450 | choices=file_choices, |
— | — | @@ -444,9 +463,22 @@ |
445 | 464 | parser.add_argument('-prog', '--progress', action='store_true', default=True, |
446 | 465 | help='Indicate whether you want to have a progressbar.') |
447 | 466 | |
| 467 | + args = parser.parse_args() |
| 468 | + #initialize logger |
| 469 | + logger = logging.getLogger('manager') |
| 470 | + logger.setLevel(logging.DEBUG) |
| 471 | + |
| 472 | + # Add the log message handler to the logger |
| 473 | + today = datetime.datetime.today() |
| 474 | + log_filename = os.path.join(settings.log_location, '%s%s_%s-%s-%s.log' % (args.language, args.project, today.day, today.month, today.year)) |
| 475 | + handler = logging.handlers.RotatingFileHandler(log_filename, maxBytes=1024 * 1024, backupCount=3) |
| 476 | + |
| 477 | + logger.addHandler(handler) |
| 478 | + logger.debug('Default language: \t%s' % default_language) |
| 479 | + |
| 480 | + #start manager |
448 | 481 | detect_python_version(logger) |
449 | 482 | about() |
450 | | - args = parser.parse_args() |
451 | 483 | config.create_configuration(settings, args) |
452 | 484 | locations = determine_file_locations(args, logger) |
453 | 485 | settings.verify_environment(locations['directories']) |
Index: trunk/tools/editor_trends/wikitree/xml.py |
— | — | @@ -26,7 +26,7 @@ |
27 | 27 | return utils.unescape(text) |
28 | 28 | |
29 | 29 | |
30 | | -def extract_text(elem, kwargs): |
| 30 | +def extract_text(elem, **kwargs): |
31 | 31 | if elem != None and elem.text != None: |
32 | 32 | #try: |
33 | 33 | return elem.text #.decode(settings.encoding) |
Index: trunk/tools/editor_trends/etl/extract.py |
— | — | @@ -20,13 +20,12 @@ |
21 | 21 | #Default Python libraries (Python => 2.6) |
22 | 22 | import sys |
23 | 23 | import os |
24 | | -import time |
25 | | -import datetime |
26 | | -import codecs |
27 | | -import math |
| 24 | +#import time |
| 25 | +#import datetime |
| 26 | +#import codecs |
| 27 | +#import math |
28 | 28 | |
29 | | -import re |
30 | | -from operator import itemgetter |
| 29 | +#from operator import itemgetter |
31 | 30 | |
32 | 31 | import multiprocessing |
33 | 32 | from Queue import Empty |
— | — | @@ -43,7 +42,7 @@ |
44 | 43 | from wikitree import xml |
45 | 44 | from bots import bots |
46 | 45 | from etl import models |
47 | | -from utils import process_constructor as pc |
| 46 | +#from utils import process_constructor as pc |
48 | 47 | |
49 | 48 | try: |
50 | 49 | import psyco |
— | — | @@ -51,25 +50,49 @@ |
52 | 51 | except ImportError: |
53 | 52 | pass |
54 | 53 | |
| 54 | +def validate_hostname(address): |
| 55 | + ''' |
| 56 | + This is not a foolproof solution at all. The problem is that it's really hard |
| 57 | + to determine whether a string is a hostname or not **reliably**. This is a |
| 58 | + very fast rule of thumb. Will lead to false positives, but that's life :) |
| 59 | + ''' |
| 60 | + parts = address.split(".") |
| 61 | + if len(parts) > 2: |
| 62 | + return True |
| 63 | + else: |
| 64 | + return False |
55 | 65 | |
| 66 | +def validate_ip(address): |
| 67 | + parts = address.split(".") |
| 68 | + if len(parts) != 4: |
| 69 | + return False |
| 70 | + parts = parts[:3] |
| 71 | + for item in parts: |
| 72 | + try: |
| 73 | + if not 0 <= int(item) <= 255: |
| 74 | + return False |
| 75 | + except ValueError: |
| 76 | + return False |
| 77 | + return True |
56 | 78 | |
57 | 79 | |
58 | | -def determine_username_is_bot(contributor, bots): |
| 80 | +def determine_username_is_bot(contributor, **kwargs): |
59 | 81 | ''' |
60 | 82 | #contributor is an xml element containing the id of the contributor |
61 | 83 | @bots should have a dcit with all the bot ids and bot names |
62 | 84 | @Return False if username id is not in bot dict id or True if username id |
63 | 85 | is a bot id. |
64 | 86 | ''' |
| 87 | + bots = kwargs.get('bots') |
65 | 88 | for elem in contributor: |
66 | 89 | if elem.tag == 'id': |
67 | | - if elem.text in bots['bots']: |
| 90 | + if elem.text in bots: |
68 | 91 | return 1 |
69 | 92 | else: |
70 | 93 | return 0 |
71 | 94 | |
72 | 95 | |
73 | | -def extract_username(contributor, kwargs): |
| 96 | +def extract_username(contributor, **kwargs): |
74 | 97 | for elem in contributor: |
75 | 98 | if elem.tag == 'username': |
76 | 99 | return elem.text |
— | — | @@ -77,41 +100,44 @@ |
78 | 101 | return None |
79 | 102 | |
80 | 103 | |
81 | | -def extract_contributor_id(contributor, kwargs): |
| 104 | +def extract_contributor_id(contributor, **kwargs): |
82 | 105 | ''' |
83 | 106 | @contributor is the xml contributor node containing a number of attributes |
84 | 107 | |
85 | 108 | Currently, we are only interested in registered contributors, hence we |
86 | 109 | ignore anonymous editors. |
87 | 110 | ''' |
88 | | - if contributor.get('deleted'): |
89 | | - return - 1 # ASK: Not sure if this is the best way to code deleted contributors. |
| 111 | + #if contributor.get('deleted'): |
| 112 | + # return None # ASK: Not sure if this is the best way to code deleted contributors. |
90 | 113 | for elem in contributor: |
91 | | - if elem.tag == 'id': |
92 | | - if elem.text != None: |
93 | | - return elem.text |
| 114 | + if elem.tag == 'id' and elem.text != None: |
| 115 | + return {'id':elem.text} |
| 116 | + |
| 117 | + elif elem.tag == 'ip' and elem.text != None: |
| 118 | + if validate_ip(elem.text) == False and validate_hostname(elem.text) == False: |
| 119 | + return {'username':elem.text, 'id': elem.text} |
94 | 120 | else: |
95 | | - return - 1 |
| 121 | + return None |
| 122 | + return None |
96 | 123 | |
97 | | - |
98 | 124 | def output_editor_information(elem, fh, **kwargs): |
99 | 125 | ''' |
100 | 126 | @elem is an XML element containing 1 revision from a page |
101 | | - @output is where to store the data, either a queue or a filehandle |
| 127 | + @output is where to store the data, a filehandle |
102 | 128 | @**kwargs contains extra information |
103 | 129 | |
104 | 130 | the variable tags determines which attributes are being parsed, the values in |
105 | 131 | this dictionary are the functions used to extract the data. |
106 | 132 | ''' |
107 | | - tags = {'contributor': {'editor': extract_contributor_id, |
| 133 | + tags = {'contributor': {'id': extract_contributor_id, |
108 | 134 | 'bot': determine_username_is_bot, |
109 | 135 | 'username': extract_username, |
110 | 136 | }, |
111 | 137 | 'timestamp': {'date': xml.extract_text}, |
112 | 138 | } |
113 | 139 | vars = {} |
114 | | - headers = ['editor', 'date', 'article', 'username'] |
115 | | - #destination = kwargs.pop('destination') |
| 140 | + #counter = kwargs.pop('counter') |
| 141 | + headers = ['id', 'date', 'article', 'username'] |
116 | 142 | revisions = elem.findall('revision') |
117 | 143 | for revision in revisions: |
118 | 144 | vars['article'] = elem.find('id').text.decode(settings.encoding) |
— | — | @@ -119,18 +145,27 @@ |
120 | 146 | for tag, functions in tags.iteritems(): |
121 | 147 | xml_node = xml.retrieve_xml_node(elements, tag) |
122 | 148 | for var, function in functions.iteritems(): |
123 | | - vars[var] = function(xml_node, kwargs) |
| 149 | + value = function(xml_node, **kwargs) |
| 150 | + if type(value) == type({}): |
| 151 | + for kw in value: |
| 152 | + vars[kw] = value[kw] |
| 153 | + #if vars['username'] not in counter: |
| 154 | + # counter['username'] = c |
| 155 | + # c += 1 |
| 156 | + #vars['id'] = counter[vars['username']] |
| 157 | + else: |
| 158 | + vars[var] = value |
124 | 159 | |
125 | 160 | #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot']) |
126 | | - if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None: |
| 161 | + if vars['bot'] != 1 and vars['id'] != None: |
127 | 162 | vars.pop('bot') |
128 | 163 | data = [] |
129 | 164 | for head in headers: |
130 | 165 | data.append(vars[head]) |
131 | 166 | utils.write_list_to_csv(data, fh) |
132 | 167 | vars = {} |
| 168 | + #return counter, c |
133 | 169 | |
134 | | - |
135 | 170 | def run_parse_editors(location, **kwargs): |
136 | 171 | bot_ids = bots.retrieve_bots() |
137 | 172 | input = os.path.join(location, 'chunks') |
— | — | @@ -157,7 +192,7 @@ |
158 | 193 | bot_ids = bots.retrieve_bots() |
159 | 194 | input = os.path.join(location, 'chunks') |
160 | 195 | output = os.path.join(location, 'txt') |
161 | | - xml_file = models.XMLFile(input, output, '1.xml', bot_ids, output_editor_information) |
| 196 | + xml_file = models.XMLFile(input, output, 'pages_full_en.xml', bot_ids, output_editor_information) |
162 | 197 | xml_file() |
163 | 198 | |
164 | 199 | if __name__ == '__main__': |
Index: trunk/tools/editor_trends/etl/chunker.py |
— | — | @@ -30,10 +30,14 @@ |
31 | 31 | |
32 | 32 | sys.path.append('..') |
33 | 33 | import configuration |
| 34 | +settings = configuration.Settings() |
| 35 | + |
34 | 36 | from utils import utils |
| 37 | +import extract |
35 | 38 | from wikitree import xml |
36 | | -settings = configuration.Settings() |
| 39 | +from bots import bots |
37 | 40 | |
| 41 | + |
38 | 42 | try: |
39 | 43 | import psyco |
40 | 44 | psyco.full() |
— | — | @@ -114,12 +118,12 @@ |
115 | 119 | return True |
116 | 120 | |
117 | 121 | |
118 | | -def write_xml_file(element, fh, output, counter): |
| 122 | +def write_xml_file(element, fh, output, counter, format): |
119 | 123 | '''Get file handle and write xml element to file''' |
120 | 124 | try: |
121 | 125 | xml_string = cElementTree.tostring(element) |
122 | 126 | size = len(xml_string) |
123 | | - fh, counter, new_file = create_file_handle(fh, output, counter, size) |
| 127 | + fh, counter, new_file = create_file_handle(fh, output, counter, size, format) |
124 | 128 | fh.write(xml_string) |
125 | 129 | except MemoryError: |
126 | 130 | print 'Add error capturing logic' |
— | — | @@ -134,7 +138,7 @@ |
135 | 139 | return fh, counter, new_file |
136 | 140 | |
137 | 141 | |
138 | | -def create_file_handle(fh, output, counter, size): |
| 142 | +def create_file_handle(fh, output, counter, size, format): |
139 | 143 | ''' |
140 | 144 | @fh is file handle, if none is supplied or if file size > max file size then |
141 | 145 | create a new file handle |
— | — | @@ -144,56 +148,79 @@ |
145 | 149 | ''' |
146 | 150 | if not fh: |
147 | 151 | counter = 0 |
148 | | - path = os.path.join(output, '%s.xml' % counter) |
| 152 | + path = os.path.join(output, '%s.%s' % (counter, format)) |
149 | 153 | fh = codecs.open(path, 'w', encoding=settings.encoding) |
150 | 154 | return fh, counter, False |
151 | 155 | elif (fh.tell() + size) > settings.max_xmlfile_size: |
152 | 156 | print 'Created chunk %s' % (counter + 1) |
153 | 157 | fh.close |
154 | 158 | counter += 1 |
155 | | - path = os.path.join(output, '%s.xml' % counter) |
| 159 | + path = os.path.join(output, '%s.%s' % (counter, format)) |
156 | 160 | fh = codecs.open(path, 'w', encoding=settings.encoding) |
157 | 161 | return fh, counter, True |
158 | 162 | else: |
159 | 163 | return fh, counter, False |
160 | 164 | |
161 | 165 | |
162 | | -def flatten_xml_elements(data, page): |
| 166 | +def flatten_xml_elements(data, page, bots): |
| 167 | + headers = ['id', 'date', 'article', 'username'] |
| 168 | + tags = {'contributor': {'id': extract.extract_contributor_id, |
| 169 | + 'bot': extract.determine_username_is_bot, |
| 170 | + 'username': extract.extract_username, |
| 171 | + }, |
| 172 | + 'timestamp': {'date': xml.extract_text}, |
| 173 | + } |
| 174 | + vars = {} |
163 | 175 | flat = [] |
| 176 | + |
164 | 177 | for x, elems in enumerate(data): |
165 | | - flat.append([page]) |
166 | | - for elem in elems: |
167 | | - if elem.tag != 'id': |
168 | | - if len(elem.getchildren()) > 0: |
169 | | - for el in elem.getchildren(): |
170 | | - flat[x].append(xml.extract_text(elem, None)) |
| 178 | + vars[x] = {} |
| 179 | + vars[x]['article'] = page |
| 180 | + for tag in tags: |
| 181 | + el = xml.retrieve_xml_node(elems, tag) |
| 182 | + for function in tags[tag].keys(): |
| 183 | + f = tags[tag][function] |
| 184 | + value = f(el, bots=bots) |
| 185 | + if type(value) == type({}): |
| 186 | + for kw in value: |
| 187 | + vars[x][kw] = value[kw] |
171 | 188 | else: |
172 | | - flat[x].append(xml.extract_text(elem, None)) |
| 189 | + vars[x][function] = value |
| 190 | + |
| 191 | + for x, var in enumerate(vars): |
| 192 | + if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None: |
| 193 | + continue |
| 194 | + else: |
| 195 | + f = [] |
| 196 | + for head in headers: |
| 197 | + f.append(vars[x][head]) |
| 198 | + flat.append(f) |
| 199 | + |
173 | 200 | return flat |
174 | 201 | |
175 | 202 | |
176 | 203 | def split_file(location, file, project, language_code, namespaces=[0], format='xml', zip=False): |
177 | 204 | ''' |
178 | 205 | Reads xml file and splits it in N chunks |
179 | | - |
180 | 206 | @namespaces is a list indicating which namespaces should be included, default |
181 | 207 | is to include namespace 0 (main namespace) |
182 | 208 | @zip indicates whether to compress the chunk or not |
183 | 209 | ''' |
184 | | - #location = os.path.join(settings.input_location, language) |
185 | 210 | input = os.path.join(location, file) |
186 | | - output = os.path.join(location, 'chunks') |
187 | | - settings.verify_environment([output]) |
188 | 211 | if format == 'xml': |
189 | | - fh = None |
| 212 | + output = os.path.join(location, 'chunks') |
190 | 213 | else: |
191 | | - f = input.replace('.xml', '') |
192 | | - fh = utils.create_txt_filehandle(output, '%s.tsv' % f, 'w', settings.encoding) |
| 214 | + output = os.path.join(location, 'txt') |
| 215 | + bot_ids = bots.retrieve_bots() |
| 216 | + settings.verify_environment([output]) |
193 | 217 | |
| 218 | + fh = None |
| 219 | + counter = 0 |
| 220 | + |
194 | 221 | ns = load_namespace(language_code) |
195 | 222 | ns = build_namespaces_locale(ns, namespaces) |
| 223 | + #settings.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/' |
196 | 224 | |
197 | | - counter = 0 |
198 | 225 | tag = '{%s}page' % settings.xml_namespace |
199 | 226 | context = cElementTree.iterparse(input, events=('start', 'end')) |
200 | 227 | context = iter(context) |
— | — | @@ -206,16 +233,21 @@ |
207 | 234 | if is_article_main_namespace(elem, ns): |
208 | 235 | page = elem.find('id').text |
209 | 236 | elem = parse_comments(elem, remove_numeric_character_references) |
| 237 | + |
210 | 238 | if format == 'xml': |
211 | | - fh, counter, new_file = write_xml_file(elem, fh, output, counter) |
212 | | - if zip and new_file: |
213 | | - file = str(counter - 1) + '.xml' |
214 | | - utils.zip_archive(settings.path_ziptool, output, file) |
215 | | - utils.delete_file(output, file) |
| 239 | + fh, counter, new_file = write_xml_file(elem, fh, output, counter, format) |
216 | 240 | else: |
217 | 241 | data = [el.getchildren() for el in elem if el.tag == 'revision'] |
218 | | - data = flatten_xml_elements(data, page) |
219 | | - utils.write_list_to_csv(data, fh, recursive=False, newline=True) |
| 242 | + data = flatten_xml_elements(data, page, bot_ids) |
| 243 | + if data != None: |
| 244 | + size = 64 * len(data) |
| 245 | + fh, counter, new_file = create_file_handle(fh, output, counter, size, format) |
| 246 | + utils.write_list_to_csv(data, fh, recursive=False, newline=True) |
| 247 | + |
| 248 | + if zip and new_file: |
| 249 | + file = str(counter - 1) + format |
| 250 | + utils.zip_archive(settings.path_ziptool, output, file) |
| 251 | + utils.delete_file(output, file) |
220 | 252 | root.clear() # when done parsing a section clear the tree to safe memory |
221 | 253 | except SyntaxError: |
222 | 254 | f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding) |
— | — | @@ -225,8 +257,8 @@ |
226 | 258 | fh.close() |
227 | 259 | |
228 | 260 | if __name__ == "__main__": |
229 | | - kwargs = {'output': settings.input_location, |
230 | | - 'input': settings.input_filename, |
| 261 | + kwargs = {'location': settings.input_location, |
| 262 | + 'file': settings.input_filename, |
231 | 263 | 'project':'wiki', |
232 | 264 | 'language_code':'en', |
233 | 265 | 'format': 'tsv' |
Index: trunk/tools/editor_trends/etl/models.py |
— | — | @@ -29,18 +29,49 @@ |
30 | 30 | from utils import utils |
31 | 31 | from wikitree import xml |
32 | 32 | |
| 33 | +class TXTFile(object): |
| 34 | + |
| 35 | + def __init__(self, file, location, output, output_file, target, **kwargs): |
| 36 | + self.file = file |
| 37 | + self.location = location |
| 38 | + self.target = target |
| 39 | + self.output = output |
| 40 | + self.output_file = output_file |
| 41 | + for kw in kwargs: |
| 42 | + setattr(self, kw, kwargs[kw]) |
| 43 | + |
| 44 | + def __str__(self): |
| 45 | + return '%s' % (self.file) |
| 46 | + |
| 47 | + def __call__(self, bots): |
| 48 | + self.bots = bots |
| 49 | + self.fr = utils.create_txt_filehandle(self.location, self.file, 'r', settings.encoding) |
| 50 | + self.fw = utils.create_txt_filehandle(self.output, self.output_file, 'w', settings.encoding) |
| 51 | + for line in self.fr: |
| 52 | + line = line.replace('\n', '') |
| 53 | + if line == '': |
| 54 | + continue |
| 55 | + line = line.split('\t') |
| 56 | + self.bots = self.target(line, self.fw, self.bots, self.keys) |
| 57 | + if self.bots == {}: |
| 58 | + break |
| 59 | + self.fr.close() |
| 60 | + self.fw.close() |
| 61 | + return self.bots |
| 62 | + |
33 | 63 | class XMLFileConsumer(models.BaseConsumer): |
34 | 64 | |
35 | 65 | def run(self): |
36 | 66 | while True: |
37 | | - print 'Queue is %s files long...' % (self.task_queue.qsize() - settings.number_of_processes) |
38 | 67 | new_xmlfile = self.task_queue.get() |
39 | 68 | self.task_queue.task_done() |
40 | 69 | if new_xmlfile == None: |
41 | 70 | print 'Swallowed a poison pill' |
42 | 71 | break |
| 72 | + print 'Queue is %s files long...' % self.task_queue.qsize() |
43 | 73 | new_xmlfile() |
44 | 74 | |
| 75 | + |
45 | 76 | class XMLFile(object): |
46 | 77 | def __init__(self, input, output, xml_file, bots, target, output_file=None, **kwargs): |
47 | 78 | self.file = xml_file |
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -50,9 +50,9 @@ |
51 | 51 | self.progressbar = True |
52 | 52 | self.encoding = 'utf-8' |
53 | 53 | self.date_format = '%Y-%m-%d' #Date format as used by Erik Zachte |
54 | | - self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' # Timestampformat as generated by the MediaWiki dumps |
| 54 | + self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' # Timestamp format as generated by the MediaWiki dumps |
55 | 55 | |
56 | | - self.max_xmlfile_size = 67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason |
| 56 | + self.max_xmlfile_size = 4096 * 1024 #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason |
57 | 57 | self.number_of_processes = cpu_count() * process_multiplier |
58 | 58 | #Change this to match your computers configuration (RAM / CPU) |
59 | 59 | self.minimum_python_version = (2, 6) |
— | — | @@ -70,8 +70,8 @@ |
71 | 71 | self.root = '/' if self.platform != 'Windows' else 'c:\\' |
72 | 72 | self.file_locations = self.set_file_locations() |
73 | 73 | self.max_filehandles = self.determine_max_filehandles_open() |
| 74 | + self.tab_width = 4 if self.platform == 'Windows' else 8 |
74 | 75 | |
75 | | - |
76 | 76 | self.load_configuration() |
77 | 77 | self.set_custom_settings(**kwargs) |
78 | 78 | self.dumpversions = {'0': 'http://www.mediawiki.org/xml/export-0.4/', |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -31,8 +31,9 @@ |
32 | 32 | import os |
33 | 33 | import ctypes |
34 | 34 | import time |
35 | | -import subprocess |
| 35 | +#import subprocess |
36 | 36 | import sys |
| 37 | +import shutil |
37 | 38 | sys.path.append('..') |
38 | 39 | |
39 | 40 | import configuration |
— | — | @@ -248,9 +249,19 @@ |
249 | 250 | return name |
250 | 251 | |
251 | 252 | |
252 | | -def delete_file(location, filename): |
| 253 | +def delete_file(location, filename, directory=False): |
253 | 254 | if check_file_exists(location, filename): |
254 | | - os.remove(os.path.join(location, filename)) |
| 255 | + if not directory: |
| 256 | + try: |
| 257 | + path = os.path.join(location, filename) |
| 258 | + os.remove(path) |
| 259 | + except WindowsError, error: |
| 260 | + print error |
| 261 | + else: |
| 262 | + try: |
| 263 | + shutil.rmtree(location) |
| 264 | + except Exception, error: |
| 265 | + print error |
255 | 266 | |
256 | 267 | |
257 | 268 | def check_file_exists(location, filename): |
Index: trunk/tools/editor_trends/utils/exceptions.py |
— | — | @@ -43,3 +43,10 @@ |
44 | 44 | |
45 | 45 | def __str__(self): |
46 | 46 | print 'You have not installed a program to extract %s archives.' % self.extension |
| 47 | + |
| 48 | +class OutDatedPythonVersionError(Error): |
| 49 | + def __init__(self, version): |
| 50 | + self.version = version |
| 51 | + |
| 52 | + def __str__(self): |
| 53 | + print 'Please upgrade to Python 2.6 or higher (but not Python 3.x).' |
Index: trunk/tools/editor_trends/database/db.py |
— | — | @@ -17,18 +17,18 @@ |
18 | 18 | __date__ = '2010-10-21' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | | -import sqlite3 as sqlite |
| 21 | +#import sqlite3 as sqlite |
22 | 22 | from pymongo import Connection |
23 | 23 | |
24 | 24 | |
25 | | -import configuration |
26 | | -settings = configuration.Settings() |
27 | | -from database import db_settings |
| 25 | +#import configuration |
| 26 | +#settings = configuration.Settings() |
| 27 | +#from database import db_settings |
28 | 28 | |
29 | 29 | |
30 | | -def init_mongo_db(db): |
| 30 | +def init_mongo_db(dbname): |
31 | 31 | connection = Connection() |
32 | | - db = connection[db] |
| 32 | + db = connection[dbname] |
33 | 33 | return db |
34 | 34 | |
35 | 35 | |
— | — | @@ -42,11 +42,12 @@ |
43 | 43 | return db.collection_names() |
44 | 44 | |
45 | 45 | |
46 | | -def cleanup_database(dbname): |
| 46 | +def cleanup_database(dbname, logger): |
47 | 47 | coll = get_collections(dbname) |
48 | 48 | for c in coll: |
49 | 49 | if not c.startswith('system'): |
50 | 50 | drop_collection(dbname, c) |
| 51 | + logger.debug('Deleting collection %s from database %s.' % (c, dbname)) |
51 | 52 | |
52 | 53 | |
53 | 54 | def remove_documents_from_mongo_db(collection, ids): |
Index: trunk/tools/editor_trends/bots/bots.py |
— | — | @@ -31,6 +31,7 @@ |
32 | 32 | from wikitree import xml |
33 | 33 | from database import db |
34 | 34 | from utils import utils |
| 35 | +#from etl import extract |
35 | 36 | from utils import process_constructor as pc |
36 | 37 | from etl import models |
37 | 38 | import models as botmodels |
— | — | @@ -110,54 +111,51 @@ |
111 | 112 | keys and values to ease writing to a csv file. |
112 | 113 | ''' |
113 | 114 | d = {} |
114 | | - for o in obj: |
115 | | - bot = obj[o] |
116 | | - d[o] = {} |
117 | | - for kw in bot.__dict__.keys(): |
118 | | - if kw not in exclude: |
119 | | - d[o][kw] = getattr(bot, kw) |
| 115 | + for kw in obj.__dict__.keys(): |
| 116 | + if kw not in exclude: |
| 117 | + d[kw] = getattr(obj, kw) |
120 | 118 | return d |
121 | 119 | |
122 | 120 | |
123 | | -def write_bot_list_to_csv(bots): |
| 121 | +def write_bot_list_to_csv(bots, keys): |
124 | 122 | fh = utils.create_txt_filehandle(settings.csv_location, 'bots_ids.csv', 'w', settings.encoding) |
125 | 123 | bot_dict = convert_object_to_dict(bots, exclude=['time', 'written']) |
126 | | - keys = ['id', 'name', 'verified', 'projects'] |
127 | 124 | for bot in bot_dict: |
128 | 125 | bot = bot_dict[bot] |
129 | 126 | utils.write_dict_to_csv(bot, fh, keys, write_key=False, newline=True) |
130 | 127 | fh.close() |
131 | 128 | |
132 | 129 | |
133 | | -def lookup_bot_userid(xml_nodes, bots): |
| 130 | +def lookup_bot_userid(data, fh, bots, keys): |
134 | 131 | ''' |
135 | 132 | This function is used to find the id's belonging to the different bots that |
136 | 133 | are patrolling the Wikipedia sites. |
137 | 134 | @xml_nodes is a list of xml elements that need to be parsed |
138 | 135 | @bots is a dictionary containing the names of the bots to lookup |
139 | 136 | ''' |
140 | | - revisions = xml_nodes.findall('revision') |
141 | | - for revision in revisions: |
142 | | - contributor = xml.retrieve_xml_node(revision, 'contributor') |
143 | | - username = contributor.find('username') |
144 | | - if username == None or username.text == None: |
145 | | - continue |
146 | | - else: |
147 | | - username = username.text #encode(settings.encoding) |
148 | | - name = username.lower() |
| 137 | + username = data[3] |
| 138 | + if username in bots: |
| 139 | + bot = bots.pop(username) |
| 140 | + setattr(bot, 'id', data[0]) |
| 141 | + setattr(bot, 'verified', True) |
| 142 | + bot = convert_object_to_dict(bot, exclude=['time']) |
| 143 | + utils.write_dict_to_csv(bot, fh, keys, write_key=False, newline=True) |
| 144 | + return bots |
149 | 145 | |
150 | | - #print username.encode('utf-8') |
151 | | - if (username in bots and bots[username].verified == True) or name.find('bot') > -1: |
152 | | - bot = bots.get(username, botmodels.Bot(username, verified=False)) |
153 | | - id = contributor.find('id').text |
154 | | - bot.id = id |
155 | | - bot.name = username |
156 | | - timestamp = revision.find('timestamp').text |
157 | | - if timestamp != None: |
158 | | - timestamp = utils.convert_timestamp_to_datetime_naive(timestamp) |
159 | | - bot.time[str(timestamp.year)].append(timestamp) |
160 | 146 | |
161 | | - bots[username] = bot |
| 147 | +def create_bot_validation_dataset(data, fh, bots, keys): |
| 148 | + username = data[3].lower() |
| 149 | + #print username.encode('utf-8') |
| 150 | + if username.find('bot') > -1 or username.find('script') > -1: |
| 151 | + bot = bots.get(username, botmodels.Bot(username, verified=False)) |
| 152 | + setattr(bot, 'id', data[0]) |
| 153 | + |
| 154 | + timestamp = data[1] |
| 155 | + if timestamp != None: |
| 156 | + timestamp = utils.convert_timestamp_to_datetime_naive(timestamp) |
| 157 | + bot.time[str(timestamp.year)].append(timestamp) |
| 158 | + bots[username] = bot |
| 159 | + |
162 | 160 | return bots |
163 | 161 | |
164 | 162 | #bot = bots.get('PseudoBot') |
— | — | @@ -165,26 +163,36 @@ |
166 | 164 | #bot.avg_lag_between_edits() |
167 | 165 | |
168 | 166 | |
169 | | -def bot_launcher(language_code, project, single=False, manager=False): |
| 167 | +def bot_launcher(language_code, project, target, action, single=False, manager=False): |
170 | 168 | ''' |
171 | 169 | This function sets the stage to launch bot id detection and collecting data |
172 | 170 | to discover new bots. |
173 | 171 | ''' |
174 | 172 | utils.delete_file(settings.csv_location, 'bots_ids.csv') |
175 | 173 | location = os.path.join(settings.input_location, language_code, project) |
176 | | - input = os.path.join(location, 'chunks') |
177 | | - |
178 | | - files = utils.retrieve_file_list(input, 'xml', mask=None) |
| 174 | + input_xml = os.path.join(location, 'chunks') |
| 175 | + input_txt = os.path.join(location, 'txt') |
| 176 | + files = utils.retrieve_file_list(input_txt, 'txt', mask=None) |
| 177 | + files = files[400:405] |
179 | 178 | input_queue = pc.load_queue(files, poison_pill=True) |
180 | 179 | tasks = multiprocessing.JoinableQueue() |
181 | 180 | mgr = multiprocessing.Manager() |
| 181 | + keys = ['id', 'name', 'verified', 'projects'] |
| 182 | + |
| 183 | + if action == 'lookup': |
| 184 | + output_file = 'bots_ids.csv' |
| 185 | + bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager) |
| 186 | + else: |
| 187 | + output_file = 'bots_predictionset.csv' |
| 188 | + bots = {} |
| 189 | + |
182 | 190 | #lock = mgr.Lock() |
183 | 191 | if manager: |
184 | 192 | manager = mgr |
185 | | - bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager) |
186 | 193 | |
| 194 | + |
187 | 195 | for file in files: |
188 | | - tasks.put(models.XMLFile(input, settings.csv_location, file, None, lookup_bot_userid)) |
| 196 | + tasks.put(models.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
189 | 197 | |
190 | 198 | tracker = {} |
191 | 199 | if single: |
— | — | @@ -199,21 +207,22 @@ |
200 | 208 | bot_launcher_multi(tasks) |
201 | 209 | |
202 | 210 | utils.store_object(bots, settings.binary_location, 'bots.bin') |
203 | | - write_bot_list_to_csv(bots) |
204 | | - bot_training_dataset(bots) |
205 | | - store_bots() |
206 | | - if bots != {}: |
207 | | - print 'The script was unable to retrieve the user id\s for the following %s bots:\n' % len(bots) |
208 | | - keys = bots.keys() |
209 | | - for key in keys: |
210 | | - try: |
211 | | - print '%s' % key.encode(settings.encoding) |
212 | | - except: |
213 | | - pass |
| 211 | + if action == 'lookup': |
| 212 | + store_bots() |
| 213 | + if bots != {}: |
| 214 | + print 'The script was unable to retrieve the user id\s for the following %s bots:\n' % len(bots) |
| 215 | + keys = bots.keys() |
| 216 | + for key in keys: |
| 217 | + try: |
| 218 | + print '%s' % key.encode(settings.encoding) |
| 219 | + except: |
| 220 | + pass |
| 221 | + else: |
| 222 | + bot_training_dataset(bots) |
| 223 | + #write_bot_list_to_csv(bots, keys) |
214 | 224 | |
215 | 225 | |
216 | 226 | |
217 | | - |
218 | 227 | def bot_training_dataset(bots): |
219 | 228 | fh = utils.create_txt_filehandle(settings.csv_location, 'training_bots.csv', 'w', settings.encoding) |
220 | 229 | keys = bots.keys() |
— | — | @@ -251,8 +260,9 @@ |
252 | 261 | if __name__ == '__main__': |
253 | 262 | language_code = 'en' |
254 | 263 | project = 'wiki' |
255 | | - store_bots() |
| 264 | + #store_bots() |
256 | 265 | #bots = debug_bots_dict() |
257 | 266 | #write_bot_list_to_csv(bots) |
258 | | - #bot_launcher(language_code, project, single=True) |
| 267 | + #language_code, project, lookup_bot_userid, single = False, manager = False |
| 268 | + bot_launcher(language_code, project, create_bot_validation_dataset, action='training', single=True, manager=False) |
259 | 269 | #cProfile.run(bot_launcher(language_code, project, single=True), 'profile') |