Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -17,7 +17,7 @@ |
18 | 18 | _date__ = '2010-12-10' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | | -from multiprocessing import JoinableQueue, Queue, Manager, RLock, Process |
| 21 | +from multiprocessing import JoinableQueue, Queue, Manager, RLock, Process, cpu_count |
22 | 22 | from multiprocessing.managers import BaseManager |
23 | 23 | from Queue import Empty |
24 | 24 | |
— | — | @@ -141,10 +141,10 @@ |
142 | 142 | del editors |
143 | 143 | |
144 | 144 | analyzers = [analytics.Analyzer(rts, tasks, result, var, data, plugin, func) for |
145 | | - x in xrange(rts.number_of_processes)] |
| 145 | + x in xrange(cpu_count())] |
146 | 146 | |
147 | 147 | |
148 | | - for x in xrange(rts.number_of_processes): |
| 148 | + for x in xrange(cpu_count()): |
149 | 149 | tasks.put(None) |
150 | 150 | |
151 | 151 | pbar = progressbar.ProgressBar(maxval=n).start() |
— | — | @@ -152,7 +152,7 @@ |
153 | 153 | analyzer.start() |
154 | 154 | |
155 | 155 | |
156 | | - ppills = rts.number_of_processes |
| 156 | + ppills = cpu_count() |
157 | 157 | while True: |
158 | 158 | while ppills > 0: |
159 | 159 | try: |
— | — | @@ -216,7 +216,7 @@ |
217 | 217 | |
218 | 218 | |
219 | 219 | def launcher(): |
220 | | - project, language, parser = manage.init_args_parser() |
| 220 | + project, language, parser = commandline.init_args_parser() |
221 | 221 | args = parser.parse_args(['django']) |
222 | 222 | rts = runtime_settings.init_environment('wiki', 'en', args) |
223 | 223 | generate_chart_data(rts, 'taxonomy_burnout', time_unit='month') |
Index: trunk/tools/editor_trends/analyses/adhoc/bot_detector.py |
— | — | @@ -244,8 +244,8 @@ |
245 | 245 | ''' |
246 | 246 | This is the launcher that uses multiprocesses. |
247 | 247 | ''' |
248 | | - consumers = [consumers.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)] |
249 | | - for x in xrange(settings.number_of_processes): |
| 248 | + consumers = [consumers.XMLFileConsumer(tasks, None) for i in xrange(multiprocessing.cpu_count())] |
| 249 | + for x in xrange(multiprocessing.cpu_count()): |
250 | 250 | tasks.put(None) |
251 | 251 | |
252 | 252 | for w in consumers: |
Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py |
— | — | @@ -159,7 +159,7 @@ |
160 | 160 | min_d = min(data.keys()) |
161 | 161 | max_d = max(data.keys()) |
162 | 162 | match = data[max_d] |
163 | | - matches.append((ppi_editor, match)) |
| 163 | + matches.append((ppi_editor, match, max_d)) |
164 | 164 | #remove match to make sure that every matched pair is unique |
165 | 165 | for editor in distances: |
166 | 166 | try: |
— | — | @@ -177,11 +177,12 @@ |
178 | 178 | fh.write('_a\t'.join(vars)) |
179 | 179 | fh.write('\t%s\t' % ('editor_b')) |
180 | 180 | fh.write('_b\t'.join(vars)) |
181 | | - fh.write('\tdelta registration days\tid\n') |
| 181 | + fh.write('\tdelta registration days\tid\teuclid_dist\n') |
182 | 182 | for i, match in enumerate(matches): |
183 | 183 | line = [] |
184 | 184 | editor_a = match[0] |
185 | 185 | editor_b = match[1] |
| 186 | + dist = match[2] |
186 | 187 | line.append(editor_a) |
187 | 188 | values_a = [str(obs_a[editor_a][v]) for v in vars] |
188 | 189 | values_b = [str(obs_b[editor_b][v]) for v in vars] |
— | — | @@ -191,6 +192,7 @@ |
192 | 193 | dt = obs_a[editor_a]['reg_date'] - obs_b[editor_b]['reg_date'] |
193 | 194 | line.append(str(dt.days)) |
194 | 195 | line.append(str(i)) |
| 196 | + line.append(dist) |
195 | 197 | line.append('\n') |
196 | 198 | print line |
197 | 199 | #line = '\t'.join([str(l).decode('utf-8') for l in line]) |
Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -30,6 +30,7 @@ |
31 | 31 | from classes import projects |
32 | 32 | from classes import runtime_settings |
33 | 33 | from utils import file_utils |
| 34 | +from utils import text_utils |
34 | 35 | from utils import ordered_dict |
35 | 36 | from utils import log |
36 | 37 | from utils import timer |
— | — | @@ -43,14 +44,113 @@ |
44 | 45 | from analyses import inventory |
45 | 46 | |
46 | 47 | |
47 | | -def init_args_parser(): |
| 48 | + |
| 49 | +def config_launcher(rts, logger): |
48 | 50 | ''' |
| 51 | + Config launcher is used to (re)configure Wikilytics. |
| 52 | + ''' |
| 53 | + |
| 54 | + pc = projects.ProjectContainer() |
| 55 | + if not os.path.exists('wiki.cfg') or rts.force: |
| 56 | + config = ConfigParser.RawConfigParser() |
| 57 | + project = None |
| 58 | + language = None |
| 59 | + db = None |
| 60 | + valid_hostname = False |
| 61 | + valid_storage = ['mongo', 'cassandra'] |
| 62 | + working_directory = raw_input('''Please indicate where you installed |
| 63 | + Wikilytics.\nCurrent location is %s\nPress Enter to accept default.\n''' % os.getcwd()) |
| 64 | + |
| 65 | + input_location = raw_input('''Please indicate where the Wikipedia dump |
| 66 | + files are or will be located.\nDefault is: %s\nPress Enter to |
| 67 | + accept default.\n''' % rts.input_location) |
| 68 | + |
| 69 | + base_location = raw_input('''Please indicate where to store all |
| 70 | + Wikilytics project files.\nDefault is: %s\nPress Enter to accept |
| 71 | + default.\n''' % rts.base_location) |
| 72 | + |
| 73 | + while db not in valid_storage: |
| 74 | + db = raw_input('''Please indicate what database you are using for storage.\nDefault is: Mongo\n''') |
| 75 | + db = 'mongo' if len(db) == 0 else db.lower() |
| 76 | + if db not in valid_storage: |
| 77 | + print 'Valid choices are: %s' % ','.join(valid_storage) |
| 78 | + |
| 79 | + while project not in pc.projects.keys(): |
| 80 | + project = raw_input('''Please indicate which project you would like |
| 81 | + to analyze.\nDefault is: %s\nPress Enter to accept default.\n''' % rts.project.full_name) |
| 82 | + project = project if len(project) > 0 else rts.project.name |
| 83 | + if project not in pc.projects.keys(): |
| 84 | + print 'Valid choices for a project are: %s' % ','.join(pc.projects.keys()) |
| 85 | + |
| 86 | + while language not in rts.project.valid_languages: |
| 87 | + language = raw_input('''Please indicate which language of project |
| 88 | + %s you would like to analyze.\nDefault is: %s\nPress Enter to accept |
| 89 | + default.\n''' % (rts.project.full_name, rts.language)) |
| 90 | + if len(language) == 0: |
| 91 | + language = rts.language.code |
| 92 | + language = language if language in rts.project.valid_languages \ |
| 93 | + else rts.language.default |
| 94 | + |
| 95 | + while valid_hostname == False: |
| 96 | + master = raw_input('''Please indicate the hostname master of your database |
| 97 | + cluster.\n Default is: %s\nPress Enter to accept default.\n''' % ('localhost')) |
| 98 | + master = 'localhost' if len(master) == 0 else master |
| 99 | + valid_hostname = text_utils.validate_hostname(master) |
| 100 | + |
| 101 | + if master != 'localhost': |
| 102 | + valid_hostname = False |
| 103 | + while valid_hostname == False: |
| 104 | + slaves = raw_input('''Please indicate the hostnames of your slaves |
| 105 | + of your database cluster.Separate names using a comma.\n''') |
| 106 | + slaves = slaves.split(',') |
| 107 | + results = [] |
| 108 | + for slave in slaves: |
| 109 | + results.append(text_utils.validate_hostname(slave)) |
| 110 | + valid_hostname = True if all(results) else False |
| 111 | + |
| 112 | + slaves = ','.join(slaves) |
| 113 | + input_location = input_location if len(input_location) > 0 else \ |
| 114 | + rts.input_location |
| 115 | + base_location = base_location if len(base_location) > 0 else \ |
| 116 | + rts.base_location |
| 117 | + working_directory = working_directory if len(working_directory) > 0 \ |
| 118 | + else os.getcwd() |
| 119 | + |
| 120 | + config = ConfigParser.RawConfigParser() |
| 121 | + config.add_section('file_locations') |
| 122 | + config.set('file_locations', 'working_directory', working_directory) |
| 123 | + config.set('file_locations', 'input_location', input_location) |
| 124 | + config.set('file_locations', 'base_location', base_location) |
| 125 | + config.add_section('wiki') |
| 126 | + config.set('wiki', 'project', project) |
| 127 | + config.set('wiki', 'language', language) |
| 128 | + config.add_section('storage') |
| 129 | + config.set('storage', 'db', db) |
| 130 | + config.add_section('cluster') |
| 131 | + config.set('cluster', 'master', master) |
| 132 | + config.set('cluster', 'slaves', slaves) |
| 133 | + |
| 134 | + fh = file_utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb') |
| 135 | + config.write(fh) |
| 136 | + fh.close() |
| 137 | + |
| 138 | + log.to_csv(logger, rts, 'New configuration', 'Creating', |
| 139 | + config_launcher, |
| 140 | + working_directory=working_directory, |
| 141 | + input_location=input_location, |
| 142 | + base_location=base_location, |
| 143 | + project=project, |
| 144 | + language=language,) |
| 145 | + |
| 146 | + |
| 147 | +def init_args_parser(language_code=None, project=None): |
| 148 | + ''' |
49 | 149 | Entry point for parsing command line and launching the needed function(s). |
50 | 150 | ''' |
51 | | - language = languages.init() |
52 | | - project = projects.init() |
| 151 | + language = languages.init(language_code) |
| 152 | + project = projects.init(project) |
53 | 153 | pjc = projects.ProjectContainer() |
54 | | - rts = runtime_settings.RunTimeSettings(project, language) |
| 154 | + #rts = runtime_settings.RunTimeSettings(project, language) |
55 | 155 | |
56 | 156 | file_choices = {'meta-full': 'stub-meta-history.xml.gz', |
57 | 157 | 'meta-current': 'stub-meta-current.xml.gz', |
— | — | @@ -78,7 +178,7 @@ |
79 | 179 | parser_config.set_defaults(func=config_launcher) |
80 | 180 | parser_config.add_argument('-f', '--force', |
81 | 181 | action='store_true', |
82 | | - help='Reconfigure Editor Toolkit (this will replace wiki.cfg') |
| 182 | + help='Reconfigure Wikilytics (this will replace wiki.cfg') |
83 | 183 | |
84 | 184 | #DOWNLOAD |
85 | 185 | parser_download = subparsers.add_parser('download', |
— | — | @@ -141,7 +241,7 @@ |
142 | 242 | parser_diff = subparsers.add_parser('diff', |
143 | 243 | help='Create a Mongo collection containing the diffs between revisions.') |
144 | 244 | parser_diff.set_defaults(func=diff_launcher) |
145 | | - |
| 245 | + |
146 | 246 | #DJANGO |
147 | 247 | parser_django = subparsers.add_parser('django') |
148 | 248 | parser_django.add_argument('-e', '--except', |
— | — | @@ -192,85 +292,9 @@ |
193 | 293 | %s' % ''.join([f + ',\n' for f in file_choices]), |
194 | 294 | default=file_choices['meta-full']) |
195 | 295 | |
196 | | - return project, language, parser |
| 296 | + return parser |
197 | 297 | |
198 | 298 | |
199 | | -def config_launcher(rts, logger): |
200 | | - ''' |
201 | | - Config launcher is used to reconfigure editor trends toolkit. |
202 | | - ''' |
203 | | - |
204 | | - pc = projects.ProjectContainer() |
205 | | - if not os.path.exists('wiki.cfg') or rts.force: |
206 | | - config = ConfigParser.RawConfigParser() |
207 | | - project = None |
208 | | - language = None |
209 | | - db = None |
210 | | - valid_storage = ['mongo', 'cassandra'] |
211 | | - working_directory = raw_input('''Please indicate where you installed |
212 | | - Wikilytics.\nCurrent location is %s\nPress Enter to accept default.\n''' % os.getcwd()) |
213 | | - |
214 | | - input_location = raw_input('''Please indicate where the Wikipedia dump |
215 | | - files are or will be located.\nDefault is: %s\nPress Enter to |
216 | | - accept default.\n''' % rts.input_location) |
217 | | - |
218 | | - base_location = raw_input('''Please indicate where to store all |
219 | | - Wikilytics project files.\nDefault is: %s\nPress Enter to accept |
220 | | - default.\n''' % rts.base_location) |
221 | | - |
222 | | - while db not in valid_storage: |
223 | | - db = raw_input('Please indicate what database you are using for storage. \nDefault is: Mongo\n') |
224 | | - db = 'mongo' if len(db) == 0 else db.lower() |
225 | | - if db not in valid_storage: |
226 | | - print 'Valid choices are: %s' % ','.join(valid_storage) |
227 | | - |
228 | | - while project not in pc.projects.keys(): |
229 | | - project = raw_input('''Please indicate which project you would like |
230 | | - to analyze.\nDefault is: %s\nPress Enter to accept default.\n''' % rts.project.full_name) |
231 | | - project = project if len(project) > 0 else rts.project.name |
232 | | - if project not in pc.projects.keys(): |
233 | | - print 'Valid choices for a project are: %s' % ','.join(pc.projects.keys()) |
234 | | - |
235 | | - while language not in rts.project.valid_languages: |
236 | | - language = raw_input('''Please indicate which language of project |
237 | | - %s you would like to analyze.\nDefault is: %s\nPress Enter to accept |
238 | | - default.\n''' % (rts.project.full_name, rts.language)) |
239 | | - if len(language) == 0: |
240 | | - language = rts.language.code |
241 | | - language = language if language in rts.project.valid_languages \ |
242 | | - else rts.language.default |
243 | | - |
244 | | - input_location = input_location if len(input_location) > 0 else \ |
245 | | - rts.input_location |
246 | | - base_location = base_location if len(base_location) > 0 else \ |
247 | | - rts.base_location |
248 | | - working_directory = working_directory if len(working_directory) > 0 \ |
249 | | - else os.getcwd() |
250 | | - |
251 | | - config = ConfigParser.RawConfigParser() |
252 | | - config.add_section('file_locations') |
253 | | - config.set('file_locations', 'working_directory', working_directory) |
254 | | - config.set('file_locations', 'input_location', input_location) |
255 | | - config.set('file_locations', 'base_location', base_location) |
256 | | - config.add_section('wiki') |
257 | | - config.set('wiki', 'project', project) |
258 | | - config.set('wiki', 'language', language) |
259 | | - config.add_section('storage') |
260 | | - config.set('storage', 'db', db) |
261 | | - |
262 | | - fh = file_utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb') |
263 | | - config.write(fh) |
264 | | - fh.close() |
265 | | - |
266 | | - log.to_csv(logger, rts, 'New configuration', 'Creating', |
267 | | - config_launcher, |
268 | | - working_directory=working_directory, |
269 | | - input_location=input_location, |
270 | | - base_location=base_location, |
271 | | - project=project, |
272 | | - language=language,) |
273 | | - |
274 | | - |
275 | 299 | def downloader_launcher(rts, logger): |
276 | 300 | ''' |
277 | 301 | This launcher calls the dump downloader to download a Wikimedia dump file. |
— | — | @@ -343,7 +367,8 @@ |
344 | 368 | stopwatch = timer.Timer() |
345 | 369 | log.to_db(rts, 'dataset', 'transform', stopwatch, event='start') |
346 | 370 | log.to_csv(logger, rts, 'Start', 'Transform', transformer_launcher) |
347 | | - transformer.transform_editors_multi_launcher(rts) |
| 371 | + #transformer.transform_editors_multi_launcher(rts) |
| 372 | + transformer.transform_editors_single_launcher(rts) |
348 | 373 | stopwatch.elapsed() |
349 | 374 | log.to_db(rts, 'dataset', 'transform', stopwatch, event='finish') |
350 | 375 | log.to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher) |
— | — | @@ -359,8 +384,8 @@ |
360 | 385 | log.to_db(rts, 'dataset', 'diff', stopwatch, event='finish') |
361 | 386 | log.to_csv(logger, rts, 'Finish', 'Diff', diff_launcher) |
362 | 387 | |
363 | | - |
364 | 388 | |
| 389 | + |
365 | 390 | def dataset_launcher(rts, logger): |
366 | 391 | ''' |
367 | 392 | Dataset launcher is the entry point to generate datasets from the command |
— | — | @@ -414,8 +439,11 @@ |
415 | 440 | ''' |
416 | 441 | This function initializes the command line parser. |
417 | 442 | ''' |
418 | | - project, language, parser, = init_args_parser() |
| 443 | + parser = init_args_parser() |
419 | 444 | args = parser.parse_args() |
| 445 | + language = languages.init() |
| 446 | + project = projects.init() |
| 447 | + |
420 | 448 | rts = runtime_settings.RunTimeSettings(project, language, args) |
421 | 449 | #initialize logger |
422 | 450 | logger = logging.getLogger('manager') |
Index: trunk/tools/editor_trends/etl/kaggle.py |
— | — | @@ -1,49 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2011-04-12' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import sys |
22 | | - |
23 | | -if '..' not in sys.path: |
24 | | - sys.path.append('..') |
25 | | - |
26 | | -from utils import file_utils |
27 | | - |
28 | | - |
29 | | -def launcher(): |
30 | | - location = '/home/diederik/wikimedia/en/wiki/kaggle_training/' |
31 | | - #location = 'C:\\wikimedia\\en\\wiki\\txt' |
32 | | - files = file_utils.retrieve_file_list(location, extension='csv') |
33 | | - files.sort() |
34 | | - dataset = file_utils.create_txt_filehandle(location, 'dataset.csv', 'w', 'utf-8') |
35 | | - for filename in files: |
36 | | - if not filename.startswith('comments') and \ |
37 | | - not filename.startswith('articles') and not filename.startswith('dataset'): |
38 | | - fh = file_utils.create_txt_filehandle(location, filename, 'r', 'utf-8') |
39 | | - print fh |
40 | | - for line in fh: |
41 | | - data = line.split('\t') |
42 | | - username = data[3].lower() |
43 | | - if username.endswith('bot'): |
44 | | - continue |
45 | | - else: |
46 | | - dataset.write(line) |
47 | | - fh.close() |
48 | | - dataset.close() |
49 | | - |
50 | | -launcher() |
Index: trunk/tools/editor_trends/etl/variables.py |
— | — | @@ -275,20 +275,20 @@ |
276 | 276 | Determine the id of a revision |
277 | 277 | ''' |
278 | 278 | if revision_id != None: |
279 | | - return revision_id.text |
| 279 | + return int(revision_id.text) |
280 | 280 | else: |
281 | 281 | return None |
282 | 282 | |
283 | 283 | |
284 | | -def extract_comment_text(revision_id, revision): |
| 284 | +def extract_comment_text(revision, xml_namespace): |
285 | 285 | ''' |
286 | 286 | Extract the comment associated with an edit. |
287 | 287 | ''' |
288 | | - comment = {} |
289 | | - text = revision.find('comment') |
290 | | - if text != None and text.text != None: |
291 | | - comment[revision_id] = text.text.encode('utf-8') |
292 | | - return comment |
| 288 | + comment_text = revision.find('%s%s' % (xml_namespace, 'comment')) |
| 289 | + if comment_text != None and comment_text.text != None: |
| 290 | + return comment_text.text |
| 291 | + else: |
| 292 | + return None |
293 | 293 | |
294 | 294 | |
295 | 295 | def create_namespace_dict(siteinfo, xml_namespace): |
Index: trunk/tools/editor_trends/etl/differ.py |
— | — | @@ -17,23 +17,35 @@ |
18 | 18 | __date__ = '2011-04-10' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
| 21 | + |
| 22 | +''' |
| 23 | +This script generates diffs of edits for the Talk, User Talk and Wikipedia Talk |
| 24 | +pages of a Wikipedia project. These diffs are stored in json files and then |
| 25 | +imported in Mongo. |
| 26 | +''' |
| 27 | +import pprint |
21 | 28 | import json |
22 | 29 | import cStringIO |
23 | 30 | import codecs |
24 | 31 | import sys |
25 | 32 | import os |
26 | 33 | import difflib |
| 34 | +import bson |
27 | 35 | from xml.etree.cElementTree import iterparse, dump |
28 | 36 | from multiprocessing import JoinableQueue, Process, cpu_count |
29 | 37 | from datetime import datetime |
| 38 | +from copy import deepcopy |
30 | 39 | |
31 | 40 | |
32 | 41 | if '..' not in sys.path: |
33 | 42 | sys.path.append('../') |
34 | 43 | |
35 | 44 | from utils import file_utils |
| 45 | +from utils import text_utils |
36 | 46 | from etl import variables |
37 | 47 | from classes import exceptions |
| 48 | +from classes import storage |
| 49 | +from classes import runtime_settings |
38 | 50 | |
39 | 51 | |
40 | 52 | def parse_xml(fh, format, process_id, location): |
— | — | @@ -50,13 +62,13 @@ |
51 | 63 | context = iterparse(fh, events=(start, end)) |
52 | 64 | context = iter(context) |
53 | 65 | |
54 | | - article = {} |
| 66 | + |
| 67 | + revisions = [] |
55 | 68 | count_articles = 0 |
56 | 69 | id = False |
57 | 70 | ns = False |
58 | 71 | parse = False |
59 | | - rev1 = None |
60 | | - rev2 = None |
| 72 | + prev_rev_text = None |
61 | 73 | file_id, fh_output = None, None |
62 | 74 | |
63 | 75 | try: |
— | — | @@ -80,11 +92,11 @@ |
81 | 93 | parsing this article, else it will skip this article. |
82 | 94 | ''' |
83 | 95 | title = variables.parse_title(elem) |
84 | | - article['title'] = title |
85 | 96 | current_namespace = variables.determine_namespace(title, namespaces, include_ns) |
86 | 97 | if current_namespace == 1 or current_namespace == 3 or current_namespace == 5: |
87 | 98 | parse = True |
88 | | - article['namespace'] = current_namespace |
| 99 | + #article['namespace'] = current_namespace |
| 100 | + title = title.replace(namespaces[current_namespace], '') |
89 | 101 | count_articles += 1 |
90 | 102 | if count_articles % 10000 == 0: |
91 | 103 | print 'Worker %s parsed %s articles' % (process_id, count_articles) |
— | — | @@ -105,23 +117,32 @@ |
106 | 118 | timestamp = elem.find('%s%s' % (xml_namespace, 'timestamp')).text |
107 | 119 | contributor = elem.find('%s%s' % (xml_namespace, 'contributor')) |
108 | 120 | editor = variables.parse_contributor(contributor, None, xml_namespace) |
| 121 | + text = variables.extract_revision_text(elem, xml_namespace) |
| 122 | + comment = variables.extract_comment_text(elem, xml_namespace) |
109 | 123 | if editor: |
110 | 124 | rev_id = variables.extract_revision_id(rev_id) |
| 125 | + if prev_rev_text == None: |
| 126 | + diff = text |
| 127 | + prev_rev_text = deepcopy(text) |
| 128 | + if prev_rev_text != None: |
| 129 | + #print text[0:20], prev_rev_text[0:20] |
| 130 | + diff = diff_revision(prev_rev_text, text) |
111 | 131 | |
112 | | - if rev1 == None and rev2 == None: |
113 | | - diff = variables.extract_revision_text(elem, xml_namespace) |
114 | | - rev1 = elem |
115 | | - if rev1 != None and rev2 != None: |
116 | | - diff = diff_revision(rev1, rev2, xml_namespace) |
| 132 | + if diff != None: |
| 133 | + timestamp = text_utils.convert_timestamp_to_datetime_utc(timestamp) |
| 134 | + timestamp = timestamp.isoformat() |
| 135 | + revision = dict(rev_id=rev_id, title=title, |
| 136 | + timestamp=timestamp, |
| 137 | + diff=diff, comment=comment, |
| 138 | + id=editor['id'], |
| 139 | + username=editor['username'], |
| 140 | + article_id=article_id, |
| 141 | + ns=current_namespace) |
| 142 | + revisions.append(revision) |
117 | 143 | |
118 | | - article[rev_id] = {} |
119 | | - article[rev_id].update(editor) |
120 | | - article[rev_id]['timestamp'] = timestamp |
121 | | - article[rev_id]['diff'] = diff |
122 | | - |
123 | 144 | clear = True |
124 | 145 | if clear: |
125 | | - rev2 = rev1 |
| 146 | + prev_rev_text = deepcopy(text) |
126 | 147 | elem.clear() |
127 | 148 | else: |
128 | 149 | elem.clear() |
— | — | @@ -130,7 +151,7 @@ |
131 | 152 | ''' |
132 | 153 | Determine id of article |
133 | 154 | ''' |
134 | | - article['article_id'] = elem.text |
| 155 | + article_id = int(elem.text) |
135 | 156 | id = True |
136 | 157 | elem.clear() |
137 | 158 | |
— | — | @@ -140,17 +161,16 @@ |
141 | 162 | memory. |
142 | 163 | ''' |
143 | 164 | elem.clear() |
144 | | - #write diff of text to file |
| 165 | + |
145 | 166 | if parse: |
146 | | - #print article |
147 | | - fh_output, file_id = assign_filehandle(fh_output, file_id, location, process_id, format) |
148 | | - write_diff(fh_output, article, format) |
| 167 | + #write diff of text to file |
| 168 | + if len(revisions) > 0: |
| 169 | + fh_output, file_id = assign_filehandle(fh_output, file_id, location, process_id, format) |
| 170 | + write_diff(fh_output, revisions, format) |
| 171 | + |
149 | 172 | #Reset all variables for next article |
150 | | - article = {} |
151 | | - if rev1 != None: |
152 | | - rev1.clear() |
153 | | - if rev2 != None: |
154 | | - rev2.clear() |
| 173 | + revisions = [] |
| 174 | + prev_rev_text = None |
155 | 175 | id = False |
156 | 176 | parse = False |
157 | 177 | |
— | — | @@ -181,14 +201,47 @@ |
182 | 202 | |
183 | 203 | return fh, file_id |
184 | 204 | |
| 205 | + |
185 | 206 | def write_xml_diff(fh, article): |
186 | 207 | pass |
187 | 208 | |
188 | 209 | |
189 | | -def write_json_diff(fh, article): |
190 | | - json.dump(article, fh) |
| 210 | +def write_json_diff(fh, revisions): |
| 211 | + fh.write('\nStart new JSON object\n') |
| 212 | + json.dump(revisions, fh, indent=4, sort_keys=True) |
191 | 213 | |
192 | 214 | |
| 215 | +def store_json_diffs(rts): |
| 216 | + files = os.listdir(rts.diffs) |
| 217 | + print files, rts.diffs |
| 218 | + db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset) |
| 219 | + buffer = cStringIO.StringIO() |
| 220 | + |
| 221 | + for filename in files: |
| 222 | + fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8') |
| 223 | + for line in fh: |
| 224 | + if line.startswith('\n') or line.startswith('Start'): |
| 225 | + obj = buffer.getvalue() |
| 226 | + if obj != '': |
| 227 | + obj = json.loads(obj) |
| 228 | + obj[0]['article_id'] = int(obj[0]['article_id']) |
| 229 | + for key, value in obj[0].iteritems(): |
| 230 | + if type(value) == type(dict()): |
| 231 | + value['timestamp'] = datetime.strptime(value['timestamp'], '%Y-%m-%dT%H:%M:%S') |
| 232 | + obj[0][key] = value |
| 233 | + obj = obj[0] |
| 234 | + #print obj |
| 235 | + #print len(obj) |
| 236 | + try: |
| 237 | + db.save(obj) |
| 238 | + except bson.errors.InvalidDocument, error: |
| 239 | + print error |
| 240 | + buffer = cStringIO.StringIO() |
| 241 | + else: |
| 242 | + buffer.write(line) |
| 243 | + fh.close() |
| 244 | + |
| 245 | + |
193 | 246 | def write_diff(fh, article, format): |
194 | 247 | if format == 'xml': |
195 | 248 | write_xml_diff(fh, article) |
— | — | @@ -198,23 +251,47 @@ |
199 | 252 | raise exceptions.OutputNotSupported() |
200 | 253 | |
201 | 254 | |
202 | | -def diff_revision(rev1, rev2, xml_namespace): |
203 | | - buffer = cStringIO.StringIO() |
204 | | - if rev1.text != None and rev2.text != None: |
205 | | - diff = difflib.unified_diff(rev1.text, rev2.text, n=0, lineterm='') |
| 255 | +def diff_revision(rev1, rev2): |
| 256 | + if rev1 == None: |
| 257 | + rev1 = '' |
| 258 | + if rev2 == None: |
| 259 | + rev2 = '' |
| 260 | + if len(rev1) != len(rev2): |
| 261 | + buffer = cStringIO.StringIO() |
| 262 | + rev1 = rev1.splitlines(1) |
| 263 | + rev2 = rev2.splitlines(2) |
| 264 | + |
| 265 | + diff = difflib.unified_diff(rev1, rev2, n=0, lineterm='') |
206 | 266 | for line in diff: |
207 | 267 | if len(line) > 3: |
208 | | - print line |
209 | | - buffer.write(line) |
| 268 | + #print line |
| 269 | + buffer.write(line.encode('utf-8')) |
210 | 270 | |
211 | | - return buffer.getvalue() |
| 271 | + diff = buffer.getvalue() |
212 | 272 | |
| 273 | + if diff == '': |
| 274 | + return None |
| 275 | + else: |
| 276 | + return diff |
| 277 | + else: |
| 278 | + return None |
| 279 | + |
| 280 | + |
| 281 | +def store_diffs_debug(rts): |
| 282 | + db = storage.init_database(rts) |
| 283 | + files = os.listdir(rts.diffs) |
| 284 | + for filename in files: |
| 285 | + fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8') |
| 286 | + diffs = json.load(fh) |
| 287 | + db.insert(diffs) |
| 288 | + fh.close() |
| 289 | + |
| 290 | + |
213 | 291 | def stream_raw_xml(input_queue, process_id, rts, format): |
214 | 292 | ''' |
215 | 293 | This function fetches an XML file from the queue and launches the processor. |
216 | 294 | ''' |
217 | 295 | t0 = datetime.now() |
218 | | - file_id = 0 |
219 | 296 | |
220 | 297 | while True: |
221 | 298 | filename = input_queue.get() |
— | — | @@ -225,7 +302,7 @@ |
226 | 303 | |
227 | 304 | print filename |
228 | 305 | fh = file_utils.create_streaming_buffer(filename) |
229 | | - parse_xml(fh, format, process_id, rts.input_location) |
| 306 | + parse_xml(fh, format, process_id, rts.diffs) |
230 | 307 | fh.close() |
231 | 308 | |
232 | 309 | t1 = datetime.now() |
— | — | @@ -266,7 +343,14 @@ |
267 | 344 | |
268 | 345 | input_queue.join() |
269 | 346 | |
| 347 | + store_json_diffs(rts) |
| 348 | + db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset) |
| 349 | + db.add_index('title') |
| 350 | + db.add_index('timestamp') |
| 351 | + db.add_index('username') |
| 352 | + db.add_index('ns') |
270 | 353 | |
| 354 | + |
271 | 355 | def launcher_simple(): |
272 | 356 | location = 'c:\\wikimedia\\nl\\wiki\\' |
273 | 357 | output_location = 'c:\\wikimedia\\nl\\wiki\\diffs\\' |
— | — | @@ -311,5 +395,6 @@ |
312 | 396 | |
313 | 397 | |
314 | 398 | if __name__ == '__main__': |
| 399 | + #read_json_diffs() |
315 | 400 | launcher_simple() |
316 | 401 | #debug() |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -22,8 +22,8 @@ |
23 | 23 | parsing the XML on the fly and extracting & constructing the variables that are
|
24 | 24 | need for subsequent analysis. The extract module is initialized using an
|
25 | 25 | instance of RunTimeSettings and the most important parameters are:
|
26 | | -The name of project\n
|
27 | | -The language of the project\n
|
| 26 | +The name of project
|
| 27 | +The language of the project
|
28 | 28 | The location where the dump files are stored
|
29 | 29 | '''
|
30 | 30 |
|
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -338,11 +338,11 @@ |
339 | 339 | db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset) |
340 | 340 | db_dataset.drop_collection() |
341 | 341 | editors = db_raw.retrieve_editors() |
342 | | - return editors |
| 342 | + return editors, db_raw, db_dataset |
343 | 343 | |
344 | 344 | |
345 | 345 | def transform_editors_multi_launcher(rts): |
346 | | - editors = setup_database(rts) |
| 346 | + editors, db_raw, db_dataset = setup_database(rts) |
347 | 347 | n = editors.size() |
348 | 348 | result = queue.JoinableRetryQueue() |
349 | 349 | pbar = progressbar.ProgressBar(maxval=n).start() |
— | — | @@ -372,7 +372,7 @@ |
373 | 373 | |
374 | 374 | def transform_editors_single_launcher(rts): |
375 | 375 | print rts.dbname, rts.editors_raw |
376 | | - editors = setup_database(rts) |
| 376 | + editors, db_raw, db_dataset = setup_database(rts) |
377 | 377 | n = editors.size() |
378 | 378 | pbar = progressbar.ProgressBar(maxval=n).start() |
379 | 379 | |
— | — | @@ -384,7 +384,7 @@ |
385 | 385 | editors.task_done() |
386 | 386 | if editor == None: |
387 | 387 | break |
388 | | - editor = Editor(rts, editor) |
| 388 | + editor = Editor(rts, editor, db_raw, db_dataset) |
389 | 389 | editor() |
390 | 390 | |
391 | 391 | pbar.update(pbar.currval + 1) |
Index: trunk/tools/editor_trends/statistics/stata/ppi.do |
— | — | @@ -1,5 +1,11 @@ |
2 | 2 | clear |
3 | 3 | insheet using "C:\Users\diederik.vanliere\Desktop\ppi_quality.csv" |
| 4 | + |
| 5 | +gen diff_character_count = character_count_a - character_count_b |
| 6 | +gen diff_cum_edit_count_main_ns = cum_edit_count_main_ns_a- cum_edit_count_main_ns_b |
| 7 | +gen diff_cum_edit_count_other_ns = cum_edit_count_other_ns_a- cum_edit_count_other_ns_b |
| 8 | +gen diff_article_count = article_count_a- article_count_b |
| 9 | + |
4 | 10 | label var character_count_a "PPI editor" |
5 | 11 | label var character_count_b "Regular editor" |
6 | 12 | |
Index: trunk/tools/editor_trends/kaggle/training.py |
— | — | @@ -17,23 +17,35 @@ |
18 | 18 | __date__ = '2011-04-12' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
| 21 | +import os |
| 22 | +import sys |
| 23 | +import cPickle |
21 | 24 | import codecs |
22 | | -import os |
23 | 25 | from datetime import datetime |
24 | | -import json |
| 26 | +sys.path.append('../') |
25 | 27 | |
26 | | -location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction' |
| 28 | +from classes import storage |
| 29 | + |
| 30 | +location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction_solution' |
27 | 31 | files = os.listdir(location) |
28 | 32 | files.reverse() |
29 | | -dataset = codecs.open('training.tsv', 'w', 'utf-8') |
| 33 | + |
| 34 | +max_size = 2147483648 |
| 35 | +max_size_reached = False |
| 36 | + |
30 | 37 | t0 = datetime.now() |
31 | | -max_size = 2147483648 |
32 | 38 | titles = {} |
33 | 39 | ids = set() |
| 40 | +dates = {} |
| 41 | +edits = {} |
| 42 | +ignore_ids = set() |
34 | 43 | size = 0 |
35 | 44 | cnt_obs = 0 |
36 | | -max_size_reached = False |
| 45 | +cutoff_date = datetime(2010, 8, 31) |
37 | 46 | |
| 47 | +print 'Constructing training dataset...' |
| 48 | +db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset') |
| 49 | +dataset = codecs.open('training.tsv', 'w', 'utf-8') |
38 | 50 | for filename in files: |
39 | 51 | if not filename.startswith('comments') and not filename.startswith('articles'): |
40 | 52 | fh = codecs.open(os.path.join(location, filename)) |
— | — | @@ -46,13 +58,25 @@ |
47 | 59 | continue |
48 | 60 | if line[10] == '1': |
49 | 61 | continue |
| 62 | + timestamp = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ') |
| 63 | + if timestamp > cutoff_date: |
| 64 | + continue |
50 | 65 | username = line[3].lower() |
51 | | - if username.endswith('bot'): |
| 66 | + if username.endswith('bot') or username.find('script') > -1: |
52 | 67 | #line[10] = '1' |
53 | 68 | continue |
| 69 | + id = line[2] |
| 70 | + if id not in ids and id not in ignore_ids: |
| 71 | + res = db.find_one('editor', id) |
| 72 | + if res == None: |
| 73 | + ignore_ids.add(id) |
| 74 | + continue |
54 | 75 | cnt_obs += 1 |
55 | 76 | title_id = line[1] |
56 | | - ids.add(line[2]) |
| 77 | + ids.add(id) |
| 78 | + simple_date = '%s-%s' % (timestamp.year, timestamp.month) |
| 79 | + dates.setdefault(simple_date, 0) |
| 80 | + dates[simple_date] += 1 |
57 | 81 | title = line.pop(5) |
58 | 82 | titles[title_id] = title |
59 | 83 | line.append('\n') |
— | — | @@ -64,20 +88,54 @@ |
65 | 89 | |
66 | 90 | dataset.close() |
67 | 91 | |
| 92 | +print 'Constructing title dataset...' |
68 | 93 | fh = codecs.open('titles.tsv', 'w', 'utf-8') |
69 | 94 | for id, title in titles.iteritems(): |
70 | 95 | fh.write('%s\t%s\n' % (id, title.decode('utf-8'))) |
71 | 96 | fh.close() |
72 | 97 | |
73 | | -fh = codecs.open('ids.json', 'w', 'utf-8') |
74 | | -json.dump(ids, fh) |
75 | | -#for id in ids: |
76 | | -#fh.write('%s\n' % (id.decode('utf-8'))) |
77 | | -#fh.write('%s\n' % (json.du) |
| 98 | + |
| 99 | +print 'Constructing solution dataset...' |
| 100 | +x = 0 |
| 101 | +fh = codecs.open('solutions.tsv', 'w', 'utf-8') |
| 102 | +for id in ids: |
| 103 | + if id not in ignore_ids: |
| 104 | + obs = db.find_one('editor', str(id), 'cum_edit_count_main_ns') |
| 105 | + if obs != None: |
| 106 | + x += 1 |
| 107 | + n = obs['cum_edit_count_main_ns'] |
| 108 | + fh.write('%s,%s\n' % (id.decode('utf-8'), n)) |
| 109 | + edits.setdefault(n, 0) |
| 110 | + edits[n] += 1 |
| 111 | + else: |
| 112 | + print id |
78 | 113 | fh.close() |
79 | 114 | |
| 115 | +print 'Storing date histogram' |
| 116 | +fh = open('histogram_dates.bin', 'wb') |
| 117 | +cPickle.dump(dates, fh) |
| 118 | +fh.close() |
| 119 | + |
| 120 | + |
| 121 | +fh = open('histogram_dates.tsv', 'w') |
| 122 | +for date, n in dates.iteritems(): |
| 123 | + fh.write('%s\t%s\n' % (date, n)) |
| 124 | +fh.close() |
| 125 | + |
| 126 | + |
| 127 | +print 'Storing edit histogram' |
| 128 | +fh = open('histogram_edits.bin', 'wb') |
| 129 | +cPickle.dump(edits, fh) |
| 130 | +fh.close() |
| 131 | + |
| 132 | +fh = open('histogram_edits.tsv', 'w') |
| 133 | +for edit, n in edits.iteritems(): |
| 134 | + fh.write('%s\t%s\n' % (edit, n)) |
| 135 | +fh.close() |
| 136 | + |
| 137 | + |
80 | 138 | t1 = datetime.now() |
81 | | -print 'Descriptives:\n' |
82 | | -print 'Number of editors: %s' % len(ids) |
| 139 | +print 'Descriptives:' |
| 140 | +print 'Number of editors: %s' % x |
83 | 141 | print 'Number of edits: %s' % cnt_obs |
84 | 142 | print 'It took %s to construct the Kaggle training set' % (t1 - t0) |
Index: trunk/tools/editor_trends/classes/projects.py |
— | — | @@ -107,9 +107,12 @@ |
108 | 108 | pc = ProjectContainer() |
109 | 109 | pc.supported_projects() |
110 | 110 | |
111 | | -def init(): |
| 111 | +def init(project=None): |
112 | 112 | pc = ProjectContainer() |
113 | | - return pc.get_project('wiki') |
| 113 | + if project: |
| 114 | + return pc.get_project(project) |
| 115 | + else: |
| 116 | + return pc.get_project('wiki') |
114 | 117 | |
115 | 118 | if __name__ == '__main__': |
116 | 119 | debug() |
Index: trunk/tools/editor_trends/classes/settings.py |
— | — | @@ -17,11 +17,6 @@ |
18 | 18 | __date__ = '2010-10-21' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | | -''' |
22 | | -This file contains settings that are used for constructing and analyzing |
23 | | -the datasets as part of the Editor Dynamics and Anti-Vandalism projects. |
24 | | -''' |
25 | | - |
26 | 21 | from multiprocessing import cpu_count |
27 | 22 | import ConfigParser |
28 | 23 | import os |
— | — | @@ -73,7 +68,7 @@ |
74 | 69 | #Change this to match your computers configuration (RAM / CPU) |
75 | 70 | # I want to get rid off these two variables. |
76 | 71 | self.number_of_processes = cpu_count() |
77 | | - self.windows_register = {'7z.exe': 'Software\\7-Zip'} |
| 72 | + #self.windows_register = {'7z.exe': 'Software\\7-Zip'} |
78 | 73 | |
79 | 74 | self.wp_dump_location = 'http://dumps.wikimedia.org' |
80 | 75 | |
— | — | @@ -107,6 +102,8 @@ |
108 | 103 | self.default_project = config.get('wiki', 'project') |
109 | 104 | self.default_language = config.get('wiki', 'language') |
110 | 105 | self.storage = config.get('storage', 'db') |
| 106 | + self.master = config.get('cluster', 'master') |
| 107 | + self.slaves = config.get('cluster', 'slaves') |
111 | 108 | return True |
112 | 109 | except Exception, error: |
113 | 110 | #raise exceptions.GenericMessage('corrupted_config') |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -27,6 +27,9 @@ |
28 | 28 | import datetime |
29 | 29 | import time |
30 | 30 | |
| 31 | +if '..' not in sys.path: |
| 32 | + sys.path.append('../') |
| 33 | + |
31 | 34 | from settings import Settings |
32 | 35 | from analyses import inventory |
33 | 36 | from classes import exceptions |
— | — | @@ -48,49 +51,49 @@ |
49 | 52 | self.language = language |
50 | 53 | self.dbname = 'wikilytics' |
51 | 54 | |
52 | | - if args: |
53 | | - self.args = args |
54 | | - self.hash = self.secs_since_epoch() |
55 | | - #print self.settings.input_location |
56 | | - #print self.get_value('location') |
57 | | - self.project = self.update_project_settings() |
58 | | - self.language = self.update_language_settings() |
| 55 | + #if args: |
| 56 | + self.args = args |
| 57 | + self.id = '%s%s_%s' % (self.language.code, self.project.name, 'current_month') |
| 58 | + #print self.settings.input_location |
| 59 | + #print self.get_value('location') |
| 60 | + self.project = self.update_project_settings() |
| 61 | + self.language = self.update_language_settings() |
59 | 62 | |
60 | | - self.input_location = self.set_input_location() |
61 | | - self.output_location = self.set_output_location() |
| 63 | + self.input_location = self.set_input_location() |
| 64 | + self.output_location = self.set_output_location() |
62 | 65 | |
63 | | - self.plugins = self.set_plugin() |
64 | | - self.keywords = self.split_keywords() |
65 | | - self.namespaces = self.get_namespaces() |
| 66 | + self.plugins = self.set_plugin() |
| 67 | + self.keywords = self.split_keywords() |
| 68 | + self.namespaces = self.get_namespaces() |
66 | 69 | |
67 | | - self.kaggle = self.get_value('kaggle') |
68 | | - self.function = self.get_value('func') |
69 | | - self.ignore = self.get_value('except') |
70 | | - self.force = self.get_value('force') |
71 | | - self.analyzer_collection = self.get_value('collection') |
| 70 | + #self.kaggle = self.get_value('kaggle') |
| 71 | + self.function = self.get_value('func') |
| 72 | + self.ignore = self.get_value('except') |
| 73 | + self.force = self.get_value('force') |
| 74 | + self.analyzer_collection = self.get_value('collection') |
72 | 75 | |
73 | | - self.dataset = os.path.join(self.dataset_location, self.project.name) |
74 | | - self.txt = os.path.join(self.output_location, 'txt') |
75 | | - self.sorted = os.path.join(self.output_location, 'sorted') |
76 | | - self.diffs = os.path.join(self.output_location, 'diffs') |
| 76 | + self.dataset = os.path.join(self.dataset_location, self.project.name) |
| 77 | + self.txt = os.path.join(self.output_location, 'txt') |
| 78 | + self.sorted = os.path.join(self.output_location, 'sorted') |
| 79 | + self.diffs = os.path.join(self.output_location, 'diffs') |
77 | 80 | |
78 | | - self.directories = [self.output_location, |
79 | | - self.txt, |
80 | | - self.sorted, |
81 | | - self.dataset, |
82 | | - self.diffs] |
83 | | - self.verify_environment(self.directories) |
| 81 | + self.directories = [self.output_location, |
| 82 | + self.txt, |
| 83 | + self.sorted, |
| 84 | + self.dataset, |
| 85 | + self.diffs] |
| 86 | + self.verify_environment(self.directories) |
84 | 87 | |
85 | | - #Wikidump file related variables |
86 | | - self.dump_filename = self.generate_wikidump_filename() |
87 | | - self.dump_relative_path = self.set_dump_path() |
88 | | - self.dump_absolute_path = self.set_dump_path(absolute=True) |
| 88 | + #Wikidump file related variables |
| 89 | + self.dump_filename = self.generate_wikidump_filename() |
| 90 | + self.dump_relative_path = self.set_dump_path() |
| 91 | + self.dump_absolute_path = self.set_dump_path(absolute=True) |
89 | 92 | |
90 | | - #Collection names |
91 | | - self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name) |
92 | | - self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name) |
93 | | - self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name) |
94 | | - self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name) |
| 93 | + #Collection names |
| 94 | + self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name) |
| 95 | + self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name) |
| 96 | + self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name) |
| 97 | + self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name) |
95 | 98 | |
96 | 99 | |
97 | 100 | |
— | — | @@ -239,7 +242,7 @@ |
240 | 243 | ''' |
241 | 244 | default = self.project |
242 | 245 | proj = self.get_value('project') |
243 | | - if proj != 'wiki': |
| 246 | + if proj != default: |
244 | 247 | pc = projects.ProjectContainer() |
245 | 248 | proj = pc.get_project(proj) |
246 | 249 | return proj |
— | — | @@ -281,7 +284,7 @@ |
282 | 285 | return ['0'] #Assume that the mainspace is of interest |
283 | 286 | |
284 | 287 | |
285 | | -def init_environment(project, language_code, args): |
| 288 | +def init_environment(project, language_code): |
286 | 289 | ''' |
287 | 290 | Initialize an instance of RuntimeSettings. |
288 | 291 | ''' |
— | — | @@ -289,8 +292,9 @@ |
290 | 293 | project = pjc.get_project(project) |
291 | 294 | lnc = languages.LanguageContainer() |
292 | 295 | language = lnc.get_language(language_code) |
293 | | - |
294 | | - args.language = language.name |
295 | | - args.project = project.name |
| 296 | + parser = init_args_parser(language_code, project) |
| 297 | + args = parser.parse_args(['django']) |
| 298 | + #args.language = language.name |
| 299 | + #args.project = project.name |
296 | 300 | rts = RunTimeSettings(project, language, args) |
297 | 301 | return rts |
Index: trunk/tools/editor_trends/classes/storage.py |
— | — | @@ -102,9 +102,14 @@ |
103 | 103 | This class provides the functionality to talk to a MongoDB backend including |
104 | 104 | inserting, finding, and updating data. |
105 | 105 | ''' |
106 | | - def __init__(self, dbname, collection): |
| 106 | + def __init__(self, dbname, collection, master=None, slaves=[]): |
| 107 | + if master == None: |
| 108 | + self.master = 'localhost' |
| 109 | + else: |
| 110 | + self.master = master |
| 111 | + self.slaves = slaves |
| 112 | + self.port = 27017 |
107 | 113 | super(Mongo, self).__init__(dbname, collection) |
108 | | - self.port = 27017 |
109 | 114 | |
110 | 115 | @classmethod |
111 | 116 | def is_registrar_for(cls, storage): |
— | — | @@ -114,8 +119,16 @@ |
115 | 120 | return storage == 'mongo' |
116 | 121 | |
117 | 122 | def connect(self): |
118 | | - db = pymongo.Connection() |
119 | | - return db[self.dbname] |
| 123 | + master = pymongo.Connection(host=self.master, port=self.port) |
| 124 | + if self.master == 'localhost': |
| 125 | + return master[self.dbname] |
| 126 | + else: |
| 127 | + slave_connections = [] |
| 128 | + for slave in self.slaves: |
| 129 | + slave = pymongo.Connection(host=slave, port=self.port) |
| 130 | + slave_connections.append(slave) |
| 131 | + master_slave_connection = pymongo.MasterSlaveConnection(master, slave_connections) |
| 132 | + return master_slave_connection[self.dbname] |
120 | 133 | |
121 | 134 | def save(self, data): |
122 | 135 | assert isinstance(data, dict), 'You need to feed me dictionaries.' |
Index: trunk/tools/editor_trends/classes/languages.py |
— | — | @@ -679,9 +679,12 @@ |
680 | 680 | print abbr |
681 | 681 | print len(abbr) |
682 | 682 | |
683 | | -def init(): |
| 683 | +def init(language_code=None): |
684 | 684 | lnc = LanguageContainer() |
685 | | - return lnc.languages[lnc.default] |
| 685 | + if language_code: |
| 686 | + return lnc.languages[language_code] |
| 687 | + else: |
| 688 | + return lnc.languages[lnc.default] |
686 | 689 | |
687 | 690 | if __name__ == '__main__': |
688 | 691 | init() |
Index: trunk/tools/editor_trends/utils/file_utils.py |
— | — | @@ -173,9 +173,10 @@ |
174 | 174 | '''Create a filehandle for text file with utf-8 encoding''' |
175 | 175 | filename = str(filename) |
176 | 176 | if not filename.endswith('.csv'): |
177 | | - filename = construct_filename(filename, '.csv') |
| 177 | + if filename.find('.') == -1: |
| 178 | + filename = construct_filename(filename, '.csv') |
178 | 179 | path = os.path.join(location, filename) |
179 | | - return codecs.open(path, mode, encoding='utf-8') |
| 180 | + return codecs.open(path, mode, encoding) |
180 | 181 | |
181 | 182 | |
182 | 183 | def create_streaming_buffer(path): |
— | — | @@ -189,7 +190,8 @@ |
190 | 191 | fh = subprocess.Popen('7z e -bd -so %s 2>/dev/null' % path, shell=True, |
191 | 192 | stdout=subprocess.PIPE, bufsize=65535).stdout |
192 | 193 | elif extension == '.xml': |
193 | | - fh = create_txt_filehandle(path, None, 'r', 'utf-8') |
| 194 | + location, filename = os.path.split(path) |
| 195 | + fh = create_txt_filehandle(location, filename, 'r', 'utf-8') |
194 | 196 | else: |
195 | 197 | raise exceptions.CompressedFileNotSupported(extension) |
196 | 198 | return fh |
— | — | @@ -247,6 +249,7 @@ |
248 | 250 | os.utime(path, (mod_rem, mod_rem)) |
249 | 251 | #sraise exceptions.NotYetImplementedError(set_modified_data) |
250 | 252 | |
| 253 | + |
251 | 254 | def get_modified_date(location, filename): |
252 | 255 | '''determine the date the file was originally created''' |
253 | 256 | path = os.path.join(location, filename) |
Index: trunk/tools/editor_trends/utils/log.py |
— | — | @@ -31,11 +31,9 @@ |
32 | 32 | def to_db(rts, jobtype, task, timer, event='start'): |
33 | 33 | db = storage.init_database(rts.storage, rts.dbname, 'jobs') |
34 | 34 | created = datetime.datetime.now() |
35 | | - hash = '%s_%s' % (rts.project, rts.hash) |
| 35 | + job = db.find_one('hash', rts.id) |
36 | 36 | |
37 | | - job = db.find_one('hash', hash) |
38 | | - |
39 | | - data = {'hash': hash, |
| 37 | + data = {'hash': rts.id, |
40 | 38 | 'created': created, |
41 | 39 | 'jobtype': jobtype, |
42 | 40 | 'in_progress': True, |
— | — | @@ -60,7 +58,7 @@ |
61 | 59 | t['start'] = timer.t0 |
62 | 60 | t['in_progress'] = True |
63 | 61 | tasks[task] = t |
64 | | - db.update('hash', hash, {'$set': {'tasks': tasks}}) |
| 62 | + db.update('hash', rts.id, {'$set': {'tasks': tasks}}) |
65 | 63 | #coll.update({'hash': hash}, {'$set': {'tasks': tasks}}) |
66 | 64 | elif event == 'finish': |
67 | 65 | t['finish'] = timer.t1 |
— | — | @@ -68,11 +66,11 @@ |
69 | 67 | tasks[task] = t |
70 | 68 | if task == 'transform' or jobtype == 'chart': |
71 | 69 | #final task, set entire task to finished |
72 | | - db.update('hash', hash, {'$set': {'tasks': tasks, |
| 70 | + db.update('hash', rts.id, {'$set': {'tasks': tasks, |
73 | 71 | 'in_progress': False, |
74 | 72 | 'finished': True}}) |
75 | 73 | else: |
76 | | - db.update('hash', hash, {'$set': {'tasks': tasks}}) |
| 74 | + db.update('hash', rts.id, {'$set': {'tasks': tasks}}) |
77 | 75 | |
78 | 76 | |
79 | 77 | def to_csv(logger, settings, message, verb, function, **kwargs): |
Index: trunk/tools/editor_trends/utils/text_utils.py |
— | — | @@ -20,6 +20,7 @@ |
21 | 21 | import datetime |
22 | 22 | import time |
23 | 23 | import sys |
| 24 | +import re |
24 | 25 | |
25 | 26 | if '..' not in sys.path: |
26 | 27 | sys.path.append('..') |
— | — | @@ -52,6 +53,14 @@ |
53 | 54 | return dict([[v, k] for k, v in dictionary.items()]) |
54 | 55 | |
55 | 56 | |
| 57 | +def validate_hostname(hostname): |
| 58 | + regex_hostname = re.compile('^(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|\b-){0,61}[0-9A-Za-z])?(?:\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|\b-){0,61}[0-9A-Za-z])?)*\.?$') |
| 59 | + res = re.match(regex_hostname, hostname) |
| 60 | + if res == None: |
| 61 | + return False |
| 62 | + else: |
| 63 | + return True |
| 64 | + |
56 | 65 | def get_max_width(table, index): |
57 | 66 | ''' |
58 | 67 | Get the maximum width of the given column index |