r89189 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r89188‎ | r89189 | r89190 >
Date:21:50, 30 May 2011
Author:diederik
Status:deferred
Tags:
Comment:
Preparing for Summer of Research
Modified paths:
  • /trunk/tools/editor_trends/analyses/adhoc/bot_detector.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/classes/languages.py (modified) (history)
  • /trunk/tools/editor_trends/classes/projects.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/classes/settings.py (modified) (history)
  • /trunk/tools/editor_trends/classes/storage.py (modified) (history)
  • /trunk/tools/editor_trends/etl/differ.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/kaggle.py (deleted) (history)
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)
  • /trunk/tools/editor_trends/etl/variables.py (modified) (history)
  • /trunk/tools/editor_trends/kaggle/training.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/statistics/stata/ppi.do (modified) (history)
  • /trunk/tools/editor_trends/utils/file_utils.py (modified) (history)
  • /trunk/tools/editor_trends/utils/log.py (modified) (history)
  • /trunk/tools/editor_trends/utils/text_utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -17,7 +17,7 @@
1818 _date__ = '2010-12-10'
1919 __version__ = '0.1'
2020
21 -from multiprocessing import JoinableQueue, Queue, Manager, RLock, Process
 21+from multiprocessing import JoinableQueue, Queue, Manager, RLock, Process, cpu_count
2222 from multiprocessing.managers import BaseManager
2323 from Queue import Empty
2424
@@ -141,10 +141,10 @@
142142 del editors
143143
144144 analyzers = [analytics.Analyzer(rts, tasks, result, var, data, plugin, func) for
145 - x in xrange(rts.number_of_processes)]
 145+ x in xrange(cpu_count())]
146146
147147
148 - for x in xrange(rts.number_of_processes):
 148+ for x in xrange(cpu_count()):
149149 tasks.put(None)
150150
151151 pbar = progressbar.ProgressBar(maxval=n).start()
@@ -152,7 +152,7 @@
153153 analyzer.start()
154154
155155
156 - ppills = rts.number_of_processes
 156+ ppills = cpu_count()
157157 while True:
158158 while ppills > 0:
159159 try:
@@ -216,7 +216,7 @@
217217
218218
219219 def launcher():
220 - project, language, parser = manage.init_args_parser()
 220+ project, language, parser = commandline.init_args_parser()
221221 args = parser.parse_args(['django'])
222222 rts = runtime_settings.init_environment('wiki', 'en', args)
223223 generate_chart_data(rts, 'taxonomy_burnout', time_unit='month')
Index: trunk/tools/editor_trends/analyses/adhoc/bot_detector.py
@@ -244,8 +244,8 @@
245245 '''
246246 This is the launcher that uses multiprocesses.
247247 '''
248 - consumers = [consumers.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
249 - for x in xrange(settings.number_of_processes):
 248+ consumers = [consumers.XMLFileConsumer(tasks, None) for i in xrange(multiprocessing.cpu_count())]
 249+ for x in xrange(multiprocessing.cpu_count()):
250250 tasks.put(None)
251251
252252 for w in consumers:
Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
@@ -159,7 +159,7 @@
160160 min_d = min(data.keys())
161161 max_d = max(data.keys())
162162 match = data[max_d]
163 - matches.append((ppi_editor, match))
 163+ matches.append((ppi_editor, match, max_d))
164164 #remove match to make sure that every matched pair is unique
165165 for editor in distances:
166166 try:
@@ -177,11 +177,12 @@
178178 fh.write('_a\t'.join(vars))
179179 fh.write('\t%s\t' % ('editor_b'))
180180 fh.write('_b\t'.join(vars))
181 - fh.write('\tdelta registration days\tid\n')
 181+ fh.write('\tdelta registration days\tid\teuclid_dist\n')
182182 for i, match in enumerate(matches):
183183 line = []
184184 editor_a = match[0]
185185 editor_b = match[1]
 186+ dist = match[2]
186187 line.append(editor_a)
187188 values_a = [str(obs_a[editor_a][v]) for v in vars]
188189 values_b = [str(obs_b[editor_b][v]) for v in vars]
@@ -191,6 +192,7 @@
192193 dt = obs_a[editor_a]['reg_date'] - obs_b[editor_b]['reg_date']
193194 line.append(str(dt.days))
194195 line.append(str(i))
 196+ line.append(dist)
195197 line.append('\n')
196198 print line
197199 #line = '\t'.join([str(l).decode('utf-8') for l in line])
Index: trunk/tools/editor_trends/manage.py
@@ -30,6 +30,7 @@
3131 from classes import projects
3232 from classes import runtime_settings
3333 from utils import file_utils
 34+from utils import text_utils
3435 from utils import ordered_dict
3536 from utils import log
3637 from utils import timer
@@ -43,14 +44,113 @@
4445 from analyses import inventory
4546
4647
47 -def init_args_parser():
 48+
 49+def config_launcher(rts, logger):
4850 '''
 51+ Config launcher is used to (re)configure Wikilytics.
 52+ '''
 53+
 54+ pc = projects.ProjectContainer()
 55+ if not os.path.exists('wiki.cfg') or rts.force:
 56+ config = ConfigParser.RawConfigParser()
 57+ project = None
 58+ language = None
 59+ db = None
 60+ valid_hostname = False
 61+ valid_storage = ['mongo', 'cassandra']
 62+ working_directory = raw_input('''Please indicate where you installed
 63+ Wikilytics.\nCurrent location is %s\nPress Enter to accept default.\n''' % os.getcwd())
 64+
 65+ input_location = raw_input('''Please indicate where the Wikipedia dump
 66+ files are or will be located.\nDefault is: %s\nPress Enter to
 67+ accept default.\n''' % rts.input_location)
 68+
 69+ base_location = raw_input('''Please indicate where to store all
 70+ Wikilytics project files.\nDefault is: %s\nPress Enter to accept
 71+ default.\n''' % rts.base_location)
 72+
 73+ while db not in valid_storage:
 74+ db = raw_input('''Please indicate what database you are using for storage.\nDefault is: Mongo\n''')
 75+ db = 'mongo' if len(db) == 0 else db.lower()
 76+ if db not in valid_storage:
 77+ print 'Valid choices are: %s' % ','.join(valid_storage)
 78+
 79+ while project not in pc.projects.keys():
 80+ project = raw_input('''Please indicate which project you would like
 81+ to analyze.\nDefault is: %s\nPress Enter to accept default.\n''' % rts.project.full_name)
 82+ project = project if len(project) > 0 else rts.project.name
 83+ if project not in pc.projects.keys():
 84+ print 'Valid choices for a project are: %s' % ','.join(pc.projects.keys())
 85+
 86+ while language not in rts.project.valid_languages:
 87+ language = raw_input('''Please indicate which language of project
 88+ %s you would like to analyze.\nDefault is: %s\nPress Enter to accept
 89+ default.\n''' % (rts.project.full_name, rts.language))
 90+ if len(language) == 0:
 91+ language = rts.language.code
 92+ language = language if language in rts.project.valid_languages \
 93+ else rts.language.default
 94+
 95+ while valid_hostname == False:
 96+ master = raw_input('''Please indicate the hostname master of your database
 97+ cluster.\n Default is: %s\nPress Enter to accept default.\n''' % ('localhost'))
 98+ master = 'localhost' if len(master) == 0 else master
 99+ valid_hostname = text_utils.validate_hostname(master)
 100+
 101+ if master != 'localhost':
 102+ valid_hostname = False
 103+ while valid_hostname == False:
 104+ slaves = raw_input('''Please indicate the hostnames of your slaves
 105+ of your database cluster.Separate names using a comma.\n''')
 106+ slaves = slaves.split(',')
 107+ results = []
 108+ for slave in slaves:
 109+ results.append(text_utils.validate_hostname(slave))
 110+ valid_hostname = True if all(results) else False
 111+
 112+ slaves = ','.join(slaves)
 113+ input_location = input_location if len(input_location) > 0 else \
 114+ rts.input_location
 115+ base_location = base_location if len(base_location) > 0 else \
 116+ rts.base_location
 117+ working_directory = working_directory if len(working_directory) > 0 \
 118+ else os.getcwd()
 119+
 120+ config = ConfigParser.RawConfigParser()
 121+ config.add_section('file_locations')
 122+ config.set('file_locations', 'working_directory', working_directory)
 123+ config.set('file_locations', 'input_location', input_location)
 124+ config.set('file_locations', 'base_location', base_location)
 125+ config.add_section('wiki')
 126+ config.set('wiki', 'project', project)
 127+ config.set('wiki', 'language', language)
 128+ config.add_section('storage')
 129+ config.set('storage', 'db', db)
 130+ config.add_section('cluster')
 131+ config.set('cluster', 'master', master)
 132+ config.set('cluster', 'slaves', slaves)
 133+
 134+ fh = file_utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
 135+ config.write(fh)
 136+ fh.close()
 137+
 138+ log.to_csv(logger, rts, 'New configuration', 'Creating',
 139+ config_launcher,
 140+ working_directory=working_directory,
 141+ input_location=input_location,
 142+ base_location=base_location,
 143+ project=project,
 144+ language=language,)
 145+
 146+
 147+def init_args_parser(language_code=None, project=None):
 148+ '''
49149 Entry point for parsing command line and launching the needed function(s).
50150 '''
51 - language = languages.init()
52 - project = projects.init()
 151+ language = languages.init(language_code)
 152+ project = projects.init(project)
53153 pjc = projects.ProjectContainer()
54 - rts = runtime_settings.RunTimeSettings(project, language)
 154+ #rts = runtime_settings.RunTimeSettings(project, language)
55155
56156 file_choices = {'meta-full': 'stub-meta-history.xml.gz',
57157 'meta-current': 'stub-meta-current.xml.gz',
@@ -78,7 +178,7 @@
79179 parser_config.set_defaults(func=config_launcher)
80180 parser_config.add_argument('-f', '--force',
81181 action='store_true',
82 - help='Reconfigure Editor Toolkit (this will replace wiki.cfg')
 182+ help='Reconfigure Wikilytics (this will replace wiki.cfg')
83183
84184 #DOWNLOAD
85185 parser_download = subparsers.add_parser('download',
@@ -141,7 +241,7 @@
142242 parser_diff = subparsers.add_parser('diff',
143243 help='Create a Mongo collection containing the diffs between revisions.')
144244 parser_diff.set_defaults(func=diff_launcher)
145 -
 245+
146246 #DJANGO
147247 parser_django = subparsers.add_parser('django')
148248 parser_django.add_argument('-e', '--except',
@@ -192,85 +292,9 @@
193293 %s' % ''.join([f + ',\n' for f in file_choices]),
194294 default=file_choices['meta-full'])
195295
196 - return project, language, parser
 296+ return parser
197297
198298
199 -def config_launcher(rts, logger):
200 - '''
201 - Config launcher is used to reconfigure editor trends toolkit.
202 - '''
203 -
204 - pc = projects.ProjectContainer()
205 - if not os.path.exists('wiki.cfg') or rts.force:
206 - config = ConfigParser.RawConfigParser()
207 - project = None
208 - language = None
209 - db = None
210 - valid_storage = ['mongo', 'cassandra']
211 - working_directory = raw_input('''Please indicate where you installed
212 - Wikilytics.\nCurrent location is %s\nPress Enter to accept default.\n''' % os.getcwd())
213 -
214 - input_location = raw_input('''Please indicate where the Wikipedia dump
215 - files are or will be located.\nDefault is: %s\nPress Enter to
216 - accept default.\n''' % rts.input_location)
217 -
218 - base_location = raw_input('''Please indicate where to store all
219 - Wikilytics project files.\nDefault is: %s\nPress Enter to accept
220 - default.\n''' % rts.base_location)
221 -
222 - while db not in valid_storage:
223 - db = raw_input('Please indicate what database you are using for storage. \nDefault is: Mongo\n')
224 - db = 'mongo' if len(db) == 0 else db.lower()
225 - if db not in valid_storage:
226 - print 'Valid choices are: %s' % ','.join(valid_storage)
227 -
228 - while project not in pc.projects.keys():
229 - project = raw_input('''Please indicate which project you would like
230 - to analyze.\nDefault is: %s\nPress Enter to accept default.\n''' % rts.project.full_name)
231 - project = project if len(project) > 0 else rts.project.name
232 - if project not in pc.projects.keys():
233 - print 'Valid choices for a project are: %s' % ','.join(pc.projects.keys())
234 -
235 - while language not in rts.project.valid_languages:
236 - language = raw_input('''Please indicate which language of project
237 - %s you would like to analyze.\nDefault is: %s\nPress Enter to accept
238 - default.\n''' % (rts.project.full_name, rts.language))
239 - if len(language) == 0:
240 - language = rts.language.code
241 - language = language if language in rts.project.valid_languages \
242 - else rts.language.default
243 -
244 - input_location = input_location if len(input_location) > 0 else \
245 - rts.input_location
246 - base_location = base_location if len(base_location) > 0 else \
247 - rts.base_location
248 - working_directory = working_directory if len(working_directory) > 0 \
249 - else os.getcwd()
250 -
251 - config = ConfigParser.RawConfigParser()
252 - config.add_section('file_locations')
253 - config.set('file_locations', 'working_directory', working_directory)
254 - config.set('file_locations', 'input_location', input_location)
255 - config.set('file_locations', 'base_location', base_location)
256 - config.add_section('wiki')
257 - config.set('wiki', 'project', project)
258 - config.set('wiki', 'language', language)
259 - config.add_section('storage')
260 - config.set('storage', 'db', db)
261 -
262 - fh = file_utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
263 - config.write(fh)
264 - fh.close()
265 -
266 - log.to_csv(logger, rts, 'New configuration', 'Creating',
267 - config_launcher,
268 - working_directory=working_directory,
269 - input_location=input_location,
270 - base_location=base_location,
271 - project=project,
272 - language=language,)
273 -
274 -
275299 def downloader_launcher(rts, logger):
276300 '''
277301 This launcher calls the dump downloader to download a Wikimedia dump file.
@@ -343,7 +367,8 @@
344368 stopwatch = timer.Timer()
345369 log.to_db(rts, 'dataset', 'transform', stopwatch, event='start')
346370 log.to_csv(logger, rts, 'Start', 'Transform', transformer_launcher)
347 - transformer.transform_editors_multi_launcher(rts)
 371+ #transformer.transform_editors_multi_launcher(rts)
 372+ transformer.transform_editors_single_launcher(rts)
348373 stopwatch.elapsed()
349374 log.to_db(rts, 'dataset', 'transform', stopwatch, event='finish')
350375 log.to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher)
@@ -359,8 +384,8 @@
360385 log.to_db(rts, 'dataset', 'diff', stopwatch, event='finish')
361386 log.to_csv(logger, rts, 'Finish', 'Diff', diff_launcher)
362387
363 -
364388
 389+
365390 def dataset_launcher(rts, logger):
366391 '''
367392 Dataset launcher is the entry point to generate datasets from the command
@@ -414,8 +439,11 @@
415440 '''
416441 This function initializes the command line parser.
417442 '''
418 - project, language, parser, = init_args_parser()
 443+ parser = init_args_parser()
419444 args = parser.parse_args()
 445+ language = languages.init()
 446+ project = projects.init()
 447+
420448 rts = runtime_settings.RunTimeSettings(project, language, args)
421449 #initialize logger
422450 logger = logging.getLogger('manager')
Index: trunk/tools/editor_trends/etl/kaggle.py
@@ -1,49 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-04-12'
19 -__version__ = '0.1'
20 -
21 -import sys
22 -
23 -if '..' not in sys.path:
24 - sys.path.append('..')
25 -
26 -from utils import file_utils
27 -
28 -
29 -def launcher():
30 - location = '/home/diederik/wikimedia/en/wiki/kaggle_training/'
31 - #location = 'C:\\wikimedia\\en\\wiki\\txt'
32 - files = file_utils.retrieve_file_list(location, extension='csv')
33 - files.sort()
34 - dataset = file_utils.create_txt_filehandle(location, 'dataset.csv', 'w', 'utf-8')
35 - for filename in files:
36 - if not filename.startswith('comments') and \
37 - not filename.startswith('articles') and not filename.startswith('dataset'):
38 - fh = file_utils.create_txt_filehandle(location, filename, 'r', 'utf-8')
39 - print fh
40 - for line in fh:
41 - data = line.split('\t')
42 - username = data[3].lower()
43 - if username.endswith('bot'):
44 - continue
45 - else:
46 - dataset.write(line)
47 - fh.close()
48 - dataset.close()
49 -
50 -launcher()
Index: trunk/tools/editor_trends/etl/variables.py
@@ -275,20 +275,20 @@
276276 Determine the id of a revision
277277 '''
278278 if revision_id != None:
279 - return revision_id.text
 279+ return int(revision_id.text)
280280 else:
281281 return None
282282
283283
284 -def extract_comment_text(revision_id, revision):
 284+def extract_comment_text(revision, xml_namespace):
285285 '''
286286 Extract the comment associated with an edit.
287287 '''
288 - comment = {}
289 - text = revision.find('comment')
290 - if text != None and text.text != None:
291 - comment[revision_id] = text.text.encode('utf-8')
292 - return comment
 288+ comment_text = revision.find('%s%s' % (xml_namespace, 'comment'))
 289+ if comment_text != None and comment_text.text != None:
 290+ return comment_text.text
 291+ else:
 292+ return None
293293
294294
295295 def create_namespace_dict(siteinfo, xml_namespace):
Index: trunk/tools/editor_trends/etl/differ.py
@@ -17,23 +17,35 @@
1818 __date__ = '2011-04-10'
1919 __version__ = '0.1'
2020
 21+
 22+'''
 23+This script generates diffs of edits for the Talk, User Talk and Wikipedia Talk
 24+pages of a Wikipedia project. These diffs are stored in json files and then
 25+imported in Mongo.
 26+'''
 27+import pprint
2128 import json
2229 import cStringIO
2330 import codecs
2431 import sys
2532 import os
2633 import difflib
 34+import bson
2735 from xml.etree.cElementTree import iterparse, dump
2836 from multiprocessing import JoinableQueue, Process, cpu_count
2937 from datetime import datetime
 38+from copy import deepcopy
3039
3140
3241 if '..' not in sys.path:
3342 sys.path.append('../')
3443
3544 from utils import file_utils
 45+from utils import text_utils
3646 from etl import variables
3747 from classes import exceptions
 48+from classes import storage
 49+from classes import runtime_settings
3850
3951
4052 def parse_xml(fh, format, process_id, location):
@@ -50,13 +62,13 @@
5163 context = iterparse(fh, events=(start, end))
5264 context = iter(context)
5365
54 - article = {}
 66+
 67+ revisions = []
5568 count_articles = 0
5669 id = False
5770 ns = False
5871 parse = False
59 - rev1 = None
60 - rev2 = None
 72+ prev_rev_text = None
6173 file_id, fh_output = None, None
6274
6375 try:
@@ -80,11 +92,11 @@
8193 parsing this article, else it will skip this article.
8294 '''
8395 title = variables.parse_title(elem)
84 - article['title'] = title
8596 current_namespace = variables.determine_namespace(title, namespaces, include_ns)
8697 if current_namespace == 1 or current_namespace == 3 or current_namespace == 5:
8798 parse = True
88 - article['namespace'] = current_namespace
 99+ #article['namespace'] = current_namespace
 100+ title = title.replace(namespaces[current_namespace], '')
89101 count_articles += 1
90102 if count_articles % 10000 == 0:
91103 print 'Worker %s parsed %s articles' % (process_id, count_articles)
@@ -105,23 +117,32 @@
106118 timestamp = elem.find('%s%s' % (xml_namespace, 'timestamp')).text
107119 contributor = elem.find('%s%s' % (xml_namespace, 'contributor'))
108120 editor = variables.parse_contributor(contributor, None, xml_namespace)
 121+ text = variables.extract_revision_text(elem, xml_namespace)
 122+ comment = variables.extract_comment_text(elem, xml_namespace)
109123 if editor:
110124 rev_id = variables.extract_revision_id(rev_id)
 125+ if prev_rev_text == None:
 126+ diff = text
 127+ prev_rev_text = deepcopy(text)
 128+ if prev_rev_text != None:
 129+ #print text[0:20], prev_rev_text[0:20]
 130+ diff = diff_revision(prev_rev_text, text)
111131
112 - if rev1 == None and rev2 == None:
113 - diff = variables.extract_revision_text(elem, xml_namespace)
114 - rev1 = elem
115 - if rev1 != None and rev2 != None:
116 - diff = diff_revision(rev1, rev2, xml_namespace)
 132+ if diff != None:
 133+ timestamp = text_utils.convert_timestamp_to_datetime_utc(timestamp)
 134+ timestamp = timestamp.isoformat()
 135+ revision = dict(rev_id=rev_id, title=title,
 136+ timestamp=timestamp,
 137+ diff=diff, comment=comment,
 138+ id=editor['id'],
 139+ username=editor['username'],
 140+ article_id=article_id,
 141+ ns=current_namespace)
 142+ revisions.append(revision)
117143
118 - article[rev_id] = {}
119 - article[rev_id].update(editor)
120 - article[rev_id]['timestamp'] = timestamp
121 - article[rev_id]['diff'] = diff
122 -
123144 clear = True
124145 if clear:
125 - rev2 = rev1
 146+ prev_rev_text = deepcopy(text)
126147 elem.clear()
127148 else:
128149 elem.clear()
@@ -130,7 +151,7 @@
131152 '''
132153 Determine id of article
133154 '''
134 - article['article_id'] = elem.text
 155+ article_id = int(elem.text)
135156 id = True
136157 elem.clear()
137158
@@ -140,17 +161,16 @@
141162 memory.
142163 '''
143164 elem.clear()
144 - #write diff of text to file
 165+
145166 if parse:
146 - #print article
147 - fh_output, file_id = assign_filehandle(fh_output, file_id, location, process_id, format)
148 - write_diff(fh_output, article, format)
 167+ #write diff of text to file
 168+ if len(revisions) > 0:
 169+ fh_output, file_id = assign_filehandle(fh_output, file_id, location, process_id, format)
 170+ write_diff(fh_output, revisions, format)
 171+
149172 #Reset all variables for next article
150 - article = {}
151 - if rev1 != None:
152 - rev1.clear()
153 - if rev2 != None:
154 - rev2.clear()
 173+ revisions = []
 174+ prev_rev_text = None
155175 id = False
156176 parse = False
157177
@@ -181,14 +201,47 @@
182202
183203 return fh, file_id
184204
 205+
185206 def write_xml_diff(fh, article):
186207 pass
187208
188209
189 -def write_json_diff(fh, article):
190 - json.dump(article, fh)
 210+def write_json_diff(fh, revisions):
 211+ fh.write('\nStart new JSON object\n')
 212+ json.dump(revisions, fh, indent=4, sort_keys=True)
191213
192214
 215+def store_json_diffs(rts):
 216+ files = os.listdir(rts.diffs)
 217+ print files, rts.diffs
 218+ db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
 219+ buffer = cStringIO.StringIO()
 220+
 221+ for filename in files:
 222+ fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8')
 223+ for line in fh:
 224+ if line.startswith('\n') or line.startswith('Start'):
 225+ obj = buffer.getvalue()
 226+ if obj != '':
 227+ obj = json.loads(obj)
 228+ obj[0]['article_id'] = int(obj[0]['article_id'])
 229+ for key, value in obj[0].iteritems():
 230+ if type(value) == type(dict()):
 231+ value['timestamp'] = datetime.strptime(value['timestamp'], '%Y-%m-%dT%H:%M:%S')
 232+ obj[0][key] = value
 233+ obj = obj[0]
 234+ #print obj
 235+ #print len(obj)
 236+ try:
 237+ db.save(obj)
 238+ except bson.errors.InvalidDocument, error:
 239+ print error
 240+ buffer = cStringIO.StringIO()
 241+ else:
 242+ buffer.write(line)
 243+ fh.close()
 244+
 245+
193246 def write_diff(fh, article, format):
194247 if format == 'xml':
195248 write_xml_diff(fh, article)
@@ -198,23 +251,47 @@
199252 raise exceptions.OutputNotSupported()
200253
201254
202 -def diff_revision(rev1, rev2, xml_namespace):
203 - buffer = cStringIO.StringIO()
204 - if rev1.text != None and rev2.text != None:
205 - diff = difflib.unified_diff(rev1.text, rev2.text, n=0, lineterm='')
 255+def diff_revision(rev1, rev2):
 256+ if rev1 == None:
 257+ rev1 = ''
 258+ if rev2 == None:
 259+ rev2 = ''
 260+ if len(rev1) != len(rev2):
 261+ buffer = cStringIO.StringIO()
 262+ rev1 = rev1.splitlines(1)
 263+ rev2 = rev2.splitlines(2)
 264+
 265+ diff = difflib.unified_diff(rev1, rev2, n=0, lineterm='')
206266 for line in diff:
207267 if len(line) > 3:
208 - print line
209 - buffer.write(line)
 268+ #print line
 269+ buffer.write(line.encode('utf-8'))
210270
211 - return buffer.getvalue()
 271+ diff = buffer.getvalue()
212272
 273+ if diff == '':
 274+ return None
 275+ else:
 276+ return diff
 277+ else:
 278+ return None
 279+
 280+
 281+def store_diffs_debug(rts):
 282+ db = storage.init_database(rts)
 283+ files = os.listdir(rts.diffs)
 284+ for filename in files:
 285+ fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8')
 286+ diffs = json.load(fh)
 287+ db.insert(diffs)
 288+ fh.close()
 289+
 290+
213291 def stream_raw_xml(input_queue, process_id, rts, format):
214292 '''
215293 This function fetches an XML file from the queue and launches the processor.
216294 '''
217295 t0 = datetime.now()
218 - file_id = 0
219296
220297 while True:
221298 filename = input_queue.get()
@@ -225,7 +302,7 @@
226303
227304 print filename
228305 fh = file_utils.create_streaming_buffer(filename)
229 - parse_xml(fh, format, process_id, rts.input_location)
 306+ parse_xml(fh, format, process_id, rts.diffs)
230307 fh.close()
231308
232309 t1 = datetime.now()
@@ -266,7 +343,14 @@
267344
268345 input_queue.join()
269346
 347+ store_json_diffs(rts)
 348+ db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
 349+ db.add_index('title')
 350+ db.add_index('timestamp')
 351+ db.add_index('username')
 352+ db.add_index('ns')
270353
 354+
271355 def launcher_simple():
272356 location = 'c:\\wikimedia\\nl\\wiki\\'
273357 output_location = 'c:\\wikimedia\\nl\\wiki\\diffs\\'
@@ -311,5 +395,6 @@
312396
313397
314398 if __name__ == '__main__':
 399+ #read_json_diffs()
315400 launcher_simple()
316401 #debug()
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -22,8 +22,8 @@
2323 parsing the XML on the fly and extracting & constructing the variables that are
2424 need for subsequent analysis. The extract module is initialized using an
2525 instance of RunTimeSettings and the most important parameters are:
26 -The name of project\n
27 -The language of the project\n
 26+The name of project
 27+The language of the project
2828 The location where the dump files are stored
2929 '''
3030
Index: trunk/tools/editor_trends/etl/transformer.py
@@ -338,11 +338,11 @@
339339 db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)
340340 db_dataset.drop_collection()
341341 editors = db_raw.retrieve_editors()
342 - return editors
 342+ return editors, db_raw, db_dataset
343343
344344
345345 def transform_editors_multi_launcher(rts):
346 - editors = setup_database(rts)
 346+ editors, db_raw, db_dataset = setup_database(rts)
347347 n = editors.size()
348348 result = queue.JoinableRetryQueue()
349349 pbar = progressbar.ProgressBar(maxval=n).start()
@@ -372,7 +372,7 @@
373373
374374 def transform_editors_single_launcher(rts):
375375 print rts.dbname, rts.editors_raw
376 - editors = setup_database(rts)
 376+ editors, db_raw, db_dataset = setup_database(rts)
377377 n = editors.size()
378378 pbar = progressbar.ProgressBar(maxval=n).start()
379379
@@ -384,7 +384,7 @@
385385 editors.task_done()
386386 if editor == None:
387387 break
388 - editor = Editor(rts, editor)
 388+ editor = Editor(rts, editor, db_raw, db_dataset)
389389 editor()
390390
391391 pbar.update(pbar.currval + 1)
Index: trunk/tools/editor_trends/statistics/stata/ppi.do
@@ -1,5 +1,11 @@
22 clear
33 insheet using "C:\Users\diederik.vanliere\Desktop\ppi_quality.csv"
 4+
 5+gen diff_character_count = character_count_a - character_count_b
 6+gen diff_cum_edit_count_main_ns = cum_edit_count_main_ns_a- cum_edit_count_main_ns_b
 7+gen diff_cum_edit_count_other_ns = cum_edit_count_other_ns_a- cum_edit_count_other_ns_b
 8+gen diff_article_count = article_count_a- article_count_b
 9+
410 label var character_count_a "PPI editor"
511 label var character_count_b "Regular editor"
612
Index: trunk/tools/editor_trends/kaggle/training.py
@@ -17,23 +17,35 @@
1818 __date__ = '2011-04-12'
1919 __version__ = '0.1'
2020
 21+import os
 22+import sys
 23+import cPickle
2124 import codecs
22 -import os
2325 from datetime import datetime
24 -import json
 26+sys.path.append('../')
2527
26 -location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction'
 28+from classes import storage
 29+
 30+location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction_solution'
2731 files = os.listdir(location)
2832 files.reverse()
29 -dataset = codecs.open('training.tsv', 'w', 'utf-8')
 33+
 34+max_size = 2147483648
 35+max_size_reached = False
 36+
3037 t0 = datetime.now()
31 -max_size = 2147483648
3238 titles = {}
3339 ids = set()
 40+dates = {}
 41+edits = {}
 42+ignore_ids = set()
3443 size = 0
3544 cnt_obs = 0
36 -max_size_reached = False
 45+cutoff_date = datetime(2010, 8, 31)
3746
 47+print 'Constructing training dataset...'
 48+db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
 49+dataset = codecs.open('training.tsv', 'w', 'utf-8')
3850 for filename in files:
3951 if not filename.startswith('comments') and not filename.startswith('articles'):
4052 fh = codecs.open(os.path.join(location, filename))
@@ -46,13 +58,25 @@
4759 continue
4860 if line[10] == '1':
4961 continue
 62+ timestamp = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ')
 63+ if timestamp > cutoff_date:
 64+ continue
5065 username = line[3].lower()
51 - if username.endswith('bot'):
 66+ if username.endswith('bot') or username.find('script') > -1:
5267 #line[10] = '1'
5368 continue
 69+ id = line[2]
 70+ if id not in ids and id not in ignore_ids:
 71+ res = db.find_one('editor', id)
 72+ if res == None:
 73+ ignore_ids.add(id)
 74+ continue
5475 cnt_obs += 1
5576 title_id = line[1]
56 - ids.add(line[2])
 77+ ids.add(id)
 78+ simple_date = '%s-%s' % (timestamp.year, timestamp.month)
 79+ dates.setdefault(simple_date, 0)
 80+ dates[simple_date] += 1
5781 title = line.pop(5)
5882 titles[title_id] = title
5983 line.append('\n')
@@ -64,20 +88,54 @@
6589
6690 dataset.close()
6791
 92+print 'Constructing title dataset...'
6893 fh = codecs.open('titles.tsv', 'w', 'utf-8')
6994 for id, title in titles.iteritems():
7095 fh.write('%s\t%s\n' % (id, title.decode('utf-8')))
7196 fh.close()
7297
73 -fh = codecs.open('ids.json', 'w', 'utf-8')
74 -json.dump(ids, fh)
75 -#for id in ids:
76 -#fh.write('%s\n' % (id.decode('utf-8')))
77 -#fh.write('%s\n' % (json.du)
 98+
 99+print 'Constructing solution dataset...'
 100+x = 0
 101+fh = codecs.open('solutions.tsv', 'w', 'utf-8')
 102+for id in ids:
 103+ if id not in ignore_ids:
 104+ obs = db.find_one('editor', str(id), 'cum_edit_count_main_ns')
 105+ if obs != None:
 106+ x += 1
 107+ n = obs['cum_edit_count_main_ns']
 108+ fh.write('%s,%s\n' % (id.decode('utf-8'), n))
 109+ edits.setdefault(n, 0)
 110+ edits[n] += 1
 111+ else:
 112+ print id
78113 fh.close()
79114
 115+print 'Storing date histogram'
 116+fh = open('histogram_dates.bin', 'wb')
 117+cPickle.dump(dates, fh)
 118+fh.close()
 119+
 120+
 121+fh = open('histogram_dates.tsv', 'w')
 122+for date, n in dates.iteritems():
 123+ fh.write('%s\t%s\n' % (date, n))
 124+fh.close()
 125+
 126+
 127+print 'Storing edit histogram'
 128+fh = open('histogram_edits.bin', 'wb')
 129+cPickle.dump(edits, fh)
 130+fh.close()
 131+
 132+fh = open('histogram_edits.tsv', 'w')
 133+for edit, n in edits.iteritems():
 134+ fh.write('%s\t%s\n' % (edit, n))
 135+fh.close()
 136+
 137+
80138 t1 = datetime.now()
81 -print 'Descriptives:\n'
82 -print 'Number of editors: %s' % len(ids)
 139+print 'Descriptives:'
 140+print 'Number of editors: %s' % x
83141 print 'Number of edits: %s' % cnt_obs
84142 print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Index: trunk/tools/editor_trends/classes/projects.py
@@ -107,9 +107,12 @@
108108 pc = ProjectContainer()
109109 pc.supported_projects()
110110
111 -def init():
 111+def init(project=None):
112112 pc = ProjectContainer()
113 - return pc.get_project('wiki')
 113+ if project:
 114+ return pc.get_project(project)
 115+ else:
 116+ return pc.get_project('wiki')
114117
115118 if __name__ == '__main__':
116119 debug()
Index: trunk/tools/editor_trends/classes/settings.py
@@ -17,11 +17,6 @@
1818 __date__ = '2010-10-21'
1919 __version__ = '0.1'
2020
21 -'''
22 -This file contains settings that are used for constructing and analyzing
23 -the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
24 -'''
25 -
2621 from multiprocessing import cpu_count
2722 import ConfigParser
2823 import os
@@ -73,7 +68,7 @@
7469 #Change this to match your computers configuration (RAM / CPU)
7570 # I want to get rid off these two variables.
7671 self.number_of_processes = cpu_count()
77 - self.windows_register = {'7z.exe': 'Software\\7-Zip'}
 72+ #self.windows_register = {'7z.exe': 'Software\\7-Zip'}
7873
7974 self.wp_dump_location = 'http://dumps.wikimedia.org'
8075
@@ -107,6 +102,8 @@
108103 self.default_project = config.get('wiki', 'project')
109104 self.default_language = config.get('wiki', 'language')
110105 self.storage = config.get('storage', 'db')
 106+ self.master = config.get('cluster', 'master')
 107+ self.slaves = config.get('cluster', 'slaves')
111108 return True
112109 except Exception, error:
113110 #raise exceptions.GenericMessage('corrupted_config')
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -27,6 +27,9 @@
2828 import datetime
2929 import time
3030
 31+if '..' not in sys.path:
 32+ sys.path.append('../')
 33+
3134 from settings import Settings
3235 from analyses import inventory
3336 from classes import exceptions
@@ -48,49 +51,49 @@
4952 self.language = language
5053 self.dbname = 'wikilytics'
5154
52 - if args:
53 - self.args = args
54 - self.hash = self.secs_since_epoch()
55 - #print self.settings.input_location
56 - #print self.get_value('location')
57 - self.project = self.update_project_settings()
58 - self.language = self.update_language_settings()
 55+ #if args:
 56+ self.args = args
 57+ self.id = '%s%s_%s' % (self.language.code, self.project.name, 'current_month')
 58+ #print self.settings.input_location
 59+ #print self.get_value('location')
 60+ self.project = self.update_project_settings()
 61+ self.language = self.update_language_settings()
5962
60 - self.input_location = self.set_input_location()
61 - self.output_location = self.set_output_location()
 63+ self.input_location = self.set_input_location()
 64+ self.output_location = self.set_output_location()
6265
63 - self.plugins = self.set_plugin()
64 - self.keywords = self.split_keywords()
65 - self.namespaces = self.get_namespaces()
 66+ self.plugins = self.set_plugin()
 67+ self.keywords = self.split_keywords()
 68+ self.namespaces = self.get_namespaces()
6669
67 - self.kaggle = self.get_value('kaggle')
68 - self.function = self.get_value('func')
69 - self.ignore = self.get_value('except')
70 - self.force = self.get_value('force')
71 - self.analyzer_collection = self.get_value('collection')
 70+ #self.kaggle = self.get_value('kaggle')
 71+ self.function = self.get_value('func')
 72+ self.ignore = self.get_value('except')
 73+ self.force = self.get_value('force')
 74+ self.analyzer_collection = self.get_value('collection')
7275
73 - self.dataset = os.path.join(self.dataset_location, self.project.name)
74 - self.txt = os.path.join(self.output_location, 'txt')
75 - self.sorted = os.path.join(self.output_location, 'sorted')
76 - self.diffs = os.path.join(self.output_location, 'diffs')
 76+ self.dataset = os.path.join(self.dataset_location, self.project.name)
 77+ self.txt = os.path.join(self.output_location, 'txt')
 78+ self.sorted = os.path.join(self.output_location, 'sorted')
 79+ self.diffs = os.path.join(self.output_location, 'diffs')
7780
78 - self.directories = [self.output_location,
79 - self.txt,
80 - self.sorted,
81 - self.dataset,
82 - self.diffs]
83 - self.verify_environment(self.directories)
 81+ self.directories = [self.output_location,
 82+ self.txt,
 83+ self.sorted,
 84+ self.dataset,
 85+ self.diffs]
 86+ self.verify_environment(self.directories)
8487
85 - #Wikidump file related variables
86 - self.dump_filename = self.generate_wikidump_filename()
87 - self.dump_relative_path = self.set_dump_path()
88 - self.dump_absolute_path = self.set_dump_path(absolute=True)
 88+ #Wikidump file related variables
 89+ self.dump_filename = self.generate_wikidump_filename()
 90+ self.dump_relative_path = self.set_dump_path()
 91+ self.dump_absolute_path = self.set_dump_path(absolute=True)
8992
90 - #Collection names
91 - self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)
92 - self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)
93 - self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)
94 - self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name)
 93+ #Collection names
 94+ self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)
 95+ self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)
 96+ self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)
 97+ self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name)
9598
9699
97100
@@ -239,7 +242,7 @@
240243 '''
241244 default = self.project
242245 proj = self.get_value('project')
243 - if proj != 'wiki':
 246+ if proj != default:
244247 pc = projects.ProjectContainer()
245248 proj = pc.get_project(proj)
246249 return proj
@@ -281,7 +284,7 @@
282285 return ['0'] #Assume that the mainspace is of interest
283286
284287
285 -def init_environment(project, language_code, args):
 288+def init_environment(project, language_code):
286289 '''
287290 Initialize an instance of RuntimeSettings.
288291 '''
@@ -289,8 +292,9 @@
290293 project = pjc.get_project(project)
291294 lnc = languages.LanguageContainer()
292295 language = lnc.get_language(language_code)
293 -
294 - args.language = language.name
295 - args.project = project.name
 296+ parser = init_args_parser(language_code, project)
 297+ args = parser.parse_args(['django'])
 298+ #args.language = language.name
 299+ #args.project = project.name
296300 rts = RunTimeSettings(project, language, args)
297301 return rts
Index: trunk/tools/editor_trends/classes/storage.py
@@ -102,9 +102,14 @@
103103 This class provides the functionality to talk to a MongoDB backend including
104104 inserting, finding, and updating data.
105105 '''
106 - def __init__(self, dbname, collection):
 106+ def __init__(self, dbname, collection, master=None, slaves=[]):
 107+ if master == None:
 108+ self.master = 'localhost'
 109+ else:
 110+ self.master = master
 111+ self.slaves = slaves
 112+ self.port = 27017
107113 super(Mongo, self).__init__(dbname, collection)
108 - self.port = 27017
109114
110115 @classmethod
111116 def is_registrar_for(cls, storage):
@@ -114,8 +119,16 @@
115120 return storage == 'mongo'
116121
117122 def connect(self):
118 - db = pymongo.Connection()
119 - return db[self.dbname]
 123+ master = pymongo.Connection(host=self.master, port=self.port)
 124+ if self.master == 'localhost':
 125+ return master[self.dbname]
 126+ else:
 127+ slave_connections = []
 128+ for slave in self.slaves:
 129+ slave = pymongo.Connection(host=slave, port=self.port)
 130+ slave_connections.append(slave)
 131+ master_slave_connection = pymongo.MasterSlaveConnection(master, slave_connections)
 132+ return master_slave_connection[self.dbname]
120133
121134 def save(self, data):
122135 assert isinstance(data, dict), 'You need to feed me dictionaries.'
Index: trunk/tools/editor_trends/classes/languages.py
@@ -679,9 +679,12 @@
680680 print abbr
681681 print len(abbr)
682682
683 -def init():
 683+def init(language_code=None):
684684 lnc = LanguageContainer()
685 - return lnc.languages[lnc.default]
 685+ if language_code:
 686+ return lnc.languages[language_code]
 687+ else:
 688+ return lnc.languages[lnc.default]
686689
687690 if __name__ == '__main__':
688691 init()
Index: trunk/tools/editor_trends/utils/file_utils.py
@@ -173,9 +173,10 @@
174174 '''Create a filehandle for text file with utf-8 encoding'''
175175 filename = str(filename)
176176 if not filename.endswith('.csv'):
177 - filename = construct_filename(filename, '.csv')
 177+ if filename.find('.') == -1:
 178+ filename = construct_filename(filename, '.csv')
178179 path = os.path.join(location, filename)
179 - return codecs.open(path, mode, encoding='utf-8')
 180+ return codecs.open(path, mode, encoding)
180181
181182
182183 def create_streaming_buffer(path):
@@ -189,7 +190,8 @@
190191 fh = subprocess.Popen('7z e -bd -so %s 2>/dev/null' % path, shell=True,
191192 stdout=subprocess.PIPE, bufsize=65535).stdout
192193 elif extension == '.xml':
193 - fh = create_txt_filehandle(path, None, 'r', 'utf-8')
 194+ location, filename = os.path.split(path)
 195+ fh = create_txt_filehandle(location, filename, 'r', 'utf-8')
194196 else:
195197 raise exceptions.CompressedFileNotSupported(extension)
196198 return fh
@@ -247,6 +249,7 @@
248250 os.utime(path, (mod_rem, mod_rem))
249251 #sraise exceptions.NotYetImplementedError(set_modified_data)
250252
 253+
251254 def get_modified_date(location, filename):
252255 '''determine the date the file was originally created'''
253256 path = os.path.join(location, filename)
Index: trunk/tools/editor_trends/utils/log.py
@@ -31,11 +31,9 @@
3232 def to_db(rts, jobtype, task, timer, event='start'):
3333 db = storage.init_database(rts.storage, rts.dbname, 'jobs')
3434 created = datetime.datetime.now()
35 - hash = '%s_%s' % (rts.project, rts.hash)
 35+ job = db.find_one('hash', rts.id)
3636
37 - job = db.find_one('hash', hash)
38 -
39 - data = {'hash': hash,
 37+ data = {'hash': rts.id,
4038 'created': created,
4139 'jobtype': jobtype,
4240 'in_progress': True,
@@ -60,7 +58,7 @@
6159 t['start'] = timer.t0
6260 t['in_progress'] = True
6361 tasks[task] = t
64 - db.update('hash', hash, {'$set': {'tasks': tasks}})
 62+ db.update('hash', rts.id, {'$set': {'tasks': tasks}})
6563 #coll.update({'hash': hash}, {'$set': {'tasks': tasks}})
6664 elif event == 'finish':
6765 t['finish'] = timer.t1
@@ -68,11 +66,11 @@
6967 tasks[task] = t
7068 if task == 'transform' or jobtype == 'chart':
7169 #final task, set entire task to finished
72 - db.update('hash', hash, {'$set': {'tasks': tasks,
 70+ db.update('hash', rts.id, {'$set': {'tasks': tasks,
7371 'in_progress': False,
7472 'finished': True}})
7573 else:
76 - db.update('hash', hash, {'$set': {'tasks': tasks}})
 74+ db.update('hash', rts.id, {'$set': {'tasks': tasks}})
7775
7876
7977 def to_csv(logger, settings, message, verb, function, **kwargs):
Index: trunk/tools/editor_trends/utils/text_utils.py
@@ -20,6 +20,7 @@
2121 import datetime
2222 import time
2323 import sys
 24+import re
2425
2526 if '..' not in sys.path:
2627 sys.path.append('..')
@@ -52,6 +53,14 @@
5354 return dict([[v, k] for k, v in dictionary.items()])
5455
5556
 57+def validate_hostname(hostname):
 58+ regex_hostname = re.compile('^(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|\b-){0,61}[0-9A-Za-z])?(?:\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|\b-){0,61}[0-9A-Za-z])?)*\.?$')
 59+ res = re.match(regex_hostname, hostname)
 60+ if res == None:
 61+ return False
 62+ else:
 63+ return True
 64+
5665 def get_max_width(table, index):
5766 '''
5867 Get the maximum width of the given column index