r75176 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r75175‎ | r75176 | r75177 >
Date:22:00, 21 October 2010
Author:diederik
Status:deferred
Tags:
Comment:
Minor improvements and bug fixes.
Modified paths:
  • /trunk/tools/editor_trends/construct_datasets.py (modified) (history)
  • /trunk/tools/editor_trends/database/db.py (modified) (history)
  • /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history)
  • /trunk/tools/editor_trends/settings.py (modified) (history)
  • /trunk/tools/editor_trends/utils/dump_downloader.py (modified) (history)
  • /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)
  • /trunk/tools/editor_trends/wikitree/xml.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/map_wiki_editors.py
@@ -13,6 +13,9 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
1720
1821 #Default Python libraries (Python => 2.6)
1922 import sys
@@ -189,7 +192,7 @@
190193 values = []
191194 #print data_queue.qsize()
192195
193 -
 196+
194197 except Empty:
195198 # The queue is empty but store the remaining values if present
196199 if values != []:
@@ -204,7 +207,7 @@
205208 are finished and this Queue is empty than break, else wait for the
206209 Queue to fill.
207210 '''
208 -
 211+
209212 if all([utils.check_if_process_is_running(pid) for pid in pids]):
210213 pass
211214 #print 'Empty queue or not %s?' % data_queue.qsize()
Index: trunk/tools/editor_trends/wikitree/xml.py
@@ -13,6 +13,9 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
1720
1821 from utils import utils
1922 import settings
Index: trunk/tools/editor_trends/settings.py
@@ -13,8 +13,10 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
1720
18 -
1921 '''
2022 This file contains settings that are used for constructing and analyzing
2123 the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
@@ -91,3 +93,14 @@
9294 #Multiprocess settings used to parallelize workload
9395 #Change this to match your computers configuration (RAM / CPU)
9496 NUMBER_OF_PROCESSES = cpu_count() * 1
 97+
 98+#Extensions of ascii files, this is used to determine the filemode to use
 99+ASCII = ['txt', 'csv', 'xml', 'sql']
 100+
 101+WP_DUMP_LOCATION = 'http://download.wikimedia.org'
 102+
 103+LANGUAGE_MAPPING = {
 104+'English': '/enwiki/latest/',
 105+'Russian': '/ruwiki/latest/',
 106+'German': '/dewiki/latest',
 107+}
Index: trunk/tools/editor_trends/utils/utils.py
@@ -13,6 +13,9 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
1720
1821 '''
1922 The utils module contains helper functions that will be needed throughout.
@@ -138,6 +141,18 @@
139142 fh.close()
140143
141144
 145+def determine_file_extension(filename):
 146+ pos = filename.rfind('.') + 1
 147+ return filename[pos:]
 148+
 149+
 150+def determine_file_mode(extension):
 151+ if extension in settings.ASCII:
 152+ return 'w'
 153+ else:
 154+ return 'wb'
 155+
 156+
142157 def write_data_to_csv(data, location, function, encoding):
143158 filename = construct_filename_from_function(function, '.csv')
144159 fh = open_txt_file(location, filename, 'a', encoding=encoding)
@@ -155,14 +170,19 @@
156171
157172
158173 def open_txt_file(location, filename, mode, encoding):
159 - return codecs.open(location+filename, mode, encoding=encoding)
 174+ return codecs.open(location + filename, mode, encoding=encoding)
160175
 176+
 177+def open_binary_file(location, filename, mode):
 178+ return open(location + filename, mode)
 179+
161180 def construct_filename_from_function(function, extension):
162181 return function.func_name + extension
163182
 183+
164184 def check_file_exists(location, filename):
165185 if hasattr(filename, '__call__'):
166 - filename = construct_filename_from_function(filename, '.bin')
 186+ filename = construct_filename_from_function(filename, '.bin')
167187 if os.path.exists(location + filename):
168188 return True
169189 else:
@@ -181,7 +201,7 @@
182202
183203 def load_object(location, filename):
184204 if hasattr(filename, '__call__'):
185 - filename = construct_filename_from_function(filename, '.bin')
 205+ filename = construct_filename_from_function(filename, '.bin')
186206 if not filename.endswith('.bin'):
187207 filename = filename + '.bin'
188208 fh = open(location + filename, 'rb')
Index: trunk/tools/editor_trends/utils/dump_downloader.py
@@ -13,8 +13,10 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
1720
18 -
1921 import os
2022 import sys
2123 import urllib2
@@ -22,56 +24,78 @@
2325
2426 import progressbar
2527
 28+import settings
2629 import utils
27 -import settings
2830
2931
 32+
3033 def determine_remote_filesize(url, filename):
3134 '''
3235 @url is the full path of the file to be downloaded
3336 @filename is the name of the file to be downloaded
3437 '''
35 - conn = httplib.HTTPConnection(url)
 38+ if url.startswith('http://'):
 39+ url = url[7:]
 40+ conn = httplib.HTTPConnection(url, 80)
3641 conn.request('HEAD', filename)
3742 res = conn.getresponse()
 43+ conn.close()
3844 if res.status == 200:
39 - return res.getheader('content-length', -1)
 45+ return int(res.getheader('content-length', -1))
4046 else:
4147 return - 1
4248
4349
44 -def download_wp_dump(url, filename, location, pbar):
 50+def download_wp_dump(domain, path, filename, location, filemode, pbar):
4551 '''
4652 This is a very simple replacement for wget and curl because Windows does
4753 support these tools.
4854 @url location of the file to be downloaded
4955 @filename name of the file to be downloaded
5056 @location indicates where to store the file locally
 57+ @filemode indicates whether we are downloading a binary or ascii file.
5158 @pbar is an instance of progressbar.ProgressBar()
5259 '''
5360 chunk = 4096
54 - fh = utils.open_txt_file(location, filename, 'w', settings.ENCODING)
55 - req = urllib2.Request(url + filename)
56 - filesize = determine_remote_filesize(url, filename)
57 - if filesize != -1:
58 - pbar(maxval=filesize).start()
 61+ if filemode == 'w':
 62+ fh = utils.open_txt_file(location, filename, filemode, settings.ENCODING)
 63+ else:
 64+ fh = utils.open_binary_file(location, filename, filemode)
 65+
 66+ filesize = determine_remote_filesize(domain, path + filename)
 67+
 68+
 69+ if filesize != -1 and pbar:
 70+ widgets = ['%s: ' % filename, progressbar.Percentage(), ' ',
 71+ progressbar.Bar(marker=progressbar.RotatingMarker()),' ',
 72+ progressbar.ETA(), ' ', progressbar.FileTransferSpeed()]
 73+
 74+ pbar = progressbar.ProgressBar(widgets=widgets,maxval=filesize).start()
 75+ else:
 76+ pbar = False
 77+
 78+ req = urllib2.Request(domain + path + filename)
5979 try:
6080 response = urllib2.urlopen(req)
61 - i = 0
6281 while True:
6382 data = response.read(chunk)
6483 if not data:
65 - print 'Finished downloading %s%s.' % (url, filename)
 84+ print 'Finished downloading %s%s%s.' % (domain, path, filename)
6685 break
67 - f.write(data)
 86+ fh.write(data)
6887
6988 if pbar:
70 - pbar.update(i * chunk)
71 - i += 1
72 - except URLError, error:
 89+ filesize -= chunk
 90+ if filesize < 0:
 91+ chunk = chunk + filesize
 92+ pbar.update(pbar.currval + chunk)
 93+
 94+ except urllib2.URLError, error:
7395 print 'Reason: %s' % error.reason
74 - except HTTPError, error:
 96+ except urllib2.HTTPError, error:
7597 print 'Error: %s' % error.code
 98+ finally:
 99+ fh.close()
76100
77101
78102 if __name__ == '__main__':
Index: trunk/tools/editor_trends/utils/process_constructor.py
@@ -13,6 +13,9 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
1720
1821 from multiprocessing import Process, Queue
1922 from Queue import Empty
@@ -106,7 +109,7 @@
107110 data = obj
108111 for d in data:
109112 input_queue.put(d)
110 -
 113+
111114 if poison_pill:
112115 for p in xrange(settings.NUMBER_OF_PROCESSES):
113116 input_queue.put(None)
Index: trunk/tools/editor_trends/construct_datasets.py
@@ -13,6 +13,9 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
1720
1821 from multiprocessing import Queue
1922 from Queue import Empty
@@ -32,10 +35,10 @@
3336 pass
3437
3538
36 -def retrieve_editor_ids_mongo():
 39+def retrieve_editor_ids_mongo(RANDOM_SAMPLE=True):
3740 if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
3841 retrieve_editor_ids_mongo):
39 - ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
 42+ contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
4043 retrieve_editor_ids_mongo)
4144 else:
4245 mongo = db.init_mongo_db('editors')
@@ -43,13 +46,14 @@
4447 contributors = set()
4548 #ids = editors.find().distinct('editor')
4649 ids = editors.find()
47 - for x,id in enumerate(ids):
 50+ for x, id in enumerate(ids):
4851 contributors.add(id['editor'])
49 - if len(contributors) % 25000 == 0:
50 - print x, len(contributors)
51 - if ids != set():
52 - utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
53 - return ids
 52+ if len(contributors) == 100000:
 53+ if RANDOM_SAMPLE:
 54+ break
 55+ if contributors != set():
 56+ utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
 57+ return contributors
5458
5559
5660 def generate_editor_dataset(input_queue, data_queue, pbar, kwargs):
@@ -58,6 +62,7 @@
5963 debug = kwargs.pop('debug')
6064 mongo = db.init_mongo_db('editors')
6165 editors = mongo['editors']
 66+ data = {}
6267 while True:
6368 try:
6469 if debug:
@@ -65,29 +70,32 @@
6671 else:
6772 id = input_queue.get(block=False)
6873
69 -
 74+ print input_queue.qsize()
7075 if definition == 'Traditional':
71 - obs = editors.find({'editor': id}).sort('date').limit(limit)
 76+
 77+ obs = editors.find({'editor': id}, {'date':1}).sort('date').limit(limit)
7278 contributors = []
7379 for ob in obs:
7480 contributors.append(ob['date'])
 81+ obs = ''
7582 else:
76 - obs = editors.find({'editor': id}).sort('date')
 83+ obs = editors.find({'editor': id}, {'date':1}).sort('date')
7784 contributors = set()
7885 for ob in obs:
7986 if len(contributors) == limit:
8087 break
8188 else:
8289 contributors.add(ob['date'])
83 -
 90+ obs.close()
8491 if len(contributors) < limit:
8592 new_wikipedian = False
8693 else:
8794 new_wikipedian = True
88 - data = {id: [contributors, new_wikipedian]}
89 - utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING)
 95+ data[id] = [contributors, new_wikipedian]
9096
 97+
9198 except Empty:
 99+ utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING)
92100 break
93101
94102
@@ -152,21 +160,26 @@
153161
154162
155163 def debug_retrieve_edits_by_contributor_launcher():
156 - input_queue = Queue()
 164+ q = Queue()
157165 kwargs = {'definition':'Traditional',
158166 'limit': 10,
159 - 'debug': True
 167+ 'debug': False
160168 }
 169+ ids = retrieve_editor_ids_mongo()
 170+ input_queue = pc.load_queue(q, ids)
161171 generate_editor_dataset(input_queue, False, False, kwargs)
162172 #generate_editor_dataset_launcher()
163173 #retrieve_list_contributors()
164174 #retrieve_edits_by_contributor()
165175
166176 def generate_editor_dataset_launcher():
167 - ids = retrieve_editor_ids_mongo()
168 - pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, definition='Traditional', limit=10)
 177+ kwargs = {'definition':'Traditional',
 178+ 'limit': 10,
 179+ 'debug': False
 180+ }
 181+ pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, kwargs)
169182
170183
171184 if __name__ == '__main__':
172 - generate_editor_dataset_launcher()
173 - #debug_retrieve_edits_by_contributor_launcher()
 185+ #generate_editor_dataset_launcher()
 186+ debug_retrieve_edits_by_contributor_launcher()
Index: trunk/tools/editor_trends/database/db.py
@@ -13,6 +13,9 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
1720
1821 import sqlite3 as sqlite
1922 from pymongo import Connection
@@ -38,7 +41,7 @@
3942 @collection is the name of the 'table' in mongodb
4043 @key name of the field to create the index
4144 '''
42 -
 45+
4346 mongo = init_mongo_db(db)
4447 collection = mongo[collection]
4548 mongo.collection.create_index(key)

Status & tagging log