Index: trunk/tools/editor_trends/map_wiki_editors.py |
— | — | @@ -13,6 +13,9 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
17 | 20 | |
18 | 21 | #Default Python libraries (Python => 2.6) |
19 | 22 | import sys |
— | — | @@ -189,7 +192,7 @@ |
190 | 193 | values = [] |
191 | 194 | #print data_queue.qsize() |
192 | 195 | |
193 | | - |
| 196 | + |
194 | 197 | except Empty: |
195 | 198 | # The queue is empty but store the remaining values if present |
196 | 199 | if values != []: |
— | — | @@ -204,7 +207,7 @@ |
205 | 208 | are finished and this Queue is empty than break, else wait for the |
206 | 209 | Queue to fill. |
207 | 210 | ''' |
208 | | - |
| 211 | + |
209 | 212 | if all([utils.check_if_process_is_running(pid) for pid in pids]): |
210 | 213 | pass |
211 | 214 | #print 'Empty queue or not %s?' % data_queue.qsize() |
Index: trunk/tools/editor_trends/wikitree/xml.py |
— | — | @@ -13,6 +13,9 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
17 | 20 | |
18 | 21 | from utils import utils |
19 | 22 | import settings |
Index: trunk/tools/editor_trends/settings.py |
— | — | @@ -13,8 +13,10 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
17 | 20 | |
18 | | - |
19 | 21 | ''' |
20 | 22 | This file contains settings that are used for constructing and analyzing |
21 | 23 | the datasets as part of the Editor Dynamics and Anti-Vandalism projects. |
— | — | @@ -91,3 +93,14 @@ |
92 | 94 | #Multiprocess settings used to parallelize workload |
93 | 95 | #Change this to match your computers configuration (RAM / CPU) |
94 | 96 | NUMBER_OF_PROCESSES = cpu_count() * 1 |
| 97 | + |
| 98 | +#Extensions of ascii files, this is used to determine the filemode to use |
| 99 | +ASCII = ['txt', 'csv', 'xml', 'sql'] |
| 100 | + |
| 101 | +WP_DUMP_LOCATION = 'http://download.wikimedia.org' |
| 102 | + |
| 103 | +LANGUAGE_MAPPING = { |
| 104 | +'English': '/enwiki/latest/', |
| 105 | +'Russian': '/ruwiki/latest/', |
| 106 | +'German': '/dewiki/latest', |
| 107 | +} |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -13,6 +13,9 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
17 | 20 | |
18 | 21 | ''' |
19 | 22 | The utils module contains helper functions that will be needed throughout. |
— | — | @@ -138,6 +141,18 @@ |
139 | 142 | fh.close() |
140 | 143 | |
141 | 144 | |
| 145 | +def determine_file_extension(filename): |
| 146 | + pos = filename.rfind('.') + 1 |
| 147 | + return filename[pos:] |
| 148 | + |
| 149 | + |
| 150 | +def determine_file_mode(extension): |
| 151 | + if extension in settings.ASCII: |
| 152 | + return 'w' |
| 153 | + else: |
| 154 | + return 'wb' |
| 155 | + |
| 156 | + |
142 | 157 | def write_data_to_csv(data, location, function, encoding): |
143 | 158 | filename = construct_filename_from_function(function, '.csv') |
144 | 159 | fh = open_txt_file(location, filename, 'a', encoding=encoding) |
— | — | @@ -155,14 +170,19 @@ |
156 | 171 | |
157 | 172 | |
158 | 173 | def open_txt_file(location, filename, mode, encoding): |
159 | | - return codecs.open(location+filename, mode, encoding=encoding) |
| 174 | + return codecs.open(location + filename, mode, encoding=encoding) |
160 | 175 | |
| 176 | + |
| 177 | +def open_binary_file(location, filename, mode): |
| 178 | + return open(location + filename, mode) |
| 179 | + |
161 | 180 | def construct_filename_from_function(function, extension): |
162 | 181 | return function.func_name + extension |
163 | 182 | |
| 183 | + |
164 | 184 | def check_file_exists(location, filename): |
165 | 185 | if hasattr(filename, '__call__'): |
166 | | - filename = construct_filename_from_function(filename, '.bin') |
| 186 | + filename = construct_filename_from_function(filename, '.bin') |
167 | 187 | if os.path.exists(location + filename): |
168 | 188 | return True |
169 | 189 | else: |
— | — | @@ -181,7 +201,7 @@ |
182 | 202 | |
183 | 203 | def load_object(location, filename): |
184 | 204 | if hasattr(filename, '__call__'): |
185 | | - filename = construct_filename_from_function(filename, '.bin') |
| 205 | + filename = construct_filename_from_function(filename, '.bin') |
186 | 206 | if not filename.endswith('.bin'): |
187 | 207 | filename = filename + '.bin' |
188 | 208 | fh = open(location + filename, 'rb') |
Index: trunk/tools/editor_trends/utils/dump_downloader.py |
— | — | @@ -13,8 +13,10 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
17 | 20 | |
18 | | - |
19 | 21 | import os |
20 | 22 | import sys |
21 | 23 | import urllib2 |
— | — | @@ -22,56 +24,78 @@ |
23 | 25 | |
24 | 26 | import progressbar |
25 | 27 | |
| 28 | +import settings |
26 | 29 | import utils |
27 | | -import settings |
28 | 30 | |
29 | 31 | |
| 32 | + |
30 | 33 | def determine_remote_filesize(url, filename): |
31 | 34 | ''' |
32 | 35 | @url is the full path of the file to be downloaded |
33 | 36 | @filename is the name of the file to be downloaded |
34 | 37 | ''' |
35 | | - conn = httplib.HTTPConnection(url) |
| 38 | + if url.startswith('http://'): |
| 39 | + url = url[7:] |
| 40 | + conn = httplib.HTTPConnection(url, 80) |
36 | 41 | conn.request('HEAD', filename) |
37 | 42 | res = conn.getresponse() |
| 43 | + conn.close() |
38 | 44 | if res.status == 200: |
39 | | - return res.getheader('content-length', -1) |
| 45 | + return int(res.getheader('content-length', -1)) |
40 | 46 | else: |
41 | 47 | return - 1 |
42 | 48 | |
43 | 49 | |
44 | | -def download_wp_dump(url, filename, location, pbar): |
| 50 | +def download_wp_dump(domain, path, filename, location, filemode, pbar): |
45 | 51 | ''' |
46 | 52 | This is a very simple replacement for wget and curl because Windows does |
47 | 53 | support these tools. |
48 | 54 | @url location of the file to be downloaded |
49 | 55 | @filename name of the file to be downloaded |
50 | 56 | @location indicates where to store the file locally |
| 57 | + @filemode indicates whether we are downloading a binary or ascii file. |
51 | 58 | @pbar is an instance of progressbar.ProgressBar() |
52 | 59 | ''' |
53 | 60 | chunk = 4096 |
54 | | - fh = utils.open_txt_file(location, filename, 'w', settings.ENCODING) |
55 | | - req = urllib2.Request(url + filename) |
56 | | - filesize = determine_remote_filesize(url, filename) |
57 | | - if filesize != -1: |
58 | | - pbar(maxval=filesize).start() |
| 61 | + if filemode == 'w': |
| 62 | + fh = utils.open_txt_file(location, filename, filemode, settings.ENCODING) |
| 63 | + else: |
| 64 | + fh = utils.open_binary_file(location, filename, filemode) |
| 65 | + |
| 66 | + filesize = determine_remote_filesize(domain, path + filename) |
| 67 | + |
| 68 | + |
| 69 | + if filesize != -1 and pbar: |
| 70 | + widgets = ['%s: ' % filename, progressbar.Percentage(), ' ', |
| 71 | + progressbar.Bar(marker=progressbar.RotatingMarker()),' ', |
| 72 | + progressbar.ETA(), ' ', progressbar.FileTransferSpeed()] |
| 73 | + |
| 74 | + pbar = progressbar.ProgressBar(widgets=widgets,maxval=filesize).start() |
| 75 | + else: |
| 76 | + pbar = False |
| 77 | + |
| 78 | + req = urllib2.Request(domain + path + filename) |
59 | 79 | try: |
60 | 80 | response = urllib2.urlopen(req) |
61 | | - i = 0 |
62 | 81 | while True: |
63 | 82 | data = response.read(chunk) |
64 | 83 | if not data: |
65 | | - print 'Finished downloading %s%s.' % (url, filename) |
| 84 | + print 'Finished downloading %s%s%s.' % (domain, path, filename) |
66 | 85 | break |
67 | | - f.write(data) |
| 86 | + fh.write(data) |
68 | 87 | |
69 | 88 | if pbar: |
70 | | - pbar.update(i * chunk) |
71 | | - i += 1 |
72 | | - except URLError, error: |
| 89 | + filesize -= chunk |
| 90 | + if filesize < 0: |
| 91 | + chunk = chunk + filesize |
| 92 | + pbar.update(pbar.currval + chunk) |
| 93 | + |
| 94 | + except urllib2.URLError, error: |
73 | 95 | print 'Reason: %s' % error.reason |
74 | | - except HTTPError, error: |
| 96 | + except urllib2.HTTPError, error: |
75 | 97 | print 'Error: %s' % error.code |
| 98 | + finally: |
| 99 | + fh.close() |
76 | 100 | |
77 | 101 | |
78 | 102 | if __name__ == '__main__': |
Index: trunk/tools/editor_trends/utils/process_constructor.py |
— | — | @@ -13,6 +13,9 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
17 | 20 | |
18 | 21 | from multiprocessing import Process, Queue |
19 | 22 | from Queue import Empty |
— | — | @@ -106,7 +109,7 @@ |
107 | 110 | data = obj |
108 | 111 | for d in data: |
109 | 112 | input_queue.put(d) |
110 | | - |
| 113 | + |
111 | 114 | if poison_pill: |
112 | 115 | for p in xrange(settings.NUMBER_OF_PROCESSES): |
113 | 116 | input_queue.put(None) |
Index: trunk/tools/editor_trends/construct_datasets.py |
— | — | @@ -13,6 +13,9 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
17 | 20 | |
18 | 21 | from multiprocessing import Queue |
19 | 22 | from Queue import Empty |
— | — | @@ -32,10 +35,10 @@ |
33 | 36 | pass |
34 | 37 | |
35 | 38 | |
36 | | -def retrieve_editor_ids_mongo(): |
| 39 | +def retrieve_editor_ids_mongo(RANDOM_SAMPLE=True): |
37 | 40 | if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION, |
38 | 41 | retrieve_editor_ids_mongo): |
39 | | - ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, |
| 42 | + contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, |
40 | 43 | retrieve_editor_ids_mongo) |
41 | 44 | else: |
42 | 45 | mongo = db.init_mongo_db('editors') |
— | — | @@ -43,13 +46,14 @@ |
44 | 47 | contributors = set() |
45 | 48 | #ids = editors.find().distinct('editor') |
46 | 49 | ids = editors.find() |
47 | | - for x,id in enumerate(ids): |
| 50 | + for x, id in enumerate(ids): |
48 | 51 | contributors.add(id['editor']) |
49 | | - if len(contributors) % 25000 == 0: |
50 | | - print x, len(contributors) |
51 | | - if ids != set(): |
52 | | - utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo) |
53 | | - return ids |
| 52 | + if len(contributors) == 100000: |
| 53 | + if RANDOM_SAMPLE: |
| 54 | + break |
| 55 | + if contributors != set(): |
| 56 | + utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo) |
| 57 | + return contributors |
54 | 58 | |
55 | 59 | |
56 | 60 | def generate_editor_dataset(input_queue, data_queue, pbar, kwargs): |
— | — | @@ -58,6 +62,7 @@ |
59 | 63 | debug = kwargs.pop('debug') |
60 | 64 | mongo = db.init_mongo_db('editors') |
61 | 65 | editors = mongo['editors'] |
| 66 | + data = {} |
62 | 67 | while True: |
63 | 68 | try: |
64 | 69 | if debug: |
— | — | @@ -65,29 +70,32 @@ |
66 | 71 | else: |
67 | 72 | id = input_queue.get(block=False) |
68 | 73 | |
69 | | - |
| 74 | + print input_queue.qsize() |
70 | 75 | if definition == 'Traditional': |
71 | | - obs = editors.find({'editor': id}).sort('date').limit(limit) |
| 76 | + |
| 77 | + obs = editors.find({'editor': id}, {'date':1}).sort('date').limit(limit) |
72 | 78 | contributors = [] |
73 | 79 | for ob in obs: |
74 | 80 | contributors.append(ob['date']) |
| 81 | + obs = '' |
75 | 82 | else: |
76 | | - obs = editors.find({'editor': id}).sort('date') |
| 83 | + obs = editors.find({'editor': id}, {'date':1}).sort('date') |
77 | 84 | contributors = set() |
78 | 85 | for ob in obs: |
79 | 86 | if len(contributors) == limit: |
80 | 87 | break |
81 | 88 | else: |
82 | 89 | contributors.add(ob['date']) |
83 | | - |
| 90 | + obs.close() |
84 | 91 | if len(contributors) < limit: |
85 | 92 | new_wikipedian = False |
86 | 93 | else: |
87 | 94 | new_wikipedian = True |
88 | | - data = {id: [contributors, new_wikipedian]} |
89 | | - utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING) |
| 95 | + data[id] = [contributors, new_wikipedian] |
90 | 96 | |
| 97 | + |
91 | 98 | except Empty: |
| 99 | + utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING) |
92 | 100 | break |
93 | 101 | |
94 | 102 | |
— | — | @@ -152,21 +160,26 @@ |
153 | 161 | |
154 | 162 | |
155 | 163 | def debug_retrieve_edits_by_contributor_launcher(): |
156 | | - input_queue = Queue() |
| 164 | + q = Queue() |
157 | 165 | kwargs = {'definition':'Traditional', |
158 | 166 | 'limit': 10, |
159 | | - 'debug': True |
| 167 | + 'debug': False |
160 | 168 | } |
| 169 | + ids = retrieve_editor_ids_mongo() |
| 170 | + input_queue = pc.load_queue(q, ids) |
161 | 171 | generate_editor_dataset(input_queue, False, False, kwargs) |
162 | 172 | #generate_editor_dataset_launcher() |
163 | 173 | #retrieve_list_contributors() |
164 | 174 | #retrieve_edits_by_contributor() |
165 | 175 | |
166 | 176 | def generate_editor_dataset_launcher(): |
167 | | - ids = retrieve_editor_ids_mongo() |
168 | | - pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, definition='Traditional', limit=10) |
| 177 | + kwargs = {'definition':'Traditional', |
| 178 | + 'limit': 10, |
| 179 | + 'debug': False |
| 180 | + } |
| 181 | + pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, kwargs) |
169 | 182 | |
170 | 183 | |
171 | 184 | if __name__ == '__main__': |
172 | | - generate_editor_dataset_launcher() |
173 | | - #debug_retrieve_edits_by_contributor_launcher() |
| 185 | + #generate_editor_dataset_launcher() |
| 186 | + debug_retrieve_edits_by_contributor_launcher() |
Index: trunk/tools/editor_trends/database/db.py |
— | — | @@ -13,6 +13,9 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
17 | 20 | |
18 | 21 | import sqlite3 as sqlite |
19 | 22 | from pymongo import Connection |
— | — | @@ -38,7 +41,7 @@ |
39 | 42 | @collection is the name of the 'table' in mongodb |
40 | 43 | @key name of the field to create the index |
41 | 44 | ''' |
42 | | - |
| 45 | + |
43 | 46 | mongo = init_mongo_db(db) |
44 | 47 | collection = mongo[collection] |
45 | 48 | mongo.collection.create_index(key) |