Index: trunk/tools/editor_trends/optimize_editors.py |
— | — | @@ -25,6 +25,7 @@ |
26 | 26 | import settings
|
27 | 27 | from database import db
|
28 | 28 | from utils import process_constructor as pc
|
| 29 | +from utils import utils
|
29 | 30 | import construct_datasets
|
30 | 31 |
|
31 | 32 |
|
— | — | @@ -67,20 +68,25 @@ |
68 | 69 | return articles
|
69 | 70 |
|
70 | 71 |
|
71 | | -def optimize_editors(input_queue, result_queue, pbar, kwargs):
|
| 72 | +def sort_edits(edits):
|
| 73 | + edits = utils.merge_list(edits)
|
| 74 | + return sorted(edits, key=itemgetter('date'))
|
| 75 | +
|
| 76 | +
|
| 77 | +def optimize_editors(input_queue, result_queue, pbar, **kwargs):
|
72 | 78 | dbname = kwargs.pop('dbname')
|
73 | 79 | mongo = db.init_mongo_db(dbname)
|
74 | 80 | input = mongo['editors']
|
75 | 81 | output = mongo['dataset']
|
76 | | - mongo.output.ensure_index('editor')
|
77 | | - mongo.output.ensure_index('year_joined')
|
| 82 | + output.ensure_index('editor')
|
| 83 | + output.ensure_index('year_joined')
|
78 | 84 | definition = kwargs.pop('definition')
|
79 | 85 | while True:
|
80 | 86 | try:
|
81 | 87 | id = input_queue.get(block=False)
|
82 | 88 | editor = input.find_one({'editor': id})
|
83 | 89 | edits = editor['edits']
|
84 | | - edits = sorted(edits, key=itemgetter('date'))
|
| 90 | + edits = sort_edits(edits)
|
85 | 91 | edit_count = len(edits)
|
86 | 92 | new_wikipedian = edits[9]['date']
|
87 | 93 | first_edit = edits[0]['date']
|
— | — | @@ -100,6 +106,7 @@ |
101 | 107 | except Empty:
|
102 | 108 | break
|
103 | 109 |
|
| 110 | +
|
104 | 111 | def run_optimize_editors(dbname):
|
105 | 112 | ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
|
106 | 113 | kwargs = {'definition': 'traditional',
|
— | — | @@ -108,15 +115,16 @@ |
109 | 116 | 'nr_input_processors': 1,
|
110 | 117 | 'nr_output_processors': 0,
|
111 | 118 | }
|
112 | | - chunks = {}
|
113 | | - parts = int(round(float(len(ids)) / 1, 0))
|
114 | | - a = 0
|
115 | | - for x in xrange(settings.NUMBER_OF_PROCESSES):
|
116 | | - b = a + parts
|
117 | | - chunks[x] = ids[a:b]
|
118 | | - a = (x + 1) * parts
|
119 | | - if a >= len(ids):
|
120 | | - break
|
| 119 | + chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
|
| 120 | +# chunks = {}
|
| 121 | +# parts = int(round(float(len(ids)) / 1, 0))
|
| 122 | +# a = 0
|
| 123 | +# for x in xrange(settings.NUMBER_OF_PROCESSES):
|
| 124 | +# b = a + parts
|
| 125 | +# chunks[x] = ids[a:b]
|
| 126 | +# a = (x + 1) * parts
|
| 127 | +# if a >= len(ids):
|
| 128 | +# break
|
121 | 129 |
|
122 | 130 | pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
|
123 | 131 |
|
— | — | @@ -131,5 +139,5 @@ |
132 | 140 |
|
133 | 141 |
|
134 | 142 | if __name__ == '__main__':
|
135 | | - debug_optimize_editors('test')
|
136 | | - #run_optimize_editors('test')
|
| 143 | + #debug_optimize_editors('test')
|
| 144 | + run_optimize_editors('enwiki')
|
Index: trunk/tools/editor_trends/settings.py |
— | — | @@ -22,19 +22,19 @@ |
23 | 23 | the datasets as part of the Editor Dynamics and Anti-Vandalism projects. |
24 | 24 | ''' |
25 | 25 | |
26 | | - |
27 | 26 | from multiprocessing import cpu_count |
28 | 27 | import os |
29 | 28 | import sys |
30 | 29 | import platform |
31 | | -#try: |
32 | | -# from pywin import win32file |
33 | | -# '''increase the maximum number of open files on Windows to 1024''' |
34 | | -# win32file._setmaxstdio(1024) |
35 | | -#except ImportError: |
36 | | -# pass |
37 | 30 | |
38 | 31 | try: |
| 32 | + from pywin import win32file |
| 33 | + '''increase the maximum number of open files on Windows to 1024''' |
| 34 | + win32file._setmaxstdio(1024) |
| 35 | +except ImportError: |
| 36 | + pass |
| 37 | + |
| 38 | +try: |
39 | 39 | import resource |
40 | 40 | except ImportError: |
41 | 41 | pass |
— | — | @@ -151,7 +151,7 @@ |
152 | 152 | NUMBER_OF_PROCESSES = cpu_count() * 1 |
153 | 153 | |
154 | 154 | #Extensions of ascii files, this is used to determine the filemode to use |
155 | | -ASCII = ['txt', 'csv', 'xml', 'sql'] |
| 155 | +ASCII = ['txt', 'csv', 'xml', 'sql', 'json'] |
156 | 156 | |
157 | 157 | WP_DUMP_LOCATION = 'http://download.wikimedia.org' |
158 | 158 | |
Index: trunk/tools/editor_trends/run.py |
— | — | @@ -33,5 +33,5 @@ |
34 | 34 | output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
|
35 | 35 | dbname = 'enwiki'
|
36 | 36 | #sort.debug_mergesort_feeder(input, output)
|
37 | | -sort.mergesort_launcher(input, output)
|
38 | | -#sort.mergesort_external_launcher(dbname, output, output) |
\ No newline at end of file |
| 37 | +#sort.mergesort_launcher(input, output)
|
| 38 | +sort.mergesort_external_launcher(dbname, output, output) |
\ No newline at end of file |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -326,6 +326,13 @@ |
327 | 327 | files.append('.'.join(file)) |
328 | 328 | return files |
329 | 329 | |
| 330 | +def merge_list(datalist): |
| 331 | + merged = [] |
| 332 | + for d in datalist: |
| 333 | + for x in datalist[d]: |
| 334 | + merged.append(x) |
| 335 | + return merged |
| 336 | + |
330 | 337 | def split_list(datalist, maxval): |
331 | 338 | chunks = {} |
332 | 339 | a = 0 |
Index: trunk/tools/editor_trends/utils/sort.py |
— | — | @@ -32,8 +32,8 @@ |
33 | 33 | import utils |
34 | 34 | import process_constructor as pc |
35 | 35 | from database import cache |
| 36 | +from database import db |
36 | 37 | |
37 | | - |
38 | 38 | def quick_sort(obs): |
39 | 39 | if obs == []: |
40 | 40 | return [] |
— | — | @@ -106,14 +106,28 @@ |
107 | 107 | mongo.collection.ensure_index('editor') |
108 | 108 | editor_cache = cache.EditorCache(collection) |
109 | 109 | prev_contributor = '' |
110 | | - for line in readline(file): |
| 110 | + x = 0 |
| 111 | + edits = 0 |
| 112 | + editors = set() |
| 113 | + for line in readline(fh): |
111 | 114 | contributor = line[0] |
| 115 | + |
112 | 116 | if prev_contributor != contributor: |
113 | | - editor_cache.add('NEXT', '') |
114 | | - value = {'date': line[1], 'article': line[2]} |
| 117 | + result = editor_cache.add(prev_contributor, 'NEXT') |
| 118 | + print 'Stored %s editors' % x |
| 119 | + edits = 0 |
| 120 | + x += 1 |
| 121 | + else: |
| 122 | + edits += 1 |
| 123 | + if edits == 10: |
| 124 | + editors.add(contributor) |
| 125 | + date = utils.convert_timestamp_to_date(line[1]) |
| 126 | + article_id = int(line[2]) |
| 127 | + value = {'date': date, 'article': article_id} |
115 | 128 | editor_cache.add(contributor, value) |
116 | 129 | prev_contributor = contributor |
117 | 130 | fh.close() |
| 131 | + utils.store_object(editors, settings.BINARY_OBJECT_FILE_LOCATION, 'editors') |
118 | 132 | |
119 | 133 | |
120 | 134 | def mergesort_external_launcher(dbname, input, output): |
— | — | @@ -126,15 +140,17 @@ |
127 | 141 | chunks = utils.split_list(files, int(x)) |
128 | 142 | '''1st iteration external mergesort''' |
129 | 143 | for chunk in chunks: |
130 | | - filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.ENCODING) for file in chunks[chunk]] |
131 | | - filename = merge_sorted_files(output, filehandles, chunk) |
132 | | - filehandles = [fh.close() for fh in filehandles] |
| 144 | +# filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.ENCODING) for file in chunks[chunk]] |
| 145 | +# filename = merge_sorted_files(output, filehandles, chunk) |
| 146 | +# filehandles = [fh.close() for fh in filehandles] |
| 147 | + pass |
133 | 148 | '''2nd iteration external mergesort, if necessary''' |
134 | 149 | if len(chunks) > 1: |
135 | | - files = utils.retrieve_file_list(output, 'txt', mask='[merged]') |
136 | | - filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.ENCODING) for file in files] |
137 | | - filename = merge_sorted_files(output, filehandles, 'final') |
138 | | - filehandles = [fh.close() for fh in filehandles] |
| 150 | +# files = utils.retrieve_file_list(output, 'txt', mask='[merged]') |
| 151 | +# filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.ENCODING) for file in files] |
| 152 | +# filename = merge_sorted_files(output, filehandles, 'final') |
| 153 | +# filehandles = [fh.close() for fh in filehandles] |
| 154 | + filename = 'merged_final.txt' |
139 | 155 | store_editors(output, filename, dbname) |
140 | 156 | |
141 | 157 | |
Index: trunk/tools/editor_trends/construct_datasets.py |
— | — | @@ -82,7 +82,7 @@ |
83 | 83 | return headers |
84 | 84 | |
85 | 85 | |
86 | | -def generate_editor_dataset(input_queue, data_queue, pbar, kwargs): |
| 86 | +def generate_editor_dataset(input_queue, data_queue, pbar, **kwargs): |
87 | 87 | debug = kwargs.pop('debug') |
88 | 88 | dbname = kwargs.pop('dbname') |
89 | 89 | mongo = db.init_mongo_db(dbname) |
— | — | @@ -143,16 +143,17 @@ |
144 | 144 | 'dbname': dbname, |
145 | 145 | } |
146 | 146 | ids = retrieve_editor_ids_mongo(dbname, 'editors') |
147 | | - chunks = {} |
148 | | - parts = int(round(float(len(ids)) / 1, 0)) |
149 | | - a = 0 |
150 | | - for x in xrange(settings.NUMBER_OF_PROCESSES): |
151 | | - b = a + parts |
152 | | - chunks[x] = ids[a:b] |
153 | | - a = (x + 1) * parts |
154 | | - if a >= len(ids): |
155 | | - break |
156 | | - |
| 147 | + chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES) |
| 148 | +# chunks = {} |
| 149 | +# parts = int(round(float(len(ids)) / 1, 0)) |
| 150 | +# a = 0 |
| 151 | +# for x in xrange(settings.NUMBER_OF_PROCESSES): |
| 152 | +# b = a + parts |
| 153 | +# chunks[x] = ids[a:b] |
| 154 | +# a = (x + 1) * parts |
| 155 | +# if a >= len(ids): |
| 156 | +# break |
| 157 | +# |
157 | 158 | pc.build_scaffolding(pc.load_queue, generate_editor_dataset, chunks, False, False, **kwargs) |
158 | 159 | |
159 | 160 | |
— | — | @@ -169,5 +170,5 @@ |
170 | 171 | |
171 | 172 | if __name__ == '__main__': |
172 | 173 | #generate_editor_dataset_debug('test') |
173 | | - generate_editor_dataset_launcher('test') |
| 174 | + generate_editor_dataset_launcher('enwiki') |
174 | 175 | #debug_retrieve_edits_by_contributor_launcher() |
Index: trunk/tools/editor_trends/database/cache.py |
— | — | @@ -64,12 +64,14 @@ |
65 | 65 | return sum([self.editors[k].get('obs', 0) for k in self.editors]) |
66 | 66 | |
67 | 67 | def add(self, key, value): |
68 | | - if key == 'NEXT': |
| 68 | + if value == 'NEXT': |
69 | 69 | for editor in self.treshold_editors: |
70 | | - self.update(editor, self.editors[editor]['edits']) |
| 70 | + self.insert (editor, self.editors[editor]['edits']) |
71 | 71 | self.n -= self.editors[editor]['obs'] |
72 | 72 | self.number_editors -= 1 |
73 | 73 | del self.editors[editor] |
| 74 | + if key in self.editors: |
| 75 | + del self.editors[key] |
74 | 76 | self.treshold_editors = set() |
75 | 77 | else: |
76 | 78 | self.cumulative_n += 1 |
— | — | @@ -77,19 +79,33 @@ |
78 | 80 | if key not in self.editors: |
79 | 81 | self.editors[key] = {} |
80 | 82 | self.editors[key]['obs'] = 0 |
81 | | - self.editors[key]['edits'] = [] |
| 83 | + self.editors[key]['edits'] = {} |
| 84 | + self.add_years(key) |
82 | 85 | self.number_editors += 1 |
83 | | - |
| 86 | + |
84 | 87 | id = str(self.editors[key]['obs']) |
85 | | - self.editors[key]['edits'].append(value) |
| 88 | + year = str(value['date'].year) |
| 89 | + self.editors[key]['edits'][year].append(value) |
86 | 90 | self.editors[key]['obs'] += 1 |
87 | 91 | |
88 | 92 | if self.editors[key]['obs'] == self.treshold: |
89 | 93 | self.treshold_editors.add(key) |
90 | 94 | |
| 95 | + def add_years(self, key): |
| 96 | + now = datetime.datetime.now().year + 1 |
| 97 | + for year in xrange(2001, now): |
| 98 | + self.editors[key]['edits'][str(year)] = [] |
| 99 | + |
| 100 | + |
91 | 101 | def update(self, editor, values): |
92 | 102 | self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True) |
93 | 103 | |
| 104 | + def insert(self, editor, values): |
| 105 | + try: |
| 106 | + self.collection.insert({'editor': editor, 'edits': values}) |
| 107 | + except: |
| 108 | + pass |
| 109 | + |
94 | 110 | def store(self): |
95 | 111 | utils.store_object(self, settings.BINARY_OBJECT_FILE_LOCATION, self.__repr__()) |
96 | 112 | |