Index: trunk/tools/editor_trends/optimize_editors.py |
— | — | @@ -115,6 +115,8 @@ |
116 | 116 | 'nr_input_processors': 1,
|
117 | 117 | 'nr_output_processors': 0,
|
118 | 118 | }
|
| 119 | + print len(ids)
|
| 120 | + ids = list(ids)
|
119 | 121 | chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
|
120 | 122 | # chunks = {}
|
121 | 123 | # parts = int(round(float(len(ids)) / 1, 0))
|
Index: trunk/tools/editor_trends/utils/process_constructor.py |
— | — | @@ -55,6 +55,7 @@ |
56 | 56 | |
57 | 57 | nr_input_processors = kwargs.pop('nr_input_processors') |
58 | 58 | nr_output_processors = kwargs.pop('nr_output_processors') |
| 59 | + poison_pill = kwargs.get('poison_pill', True) |
59 | 60 | input_queues = {} |
60 | 61 | result_queues = {} |
61 | 62 | |
— | — | @@ -63,7 +64,7 @@ |
64 | 65 | # assert len(obj)== nr_output_processors |
65 | 66 | |
66 | 67 | for i, o in enumerate(obj): |
67 | | - input_queues[i] = load_input_queue(obj[o], poison_pill=True) |
| 68 | + input_queues[i] = load_input_queue(obj[o], poison_pill=poison_pill) |
68 | 69 | if result_queue: |
69 | 70 | result_queues[i] = JoinableQueue() |
70 | 71 | else: |
Index: trunk/tools/editor_trends/utils/sort.py |
— | — | @@ -27,6 +27,7 @@ |
28 | 28 | import heapq |
29 | 29 | from multiprocessing import Queue |
30 | 30 | from Queue import Empty |
| 31 | +import datetime |
31 | 32 | |
32 | 33 | import settings |
33 | 34 | import utils |
— | — | @@ -71,10 +72,10 @@ |
72 | 73 | |
73 | 74 | def readline(file): |
74 | 75 | for line in file: |
| 76 | + line = line.replace('\n', '') |
75 | 77 | if line == '': |
76 | 78 | continue |
77 | 79 | else: |
78 | | - line = line.replace('\n', '') |
79 | 80 | line = line.split('\t') |
80 | 81 | yield line |
81 | 82 | |
— | — | @@ -102,26 +103,29 @@ |
103 | 104 | def store_editors(input, filename, dbname): |
104 | 105 | fh = utils.create_txt_filehandle(input, filename, 'r', settings.ENCODING) |
105 | 106 | mongo = db.init_mongo_db(dbname) |
106 | | - collection = mongo['editors'] |
| 107 | + collection = mongo['test'] |
107 | 108 | mongo.collection.ensure_index('editor') |
108 | 109 | editor_cache = cache.EditorCache(collection) |
109 | | - prev_contributor = '' |
| 110 | + prev_contributor = -1 |
110 | 111 | x = 0 |
111 | 112 | edits = 0 |
112 | 113 | editors = set() |
113 | 114 | for line in readline(fh): |
114 | | - contributor = line[0] |
115 | | - |
| 115 | + if len(line) == 0: |
| 116 | + continue |
| 117 | + contributor = int(line[0]) |
116 | 118 | if prev_contributor != contributor: |
117 | | - result = editor_cache.add(prev_contributor, 'NEXT') |
118 | | - print 'Stored %s editors' % x |
| 119 | + if edits >= 10: |
| 120 | + result = editor_cache.add(prev_contributor, 'NEXT') |
| 121 | + if result: |
| 122 | + editors.add(contributor) |
| 123 | + x += 1 |
| 124 | + print 'Stored %s editors' % x |
| 125 | + else: |
| 126 | + editor_cache.clear(prev_contributor) |
119 | 127 | edits = 0 |
120 | | - x += 1 |
121 | | - else: |
122 | | - edits += 1 |
123 | | - if edits == 10: |
124 | | - editors.add(contributor) |
125 | | - date = utils.convert_timestamp_to_date(line[1]) |
| 128 | + edits += 1 |
| 129 | + date = utils.convert_timestamp_to_date(line[1]) #+ datetime.timedelta(days=1) |
126 | 130 | article_id = int(line[2]) |
127 | 131 | value = {'date': date, 'article': article_id} |
128 | 132 | editor_cache.add(contributor, value) |
— | — | @@ -177,11 +181,13 @@ |
178 | 182 | 'nr_output_processors': settings.NUMBER_OF_PROCESSES, |
179 | 183 | 'input': input, |
180 | 184 | 'output': output, |
| 185 | + 'poison_pill': False |
181 | 186 | } |
182 | 187 | files = utils.retrieve_file_list(input, 'txt') |
183 | 188 | chunks = utils.split_list(files, settings.NUMBER_OF_PROCESSES) |
184 | 189 | pc.build_scaffolding(pc.load_queue, mergesort_feeder, chunks, False, False, **kwargs) |
185 | 190 | |
| 191 | + |
186 | 192 | def debug_mergesort_feeder(input, output): |
187 | 193 | kwargs = { |
188 | 194 | 'input': input, |
— | — | @@ -192,6 +198,7 @@ |
193 | 199 | q = pc.load_queue(chunks[0]) |
194 | 200 | mergesort_feeder(q, False, **kwargs) |
195 | 201 | |
| 202 | + |
196 | 203 | if __name__ == '__main__': |
197 | 204 | input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'txt') |
198 | 205 | output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted') |
Index: trunk/tools/editor_trends/construct_datasets.py |
— | — | @@ -36,9 +36,9 @@ |
37 | 37 | |
38 | 38 | def retrieve_editor_ids_mongo(dbname, collection): |
39 | 39 | if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION, |
40 | | - retrieve_editor_ids_mongo): |
| 40 | + 'editors.bin'): |
41 | 41 | ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, |
42 | | - retrieve_editor_ids_mongo) |
| 42 | + 'editors.bin') |
43 | 43 | else: |
44 | 44 | mongo = db.init_mongo_db(dbname) |
45 | 45 | editors = mongo[collection] |