r76417 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76416‎ | r76417 | r76418 >
Date:22:43, 9 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Changed MongoDB schema to handle cases where editors have more than 4Mb of edit observations.
Modified paths:
  • /trunk/tools/editor_trends/construct_datasets.py (modified) (history)
  • /trunk/tools/editor_trends/database/cache.py (modified) (history)
  • /trunk/tools/editor_trends/optimize_editors.py (modified) (history)
  • /trunk/tools/editor_trends/run.py (modified) (history)
  • /trunk/tools/editor_trends/settings.py (modified) (history)
  • /trunk/tools/editor_trends/utils/sort.py (modified) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/optimize_editors.py
@@ -25,6 +25,7 @@
2626 import settings
2727 from database import db
2828 from utils import process_constructor as pc
 29+from utils import utils
2930 import construct_datasets
3031
3132
@@ -67,20 +68,25 @@
6869 return articles
6970
7071
71 -def optimize_editors(input_queue, result_queue, pbar, kwargs):
 72+def sort_edits(edits):
 73+ edits = utils.merge_list(edits)
 74+ return sorted(edits, key=itemgetter('date'))
 75+
 76+
 77+def optimize_editors(input_queue, result_queue, pbar, **kwargs):
7278 dbname = kwargs.pop('dbname')
7379 mongo = db.init_mongo_db(dbname)
7480 input = mongo['editors']
7581 output = mongo['dataset']
76 - mongo.output.ensure_index('editor')
77 - mongo.output.ensure_index('year_joined')
 82+ output.ensure_index('editor')
 83+ output.ensure_index('year_joined')
7884 definition = kwargs.pop('definition')
7985 while True:
8086 try:
8187 id = input_queue.get(block=False)
8288 editor = input.find_one({'editor': id})
8389 edits = editor['edits']
84 - edits = sorted(edits, key=itemgetter('date'))
 90+ edits = sort_edits(edits)
8591 edit_count = len(edits)
8692 new_wikipedian = edits[9]['date']
8793 first_edit = edits[0]['date']
@@ -100,6 +106,7 @@
101107 except Empty:
102108 break
103109
 110+
104111 def run_optimize_editors(dbname):
105112 ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
106113 kwargs = {'definition': 'traditional',
@@ -108,15 +115,16 @@
109116 'nr_input_processors': 1,
110117 'nr_output_processors': 0,
111118 }
112 - chunks = {}
113 - parts = int(round(float(len(ids)) / 1, 0))
114 - a = 0
115 - for x in xrange(settings.NUMBER_OF_PROCESSES):
116 - b = a + parts
117 - chunks[x] = ids[a:b]
118 - a = (x + 1) * parts
119 - if a >= len(ids):
120 - break
 119+ chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
 120+# chunks = {}
 121+# parts = int(round(float(len(ids)) / 1, 0))
 122+# a = 0
 123+# for x in xrange(settings.NUMBER_OF_PROCESSES):
 124+# b = a + parts
 125+# chunks[x] = ids[a:b]
 126+# a = (x + 1) * parts
 127+# if a >= len(ids):
 128+# break
121129
122130 pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
123131
@@ -131,5 +139,5 @@
132140
133141
134142 if __name__ == '__main__':
135 - debug_optimize_editors('test')
136 - #run_optimize_editors('test')
 143+ #debug_optimize_editors('test')
 144+ run_optimize_editors('enwiki')
Index: trunk/tools/editor_trends/settings.py
@@ -22,19 +22,19 @@
2323 the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
2424 '''
2525
26 -
2726 from multiprocessing import cpu_count
2827 import os
2928 import sys
3029 import platform
31 -#try:
32 -# from pywin import win32file
33 -# '''increase the maximum number of open files on Windows to 1024'''
34 -# win32file._setmaxstdio(1024)
35 -#except ImportError:
36 -# pass
3730
3831 try:
 32+ from pywin import win32file
 33+ '''increase the maximum number of open files on Windows to 1024'''
 34+ win32file._setmaxstdio(1024)
 35+except ImportError:
 36+ pass
 37+
 38+try:
3939 import resource
4040 except ImportError:
4141 pass
@@ -151,7 +151,7 @@
152152 NUMBER_OF_PROCESSES = cpu_count() * 1
153153
154154 #Extensions of ascii files, this is used to determine the filemode to use
155 -ASCII = ['txt', 'csv', 'xml', 'sql']
 155+ASCII = ['txt', 'csv', 'xml', 'sql', 'json']
156156
157157 WP_DUMP_LOCATION = 'http://download.wikimedia.org'
158158
Index: trunk/tools/editor_trends/run.py
@@ -33,5 +33,5 @@
3434 output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
3535 dbname = 'enwiki'
3636 #sort.debug_mergesort_feeder(input, output)
37 -sort.mergesort_launcher(input, output)
38 -#sort.mergesort_external_launcher(dbname, output, output)
\ No newline at end of file
 37+#sort.mergesort_launcher(input, output)
 38+sort.mergesort_external_launcher(dbname, output, output)
\ No newline at end of file
Index: trunk/tools/editor_trends/utils/utils.py
@@ -326,6 +326,13 @@
327327 files.append('.'.join(file))
328328 return files
329329
 330+def merge_list(datalist):
 331+ merged = []
 332+ for d in datalist:
 333+ for x in datalist[d]:
 334+ merged.append(x)
 335+ return merged
 336+
330337 def split_list(datalist, maxval):
331338 chunks = {}
332339 a = 0
Index: trunk/tools/editor_trends/utils/sort.py
@@ -32,8 +32,8 @@
3333 import utils
3434 import process_constructor as pc
3535 from database import cache
 36+from database import db
3637
37 -
3838 def quick_sort(obs):
3939 if obs == []:
4040 return []
@@ -106,14 +106,28 @@
107107 mongo.collection.ensure_index('editor')
108108 editor_cache = cache.EditorCache(collection)
109109 prev_contributor = ''
110 - for line in readline(file):
 110+ x = 0
 111+ edits = 0
 112+ editors = set()
 113+ for line in readline(fh):
111114 contributor = line[0]
 115+
112116 if prev_contributor != contributor:
113 - editor_cache.add('NEXT', '')
114 - value = {'date': line[1], 'article': line[2]}
 117+ result = editor_cache.add(prev_contributor, 'NEXT')
 118+ print 'Stored %s editors' % x
 119+ edits = 0
 120+ x += 1
 121+ else:
 122+ edits += 1
 123+ if edits == 10:
 124+ editors.add(contributor)
 125+ date = utils.convert_timestamp_to_date(line[1])
 126+ article_id = int(line[2])
 127+ value = {'date': date, 'article': article_id}
115128 editor_cache.add(contributor, value)
116129 prev_contributor = contributor
117130 fh.close()
 131+ utils.store_object(editors, settings.BINARY_OBJECT_FILE_LOCATION, 'editors')
118132
119133
120134 def mergesort_external_launcher(dbname, input, output):
@@ -126,15 +140,17 @@
127141 chunks = utils.split_list(files, int(x))
128142 '''1st iteration external mergesort'''
129143 for chunk in chunks:
130 - filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.ENCODING) for file in chunks[chunk]]
131 - filename = merge_sorted_files(output, filehandles, chunk)
132 - filehandles = [fh.close() for fh in filehandles]
 144+# filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.ENCODING) for file in chunks[chunk]]
 145+# filename = merge_sorted_files(output, filehandles, chunk)
 146+# filehandles = [fh.close() for fh in filehandles]
 147+ pass
133148 '''2nd iteration external mergesort, if necessary'''
134149 if len(chunks) > 1:
135 - files = utils.retrieve_file_list(output, 'txt', mask='[merged]')
136 - filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.ENCODING) for file in files]
137 - filename = merge_sorted_files(output, filehandles, 'final')
138 - filehandles = [fh.close() for fh in filehandles]
 150+# files = utils.retrieve_file_list(output, 'txt', mask='[merged]')
 151+# filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.ENCODING) for file in files]
 152+# filename = merge_sorted_files(output, filehandles, 'final')
 153+# filehandles = [fh.close() for fh in filehandles]
 154+ filename = 'merged_final.txt'
139155 store_editors(output, filename, dbname)
140156
141157
Index: trunk/tools/editor_trends/construct_datasets.py
@@ -82,7 +82,7 @@
8383 return headers
8484
8585
86 -def generate_editor_dataset(input_queue, data_queue, pbar, kwargs):
 86+def generate_editor_dataset(input_queue, data_queue, pbar, **kwargs):
8787 debug = kwargs.pop('debug')
8888 dbname = kwargs.pop('dbname')
8989 mongo = db.init_mongo_db(dbname)
@@ -143,16 +143,17 @@
144144 'dbname': dbname,
145145 }
146146 ids = retrieve_editor_ids_mongo(dbname, 'editors')
147 - chunks = {}
148 - parts = int(round(float(len(ids)) / 1, 0))
149 - a = 0
150 - for x in xrange(settings.NUMBER_OF_PROCESSES):
151 - b = a + parts
152 - chunks[x] = ids[a:b]
153 - a = (x + 1) * parts
154 - if a >= len(ids):
155 - break
156 -
 147+ chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
 148+# chunks = {}
 149+# parts = int(round(float(len(ids)) / 1, 0))
 150+# a = 0
 151+# for x in xrange(settings.NUMBER_OF_PROCESSES):
 152+# b = a + parts
 153+# chunks[x] = ids[a:b]
 154+# a = (x + 1) * parts
 155+# if a >= len(ids):
 156+# break
 157+#
157158 pc.build_scaffolding(pc.load_queue, generate_editor_dataset, chunks, False, False, **kwargs)
158159
159160
@@ -169,5 +170,5 @@
170171
171172 if __name__ == '__main__':
172173 #generate_editor_dataset_debug('test')
173 - generate_editor_dataset_launcher('test')
 174+ generate_editor_dataset_launcher('enwiki')
174175 #debug_retrieve_edits_by_contributor_launcher()
Index: trunk/tools/editor_trends/database/cache.py
@@ -64,12 +64,14 @@
6565 return sum([self.editors[k].get('obs', 0) for k in self.editors])
6666
6767 def add(self, key, value):
68 - if key == 'NEXT':
 68+ if value == 'NEXT':
6969 for editor in self.treshold_editors:
70 - self.update(editor, self.editors[editor]['edits'])
 70+ self.insert (editor, self.editors[editor]['edits'])
7171 self.n -= self.editors[editor]['obs']
7272 self.number_editors -= 1
7373 del self.editors[editor]
 74+ if key in self.editors:
 75+ del self.editors[key]
7476 self.treshold_editors = set()
7577 else:
7678 self.cumulative_n += 1
@@ -77,19 +79,33 @@
7880 if key not in self.editors:
7981 self.editors[key] = {}
8082 self.editors[key]['obs'] = 0
81 - self.editors[key]['edits'] = []
 83+ self.editors[key]['edits'] = {}
 84+ self.add_years(key)
8285 self.number_editors += 1
83 -
 86+
8487 id = str(self.editors[key]['obs'])
85 - self.editors[key]['edits'].append(value)
 88+ year = str(value['date'].year)
 89+ self.editors[key]['edits'][year].append(value)
8690 self.editors[key]['obs'] += 1
8791
8892 if self.editors[key]['obs'] == self.treshold:
8993 self.treshold_editors.add(key)
9094
 95+ def add_years(self, key):
 96+ now = datetime.datetime.now().year + 1
 97+ for year in xrange(2001, now):
 98+ self.editors[key]['edits'][str(year)] = []
 99+
 100+
91101 def update(self, editor, values):
92102 self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)
93103
 104+ def insert(self, editor, values):
 105+ try:
 106+ self.collection.insert({'editor': editor, 'edits': values})
 107+ except:
 108+ pass
 109+
94110 def store(self):
95111 utils.store_object(self, settings.BINARY_OBJECT_FILE_LOCATION, self.__repr__())
96112

Status & tagging log