r75089 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r75088‎ | r75089 | r75090 >
Date:18:08, 20 October 2010
Author:diederik
Status:deferred
Tags:
Comment:
1) Added functionality to generate datasets
2) Fixed creating indexes in MongoDB
3) Updated README.1ST with processing time information
Modified paths:
  • /trunk/tools/editor_trends (modified) (history)
  • /trunk/tools/editor_trends/README.1ST (modified) (history)
  • /trunk/tools/editor_trends/construct_datasets.py (modified) (history)
  • /trunk/tools/editor_trends/database/db.py (modified) (history)
  • /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history)
  • /trunk/tools/editor_trends/settings.py (modified) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/map_wiki_editors.py
@@ -143,18 +143,25 @@
144144 elem = cElementTree.XML(data)
145145 output_editor_information(elem)
146146 '''
 147+ if settings.DEBUG:
 148+ utils.track_errors(xml_buffer, error, file, messages)
147149 except UnicodeEncodeError, error:
148150 print error
 151+ if settings.DEBUG:
 152+ utils.track_errors(xml_buffer, error, file, messages)
149153 except MemoryError, error:
150154 '''
151155 There is one xml file causing an out of memory file, not
152 - sure which one yet.
 156+ sure which one yet. This happens when raw_data =
 157+ ''.join(raw_data) is called. 18-22
153158 '''
154159 print error
155 - finally:
 160+ print raw_data[:12]
 161+ print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
156162 if settings.DEBUG:
157163 utils.track_errors(xml_buffer, error, file, messages)
158164
 165+
159166 if pbar:
160167 print xml_queue.qsize()
161168 #utils.update_progressbar(pbar, xml_queue)
@@ -248,17 +255,19 @@
249256 for bot in cursor:
250257 ids[bot['id']] = bot['name']
251258 pc.build_scaffolding(pc.load_queue, lookup_new_editors, files, store_data_mongo, True, bots=ids)
252 - keys = [('date', pymongo.ASCENDING), ('name', pymongo.ASCENDING)]
253 - db.add_index_to_collection('editors', 'editors', keys)
 259+ keys = ['editor']
 260+ for key in keys:
 261+ db.add_index_to_collection('editors', 'editors', key)
254262
255263 def debug_lookup_new_editors():
256264 q = Queue()
257265 import progressbar
258266 pbar = progressbar.ProgressBar().start()
259 - edits = db.init_mongo_db('editors')
260 - lookup_new_editors('1.xml', q, None, None, True)
261 - keys = [('date', pymongo.ASCENDING), ('name', pymongo.ASCENDING)]
262 - db.add_index_to_collection('editors', 'editors', keys)
 267+ #edits = db.init_mongo_db('editors')
 268+ #lookup_new_editors('1.xml', q, None, None, True)
 269+ keys = ['editor']
 270+ for key in keys:
 271+ db.add_index_to_collection('editors', 'editors', key)
263272
264273
265274
@@ -267,10 +276,10 @@
268277
269278
270279 if __name__ == "__main__":
271 - #debug_lookup_new_editors()
 280+ debug_lookup_new_editors()
272281
273 - if settings.RUN_MODE == 'stand_alone':
274 - run_stand_alone()
275 - print 'Finished processing XML files.'
276 - else:
277 - run_hadoop()
 282+# if settings.RUN_MODE == 'stand_alone':
 283+# run_stand_alone()
 284+# print 'Finished processing XML files.'
 285+# else:
 286+# run_hadoop()
Index: trunk/tools/editor_trends/settings.py
@@ -75,6 +75,8 @@
7676
7777 BINARY_OBJECT_FILE_LOCATION = WORKING_DIRECTORY + '/data/objects/'
7878
 79+DATASETS_FILE_LOCATION = WORKING_DIRECTORY + '/datasets/'
 80+
7981 #This section contains configuration variables for parsing / encoding and
8082 #working with the XML files.
8183
Index: trunk/tools/editor_trends/utils/utils.py
@@ -138,18 +138,24 @@
139139 fh.close()
140140
141141
142 -def write_data_to_csv(data, function, encoding):
 142+def write_data_to_csv(data, location, function, encoding):
143143 filename = construct_filename_from_function(function, '.csv')
144 - fh = open_txt_file(filename, 'a', encoding=encoding)
 144+ fh = open_txt_file(location, filename, 'a', encoding=encoding)
145145 keys = data.keys()
146146 for key in keys:
147 - for value in data[key]:
148 - fh.write('%s\t%s\n' % (key, value))
 147+ fh.write('%s' % key)
 148+ for obs in data[key]:
 149+ if getattr(obs, '__iter__', False):
 150+ for o in obs:
 151+ fh.write('\t%s' % o)
 152+ else:
 153+ fh.write('\t%s' % (obs))
 154+ fh.write('\n')
149155 fh.close()
150156
151157
152 -def open_txt_file(filename, mode, encoding):
153 - return codecs.open(filename, mode, encoding=encoding)
 158+def open_txt_file(location, filename, mode, encoding):
 159+ return codecs.open(location+filename, mode, encoding=encoding)
154160
155161 def construct_filename_from_function(function, extension):
156162 return function.func_name + extension
Index: trunk/tools/editor_trends/README.1ST
@@ -58,6 +58,30 @@
5959 settings are self-explanatory but in cases of any questions please drop me a
6060 line.
6161
 62+PROCESSING TIMES:
 63+
 64+CONFIG NAMESPACE FILENAME CHUNKING STORING INDEXING RETRIEVING TOTAL
 65+1 0 stub-meta-history 7 3 1 ? 11
 66+
 67+
 68+*CHUNKING == splitting XML file in smaller pieces
 69+*STORING == parsing xml files and storing it in MongoDB
 70+*INDEXING == creating an index in MongoDB
 71+*RETRIEVING == generating a dataset
 72+*TOTAL == sum of all parts
 73+
 74+MACHINE CONFIGURATIONS
 75+
 76+ID OS VERSION MEMORY PROCESSOR SPEED
 77+1 Windows 7 64-bit 4GB Duo Core 2.8MHZ
 78+Please add your processing times plus configuration to help improve performance.
 79+
 80+HARDDISK REQUIREMENTS
 81+You will need at least 3x the size of xml dump file in free space on your hard
 82+disk if you want to create the databases and datasets to run your own analyses.
 83+The English stub-meta-history.xml is about 15Gb so you need about 45Gb of free
 84+diskspace.
 85+
6286 CODE:
6387 The Python code adheres to PEP8. Function names are deliberately expressive to
6488 ease understanding what's going. If you find a bug please email me at dvanliere
Index: trunk/tools/editor_trends/construct_datasets.py
@@ -40,9 +40,14 @@
4141 else:
4242 mongo = db.init_mongo_db('editors')
4343 editors = mongo['editors']
44 - ids = editors.find().distinct('editor')
45 - print ids
46 - if ids != []:
 44+ contributors = set()
 45+ #ids = editors.find().distinct('editor')
 46+ ids = editors.find()
 47+ for x,id in enumerate(ids):
 48+ contributors.add(id['editor'])
 49+ if len(contributors) % 25000 == 0:
 50+ print x, len(contributors)
 51+ if ids != set():
4752 utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
4853 return ids
4954
@@ -60,21 +65,28 @@
6166 else:
6267 id = input_queue.get(block=False)
6368
64 - contributors = set()
 69+
6570 if definition == 'Traditional':
66 - obs = editors.find({'editor': id}).limit(limit) #.sort({'date': 1}).limit(limit)
 71+ obs = editors.find({'editor': id}).sort('date').limit(limit)
 72+ contributors = []
6773 for ob in obs:
68 - contributors.add(ob)
 74+ contributors.append(ob['date'])
6975 else:
70 - obs = editors.find({'editor': id}).sort({'date': 1})
 76+ obs = editors.find({'editor': id}).sort('date')
 77+ contributors = set()
7178 for ob in obs:
72 - if len(dates) > limit:
 79+ if len(contributors) == limit:
7380 break
7481 else:
75 - if edit.date not in dates:
76 - set.add(edit)
77 - utils.write_data_to_csv(contributors, generate_editor_dataset, settings.ENCODING)
 82+ contributors.add(ob['date'])
7883
 84+ if len(contributors) < limit:
 85+ new_wikipedian = False
 86+ else:
 87+ new_wikipedian = True
 88+ data = {id: [contributors, new_wikipedian]}
 89+ utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING)
 90+
7991 except Empty:
8092 break
8193
@@ -146,7 +158,7 @@
147159 'debug': True
148160 }
149161 generate_editor_dataset(input_queue, False, False, kwargs)
150 - generate_editor_dataset_launcher()
 162+ #generate_editor_dataset_launcher()
151163 #retrieve_list_contributors()
152164 #retrieve_edits_by_contributor()
153165
@@ -156,4 +168,5 @@
157169
158170
159171 if __name__ == '__main__':
160 - debug_retrieve_edits_by_contributor_launcher()
 172+ generate_editor_dataset_launcher()
 173+ #debug_retrieve_edits_by_contributor_launcher()
Index: trunk/tools/editor_trends/database/db.py
@@ -32,17 +32,17 @@
3333 collection.remove(ids)
3434
3535
36 -def add_index_to_collection(db, collection, keys):
 36+def add_index_to_collection(db, collection, key):
3737 '''
3838 @db is the name of the mongodb
3939 @collection is the name of the 'table' in mongodb
40 - @keys should be a list of keys used to create the index
 40+ @key name of the field to create the index
4141 '''
4242
4343 mongo = init_mongo_db(db)
4444 collection = mongo[collection]
45 - mongo.collection.create_index(keys)
46 - mongo.collection.ensure_index(keys)
 45+ mongo.collection.create_index(key)
 46+ mongo.collection.ensure_index(key)
4747
4848
4949 def init_database(db=None):
Property changes on: trunk/tools/editor_trends
___________________________________________________________________
Modified: svn:ignore
5050 - wikistats
zips
notes.txt
*.pyc
datasets
errors
5151 + wikistats
zips
notes.txt
*.pyc
datasets
errors
.settings
.project
.pydevproject

Status & tagging log