Index: trunk/tools/editor_trends/map_wiki_editors.py |
— | — | @@ -143,18 +143,25 @@ |
144 | 144 | elem = cElementTree.XML(data) |
145 | 145 | output_editor_information(elem) |
146 | 146 | ''' |
| 147 | + if settings.DEBUG: |
| 148 | + utils.track_errors(xml_buffer, error, file, messages) |
147 | 149 | except UnicodeEncodeError, error: |
148 | 150 | print error |
| 151 | + if settings.DEBUG: |
| 152 | + utils.track_errors(xml_buffer, error, file, messages) |
149 | 153 | except MemoryError, error: |
150 | 154 | ''' |
151 | 155 | There is one xml file causing an out of memory file, not |
152 | | - sure which one yet. |
| 156 | + sure which one yet. This happens when raw_data = |
| 157 | + ''.join(raw_data) is called. 18-22 |
153 | 158 | ''' |
154 | 159 | print error |
155 | | - finally: |
| 160 | + print raw_data[:12] |
| 161 | + print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data]) |
156 | 162 | if settings.DEBUG: |
157 | 163 | utils.track_errors(xml_buffer, error, file, messages) |
158 | 164 | |
| 165 | + |
159 | 166 | if pbar: |
160 | 167 | print xml_queue.qsize() |
161 | 168 | #utils.update_progressbar(pbar, xml_queue) |
— | — | @@ -248,17 +255,19 @@ |
249 | 256 | for bot in cursor: |
250 | 257 | ids[bot['id']] = bot['name'] |
251 | 258 | pc.build_scaffolding(pc.load_queue, lookup_new_editors, files, store_data_mongo, True, bots=ids) |
252 | | - keys = [('date', pymongo.ASCENDING), ('name', pymongo.ASCENDING)] |
253 | | - db.add_index_to_collection('editors', 'editors', keys) |
| 259 | + keys = ['editor'] |
| 260 | + for key in keys: |
| 261 | + db.add_index_to_collection('editors', 'editors', key) |
254 | 262 | |
255 | 263 | def debug_lookup_new_editors(): |
256 | 264 | q = Queue() |
257 | 265 | import progressbar |
258 | 266 | pbar = progressbar.ProgressBar().start() |
259 | | - edits = db.init_mongo_db('editors') |
260 | | - lookup_new_editors('1.xml', q, None, None, True) |
261 | | - keys = [('date', pymongo.ASCENDING), ('name', pymongo.ASCENDING)] |
262 | | - db.add_index_to_collection('editors', 'editors', keys) |
| 267 | + #edits = db.init_mongo_db('editors') |
| 268 | + #lookup_new_editors('1.xml', q, None, None, True) |
| 269 | + keys = ['editor'] |
| 270 | + for key in keys: |
| 271 | + db.add_index_to_collection('editors', 'editors', key) |
263 | 272 | |
264 | 273 | |
265 | 274 | |
— | — | @@ -267,10 +276,10 @@ |
268 | 277 | |
269 | 278 | |
270 | 279 | if __name__ == "__main__": |
271 | | - #debug_lookup_new_editors() |
| 280 | + debug_lookup_new_editors() |
272 | 281 | |
273 | | - if settings.RUN_MODE == 'stand_alone': |
274 | | - run_stand_alone() |
275 | | - print 'Finished processing XML files.' |
276 | | - else: |
277 | | - run_hadoop() |
| 282 | +# if settings.RUN_MODE == 'stand_alone': |
| 283 | +# run_stand_alone() |
| 284 | +# print 'Finished processing XML files.' |
| 285 | +# else: |
| 286 | +# run_hadoop() |
Index: trunk/tools/editor_trends/settings.py |
— | — | @@ -75,6 +75,8 @@ |
76 | 76 | |
77 | 77 | BINARY_OBJECT_FILE_LOCATION = WORKING_DIRECTORY + '/data/objects/' |
78 | 78 | |
| 79 | +DATASETS_FILE_LOCATION = WORKING_DIRECTORY + '/datasets/' |
| 80 | + |
79 | 81 | #This section contains configuration variables for parsing / encoding and |
80 | 82 | #working with the XML files. |
81 | 83 | |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -138,18 +138,24 @@ |
139 | 139 | fh.close() |
140 | 140 | |
141 | 141 | |
142 | | -def write_data_to_csv(data, function, encoding): |
| 142 | +def write_data_to_csv(data, location, function, encoding): |
143 | 143 | filename = construct_filename_from_function(function, '.csv') |
144 | | - fh = open_txt_file(filename, 'a', encoding=encoding) |
| 144 | + fh = open_txt_file(location, filename, 'a', encoding=encoding) |
145 | 145 | keys = data.keys() |
146 | 146 | for key in keys: |
147 | | - for value in data[key]: |
148 | | - fh.write('%s\t%s\n' % (key, value)) |
| 147 | + fh.write('%s' % key) |
| 148 | + for obs in data[key]: |
| 149 | + if getattr(obs, '__iter__', False): |
| 150 | + for o in obs: |
| 151 | + fh.write('\t%s' % o) |
| 152 | + else: |
| 153 | + fh.write('\t%s' % (obs)) |
| 154 | + fh.write('\n') |
149 | 155 | fh.close() |
150 | 156 | |
151 | 157 | |
152 | | -def open_txt_file(filename, mode, encoding): |
153 | | - return codecs.open(filename, mode, encoding=encoding) |
| 158 | +def open_txt_file(location, filename, mode, encoding): |
| 159 | + return codecs.open(location+filename, mode, encoding=encoding) |
154 | 160 | |
155 | 161 | def construct_filename_from_function(function, extension): |
156 | 162 | return function.func_name + extension |
Index: trunk/tools/editor_trends/README.1ST |
— | — | @@ -58,6 +58,30 @@ |
59 | 59 | settings are self-explanatory but in cases of any questions please drop me a
|
60 | 60 | line.
|
61 | 61 |
|
| 62 | +PROCESSING TIMES:
|
| 63 | +
|
| 64 | +CONFIG NAMESPACE FILENAME CHUNKING STORING INDEXING RETRIEVING TOTAL
|
| 65 | +1 0 stub-meta-history 7 3 1 ? 11
|
| 66 | +
|
| 67 | +
|
| 68 | +*CHUNKING == splitting XML file in smaller pieces
|
| 69 | +*STORING == parsing xml files and storing it in MongoDB
|
| 70 | +*INDEXING == creating an index in MongoDB
|
| 71 | +*RETRIEVING == generating a dataset
|
| 72 | +*TOTAL == sum of all parts
|
| 73 | +
|
| 74 | +MACHINE CONFIGURATIONS
|
| 75 | +
|
| 76 | +ID OS VERSION MEMORY PROCESSOR SPEED
|
| 77 | +1 Windows 7 64-bit 4GB Duo Core 2.8MHZ
|
| 78 | +Please add your processing times plus configuration to help improve performance.
|
| 79 | +
|
| 80 | +HARDDISK REQUIREMENTS
|
| 81 | +You will need at least 3x the size of xml dump file in free space on your hard
|
| 82 | +disk if you want to create the databases and datasets to run your own analyses.
|
| 83 | +The English stub-meta-history.xml is about 15Gb so you need about 45Gb of free
|
| 84 | +diskspace.
|
| 85 | +
|
62 | 86 | CODE:
|
63 | 87 | The Python code adheres to PEP8. Function names are deliberately expressive to
|
64 | 88 | ease understanding what's going. If you find a bug please email me at dvanliere
|
Index: trunk/tools/editor_trends/construct_datasets.py |
— | — | @@ -40,9 +40,14 @@ |
41 | 41 | else: |
42 | 42 | mongo = db.init_mongo_db('editors') |
43 | 43 | editors = mongo['editors'] |
44 | | - ids = editors.find().distinct('editor') |
45 | | - print ids |
46 | | - if ids != []: |
| 44 | + contributors = set() |
| 45 | + #ids = editors.find().distinct('editor') |
| 46 | + ids = editors.find() |
| 47 | + for x,id in enumerate(ids): |
| 48 | + contributors.add(id['editor']) |
| 49 | + if len(contributors) % 25000 == 0: |
| 50 | + print x, len(contributors) |
| 51 | + if ids != set(): |
47 | 52 | utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo) |
48 | 53 | return ids |
49 | 54 | |
— | — | @@ -60,21 +65,28 @@ |
61 | 66 | else: |
62 | 67 | id = input_queue.get(block=False) |
63 | 68 | |
64 | | - contributors = set() |
| 69 | + |
65 | 70 | if definition == 'Traditional': |
66 | | - obs = editors.find({'editor': id}).limit(limit) #.sort({'date': 1}).limit(limit) |
| 71 | + obs = editors.find({'editor': id}).sort('date').limit(limit) |
| 72 | + contributors = [] |
67 | 73 | for ob in obs: |
68 | | - contributors.add(ob) |
| 74 | + contributors.append(ob['date']) |
69 | 75 | else: |
70 | | - obs = editors.find({'editor': id}).sort({'date': 1}) |
| 76 | + obs = editors.find({'editor': id}).sort('date') |
| 77 | + contributors = set() |
71 | 78 | for ob in obs: |
72 | | - if len(dates) > limit: |
| 79 | + if len(contributors) == limit: |
73 | 80 | break |
74 | 81 | else: |
75 | | - if edit.date not in dates: |
76 | | - set.add(edit) |
77 | | - utils.write_data_to_csv(contributors, generate_editor_dataset, settings.ENCODING) |
| 82 | + contributors.add(ob['date']) |
78 | 83 | |
| 84 | + if len(contributors) < limit: |
| 85 | + new_wikipedian = False |
| 86 | + else: |
| 87 | + new_wikipedian = True |
| 88 | + data = {id: [contributors, new_wikipedian]} |
| 89 | + utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING) |
| 90 | + |
79 | 91 | except Empty: |
80 | 92 | break |
81 | 93 | |
— | — | @@ -146,7 +158,7 @@ |
147 | 159 | 'debug': True |
148 | 160 | } |
149 | 161 | generate_editor_dataset(input_queue, False, False, kwargs) |
150 | | - generate_editor_dataset_launcher() |
| 162 | + #generate_editor_dataset_launcher() |
151 | 163 | #retrieve_list_contributors() |
152 | 164 | #retrieve_edits_by_contributor() |
153 | 165 | |
— | — | @@ -156,4 +168,5 @@ |
157 | 169 | |
158 | 170 | |
159 | 171 | if __name__ == '__main__': |
160 | | - debug_retrieve_edits_by_contributor_launcher() |
| 172 | + generate_editor_dataset_launcher() |
| 173 | + #debug_retrieve_edits_by_contributor_launcher() |
Index: trunk/tools/editor_trends/database/db.py |
— | — | @@ -32,17 +32,17 @@ |
33 | 33 | collection.remove(ids) |
34 | 34 | |
35 | 35 | |
36 | | -def add_index_to_collection(db, collection, keys): |
| 36 | +def add_index_to_collection(db, collection, key): |
37 | 37 | ''' |
38 | 38 | @db is the name of the mongodb |
39 | 39 | @collection is the name of the 'table' in mongodb |
40 | | - @keys should be a list of keys used to create the index |
| 40 | + @key name of the field to create the index |
41 | 41 | ''' |
42 | 42 | |
43 | 43 | mongo = init_mongo_db(db) |
44 | 44 | collection = mongo[collection] |
45 | | - mongo.collection.create_index(keys) |
46 | | - mongo.collection.ensure_index(keys) |
| 45 | + mongo.collection.create_index(key) |
| 46 | + mongo.collection.ensure_index(key) |
47 | 47 | |
48 | 48 | |
49 | 49 | def init_database(db=None): |
Property changes on: trunk/tools/editor_trends |
___________________________________________________________________ |
Modified: svn:ignore |
50 | 50 | - wikistats |
zips |
notes.txt |
*.pyc |
datasets |
errors |
51 | 51 | + wikistats |
zips |
notes.txt |
*.pyc |
datasets |
errors |
.settings |
.project |
.pydevproject |