Index: trunk/tools/editor_trends/map_wiki_editors.py |
— | — | @@ -35,6 +35,7 @@ |
36 | 36 | from database import db_settings |
37 | 37 | from database import db |
38 | 38 | from wikitree import xml |
| 39 | +from statistics import dataset |
39 | 40 | from utils import process_constructor as pc |
40 | 41 | |
41 | 42 | |
— | — | @@ -44,15 +45,10 @@ |
45 | 46 | except ImportError: |
46 | 47 | pass |
47 | 48 | |
48 | | -contributors = {} |
| 49 | +#contributors = {} |
49 | 50 | |
50 | 51 | RE_BOT = re.compile('bot', re.IGNORECASE) |
51 | 52 | RE_SCRIPT = re.compile('script', re.IGNORECASE) |
52 | | -#RE_NUMERIC_CHARACTER = re.compile('&#[\d{1,5}]+;') |
53 | | -# |
54 | | -#def remove_numeric_character_references(text): |
55 | | -# return re.sub(RE_NUMERIC_CHARACTER, '', text) |
56 | | -# |
57 | 53 | |
58 | 54 | |
59 | 55 | def determine_username_is_bot(username, kwargs): |
— | — | @@ -108,7 +104,7 @@ |
109 | 105 | data_queue.put(vars) |
110 | 106 | vars = {} |
111 | 107 | |
112 | | -def lookup_new_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'): |
| 108 | +def parse_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'): |
113 | 109 | if settings.DEBUG: |
114 | 110 | messages = {} |
115 | 111 | vars = {} |
— | — | @@ -118,14 +114,12 @@ |
119 | 115 | file = xml_queue |
120 | 116 | else: |
121 | 117 | file = xml_queue.get(block=False) |
122 | | - #print 'parsing %s' % file |
123 | 118 | if file == None: |
124 | 119 | print 'Swallowed a poison pill' |
125 | 120 | break |
126 | 121 | data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION, |
127 | 122 | file, 'r', |
128 | 123 | encoding=settings.ENCODING)) |
129 | | - #data = read_input(sys.stdin) |
130 | 124 | for raw_data in data: |
131 | 125 | xml_buffer = cStringIO.StringIO() |
132 | 126 | raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
— | — | @@ -155,16 +149,15 @@ |
156 | 150 | sure which one yet. This happens when raw_data = |
157 | 151 | ''.join(raw_data) is called. 18-22 |
158 | 152 | ''' |
159 | | - print error |
| 153 | + print file, error |
160 | 154 | print raw_data[:12] |
161 | 155 | print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data]) |
162 | 156 | if settings.DEBUG: |
163 | 157 | utils.track_errors(xml_buffer, error, file, messages) |
164 | 158 | |
165 | | - |
166 | 159 | if pbar: |
167 | | - print xml_queue.qsize() |
168 | | - #utils.update_progressbar(pbar, xml_queue) |
| 160 | + #print xml_queue.qsize() |
| 161 | + utils.update_progressbar(pbar, xml_queue) |
169 | 162 | if debug: |
170 | 163 | break |
171 | 164 | |
— | — | @@ -175,25 +168,17 @@ |
176 | 169 | utils.report_error_messages(messages, lookup_new_editors) |
177 | 170 | |
178 | 171 | |
179 | | -def store_data_mongo(data_queue, pids, dbname): |
| 172 | +def store_editors(data_queue, pids, dbname): |
180 | 173 | mongo = db.init_mongo_db(dbname) |
181 | 174 | collection = mongo['editors'] |
182 | 175 | mongo.collection.ensure_index('editor') |
183 | | - contributors = {} |
184 | 176 | while True: |
185 | 177 | try: |
186 | 178 | edit = data_queue.get(block=False) |
187 | 179 | contributor = edit['editor'] |
188 | | - if contributor not in contributors: |
189 | | - collection.insert({'editor': contributor, 'edit_count': 0, }) |
190 | | - contributors[contributor] = 1 |
191 | | - |
192 | | - key = str(contributors[contributor]) |
193 | 180 | value = {'date':edit['date'], 'article': edit['article']} |
194 | 181 | collection.update({'editor': contributor}, {'$inc': {'edit_count': 1}, |
195 | | - '$push': {'edits': value}}) |
196 | | - contributors[contributor] += 1 |
197 | | - |
| 182 | + '$push': {'edits': value}}, True) |
198 | 183 | except Empty: |
199 | 184 | ''' |
200 | 185 | This checks whether the Queue is empty because the preprocessors are |
— | — | @@ -202,16 +187,32 @@ |
203 | 188 | are finished and this Queue is empty than break, else wait for the |
204 | 189 | Queue to fill. |
205 | 190 | ''' |
206 | | - |
207 | 191 | if all([utils.check_if_process_is_running(pid) for pid in pids]): |
208 | 192 | pass |
209 | 193 | #print 'Empty queue or not %s?' % data_queue.qsize() |
210 | 194 | else: |
211 | 195 | break |
212 | | - except Exception, error: |
213 | | - print error |
214 | 196 | |
215 | 197 | |
| 198 | +def optimize_editors(dbname, input_queue, **kwargs): |
| 199 | + mongo = db.init_mongo_db(dbname) |
| 200 | + collection = mongo['editors'] |
| 201 | + definition = kwargs.pop('definition') |
| 202 | + while True: |
| 203 | + try: |
| 204 | + id = input_queue.get(block=False) |
| 205 | + #id = '94033' |
| 206 | + editor = collection.find_one({'editor': id}) |
| 207 | + edits = editor['edits'] |
| 208 | + edits.sort() |
| 209 | + year = edits[0]['date'].year |
| 210 | + new_wikipedian = dataset.determine_editor_is_new_wikipedian(edits, defintion) |
| 211 | + collection.update({'editor': id}, {'$set': {'edits': edits, 'year_joined': year, 'new_wikipedian': new_wikipedian}}) |
| 212 | + |
| 213 | + except Empty: |
| 214 | + break |
| 215 | + |
| 216 | + |
216 | 217 | def store_data_db(data_queue, pids): |
217 | 218 | connection = db.init_database() |
218 | 219 | cursor = connection.cursor() |
— | — | @@ -243,43 +244,38 @@ |
244 | 245 | connection.close() |
245 | 246 | |
246 | 247 | |
247 | | -def run_stand_alone(): |
| 248 | +def run_stand_alone(dbname): |
248 | 249 | files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml') |
249 | 250 | #files = files[:2] |
| 251 | + kwargs = {'bots': ids, |
| 252 | + 'dbname': dbname, |
| 253 | + 'pbar': True, |
| 254 | + 'definition': 'traditional'} |
| 255 | + |
250 | 256 | mongo = db.init_mongo_db('bots') |
251 | 257 | bots = mongo['ids'] |
252 | 258 | ids = {} |
253 | 259 | cursor = bots.find() |
254 | | - |
255 | | - kwargs = {'bots': ids, |
256 | | - 'dbname': 'enwiki', |
257 | | - 'pbar': True} |
258 | | - |
259 | 260 | for bot in cursor: |
260 | 261 | ids[bot['id']] = bot['name'] |
261 | | - pc.build_scaffolding(pc.load_queue, lookup_new_editors, files, store_data_mongo, True, **kwargs) |
262 | | - keys = ['editor'] |
263 | | - for key in keys: |
264 | | - db.add_index_to_collection('enwiki', 'editors', key) |
| 262 | + |
| 263 | + pc.build_scaffolding(pc.load_queue, parse_editors, files, store_editors, True, **kwargs) |
| 264 | + ids = retrieve_ids_mongo_new(dbname, 'editors') |
| 265 | + pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs) |
265 | 266 | |
266 | 267 | def debug_lookup_new_editors(): |
267 | 268 | q = Queue() |
268 | 269 | import progressbar |
269 | 270 | pbar = progressbar.ProgressBar().start() |
270 | 271 | #edits = db.init_mongo_db('editors') |
271 | | - lookup_new_editors('464.xml', q, None, None, True) |
| 272 | + parse_editors('464.xml', q, None, None, True) |
272 | 273 | store_data_mongo(q, [], 'test') |
273 | 274 | #keys = ['editor'] |
274 | 275 | #for key in keys: |
275 | 276 | # db.add_index_to_collection('editors', 'editors', key) |
276 | 277 | |
277 | | - |
278 | | - |
279 | | -def run_hadoop(): |
280 | | - pass |
281 | | - |
282 | | - |
283 | 278 | if __name__ == "__main__": |
| 279 | + #optimize_editors('enwiki') |
284 | 280 | #debug_lookup_new_editors() |
285 | 281 | |
286 | 282 | if settings.RUN_MODE == 'stand_alone': |
Index: trunk/tools/editor_trends/settings.py |
— | — | @@ -79,6 +79,8 @@ |
80 | 80 | |
81 | 81 | DATASETS_FILE_LOCATION = WORKING_DIRECTORY + '/datasets/' |
82 | 82 | |
| 83 | +TXT_FILE_LOCATION = WORKING_DIRECTORY + '/csv/' |
| 84 | + |
83 | 85 | #This section contains configuration variables for parsing / encoding and |
84 | 86 | #working with the XML files. |
85 | 87 | |
— | — | @@ -99,8 +101,3 @@ |
100 | 102 | |
101 | 103 | WP_DUMP_LOCATION = 'http://download.wikimedia.org' |
102 | 104 | |
103 | | -LANGUAGE_MAPPING = { |
104 | | -'English': '/enwiki/latest/', |
105 | | -'Russian': '/ruwiki/latest/', |
106 | | -'German': '/dewiki/latest', |
107 | | -} |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -256,7 +256,7 @@ |
257 | 257 | ValueError: I/O operation on closed file |
258 | 258 | Not sure how to fix this, that's why the line is commented. |
259 | 259 | ''' |
260 | | - #pbar.update(x) |
| 260 | + pbar.update(pbar.currval + x) |
261 | 261 | |
262 | 262 | |
263 | 263 | def humanize_time_difference(seconds_elapsed): |
Index: trunk/tools/editor_trends/utils/process_constructor.py |
— | — | @@ -66,18 +66,16 @@ |
67 | 67 | |
68 | 68 | |
69 | 69 | input_processes = [models.ProcessInputQueue(main, input_queue, result_queue, |
70 | | - **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES)] |
| 70 | + **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES -1)] |
71 | 71 | |
72 | 72 | for input_process in input_processes: |
73 | 73 | input_process.start() |
74 | 74 | pids = [p.pid for p in input_processes] |
75 | 75 | kwargs['pids'] = pids |
76 | 76 | |
77 | | - |
78 | | - |
79 | 77 | if result_queue: |
80 | 78 | result_processes = [models.ProcessResultQueue(result_processor, |
81 | | - result_queue, **kwargs) for i in xrange(1)] |
| 79 | + result_queue, **kwargs) for i in xrange(24)] |
82 | 80 | for result_process in result_processes: |
83 | 81 | result_process.start() |
84 | 82 | |
Index: trunk/tools/editor_trends/split_xml_file.py |
— | — | @@ -13,8 +13,10 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
17 | 20 | |
18 | | - |
19 | 21 | import xml.etree.cElementTree as cElementTree |
20 | 22 | import codecs |
21 | 23 | import utils |
— | — | @@ -30,10 +32,7 @@ |
31 | 33 | |
32 | 34 | RE_NUMERIC_CHARACTER = re.compile('&#(\d+);') |
33 | 35 | |
34 | | -#def convert_html_entities(text): |
35 | | -# return utils.unescape(text) |
36 | 36 | |
37 | | - |
38 | 37 | def remove_numeric_character_references(text): |
39 | 38 | return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8') |
40 | 39 | |
— | — | @@ -57,26 +56,17 @@ |
58 | 57 | for revision in revisions: |
59 | 58 | comment = revision.find('comment') |
60 | 59 | timestamp = revision.find('timestamp').text |
61 | | - #if timestamp == '2007-11-25T09:21:11Z': |
62 | | - # print 'debug' |
63 | | - # text = comment.text |
64 | | - #test2 = text.encode('utf-8') |
65 | | - #test = text.decode('utf-8') |
66 | 60 | |
67 | 61 | # text1 = remove_ascii_control_characters(text) |
68 | 62 | # text2 = remove_numeric_character_references(text) |
69 | 63 | # text3 = convert_html_entities(text) |
70 | 64 | |
71 | 65 | if comment != None and comment.text != None: |
72 | | - #print comment.text.encode('utf-8') |
73 | | - |
74 | 66 | comment.text = function(comment.text) |
75 | | - #text = comment.text |
76 | | - #print text |
77 | 67 | return xml |
78 | 68 | |
79 | 69 | |
80 | | -def write_xml_file(element, fh, counter): |
| 70 | +def write_xml_file(element, fh, counter, language): |
81 | 71 | '''Get file handle and write xml element to file''' |
82 | 72 | size = len(cElementTree.tostring(element)) |
83 | 73 | fh, counter = create_xml_file_handle(fh, counter, size) |
— | — | @@ -89,20 +79,24 @@ |
90 | 80 | '''Create file handle if none is supplied or if file size > max file size.''' |
91 | 81 | if not fh: |
92 | 82 | counter = 0 |
93 | | - fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING) |
| 83 | + fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING) |
94 | 84 | return fh, counter |
95 | 85 | elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE: |
96 | 86 | print 'Created chunk %s' % counter |
97 | 87 | fh.close |
98 | 88 | counter += 1 |
99 | | - fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING) |
| 89 | + fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING) |
100 | 90 | return fh, counter |
101 | 91 | else: |
102 | 92 | return fh, counter |
103 | 93 | |
104 | 94 | |
105 | | -def split_xml(): |
| 95 | +def split_xml(language): |
106 | 96 | '''Reads xml file and splits it in N chunks''' |
| 97 | + result = utils.create_directory(language) |
| 98 | + if not result: |
| 99 | + return |
| 100 | + |
107 | 101 | fh = None |
108 | 102 | counter = None |
109 | 103 | tag = '{%s}page' % settings.NAME_SPACE |
— | — | @@ -118,10 +112,10 @@ |
119 | 113 | elem = parse_comments(elem, remove_numeric_character_references) |
120 | 114 | #elem = parse_comments(elem, convert_html_entities) |
121 | 115 | #elem = parse_comments(elem, remove_ascii_control_characters) |
122 | | - fh, counter = write_xml_file(elem, fh, counter) |
| 116 | + fh, counter = write_xml_file(elem, fh, counter, language) |
123 | 117 | #print cElementTree.tostring(elem) |
124 | 118 | root.clear() # when done parsing a section clear the tree to safe memory |
125 | 119 | |
126 | 120 | |
127 | 121 | if __name__ == "__main__": |
128 | | - split_xml() |
| 122 | + split_xml('enwiki') |
Index: trunk/tools/editor_trends/construct_datasets.py |
— | — | @@ -36,26 +36,38 @@ |
37 | 37 | |
38 | 38 | |
39 | 39 | def retrieve_editor_ids_mongo(RANDOM_SAMPLE=True): |
40 | | - if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION, |
| 40 | + raise DeprecatedError |
| 41 | +# if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION, |
| 42 | +# retrieve_editor_ids_mongo): |
| 43 | +# contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, |
| 44 | +# retrieve_editor_ids_mongo) |
| 45 | +# else: |
| 46 | +# mongo = db.init_mongo_db('editors') |
| 47 | +# editors = mongo['editors'] |
| 48 | +# contributors = set() |
| 49 | +# #ids = editors.find().distinct('editor') |
| 50 | +# ids = editors.find() |
| 51 | +# for x, id in enumerate(ids): |
| 52 | +# contributors.add(id['editor']) |
| 53 | +# if len(contributors) == 100000: |
| 54 | +# if RANDOM_SAMPLE: |
| 55 | +# break |
| 56 | +# if contributors != set(): |
| 57 | +# utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo) |
| 58 | +# return contributors |
| 59 | + |
| 60 | +def retrieve_ids_mongo_new(dbname, collection): |
| 61 | + if utils.check_file_exists(settings.TXT_FILE_LOCATION, |
41 | 62 | retrieve_editor_ids_mongo): |
42 | | - contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, |
| 63 | + ids = utils.load_object(settings.TXT_FILE_LOCATION, |
43 | 64 | retrieve_editor_ids_mongo) |
44 | 65 | else: |
45 | | - mongo = db.init_mongo_db('editors') |
46 | | - editors = mongo['editors'] |
47 | | - contributors = set() |
48 | | - #ids = editors.find().distinct('editor') |
49 | | - ids = editors.find() |
50 | | - for x, id in enumerate(ids): |
51 | | - contributors.add(id['editor']) |
52 | | - if len(contributors) == 100000: |
53 | | - if RANDOM_SAMPLE: |
54 | | - break |
55 | | - if contributors != set(): |
56 | | - utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo) |
57 | | - return contributors |
| 66 | + mongo = db.init_mongo_db(dbname) |
| 67 | + editors = mongo[collection] |
| 68 | + ids = editors.distinct() |
| 69 | + utils.store_object(contributors, settings.TXT_FILE_LOCATION, retrieve_editor_ids_mongo) |
| 70 | + return ids |
58 | 71 | |
59 | | - |
60 | 72 | def generate_editor_dataset(input_queue, data_queue, pbar, kwargs): |
61 | 73 | definition = kwargs.pop('definition') |
62 | 74 | limit = kwargs.pop('limit') |
— | — | @@ -72,7 +84,7 @@ |
73 | 85 | |
74 | 86 | print input_queue.qsize() |
75 | 87 | if definition == 'Traditional': |
76 | | - |
| 88 | + |
77 | 89 | obs = editors.find({'editor': id}, {'date':1}).sort('date').limit(limit) |
78 | 90 | contributors = [] |
79 | 91 | for ob in obs: |