Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -172,7 +172,7 @@ |
173 | 173 | |
174 | 174 | def launch_zip_extractor(args, location, file): |
175 | 175 | timer = Timer() |
176 | | - utils.zip_extract(location, file, compression='7z') |
| 176 | + utils.zip_extract(location, file) |
177 | 177 | timer.elapsed() |
178 | 178 | |
179 | 179 | |
— | — | @@ -211,7 +211,8 @@ |
212 | 212 | print 'dataset launcher' |
213 | 213 | timer = Timer() |
214 | 214 | project = kwargs.pop('full_project') |
215 | | - transformer.run_optimize_editors(project) |
| 215 | + collection = kwargs.pop('collection') |
| 216 | + transformer.run_optimize_editors(project, collection) |
216 | 217 | timer.elapsed() |
217 | 218 | |
218 | 219 | |
— | — | @@ -313,6 +314,9 @@ |
314 | 315 | |
315 | 316 | parser_transform = subparsers.add_parser('transform', help='Transform the raw datatabe to an enriched dataset that can be exported.') |
316 | 317 | parser_transform.set_defaults(func=transformer_launcher) |
| 318 | + parser_transform.add_argument('-c', '--collection', action='store', |
| 319 | + help='Name of MongoDB collection', |
| 320 | + default='editors') |
317 | 321 | |
318 | 322 | parser_dataset = subparsers.add_parser('export', help='Create a dataset from the MongoDB and write it to a csv file.') |
319 | 323 | parser_dataset.set_defaults(func=exporter_launcher) |
Index: trunk/tools/editor_trends/analyses/cohort_confidence_intervals.py |
— | — | @@ -0,0 +1,49 @@ |
| 2 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 3 | +__author__email = 'dvanliere at gmail dot com'
|
| 4 | +__date__ = '2010-11-24'
|
| 5 | +__version__ = '0.1'
|
| 6 | +
|
| 7 | +import sys
|
| 8 | +sys.path.append('..')
|
| 9 | +
|
| 10 | +import configuration
|
| 11 | +settings = configuration.Settings()
|
| 12 | +from utils import utils
|
| 13 | +from database import db
|
| 14 | +
|
| 15 | +
|
| 16 | +def dataset_edits_by_month(dbname, **kwargs):
|
| 17 | + dbname = kwargs.pop('dbname')
|
| 18 | + mongo = db.init_mongo_db(dbname)
|
| 19 | + editors = mongo['dataset']
|
| 20 | + name = dbname + '_edits_by_month.csv'
|
| 21 | + fh = utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding)
|
| 22 | + x = 0
|
| 23 | + vars_to_expand = ['monthly_edits']
|
| 24 | + while True:
|
| 25 | + try:
|
| 26 | + id = input_queue.get(block=False)
|
| 27 | + print input_queue.qsize()
|
| 28 | + obs = editors.find_one({'editor': id})
|
| 29 | + obs = expand_observations(obs, vars_to_expand)
|
| 30 | + if x == 0:
|
| 31 | + headers = obs.keys()
|
| 32 | + headers.sort()
|
| 33 | + headers = expand_headers(headers, vars_to_expand, obs)
|
| 34 | + utils.write_list_to_csv(headers, fh)
|
| 35 | + data = []
|
| 36 | + keys = obs.keys()
|
| 37 | + keys.sort()
|
| 38 | + for key in keys:
|
| 39 | + data.append(obs[key])
|
| 40 | + utils.write_list_to_csv(data, fh)
|
| 41 | +
|
| 42 | + x += 1
|
| 43 | + except Empty:
|
| 44 | + break
|
| 45 | + fh.close()
|
| 46 | +
|
| 47 | +
|
| 48 | +if __name__ == '__main__':
|
| 49 | +
|
| 50 | + |
\ No newline at end of file |
Index: trunk/tools/editor_trends/etl/bots.py |
— | — | @@ -1,123 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | - |
18 | | -import os |
19 | | -import cStringIO |
20 | | -import xml.etree.cElementTree as cElementTree |
21 | | - |
22 | | - |
23 | | -import configuration |
24 | | -settings = configuration.Settings() |
25 | | -from wikitree import xml |
26 | | -from database import db |
27 | | -from database import db_settings |
28 | | -from utils import utils |
29 | | -from utils import process_constructor as pc |
30 | | - |
31 | | -try: |
32 | | - import psyco |
33 | | - psyco.full() |
34 | | -except ImportError: |
35 | | - pass |
36 | | - |
37 | | - |
38 | | -def create_bot_ids_db_mongo(): |
39 | | - ids = utils.create_dict_from_csv_file(add_id_to_botnames, settings.encoding) |
40 | | - mongo = db.init_mongo_db('bots') |
41 | | - collection = mongo['ids'] |
42 | | - |
43 | | - db.remove_documents_from_mongo_db(collection, None) |
44 | | - |
45 | | - for id, name in ids.iteritems(): |
46 | | - collection.insert({'id': id, 'name': name}) |
47 | | - |
48 | | - print collection.count() |
49 | | - |
50 | | - |
51 | | -def lookup_username(input_queue, result_queue, progressbar, bots, debug=False): |
52 | | - ''' |
53 | | - This function is used to find the id's belonging to the different bots that |
54 | | - are patrolling the Wikipedia sites. |
55 | | - @input_queue contains a list of xml files to parse |
56 | | - |
57 | | - @result_queue should be set to false as the results are directly written to |
58 | | - a csv file. |
59 | | - |
60 | | - @progressbar depends on settings |
61 | | - |
62 | | - @bots is a dictionary containing the names of the bots to lookup |
63 | | - ''' |
64 | | - |
65 | | - #if len(bots.keys()) == 1: |
66 | | - bots = bots['bots'] |
67 | | - #print bots.keys() |
68 | | - |
69 | | - if settings.debug: |
70 | | - messages = {} |
71 | | - |
72 | | - while True: |
73 | | - if debug: |
74 | | - file = input_queue |
75 | | - else: |
76 | | - file = input_queue.get(block=False) |
77 | | - |
78 | | - if file == None: |
79 | | - break |
80 | | - |
81 | | - data = xml.read_input(utils.open_txt_file(settings.input_location + |
82 | | - file, 'r', encoding=settings.encoding)) |
83 | | - |
84 | | - for raw_data in data: |
85 | | - xml_buffer = cStringIO.StringIO() |
86 | | - raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
87 | | - raw_data = ''.join(raw_data) |
88 | | - raw_data = raw_data.encode('utf-8') |
89 | | - xml_buffer.write(raw_data) |
90 | | - |
91 | | - try: |
92 | | - xml_nodes = cElementTree.XML(xml_buffer.getvalue()) |
93 | | - revisions = xml_nodes.findall('revision') |
94 | | - for revision in revisions: |
95 | | - contributor = xml.retrieve_xml_node(revision, 'contributor') |
96 | | - username = contributor.find('username') |
97 | | - if username == None: |
98 | | - continue |
99 | | - username = xml.extract_text(username) |
100 | | - #print username.encode('utf-8') |
101 | | - |
102 | | - if username in bots: |
103 | | - id = contributor.find('id') |
104 | | - id = xml.extract_text(id) |
105 | | - #print username.encode('utf-8'), id |
106 | | - utils.write_data_to_csv({username: [id]}, add_id_to_botnames, settings.encoding) |
107 | | - bots.pop(username) |
108 | | - if bots == {}: |
109 | | - print 'Mission accomplished' |
110 | | - return |
111 | | - except Exception, error: |
112 | | - print error |
113 | | - if settings.debug: |
114 | | - messages = utils.track_errors(xml_buffer, error, file, |
115 | | - messages) |
116 | | - |
117 | | - if settings.debug: |
118 | | - utils.report_error_messages(messages, lookup_username) |
119 | | - |
120 | | - |
121 | | -if __name__ == '__main__': |
122 | | - #debug() |
123 | | - #add_id_to_botnames() |
124 | | - create_bot_ids_db_mongo() |
Index: trunk/tools/editor_trends/etl/exporter.py |
— | — | @@ -17,19 +17,23 @@ |
18 | 18 | __date__ = '2010-10-21' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
| 21 | +import sys |
| 22 | +import datetime |
| 23 | +from dateutil.relativedelta import * |
| 24 | +import calendar |
21 | 25 | from multiprocessing import Queue |
22 | 26 | from Queue import Empty |
23 | | -import datetime |
24 | | -from dateutil.relativedelta import * |
25 | | -import sys |
26 | | -import progressbar |
27 | 27 | |
| 28 | + |
| 29 | + |
28 | 30 | sys.path.append('..') |
29 | 31 | import configuration |
30 | 32 | settings = configuration.Settings() |
31 | 33 | from utils import models, utils |
32 | 34 | from database import db |
| 35 | +from etl import shaper |
33 | 36 | from utils import process_constructor as pc |
| 37 | +import progressbar |
34 | 38 | |
35 | 39 | try: |
36 | 40 | import psyco |
— | — | @@ -38,6 +42,91 @@ |
39 | 43 | pass |
40 | 44 | |
41 | 45 | |
| 46 | +class Variable(object): |
| 47 | + |
| 48 | + def __init__(self, var): |
| 49 | + setattr(self, 'name', var) |
| 50 | + self.stats = ['n', 'avg', 'sd', 'min', 'max'] |
| 51 | + setattr(self, 'time', shaper.create_datacontainer()) |
| 52 | + setattr(self, 'time', shaper.add_months_to_datacontainer(getattr(self, 'time'), datatype='dict')) |
| 53 | + |
| 54 | + for var in self.stats: |
| 55 | + setattr(self, var, shaper.create_datacontainer()) |
| 56 | + setattr(self, var, shaper.add_months_to_datacontainer(getattr(self, var), datatype='list')) |
| 57 | + |
| 58 | + def __repr__(self): |
| 59 | + return self.name |
| 60 | + |
| 61 | + def descriptives(self): |
| 62 | + for year in self.time: |
| 63 | + for month in self.time[year]: |
| 64 | + data = [self.time[year][month][k] for k in self.time[year][month].keys()] |
| 65 | + self.avg[year][month] = shaper.get_mean(data) |
| 66 | + self.sd[year][month] = shaper.get_standard_deviation(data) |
| 67 | + self.min[year][month] = min(data) |
| 68 | + self.max[year][month] = max(data) |
| 69 | + self.n[year][month] = len(data) |
| 70 | + |
| 71 | + |
| 72 | +class LongDataset(object): |
| 73 | + |
| 74 | + def __init__(self, vars): |
| 75 | + self.name = 'long_dataset.tsv' |
| 76 | + self.vars = [] |
| 77 | + for var in vars: |
| 78 | + setattr(self, var, Variable(var)) |
| 79 | + self.vars.append(var) |
| 80 | + |
| 81 | + def __repr__(self): |
| 82 | + return 'Dataset containing: %s' % (self.vars) |
| 83 | + |
| 84 | + def write_headers(self, fh): |
| 85 | + fh.write('_time\t') |
| 86 | + for var in self.vars: |
| 87 | + var = getattr(self, var) |
| 88 | + for stat in var.stats: |
| 89 | + fh.write('%s_%s\t' % (var.name, stat)) |
| 90 | + fh.write('\n') |
| 91 | + |
| 92 | + def convert_to_longitudinal_data(self, id, obs, vars): |
| 93 | + for var in vars: |
| 94 | + ds = getattr(self, var) |
| 95 | + years = obs[var].keys() |
| 96 | + for year in years: |
| 97 | + months = obs[var][year].keys() |
| 98 | + for m in months: |
| 99 | + #d = calendar.monthrange(int(year), int(m))[1] #determines the number of days in a given month/year |
| 100 | + #date = datetime.date(int(year), int(m), d) |
| 101 | + if id not in ds.time[year][m]: |
| 102 | + ds.time[year][m][id] = 0 |
| 103 | + ds.time[year][m][id] = obs[var][year][str(m)] |
| 104 | + |
| 105 | + def write_longitudinal_data(self): |
| 106 | + fh = utils.create_txt_filehandle(settings.dataset_location, self.name, 'w', settings.encoding) |
| 107 | + self.write_headers(fh) |
| 108 | + dc = shaper.create_datacontainer() |
| 109 | + dc = shaper.add_months_to_datacontainer(dc) |
| 110 | + |
| 111 | + for var in self.vars: |
| 112 | + var = getattr(self, var) |
| 113 | + var.descriptives() |
| 114 | + years = dc.keys() |
| 115 | + years.sort() |
| 116 | + for year in years: |
| 117 | + months = dc[year].keys() |
| 118 | + months.sort() |
| 119 | + for month in months: |
| 120 | + d = calendar.monthrange(int(year), int(month))[1] #determines the number of days in a given month/year |
| 121 | + date = datetime.date(int(year), int(month), d) |
| 122 | + fh.write('%s\t' % date) |
| 123 | + for var in self.vars: |
| 124 | + var = getattr(self, var) |
| 125 | + #data = ['%s_%s\t' % (var.name, getattr(var, stat)[year][month]) for stat in var.stats] |
| 126 | + fh.write(''.join(['%s\t' % (getattr(var, stat)[year][month],) for stat in var.stats])) |
| 127 | + fh.write('\n') |
| 128 | + fh.close() |
| 129 | + |
| 130 | + |
42 | 131 | def retrieve_editor_ids_mongo(dbname, collection): |
43 | 132 | if utils.check_file_exists(settings.binary_location, |
44 | 133 | 'editors.bin'): |
— | — | @@ -71,16 +160,6 @@ |
72 | 161 | obs[var] = edits |
73 | 162 | return obs |
74 | 163 | |
75 | | -def write_longitudinal_data(id, edits, fh): |
76 | | - years = edits.keys() |
77 | | - years.sort() |
78 | | - for year in years: |
79 | | - months = edits[year].keys() |
80 | | - months = [int(m) for m in months] |
81 | | - months.sort() |
82 | | - for m in months: |
83 | | - date = datetime.date(int(year), int(m), 1) |
84 | | - fh.write('%s\t%s\t%s\n' % (id, date, edits[year][str(m)])) |
85 | 164 | |
86 | 165 | |
87 | 166 | def expand_headers(headers, vars_to_expand, obs): |
— | — | @@ -97,32 +176,28 @@ |
98 | 177 | return headers |
99 | 178 | |
100 | 179 | |
101 | | -def generate_long_editor_dataset(input_queue, data_queue, pbar, **kwargs): |
102 | | - debug = kwargs.pop('debug') |
| 180 | +def generate_long_editor_dataset(input_queue, vars, **kwargs): |
103 | 181 | dbname = kwargs.pop('dbname') |
104 | 182 | mongo = db.init_mongo_db(dbname) |
105 | 183 | editors = mongo['dataset'] |
106 | 184 | name = dbname + '_long_editors.csv' |
107 | | - fh = utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding) |
108 | | - x = 0 |
| 185 | + #fh = utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding) |
109 | 186 | vars_to_expand = [] |
| 187 | + keys = dict([(var, 1) for var in vars]) |
| 188 | + ld = LongDataset(vars) |
110 | 189 | while True: |
111 | 190 | try: |
112 | 191 | id = input_queue.get(block=False) |
113 | | - obs = editors.find_one({'editor': id}, {'monthly_edits': 1}) |
114 | | - if x == 0: |
115 | | - headers = obs.keys() |
116 | | - headers.sort() |
117 | | - headers = expand_headers(headers, vars_to_expand, obs) |
118 | | - utils.write_list_to_csv(headers, fh) |
119 | | - write_longitudinal_data(id, obs['monthly_edits'], fh) |
| 192 | + print id |
| 193 | + obs = editors.find_one({'editor': id}, keys) |
| 194 | + ld.convert_to_longitudinal_data(id, obs, vars) |
120 | 195 | #utils.write_list_to_csv(data, fh) |
121 | | - x += 1 |
122 | 196 | except Empty: |
123 | 197 | break |
| 198 | + ld.write_longitudinal_data() |
124 | 199 | |
125 | 200 | |
126 | | -def generate_cohort_analysis(input_queue, data_queue, pbar, **kwargs): |
| 201 | +def generate_cohort_analysis(input_queue, **kwargs): |
127 | 202 | dbname = kwargs.get('dbname') |
128 | 203 | pbar = kwargs.get('pbar') |
129 | 204 | mongo = db.init_mongo_db(dbname) |
— | — | @@ -169,6 +244,7 @@ |
170 | 245 | break |
171 | 246 | utils.store_object(data, settings.binary_location, 'cohort_data') |
172 | 247 | |
| 248 | + |
173 | 249 | def date_falls_in_window(window_start, window_end, first_edit, last_edit): |
174 | 250 | if first_edit >= window_start and first_edit <= window_end: |
175 | 251 | return True |
— | — | @@ -176,7 +252,7 @@ |
177 | 253 | return False |
178 | 254 | |
179 | 255 | |
180 | | -def generate_wide_editor_dataset(input_queue, data_queue, pbar, **kwargs): |
| 256 | +def generate_wide_editor_dataset(input_queue, **kwargs): |
181 | 257 | dbname = kwargs.pop('dbname') |
182 | 258 | mongo = db.init_mongo_db(dbname) |
183 | 259 | editors = mongo['dataset'] |
— | — | @@ -241,16 +317,19 @@ |
242 | 318 | |
243 | 319 | def generate_editor_dataset_debug(dbname): |
244 | 320 | ids = retrieve_editor_ids_mongo(dbname, 'editors') |
| 321 | + #ids = list(ids)[:1000] |
245 | 322 | input_queue = pc.load_queue(ids) |
246 | 323 | kwargs = {'nr_input_processors': 1, |
247 | 324 | 'nr_output_processors': 1, |
248 | 325 | 'debug': True, |
249 | 326 | 'dbname': dbname, |
250 | 327 | } |
251 | | - generate_editor_dataset(input_queue, False, False, kwargs) |
| 328 | + #generate_editor_dataset(input_queue, False, False, kwargs) |
| 329 | + vars = ['monthly_edits'] |
| 330 | + generate_long_editor_dataset(input_queue, vars, **kwargs) |
252 | 331 | |
253 | | - |
254 | 332 | if __name__ == '__main__': |
255 | 333 | #generate_editor_dataset_debug('test') |
256 | | - generate_editor_dataset_launcher('enwiki') |
| 334 | + #generate_editor_dataset_launcher('enwiki') |
| 335 | + generate_editor_dataset_debug('enwiki') |
257 | 336 | #debug_retrieve_edits_by_contributor_launcher() |
Index: trunk/tools/editor_trends/etl/extract.py |
— | — | @@ -135,29 +135,25 @@ |
136 | 136 | |
137 | 137 | |
138 | 138 | |
139 | | -def determine_username_is_bot(username, kwargs): |
| 139 | +def determine_username_is_bot(contributor, bots): |
140 | 140 | ''' |
141 | | - @username is the xml element containing the id of the user |
142 | | - @kwargs should have a list with all the bot ids |
143 | | - |
144 | | - @Return False if username id is not in bot list id or True if username id |
| 141 | + #contributor is an xml element containing the id of the contributor |
| 142 | + @bots should have a dcit with all the bot ids and bot names |
| 143 | + @Return False if username id is not in bot dict id or True if username id |
145 | 144 | is a bot id. |
146 | 145 | ''' |
147 | | - ids = kwargs.get('bots', []) |
148 | | - if ids == None: |
149 | | - ids = [] |
150 | | - if username != None and username.text != None: |
151 | | - id = username.text |
152 | | - if id in ids: |
153 | | - return 1 |
154 | | - else: |
155 | | - return 0 |
| 146 | + for elem in contributor: |
| 147 | + if elem.tag == 'id': |
| 148 | + if elem.text in bots['bots']: |
| 149 | + return 1 |
| 150 | + else: |
| 151 | + return 0 |
156 | 152 | |
157 | 153 | |
158 | 154 | def extract_username(contributor, kwargs): |
159 | 155 | for elem in contributor: |
160 | 156 | if elem.tag == 'username': |
161 | | - return elem.text #.encode(settings.encoding) |
| 157 | + return elem.text |
162 | 158 | else: |
163 | 159 | return None |
164 | 160 | |
— | — | @@ -167,16 +163,14 @@ |
168 | 164 | @contributor is the xml contributor node containing a number of attributes |
169 | 165 | |
170 | 166 | Currently, we are only interested in registered contributors, hence we |
171 | | - ignore anonymous editors. If you are interested in collecting data on |
172 | | - anonymous editors then add the string 'ip' to the tags variable. |
| 167 | + ignore anonymous editors. |
173 | 168 | ''' |
174 | | - tags = ['id'] |
175 | 169 | if contributor.get('deleted'): |
176 | 170 | return - 1 # ASK: Not sure if this is the best way to code deleted contributors. |
177 | 171 | for elem in contributor: |
178 | | - if elem.tag in tags: |
| 172 | + if elem.tag == 'id': |
179 | 173 | if elem.text != None: |
180 | | - return elem.text.encode(settings.encoding) |
| 174 | + return elem.text |
181 | 175 | else: |
182 | 176 | return - 1 |
183 | 177 | |
— | — | @@ -209,6 +203,8 @@ |
210 | 204 | vars[var] = function(xml_node, kwargs) |
211 | 205 | |
212 | 206 | #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot']) |
| 207 | + if vars['username'] == 'ClueBot': |
| 208 | + print 'debug' |
213 | 209 | if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None: |
214 | 210 | vars.pop('bot') |
215 | 211 | if destination == 'queue': |
— | — | @@ -222,100 +218,6 @@ |
223 | 219 | vars = {} |
224 | 220 | |
225 | 221 | |
226 | | -#def parse_editors(xml_queue, data_queue, **kwargs): |
227 | | -# ''' |
228 | | -# @xml_queue contains the filenames of the files to be parsed |
229 | | -# @data_queue is an instance of Queue where the extracted data is stored for |
230 | | -# further processing |
231 | | -# @pbar is an instance of progressbar to display the progress |
232 | | -# @bots is a list of id's of known Wikipedia bots |
233 | | -# @debug is a flag to indicate whether the function is called for debugging. |
234 | | -# |
235 | | -# Output is the data_queue that will be used by store_editors() |
236 | | -# ''' |
237 | | -# input = kwargs.get('input', None) |
238 | | -# output = kwargs.get('output', None) |
239 | | -# debug = kwargs.get('debug', False) |
240 | | -# destination = kwargs.get('destination', 'file') |
241 | | -# bots = kwargs.get('bots', None) |
242 | | -# pbar = kwargs.get('pbar', None) |
243 | | -# if settings.debug: |
244 | | -# messages = {} |
245 | | -# vars = {} |
246 | | -# |
247 | | -# while True: |
248 | | -# try: |
249 | | -# if debug: |
250 | | -# file = xml_queue |
251 | | -# else: |
252 | | -# file = xml_queue.get(block=False) |
253 | | -# if file == None: |
254 | | -# print 'Swallowed a poison pill' |
255 | | -# break |
256 | | -# |
257 | | -# data = xml.read_input(utils.create_txt_filehandle(input, |
258 | | -# file, 'r', |
259 | | -# encoding=settings.encoding)) |
260 | | -# if destination == 'file': |
261 | | -# name = file[:-4] + '.txt' |
262 | | -# fh = utils.create_txt_filehandle(output, name, 'w', settings.encoding) |
263 | | -# for raw_data in data: |
264 | | -# xml_buffer = cStringIO.StringIO() |
265 | | -# raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
266 | | -# |
267 | | -# try: |
268 | | -# raw_data = ''.join(raw_data) |
269 | | -# xml_buffer.write(raw_data) |
270 | | -# elem = cElementTree.XML(xml_buffer.getvalue()) |
271 | | -# output_editor_information(elem, fh, bots=bots, destination=destination) |
272 | | -# except SyntaxError, error: |
273 | | -# print error |
274 | | -# ''' |
275 | | -# There are few cases with invalid tokens, they are fixed |
276 | | -# here and then reinserted into the XML DOM |
277 | | -# data = convert_html_entities(xml_buffer.getvalue()) |
278 | | -# elem = cElementTree.XML(data) |
279 | | -# output_editor_information(elem) |
280 | | -# ''' |
281 | | -# if settings.debug: |
282 | | -# utils.track_errors(xml_buffer, error, file, messages) |
283 | | -# except UnicodeEncodeError, error: |
284 | | -# print error |
285 | | -# if settings.debug: |
286 | | -# utils.track_errors(xml_buffer, error, file, messages) |
287 | | -# except MemoryError, error: |
288 | | -# print file, error |
289 | | -# print raw_data[:12] |
290 | | -# print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data]) |
291 | | -# if destination == 'queue': |
292 | | -# output.put('NEXT') |
293 | | -# while True: |
294 | | -# if output.qsize() < 100000: |
295 | | -# break |
296 | | -# else: |
297 | | -# time.sleep(10) |
298 | | -# print 'Still sleeping, queue is %s items long' % output.qsize() |
299 | | -# |
300 | | -# else: |
301 | | -# fh.close() |
302 | | -# |
303 | | -# if pbar: |
304 | | -# print file, xml_queue.qsize() |
305 | | -# #utils.update_progressbar(pbar, xml_queue) |
306 | | -# |
307 | | -# if debug: |
308 | | -# break |
309 | | -# |
310 | | -# except Empty: |
311 | | -# break |
312 | | -# |
313 | | -# if destination == 'queue': |
314 | | -# data_queue.put(None) |
315 | | -# |
316 | | -# if settings.debug: |
317 | | -# utils.report_error_messages(messages, parse_editors) |
318 | | - |
319 | | - |
320 | 222 | def load_bot_ids(): |
321 | 223 | ''' |
322 | 224 | Loader function to retrieve list of id's of known Wikipedia bots. |
— | — | @@ -352,12 +254,14 @@ |
353 | 255 | tasks.join() |
354 | 256 | |
355 | 257 | |
356 | | -def debug_parse_editors(dbname): |
357 | | - q = JoinableQueue() |
358 | | - parse_editors('522.xml', q, None, None, debug=True, destination='file') |
359 | | - store_editors(q, [], dbname) |
| 258 | +def debug_parse_editors(location): |
| 259 | + bots = load_bot_ids() |
| 260 | + input = os.path.join(location, 'chunks') |
| 261 | + output = os.path.join(location, 'txt') |
| 262 | + xml_file = XMLFile(input, output, '1.xml', bots, output_editor_information, destination='file') |
| 263 | + xml_file() |
360 | 264 | |
361 | | - |
362 | | -if __name__ == "__main__": |
363 | | - #debug_parse_editors('test2') |
364 | | - run_parse_editors(os.path.join(settings.input_location, 'en', 'wiki')) |
| 265 | +if __name__ == '__main__': |
| 266 | + location = os.path.join(settings.input_location, 'en', 'wiki') |
| 267 | + debug_parse_editors(location) |
| 268 | + #run_parse_editors(location) |
Index: trunk/tools/editor_trends/etl/shaper.py |
— | — | @@ -0,0 +1,72 @@ |
| 2 | + |
| 3 | + |
| 4 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 5 | +__author__email = 'dvanliere at gmail dot com' |
| 6 | +__date__ = '2010-11-24' |
| 7 | +__version__ = '0.1' |
| 8 | + |
| 9 | +import datetime |
| 10 | +import math |
| 11 | + |
| 12 | +def create_datacontainer(init_value=0): |
| 13 | + ''' |
| 14 | + This function initializes an empty dictionary with as key the year (starting |
| 15 | + 2001 and running through) and as value @init_value, in most cases this will |
| 16 | + be zero so the dictionary will act as a running tally for a variable but |
| 17 | + @init_value can also a list, [], or a dictionary, {}, or a set, set(). |
| 18 | + ''' |
| 19 | + data = {} |
| 20 | + year = datetime.datetime.now().year + 1 |
| 21 | + for x in xrange(2001, year): |
| 22 | + if init_value == 'set': |
| 23 | + data[str(x)] = set() |
| 24 | + else: |
| 25 | + data[str(x)] = init_value |
| 26 | + return data |
| 27 | + |
| 28 | + |
| 29 | +def add_months_to_datacontainer(datacontainer, datatype=0.0): |
| 30 | + for dc in datacontainer: |
| 31 | + datacontainer[dc] = {} |
| 32 | + for x in xrange(1, 13): |
| 33 | + if datatype =='dict': |
| 34 | + datacontainer[dc][str(x)] = dict() |
| 35 | + elif datatype == 'list': |
| 36 | + datacontainer[dc][str(x)] = list() |
| 37 | + elif datatype == 'set': |
| 38 | + datacontainer[dc][str(x)] = set() |
| 39 | + else: |
| 40 | + datacontainer[dc][str(x)] = 0.0 |
| 41 | + #else: |
| 42 | + # datacontainer[dc][str(x)] = 0.0 |
| 43 | + return datacontainer |
| 44 | + |
| 45 | + |
| 46 | +def get_standard_deviation(numberList): |
| 47 | + mean = get_mean(numberList) |
| 48 | + std = 0 |
| 49 | + n = len(numberList) |
| 50 | + for i in numberList: |
| 51 | + std = std + (i - mean)**2 |
| 52 | + return math.sqrt(std / float(n-1)) |
| 53 | + |
| 54 | + |
| 55 | +def get_median(numberList): |
| 56 | + #print numberList |
| 57 | + if numberList== []: return '.' |
| 58 | + theValues = sorted(numberList) |
| 59 | + theValues = [float(x) for x in theValues] |
| 60 | + if len(theValues) % 2 == 1: |
| 61 | + return theValues[(len(theValues)+1)/2-1] |
| 62 | + else: |
| 63 | + lower = theValues[len(theValues)/2-1] |
| 64 | + upper = theValues[len(theValues)/2] |
| 65 | + #print upper, lower |
| 66 | + return (lower + upper) / 2 |
| 67 | + |
| 68 | + |
| 69 | +def get_mean(numberList): |
| 70 | + #print numberList |
| 71 | + if numberList== []: return '.' |
| 72 | + floatNums = [float(x) for x in numberList] |
| 73 | + return sum(floatNums) / len(numberList) |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/etl/shaper.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 74 | + native |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -31,8 +31,8 @@ |
32 | 32 | from utils import utils |
33 | 33 | from utils import models |
34 | 34 | import construct_datasets |
| 35 | +import shaper |
35 | 36 | |
36 | | - |
37 | 37 | try: |
38 | 38 | import psyco |
39 | 39 | psyco.full() |
— | — | @@ -52,27 +52,25 @@ |
53 | 53 | |
54 | 54 | |
55 | 55 | class Editor(object): |
56 | | - def __init__(self, dbname, id, **kwargs): |
| 56 | + def __init__(self, dbname, collection, id, **kwargs): |
57 | 57 | self.dbname = dbname |
58 | 58 | self.id = id |
| 59 | + self.collection = collection |
59 | 60 | for kw in kwargs: |
60 | 61 | setattr(self, kw, kwargs[kw]) |
61 | 62 | |
62 | 63 | def __str__(self): |
63 | 64 | return '%s' % (self.id) |
64 | | - # mongo = db.init_mongo_db(dbname) |
65 | | - # input = mongo[dbname] |
66 | | - # output = mongo['dataset'] |
67 | | - # output.ensure_index('editor') |
68 | | - # output.ensure_index('year_joined') |
69 | | - |
| 65 | + |
70 | 66 | def __call__(self): |
71 | 67 | self.mongo = db.init_mongo_db(self.dbname) |
72 | | - input_db = self.mongo['editors'] |
73 | | - output_db = self.mongo['dataset'] |
| 68 | + input_db = self.mongo[self.collection] |
| 69 | + output_db = self.mongo[self.collection +'_dataset'] |
74 | 70 | |
75 | 71 | output_db.ensure_index('editor') |
76 | 72 | output_db.create_index('editor') |
| 73 | + output_db.ensure_index('year_joined') |
| 74 | + output_db.create_index('year_joined') |
77 | 75 | |
78 | 76 | editor = input_db.find_one({'editor': self.id}) |
79 | 77 | if editor == None: |
— | — | @@ -100,43 +98,14 @@ |
101 | 99 | 'username': username |
102 | 100 | }) |
103 | 101 | |
104 | | -def create_datacontainer(init_value=0): |
105 | | - ''' |
106 | | - This function initializes an empty dictionary with as key the year (starting |
107 | | - 2001 and running through) and as value @init_value, in most cases this will |
108 | | - be zero so the dictionary will act as a running tally for a variable but |
109 | | - @init_value can also a list, [], or a dictionary, {}, or a set, set(). |
110 | | - ''' |
111 | | - data = {} |
112 | | - year = datetime.datetime.now().year + 1 |
113 | | - for x in xrange(2001, year): |
114 | | - if init_value == 'set': |
115 | | - data[str(x)] = set() |
116 | | - else: |
117 | | - data[str(x)] = init_value |
118 | | - return data |
119 | 102 | |
120 | | - |
121 | | -def add_months_to_datacontainer(datacontainer): |
122 | | - for dc in datacontainer: |
123 | | - datacontainer[dc] = {} |
124 | | - for x in xrange(1, 13): |
125 | | - datacontainer[dc][str(x)] = 0 |
126 | | - return datacontainer |
127 | | - |
128 | | - |
129 | 103 | def determine_edits_by_month(edits): |
130 | | - datacontainer = create_datacontainer(init_value=0) |
131 | | - datacontainer = add_months_to_datacontainer(datacontainer) |
| 104 | + datacontainer = shaper.create_datacontainer(init_value=0) |
| 105 | + datacontainer = shaper.add_months_to_datacontainer(datacontainer) |
132 | 106 | for year in edits: |
133 | | - months = set() |
134 | 107 | for edit in edits[year]: |
135 | 108 | m = str(edit['date'].month) |
136 | | - if m not in months: |
137 | | - datacontainer[year][m] = 1 |
138 | | - months.add(m) |
139 | | - if len(months) == 12: |
140 | | - break |
| 109 | + datacontainer[year][m] += 1 |
141 | 110 | return datacontainer |
142 | 111 | |
143 | 112 | |
— | — | @@ -144,7 +113,7 @@ |
145 | 114 | ''' |
146 | 115 | This function counts the number of edits by year made by a particular editor. |
147 | 116 | ''' |
148 | | - edits = create_datacontainer() |
| 117 | + edits = shaper.create_datacontainer() |
149 | 118 | for date in dates: |
150 | 119 | year = str(date['date'].year) |
151 | 120 | edits[year] += 1 |
— | — | @@ -156,7 +125,7 @@ |
157 | 126 | This function counts the number of unique articles by year edited by a |
158 | 127 | particular editor. |
159 | 128 | ''' |
160 | | - articles = create_datacontainer('set') |
| 129 | + articles = shaper.create_datacontainer('set') |
161 | 130 | for date in dates: |
162 | 131 | year = str(date['date'].year) |
163 | 132 | articles[year].add(date['article']) |
— | — | @@ -179,8 +148,8 @@ |
180 | 149 | # definition = kwargs.pop('definition') |
181 | 150 | |
182 | 151 | |
183 | | -def run_optimize_editors(dbname): |
184 | | - ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors') |
| 152 | +def run_optimize_editors(dbname, collection): |
| 153 | + ids = construct_datasets.retrieve_editor_ids_mongo(dbname, collection) |
185 | 154 | kwargs = {'definition': 'traditional', |
186 | 155 | 'pbar': True, |
187 | 156 | } |
— | — | @@ -190,7 +159,7 @@ |
191 | 160 | consumers = [EditorConsumer(tasks, None) for i in xrange(settings.number_of_processes)] |
192 | 161 | |
193 | 162 | for id in ids: |
194 | | - tasks.put(Editor(dbname, id)) |
| 163 | + tasks.put(Editor(dbname, collection, id)) |
195 | 164 | for x in xrange(settings.number_of_processes): |
196 | 165 | tasks.put(None) |
197 | 166 | |
— | — | @@ -212,4 +181,4 @@ |
213 | 182 | |
214 | 183 | if __name__ == '__main__': |
215 | 184 | #debug_optimize_editors('test') |
216 | | - run_optimize_editors('enwiki') |
| 185 | + run_optimize_editors('enwiki', 'test') |
Index: trunk/tools/editor_trends/etl/loader.py |
— | — | @@ -71,7 +71,7 @@ |
72 | 72 | utils.store_object(editors, settings.binary_location, 'editors') |
73 | 73 | |
74 | 74 | |
75 | | -def mergesort_external_launcher(dbname, input, intermediate_output, output): |
| 75 | +def mergesort_external_launcher(dbname, input, output): |
76 | 76 | files = utils.retrieve_file_list(input, 'txt', mask='') |
77 | 77 | x = 0 |
78 | 78 | maxval = 99999 |
— | — | @@ -80,11 +80,11 @@ |
81 | 81 | maxval = round(len(files) / x) |
82 | 82 | chunks = utils.split_list(files, int(x)) |
83 | 83 | '''1st iteration external mergesort''' |
84 | | - if len(chunks) < 2: |
85 | | - intermediate_output = output |
| 84 | + to_remove = [] |
86 | 85 | for chunk in chunks: |
87 | 86 | filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]] |
88 | | - filename = sort.merge_sorted_files(intermediate_output, filehandles, chunk) |
| 87 | + filename = sort.merge_sorted_files(output, filehandles, chunk) |
| 88 | + to_remove.append(filename) |
89 | 89 | filehandles = [fh.close() for fh in filehandles] |
90 | 90 | '''2nd iteration external mergesort, if necessary''' |
91 | 91 | if len(chunks) > 1: |
— | — | @@ -93,6 +93,9 @@ |
94 | 94 | filename = sort.merge_sorted_files(output, filehandles, 'final') |
95 | 95 | filehandles = [fh.close() for fh in filehandles] |
96 | 96 | filename = 'merged_final.txt' |
| 97 | + for r in to_remove: |
| 98 | + utils.delete_file(output ,r) |
| 99 | + |
97 | 100 | |
98 | 101 | |
99 | 102 | |
Index: trunk/tools/editor_trends/experience/map.py |
— | — | @@ -0,0 +1,122 @@ |
| 2 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 3 | +__author__email = 'dvanliere at gmail dot com' |
| 4 | +__date__ = '2010-11-22' |
| 5 | +__version__ = '0.1' |
| 6 | + |
| 7 | +import xml.etree.cElementTree as cElementTree |
| 8 | +import cStringIO |
| 9 | +import os |
| 10 | +import sys |
| 11 | +import codecs |
| 12 | +import multiprocessing |
| 13 | +sys.path.append('..') |
| 14 | +import cProfile |
| 15 | + |
| 16 | +import configuration |
| 17 | +settings = configuration.Settings() |
| 18 | + |
| 19 | +from etl import extract |
| 20 | +from utils import models |
| 21 | +from wikitree import xml |
| 22 | +from utils import utils |
| 23 | +from etl import chunker |
| 24 | + |
| 25 | + |
| 26 | +def extract_article_talk_pages(page, output, **kwargs): |
| 27 | + tags = {'title': xml.extract_text, |
| 28 | + 'id': xml.extract_text, |
| 29 | + } |
| 30 | + headers = ['id', 'title'] |
| 31 | + vars = {} |
| 32 | + elements = page.getchildren() |
| 33 | + for tag, function in tags.iteritems(): |
| 34 | + xml_node = xml.retrieve_xml_node(elements, tag) |
| 35 | + vars[tag] = function(xml_node, kwargs) |
| 36 | + |
| 37 | + data = [] |
| 38 | + for head in headers: |
| 39 | + data.append(vars[head]) |
| 40 | + utils.write_list_to_csv(data, output) |
| 41 | + |
| 42 | + |
| 43 | +def map_article_talk_ids(language_code): |
| 44 | + ns = chunker.load_namespace(language_code) |
| 45 | + talk_ns = ns['1'].get(u'*', None) |
| 46 | + input = os.path.join(settings.input_location, 'en', 'wiki', 'article_talk') |
| 47 | + files = utils.retrieve_file_list(input, 'txt') |
| 48 | + articles = {} |
| 49 | + talks = {} |
| 50 | + for file in files: |
| 51 | + fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding) |
| 52 | + for line in fh: |
| 53 | + line = line.replace('\n', '') |
| 54 | + id, article = line.split('\t') |
| 55 | + if not article.startswith(talk_ns): |
| 56 | + articles[article] = {} |
| 57 | + articles[article]['id'] = id |
| 58 | + else: |
| 59 | + talks[article] = id |
| 60 | + fh.close() |
| 61 | + utils.store_object(articles, settings.binary_location, 'articles.bin') |
| 62 | + utils.store_object(talks, settings.binary_location, 'talks.bin') |
| 63 | + |
| 64 | + for article in articles: |
| 65 | + talk = '%s:%s' % (talk_ns, article) |
| 66 | + if talk in talks: |
| 67 | + articles[article]['talk_id'] = talks[talk] |
| 68 | + |
| 69 | + utils.store_object(articles, settings.binary_location, 'articles_talks.bin') |
| 70 | + |
| 71 | + |
| 72 | +def article_to_talk_launcher(**kwargs): |
| 73 | + file = 'dewiki-latest-stub-meta-current.xml'#'enwiki-20100916-stub-meta-history.xml' |
| 74 | + include = [0, 1] |
| 75 | + language_code = 'en' |
| 76 | + project = 'wiki' |
| 77 | + input = os.path.join(settings.input_location, 'en', 'wiki') |
| 78 | + output = os.path.join(settings.input_location, 'en', 'wiki', 'chunks') |
| 79 | + chunker.split_file(input, file, project, language_code, include, format='xml', zip=True) |
| 80 | + files = utils.retrieve_file_list(output, 'xml') |
| 81 | + |
| 82 | + |
| 83 | + tasks = multiprocessing.JoinableQueue() |
| 84 | + consumers = [extract.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)] |
| 85 | + input = output |
| 86 | + output = os.path.join(settings.input_location, 'en', 'wiki', 'article_talk') |
| 87 | + for file in files: |
| 88 | + tasks.put(extract.XMLFile(input, output, file, [], extract_article_talk_pages, destination='file')) |
| 89 | + for x in xrange(settings.number_of_processes): |
| 90 | + tasks.put(None) |
| 91 | + |
| 92 | + print tasks.qsize() |
| 93 | + for w in consumers: |
| 94 | + w.start() |
| 95 | + |
| 96 | + tasks.join() |
| 97 | + |
| 98 | + |
| 99 | +def debug_map_article_talk_ids(): |
| 100 | + map_article_talk_ids('de') |
| 101 | + |
| 102 | + |
| 103 | +def debug_article_to_talk(): |
| 104 | + input = os.path.join(settings.input_location, 'en', 'wiki', 'chunks', '0.xml') |
| 105 | + output = os.path.join(settings.input_location, 'en', 'wiki', 'txt', 'test.txt') |
| 106 | + f = codecs.open(output, 'w', encoding=settings.encoding) |
| 107 | + fh = open(input, 'r') |
| 108 | + data = xml.read_input(fh) |
| 109 | + for raw_data in data: |
| 110 | + xml_buffer = cStringIO.StringIO() |
| 111 | + raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
| 112 | + raw_data = ''.join(raw_data) |
| 113 | + xml_buffer.write(raw_data) |
| 114 | + elem = cElementTree.XML(xml_buffer.getvalue()) |
| 115 | + extract_article_talk_pages(elem, f) |
| 116 | + f.close() |
| 117 | + |
| 118 | + |
| 119 | +if __name__ == '__main__': |
| 120 | + #cProfile.run('article_to_talk_launcher()') |
| 121 | + #debug_article_to_talk() |
| 122 | + debug_map_article_talk_ids() |
| 123 | + #article_to_talk_launcher() |
Property changes on: trunk/tools/editor_trends/experience/map.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 124 | + native |
Index: trunk/tools/editor_trends/experience/__init__.py |
Property changes on: trunk/tools/editor_trends/experience/__init__.py |
___________________________________________________________________ |
Added: svn:eol-style |
2 | 125 | + native |
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -27,6 +27,7 @@ |
28 | 28 | import os |
29 | 29 | import sys |
30 | 30 | import platform |
| 31 | +import subprocess |
31 | 32 | |
32 | 33 | try: |
33 | 34 | from _winreg import * |
— | — | @@ -138,14 +139,17 @@ |
139 | 140 | return QueryValueEx(key, 'Path')[0] |
140 | 141 | except WindowsError: |
141 | 142 | return None |
142 | | - |
143 | | - |
| 143 | + |
| 144 | + def detect_linux_program(self, program): |
| 145 | + path = subprocess.Popen(['which', '%s' % program],stdout=subprocess.PIPE).communicate()[0] |
| 146 | + return path.replace('\n','') |
| 147 | + |
144 | 148 | def detect_installed_program(self, program): |
145 | 149 | if self.platform == 'Windows': |
146 | 150 | path = self.detect_windows_program(program) |
147 | | - return path |
148 | | - else: |
149 | | - raise NotImplementedError |
| 151 | + elif self.platform == 'Linux': |
| 152 | + path = self.detect_linux_program(program) |
| 153 | + return path |
150 | 154 | |
151 | 155 | def determine_max_filehandles_open(self): |
152 | 156 | if self.platform == 'Windows' and self.architecture == 'i386': |
— | — | @@ -167,7 +171,7 @@ |
168 | 172 | def determine_ziptool(self): |
169 | 173 | tools = {'OSX': None, |
170 | 174 | 'Windows': '7z.exe', |
171 | | - 'Linux': None} |
| 175 | + 'Linux': 'unzip'} |
172 | 176 | return tools[self.platform] |
173 | 177 | |
174 | 178 | def set_file_locations(self): |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -143,16 +143,13 @@ |
144 | 144 | |
145 | 145 | |
146 | 146 | # read / write data related functions |
147 | | -def read_data_from_csv(filename, encoding): |
| 147 | +def read_data_from_csv(location, filename, encoding): |
148 | 148 | ''' |
149 | 149 | @filename is the path (either absolute or relative) including the name of |
150 | 150 | of the file |
151 | 151 | @encoding is usually utf-8 |
152 | 152 | ''' |
153 | | - if hasattr(filename, '__call__'): |
154 | | - filename = construct_filename(filename) |
155 | | - |
156 | | - fh = open_txt_file(filename, 'r', encoding=encoding) |
| 153 | + fh = create_txt_filehandle(location, filename, 'r', encoding) |
157 | 154 | for line in fh: |
158 | 155 | yield line |
159 | 156 | |
— | — | @@ -220,6 +217,7 @@ |
221 | 218 | |
222 | 219 | def write_dict_to_csv(data, fh, write_key=True, newline=True): |
223 | 220 | keys = data.keys() |
| 221 | + keys.sort() |
224 | 222 | for key in keys: |
225 | 223 | if write_key: |
226 | 224 | fh.write('%s' % key) |
— | — | @@ -317,12 +315,12 @@ |
318 | 316 | return dict([[v, k] for k, v in dictionary.items()]) |
319 | 317 | |
320 | 318 | |
321 | | -def create_dict_from_csv_file(filename, encoding): |
| 319 | +def create_dict_from_csv_file(location, filename, encoding): |
322 | 320 | ''' |
323 | 321 | Constructs a dictionary from a txtfile |
324 | 322 | ''' |
325 | 323 | d = {} |
326 | | - for line in read_data_from_csv(filename, encoding): |
| 324 | + for line in read_data_from_csv(location, filename, encoding): |
327 | 325 | line = clean_string(line) |
328 | 326 | value, key = line.split('\t') |
329 | 327 | d[key] = value |
— | — | @@ -375,12 +373,13 @@ |
376 | 374 | raise exceptions.PlatformNotSupportedError |
377 | 375 | |
378 | 376 | |
379 | | -def zip_extract(path, location, source): |
| 377 | +def zip_extract(location, source): |
380 | 378 | ''' |
381 | 379 | @path is the absolute path to the zip program |
382 | 380 | @location is the directory where to store the compressed file |
383 | 381 | @source is the name of the zipfile |
384 | 382 | ''' |
| 383 | + path = settings.path_ziptool |
385 | 384 | if settings.platform == 'Windows': |
386 | 385 | p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait() |
387 | 386 | elif settings.platform == 'Linux': |
Index: trunk/tools/editor_trends/tests/mongodb/store.py |
— | — | @@ -0,0 +1,28 @@ |
| 2 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 3 | +__author__email = 'dvanliere at gmail dot com' |
| 4 | +__date__ = '2010-11-09' |
| 5 | +__version__ = '0.1' |
| 6 | + |
| 7 | +import datetime |
| 8 | +import calendar |
| 9 | +import time |
| 10 | +from database import db |
| 11 | + |
| 12 | + |
| 13 | +def test_date(): |
| 14 | + |
| 15 | + mongo = db.init_mongo_db('unit_test') |
| 16 | + collection = mongo['foo'] |
| 17 | + d1 = datetime.datetime(2007, 1, 1) |
| 18 | + d2 = datetime.datetime(2006, 12, 31) |
| 19 | + |
| 20 | + if d1.utcoffset() is not None: |
| 21 | + d1 = d1 - d1.utcoffset() |
| 22 | + millis = int(calendar.timegm(d1.timetuple()) * 1000 + d1.microsecond / 1000) |
| 23 | + millis = millis /1000 |
| 24 | + d3 = time.gmtime(millis) |
| 25 | + #d3 = datetime.date(2007, 1, 1) |
| 26 | + collection.insert({'date': d1}) |
| 27 | + collection.insert({'date': d2}) |
| 28 | + #collection.insert({'date': d3}) |
| 29 | + |
Property changes on: trunk/tools/editor_trends/tests/mongodb/store.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 30 | + native |
Index: trunk/tools/editor_trends/tests/mongodb/__init__.py |
Property changes on: trunk/tools/editor_trends/tests/mongodb/__init__.py |
___________________________________________________________________ |
Added: svn:eol-style |
2 | 31 | + native |
Index: trunk/tools/editor_trends/tests/__init__.py |
Property changes on: trunk/tools/editor_trends/tests/__init__.py |
___________________________________________________________________ |
Added: svn:eol-style |
3 | 32 | + native |
Index: trunk/tools/editor_trends/tests/test.py |
— | — | @@ -0,0 +1,6 @@ |
| 2 | +import configuration |
| 3 | +settings = configuration.Settings()
|
| 4 | +
|
| 5 | +from tests.mongodb import store
|
| 6 | +
|
| 7 | +store.test_date() |
\ No newline at end of file |
Property changes on: trunk/tools/editor_trends/logs |
___________________________________________________________________ |
Added: svn:ignore |
1 | 8 | + *.bin |
split_xml |
Index: trunk/tools/editor_trends/bots/bots.py |
— | — | @@ -0,0 +1,141 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | + |
| 18 | +import os |
| 19 | +import cStringIO |
| 20 | +import xml.etree.cElementTree as cElementTree |
| 21 | +import sys |
| 22 | +sys.path.append('..') |
| 23 | + |
| 24 | +import configuration |
| 25 | +settings = configuration.Settings() |
| 26 | +from wikitree import xml |
| 27 | +from database import db |
| 28 | +from database import db_settings |
| 29 | +from utils import utils |
| 30 | +from utils import process_constructor as pc |
| 31 | + |
| 32 | +try: |
| 33 | + import psyco |
| 34 | + psyco.full() |
| 35 | +except ImportError: |
| 36 | + pass |
| 37 | + |
| 38 | + |
| 39 | +def read_bots_csv_file(location, filename, encoding): |
| 40 | + ''' |
| 41 | + Constructs a dictionary: |
| 42 | + key is language |
| 43 | + value is a list of bot names |
| 44 | + ''' |
| 45 | + d = {} |
| 46 | + for line in utils.read_data_from_csv(location, filename, encoding): |
| 47 | + line = utils.clean_string(line) |
| 48 | + language, bots = line.split(',') |
| 49 | + bots = bots.split('|') |
| 50 | + for bot in bots: |
| 51 | + if bot not in d: |
| 52 | + d[bot] = {} |
| 53 | + d[bot]['id'] = None |
| 54 | + d[bot]['languages'] = [] |
| 55 | + d[bot]['languages'].append(language) |
| 56 | + return d |
| 57 | + |
| 58 | + |
| 59 | +def store_bots(): |
| 60 | + bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding) |
| 61 | + mongo = db.init_mongo_db('bots') |
| 62 | + collection = mongo['ids'] |
| 63 | + db.remove_documents_from_mongo_db(collection, None) |
| 64 | + for id, name in ids.iteritems(): |
| 65 | + collection.insert({'id': int(id), 'name': name, 'language': language}) |
| 66 | + |
| 67 | + print 'Stored %s bots' % collection.count() |
| 68 | + |
| 69 | + |
| 70 | +def lookup_bot_userid(input_queue, language_code, project, bots): |
| 71 | + ''' |
| 72 | + This function is used to find the id's belonging to the different bots that |
| 73 | + are patrolling the Wikipedia sites. |
| 74 | + @input_queue contains a list of xml files to parse |
| 75 | + @bots is a dictionary containing the names of the bots to lookup |
| 76 | + ''' |
| 77 | + if settings.debug: |
| 78 | + messages = {} |
| 79 | + |
| 80 | + location = os.path.join(settings.input_location, language_code, project, 'chunks') |
| 81 | + fh = utils.create_txt_filehandle(settings.csv_location, 'bots_ids.csv', 'w', settings.encoding) |
| 82 | + |
| 83 | + while True: |
| 84 | + file = input_queue.get(block=False) |
| 85 | + if file == None: |
| 86 | + break |
| 87 | + data = xml.read_input(utils.create_txt_filehandle(location, |
| 88 | + file, |
| 89 | + 'r', |
| 90 | + settings.encoding)) |
| 91 | + |
| 92 | + for raw_data in data: |
| 93 | + xml_buffer = cStringIO.StringIO() |
| 94 | + raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n') |
| 95 | + raw_data = ''.join(raw_data) |
| 96 | + raw_data = raw_data.encode('utf-8') |
| 97 | + xml_buffer.write(raw_data) |
| 98 | + |
| 99 | + try: |
| 100 | + xml_nodes = cElementTree.XML(xml_buffer.getvalue()) |
| 101 | + revisions = xml_nodes.findall('revision') |
| 102 | + for revision in revisions: |
| 103 | + contributor = xml.retrieve_xml_node(revision, 'contributor') |
| 104 | + username = contributor.find('username') |
| 105 | + if username == None: |
| 106 | + continue |
| 107 | + username = xml.extract_text(username, None) |
| 108 | + #print username.encode('utf-8') |
| 109 | + if username in bots: |
| 110 | + id = contributor.find('id') |
| 111 | + id = xml.extract_text(id, None) |
| 112 | + #print username.encode('utf-8'), id |
| 113 | + bot = bots[username] |
| 114 | + bot['_username'] = username |
| 115 | + bot['id'] = id |
| 116 | + utils.write_dict_to_csv(bot, fh, write_key=False) |
| 117 | + bots.pop(username) |
| 118 | + if bots == {}: |
| 119 | + print 'Found id numbers for all bots.' |
| 120 | + return |
| 121 | + |
| 122 | + except Exception, error: |
| 123 | + print error |
| 124 | + if settings.debug: |
| 125 | + messages = utils.track_errors(xml_buffer, error, file, |
| 126 | + messages) |
| 127 | + fh.close() |
| 128 | + |
| 129 | + if settings.debug: |
| 130 | + utils.report_error_messages(messages, lookup_username) |
| 131 | + |
| 132 | +def bot_launcher(language_code, project): |
| 133 | + bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding) |
| 134 | + files = utils.retrieve_file_list(os.path.join(settings.input_location, language_code, project, 'chunks'), 'xml', mask=None) |
| 135 | + input_queue = pc.load_queue(files, poison_pill=True) |
| 136 | + lookup_bot_userid(input_queue, language_code, project, bots) |
| 137 | + |
| 138 | + |
| 139 | +if __name__ == '__main__': |
| 140 | + language_code = 'en' |
| 141 | + project = 'wiki' |
| 142 | + bot_launcher(language_code, project) |
Property changes on: trunk/tools/editor_trends/bots/bots.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 143 | + native |
Added: svn:mime-type |
2 | 144 | + text/plain |
Property changes on: trunk/tools/editor_trends |
___________________________________________________________________ |
Modified: svn:ignore |
3 | 145 | - wikistats |
zips |
notes.txt |
*.pyc |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
4 | 146 | + wikistats |
zips |
notes.txt |
*.pyc |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |