Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -41,15 +41,15 @@ |
42 | 42 | return getattr(args, key, None) |
43 | 43 | |
44 | 44 | |
45 | | -def config_launcher(args, location, filename, project, language_code): |
| 45 | +def config_launcher(args, location, filename, project, full_project, language_code): |
46 | 46 | config.load_configuration(args) |
47 | 47 | |
48 | 48 | |
49 | 49 | def determine_default_language(): |
50 | 50 | language_code = locale.getdefaultlocale()[0] |
51 | 51 | return language_code.split('_')[0] |
52 | | - |
53 | | - |
| 52 | + |
| 53 | + |
54 | 54 | def retrieve_projectname(args): |
55 | 55 | language_code = retrieve_language(args) |
56 | 56 | if language_code == None: |
— | — | @@ -90,33 +90,39 @@ |
91 | 91 | language_code = retrieve_language(args) |
92 | 92 | locations['language_code'] = language_code |
93 | 93 | locations['location'] = os.path.join(location, language_code, project) |
94 | | - locations['project'] = retrieve_projectname(args) |
| 94 | + locations['project'] = project |
| 95 | + locations['full_project'] = retrieve_projectname(args) |
95 | 96 | locations['filename'] = generate_wikidump_filename(project, args) |
96 | 97 | return locations |
97 | 98 | |
98 | 99 | |
99 | | -def show_settings(args, location, filename, project, language_code): |
| 100 | +def prepare_file_locations(location): |
| 101 | + result = utils.check_file_exists(location, '') |
| 102 | + if result == False: |
| 103 | + utils.create_directory(os.path.join(location)) |
| 104 | + |
| 105 | + |
| 106 | +def show_settings(args, location, filename, project, full_project, language_code): |
100 | 107 | project = settings.WIKIMEDIA_PROJECTS.get(project, 'wiki') |
101 | 108 | project = project.title() |
102 | 109 | language_map = utils.invert_dict(languages.MAPPING) |
103 | 110 | print 'Project: %s' % (project) |
104 | 111 | print 'Language: %s' % language_map[language_code].decode('utf-8') |
105 | 112 | print 'Input directory: %s' % location |
106 | | - print 'Output directory: TODO' |
107 | | - |
| 113 | + print 'Output directory: %s and subdirectories' % location |
108 | 114 | |
109 | | -def dump_downloader_launcher(args, location, filename, project, language_code): |
| 115 | + |
| 116 | +def dump_downloader_launcher(args, location, filename, project, full_project, language_code): |
110 | 117 | print 'dump downloader' |
111 | 118 | pbar = get_value(args, 'progress') |
112 | 119 | domain = settings.WP_DUMP_LOCATION |
113 | 120 | path = '/%s/latest/' % project |
114 | 121 | extension = utils.determine_file_extension(filename) |
115 | 122 | filemode = utils.determine_file_mode(extension) |
116 | | - |
117 | 123 | dump_downloader.download_wiki_file(domain, path, filename, location, filemode, pbar) |
118 | 124 | |
119 | 125 | |
120 | | -def split_xml_file_launcher(args, location, filename, project, language_code): |
| 126 | +def split_xml_file_launcher(args, location, filename, project, full_project, language_code): |
121 | 127 | print 'split_xml_file_launcher' |
122 | 128 | ext = utils.determine_file_extension(filename) |
123 | 129 | if ext in settings.COMPRESSION_EXTENSIONS: |
— | — | @@ -136,7 +142,7 @@ |
137 | 143 | path = config.detect_installed_program('7zip') |
138 | 144 | source = os.path.join(location, file) |
139 | 145 | p = None |
140 | | - |
| 146 | + |
141 | 147 | if settings.OS == 'Windows': |
142 | 148 | p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait() |
143 | 149 | elif settings.OS == 'Linux': |
— | — | @@ -148,18 +154,22 @@ |
149 | 155 | return p |
150 | 156 | |
151 | 157 | |
152 | | -def mongodb_script_launcher(args, location, filename, project, language_code): |
| 158 | +def mongodb_script_launcher(args, location, filename, project, full_project, language_code): |
153 | 159 | print 'mongodb_script_launcher' |
154 | 160 | map_wiki_editors.run_parse_editors(project, language_code, location) |
155 | 161 | |
156 | 162 | |
157 | | -def dataset_launcher(args, project): |
| 163 | +def sort_launcher(args, location, filename, project, full_project, language_code): |
| 164 | + raise NotImplementedError |
| 165 | + |
| 166 | + |
| 167 | +def dataset_launcher(args, full_project): |
158 | 168 | print 'dataset launcher' |
159 | 169 | optimize_editors.run_optimize_editors(project) |
160 | 170 | construct_datasets.generate_editor_dataset_launcher(project) |
161 | 171 | |
162 | 172 | |
163 | | -def all_launcher(args, location, filename, project, language_code): |
| 173 | +def all_launcher(args, location, filename, project, full_project, language_code): |
164 | 174 | print 'all_launcher' |
165 | 175 | dump_downloader_launcher(args, location, filename, project, language_code) |
166 | 176 | split_xml_file_launcher(args, location, filename, project, language_code) |
— | — | @@ -173,7 +183,7 @@ |
174 | 184 | return tuple(choices) |
175 | 185 | |
176 | 186 | |
177 | | -def show_languages(args, location, filename, project, language_code): |
| 187 | +def show_languages(args, location, filename, project, full_project, language_code): |
178 | 188 | first = get_value(args, 'startswith') |
179 | 189 | if first != None: |
180 | 190 | first = first.title() |
— | — | @@ -193,18 +203,18 @@ |
194 | 204 | |
195 | 205 | |
196 | 206 | def detect_python_version(): |
197 | | - version = ''.join(sys.version_info[0:2]) |
| 207 | + version = sys.version_info[0:2] |
198 | 208 | if version < settings.MINIMUM_PYTHON_VERSION: |
199 | | - raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).' |
| 209 | + raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).' |
200 | 210 | |
201 | 211 | def about(): |
202 | 212 | print 'Editor Trends Software is (c) 2010 by the Wikimedia Foundation.' |
203 | 213 | print 'Written by Diederik van Liere (dvanliere@gmail.com).' |
204 | 214 | print 'This software comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to distribute it under certain conditions.' |
205 | 215 | print 'See the README.1ST file for more information.' |
206 | | - print '' |
207 | | - |
| 216 | + print '\n' |
208 | 217 | |
| 218 | + |
209 | 219 | def main(): |
210 | 220 | default_language = determine_default_language() |
211 | 221 | file_choices = ('stub-meta-history.xml.gz', |
— | — | @@ -216,8 +226,8 @@ |
217 | 227 | subparsers = parser.add_subparsers(help='sub-command help') |
218 | 228 | |
219 | 229 | parser_languages = subparsers.add_parser('show_languages', help='Overview of all valid languages.') |
220 | | - parser_languages.add_argument('-s', '--startswith', |
221 | | - action='store', |
| 230 | + parser_languages.add_argument('-s', '--startswith', |
| 231 | + action='store', |
222 | 232 | help='Enter the first letter of a language to see which languages are available.') |
223 | 233 | parser_languages.set_defaults(func=show_languages) |
224 | 234 | |
— | — | @@ -230,12 +240,15 @@ |
231 | 241 | parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.') |
232 | 242 | parser_split.set_defaults(func=split_xml_file_launcher) |
233 | 243 | |
| 244 | + parser_sort = subparsers.add_parser('sort', help='By presorting the data, significant processing time reducations are achieved.') |
| 245 | + parser_sort.set_defaults(func=sort_launcher) |
| 246 | + |
234 | 247 | parser_create = subparsers.add_parser('store', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.') |
235 | 248 | parser_create.set_defaults(func=mongodb_script_launcher) |
236 | 249 | |
237 | 250 | parser_dataset = subparsers.add_parser('dataset', help='Create a dataset from the MongoDB and write it to a csv file.') |
238 | 251 | parser_dataset.set_defaults(func=dataset_launcher) |
239 | | - |
| 252 | + |
240 | 253 | parser_all = subparsers.add_parser('all', help='The all sub command runs the download, split, store and dataset commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.') |
241 | 254 | parser_all.set_defaults(func=all_launcher) |
242 | 255 | |
— | — | @@ -265,6 +278,7 @@ |
266 | 279 | args = parser.parse_args() |
267 | 280 | config.load_configuration(args) |
268 | 281 | locations = determine_file_locations(args) |
| 282 | + prepare_file_locations(locations['location']) |
269 | 283 | about() |
270 | 284 | show_settings(args, **locations) |
271 | 285 | args.func(args, **locations) |
Index: trunk/tools/editor_trends/optimize_editors.py |
— | — | @@ -29,6 +29,13 @@ |
30 | 30 | import construct_datasets |
31 | 31 | |
32 | 32 | |
| 33 | +try: |
| 34 | + import psyco |
| 35 | + psyco.full() |
| 36 | +except ImportError: |
| 37 | + pass |
| 38 | + |
| 39 | + |
33 | 40 | def create_datacontainer(init_value=0): |
34 | 41 | ''' |
35 | 42 | This function initializes an empty dictionary with as key the year (starting |
— | — | @@ -43,6 +50,29 @@ |
44 | 51 | return data |
45 | 52 | |
46 | 53 | |
| 54 | +def add_months_to_datacontainer(datacontainer): |
| 55 | + for dc in datacontainer: |
| 56 | + datacontainer[dc] = {} |
| 57 | + for x in xrange(1, 13): |
| 58 | + datacontainer[dc][str(x)] = 0 |
| 59 | + return datacontainer |
| 60 | + |
| 61 | + |
| 62 | +def determine_edits_by_month(edits): |
| 63 | + datacontainer = create_datacontainer(init_value=0) |
| 64 | + datacontainer = add_months_to_datacontainer(datacontainer) |
| 65 | + for year in edits: |
| 66 | + months = set() |
| 67 | + for edit in edits[year]: |
| 68 | + m = str(edit['date'].month) |
| 69 | + if m not in months: |
| 70 | + datacontainer[year][m] = 1 |
| 71 | + months.add(m) |
| 72 | + if len(months) == 12: |
| 73 | + break |
| 74 | + return datacontainer |
| 75 | + |
| 76 | + |
47 | 77 | def determine_edits_by_year(dates): |
48 | 78 | ''' |
49 | 79 | This function counts the number of edits by year made by a particular editor. |
— | — | @@ -64,19 +94,19 @@ |
65 | 95 | year = str(date['date'].year) |
66 | 96 | articles[year].add(date['article']) |
67 | 97 | for article in articles: |
68 | | - articles[article] = len(article) |
| 98 | + articles[article] = len(articles[article]) |
69 | 99 | return articles |
70 | 100 | |
71 | 101 | |
72 | 102 | def sort_edits(edits): |
73 | 103 | edits = utils.merge_list(edits) |
74 | 104 | return sorted(edits, key=itemgetter('date')) |
75 | | - |
76 | 105 | |
| 106 | + |
77 | 107 | def optimize_editors(input_queue, result_queue, pbar, **kwargs): |
78 | 108 | dbname = kwargs.pop('dbname') |
79 | 109 | mongo = db.init_mongo_db(dbname) |
80 | | - input = mongo['editors'] |
| 110 | + input = mongo['test'] |
81 | 111 | output = mongo['dataset'] |
82 | 112 | output.ensure_index('editor') |
83 | 113 | output.ensure_index('year_joined') |
— | — | @@ -85,7 +115,10 @@ |
86 | 116 | try: |
87 | 117 | id = input_queue.get(block=False) |
88 | 118 | editor = input.find_one({'editor': id}) |
| 119 | + if editor == None: |
| 120 | + continue |
89 | 121 | edits = editor['edits'] |
| 122 | + monthly_edits = determine_edits_by_month(edits) |
90 | 123 | edits = sort_edits(edits) |
91 | 124 | edit_count = len(edits) |
92 | 125 | new_wikipedian = edits[9]['date'] |
— | — | @@ -93,6 +126,7 @@ |
94 | 127 | final_edit = edits[-1]['date'] |
95 | 128 | edits_by_year = determine_edits_by_year(edits) |
96 | 129 | articles_by_year = determine_articles_by_year(edits) |
| 130 | + |
97 | 131 | edits = edits[:10] |
98 | 132 | |
99 | 133 | output.insert({'editor': id, 'edits': edits, |
— | — | @@ -101,7 +135,8 @@ |
102 | 136 | 'edit_count': edit_count, |
103 | 137 | 'final_edit': final_edit, |
104 | 138 | 'first_edit': first_edit, |
105 | | - 'articles_by_year': articles_by_year}) |
| 139 | + 'articles_by_year': articles_by_year, |
| 140 | + 'monthly_edits': monthly_edits}) |
106 | 141 | print 'Items left: %s' % input_queue.qsize() |
107 | 142 | except Empty: |
108 | 143 | break |
— | — | @@ -114,20 +149,11 @@ |
115 | 150 | 'dbname': 'enwiki', |
116 | 151 | 'nr_input_processors': 1, |
117 | 152 | 'nr_output_processors': 0, |
| 153 | + 'poison_pill': False |
118 | 154 | } |
119 | 155 | print len(ids) |
120 | 156 | ids = list(ids) |
121 | | - chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES) |
122 | | -# chunks = {} |
123 | | -# parts = int(round(float(len(ids)) / 1, 0)) |
124 | | -# a = 0 |
125 | | -# for x in xrange(settings.NUMBER_OF_PROCESSES): |
126 | | -# b = a + parts |
127 | | -# chunks[x] = ids[a:b] |
128 | | -# a = (x + 1) * parts |
129 | | -# if a >= len(ids): |
130 | | -# break |
131 | | - |
| 157 | + chunks = dict(0, ids) |
132 | 158 | pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs) |
133 | 159 | |
134 | 160 | |
— | — | @@ -142,4 +168,4 @@ |
143 | 169 | |
144 | 170 | if __name__ == '__main__': |
145 | 171 | #debug_optimize_editors('test') |
146 | | - run_optimize_editors('enwiki') |
| 172 | + run_optimize_editors('enwiki') |
\ No newline at end of file |
Index: trunk/tools/editor_trends/settings.py |
— | — | @@ -54,7 +54,7 @@ |
55 | 55 | IGNORE_DIRS = ['wikistats', 'zips'] |
56 | 56 | ROOT = '/' if OS != 'Windows' else 'c:\\' |
57 | 57 | |
58 | | -MINIMUM_PYTHON_VERSION = 2.6 |
| 58 | +MINIMUM_PYTHON_VERSION = (2, 6) |
59 | 59 | |
60 | 60 | dirs = [name for name in os.listdir(WORKING_DIRECTORY) if |
61 | 61 | os.path.isdir(os.path.join(WORKING_DIRECTORY, name))] |
— | — | @@ -111,7 +111,7 @@ |
112 | 112 | MAX_XML_FILE_SIZE = 67108864 |
113 | 113 | |
114 | 114 | if OS == 'Windows' and ARCH == 'i386': |
115 | | - MAX_FILES_OPEN = win32file._getmaxstdio() |
| 115 | + MAX_FILES_OPEN = win32file._getmaxstdio() |
116 | 116 | elif OS != 'Windows': |
117 | 117 | MAX_FILES_OPEN = resource.getrlimit(resource.RLIMIT_NOFILE) |
118 | 118 | else: |
Index: trunk/tools/editor_trends/run.py |
— | — | @@ -1,5 +1,48 @@ |
2 | 2 | import os |
| 3 | +import settings |
| 4 | +#from utils import namespace_downloader as nd |
| 5 | +#nd.launch_downloader() |
3 | 6 | |
| 7 | + |
| 8 | +#def which(program): |
| 9 | +# import os |
| 10 | +# def is_exe(fpath): |
| 11 | +# return os.path.exists(fpath) and os.access(fpath, os.X_OK) |
| 12 | +# |
| 13 | +# fpath, fname = os.path.split(program) |
| 14 | +# if fpath: |
| 15 | +# if is_exe(program): |
| 16 | +# return program |
| 17 | +# else: |
| 18 | +# for path in os.environ["PATH"].split(os.pathsep): |
| 19 | +# exe_file = os.path.join(path, program) |
| 20 | +# if is_exe(exe_file): |
| 21 | +# return exe_file |
| 22 | +# |
| 23 | +# return None |
| 24 | +# |
| 25 | +# |
| 26 | +#result = which('7z.exe') |
| 27 | +#print result |
| 28 | + |
| 29 | +#from database import launcher |
| 30 | +#launcher.launcher() |
| 31 | +from utils import sort |
| 32 | +input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'txt') |
| 33 | +output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted') |
| 34 | +dbname = 'enwiki' |
| 35 | +#sort.debug_mergesort_feeder(input, output) |
| 36 | +#sort.mergesort_launcher(input, output) |
| 37 | +#sort.mergesort_external_launcher(dbname, output, output) |
| 38 | + |
| 39 | + |
| 40 | + |
| 41 | + |
| 42 | + |
| 43 | +from analyses import cohort_charts |
| 44 | +cohort_charts.prepare_cohort_dataset() |
| 45 | +import os |
| 46 | + |
4 | 47 | import settings |
5 | 48 | #from utils import namespace_downloader as nd |
6 | 49 | #nd.launch_downloader() |
Index: trunk/tools/editor_trends/config.py |
— | — | @@ -24,10 +24,13 @@ |
25 | 25 | |
26 | 26 | import settings |
27 | 27 | from utils import utils |
| 28 | +try: |
| 29 | + from _winreg import * |
| 30 | +except ImportError: |
| 31 | + pass |
28 | 32 | |
29 | | - |
30 | 33 | def detect_windows_program(program): |
31 | | - from _winreg import * |
| 34 | + |
32 | 35 | entry = settings.WINDOWS_REGISTER[program] |
33 | 36 | try: |
34 | 37 | key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ) |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -171,7 +171,7 @@ |
172 | 172 | return 'wb' |
173 | 173 | |
174 | 174 | |
175 | | -def write_list_to_csv(data, fh, recursive=False): |
| 175 | +def write_list_to_csv(data, fh, recursive=False, newline=True): |
176 | 176 | ''' |
177 | 177 | @data is a list which can contain other lists that will be written as a |
178 | 178 | single line to a textfile |
— | — | @@ -188,27 +188,32 @@ |
189 | 189 | if tab: |
190 | 190 | fh.write('\t') |
191 | 191 | if type(d) == type([]): |
192 | | - recursive = write_list_to_csv(d, fh, True) |
| 192 | + recursive = write_list_to_csv(d, fh, recursive=True, newline=False) |
| 193 | + elif type(d) == type({}): |
| 194 | + tab = write_dict_to_csv(d, fh, write_key=False, newline=newline) |
193 | 195 | else: |
194 | 196 | fh.write('%s' % d) |
195 | 197 | tab = True |
196 | 198 | if recursive: |
197 | 199 | tab = False |
198 | 200 | return True |
199 | | - fh.write('\n') |
| 201 | + if newline: |
| 202 | + fh.write('\n') |
200 | 203 | |
201 | 204 | |
202 | | -def write_dict_to_csv(data, fh): |
| 205 | +def write_dict_to_csv(data, fh, write_key=True, newline=True): |
203 | 206 | keys = data.keys() |
204 | 207 | for key in keys: |
205 | | - fh.write('%s' % key) |
206 | | - for obs in data[key]: |
207 | | - if getattr(obs, '__iter__', False): |
208 | | - for o in obs: |
209 | | - fh.write('\t%s' % o) |
210 | | - else: |
211 | | - fh.write('\t%s' % (obs)) |
| 208 | + if write_key: |
| 209 | + fh.write('%s' % key) |
| 210 | + if getattr(data[key], '__iter__', False): |
| 211 | + for d in data[key]: |
| 212 | + fh.write('\t%s' % d) |
| 213 | + else: |
| 214 | + fh.write('%s\t' % (data[key])) |
| 215 | + if newline: |
212 | 216 | fh.write('\n') |
| 217 | + return False #this prevents the calling function from writing \t |
213 | 218 | |
214 | 219 | |
215 | 220 | def create_txt_filehandle(location, name, mode, encoding): |
— | — | @@ -326,6 +331,7 @@ |
327 | 332 | files.append('.'.join(file)) |
328 | 333 | return files |
329 | 334 | |
| 335 | + |
330 | 336 | def merge_list(datalist): |
331 | 337 | merged = [] |
332 | 338 | for d in datalist: |
— | — | @@ -333,6 +339,7 @@ |
334 | 340 | merged.append(x) |
335 | 341 | return merged |
336 | 342 | |
| 343 | + |
337 | 344 | def split_list(datalist, maxval): |
338 | 345 | chunks = {} |
339 | 346 | a = 0 |
Index: trunk/tools/editor_trends/utils/dump_downloader.py |
— | — | @@ -29,21 +29,25 @@ |
30 | 30 | |
31 | 31 | |
32 | 32 | |
33 | | -def determine_remote_filesize(url, filename): |
| 33 | +def determine_remote_filesize(domain, filename): |
34 | 34 | ''' |
35 | | - @url is the full path of the file to be downloaded |
| 35 | + @domain is the full path of the file to be downloaded |
36 | 36 | @filename is the name of the file to be downloaded |
37 | 37 | ''' |
38 | | - if url.startswith('http://'): |
39 | | - url = url[7:] |
40 | | - conn = httplib.HTTPConnection(url, 80) |
41 | | - conn.request('HEAD', filename) |
42 | | - res = conn.getresponse() |
43 | | - conn.close() |
44 | | - if res.status == 200: |
45 | | - return int(res.getheader('content-length', -1)) |
46 | | - else: |
47 | | - return - 1 |
| 38 | + try: |
| 39 | + if domain.startswith('http://'): |
| 40 | + domain = domain[7:] |
| 41 | + conn = httplib.HTTPConnection(domain) |
| 42 | + conn.request('HEAD', filename) |
| 43 | + res = conn.getresponse() |
| 44 | + conn.close() |
| 45 | + if res.status == 200: |
| 46 | + return int(res.getheader('content-length', -1)) |
| 47 | + else: |
| 48 | + return - 1 |
| 49 | + except httplib.socket.error: |
| 50 | + #print 'It seemst that %s is temporarily unavailable, please try again later.' % url |
| 51 | + raise httplib.NotConnected('It seems that %s is temporarily unavailable, please try again later.' % url) |
48 | 52 | |
49 | 53 | |
50 | 54 | def download_wiki_file(domain, path, filename, location, filemode, pbar): |
— | — | @@ -57,17 +61,12 @@ |
58 | 62 | @pbar is an instance of progressbar.ProgressBar() |
59 | 63 | ''' |
60 | 64 | chunk = 4096 |
61 | | - result = utils.check_file_exists(location, '') |
62 | | - if result == False: |
63 | | - utils.create_directory(os.path.join(location)) |
| 65 | + filesize = determine_remote_filesize(domain, path + filename) |
64 | 66 | if filemode == 'w': |
65 | 67 | fh = utils.create_txt_filehandle(location, filename, filemode, settings.ENCODING) |
66 | 68 | else: |
67 | 69 | fh = utils.create_binary_filehandle(location, filename, 'wb') |
68 | 70 | |
69 | | - filesize = determine_remote_filesize(domain, path + filename) |
70 | | - |
71 | | - |
72 | 71 | if filesize != -1 and pbar: |
73 | 72 | widgets = ['%s: ' % filename, progressbar.Percentage(), ' ', |
74 | 73 | progressbar.Bar(marker=progressbar.RotatingMarker()),' ', |
Index: trunk/tools/editor_trends/utils/models.py |
— | — | @@ -30,13 +30,14 @@ |
31 | 31 | for kw in kwargs: |
32 | 32 | setattr(self, kw, kwargs[kw]) |
33 | 33 | |
34 | | - def run(self): |
| 34 | + def start(self): |
35 | 35 | proc_name = self.name |
36 | 36 | kwargs = {} |
37 | 37 | IGNORE = ['input_queue', 'result_queue', 'target'] |
38 | 38 | for kw in self.__dict__: |
39 | 39 | if kw not in IGNORE and not kw.startswith('_'): |
40 | 40 | kwargs[kw] = getattr(self, kw) |
| 41 | + self._popen = True |
41 | 42 | self.target(self.input_queue, self.result_queue, **kwargs) |
42 | 43 | |
43 | 44 | |
— | — | @@ -50,11 +51,12 @@ |
51 | 52 | setattr(self, kw, kwargs[kw]) |
52 | 53 | |
53 | 54 | |
54 | | - def run(self): |
| 55 | + def start(self): |
55 | 56 | proc_name = self.name |
56 | 57 | kwargs = {} |
57 | 58 | IGNORE = ['result_queue', 'target'] |
58 | 59 | for kw in self.__dict__: |
59 | 60 | if kw not in IGNORE and not kw.startswith('_'): |
60 | 61 | kwargs[kw] = getattr(self, kw) |
| 62 | + self._popen = True |
61 | 63 | self.target(self.result_queue, **kwargs) |
Index: trunk/tools/editor_trends/utils/process_constructor.py |
— | — | @@ -81,7 +81,7 @@ |
82 | 82 | **kwargs) for i in xrange(nr_input_processors)] |
83 | 83 | |
84 | 84 | for input_process in input_processes: |
85 | | - input_process.run() |
| 85 | + input_process.start() |
86 | 86 | pids = [p.pid for p in input_processes] |
87 | 87 | kwargs['pids'] = pids |
88 | 88 | |
Index: trunk/tools/editor_trends/utils/sort.py |
— | — | @@ -105,6 +105,7 @@ |
106 | 106 | mongo = db.init_mongo_db(dbname) |
107 | 107 | collection = mongo['test'] |
108 | 108 | mongo.collection.ensure_index('editor') |
| 109 | + mongo.collection.create_index('editor') |
109 | 110 | editor_cache = cache.EditorCache(collection) |
110 | 111 | prev_contributor = -1 |
111 | 112 | x = 0 |
— | — | @@ -113,12 +114,15 @@ |
114 | 115 | for line in readline(fh): |
115 | 116 | if len(line) == 0: |
116 | 117 | continue |
117 | | - contributor = int(line[0]) |
| 118 | + contributor = int(line[0]) |
| 119 | + if contributor == 5767932: |
| 120 | + print 'debug' |
118 | 121 | if prev_contributor != contributor: |
119 | 122 | if edits >= 10: |
120 | 123 | result = editor_cache.add(prev_contributor, 'NEXT') |
121 | 124 | if result: |
122 | | - editors.add(contributor) |
| 125 | + editors.add(prev_contributor) |
| 126 | + result = None |
123 | 127 | x += 1 |
124 | 128 | print 'Stored %s editors' % x |
125 | 129 | else: |
Index: trunk/tools/editor_trends/construct_datasets.py |
— | — | @@ -19,6 +19,8 @@ |
20 | 20 | |
21 | 21 | from multiprocessing import Queue |
22 | 22 | from Queue import Empty |
| 23 | +import datetime |
| 24 | +from dateutil.relativedelta import * |
23 | 25 | |
24 | 26 | import progressbar |
25 | 27 | |
— | — | @@ -67,7 +69,18 @@ |
68 | 70 | obs[var] = edits |
69 | 71 | return obs |
70 | 72 | |
| 73 | +def write_longitudinal_data(id, edits, fh): |
| 74 | + years = edits.keys() |
| 75 | + years.sort() |
| 76 | + for year in years: |
| 77 | + months = edits[year].keys() |
| 78 | + months = [int(m) for m in months] |
| 79 | + months.sort() |
| 80 | + for m in months: |
| 81 | + date = datetime.date(int(year), int(m), 1) |
| 82 | + fh.write('%s\t%s\t%s\n' % (id, date, edits[year][str(m)])) |
71 | 83 | |
| 84 | + |
72 | 85 | def expand_headers(headers, vars_to_expand, obs): |
73 | 86 | for var in vars_to_expand: |
74 | 87 | l = len(obs[var]) |
— | — | @@ -77,29 +90,105 @@ |
78 | 91 | suffix = 2001 + i |
79 | 92 | elif var.endswith('edits'): |
80 | 93 | suffix = 1 + i |
81 | | - headers.insert(pos+i, '%s_%s' % (var, suffix)) |
| 94 | + headers.insert(pos + i, '%s_%s' % (var, suffix)) |
82 | 95 | headers.remove(var) |
83 | 96 | return headers |
84 | | - |
85 | | - |
86 | | -def generate_editor_dataset(input_queue, data_queue, pbar, **kwargs): |
| 97 | + |
| 98 | + |
| 99 | +def generate_long_editor_dataset(input_queue, data_queue, pbar, **kwargs): |
87 | 100 | debug = kwargs.pop('debug') |
88 | 101 | dbname = kwargs.pop('dbname') |
89 | 102 | mongo = db.init_mongo_db(dbname) |
90 | 103 | editors = mongo['dataset'] |
91 | | - name = dbname + '_editors.csv' |
| 104 | + name = dbname + '_long_editors.csv' |
92 | 105 | fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING) |
93 | 106 | x = 0 |
94 | | - vars_to_expand = ['edits', 'edits_by_year'] |
| 107 | + vars_to_expand = [] |
95 | 108 | while True: |
96 | 109 | try: |
| 110 | + id = input_queue.get(block=False) |
| 111 | + obs = editors.find_one({'editor': id}, {'monthly_edits': 1}) |
| 112 | + if x == 0: |
| 113 | + headers = obs.keys() |
| 114 | + headers.sort() |
| 115 | + headers = expand_headers(headers, vars_to_expand, obs) |
| 116 | + utils.write_list_to_csv(headers, fh) |
| 117 | + write_longitudinal_data(id, obs['monthly_edits'], fh) |
| 118 | + #utils.write_list_to_csv(data, fh) |
| 119 | + x += 1 |
| 120 | + except Empty: |
| 121 | + break |
| 122 | + |
| 123 | + |
| 124 | +def generate_cohort_analysis(input_queue, data_queue, pbar, **kwargs): |
| 125 | + dbname = kwargs.get('dbname') |
| 126 | + pbar = kwargs.get('pbar') |
| 127 | + mongo = db.init_mongo_db(dbname) |
| 128 | + editors = mongo['dataset'] |
| 129 | + year = datetime.datetime.now().year + 1 |
| 130 | + begin = year - 2001 |
| 131 | + p = [3, 6, 9] |
| 132 | + periods = [y * 12 for y in xrange(1, begin)] |
| 133 | + periods = p + periods |
| 134 | + data = {} |
| 135 | + while True: |
| 136 | + try: |
| 137 | + id = input_queue.get(block=False) |
| 138 | + obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1}) |
| 139 | + first_edit = obs['first_edit'] |
| 140 | + last_edit = obs['final_edit'] |
| 141 | + for y in xrange(2001, year): |
| 142 | + if y == 2010 and first_edit > datetime.datetime(2010, 1, 1): |
| 143 | + print 'debug' |
| 144 | + if y not in data: |
| 145 | + data[y] = {} |
| 146 | + data[y]['n'] = 0 |
| 147 | + window_end = datetime.datetime(y, 12, 31) |
| 148 | + if window_end > datetime.datetime.now(): |
| 149 | + now = datetime.datetime.now() |
| 150 | + m = now.month - 1 #Dump files are always lagging at least one month.... |
| 151 | + d = now.day |
| 152 | + window_end = datetime.datetime(y, m, d) |
| 153 | + edits = [] |
| 154 | + for period in periods: |
| 155 | + if period not in data[y]: |
| 156 | + data[y][period] = 0 |
| 157 | + window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period) |
| 158 | + if window_start < datetime.datetime(2001, 1, 1): |
| 159 | + window_start = datetime.datetime(2001, 1, 1) |
| 160 | + if date_falls_in_window(window_start, window_end, first_edit, last_edit): |
| 161 | + edits.append(period) |
| 162 | + if edits != []: |
| 163 | + p = min(edits) |
| 164 | + data[y]['n'] += 1 |
| 165 | + data[y][p] += 1 |
| 166 | + #pbar.update(+1) |
| 167 | + except Empty: |
| 168 | + break |
| 169 | + utils.store_object(data, settings.BINARY_OBJECT_FILE_LOCATION, 'cohort_data') |
| 170 | + |
| 171 | +def date_falls_in_window(window_start, window_end, first_edit, last_edit): |
| 172 | + if first_edit >= window_start and first_edit <= window_end: |
| 173 | + return True |
| 174 | + else: |
| 175 | + return False |
| 176 | + |
| 177 | + |
| 178 | +def generate_wide_editor_dataset(input_queue, data_queue, pbar, **kwargs): |
| 179 | + dbname = kwargs.pop('dbname') |
| 180 | + mongo = db.init_mongo_db(dbname) |
| 181 | + editors = mongo['dataset'] |
| 182 | + name = dbname + '_wide_editors.csv' |
| 183 | + fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING) |
| 184 | + x = 0 |
| 185 | + vars_to_expand = ['edits', 'edits_by_year', 'articles_by_year'] |
| 186 | + while True: |
| 187 | + try: |
97 | 188 | if debug: |
98 | 189 | id = u'99797' |
99 | 190 | else: |
100 | 191 | id = input_queue.get(block=False) |
101 | | - |
102 | 192 | print input_queue.qsize() |
103 | | - |
104 | 193 | obs = editors.find_one({'editor': id}) |
105 | 194 | obs = expand_observations(obs, vars_to_expand) |
106 | 195 | if x == 0: |
— | — | @@ -107,14 +196,12 @@ |
108 | 197 | headers.sort() |
109 | 198 | headers = expand_headers(headers, vars_to_expand, obs) |
110 | 199 | utils.write_list_to_csv(headers, fh) |
111 | | - fh.write('\n') |
112 | 200 | data = [] |
113 | 201 | keys = obs.keys() |
114 | 202 | keys.sort() |
115 | 203 | for key in keys: |
116 | 204 | data.append(obs[key]) |
117 | 205 | utils.write_list_to_csv(data, fh) |
118 | | - fh.write('\n') |
119 | 206 | |
120 | 207 | x += 1 |
121 | 208 | except Empty: |
— | — | @@ -141,20 +228,13 @@ |
142 | 229 | 'nr_output_processors': 1, |
143 | 230 | 'debug': False, |
144 | 231 | 'dbname': dbname, |
| 232 | + 'poison_pill':False, |
| 233 | + 'pbar': True |
145 | 234 | } |
146 | 235 | ids = retrieve_editor_ids_mongo(dbname, 'editors') |
147 | | - chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES) |
148 | | -# chunks = {} |
149 | | -# parts = int(round(float(len(ids)) / 1, 0)) |
150 | | -# a = 0 |
151 | | -# for x in xrange(settings.NUMBER_OF_PROCESSES): |
152 | | -# b = a + parts |
153 | | -# chunks[x] = ids[a:b] |
154 | | -# a = (x + 1) * parts |
155 | | -# if a >= len(ids): |
156 | | -# break |
157 | | -# |
158 | | - pc.build_scaffolding(pc.load_queue, generate_editor_dataset, chunks, False, False, **kwargs) |
| 236 | + ids = list(ids) |
| 237 | + chunks = dict({0: ids}) |
| 238 | + pc.build_scaffolding(pc.load_queue, generate_cohort_analysis, chunks, False, False, **kwargs) |
159 | 239 | |
160 | 240 | |
161 | 241 | def generate_editor_dataset_debug(dbname): |
Index: trunk/tools/editor_trends/database/cache.py |
— | — | @@ -63,16 +63,17 @@ |
64 | 64 | def current_cache_size(self): |
65 | 65 | return sum([self.editors[k].get('obs', 0) for k in self.editors]) |
66 | 66 | |
| 67 | + def clear(self, key): |
| 68 | + if key in self.editors: |
| 69 | + del self.editors[key] |
| 70 | + |
67 | 71 | def add(self, key, value): |
68 | 72 | if value == 'NEXT': |
69 | | - for editor in self.treshold_editors: |
70 | | - self.insert (editor, self.editors[editor]['edits']) |
71 | | - self.n -= self.editors[editor]['obs'] |
72 | | - self.number_editors -= 1 |
73 | | - del self.editors[editor] |
74 | | - if key in self.editors: |
75 | | - del self.editors[key] |
76 | | - self.treshold_editors = set() |
| 73 | + result = self.insert(key, self.editors[key]['edits']) |
| 74 | + self.n -= self.editors[key]['obs'] |
| 75 | + self.number_editors -= 1 |
| 76 | + del self.editors[key] |
| 77 | + return result |
77 | 78 | else: |
78 | 79 | self.cumulative_n += 1 |
79 | 80 | self.n += 1 |
— | — | @@ -88,8 +89,8 @@ |
89 | 90 | self.editors[key]['edits'][year].append(value) |
90 | 91 | self.editors[key]['obs'] += 1 |
91 | 92 | |
92 | | - if self.editors[key]['obs'] == self.treshold: |
93 | | - self.treshold_editors.add(key) |
| 93 | + #if self.editors[key]['obs'] == self.treshold: |
| 94 | + # self.treshold_editors.add(key) |
94 | 95 | |
95 | 96 | def add_years(self, key): |
96 | 97 | now = datetime.datetime.now().year + 1 |
— | — | @@ -103,8 +104,9 @@ |
104 | 105 | def insert(self, editor, values): |
105 | 106 | try: |
106 | 107 | self.collection.insert({'editor': editor, 'edits': values}) |
| 108 | + return True |
107 | 109 | except: |
108 | | - pass |
| 110 | + return False |
109 | 111 | |
110 | 112 | def store(self): |
111 | 113 | utils.store_object(self, settings.BINARY_OBJECT_FILE_LOCATION, self.__repr__()) |