r76979 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r76978‎ \| r76979 \| r76980 >
Date:	22:21, 18 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Fixed some missed renames
Modified paths:	/trunk/tools/editor_trends/etl/construct_datasets.py (modified) (history) /trunk/tools/editor_trends/etl/loader.py (modified) (history) /trunk/tools/editor_trends/etl/optimize_editors.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history) /trunk/tools/editor_trends/utils/sort.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -33,9 +33,9 @@
34	34	from utils import utils
35	35	from utils import dump_downloader
36	36	from etl import chunker
37		~~-import map_wiki_editors~~
38		~~-import optimize_editors~~
39		~~-import construct_datasets~~
	37	+from etl import extract
	38	+from etl import optimize_editors
	39	+from etl import construct_datasets
40	40	import config
41	41
42	42
—	—	@@ -155,7 +155,7 @@
156	156
157	157	def mongodb_script_launcher(args, location, filename, project, full_project, language_code, language):
158	158	print 'mongodb_script_launcher'
159		~~- map_wiki_editors.run_parse_editors(project, language_code, location)~~
	159	+ extract.run_parse_editors(project, language_code, location)
160	160
161	161
162	162	def sort_launcher(args, location, filename, project, full_project, language_code):
Index: trunk/tools/editor_trends/etl/optimize_editors.py
—	—	@@ -21,7 +21,9 @@
22	22	from Queue import Empty
23	23	from operator import itemgetter
24	24	import datetime
	25	+import sys
25	26
	27	+sys.path.append('..')
26	28	import configuration
27	29	settings = configuration.Settings()
28	30	from database import db
—	—	@@ -154,7 +156,7 @@
155	157	}
156	158	print len(ids)
157	159	ids = list(ids)
158		~~- chunks = dict(0, ids)~~
	160	+ chunks = {0: ids}
159	161	pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
160	162
161	163
Index: trunk/tools/editor_trends/etl/construct_datasets.py
—	—	@@ -21,9 +21,10 @@
22	22	from Queue import Empty
23	23	import datetime
24	24	from dateutil.relativedelta import *
25		-
	25	+import sys
26	26	import progressbar
27	27
	28	+sys.path.append('..')
28	29	import configuration
29	30	settings = configuration.Settings()
30	31	from utils import models, utils
Index: trunk/tools/editor_trends/etl/loader.py
—	—	@@ -17,8 +17,9 @@
18	18	__date__ = '2010-11-16'
19	19	__version__ = '0.1'
20	20
21		-
	21	+import os
22	22	import sys
	23	+from Queue import Empty
23	24
24	25	sys.path.append('..')
25	26	import configuration
—	—	@@ -26,9 +27,11 @@
27	28	from database import db
28	29	from database import cache
29	30	from utils import utils
	31	+from utils import sort
30	32	import process_constructor as pc
31	33
32	34
	35	+
33	36	def store_editors(input, filename, dbname):
34	37	fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding)
35	38	mongo = db.init_mongo_db(dbname)
—	—	@@ -77,17 +80,17 @@
78	81	chunks = utils.split_list(files, int(x))
79	82	'''1st iteration external mergesort'''
80	83	for chunk in chunks:
81		~~- filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]]~~
82		~~- filename = merge_sorted_files(output, filehandles, chunk)~~
83		~~- filehandles = [fh.close() for fh in filehandles]~~
	84	+ #filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]]
	85	+ #filename = sort.merge_sorted_files(output, filehandles, chunk)
	86	+ #filehandles = [fh.close() for fh in filehandles]
84	87	pass
85	88	'''2nd iteration external mergesort, if necessary'''
86	89	if len(chunks) > 1:
87	90	files = utils.retrieve_file_list(output, 'txt', mask='[merged]')
88	91	filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.encoding) for file in files]
89		~~- filename = merge_sorted_files(output, filehandles, 'final')~~
	92	+ filename = sort.merge_sorted_files(output, filehandles, 'final')
90	93	filehandles = [fh.close() for fh in filehandles]
91		~~- filename = 'merged_final.txt'~~
	94	+ filename = 'merged_final.txt'
92	95	store_editors(output, filename, dbname)
93	96
94	97
—	—	@@ -102,8 +105,8 @@
103	106	fh.close()
104	107	data = [d.replace('\n', '') for d in data]
105	108	data = [d.split('\t') for d in data]
106		~~- sorted_data = mergesort(data)~~
107		~~- write_sorted_file(sorted_data, file, output)~~
	109	+ sorted_data = sort.mergesort(data)
	110	+ sort.write_sorted_file(sorted_data, file, output)
108	111	except Empty:
109	112	break
110	113
—	—	@@ -136,5 +139,5 @@
137	140	input = os.path.join(settings.input_location, 'en', 'wiki', 'txt')
138	141	output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
139	142	dbname = 'enwiki'
140		~~- mergesort_launcher(input, output)~~
	143	+ #mergesort_launcher(input, output)
141	144	mergesort_external_launcher(dbname, output, output)
\ No newline at end of file
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -331,6 +331,8 @@
332	332	all_files = os.listdir(location)
333	333	files = []
334	334	for file in all_files:
	335	+ if file == 'merged_1.txt':
	336	+ print 'debug'
335	337	file = file.split('.')
336	338	if len(file) == 1:
337	339	continue
Index: trunk/tools/editor_trends/utils/process_constructor.py
—	—	@@ -74,7 +74,7 @@
75	75	else:
76	76	result_queues[i] = False
77	77
78		~~- if settings.progress_bar:~~
	78	+ if settings.progressbar:
79	79	size = sum([input_queues[q].qsize() for q in input_queues])
80	80	pbar = progressbar.ProgressBar(maxval=size).start()
81	81	kwargs['pbar'] = pbar
Index: trunk/tools/editor_trends/utils/sort.py
—	—	@@ -28,9 +28,17 @@
29	29	from multiprocessing import Queue
30	30	from Queue import Empty
31	31	import datetime
	32	+import sys
32	33
	34	+sys.path.append('..')
	35	+import configuration
	36	+settings = configuration.Settings()
33	37
34	38
	39	+import utils
	40	+
	41	+
	42	+
35	43	def quick_sort(obs):
36	44	if obs == []:
37	45	return []
—	—	@@ -68,6 +76,7 @@
69	77
70	78	def readline(file):
71	79	for line in file:
	80	+ print file.stream.name
72	81	line = line.replace('\n', '')
73	82	if line == '':
74	83	continue

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r76979 [removed: new added: deferred]