r80714 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r80713‎ \| r80714 \| r80715 >
Date:	22:09, 21 January 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Commit before refactoring code
Modified paths:	/trunk/tools/editor_trends/utils/dump_downloader.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/utils/dump_downloader.py
—	—	@@ -13,7 +13,7 @@
14	14	'''
15	15
16	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17		~~-__author__email = 'dvanliere at gmail dot com'~~
	17	+__email__ = 'dvanliere at gmail dot com'
18	18	__date__ = '2010-10-21'
19	19	__version__ = '0.1'
20	20
—	—	@@ -23,33 +23,91 @@
24	24	import httplib
25	25	import multiprocessing
26	26	import progressbar
	27	+from HTMLParser import HTMLParser
27	28
	29	+sys.path.append('..')
	30	+#print sys.path
28	31	import configuration
29	32	settings = configuration.Settings()
30	33	import utils
	34	+import log
31	35
32	36
33		~~-def launcher(config):~~
34		~~- tasks = create_list_dumpfiles(settings.wp_dump_location, config.path, config.filename)~~
35		~~- consumers = [multiprocessing.Process(target=download_wiki_file, args=(tasks, config)) for i in xrange(settings.number_of_processes)]~~
	37	+class AnchorParser(HTMLParser):
	38	+ '''
	39	+ A simple HTML parser that takes an HTML directory listing and extracts the
	40	+ directories.
	41	+ '''
	42	+ def __init__(self,):
	43	+ HTMLParser.__init__(self)
	44	+ self.directories = []
	45	+
	46	+ def handle_starttag(self, tag, attrs):
	47	+ if tag == 'a':
	48	+ for key, value in attrs:
	49	+ if key == 'href':
	50	+ self.directories.append(value)
	51	+ #print value
	52	+
	53	+
	54	+def launcher(properties, settings, logger):
	55	+ print 'Creating list of files to be downloaded...'
	56	+ tasks = create_list_dumpfiles(settings.wp_dump_location,
	57	+ properties.path,
	58	+ properties.filename)
	59	+ consumers = [multiprocessing.Process(target=download_wiki_file,
	60	+ args=(tasks, properties))
	61	+ for i in xrange(settings.number_of_processes)]
	62	+
	63	+ print 'Starting consumers to download files...'
36	64	for w in consumers:
37	65	w.start()
38	66
39	67	tasks.join()
40	68
41	69
	70	+def read_data_from_http_connection(domain, path):
	71	+ if not domain.startswith('http://'):
	72	+ domain = 'http://%s' % domain
	73	+ url = '%s/%s' % (domain, path)
	74	+
	75	+ try:
	76	+ req = urllib2.Request(url)
	77	+ response = urllib2.urlopen(req)
	78	+ data = response.read()
	79	+ except urllib2.URLError, error:
	80	+ print 'Reason: %s' % error
	81	+ except urllib2.HTTPError, error:
	82	+ print 'Error: %s' % error
	83	+
	84	+ return data
	85	+
	86	+
	87	+def read_directory_contents(domain, path):
	88	+ parser = AnchorParser()
	89	+ data = read_data_from_http_connection(domain, path)
	90	+ parser.feed(data)
	91	+ return parser.directories
	92	+
	93	+
	94	+def retrieve_md5_hashes(domain, project, date):
	95	+ path = '%s/%s/%s-%s-md5sums.txt' % (project, date, project, date)
	96	+ data = read_data_from_http_connection(domain, path)
	97	+
	98	+
	99	+
42	100	def create_list_dumpfiles(domain, path, filename):
43	101	'''
44		~~- Wikipedia offers the option to download one dump file in separate batches.~~
	102	+ Wikipedia offers the option to download one dump file in separate pieces.
45	103	This function determines how many files there are for a giving dump and puts
46	104	them in a queue.
47	105	'''
48	106	task_queue = multiprocessing.JoinableQueue()
49		~~- ext = utils.determine_file_extension(filename)~~
50		~~- canonical_filename = utils.determine_canonical_name(filename)~~
	107	+ ext = file_utils.determine_file_extension(filename)
	108	+ canonical_filename = file_utils.determine_canonical_name(filename)
51	109	for x in xrange(1, 100):
52	110	f = '%s%s.xml.%s' % (canonical_filename, x, ext)
53		~~- res = check_remote_file_exists(domain, path, f)~~
	111	+ res = check_remote_path_exists(domain, path, f)
54	112	if res == None or res.status != 200:
55	113	if x == 1:
56	114	task_queue.put(filename)
—	—	@@ -62,7 +120,8 @@
63	121	return task_queue
64	122
65	123
66		~~-def check_remote_file_exists(domain, path, filename):~~
	124	+
	125	+def check_remote_path_exists(domain, path, filename):
67	126	'''
68	127	@path is the full path of the file to be downloaded
69	128	@filename is the name of the file to be downloaded
—	—	@@ -71,18 +130,22 @@
72	131	if domain.startswith('http://'):
73	132	domain = domain[7:]
74	133	conn = httplib.HTTPConnection(domain)
75		~~- url = '%s%s' % (path, filename)~~
	134	+ if filename != None:
	135	+ url = '%s%s' % (path, filename)
	136	+ else:
	137	+ url = '%s' % path
76	138	conn.request('HEAD', url)
77	139	res = conn.getresponse()
78	140	conn.close()
79	141	return res
80	142
81	143	except httplib.socket.error:
82		~~- raise httplib.NotConnected('It seems that %s is temporarily unavailable, please try again later.' % url)~~
	144	+ raise httplib.NotConnected('It seems that %s is temporarily \
	145	+ unavailable, please try again later.' % url)
83	146
84	147
85	148	def determine_remote_filesize(domain, path, filename):
86		~~- res = check_remote_file_exists(domain, path, filename)~~
	149	+ res = check_remote_path_exists(domain, path, filename)
87	150	if res != None and res.status == 200:
88	151	return int(res.getheader('content-length', -1))
89	152	else:
—	—	@@ -94,27 +157,24 @@
95	158	This is a very simple replacement for wget and curl because Windows does
96	159	not have these tools installed by default
97	160	'''
98		~~- chunk = 4096~~
	161	+ success = True
	162	+ chunk = 1024 * 4
99	163	while True:
100	164	filename = task_queue.get(block=False)
101	165	task_queue.task_done()
102	166	if filename == None:
103	167	print 'Swallowed a poison pill'
104	168	break
105		~~- filename = 'zhwiki-latest-page_props.sql.gz'~~
106		~~- extension = utils.determine_file_extension(filename)~~
107		~~- filemode = utils.determine_file_mode(extension)~~
	169	+ extension = file_utils.determine_file_extension(filename)
	170	+ filemode = file_utils.determine_file_mode(extension)
108	171	filesize = determine_remote_filesize(settings.wp_dump_location, config.path, filename)
109	172	if filemode == 'w':
110		~~- fh = utils.create_txt_filehandle(config.location, filename, filemode, settings.encoding)~~
	173	+ fh = file_utils.create_txt_filehandle(config.location, filename, filemode, settings.encoding)
111	174	else:
112		~~- fh = utils.create_binary_filehandle(config.location, filename, 'wb')~~
	175	+ fh = file_utils.create_binary_filehandle(config.location, filename, 'wb')
113	176
114	177	if filesize != -1:
115		~~- widgets = ['%s: ' % filename, progressbar.Percentage(), ' ',~~
116		~~- progressbar.Bar(marker=progressbar.RotatingMarker()), ' ',~~
117		~~- progressbar.ETA(), ' ', progressbar.FileTransferSpeed()]~~
118		-
	178	+ widgets = log.init_progressbar_widgets(filename)
119	179	pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()
120	180
121	181	try:
—	—	@@ -137,11 +197,22 @@
138	198
139	199	except urllib2.URLError, error:
140	200	print 'Reason: %s' % error
	201	+ success = False
141	202	except urllib2.HTTPError, error:
142	203	print 'Error: %s' % error
	204	+ success = False
143	205	finally:
144	206	fh.close()
145	207
	208	+ return success
146	209
	210	+
147	211	if __name__ == '__main__':
148		~~- download_wp_dump('http://download.wikimedia.org/enwiki/latest', 'enwiki-latest-page_props.sql.gz', settings.input_location)~~
	212	+ domain = 'download.wikimedia.org'
	213	+ path = 'enwikinews'
	214	+ filename = None
	215	+ #check_remote_path_exists(domain, path, filename)
	216	+ #read_directory_contents(domain, path)
	217	+# download_wp_dump('http://download.wikimedia.org/enwiki/latest',
	218	+# 'enwiki-latest-page_props.sql.gz',
	219	+# settings.input_location)

Status & tagging log

22:21, 21 January 2011 Reedy (talk | contribs) changed the status of r80714 [removed: new added: deferred]