r80714 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80713‎ | r80714 | r80715 >
Date:22:09, 21 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Commit before refactoring code
Modified paths:
  • /trunk/tools/editor_trends/utils/dump_downloader.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/utils/dump_downloader.py
@@ -13,7 +13,7 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
 17+__email__ = 'dvanliere at gmail dot com'
1818 __date__ = '2010-10-21'
1919 __version__ = '0.1'
2020
@@ -23,33 +23,91 @@
2424 import httplib
2525 import multiprocessing
2626 import progressbar
 27+from HTMLParser import HTMLParser
2728
 29+sys.path.append('..')
 30+#print sys.path
2831 import configuration
2932 settings = configuration.Settings()
3033 import utils
 34+import log
3135
3236
33 -def launcher(config):
34 - tasks = create_list_dumpfiles(settings.wp_dump_location, config.path, config.filename)
35 - consumers = [multiprocessing.Process(target=download_wiki_file, args=(tasks, config)) for i in xrange(settings.number_of_processes)]
 37+class AnchorParser(HTMLParser):
 38+ '''
 39+ A simple HTML parser that takes an HTML directory listing and extracts the
 40+ directories.
 41+ '''
 42+ def __init__(self,):
 43+ HTMLParser.__init__(self)
 44+ self.directories = []
 45+
 46+ def handle_starttag(self, tag, attrs):
 47+ if tag == 'a':
 48+ for key, value in attrs:
 49+ if key == 'href':
 50+ self.directories.append(value)
 51+ #print value
 52+
 53+
 54+def launcher(properties, settings, logger):
 55+ print 'Creating list of files to be downloaded...'
 56+ tasks = create_list_dumpfiles(settings.wp_dump_location,
 57+ properties.path,
 58+ properties.filename)
 59+ consumers = [multiprocessing.Process(target=download_wiki_file,
 60+ args=(tasks, properties))
 61+ for i in xrange(settings.number_of_processes)]
 62+
 63+ print 'Starting consumers to download files...'
3664 for w in consumers:
3765 w.start()
3866
3967 tasks.join()
4068
4169
 70+def read_data_from_http_connection(domain, path):
 71+ if not domain.startswith('http://'):
 72+ domain = 'http://%s' % domain
 73+ url = '%s/%s' % (domain, path)
 74+
 75+ try:
 76+ req = urllib2.Request(url)
 77+ response = urllib2.urlopen(req)
 78+ data = response.read()
 79+ except urllib2.URLError, error:
 80+ print 'Reason: %s' % error
 81+ except urllib2.HTTPError, error:
 82+ print 'Error: %s' % error
 83+
 84+ return data
 85+
 86+
 87+def read_directory_contents(domain, path):
 88+ parser = AnchorParser()
 89+ data = read_data_from_http_connection(domain, path)
 90+ parser.feed(data)
 91+ return parser.directories
 92+
 93+
 94+def retrieve_md5_hashes(domain, project, date):
 95+ path = '%s/%s/%s-%s-md5sums.txt' % (project, date, project, date)
 96+ data = read_data_from_http_connection(domain, path)
 97+
 98+
 99+
42100 def create_list_dumpfiles(domain, path, filename):
43101 '''
44 - Wikipedia offers the option to download one dump file in separate batches.
 102+ Wikipedia offers the option to download one dump file in separate pieces.
45103 This function determines how many files there are for a giving dump and puts
46104 them in a queue.
47105 '''
48106 task_queue = multiprocessing.JoinableQueue()
49 - ext = utils.determine_file_extension(filename)
50 - canonical_filename = utils.determine_canonical_name(filename)
 107+ ext = file_utils.determine_file_extension(filename)
 108+ canonical_filename = file_utils.determine_canonical_name(filename)
51109 for x in xrange(1, 100):
52110 f = '%s%s.xml.%s' % (canonical_filename, x, ext)
53 - res = check_remote_file_exists(domain, path, f)
 111+ res = check_remote_path_exists(domain, path, f)
54112 if res == None or res.status != 200:
55113 if x == 1:
56114 task_queue.put(filename)
@@ -62,7 +120,8 @@
63121 return task_queue
64122
65123
66 -def check_remote_file_exists(domain, path, filename):
 124+
 125+def check_remote_path_exists(domain, path, filename):
67126 '''
68127 @path is the full path of the file to be downloaded
69128 @filename is the name of the file to be downloaded
@@ -71,18 +130,22 @@
72131 if domain.startswith('http://'):
73132 domain = domain[7:]
74133 conn = httplib.HTTPConnection(domain)
75 - url = '%s%s' % (path, filename)
 134+ if filename != None:
 135+ url = '%s%s' % (path, filename)
 136+ else:
 137+ url = '%s' % path
76138 conn.request('HEAD', url)
77139 res = conn.getresponse()
78140 conn.close()
79141 return res
80142
81143 except httplib.socket.error:
82 - raise httplib.NotConnected('It seems that %s is temporarily unavailable, please try again later.' % url)
 144+ raise httplib.NotConnected('It seems that %s is temporarily \
 145+ unavailable, please try again later.' % url)
83146
84147
85148 def determine_remote_filesize(domain, path, filename):
86 - res = check_remote_file_exists(domain, path, filename)
 149+ res = check_remote_path_exists(domain, path, filename)
87150 if res != None and res.status == 200:
88151 return int(res.getheader('content-length', -1))
89152 else:
@@ -94,27 +157,24 @@
95158 This is a very simple replacement for wget and curl because Windows does
96159 not have these tools installed by default
97160 '''
98 - chunk = 4096
 161+ success = True
 162+ chunk = 1024 * 4
99163 while True:
100164 filename = task_queue.get(block=False)
101165 task_queue.task_done()
102166 if filename == None:
103167 print 'Swallowed a poison pill'
104168 break
105 - filename = 'zhwiki-latest-page_props.sql.gz'
106 - extension = utils.determine_file_extension(filename)
107 - filemode = utils.determine_file_mode(extension)
 169+ extension = file_utils.determine_file_extension(filename)
 170+ filemode = file_utils.determine_file_mode(extension)
108171 filesize = determine_remote_filesize(settings.wp_dump_location, config.path, filename)
109172 if filemode == 'w':
110 - fh = utils.create_txt_filehandle(config.location, filename, filemode, settings.encoding)
 173+ fh = file_utils.create_txt_filehandle(config.location, filename, filemode, settings.encoding)
111174 else:
112 - fh = utils.create_binary_filehandle(config.location, filename, 'wb')
 175+ fh = file_utils.create_binary_filehandle(config.location, filename, 'wb')
113176
114177 if filesize != -1:
115 - widgets = ['%s: ' % filename, progressbar.Percentage(), ' ',
116 - progressbar.Bar(marker=progressbar.RotatingMarker()), ' ',
117 - progressbar.ETA(), ' ', progressbar.FileTransferSpeed()]
118 -
 178+ widgets = log.init_progressbar_widgets(filename)
119179 pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()
120180
121181 try:
@@ -137,11 +197,22 @@
138198
139199 except urllib2.URLError, error:
140200 print 'Reason: %s' % error
 201+ success = False
141202 except urllib2.HTTPError, error:
142203 print 'Error: %s' % error
 204+ success = False
143205 finally:
144206 fh.close()
145207
 208+ return success
146209
 210+
147211 if __name__ == '__main__':
148 - download_wp_dump('http://download.wikimedia.org/enwiki/latest', 'enwiki-latest-page_props.sql.gz', settings.input_location)
 212+ domain = 'download.wikimedia.org'
 213+ path = 'enwikinews'
 214+ filename = None
 215+ #check_remote_path_exists(domain, path, filename)
 216+ #read_directory_contents(domain, path)
 217+# download_wp_dump('http://download.wikimedia.org/enwiki/latest',
 218+# 'enwiki-latest-page_props.sql.gz',
 219+# settings.input_location)

Status & tagging log