Index: trunk/tools/editor_trends/utils/dump_downloader.py |
— | — | @@ -13,7 +13,7 @@ |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__author__email = 'dvanliere at gmail dot com' |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
18 | 18 | __date__ = '2010-10-21' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
— | — | @@ -23,33 +23,91 @@ |
24 | 24 | import httplib |
25 | 25 | import multiprocessing |
26 | 26 | import progressbar |
| 27 | +from HTMLParser import HTMLParser |
27 | 28 | |
| 29 | +sys.path.append('..') |
| 30 | +#print sys.path |
28 | 31 | import configuration |
29 | 32 | settings = configuration.Settings() |
30 | 33 | import utils |
| 34 | +import log |
31 | 35 | |
32 | 36 | |
33 | | -def launcher(config): |
34 | | - tasks = create_list_dumpfiles(settings.wp_dump_location, config.path, config.filename) |
35 | | - consumers = [multiprocessing.Process(target=download_wiki_file, args=(tasks, config)) for i in xrange(settings.number_of_processes)] |
| 37 | +class AnchorParser(HTMLParser): |
| 38 | + ''' |
| 39 | + A simple HTML parser that takes an HTML directory listing and extracts the |
| 40 | + directories. |
| 41 | + ''' |
| 42 | + def __init__(self,): |
| 43 | + HTMLParser.__init__(self) |
| 44 | + self.directories = [] |
| 45 | + |
| 46 | + def handle_starttag(self, tag, attrs): |
| 47 | + if tag == 'a': |
| 48 | + for key, value in attrs: |
| 49 | + if key == 'href': |
| 50 | + self.directories.append(value) |
| 51 | + #print value |
| 52 | + |
| 53 | + |
| 54 | +def launcher(properties, settings, logger): |
| 55 | + print 'Creating list of files to be downloaded...' |
| 56 | + tasks = create_list_dumpfiles(settings.wp_dump_location, |
| 57 | + properties.path, |
| 58 | + properties.filename) |
| 59 | + consumers = [multiprocessing.Process(target=download_wiki_file, |
| 60 | + args=(tasks, properties)) |
| 61 | + for i in xrange(settings.number_of_processes)] |
| 62 | + |
| 63 | + print 'Starting consumers to download files...' |
36 | 64 | for w in consumers: |
37 | 65 | w.start() |
38 | 66 | |
39 | 67 | tasks.join() |
40 | 68 | |
41 | 69 | |
| 70 | +def read_data_from_http_connection(domain, path): |
| 71 | + if not domain.startswith('http://'): |
| 72 | + domain = 'http://%s' % domain |
| 73 | + url = '%s/%s' % (domain, path) |
| 74 | + |
| 75 | + try: |
| 76 | + req = urllib2.Request(url) |
| 77 | + response = urllib2.urlopen(req) |
| 78 | + data = response.read() |
| 79 | + except urllib2.URLError, error: |
| 80 | + print 'Reason: %s' % error |
| 81 | + except urllib2.HTTPError, error: |
| 82 | + print 'Error: %s' % error |
| 83 | + |
| 84 | + return data |
| 85 | + |
| 86 | + |
| 87 | +def read_directory_contents(domain, path): |
| 88 | + parser = AnchorParser() |
| 89 | + data = read_data_from_http_connection(domain, path) |
| 90 | + parser.feed(data) |
| 91 | + return parser.directories |
| 92 | + |
| 93 | + |
| 94 | +def retrieve_md5_hashes(domain, project, date): |
| 95 | + path = '%s/%s/%s-%s-md5sums.txt' % (project, date, project, date) |
| 96 | + data = read_data_from_http_connection(domain, path) |
| 97 | + |
| 98 | + |
| 99 | + |
42 | 100 | def create_list_dumpfiles(domain, path, filename): |
43 | 101 | ''' |
44 | | - Wikipedia offers the option to download one dump file in separate batches. |
| 102 | + Wikipedia offers the option to download one dump file in separate pieces. |
45 | 103 | This function determines how many files there are for a giving dump and puts |
46 | 104 | them in a queue. |
47 | 105 | ''' |
48 | 106 | task_queue = multiprocessing.JoinableQueue() |
49 | | - ext = utils.determine_file_extension(filename) |
50 | | - canonical_filename = utils.determine_canonical_name(filename) |
| 107 | + ext = file_utils.determine_file_extension(filename) |
| 108 | + canonical_filename = file_utils.determine_canonical_name(filename) |
51 | 109 | for x in xrange(1, 100): |
52 | 110 | f = '%s%s.xml.%s' % (canonical_filename, x, ext) |
53 | | - res = check_remote_file_exists(domain, path, f) |
| 111 | + res = check_remote_path_exists(domain, path, f) |
54 | 112 | if res == None or res.status != 200: |
55 | 113 | if x == 1: |
56 | 114 | task_queue.put(filename) |
— | — | @@ -62,7 +120,8 @@ |
63 | 121 | return task_queue |
64 | 122 | |
65 | 123 | |
66 | | -def check_remote_file_exists(domain, path, filename): |
| 124 | + |
| 125 | +def check_remote_path_exists(domain, path, filename): |
67 | 126 | ''' |
68 | 127 | @path is the full path of the file to be downloaded |
69 | 128 | @filename is the name of the file to be downloaded |
— | — | @@ -71,18 +130,22 @@ |
72 | 131 | if domain.startswith('http://'): |
73 | 132 | domain = domain[7:] |
74 | 133 | conn = httplib.HTTPConnection(domain) |
75 | | - url = '%s%s' % (path, filename) |
| 134 | + if filename != None: |
| 135 | + url = '%s%s' % (path, filename) |
| 136 | + else: |
| 137 | + url = '%s' % path |
76 | 138 | conn.request('HEAD', url) |
77 | 139 | res = conn.getresponse() |
78 | 140 | conn.close() |
79 | 141 | return res |
80 | 142 | |
81 | 143 | except httplib.socket.error: |
82 | | - raise httplib.NotConnected('It seems that %s is temporarily unavailable, please try again later.' % url) |
| 144 | + raise httplib.NotConnected('It seems that %s is temporarily \ |
| 145 | + unavailable, please try again later.' % url) |
83 | 146 | |
84 | 147 | |
85 | 148 | def determine_remote_filesize(domain, path, filename): |
86 | | - res = check_remote_file_exists(domain, path, filename) |
| 149 | + res = check_remote_path_exists(domain, path, filename) |
87 | 150 | if res != None and res.status == 200: |
88 | 151 | return int(res.getheader('content-length', -1)) |
89 | 152 | else: |
— | — | @@ -94,27 +157,24 @@ |
95 | 158 | This is a very simple replacement for wget and curl because Windows does |
96 | 159 | not have these tools installed by default |
97 | 160 | ''' |
98 | | - chunk = 4096 |
| 161 | + success = True |
| 162 | + chunk = 1024 * 4 |
99 | 163 | while True: |
100 | 164 | filename = task_queue.get(block=False) |
101 | 165 | task_queue.task_done() |
102 | 166 | if filename == None: |
103 | 167 | print 'Swallowed a poison pill' |
104 | 168 | break |
105 | | - filename = 'zhwiki-latest-page_props.sql.gz' |
106 | | - extension = utils.determine_file_extension(filename) |
107 | | - filemode = utils.determine_file_mode(extension) |
| 169 | + extension = file_utils.determine_file_extension(filename) |
| 170 | + filemode = file_utils.determine_file_mode(extension) |
108 | 171 | filesize = determine_remote_filesize(settings.wp_dump_location, config.path, filename) |
109 | 172 | if filemode == 'w': |
110 | | - fh = utils.create_txt_filehandle(config.location, filename, filemode, settings.encoding) |
| 173 | + fh = file_utils.create_txt_filehandle(config.location, filename, filemode, settings.encoding) |
111 | 174 | else: |
112 | | - fh = utils.create_binary_filehandle(config.location, filename, 'wb') |
| 175 | + fh = file_utils.create_binary_filehandle(config.location, filename, 'wb') |
113 | 176 | |
114 | 177 | if filesize != -1: |
115 | | - widgets = ['%s: ' % filename, progressbar.Percentage(), ' ', |
116 | | - progressbar.Bar(marker=progressbar.RotatingMarker()), ' ', |
117 | | - progressbar.ETA(), ' ', progressbar.FileTransferSpeed()] |
118 | | - |
| 178 | + widgets = log.init_progressbar_widgets(filename) |
119 | 179 | pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start() |
120 | 180 | |
121 | 181 | try: |
— | — | @@ -137,11 +197,22 @@ |
138 | 198 | |
139 | 199 | except urllib2.URLError, error: |
140 | 200 | print 'Reason: %s' % error |
| 201 | + success = False |
141 | 202 | except urllib2.HTTPError, error: |
142 | 203 | print 'Error: %s' % error |
| 204 | + success = False |
143 | 205 | finally: |
144 | 206 | fh.close() |
145 | 207 | |
| 208 | + return success |
146 | 209 | |
| 210 | + |
147 | 211 | if __name__ == '__main__': |
148 | | - download_wp_dump('http://download.wikimedia.org/enwiki/latest', 'enwiki-latest-page_props.sql.gz', settings.input_location) |
| 212 | + domain = 'download.wikimedia.org' |
| 213 | + path = 'enwikinews' |
| 214 | + filename = None |
| 215 | + #check_remote_path_exists(domain, path, filename) |
| 216 | + #read_directory_contents(domain, path) |
| 217 | +# download_wp_dump('http://download.wikimedia.org/enwiki/latest', |
| 218 | +# 'enwiki-latest-page_props.sql.gz', |
| 219 | +# settings.input_location) |