r80734 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80733‎ | r80734 | r80735 >
Date:23:59, 21 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
A new helper file for http connections.
Modified paths:
  • /trunk/tools/editor_trends/utils/http_utils.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/utils/http_utils.py
@@ -0,0 +1,128 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
 20+
 21+import os
 22+import sys
 23+import urllib2
 24+import httplib
 25+import multiprocessing
 26+
 27+
 28+sys.path.append('..')
 29+import configuration
 30+settings = configuration.Settings()
 31+import file_utils
 32+import log
 33+
 34+
 35+
 36+
 37+def read_data_from_http_connection(domain, path):
 38+ if not domain.startswith('http://'):
 39+ domain = 'http://%s' % domain
 40+ url = '%s/%s' % (domain, path)
 41+
 42+ try:
 43+ req = urllib2.Request(url)
 44+ response = urllib2.urlopen(req)
 45+ data = response.read()
 46+ except urllib2.URLError, error:
 47+ print 'Reason: %s' % error
 48+ except urllib2.HTTPError, error:
 49+ print 'Error: %s' % error
 50+
 51+ return data
 52+
 53+
 54+
 55+def retrieve_md5_hashes(domain, project, date):
 56+ path = '%s/%s/%s-%s-md5sums.txt' % (project, date, project, date)
 57+ data = read_data_from_http_connection(domain, path)
 58+ '''Implementation not yet finished'''
 59+
 60+
 61+def create_list_dumpfiles(domain, path, filename):
 62+ '''
 63+ Wikipedia offers the option to download one dump file in separate pieces.
 64+ This function determines how many files there are for a giving dump and puts
 65+ them in a queue.
 66+ '''
 67+ task_queue = multiprocessing.JoinableQueue()
 68+ ext = file_utils.determine_file_extension(filename)
 69+ canonical_filename = file_utils.determine_canonical_name(filename)
 70+ for x in xrange(1, 100):
 71+ f = '%s%s.xml.%s' % (canonical_filename, x, ext)
 72+ res = check_remote_path_exists(domain, path, f)
 73+ if res == None or res.status != 200:
 74+ if x == 1:
 75+ task_queue.put(filename)
 76+ break
 77+ else:
 78+ print 'Added chunk to download: %s' % f
 79+ task_queue.put(f)
 80+ for x in xrange(settings.number_of_processes):
 81+ task_queue.put(None)
 82+ return task_queue
 83+
 84+
 85+
 86+def check_remote_path_exists(domain, path, filename):
 87+ '''
 88+ @path is the full path of the file to be downloaded
 89+ @filename is the name of the file to be downloaded
 90+ '''
 91+ try:
 92+ if domain.startswith('http://'):
 93+ domain = domain[7:]
 94+ conn = httplib.HTTPConnection(domain)
 95+ if filename != None:
 96+ url = '%s%s' % (path, filename)
 97+ else:
 98+ url = '%s' % path
 99+ conn.request('HEAD', url)
 100+ res = conn.getresponse()
 101+ conn.close()
 102+ return res
 103+
 104+ except httplib.socket.error:
 105+ raise httplib.NotConnected('It seems that %s is temporarily \
 106+ unavailable, please try again later.' % url)
 107+
 108+
 109+def determine_remote_filesize(domain, path, filename):
 110+ res = check_remote_path_exists(domain, path, filename)
 111+ if res != None and res.status == 200:
 112+ return int(res.getheader('content-length', -1))
 113+ else:
 114+ return - 1
 115+
 116+
 117+def debug():
 118+ domain = 'download.wikimedia.org'
 119+ path = 'enwikinews'
 120+ filename = None
 121+ #check_remote_path_exists(domain, path, filename)
 122+ #read_directory_contents(domain, path)
 123+# download_wp_dump('http://download.wikimedia.org/enwiki/latest',
 124+# 'enwiki-latest-page_props.sql.gz',
 125+# settings.input_location)
 126+
 127+
 128+if __name__ == '__main__':
 129+ debug()
Property changes on: trunk/tools/editor_trends/utils/http_utils.py
___________________________________________________________________
Added: svn:eol-style
1130 + native
Added: svn:mime-type
2131 + text/plain

Status & tagging log