r9901 pywikipedia - Code Review archive

Repository:pywikipedia
Revision:r9900‎ | r9901 | r9902 >
Date:22:44, 16 February 2012
Author:drtrigon
Status:old
Tags:
Comment:
updated analogue to rewrite 'getUrl' moved/splitted to 'pywikibot.comms.http.request'
the generic one does not attempt to (re)login on the target if it is a wiki
Modified paths:
  • /trunk/pywikipedia/pywikibot/comms (added) (history)
  • /trunk/pywikipedia/pywikibot/comms/__init__.py (added) (history)
  • /trunk/pywikipedia/pywikibot/comms/http.py (added) (history)
  • /trunk/pywikipedia/wikipedia.py (modified) (history)

Diff [purge]

Index: trunk/pywikipedia/wikipedia.py
@@ -5554,10 +5554,12 @@
55555555
55565556 return f, text
55575557
 5558+ #@deprecated("pywikibot.comms.http.request") # in 'trunk' not yet...
55585559 def getUrl(self, path, retry = None, sysop = False, data = None, compress = True,
55595560 no_hostname = False, cookie_only=False, refer=None, back_response=False):
55605561 """
5561 - Low-level routine to get a URL from the wiki.
 5562+ Low-level routine to get a URL from the wiki. Tries to login if it is
 5563+ another wiki.
55625564
55635565 Parameters:
55645566 path - The absolute path, without the hostname.
@@ -5569,150 +5571,11 @@
55705572
55715573 Returns the HTML text of the page converted to unicode.
55725574 """
 5575+ from pywikibot.comms import http
55735576
5574 - if retry is None:
5575 - retry = config.retry_on_fail
 5577+ f, text = http.request(self, path, retry, sysop, data, compress,
 5578+ no_hostname, cookie_only, refer, back_response = True)
55765579
5577 - headers = {
5578 - 'User-agent': useragent,
5579 - #'Accept-Language': config.mylang,
5580 - #'Accept-Charset': config.textfile_encoding,
5581 - #'Keep-Alive': '115',
5582 - #'Connection': 'keep-alive',
5583 - #'Cache-Control': 'max-age=0',
5584 - #'': '',
5585 - }
5586 -
5587 - if not no_hostname and self.cookies(sysop = sysop):
5588 - headers['Cookie'] = self.cookies(sysop = sysop)
5589 - if compress:
5590 - headers['Accept-encoding'] = 'gzip'
5591 -
5592 - if refer:
5593 - headers['Refer'] = refer
5594 -
5595 - if no_hostname: # This allow users to parse also toolserver's script
5596 - url = path # and other useful pages without using some other functions.
5597 - else:
5598 - url = '%s://%s%s' % (self.protocol(), self.hostname(), path)
5599 - data = self.urlEncode(data)
5600 -
5601 - # Try to retrieve the page until it was successfully loaded (just in
5602 - # case the server is down or overloaded).
5603 - # Wait for retry_idle_time minutes (growing!) between retries.
5604 - retry_idle_time = 1
5605 - retry_attempt = 0
5606 - while True:
5607 - try:
5608 - request = urllib2.Request(url, data, headers)
5609 - f = MyURLopener.open(request)
5610 -
5611 - # read & info can raise socket.error
5612 - text = f.read()
5613 - headers = f.info()
5614 - break
5615 - except KeyboardInterrupt:
5616 - raise
5617 - except urllib2.HTTPError, e:
5618 - if e.code in [401, 404]:
5619 - raise PageNotFound(
5620 -u'Page %s could not be retrieved. Check your family file.'
5621 - % url)
5622 - elif e.code in [403]:
5623 - raise PageNotFound(
5624 -u'Page %s could not be retrieved. Check your virus wall.'
5625 - % url)
5626 - elif e.code == 504:
5627 - output(u'HTTPError: %s %s' % (e.code, e.msg))
5628 - if retry:
5629 - retry_attempt += 1
5630 - if retry_attempt > config.maxretries:
5631 - raise MaxTriesExceededError()
5632 - output(
5633 -u"WARNING: Could not open '%s'.Maybe the server or\n your connection is down. Retrying in %i minutes..."
5634 - % (url, retry_idle_time))
5635 - time.sleep(retry_idle_time * 60)
5636 - # Next time wait longer,
5637 - # but not longer than half an hour
5638 - retry_idle_time *= 2
5639 - if retry_idle_time > 30:
5640 - retry_idle_time = 30
5641 - continue
5642 - raise
5643 - else:
5644 - output(u"Result: %s %s" % (e.code, e.msg))
5645 - raise
5646 - except Exception, e:
5647 - output(u'%s' %e)
5648 - if retry:
5649 - retry_attempt += 1
5650 - if retry_attempt > config.maxretries:
5651 - raise MaxTriesExceededError()
5652 - output(
5653 -u"WARNING: Could not open '%s'. Maybe the server or\n your connection is down. Retrying in %i minutes..."
5654 - % (url, retry_idle_time))
5655 - time.sleep(retry_idle_time * 60)
5656 - retry_idle_time *= 2
5657 - if retry_idle_time > 30:
5658 - retry_idle_time = 30
5659 - continue
5660 -
5661 - raise
5662 - # check cookies return or not, if return, send its to update.
5663 - if hasattr(f, 'sheaders'):
5664 - ck = f.sheaders
5665 - else:
5666 - ck = f.info().getallmatchingheaders('set-cookie')
5667 - if not no_hostname and ck:
5668 - Reat=re.compile(': (.*?)=(.*?);')
5669 - tmpc = {}
5670 - for d in ck:
5671 - m = Reat.search(d)
5672 - if m: tmpc[m.group(1)] = m.group(2)
5673 - self.updateCookies(tmpc, sysop)
5674 -
5675 - if cookie_only:
5676 - return headers.get('set-cookie', '')
5677 - contentType = headers.get('content-type', '')
5678 - contentEncoding = headers.get('content-encoding', '')
5679 -
5680 - # Ensure that all sent data is received
5681 - # In rare cases we found a douple Content-Length in the header.
5682 - # We need to split it to get a value
5683 - content_length = int(headers.get('content-length', '0').split(',')[0])
5684 - if content_length != len(text) and 'content-length' in headers:
5685 - output(
5686 - u'Warning! len(text) does not match content-length: %s != %s'
5687 - % (len(text), content_length))
5688 - return self.getUrl(path, retry, sysop, data, compress, no_hostname,
5689 - cookie_only, back_response)
5690 -
5691 - if compress and contentEncoding == 'gzip':
5692 - text = decompress_gzip(text)
5693 -
5694 - R = re.compile('charset=([^\'\";]+)')
5695 - m = R.search(contentType)
5696 - if m:
5697 - charset = m.group(1)
5698 - else:
5699 - if verbose:
5700 - output(u"WARNING: No character set found.")
5701 - # UTF-8 as default
5702 - charset = 'utf-8'
5703 - # Check if this is the charset we expected
5704 - self.checkCharset(charset)
5705 - # Convert HTML to Unicode
5706 - try:
5707 - text = unicode(text, charset, errors = 'strict')
5708 - except UnicodeDecodeError, e:
5709 - print e
5710 - if no_hostname:
5711 - output(u'ERROR: Invalid characters found on %s, replaced by \\ufffd.' % path)
5712 - else:
5713 - output(u'ERROR: Invalid characters found on %s://%s%s, replaced by \\ufffd.' % (self.protocol(), self.hostname(), path))
5714 - # We use error='replace' in case of bad encoding.
5715 - text = unicode(text, charset, errors = 'replace')
5716 -
57175580 # If a wiki page, get user data
57185581 self._getUserDataOld(text, sysop = sysop)
57195582
Index: trunk/pywikipedia/pywikibot/comms/__init__.py
@@ -0,0 +1,7 @@
 2+# -*- coding: utf-8 -*-
 3+#
 4+# (C) Pywikipedia bot team, 2012
 5+#
 6+# Distributed under the terms of the MIT license.
 7+#
 8+__version__ = '$Id$'
Property changes on: trunk/pywikipedia/pywikibot/comms/__init__.py
___________________________________________________________________
Added: svn:keywords
19 + Id
Added: svn:eol-style
210 + native
Index: trunk/pywikipedia/pywikibot/comms/http.py
@@ -0,0 +1,198 @@
 2+# -*- coding: utf-8 -*-
 3+"""
 4+Basic HTTP access interface.
 5+
 6+This module handles communication between the bot and the HTTP threads.
 7+
 8+This module is responsible for
 9+ - Providing a (blocking) interface for HTTP requests
 10+ - Urlencoding all data
 11+ - Basic HTTP error handling
 12+"""
 13+
 14+#
 15+# (C) Pywikipedia bot team, 2012
 16+#
 17+# Distributed under the terms of the MIT license.
 18+#
 19+
 20+__version__ = '$Id$'
 21+
 22+import urllib2
 23+
 24+import config
 25+from pywikibot import *
 26+import wikipedia as pywikibot
 27+
 28+
 29+# global variables
 30+
 31+# import useragent and MyURLopener from global namespace
 32+useragent = pywikibot.useragent
 33+MyURLopener = pywikibot.MyURLopener
 34+
 35+def request(site, uri, retry = None, sysop = False, data = None, compress = True,
 36+ no_hostname = False, cookie_only=False, refer=None, back_response=False):
 37+ """
 38+ Low-level routine to get a URL from any source (may be the wiki).
 39+
 40+ Parameters:
 41+ @param site - The Site to connect to.
 42+ @param uri - The absolute uri, without the hostname.
 43+ @param retry - If True, retries loading the page when a network error
 44+ occurs.
 45+ @param sysop - If True, the sysop account's cookie will be used.
 46+ @param data - An optional dict providing extra post request
 47+ parameters.
 48+ @param cookie_only - Only return the cookie the server sent us back
 49+
 50+ @return: Returns the HTML text of the page converted to unicode.
 51+ """
 52+
 53+ if retry is None:
 54+ retry = config.retry_on_fail
 55+
 56+ headers = {
 57+ 'User-agent': useragent,
 58+ #'Accept-Language': config.mylang,
 59+ #'Accept-Charset': config.textfile_encoding,
 60+ #'Keep-Alive': '115',
 61+ #'Connection': 'keep-alive',
 62+ #'Cache-Control': 'max-age=0',
 63+ #'': '',
 64+ }
 65+
 66+ if not no_hostname and site.cookies(sysop = sysop):
 67+ headers['Cookie'] = site.cookies(sysop = sysop)
 68+ if compress:
 69+ headers['Accept-encoding'] = 'gzip'
 70+
 71+ if refer:
 72+ headers['Refer'] = refer
 73+
 74+ if no_hostname: # This allow users to parse also toolserver's script
 75+ url = uri # and other useful pages without using some other functions.
 76+ else:
 77+ url = '%s://%s%s' % (site.protocol(), site.hostname(), uri)
 78+ data = site.urlEncode(data)
 79+
 80+ # Try to retrieve the page until it was successfully loaded (just in
 81+ # case the server is down or overloaded).
 82+ # Wait for retry_idle_time minutes (growing!) between retries.
 83+ retry_idle_time = 1
 84+ retry_attempt = 0
 85+ while True:
 86+ try:
 87+ req = urllib2.Request(url, data, headers)
 88+ f = MyURLopener.open(req)
 89+
 90+ # read & info can raise socket.error
 91+ text = f.read()
 92+ headers = f.info()
 93+ break
 94+ except KeyboardInterrupt:
 95+ raise
 96+ except urllib2.HTTPError, e:
 97+ if e.code in [401, 404]:
 98+ raise PageNotFound(
 99+u'Page %s could not be retrieved. Check your family file.'
 100+ % url)
 101+ elif e.code in [403]:
 102+ raise PageNotFound(
 103+u'Page %s could not be retrieved. Check your virus wall.'
 104+ % url)
 105+ elif e.code == 504:
 106+ output(u'HTTPError: %s %s' % (e.code, e.msg))
 107+ if retry:
 108+ retry_attempt += 1
 109+ if retry_attempt > config.maxretries:
 110+ raise MaxTriesExceededError()
 111+ output(
 112+u"WARNING: Could not open '%s'.Maybe the server or\n your connection is down. Retrying in %i minutes..."
 113+ % (url, retry_idle_time))
 114+ time.sleep(retry_idle_time * 60)
 115+ # Next time wait longer,
 116+ # but not longer than half an hour
 117+ retry_idle_time *= 2
 118+ if retry_idle_time > 30:
 119+ retry_idle_time = 30
 120+ continue
 121+ raise
 122+ else:
 123+ output(u"Result: %s %s" % (e.code, e.msg))
 124+ raise
 125+ except Exception, e:
 126+ output(u'%s' %e)
 127+ if retry:
 128+ retry_attempt += 1
 129+ if retry_attempt > config.maxretries:
 130+ raise MaxTriesExceededError()
 131+ output(
 132+u"WARNING: Could not open '%s'. Maybe the server or\n your connection is down. Retrying in %i minutes..."
 133+ % (url, retry_idle_time))
 134+ time.sleep(retry_idle_time * 60)
 135+ retry_idle_time *= 2
 136+ if retry_idle_time > 30:
 137+ retry_idle_time = 30
 138+ continue
 139+
 140+ raise
 141+ # check cookies return or not, if return, send its to update.
 142+ if hasattr(f, 'sheaders'):
 143+ ck = f.sheaders
 144+ else:
 145+ ck = f.info().getallmatchingheaders('set-cookie')
 146+ if not no_hostname and ck:
 147+ Reat=re.compile(': (.*?)=(.*?);')
 148+ tmpc = {}
 149+ for d in ck:
 150+ m = Reat.search(d)
 151+ if m: tmpc[m.group(1)] = m.group(2)
 152+ site.updateCookies(tmpc, sysop)
 153+
 154+ if cookie_only:
 155+ return headers.get('set-cookie', '')
 156+ contentType = headers.get('content-type', '')
 157+ contentEncoding = headers.get('content-encoding', '')
 158+
 159+ # Ensure that all sent data is received
 160+ # In rare cases we found a douple Content-Length in the header.
 161+ # We need to split it to get a value
 162+ content_length = int(headers.get('content-length', '0').split(',')[0])
 163+ if content_length != len(text) and 'content-length' in headers:
 164+ output(
 165+ u'Warning! len(text) does not match content-length: %s != %s'
 166+ % (len(text), content_length))
 167+ return request(site, uri, retry, sysop, data, compress, no_hostname,
 168+ cookie_only, back_response)
 169+
 170+ if compress and contentEncoding == 'gzip':
 171+ text = pywikibot.decompress_gzip(text)
 172+
 173+ R = re.compile('charset=([^\'\";]+)')
 174+ m = R.search(contentType)
 175+ if m:
 176+ charset = m.group(1)
 177+ else:
 178+ if verbose:
 179+ output(u"WARNING: No character set found.")
 180+ # UTF-8 as default
 181+ charset = 'utf-8'
 182+ # Check if this is the charset we expected
 183+ site.checkCharset(charset)
 184+ # Convert HTML to Unicode
 185+ try:
 186+ text = unicode(text, charset, errors = 'strict')
 187+ except UnicodeDecodeError, e:
 188+ print e
 189+ if no_hostname:
 190+ output(u'ERROR: Invalid characters found on %s, replaced by \\ufffd.' % uri)
 191+ else:
 192+ output(u'ERROR: Invalid characters found on %s://%s%s, replaced by \\ufffd.' % (site.protocol(), site.hostname(), uri))
 193+ # We use error='replace' in case of bad encoding.
 194+ text = unicode(text, charset, errors = 'replace')
 195+
 196+ if back_response:
 197+ return f, text
 198+
 199+ return text
Property changes on: trunk/pywikipedia/pywikibot/comms/http.py
___________________________________________________________________
Added: svn:keywords
1200 + Id
Added: svn:eol-style
2201 + native

Follow-up revisions

RevisionCommit summaryAuthorDate
r9910bug fix; scope has changed, follow-up to r9901 and r9904drtrigon12:49, 19 February 2012

Status & tagging log