Index: trunk/pywikipedia/wikipedia.py |
— | — | @@ -5554,10 +5554,12 @@ |
5555 | 5555 | |
5556 | 5556 | return f, text |
5557 | 5557 | |
| 5558 | + #@deprecated("pywikibot.comms.http.request") # in 'trunk' not yet... |
5558 | 5559 | def getUrl(self, path, retry = None, sysop = False, data = None, compress = True, |
5559 | 5560 | no_hostname = False, cookie_only=False, refer=None, back_response=False): |
5560 | 5561 | """ |
5561 | | - Low-level routine to get a URL from the wiki. |
| 5562 | + Low-level routine to get a URL from the wiki. Tries to login if it is |
| 5563 | + another wiki. |
5562 | 5564 | |
5563 | 5565 | Parameters: |
5564 | 5566 | path - The absolute path, without the hostname. |
— | — | @@ -5569,150 +5571,11 @@ |
5570 | 5572 | |
5571 | 5573 | Returns the HTML text of the page converted to unicode. |
5572 | 5574 | """ |
| 5575 | + from pywikibot.comms import http |
5573 | 5576 | |
5574 | | - if retry is None: |
5575 | | - retry = config.retry_on_fail |
| 5577 | + f, text = http.request(self, path, retry, sysop, data, compress, |
| 5578 | + no_hostname, cookie_only, refer, back_response = True) |
5576 | 5579 | |
5577 | | - headers = { |
5578 | | - 'User-agent': useragent, |
5579 | | - #'Accept-Language': config.mylang, |
5580 | | - #'Accept-Charset': config.textfile_encoding, |
5581 | | - #'Keep-Alive': '115', |
5582 | | - #'Connection': 'keep-alive', |
5583 | | - #'Cache-Control': 'max-age=0', |
5584 | | - #'': '', |
5585 | | - } |
5586 | | - |
5587 | | - if not no_hostname and self.cookies(sysop = sysop): |
5588 | | - headers['Cookie'] = self.cookies(sysop = sysop) |
5589 | | - if compress: |
5590 | | - headers['Accept-encoding'] = 'gzip' |
5591 | | - |
5592 | | - if refer: |
5593 | | - headers['Refer'] = refer |
5594 | | - |
5595 | | - if no_hostname: # This allow users to parse also toolserver's script |
5596 | | - url = path # and other useful pages without using some other functions. |
5597 | | - else: |
5598 | | - url = '%s://%s%s' % (self.protocol(), self.hostname(), path) |
5599 | | - data = self.urlEncode(data) |
5600 | | - |
5601 | | - # Try to retrieve the page until it was successfully loaded (just in |
5602 | | - # case the server is down or overloaded). |
5603 | | - # Wait for retry_idle_time minutes (growing!) between retries. |
5604 | | - retry_idle_time = 1 |
5605 | | - retry_attempt = 0 |
5606 | | - while True: |
5607 | | - try: |
5608 | | - request = urllib2.Request(url, data, headers) |
5609 | | - f = MyURLopener.open(request) |
5610 | | - |
5611 | | - # read & info can raise socket.error |
5612 | | - text = f.read() |
5613 | | - headers = f.info() |
5614 | | - break |
5615 | | - except KeyboardInterrupt: |
5616 | | - raise |
5617 | | - except urllib2.HTTPError, e: |
5618 | | - if e.code in [401, 404]: |
5619 | | - raise PageNotFound( |
5620 | | -u'Page %s could not be retrieved. Check your family file.' |
5621 | | - % url) |
5622 | | - elif e.code in [403]: |
5623 | | - raise PageNotFound( |
5624 | | -u'Page %s could not be retrieved. Check your virus wall.' |
5625 | | - % url) |
5626 | | - elif e.code == 504: |
5627 | | - output(u'HTTPError: %s %s' % (e.code, e.msg)) |
5628 | | - if retry: |
5629 | | - retry_attempt += 1 |
5630 | | - if retry_attempt > config.maxretries: |
5631 | | - raise MaxTriesExceededError() |
5632 | | - output( |
5633 | | -u"WARNING: Could not open '%s'.Maybe the server or\n your connection is down. Retrying in %i minutes..." |
5634 | | - % (url, retry_idle_time)) |
5635 | | - time.sleep(retry_idle_time * 60) |
5636 | | - # Next time wait longer, |
5637 | | - # but not longer than half an hour |
5638 | | - retry_idle_time *= 2 |
5639 | | - if retry_idle_time > 30: |
5640 | | - retry_idle_time = 30 |
5641 | | - continue |
5642 | | - raise |
5643 | | - else: |
5644 | | - output(u"Result: %s %s" % (e.code, e.msg)) |
5645 | | - raise |
5646 | | - except Exception, e: |
5647 | | - output(u'%s' %e) |
5648 | | - if retry: |
5649 | | - retry_attempt += 1 |
5650 | | - if retry_attempt > config.maxretries: |
5651 | | - raise MaxTriesExceededError() |
5652 | | - output( |
5653 | | -u"WARNING: Could not open '%s'. Maybe the server or\n your connection is down. Retrying in %i minutes..." |
5654 | | - % (url, retry_idle_time)) |
5655 | | - time.sleep(retry_idle_time * 60) |
5656 | | - retry_idle_time *= 2 |
5657 | | - if retry_idle_time > 30: |
5658 | | - retry_idle_time = 30 |
5659 | | - continue |
5660 | | - |
5661 | | - raise |
5662 | | - # check cookies return or not, if return, send its to update. |
5663 | | - if hasattr(f, 'sheaders'): |
5664 | | - ck = f.sheaders |
5665 | | - else: |
5666 | | - ck = f.info().getallmatchingheaders('set-cookie') |
5667 | | - if not no_hostname and ck: |
5668 | | - Reat=re.compile(': (.*?)=(.*?);') |
5669 | | - tmpc = {} |
5670 | | - for d in ck: |
5671 | | - m = Reat.search(d) |
5672 | | - if m: tmpc[m.group(1)] = m.group(2) |
5673 | | - self.updateCookies(tmpc, sysop) |
5674 | | - |
5675 | | - if cookie_only: |
5676 | | - return headers.get('set-cookie', '') |
5677 | | - contentType = headers.get('content-type', '') |
5678 | | - contentEncoding = headers.get('content-encoding', '') |
5679 | | - |
5680 | | - # Ensure that all sent data is received |
5681 | | - # In rare cases we found a douple Content-Length in the header. |
5682 | | - # We need to split it to get a value |
5683 | | - content_length = int(headers.get('content-length', '0').split(',')[0]) |
5684 | | - if content_length != len(text) and 'content-length' in headers: |
5685 | | - output( |
5686 | | - u'Warning! len(text) does not match content-length: %s != %s' |
5687 | | - % (len(text), content_length)) |
5688 | | - return self.getUrl(path, retry, sysop, data, compress, no_hostname, |
5689 | | - cookie_only, back_response) |
5690 | | - |
5691 | | - if compress and contentEncoding == 'gzip': |
5692 | | - text = decompress_gzip(text) |
5693 | | - |
5694 | | - R = re.compile('charset=([^\'\";]+)') |
5695 | | - m = R.search(contentType) |
5696 | | - if m: |
5697 | | - charset = m.group(1) |
5698 | | - else: |
5699 | | - if verbose: |
5700 | | - output(u"WARNING: No character set found.") |
5701 | | - # UTF-8 as default |
5702 | | - charset = 'utf-8' |
5703 | | - # Check if this is the charset we expected |
5704 | | - self.checkCharset(charset) |
5705 | | - # Convert HTML to Unicode |
5706 | | - try: |
5707 | | - text = unicode(text, charset, errors = 'strict') |
5708 | | - except UnicodeDecodeError, e: |
5709 | | - print e |
5710 | | - if no_hostname: |
5711 | | - output(u'ERROR: Invalid characters found on %s, replaced by \\ufffd.' % path) |
5712 | | - else: |
5713 | | - output(u'ERROR: Invalid characters found on %s://%s%s, replaced by \\ufffd.' % (self.protocol(), self.hostname(), path)) |
5714 | | - # We use error='replace' in case of bad encoding. |
5715 | | - text = unicode(text, charset, errors = 'replace') |
5716 | | - |
5717 | 5580 | # If a wiki page, get user data |
5718 | 5581 | self._getUserDataOld(text, sysop = sysop) |
5719 | 5582 | |
Index: trunk/pywikipedia/pywikibot/comms/__init__.py |
— | — | @@ -0,0 +1,7 @@ |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +# |
| 4 | +# (C) Pywikipedia bot team, 2012 |
| 5 | +# |
| 6 | +# Distributed under the terms of the MIT license. |
| 7 | +# |
| 8 | +__version__ = '$Id$' |
Property changes on: trunk/pywikipedia/pywikibot/comms/__init__.py |
___________________________________________________________________ |
Added: svn:keywords |
1 | 9 | + Id |
Added: svn:eol-style |
2 | 10 | + native |
Index: trunk/pywikipedia/pywikibot/comms/http.py |
— | — | @@ -0,0 +1,198 @@ |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +""" |
| 4 | +Basic HTTP access interface. |
| 5 | + |
| 6 | +This module handles communication between the bot and the HTTP threads. |
| 7 | + |
| 8 | +This module is responsible for |
| 9 | + - Providing a (blocking) interface for HTTP requests |
| 10 | + - Urlencoding all data |
| 11 | + - Basic HTTP error handling |
| 12 | +""" |
| 13 | + |
| 14 | +# |
| 15 | +# (C) Pywikipedia bot team, 2012 |
| 16 | +# |
| 17 | +# Distributed under the terms of the MIT license. |
| 18 | +# |
| 19 | + |
| 20 | +__version__ = '$Id$' |
| 21 | + |
| 22 | +import urllib2 |
| 23 | + |
| 24 | +import config |
| 25 | +from pywikibot import * |
| 26 | +import wikipedia as pywikibot |
| 27 | + |
| 28 | + |
| 29 | +# global variables |
| 30 | + |
| 31 | +# import useragent and MyURLopener from global namespace |
| 32 | +useragent = pywikibot.useragent |
| 33 | +MyURLopener = pywikibot.MyURLopener |
| 34 | + |
| 35 | +def request(site, uri, retry = None, sysop = False, data = None, compress = True, |
| 36 | + no_hostname = False, cookie_only=False, refer=None, back_response=False): |
| 37 | + """ |
| 38 | + Low-level routine to get a URL from any source (may be the wiki). |
| 39 | + |
| 40 | + Parameters: |
| 41 | + @param site - The Site to connect to. |
| 42 | + @param uri - The absolute uri, without the hostname. |
| 43 | + @param retry - If True, retries loading the page when a network error |
| 44 | + occurs. |
| 45 | + @param sysop - If True, the sysop account's cookie will be used. |
| 46 | + @param data - An optional dict providing extra post request |
| 47 | + parameters. |
| 48 | + @param cookie_only - Only return the cookie the server sent us back |
| 49 | + |
| 50 | + @return: Returns the HTML text of the page converted to unicode. |
| 51 | + """ |
| 52 | + |
| 53 | + if retry is None: |
| 54 | + retry = config.retry_on_fail |
| 55 | + |
| 56 | + headers = { |
| 57 | + 'User-agent': useragent, |
| 58 | + #'Accept-Language': config.mylang, |
| 59 | + #'Accept-Charset': config.textfile_encoding, |
| 60 | + #'Keep-Alive': '115', |
| 61 | + #'Connection': 'keep-alive', |
| 62 | + #'Cache-Control': 'max-age=0', |
| 63 | + #'': '', |
| 64 | + } |
| 65 | + |
| 66 | + if not no_hostname and site.cookies(sysop = sysop): |
| 67 | + headers['Cookie'] = site.cookies(sysop = sysop) |
| 68 | + if compress: |
| 69 | + headers['Accept-encoding'] = 'gzip' |
| 70 | + |
| 71 | + if refer: |
| 72 | + headers['Refer'] = refer |
| 73 | + |
| 74 | + if no_hostname: # This allow users to parse also toolserver's script |
| 75 | + url = uri # and other useful pages without using some other functions. |
| 76 | + else: |
| 77 | + url = '%s://%s%s' % (site.protocol(), site.hostname(), uri) |
| 78 | + data = site.urlEncode(data) |
| 79 | + |
| 80 | + # Try to retrieve the page until it was successfully loaded (just in |
| 81 | + # case the server is down or overloaded). |
| 82 | + # Wait for retry_idle_time minutes (growing!) between retries. |
| 83 | + retry_idle_time = 1 |
| 84 | + retry_attempt = 0 |
| 85 | + while True: |
| 86 | + try: |
| 87 | + req = urllib2.Request(url, data, headers) |
| 88 | + f = MyURLopener.open(req) |
| 89 | + |
| 90 | + # read & info can raise socket.error |
| 91 | + text = f.read() |
| 92 | + headers = f.info() |
| 93 | + break |
| 94 | + except KeyboardInterrupt: |
| 95 | + raise |
| 96 | + except urllib2.HTTPError, e: |
| 97 | + if e.code in [401, 404]: |
| 98 | + raise PageNotFound( |
| 99 | +u'Page %s could not be retrieved. Check your family file.' |
| 100 | + % url) |
| 101 | + elif e.code in [403]: |
| 102 | + raise PageNotFound( |
| 103 | +u'Page %s could not be retrieved. Check your virus wall.' |
| 104 | + % url) |
| 105 | + elif e.code == 504: |
| 106 | + output(u'HTTPError: %s %s' % (e.code, e.msg)) |
| 107 | + if retry: |
| 108 | + retry_attempt += 1 |
| 109 | + if retry_attempt > config.maxretries: |
| 110 | + raise MaxTriesExceededError() |
| 111 | + output( |
| 112 | +u"WARNING: Could not open '%s'.Maybe the server or\n your connection is down. Retrying in %i minutes..." |
| 113 | + % (url, retry_idle_time)) |
| 114 | + time.sleep(retry_idle_time * 60) |
| 115 | + # Next time wait longer, |
| 116 | + # but not longer than half an hour |
| 117 | + retry_idle_time *= 2 |
| 118 | + if retry_idle_time > 30: |
| 119 | + retry_idle_time = 30 |
| 120 | + continue |
| 121 | + raise |
| 122 | + else: |
| 123 | + output(u"Result: %s %s" % (e.code, e.msg)) |
| 124 | + raise |
| 125 | + except Exception, e: |
| 126 | + output(u'%s' %e) |
| 127 | + if retry: |
| 128 | + retry_attempt += 1 |
| 129 | + if retry_attempt > config.maxretries: |
| 130 | + raise MaxTriesExceededError() |
| 131 | + output( |
| 132 | +u"WARNING: Could not open '%s'. Maybe the server or\n your connection is down. Retrying in %i minutes..." |
| 133 | + % (url, retry_idle_time)) |
| 134 | + time.sleep(retry_idle_time * 60) |
| 135 | + retry_idle_time *= 2 |
| 136 | + if retry_idle_time > 30: |
| 137 | + retry_idle_time = 30 |
| 138 | + continue |
| 139 | + |
| 140 | + raise |
| 141 | + # check cookies return or not, if return, send its to update. |
| 142 | + if hasattr(f, 'sheaders'): |
| 143 | + ck = f.sheaders |
| 144 | + else: |
| 145 | + ck = f.info().getallmatchingheaders('set-cookie') |
| 146 | + if not no_hostname and ck: |
| 147 | + Reat=re.compile(': (.*?)=(.*?);') |
| 148 | + tmpc = {} |
| 149 | + for d in ck: |
| 150 | + m = Reat.search(d) |
| 151 | + if m: tmpc[m.group(1)] = m.group(2) |
| 152 | + site.updateCookies(tmpc, sysop) |
| 153 | + |
| 154 | + if cookie_only: |
| 155 | + return headers.get('set-cookie', '') |
| 156 | + contentType = headers.get('content-type', '') |
| 157 | + contentEncoding = headers.get('content-encoding', '') |
| 158 | + |
| 159 | + # Ensure that all sent data is received |
| 160 | + # In rare cases we found a douple Content-Length in the header. |
| 161 | + # We need to split it to get a value |
| 162 | + content_length = int(headers.get('content-length', '0').split(',')[0]) |
| 163 | + if content_length != len(text) and 'content-length' in headers: |
| 164 | + output( |
| 165 | + u'Warning! len(text) does not match content-length: %s != %s' |
| 166 | + % (len(text), content_length)) |
| 167 | + return request(site, uri, retry, sysop, data, compress, no_hostname, |
| 168 | + cookie_only, back_response) |
| 169 | + |
| 170 | + if compress and contentEncoding == 'gzip': |
| 171 | + text = pywikibot.decompress_gzip(text) |
| 172 | + |
| 173 | + R = re.compile('charset=([^\'\";]+)') |
| 174 | + m = R.search(contentType) |
| 175 | + if m: |
| 176 | + charset = m.group(1) |
| 177 | + else: |
| 178 | + if verbose: |
| 179 | + output(u"WARNING: No character set found.") |
| 180 | + # UTF-8 as default |
| 181 | + charset = 'utf-8' |
| 182 | + # Check if this is the charset we expected |
| 183 | + site.checkCharset(charset) |
| 184 | + # Convert HTML to Unicode |
| 185 | + try: |
| 186 | + text = unicode(text, charset, errors = 'strict') |
| 187 | + except UnicodeDecodeError, e: |
| 188 | + print e |
| 189 | + if no_hostname: |
| 190 | + output(u'ERROR: Invalid characters found on %s, replaced by \\ufffd.' % uri) |
| 191 | + else: |
| 192 | + output(u'ERROR: Invalid characters found on %s://%s%s, replaced by \\ufffd.' % (site.protocol(), site.hostname(), uri)) |
| 193 | + # We use error='replace' in case of bad encoding. |
| 194 | + text = unicode(text, charset, errors = 'replace') |
| 195 | + |
| 196 | + if back_response: |
| 197 | + return f, text |
| 198 | + |
| 199 | + return text |
Property changes on: trunk/pywikipedia/pywikibot/comms/http.py |
___________________________________________________________________ |
Added: svn:keywords |
1 | 200 | + Id |
Added: svn:eol-style |
2 | 201 | + native |