r9901 pywikipedia - Code Review archive

Repository:	pywikipedia
Revision:	< r9900‎ \| r9901 \| r9902 >
Date:	22:44, 16 February 2012
Author:	drtrigon
Status:	old
Tags:
Comment:	updated analogue to rewrite 'getUrl' moved/splitted to 'pywikibot.comms.http.request' the generic one does not attempt to (re)login on the target if it is a wiki
Modified paths:	/trunk/pywikipedia/pywikibot/comms (added) (history) /trunk/pywikipedia/pywikibot/comms/__init__.py (added) (history) /trunk/pywikipedia/pywikibot/comms/http.py (added) (history) /trunk/pywikipedia/wikipedia.py (modified) (history)

Diff [purge]

Index: trunk/pywikipedia/wikipedia.py
—	—	@@ -5554,10 +5554,12 @@
5555	5555
5556	5556	return f, text
5557	5557
	5558	+ #@deprecated("pywikibot.comms.http.request") # in 'trunk' not yet...
5558	5559	def getUrl(self, path, retry = None, sysop = False, data = None, compress = True,
5559	5560	no_hostname = False, cookie_only=False, refer=None, back_response=False):
5560	5561	"""
5561		~~- Low-level routine to get a URL from the wiki.~~
	5562	+ Low-level routine to get a URL from the wiki. Tries to login if it is
	5563	+ another wiki.
5562	5564
5563	5565	Parameters:
5564	5566	path - The absolute path, without the hostname.
—	—	@@ -5569,150 +5571,11 @@
5570	5572
5571	5573	Returns the HTML text of the page converted to unicode.
5572	5574	"""
	5575	+ from pywikibot.comms import http
5573	5576
5574		~~- if retry is None:~~
5575		~~- retry = config.retry_on_fail~~
	5577	+ f, text = http.request(self, path, retry, sysop, data, compress,
	5578	+ no_hostname, cookie_only, refer, back_response = True)
5576	5579
5577		~~- headers = {~~
5578		~~- 'User-agent': useragent,~~
5579		~~- #'Accept-Language': config.mylang,~~
5580		~~- #'Accept-Charset': config.textfile_encoding,~~
5581		~~- #'Keep-Alive': '115',~~
5582		~~- #'Connection': 'keep-alive',~~
5583		~~- #'Cache-Control': 'max-age=0',~~
5584		~~- #'': '',~~
5585		~~- }~~
5586		-
5587		~~- if not no_hostname and self.cookies(sysop = sysop):~~
5588		~~- headers['Cookie'] = self.cookies(sysop = sysop)~~
5589		~~- if compress:~~
5590		~~- headers['Accept-encoding'] = 'gzip'~~
5591		-
5592		~~- if refer:~~
5593		~~- headers['Refer'] = refer~~
5594		-
5595		~~- if no_hostname: # This allow users to parse also toolserver's script~~
5596		~~- url = path # and other useful pages without using some other functions.~~
5597		~~- else:~~
5598		~~- url = '%s://%s%s' % (self.protocol(), self.hostname(), path)~~
5599		~~- data = self.urlEncode(data)~~
5600		-
5601		~~- # Try to retrieve the page until it was successfully loaded (just in~~
5602		~~- # case the server is down or overloaded).~~
5603		~~- # Wait for retry_idle_time minutes (growing!) between retries.~~
5604		~~- retry_idle_time = 1~~
5605		~~- retry_attempt = 0~~
5606		~~- while True:~~
5607		~~- try:~~
5608		~~- request = urllib2.Request(url, data, headers)~~
5609		~~- f = MyURLopener.open(request)~~
5610		-
5611		~~- # read & info can raise socket.error~~
5612		~~- text = f.read()~~
5613		~~- headers = f.info()~~
5614		~~- break~~
5615		~~- except KeyboardInterrupt:~~
5616		~~- raise~~
5617		~~- except urllib2.HTTPError, e:~~
5618		~~- if e.code in [401, 404]:~~
5619		~~- raise PageNotFound(~~
5620		~~-u'Page %s could not be retrieved. Check your family file.'~~
5621		~~- % url)~~
5622		~~- elif e.code in [403]:~~
5623		~~- raise PageNotFound(~~
5624		~~-u'Page %s could not be retrieved. Check your virus wall.'~~
5625		~~- % url)~~
5626		~~- elif e.code == 504:~~
5627		~~- output(u'HTTPError: %s %s' % (e.code, e.msg))~~
5628		~~- if retry:~~
5629		~~- retry_attempt += 1~~
5630		~~- if retry_attempt > config.maxretries:~~
5631		~~- raise MaxTriesExceededError()~~
5632		~~- output(~~
5633		~~-u"WARNING: Could not open '%s'.Maybe the server or\n your connection is down. Retrying in %i minutes..."~~
5634		~~- % (url, retry_idle_time))~~
5635		~~- time.sleep(retry_idle_time * 60)~~
5636		~~- # Next time wait longer,~~
5637		~~- # but not longer than half an hour~~
5638		~~- retry_idle_time *= 2~~
5639		~~- if retry_idle_time > 30:~~
5640		~~- retry_idle_time = 30~~
5641		~~- continue~~
5642		~~- raise~~
5643		~~- else:~~
5644		~~- output(u"Result: %s %s" % (e.code, e.msg))~~
5645		~~- raise~~
5646		~~- except Exception, e:~~
5647		~~- output(u'%s' %e)~~
5648		~~- if retry:~~
5649		~~- retry_attempt += 1~~
5650		~~- if retry_attempt > config.maxretries:~~
5651		~~- raise MaxTriesExceededError()~~
5652		~~- output(~~
5653		~~-u"WARNING: Could not open '%s'. Maybe the server or\n your connection is down. Retrying in %i minutes..."~~
5654		~~- % (url, retry_idle_time))~~
5655		~~- time.sleep(retry_idle_time * 60)~~
5656		~~- retry_idle_time *= 2~~
5657		~~- if retry_idle_time > 30:~~
5658		~~- retry_idle_time = 30~~
5659		~~- continue~~
5660		-
5661		~~- raise~~
5662		~~- # check cookies return or not, if return, send its to update.~~
5663		~~- if hasattr(f, 'sheaders'):~~
5664		~~- ck = f.sheaders~~
5665		~~- else:~~
5666		~~- ck = f.info().getallmatchingheaders('set-cookie')~~
5667		~~- if not no_hostname and ck:~~
5668		~~- Reat=re.compile(': (.?)=(.?);')~~
5669		~~- tmpc = {}~~
5670		~~- for d in ck:~~
5671		~~- m = Reat.search(d)~~
5672		~~- if m: tmpc[m.group(1)] = m.group(2)~~
5673		~~- self.updateCookies(tmpc, sysop)~~
5674		-
5675		~~- if cookie_only:~~
5676		~~- return headers.get('set-cookie', '')~~
5677		~~- contentType = headers.get('content-type', '')~~
5678		~~- contentEncoding = headers.get('content-encoding', '')~~
5679		-
5680		~~- # Ensure that all sent data is received~~
5681		~~- # In rare cases we found a douple Content-Length in the header.~~
5682		~~- # We need to split it to get a value~~
5683		~~- content_length = int(headers.get('content-length', '0').split(',')[0])~~
5684		~~- if content_length != len(text) and 'content-length' in headers:~~
5685		~~- output(~~
5686		~~- u'Warning! len(text) does not match content-length: %s != %s'~~
5687		~~- % (len(text), content_length))~~
5688		~~- return self.getUrl(path, retry, sysop, data, compress, no_hostname,~~
5689		~~- cookie_only, back_response)~~
5690		-
5691		~~- if compress and contentEncoding == 'gzip':~~
5692		~~- text = decompress_gzip(text)~~
5693		-
5694		~~- R = re.compile('charset=([^\'\";]+)')~~
5695		~~- m = R.search(contentType)~~
5696		~~- if m:~~
5697		~~- charset = m.group(1)~~
5698		~~- else:~~
5699		~~- if verbose:~~
5700		~~- output(u"WARNING: No character set found.")~~
5701		~~- # UTF-8 as default~~
5702		~~- charset = 'utf-8'~~
5703		~~- # Check if this is the charset we expected~~
5704		~~- self.checkCharset(charset)~~
5705		~~- # Convert HTML to Unicode~~
5706		~~- try:~~
5707		~~- text = unicode(text, charset, errors = 'strict')~~
5708		~~- except UnicodeDecodeError, e:~~
5709		~~- print e~~
5710		~~- if no_hostname:~~
5711		~~- output(u'ERROR: Invalid characters found on %s, replaced by \\ufffd.' % path)~~
5712		~~- else:~~
5713		~~- output(u'ERROR: Invalid characters found on %s://%s%s, replaced by \\ufffd.' % (self.protocol(), self.hostname(), path))~~
5714		~~- # We use error='replace' in case of bad encoding.~~
5715		~~- text = unicode(text, charset, errors = 'replace')~~
5716		-
5717	5580	# If a wiki page, get user data
5718	5581	self._getUserDataOld(text, sysop = sysop)
5719	5582
Index: trunk/pywikipedia/pywikibot/comms/__init__.py
—	—	@@ -0,0 +1,7 @@
	2	+# -- coding: utf-8 --
	3	+#
	4	+# (C) Pywikipedia bot team, 2012
	5	+#
	6	+# Distributed under the terms of the MIT license.
	7	+#
	8	+__version__ = '$Id$'
Property changes on: trunk/pywikipedia/pywikibot/comms/__init__.py
___________________________________________________________________
Added: svn:keywords
1	9	+ Id
Added: svn:eol-style
2	10	+ native
Index: trunk/pywikipedia/pywikibot/comms/http.py
—	—	@@ -0,0 +1,198 @@
	2	+# -- coding: utf-8 --
	3	+"""
	4	+Basic HTTP access interface.
	5	+
	6	+This module handles communication between the bot and the HTTP threads.
	7	+
	8	+This module is responsible for
	9	+ - Providing a (blocking) interface for HTTP requests
	10	+ - Urlencoding all data
	11	+ - Basic HTTP error handling
	12	+"""
	13	+
	14	+#
	15	+# (C) Pywikipedia bot team, 2012
	16	+#
	17	+# Distributed under the terms of the MIT license.
	18	+#
	19	+
	20	+__version__ = '$Id$'
	21	+
	22	+import urllib2
	23	+
	24	+import config
	25	+from pywikibot import *
	26	+import wikipedia as pywikibot
	27	+
	28	+
	29	+# global variables
	30	+
	31	+# import useragent and MyURLopener from global namespace
	32	+useragent = pywikibot.useragent
	33	+MyURLopener = pywikibot.MyURLopener
	34	+
	35	+def request(site, uri, retry = None, sysop = False, data = None, compress = True,
	36	+ no_hostname = False, cookie_only=False, refer=None, back_response=False):
	37	+ """
	38	+ Low-level routine to get a URL from any source (may be the wiki).
	39	+
	40	+ Parameters:
	41	+ @param site - The Site to connect to.
	42	+ @param uri - The absolute uri, without the hostname.
	43	+ @param retry - If True, retries loading the page when a network error
	44	+ occurs.
	45	+ @param sysop - If True, the sysop account's cookie will be used.
	46	+ @param data - An optional dict providing extra post request
	47	+ parameters.
	48	+ @param cookie_only - Only return the cookie the server sent us back
	49	+
	50	+ @return: Returns the HTML text of the page converted to unicode.
	51	+ """
	52	+
	53	+ if retry is None:
	54	+ retry = config.retry_on_fail
	55	+
	56	+ headers = {
	57	+ 'User-agent': useragent,
	58	+ #'Accept-Language': config.mylang,
	59	+ #'Accept-Charset': config.textfile_encoding,
	60	+ #'Keep-Alive': '115',
	61	+ #'Connection': 'keep-alive',
	62	+ #'Cache-Control': 'max-age=0',
	63	+ #'': '',
	64	+ }
	65	+
	66	+ if not no_hostname and site.cookies(sysop = sysop):
	67	+ headers['Cookie'] = site.cookies(sysop = sysop)
	68	+ if compress:
	69	+ headers['Accept-encoding'] = 'gzip'
	70	+
	71	+ if refer:
	72	+ headers['Refer'] = refer
	73	+
	74	+ if no_hostname: # This allow users to parse also toolserver's script
	75	+ url = uri # and other useful pages without using some other functions.
	76	+ else:
	77	+ url = '%s://%s%s' % (site.protocol(), site.hostname(), uri)
	78	+ data = site.urlEncode(data)
	79	+
	80	+ # Try to retrieve the page until it was successfully loaded (just in
	81	+ # case the server is down or overloaded).
	82	+ # Wait for retry_idle_time minutes (growing!) between retries.
	83	+ retry_idle_time = 1
	84	+ retry_attempt = 0
	85	+ while True:
	86	+ try:
	87	+ req = urllib2.Request(url, data, headers)
	88	+ f = MyURLopener.open(req)
	89	+
	90	+ # read & info can raise socket.error
	91	+ text = f.read()
	92	+ headers = f.info()
	93	+ break
	94	+ except KeyboardInterrupt:
	95	+ raise
	96	+ except urllib2.HTTPError, e:
	97	+ if e.code in [401, 404]:
	98	+ raise PageNotFound(
	99	+u'Page %s could not be retrieved. Check your family file.'
	100	+ % url)
	101	+ elif e.code in [403]:
	102	+ raise PageNotFound(
	103	+u'Page %s could not be retrieved. Check your virus wall.'
	104	+ % url)
	105	+ elif e.code == 504:
	106	+ output(u'HTTPError: %s %s' % (e.code, e.msg))
	107	+ if retry:
	108	+ retry_attempt += 1
	109	+ if retry_attempt > config.maxretries:
	110	+ raise MaxTriesExceededError()
	111	+ output(
	112	+u"WARNING: Could not open '%s'.Maybe the server or\n your connection is down. Retrying in %i minutes..."
	113	+ % (url, retry_idle_time))
	114	+ time.sleep(retry_idle_time * 60)
	115	+ # Next time wait longer,
	116	+ # but not longer than half an hour
	117	+ retry_idle_time *= 2
	118	+ if retry_idle_time > 30:
	119	+ retry_idle_time = 30
	120	+ continue
	121	+ raise
	122	+ else:
	123	+ output(u"Result: %s %s" % (e.code, e.msg))
	124	+ raise
	125	+ except Exception, e:
	126	+ output(u'%s' %e)
	127	+ if retry:
	128	+ retry_attempt += 1
	129	+ if retry_attempt > config.maxretries:
	130	+ raise MaxTriesExceededError()
	131	+ output(
	132	+u"WARNING: Could not open '%s'. Maybe the server or\n your connection is down. Retrying in %i minutes..."
	133	+ % (url, retry_idle_time))
	134	+ time.sleep(retry_idle_time * 60)
	135	+ retry_idle_time *= 2
	136	+ if retry_idle_time > 30:
	137	+ retry_idle_time = 30
	138	+ continue
	139	+
	140	+ raise
	141	+ # check cookies return or not, if return, send its to update.
	142	+ if hasattr(f, 'sheaders'):
	143	+ ck = f.sheaders
	144	+ else:
	145	+ ck = f.info().getallmatchingheaders('set-cookie')
	146	+ if not no_hostname and ck:
	147	+ Reat=re.compile(': (.?)=(.?);')
	148	+ tmpc = {}
	149	+ for d in ck:
	150	+ m = Reat.search(d)
	151	+ if m: tmpc[m.group(1)] = m.group(2)
	152	+ site.updateCookies(tmpc, sysop)
	153	+
	154	+ if cookie_only:
	155	+ return headers.get('set-cookie', '')
	156	+ contentType = headers.get('content-type', '')
	157	+ contentEncoding = headers.get('content-encoding', '')
	158	+
	159	+ # Ensure that all sent data is received
	160	+ # In rare cases we found a douple Content-Length in the header.
	161	+ # We need to split it to get a value
	162	+ content_length = int(headers.get('content-length', '0').split(',')[0])
	163	+ if content_length != len(text) and 'content-length' in headers:
	164	+ output(
	165	+ u'Warning! len(text) does not match content-length: %s != %s'
	166	+ % (len(text), content_length))
	167	+ return request(site, uri, retry, sysop, data, compress, no_hostname,
	168	+ cookie_only, back_response)
	169	+
	170	+ if compress and contentEncoding == 'gzip':
	171	+ text = pywikibot.decompress_gzip(text)
	172	+
	173	+ R = re.compile('charset=([^\'\";]+)')
	174	+ m = R.search(contentType)
	175	+ if m:
	176	+ charset = m.group(1)
	177	+ else:
	178	+ if verbose:
	179	+ output(u"WARNING: No character set found.")
	180	+ # UTF-8 as default
	181	+ charset = 'utf-8'
	182	+ # Check if this is the charset we expected
	183	+ site.checkCharset(charset)
	184	+ # Convert HTML to Unicode
	185	+ try:
	186	+ text = unicode(text, charset, errors = 'strict')
	187	+ except UnicodeDecodeError, e:
	188	+ print e
	189	+ if no_hostname:
	190	+ output(u'ERROR: Invalid characters found on %s, replaced by \\ufffd.' % uri)
	191	+ else:
	192	+ output(u'ERROR: Invalid characters found on %s://%s%s, replaced by \\ufffd.' % (site.protocol(), site.hostname(), uri))
	193	+ # We use error='replace' in case of bad encoding.
	194	+ text = unicode(text, charset, errors = 'replace')
	195	+
	196	+ if back_response:
	197	+ return f, text
	198	+
	199	+ return text
Property changes on: trunk/pywikipedia/pywikibot/comms/http.py
___________________________________________________________________
Added: svn:keywords
1	200	+ Id
Added: svn:eol-style
2	201	+ native

Follow-up revisions

Revision	Commit summary	Author	Date
r9910	bug fix; scope has changed, follow-up to r9901 and r9904	drtrigon	12:49, 19 February 2012

Status & tagging log

11:23, 16 February 2013 Xqt (talk | contribs) changed the status of r9901 [removed: new added: old]