1"""URL opener. 2 3Copyright 2004-2006 John J Lee <jjl@pobox.com> 4 5This code is free software; you can redistribute it and/or modify it 6under the terms of the BSD or ZPL 2.1 licenses (see the file 7LICENSE included with the distribution). 8 9""" 10 11from __future__ import absolute_import 12 13import bisect 14import os 15import tempfile 16import threading 17 18from . import _response 19from . import _rfc3986 20from . import _sockettimeout 21from . import _urllib2_fork 22from ._request import Request 23from ._util import isstringlike 24from .polyglot import HTTPError, URLError, iteritems, is_class 25 26 27open_file = open 28 29 30class ContentTooShortError(URLError): 31 32 def __init__(self, reason, result): 33 URLError.__init__(self, reason) 34 self.result = result 35 36 37def set_request_attr(req, name, value, default): 38 try: 39 getattr(req, name) 40 except AttributeError: 41 setattr(req, name, default) 42 if value is not default: 43 setattr(req, name, value) 44 45 46class OpenerDirector(_urllib2_fork.OpenerDirector): 47 48 def __init__(self): 49 _urllib2_fork.OpenerDirector.__init__(self) 50 # really none of these are (sanely) public -- the lack of initial 51 # underscore on some is just due to following urllib2 52 self.process_response = {} 53 self.process_request = {} 54 self._any_request = {} 55 self._any_response = {} 56 self._handler_index_valid = True 57 self._tempfiles = [] 58 59 def add_handler(self, handler): 60 if not hasattr(handler, "add_parent"): 61 raise TypeError("expected BaseHandler instance, got %r" % 62 type(handler)) 63 64 if handler in self.handlers: 65 return 66 # XXX why does self.handlers need to be sorted? 67 bisect.insort(self.handlers, handler) 68 handler.add_parent(self) 69 self._handler_index_valid = False 70 71 def _maybe_reindex_handlers(self): 72 if self._handler_index_valid: 73 return 74 75 handle_error = {} 76 handle_open = {} 77 process_request = {} 78 process_response = {} 79 any_request = set() 80 any_response = set() 81 unwanted = [] 82 83 for handler in self.handlers: 84 added = False 85 for meth in dir(handler): 86 if meth in ["redirect_request", "do_open", "proxy_open"]: 87 # oops, coincidental match 88 continue 89 90 if meth == "any_request": 91 any_request.add(handler) 92 added = True 93 continue 94 elif meth == "any_response": 95 any_response.add(handler) 96 added = True 97 continue 98 99 ii = meth.find("_") 100 scheme = meth[:ii] 101 condition = meth[ii + 1:] 102 103 if condition.startswith("error"): 104 jj = meth[ii + 1:].find("_") + ii + 1 105 kind = meth[jj + 1:] 106 try: 107 kind = int(kind) 108 except ValueError: 109 pass 110 lookup = handle_error.setdefault(scheme, {}) 111 elif condition == "open": 112 kind = scheme 113 lookup = handle_open 114 elif condition == "request": 115 kind = scheme 116 lookup = process_request 117 elif condition == "response": 118 kind = scheme 119 lookup = process_response 120 else: 121 continue 122 123 lookup.setdefault(kind, set()).add(handler) 124 added = True 125 126 if not added: 127 unwanted.append(handler) 128 129 for handler in unwanted: 130 self.handlers.remove(handler) 131 132 # sort indexed methods 133 # XXX could be cleaned up 134 for lookup in [process_request, process_response]: 135 for scheme, handlers in iteritems(lookup): 136 lookup[scheme] = handlers 137 for scheme, lookup in iteritems(handle_error): 138 for code, handlers in iteritems(lookup): 139 handlers = list(handlers) 140 handlers.sort() 141 lookup[code] = handlers 142 for scheme, handlers in iteritems(handle_open): 143 handlers = list(handlers) 144 handlers.sort() 145 handle_open[scheme] = handlers 146 147 # cache the indexes 148 self.handle_error = handle_error 149 self.handle_open = handle_open 150 self.process_request = process_request 151 self.process_response = process_response 152 self._any_request = any_request 153 self._any_response = any_response 154 155 def _request(self, url_or_req, data, visit, 156 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): 157 if isstringlike(url_or_req): 158 req = Request(url_or_req, data, visit=visit, timeout=timeout) 159 else: 160 # already a mechanize.Request instance 161 req = url_or_req 162 if data is not None: 163 req.add_data(data) 164 # XXX yuck 165 set_request_attr(req, "visit", visit, None) 166 set_request_attr(req, "timeout", timeout, 167 _sockettimeout._GLOBAL_DEFAULT_TIMEOUT) 168 return req 169 170 def open(self, fullurl, data=None, 171 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): 172 req = self._request(fullurl, data, None, timeout) 173 req_scheme = req.get_type() 174 175 self._maybe_reindex_handlers() 176 177 # pre-process request 178 # XXX should we allow a Processor to change the URL scheme 179 # of the request? 180 request_processors = set(self.process_request.get(req_scheme, [])) 181 request_processors.update(self._any_request) 182 request_processors = list(request_processors) 183 request_processors.sort() 184 for processor in request_processors: 185 for meth_name in ["any_request", req_scheme + "_request"]: 186 meth = getattr(processor, meth_name, None) 187 if meth: 188 req = meth(req) 189 190 # In Python >= 2.4, .open() supports processors already, so we must 191 # call ._open() instead. 192 urlopen = _urllib2_fork.OpenerDirector._open 193 response = urlopen(self, req, data) 194 195 # post-process response 196 response_processors = set(self.process_response.get(req_scheme, [])) 197 response_processors.update(self._any_response) 198 response_processors = list(response_processors) 199 response_processors.sort() 200 for processor in response_processors: 201 for meth_name in ["any_response", req_scheme + "_response"]: 202 meth = getattr(processor, meth_name, None) 203 if meth: 204 response = meth(req, response) 205 206 return response 207 208 def error(self, proto, *args): 209 if proto in ['http', 'https']: 210 # XXX http[s] protocols are special-cased 211 # https is not different than http 212 dict = self.handle_error['http'] 213 proto = args[2] # YUCK! 214 meth_name = 'http_error_%s' % proto 215 http_err = 1 216 orig_args = args 217 else: 218 dict = self.handle_error 219 meth_name = proto + '_error' 220 http_err = 0 221 args = (dict, proto, meth_name) + args 222 result = self._call_chain(*args) 223 if result: 224 return result 225 226 if http_err: 227 args = (dict, 'default', 'http_error_default') + orig_args 228 return self._call_chain(*args) 229 230 BLOCK_SIZE = 1024 * 8 231 232 def retrieve(self, fullurl, filename=None, reporthook=None, data=None, 233 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT, 234 open=open_file): 235 """Returns (filename, headers). 236 237 For remote objects, the default filename will refer to a temporary 238 file. Temporary files are removed when the OpenerDirector.close() 239 method is called. 240 241 For file: URLs, at present the returned filename is None. This may 242 change in future. 243 244 If the actual number of bytes read is less than indicated by the 245 Content-Length header, raises ContentTooShortError (a URLError 246 subclass). The exception's .result attribute contains the (filename, 247 headers) that would have been returned. 248 249 """ 250 req = self._request(fullurl, data, False, timeout) 251 scheme = req.get_type() 252 fp = self.open(req) 253 try: 254 headers = fp.info() 255 if filename is None and scheme == 'file': 256 # XXX req.get_selector() seems broken here, return None, 257 # pending sanity :-/ 258 return None, headers 259 # return urllib.url2pathname(req.get_selector()), headers 260 if filename: 261 tfp = open(filename, 'wb') 262 else: 263 path = _rfc3986.urlsplit(req.get_full_url())[2] 264 suffix = os.path.splitext(path)[1] 265 fd, filename = tempfile.mkstemp(suffix) 266 self._tempfiles.append(filename) 267 tfp = os.fdopen(fd, 'wb') 268 try: 269 result = filename, headers 270 bs = self.BLOCK_SIZE 271 size = -1 272 read = 0 273 blocknum = 0 274 if reporthook: 275 if "content-length" in headers: 276 size = int(headers["content-length"]) 277 reporthook(blocknum, bs, size) 278 while 1: 279 block = fp.read(bs) 280 if not block: 281 break 282 read += len(block) 283 tfp.write(block) 284 blocknum += 1 285 if reporthook: 286 reporthook(blocknum, bs, size) 287 finally: 288 tfp.close() 289 finally: 290 fp.close() 291 292 # raise exception if actual size does not match content-length header 293 if size >= 0 and read < size: 294 raise ContentTooShortError( 295 "retrieval incomplete: " 296 "got only %i out of %i bytes" % (read, size), 297 result 298 ) 299 300 return result 301 302 def close(self): 303 _urllib2_fork.OpenerDirector.close(self) 304 305 # make it very obvious this object is no longer supposed to be used 306 self.open = self.error = self.retrieve = self.add_handler = None 307 308 if self._tempfiles: 309 for filename in self._tempfiles: 310 try: 311 os.unlink(filename) 312 except OSError: 313 pass 314 del self._tempfiles[:] 315 316 317def wrapped_open(urlopen, process_response_object, fullurl, data=None, 318 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): 319 success = True 320 try: 321 response = urlopen(fullurl, data, timeout) 322 except HTTPError as error: 323 success = False 324 if error.fp is None: # not a response 325 raise 326 response = error 327 328 if response is not None: 329 response = process_response_object(response) 330 331 if not success: 332 raise response 333 return response 334 335 336class ResponseProcessingOpener(OpenerDirector): 337 338 def open(self, fullurl, data=None, 339 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): 340 def bound_open(fullurl, data=None, 341 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): 342 return OpenerDirector.open(self, fullurl, data, timeout) 343 return wrapped_open( 344 bound_open, self.process_response_object, fullurl, data, timeout) 345 346 def process_response_object(self, response): 347 return response 348 349 350class SeekableResponseOpener(ResponseProcessingOpener): 351 352 def process_response_object(self, response): 353 return _response.seek_wrapped_response(response) 354 355 356class OpenerFactory: 357 """This class's interface is quite likely to change.""" 358 359 default_classes = [ 360 # handlers 361 _urllib2_fork.ProxyHandler, 362 _urllib2_fork.UnknownHandler, 363 _urllib2_fork.HTTPHandler, 364 _urllib2_fork.HTTPDefaultErrorHandler, 365 _urllib2_fork.HTTPRedirectHandler, 366 _urllib2_fork.FTPHandler, 367 _urllib2_fork.FileHandler, 368 # processors 369 _urllib2_fork.HTTPCookieProcessor, 370 _urllib2_fork.HTTPErrorProcessor, 371 ] 372 default_classes.append(_urllib2_fork.HTTPSHandler) 373 handlers = [] 374 replacement_handlers = [] 375 376 def __init__(self, klass=OpenerDirector): 377 self.klass = klass 378 379 def build_opener(self, *handlers): 380 """Create an opener object from a list of handlers and processors. 381 382 The opener will use several default handlers and processors, including 383 support for HTTP and FTP. 384 385 If any of the handlers passed as arguments are subclasses of the 386 default handlers, the default handlers will not be used. 387 388 """ 389 opener = self.klass() 390 default_classes = list(self.default_classes) 391 skip = set() 392 for klass in default_classes: 393 for check in handlers: 394 if is_class(check): 395 if issubclass(check, klass): 396 skip.add(klass) 397 elif isinstance(check, klass): 398 skip.add(klass) 399 for klass in skip: 400 default_classes.remove(klass) 401 402 for klass in default_classes: 403 opener.add_handler(klass()) 404 for h in handlers: 405 if is_class(h): 406 h = h() 407 opener.add_handler(h) 408 409 return opener 410 411 412build_opener = OpenerFactory().build_opener 413 414thread_local = threading.local() 415thread_local.opener = None 416 417 418def get_thread_local_opener(): 419 try: 420 ans = thread_local.opener 421 except AttributeError: 422 # threading module is broken, use a single global instance 423 ans = getattr(get_thread_local_opener, 'ans', None) 424 if ans is None: 425 ans = get_thread_local_opener.ans = build_opener() 426 if ans is None: 427 ans = thread_local.opener = build_opener() 428 return ans 429 430 431def urlopen(url, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): 432 return get_thread_local_opener().open(url, data, timeout) 433 434 435def urlretrieve(url, filename=None, reporthook=None, data=None, 436 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): 437 return get_thread_local_opener().retrieve( 438 url, filename, reporthook, data, timeout) 439 440 441def install_opener(opener): 442 get_thread_local_opener.ans = opener 443 try: 444 thread_local.opener = opener 445 except AttributeError: 446 pass 447