1""" 2Ported using Python-Future from the Python 3.3 standard library. 3 4An extensible library for opening URLs using a variety of protocols 5 6The simplest way to use this module is to call the urlopen function, 7which accepts a string containing a URL or a Request object (described 8below). It opens the URL and returns the results as file-like 9object; the returned object has some extra methods described below. 10 11The OpenerDirector manages a collection of Handler objects that do 12all the actual work. Each Handler implements a particular protocol or 13option. The OpenerDirector is a composite object that invokes the 14Handlers needed to open the requested URL. For example, the 15HTTPHandler performs HTTP GET and POST requests and deals with 16non-error returns. The HTTPRedirectHandler automatically deals with 17HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler 18deals with digest authentication. 19 20urlopen(url, data=None) -- Basic usage is the same as original 21urllib. pass the url and optionally data to post to an HTTP URL, and 22get a file-like object back. One difference is that you can also pass 23a Request instance instead of URL. Raises a URLError (subclass of 24IOError); for HTTP errors, raises an HTTPError, which can also be 25treated as a valid response. 26 27build_opener -- Function that creates a new OpenerDirector instance. 28Will install the default handlers. Accepts one or more Handlers as 29arguments, either instances or Handler classes that it will 30instantiate. If one of the argument is a subclass of the default 31handler, the argument will be installed instead of the default. 32 33install_opener -- Installs a new opener as the default opener. 34 35objects of interest: 36 37OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages 38the Handler classes, while dealing with requests and responses. 39 40Request -- An object that encapsulates the state of a request. The 41state can be as simple as the URL. It can also include extra HTTP 42headers, e.g. a User-Agent. 43 44BaseHandler -- 45 46internals: 47BaseHandler and parent 48_call_chain conventions 49 50Example usage: 51 52import urllib.request 53 54# set up authentication info 55authinfo = urllib.request.HTTPBasicAuthHandler() 56authinfo.add_password(realm='PDQ Application', 57 uri='https://mahler:8092/site-updates.py', 58 user='klem', 59 passwd='geheim$parole') 60 61proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"}) 62 63# build a new opener that adds authentication and caching FTP handlers 64opener = urllib.request.build_opener(proxy_support, authinfo, 65 urllib.request.CacheFTPHandler) 66 67# install it 68urllib.request.install_opener(opener) 69 70f = urllib.request.urlopen('http://www.python.org/') 71""" 72 73# XXX issues: 74# If an authentication error handler that tries to perform 75# authentication for some reason but fails, how should the error be 76# signalled? The client needs to know the HTTP error code. But if 77# the handler knows that the problem was, e.g., that it didn't know 78# that hash algo that requested in the challenge, it would be good to 79# pass that information along to the client, too. 80# ftp errors aren't handled cleanly 81# check digest against correct (i.e. non-apache) implementation 82 83# Possible extensions: 84# complex proxies XXX not sure what exactly was meant by this 85# abstract factory for opener 86 87from __future__ import absolute_import, division, print_function, unicode_literals 88from future.builtins import bytes, dict, filter, input, int, map, open, str 89from future.utils import PY2, PY3, raise_with_traceback 90 91import base64 92import bisect 93import hashlib 94import array 95 96from future.backports import email 97from future.backports.http import client as http_client 98from .error import URLError, HTTPError, ContentTooShortError 99from .parse import ( 100 urlparse, urlsplit, urljoin, unwrap, quote, unquote, 101 splittype, splithost, splitport, splituser, splitpasswd, 102 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse) 103from .response import addinfourl, addclosehook 104 105import io 106import os 107import posixpath 108import re 109import socket 110import sys 111import time 112import collections 113import tempfile 114import contextlib 115import warnings 116 117# check for SSL 118try: 119 import ssl 120 # Not available in the SSL module in Py2: 121 from ssl import SSLContext 122except ImportError: 123 _have_ssl = False 124else: 125 _have_ssl = True 126 127__all__ = [ 128 # Classes 129 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler', 130 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler', 131 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 132 'AbstractBasicAuthHandler', 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 133 'AbstractDigestAuthHandler', 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 134 'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 135 'UnknownHandler', 'HTTPErrorProcessor', 136 # Functions 137 'urlopen', 'install_opener', 'build_opener', 138 'pathname2url', 'url2pathname', 'getproxies', 139 # Legacy interface 140 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener', 141] 142 143# used in User-Agent header sent 144__version__ = sys.version[:3] 145 146_opener = None 147def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, **_3to2kwargs): 148 if 'cadefault' in _3to2kwargs: cadefault = _3to2kwargs['cadefault']; del _3to2kwargs['cadefault'] 149 else: cadefault = False 150 if 'capath' in _3to2kwargs: capath = _3to2kwargs['capath']; del _3to2kwargs['capath'] 151 else: capath = None 152 if 'cafile' in _3to2kwargs: cafile = _3to2kwargs['cafile']; del _3to2kwargs['cafile'] 153 else: cafile = None 154 global _opener 155 if cafile or capath or cadefault: 156 if not _have_ssl: 157 raise ValueError('SSL support not available') 158 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) 159 context.options |= ssl.OP_NO_SSLv2 160 context.verify_mode = ssl.CERT_REQUIRED 161 if cafile or capath: 162 context.load_verify_locations(cafile, capath) 163 else: 164 context.set_default_verify_paths() 165 https_handler = HTTPSHandler(context=context, check_hostname=True) 166 opener = build_opener(https_handler) 167 elif _opener is None: 168 _opener = opener = build_opener() 169 else: 170 opener = _opener 171 return opener.open(url, data, timeout) 172 173def install_opener(opener): 174 global _opener 175 _opener = opener 176 177_url_tempfiles = [] 178def urlretrieve(url, filename=None, reporthook=None, data=None): 179 """ 180 Retrieve a URL into a temporary location on disk. 181 182 Requires a URL argument. If a filename is passed, it is used as 183 the temporary file location. The reporthook argument should be 184 a callable that accepts a block number, a read size, and the 185 total file size of the URL target. The data argument should be 186 valid URL encoded data. 187 188 If a filename is passed and the URL points to a local resource, 189 the result is a copy from local file to new file. 190 191 Returns a tuple containing the path to the newly created 192 data file as well as the resulting HTTPMessage object. 193 """ 194 url_type, path = splittype(url) 195 196 with contextlib.closing(urlopen(url, data)) as fp: 197 headers = fp.info() 198 199 # Just return the local path and the "headers" for file:// 200 # URLs. No sense in performing a copy unless requested. 201 if url_type == "file" and not filename: 202 return os.path.normpath(path), headers 203 204 # Handle temporary file setup. 205 if filename: 206 tfp = open(filename, 'wb') 207 else: 208 tfp = tempfile.NamedTemporaryFile(delete=False) 209 filename = tfp.name 210 _url_tempfiles.append(filename) 211 212 with tfp: 213 result = filename, headers 214 bs = 1024*8 215 size = -1 216 read = 0 217 blocknum = 0 218 if "content-length" in headers: 219 size = int(headers["Content-Length"]) 220 221 if reporthook: 222 reporthook(blocknum, bs, size) 223 224 while True: 225 block = fp.read(bs) 226 if not block: 227 break 228 read += len(block) 229 tfp.write(block) 230 blocknum += 1 231 if reporthook: 232 reporthook(blocknum, bs, size) 233 234 if size >= 0 and read < size: 235 raise ContentTooShortError( 236 "retrieval incomplete: got only %i out of %i bytes" 237 % (read, size), result) 238 239 return result 240 241def urlcleanup(): 242 for temp_file in _url_tempfiles: 243 try: 244 os.unlink(temp_file) 245 except EnvironmentError: 246 pass 247 248 del _url_tempfiles[:] 249 global _opener 250 if _opener: 251 _opener = None 252 253if PY3: 254 _cut_port_re = re.compile(r":\d+$", re.ASCII) 255else: 256 _cut_port_re = re.compile(r":\d+$") 257 258def request_host(request): 259 260 """Return request-host, as defined by RFC 2965. 261 262 Variation from RFC: returned value is lowercased, for convenient 263 comparison. 264 265 """ 266 url = request.full_url 267 host = urlparse(url)[1] 268 if host == "": 269 host = request.get_header("Host", "") 270 271 # remove port, if present 272 host = _cut_port_re.sub("", host, 1) 273 return host.lower() 274 275class Request(object): 276 277 def __init__(self, url, data=None, headers={}, 278 origin_req_host=None, unverifiable=False, 279 method=None): 280 # unwrap('<URL:type://host/path>') --> 'type://host/path' 281 self.full_url = unwrap(url) 282 self.full_url, self.fragment = splittag(self.full_url) 283 self.data = data 284 self.headers = {} 285 self._tunnel_host = None 286 for key, value in headers.items(): 287 self.add_header(key, value) 288 self.unredirected_hdrs = {} 289 if origin_req_host is None: 290 origin_req_host = request_host(self) 291 self.origin_req_host = origin_req_host 292 self.unverifiable = unverifiable 293 self.method = method 294 self._parse() 295 296 def _parse(self): 297 self.type, rest = splittype(self.full_url) 298 if self.type is None: 299 raise ValueError("unknown url type: %r" % self.full_url) 300 self.host, self.selector = splithost(rest) 301 if self.host: 302 self.host = unquote(self.host) 303 304 def get_method(self): 305 """Return a string indicating the HTTP request method.""" 306 if self.method is not None: 307 return self.method 308 elif self.data is not None: 309 return "POST" 310 else: 311 return "GET" 312 313 def get_full_url(self): 314 if self.fragment: 315 return '%s#%s' % (self.full_url, self.fragment) 316 else: 317 return self.full_url 318 319 # Begin deprecated methods 320 321 def add_data(self, data): 322 msg = "Request.add_data method is deprecated." 323 warnings.warn(msg, DeprecationWarning, stacklevel=1) 324 self.data = data 325 326 def has_data(self): 327 msg = "Request.has_data method is deprecated." 328 warnings.warn(msg, DeprecationWarning, stacklevel=1) 329 return self.data is not None 330 331 def get_data(self): 332 msg = "Request.get_data method is deprecated." 333 warnings.warn(msg, DeprecationWarning, stacklevel=1) 334 return self.data 335 336 def get_type(self): 337 msg = "Request.get_type method is deprecated." 338 warnings.warn(msg, DeprecationWarning, stacklevel=1) 339 return self.type 340 341 def get_host(self): 342 msg = "Request.get_host method is deprecated." 343 warnings.warn(msg, DeprecationWarning, stacklevel=1) 344 return self.host 345 346 def get_selector(self): 347 msg = "Request.get_selector method is deprecated." 348 warnings.warn(msg, DeprecationWarning, stacklevel=1) 349 return self.selector 350 351 def is_unverifiable(self): 352 msg = "Request.is_unverifiable method is deprecated." 353 warnings.warn(msg, DeprecationWarning, stacklevel=1) 354 return self.unverifiable 355 356 def get_origin_req_host(self): 357 msg = "Request.get_origin_req_host method is deprecated." 358 warnings.warn(msg, DeprecationWarning, stacklevel=1) 359 return self.origin_req_host 360 361 # End deprecated methods 362 363 def set_proxy(self, host, type): 364 if self.type == 'https' and not self._tunnel_host: 365 self._tunnel_host = self.host 366 else: 367 self.type= type 368 self.selector = self.full_url 369 self.host = host 370 371 def has_proxy(self): 372 return self.selector == self.full_url 373 374 def add_header(self, key, val): 375 # useful for something like authentication 376 self.headers[key.capitalize()] = val 377 378 def add_unredirected_header(self, key, val): 379 # will not be added to a redirected request 380 self.unredirected_hdrs[key.capitalize()] = val 381 382 def has_header(self, header_name): 383 return (header_name in self.headers or 384 header_name in self.unredirected_hdrs) 385 386 def get_header(self, header_name, default=None): 387 return self.headers.get( 388 header_name, 389 self.unredirected_hdrs.get(header_name, default)) 390 391 def header_items(self): 392 hdrs = self.unredirected_hdrs.copy() 393 hdrs.update(self.headers) 394 return list(hdrs.items()) 395 396class OpenerDirector(object): 397 def __init__(self): 398 client_version = "Python-urllib/%s" % __version__ 399 self.addheaders = [('User-agent', client_version)] 400 # self.handlers is retained only for backward compatibility 401 self.handlers = [] 402 # manage the individual handlers 403 self.handle_open = {} 404 self.handle_error = {} 405 self.process_response = {} 406 self.process_request = {} 407 408 def add_handler(self, handler): 409 if not hasattr(handler, "add_parent"): 410 raise TypeError("expected BaseHandler instance, got %r" % 411 type(handler)) 412 413 added = False 414 for meth in dir(handler): 415 if meth in ["redirect_request", "do_open", "proxy_open"]: 416 # oops, coincidental match 417 continue 418 419 i = meth.find("_") 420 protocol = meth[:i] 421 condition = meth[i+1:] 422 423 if condition.startswith("error"): 424 j = condition.find("_") + i + 1 425 kind = meth[j+1:] 426 try: 427 kind = int(kind) 428 except ValueError: 429 pass 430 lookup = self.handle_error.get(protocol, {}) 431 self.handle_error[protocol] = lookup 432 elif condition == "open": 433 kind = protocol 434 lookup = self.handle_open 435 elif condition == "response": 436 kind = protocol 437 lookup = self.process_response 438 elif condition == "request": 439 kind = protocol 440 lookup = self.process_request 441 else: 442 continue 443 444 handlers = lookup.setdefault(kind, []) 445 if handlers: 446 bisect.insort(handlers, handler) 447 else: 448 handlers.append(handler) 449 added = True 450 451 if added: 452 bisect.insort(self.handlers, handler) 453 handler.add_parent(self) 454 455 def close(self): 456 # Only exists for backwards compatibility. 457 pass 458 459 def _call_chain(self, chain, kind, meth_name, *args): 460 # Handlers raise an exception if no one else should try to handle 461 # the request, or return None if they can't but another handler 462 # could. Otherwise, they return the response. 463 handlers = chain.get(kind, ()) 464 for handler in handlers: 465 func = getattr(handler, meth_name) 466 result = func(*args) 467 if result is not None: 468 return result 469 470 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 471 """ 472 Accept a URL or a Request object 473 474 Python-Future: if the URL is passed as a byte-string, decode it first. 475 """ 476 if isinstance(fullurl, bytes): 477 fullurl = fullurl.decode() 478 if isinstance(fullurl, str): 479 req = Request(fullurl, data) 480 else: 481 req = fullurl 482 if data is not None: 483 req.data = data 484 485 req.timeout = timeout 486 protocol = req.type 487 488 # pre-process request 489 meth_name = protocol+"_request" 490 for processor in self.process_request.get(protocol, []): 491 meth = getattr(processor, meth_name) 492 req = meth(req) 493 494 response = self._open(req, data) 495 496 # post-process response 497 meth_name = protocol+"_response" 498 for processor in self.process_response.get(protocol, []): 499 meth = getattr(processor, meth_name) 500 response = meth(req, response) 501 502 return response 503 504 def _open(self, req, data=None): 505 result = self._call_chain(self.handle_open, 'default', 506 'default_open', req) 507 if result: 508 return result 509 510 protocol = req.type 511 result = self._call_chain(self.handle_open, protocol, protocol + 512 '_open', req) 513 if result: 514 return result 515 516 return self._call_chain(self.handle_open, 'unknown', 517 'unknown_open', req) 518 519 def error(self, proto, *args): 520 if proto in ('http', 'https'): 521 # XXX http[s] protocols are special-cased 522 dict = self.handle_error['http'] # https is not different than http 523 proto = args[2] # YUCK! 524 meth_name = 'http_error_%s' % proto 525 http_err = 1 526 orig_args = args 527 else: 528 dict = self.handle_error 529 meth_name = proto + '_error' 530 http_err = 0 531 args = (dict, proto, meth_name) + args 532 result = self._call_chain(*args) 533 if result: 534 return result 535 536 if http_err: 537 args = (dict, 'default', 'http_error_default') + orig_args 538 return self._call_chain(*args) 539 540# XXX probably also want an abstract factory that knows when it makes 541# sense to skip a superclass in favor of a subclass and when it might 542# make sense to include both 543 544def build_opener(*handlers): 545 """Create an opener object from a list of handlers. 546 547 The opener will use several default handlers, including support 548 for HTTP, FTP and when applicable HTTPS. 549 550 If any of the handlers passed as arguments are subclasses of the 551 default handlers, the default handlers will not be used. 552 """ 553 def isclass(obj): 554 return isinstance(obj, type) or hasattr(obj, "__bases__") 555 556 opener = OpenerDirector() 557 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, 558 HTTPDefaultErrorHandler, HTTPRedirectHandler, 559 FTPHandler, FileHandler, HTTPErrorProcessor] 560 if hasattr(http_client, "HTTPSConnection"): 561 default_classes.append(HTTPSHandler) 562 skip = set() 563 for klass in default_classes: 564 for check in handlers: 565 if isclass(check): 566 if issubclass(check, klass): 567 skip.add(klass) 568 elif isinstance(check, klass): 569 skip.add(klass) 570 for klass in skip: 571 default_classes.remove(klass) 572 573 for klass in default_classes: 574 opener.add_handler(klass()) 575 576 for h in handlers: 577 if isclass(h): 578 h = h() 579 opener.add_handler(h) 580 return opener 581 582class BaseHandler(object): 583 handler_order = 500 584 585 def add_parent(self, parent): 586 self.parent = parent 587 588 def close(self): 589 # Only exists for backwards compatibility 590 pass 591 592 def __lt__(self, other): 593 if not hasattr(other, "handler_order"): 594 # Try to preserve the old behavior of having custom classes 595 # inserted after default ones (works only for custom user 596 # classes which are not aware of handler_order). 597 return True 598 return self.handler_order < other.handler_order 599 600 601class HTTPErrorProcessor(BaseHandler): 602 """Process HTTP error responses.""" 603 handler_order = 1000 # after all other processing 604 605 def http_response(self, request, response): 606 code, msg, hdrs = response.code, response.msg, response.info() 607 608 # According to RFC 2616, "2xx" code indicates that the client's 609 # request was successfully received, understood, and accepted. 610 if not (200 <= code < 300): 611 response = self.parent.error( 612 'http', request, response, code, msg, hdrs) 613 614 return response 615 616 https_response = http_response 617 618class HTTPDefaultErrorHandler(BaseHandler): 619 def http_error_default(self, req, fp, code, msg, hdrs): 620 raise HTTPError(req.full_url, code, msg, hdrs, fp) 621 622class HTTPRedirectHandler(BaseHandler): 623 # maximum number of redirections to any single URL 624 # this is needed because of the state that cookies introduce 625 max_repeats = 4 626 # maximum total number of redirections (regardless of URL) before 627 # assuming we're in a loop 628 max_redirections = 10 629 630 def redirect_request(self, req, fp, code, msg, headers, newurl): 631 """Return a Request or None in response to a redirect. 632 633 This is called by the http_error_30x methods when a 634 redirection response is received. If a redirection should 635 take place, return a new Request to allow http_error_30x to 636 perform the redirect. Otherwise, raise HTTPError if no-one 637 else should try to handle this url. Return None if you can't 638 but another Handler might. 639 """ 640 m = req.get_method() 641 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD") 642 or code in (301, 302, 303) and m == "POST")): 643 raise HTTPError(req.full_url, code, msg, headers, fp) 644 645 # Strictly (according to RFC 2616), 301 or 302 in response to 646 # a POST MUST NOT cause a redirection without confirmation 647 # from the user (of urllib.request, in this case). In practice, 648 # essentially all clients do redirect in this case, so we do 649 # the same. 650 # be conciliant with URIs containing a space 651 newurl = newurl.replace(' ', '%20') 652 CONTENT_HEADERS = ("content-length", "content-type") 653 newheaders = dict((k, v) for k, v in req.headers.items() 654 if k.lower() not in CONTENT_HEADERS) 655 return Request(newurl, 656 headers=newheaders, 657 origin_req_host=req.origin_req_host, 658 unverifiable=True) 659 660 # Implementation note: To avoid the server sending us into an 661 # infinite loop, the request object needs to track what URLs we 662 # have already seen. Do this by adding a handler-specific 663 # attribute to the Request object. 664 def http_error_302(self, req, fp, code, msg, headers): 665 # Some servers (incorrectly) return multiple Location headers 666 # (so probably same goes for URI). Use first header. 667 if "location" in headers: 668 newurl = headers["location"] 669 elif "uri" in headers: 670 newurl = headers["uri"] 671 else: 672 return 673 674 # fix a possible malformed URL 675 urlparts = urlparse(newurl) 676 677 # For security reasons we don't allow redirection to anything other 678 # than http, https or ftp. 679 680 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 681 raise HTTPError( 682 newurl, code, 683 "%s - Redirection to url '%s' is not allowed" % (msg, newurl), 684 headers, fp) 685 686 if not urlparts.path: 687 urlparts = list(urlparts) 688 urlparts[2] = "/" 689 newurl = urlunparse(urlparts) 690 691 newurl = urljoin(req.full_url, newurl) 692 693 # XXX Probably want to forget about the state of the current 694 # request, although that might interact poorly with other 695 # handlers that also use handler-specific request attributes 696 new = self.redirect_request(req, fp, code, msg, headers, newurl) 697 if new is None: 698 return 699 700 # loop detection 701 # .redirect_dict has a key url if url was previously visited. 702 if hasattr(req, 'redirect_dict'): 703 visited = new.redirect_dict = req.redirect_dict 704 if (visited.get(newurl, 0) >= self.max_repeats or 705 len(visited) >= self.max_redirections): 706 raise HTTPError(req.full_url, code, 707 self.inf_msg + msg, headers, fp) 708 else: 709 visited = new.redirect_dict = req.redirect_dict = {} 710 visited[newurl] = visited.get(newurl, 0) + 1 711 712 # Don't close the fp until we are sure that we won't use it 713 # with HTTPError. 714 fp.read() 715 fp.close() 716 717 return self.parent.open(new, timeout=req.timeout) 718 719 http_error_301 = http_error_303 = http_error_307 = http_error_302 720 721 inf_msg = "The HTTP server returned a redirect error that would " \ 722 "lead to an infinite loop.\n" \ 723 "The last 30x error message was:\n" 724 725 726def _parse_proxy(proxy): 727 """Return (scheme, user, password, host/port) given a URL or an authority. 728 729 If a URL is supplied, it must have an authority (host:port) component. 730 According to RFC 3986, having an authority component means the URL must 731 have two slashes after the scheme: 732 733 >>> _parse_proxy('file:/ftp.example.com/') 734 Traceback (most recent call last): 735 ValueError: proxy URL with no authority: 'file:/ftp.example.com/' 736 737 The first three items of the returned tuple may be None. 738 739 Examples of authority parsing: 740 741 >>> _parse_proxy('proxy.example.com') 742 (None, None, None, 'proxy.example.com') 743 >>> _parse_proxy('proxy.example.com:3128') 744 (None, None, None, 'proxy.example.com:3128') 745 746 The authority component may optionally include userinfo (assumed to be 747 username:password): 748 749 >>> _parse_proxy('joe:password@proxy.example.com') 750 (None, 'joe', 'password', 'proxy.example.com') 751 >>> _parse_proxy('joe:password@proxy.example.com:3128') 752 (None, 'joe', 'password', 'proxy.example.com:3128') 753 754 Same examples, but with URLs instead: 755 756 >>> _parse_proxy('http://proxy.example.com/') 757 ('http', None, None, 'proxy.example.com') 758 >>> _parse_proxy('http://proxy.example.com:3128/') 759 ('http', None, None, 'proxy.example.com:3128') 760 >>> _parse_proxy('http://joe:password@proxy.example.com/') 761 ('http', 'joe', 'password', 'proxy.example.com') 762 >>> _parse_proxy('http://joe:password@proxy.example.com:3128') 763 ('http', 'joe', 'password', 'proxy.example.com:3128') 764 765 Everything after the authority is ignored: 766 767 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') 768 ('ftp', 'joe', 'password', 'proxy.example.com') 769 770 Test for no trailing '/' case: 771 772 >>> _parse_proxy('http://joe:password@proxy.example.com') 773 ('http', 'joe', 'password', 'proxy.example.com') 774 775 """ 776 scheme, r_scheme = splittype(proxy) 777 if not r_scheme.startswith("/"): 778 # authority 779 scheme = None 780 authority = proxy 781 else: 782 # URL 783 if not r_scheme.startswith("//"): 784 raise ValueError("proxy URL with no authority: %r" % proxy) 785 # We have an authority, so for RFC 3986-compliant URLs (by ss 3. 786 # and 3.3.), path is empty or starts with '/' 787 end = r_scheme.find("/", 2) 788 if end == -1: 789 end = None 790 authority = r_scheme[2:end] 791 userinfo, hostport = splituser(authority) 792 if userinfo is not None: 793 user, password = splitpasswd(userinfo) 794 else: 795 user = password = None 796 return scheme, user, password, hostport 797 798class ProxyHandler(BaseHandler): 799 # Proxies must be in front 800 handler_order = 100 801 802 def __init__(self, proxies=None): 803 if proxies is None: 804 proxies = getproxies() 805 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 806 self.proxies = proxies 807 for type, url in proxies.items(): 808 setattr(self, '%s_open' % type, 809 lambda r, proxy=url, type=type, meth=self.proxy_open: 810 meth(r, proxy, type)) 811 812 def proxy_open(self, req, proxy, type): 813 orig_type = req.type 814 proxy_type, user, password, hostport = _parse_proxy(proxy) 815 if proxy_type is None: 816 proxy_type = orig_type 817 818 if req.host and proxy_bypass(req.host): 819 return None 820 821 if user and password: 822 user_pass = '%s:%s' % (unquote(user), 823 unquote(password)) 824 creds = base64.b64encode(user_pass.encode()).decode("ascii") 825 req.add_header('Proxy-authorization', 'Basic ' + creds) 826 hostport = unquote(hostport) 827 req.set_proxy(hostport, proxy_type) 828 if orig_type == proxy_type or orig_type == 'https': 829 # let other handlers take care of it 830 return None 831 else: 832 # need to start over, because the other handlers don't 833 # grok the proxy's URL type 834 # e.g. if we have a constructor arg proxies like so: 835 # {'http': 'ftp://proxy.example.com'}, we may end up turning 836 # a request for http://acme.example.com/a into one for 837 # ftp://proxy.example.com/a 838 return self.parent.open(req, timeout=req.timeout) 839 840class HTTPPasswordMgr(object): 841 842 def __init__(self): 843 self.passwd = {} 844 845 def add_password(self, realm, uri, user, passwd): 846 # uri could be a single URI or a sequence 847 if isinstance(uri, str): 848 uri = [uri] 849 if realm not in self.passwd: 850 self.passwd[realm] = {} 851 for default_port in True, False: 852 reduced_uri = tuple( 853 [self.reduce_uri(u, default_port) for u in uri]) 854 self.passwd[realm][reduced_uri] = (user, passwd) 855 856 def find_user_password(self, realm, authuri): 857 domains = self.passwd.get(realm, {}) 858 for default_port in True, False: 859 reduced_authuri = self.reduce_uri(authuri, default_port) 860 for uris, authinfo in domains.items(): 861 for uri in uris: 862 if self.is_suburi(uri, reduced_authuri): 863 return authinfo 864 return None, None 865 866 def reduce_uri(self, uri, default_port=True): 867 """Accept authority or URI and extract only the authority and path.""" 868 # note HTTP URLs do not have a userinfo component 869 parts = urlsplit(uri) 870 if parts[1]: 871 # URI 872 scheme = parts[0] 873 authority = parts[1] 874 path = parts[2] or '/' 875 else: 876 # host or host:port 877 scheme = None 878 authority = uri 879 path = '/' 880 host, port = splitport(authority) 881 if default_port and port is None and scheme is not None: 882 dport = {"http": 80, 883 "https": 443, 884 }.get(scheme) 885 if dport is not None: 886 authority = "%s:%d" % (host, dport) 887 return authority, path 888 889 def is_suburi(self, base, test): 890 """Check if test is below base in a URI tree 891 892 Both args must be URIs in reduced form. 893 """ 894 if base == test: 895 return True 896 if base[0] != test[0]: 897 return False 898 common = posixpath.commonprefix((base[1], test[1])) 899 if len(common) == len(base[1]): 900 return True 901 return False 902 903 904class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): 905 906 def find_user_password(self, realm, authuri): 907 user, password = HTTPPasswordMgr.find_user_password(self, realm, 908 authuri) 909 if user is not None: 910 return user, password 911 return HTTPPasswordMgr.find_user_password(self, None, authuri) 912 913 914class AbstractBasicAuthHandler(object): 915 916 # XXX this allows for multiple auth-schemes, but will stupidly pick 917 # the last one with a realm specified. 918 919 # allow for double- and single-quoted realm values 920 # (single quotes are a violation of the RFC, but appear in the wild) 921 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' 922 'realm=(["\']?)([^"\']*)\\2', re.I) 923 924 # XXX could pre-emptively send auth info already accepted (RFC 2617, 925 # end of section 2, and section 1.2 immediately after "credentials" 926 # production). 927 928 def __init__(self, password_mgr=None): 929 if password_mgr is None: 930 password_mgr = HTTPPasswordMgr() 931 self.passwd = password_mgr 932 self.add_password = self.passwd.add_password 933 self.retried = 0 934 935 def reset_retry_count(self): 936 self.retried = 0 937 938 def http_error_auth_reqed(self, authreq, host, req, headers): 939 # host may be an authority (without userinfo) or a URL with an 940 # authority 941 # XXX could be multiple headers 942 authreq = headers.get(authreq, None) 943 944 if self.retried > 5: 945 # retry sending the username:password 5 times before failing. 946 raise HTTPError(req.get_full_url(), 401, "basic auth failed", 947 headers, None) 948 else: 949 self.retried += 1 950 951 if authreq: 952 scheme = authreq.split()[0] 953 if scheme.lower() != 'basic': 954 raise ValueError("AbstractBasicAuthHandler does not" 955 " support the following scheme: '%s'" % 956 scheme) 957 else: 958 mo = AbstractBasicAuthHandler.rx.search(authreq) 959 if mo: 960 scheme, quote, realm = mo.groups() 961 if quote not in ['"',"'"]: 962 warnings.warn("Basic Auth Realm was unquoted", 963 UserWarning, 2) 964 if scheme.lower() == 'basic': 965 response = self.retry_http_basic_auth(host, req, realm) 966 if response and response.code != 401: 967 self.retried = 0 968 return response 969 970 def retry_http_basic_auth(self, host, req, realm): 971 user, pw = self.passwd.find_user_password(realm, host) 972 if pw is not None: 973 raw = "%s:%s" % (user, pw) 974 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") 975 if req.headers.get(self.auth_header, None) == auth: 976 return None 977 req.add_unredirected_header(self.auth_header, auth) 978 return self.parent.open(req, timeout=req.timeout) 979 else: 980 return None 981 982 983class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 984 985 auth_header = 'Authorization' 986 987 def http_error_401(self, req, fp, code, msg, headers): 988 url = req.full_url 989 response = self.http_error_auth_reqed('www-authenticate', 990 url, req, headers) 991 self.reset_retry_count() 992 return response 993 994 995class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 996 997 auth_header = 'Proxy-authorization' 998 999 def http_error_407(self, req, fp, code, msg, headers): 1000 # http_error_auth_reqed requires that there is no userinfo component in 1001 # authority. Assume there isn't one, since urllib.request does not (and 1002 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing 1003 # userinfo. 1004 authority = req.host 1005 response = self.http_error_auth_reqed('proxy-authenticate', 1006 authority, req, headers) 1007 self.reset_retry_count() 1008 return response 1009 1010 1011# Return n random bytes. 1012_randombytes = os.urandom 1013 1014 1015class AbstractDigestAuthHandler(object): 1016 # Digest authentication is specified in RFC 2617. 1017 1018 # XXX The client does not inspect the Authentication-Info header 1019 # in a successful response. 1020 1021 # XXX It should be possible to test this implementation against 1022 # a mock server that just generates a static set of challenges. 1023 1024 # XXX qop="auth-int" supports is shaky 1025 1026 def __init__(self, passwd=None): 1027 if passwd is None: 1028 passwd = HTTPPasswordMgr() 1029 self.passwd = passwd 1030 self.add_password = self.passwd.add_password 1031 self.retried = 0 1032 self.nonce_count = 0 1033 self.last_nonce = None 1034 1035 def reset_retry_count(self): 1036 self.retried = 0 1037 1038 def http_error_auth_reqed(self, auth_header, host, req, headers): 1039 authreq = headers.get(auth_header, None) 1040 if self.retried > 5: 1041 # Don't fail endlessly - if we failed once, we'll probably 1042 # fail a second time. Hm. Unless the Password Manager is 1043 # prompting for the information. Crap. This isn't great 1044 # but it's better than the current 'repeat until recursion 1045 # depth exceeded' approach <wink> 1046 raise HTTPError(req.full_url, 401, "digest auth failed", 1047 headers, None) 1048 else: 1049 self.retried += 1 1050 if authreq: 1051 scheme = authreq.split()[0] 1052 if scheme.lower() == 'digest': 1053 return self.retry_http_digest_auth(req, authreq) 1054 elif scheme.lower() != 'basic': 1055 raise ValueError("AbstractDigestAuthHandler does not support" 1056 " the following scheme: '%s'" % scheme) 1057 1058 def retry_http_digest_auth(self, req, auth): 1059 token, challenge = auth.split(' ', 1) 1060 chal = parse_keqv_list(filter(None, parse_http_list(challenge))) 1061 auth = self.get_authorization(req, chal) 1062 if auth: 1063 auth_val = 'Digest %s' % auth 1064 if req.headers.get(self.auth_header, None) == auth_val: 1065 return None 1066 req.add_unredirected_header(self.auth_header, auth_val) 1067 resp = self.parent.open(req, timeout=req.timeout) 1068 return resp 1069 1070 def get_cnonce(self, nonce): 1071 # The cnonce-value is an opaque 1072 # quoted string value provided by the client and used by both client 1073 # and server to avoid chosen plaintext attacks, to provide mutual 1074 # authentication, and to provide some message integrity protection. 1075 # This isn't a fabulous effort, but it's probably Good Enough. 1076 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) 1077 b = s.encode("ascii") + _randombytes(8) 1078 dig = hashlib.sha1(b).hexdigest() 1079 return dig[:16] 1080 1081 def get_authorization(self, req, chal): 1082 try: 1083 realm = chal['realm'] 1084 nonce = chal['nonce'] 1085 qop = chal.get('qop') 1086 algorithm = chal.get('algorithm', 'MD5') 1087 # mod_digest doesn't send an opaque, even though it isn't 1088 # supposed to be optional 1089 opaque = chal.get('opaque', None) 1090 except KeyError: 1091 return None 1092 1093 H, KD = self.get_algorithm_impls(algorithm) 1094 if H is None: 1095 return None 1096 1097 user, pw = self.passwd.find_user_password(realm, req.full_url) 1098 if user is None: 1099 return None 1100 1101 # XXX not implemented yet 1102 if req.data is not None: 1103 entdig = self.get_entity_digest(req.data, chal) 1104 else: 1105 entdig = None 1106 1107 A1 = "%s:%s:%s" % (user, realm, pw) 1108 A2 = "%s:%s" % (req.get_method(), 1109 # XXX selector: what about proxies and full urls 1110 req.selector) 1111 if qop == 'auth': 1112 if nonce == self.last_nonce: 1113 self.nonce_count += 1 1114 else: 1115 self.nonce_count = 1 1116 self.last_nonce = nonce 1117 ncvalue = '%08x' % self.nonce_count 1118 cnonce = self.get_cnonce(nonce) 1119 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) 1120 respdig = KD(H(A1), noncebit) 1121 elif qop is None: 1122 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) 1123 else: 1124 # XXX handle auth-int. 1125 raise URLError("qop '%s' is not supported." % qop) 1126 1127 # XXX should the partial digests be encoded too? 1128 1129 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ 1130 'response="%s"' % (user, realm, nonce, req.selector, 1131 respdig) 1132 if opaque: 1133 base += ', opaque="%s"' % opaque 1134 if entdig: 1135 base += ', digest="%s"' % entdig 1136 base += ', algorithm="%s"' % algorithm 1137 if qop: 1138 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) 1139 return base 1140 1141 def get_algorithm_impls(self, algorithm): 1142 # lambdas assume digest modules are imported at the top level 1143 if algorithm == 'MD5': 1144 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() 1145 elif algorithm == 'SHA': 1146 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() 1147 # XXX MD5-sess 1148 KD = lambda s, d: H("%s:%s" % (s, d)) 1149 return H, KD 1150 1151 def get_entity_digest(self, data, chal): 1152 # XXX not implemented yet 1153 return None 1154 1155 1156class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1157 """An authentication protocol defined by RFC 2069 1158 1159 Digest authentication improves on basic authentication because it 1160 does not transmit passwords in the clear. 1161 """ 1162 1163 auth_header = 'Authorization' 1164 handler_order = 490 # before Basic auth 1165 1166 def http_error_401(self, req, fp, code, msg, headers): 1167 host = urlparse(req.full_url)[1] 1168 retry = self.http_error_auth_reqed('www-authenticate', 1169 host, req, headers) 1170 self.reset_retry_count() 1171 return retry 1172 1173 1174class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1175 1176 auth_header = 'Proxy-Authorization' 1177 handler_order = 490 # before Basic auth 1178 1179 def http_error_407(self, req, fp, code, msg, headers): 1180 host = req.host 1181 retry = self.http_error_auth_reqed('proxy-authenticate', 1182 host, req, headers) 1183 self.reset_retry_count() 1184 return retry 1185 1186class AbstractHTTPHandler(BaseHandler): 1187 1188 def __init__(self, debuglevel=0): 1189 self._debuglevel = debuglevel 1190 1191 def set_http_debuglevel(self, level): 1192 self._debuglevel = level 1193 1194 def do_request_(self, request): 1195 host = request.host 1196 if not host: 1197 raise URLError('no host given') 1198 1199 if request.data is not None: # POST 1200 data = request.data 1201 if isinstance(data, str): 1202 msg = "POST data should be bytes or an iterable of bytes. " \ 1203 "It cannot be of type str." 1204 raise TypeError(msg) 1205 if not request.has_header('Content-type'): 1206 request.add_unredirected_header( 1207 'Content-type', 1208 'application/x-www-form-urlencoded') 1209 if not request.has_header('Content-length'): 1210 size = None 1211 try: 1212 ### For Python-Future: 1213 if PY2 and isinstance(data, array.array): 1214 # memoryviews of arrays aren't supported 1215 # in Py2.7. (e.g. memoryview(array.array('I', 1216 # [1, 2, 3, 4])) raises a TypeError.) 1217 # So we calculate the size manually instead: 1218 size = len(data) * data.itemsize 1219 ### 1220 else: 1221 mv = memoryview(data) 1222 size = len(mv) * mv.itemsize 1223 except TypeError: 1224 if isinstance(data, collections.Iterable): 1225 raise ValueError("Content-Length should be specified " 1226 "for iterable data of type %r %r" % (type(data), 1227 data)) 1228 else: 1229 request.add_unredirected_header( 1230 'Content-length', '%d' % size) 1231 1232 sel_host = host 1233 if request.has_proxy(): 1234 scheme, sel = splittype(request.selector) 1235 sel_host, sel_path = splithost(sel) 1236 if not request.has_header('Host'): 1237 request.add_unredirected_header('Host', sel_host) 1238 for name, value in self.parent.addheaders: 1239 name = name.capitalize() 1240 if not request.has_header(name): 1241 request.add_unredirected_header(name, value) 1242 1243 return request 1244 1245 def do_open(self, http_class, req, **http_conn_args): 1246 """Return an HTTPResponse object for the request, using http_class. 1247 1248 http_class must implement the HTTPConnection API from http.client. 1249 """ 1250 host = req.host 1251 if not host: 1252 raise URLError('no host given') 1253 1254 # will parse host:port 1255 h = http_class(host, timeout=req.timeout, **http_conn_args) 1256 1257 headers = dict(req.unredirected_hdrs) 1258 headers.update(dict((k, v) for k, v in req.headers.items() 1259 if k not in headers)) 1260 1261 # TODO(jhylton): Should this be redesigned to handle 1262 # persistent connections? 1263 1264 # We want to make an HTTP/1.1 request, but the addinfourl 1265 # class isn't prepared to deal with a persistent connection. 1266 # It will try to read all remaining data from the socket, 1267 # which will block while the server waits for the next request. 1268 # So make sure the connection gets closed after the (only) 1269 # request. 1270 headers["Connection"] = "close" 1271 headers = dict((name.title(), val) for name, val in headers.items()) 1272 1273 if req._tunnel_host: 1274 tunnel_headers = {} 1275 proxy_auth_hdr = "Proxy-Authorization" 1276 if proxy_auth_hdr in headers: 1277 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 1278 # Proxy-Authorization should not be sent to origin 1279 # server. 1280 del headers[proxy_auth_hdr] 1281 h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 1282 1283 try: 1284 h.request(req.get_method(), req.selector, req.data, headers) 1285 except socket.error as err: # timeout error 1286 h.close() 1287 raise URLError(err) 1288 else: 1289 r = h.getresponse() 1290 # If the server does not send us a 'Connection: close' header, 1291 # HTTPConnection assumes the socket should be left open. Manually 1292 # mark the socket to be closed when this response object goes away. 1293 if h.sock: 1294 h.sock.close() 1295 h.sock = None 1296 1297 1298 r.url = req.get_full_url() 1299 # This line replaces the .msg attribute of the HTTPResponse 1300 # with .headers, because urllib clients expect the response to 1301 # have the reason in .msg. It would be good to mark this 1302 # attribute is deprecated and get then to use info() or 1303 # .headers. 1304 r.msg = r.reason 1305 return r 1306 1307 1308class HTTPHandler(AbstractHTTPHandler): 1309 1310 def http_open(self, req): 1311 return self.do_open(http_client.HTTPConnection, req) 1312 1313 http_request = AbstractHTTPHandler.do_request_ 1314 1315if hasattr(http_client, 'HTTPSConnection'): 1316 1317 class HTTPSHandler(AbstractHTTPHandler): 1318 1319 def __init__(self, debuglevel=0, context=None, check_hostname=None): 1320 AbstractHTTPHandler.__init__(self, debuglevel) 1321 self._context = context 1322 self._check_hostname = check_hostname 1323 1324 def https_open(self, req): 1325 return self.do_open(http_client.HTTPSConnection, req, 1326 context=self._context, check_hostname=self._check_hostname) 1327 1328 https_request = AbstractHTTPHandler.do_request_ 1329 1330 __all__.append('HTTPSHandler') 1331 1332class HTTPCookieProcessor(BaseHandler): 1333 def __init__(self, cookiejar=None): 1334 import future.backports.http.cookiejar as http_cookiejar 1335 if cookiejar is None: 1336 cookiejar = http_cookiejar.CookieJar() 1337 self.cookiejar = cookiejar 1338 1339 def http_request(self, request): 1340 self.cookiejar.add_cookie_header(request) 1341 return request 1342 1343 def http_response(self, request, response): 1344 self.cookiejar.extract_cookies(response, request) 1345 return response 1346 1347 https_request = http_request 1348 https_response = http_response 1349 1350class UnknownHandler(BaseHandler): 1351 def unknown_open(self, req): 1352 type = req.type 1353 raise URLError('unknown url type: %s' % type) 1354 1355def parse_keqv_list(l): 1356 """Parse list of key=value strings where keys are not duplicated.""" 1357 parsed = {} 1358 for elt in l: 1359 k, v = elt.split('=', 1) 1360 if v[0] == '"' and v[-1] == '"': 1361 v = v[1:-1] 1362 parsed[k] = v 1363 return parsed 1364 1365def parse_http_list(s): 1366 """Parse lists as described by RFC 2068 Section 2. 1367 1368 In particular, parse comma-separated lists where the elements of 1369 the list may include quoted-strings. A quoted-string could 1370 contain a comma. A non-quoted string could have quotes in the 1371 middle. Neither commas nor quotes count if they are escaped. 1372 Only double-quotes count, not single-quotes. 1373 """ 1374 res = [] 1375 part = '' 1376 1377 escape = quote = False 1378 for cur in s: 1379 if escape: 1380 part += cur 1381 escape = False 1382 continue 1383 if quote: 1384 if cur == '\\': 1385 escape = True 1386 continue 1387 elif cur == '"': 1388 quote = False 1389 part += cur 1390 continue 1391 1392 if cur == ',': 1393 res.append(part) 1394 part = '' 1395 continue 1396 1397 if cur == '"': 1398 quote = True 1399 1400 part += cur 1401 1402 # append last part 1403 if part: 1404 res.append(part) 1405 1406 return [part.strip() for part in res] 1407 1408class FileHandler(BaseHandler): 1409 # Use local file or FTP depending on form of URL 1410 def file_open(self, req): 1411 url = req.selector 1412 if url[:2] == '//' and url[2:3] != '/' and (req.host and 1413 req.host != 'localhost'): 1414 if not req.host is self.get_names(): 1415 raise URLError("file:// scheme is supported only on localhost") 1416 else: 1417 return self.open_local_file(req) 1418 1419 # names for the localhost 1420 names = None 1421 def get_names(self): 1422 if FileHandler.names is None: 1423 try: 1424 FileHandler.names = tuple( 1425 socket.gethostbyname_ex('localhost')[2] + 1426 socket.gethostbyname_ex(socket.gethostname())[2]) 1427 except socket.gaierror: 1428 FileHandler.names = (socket.gethostbyname('localhost'),) 1429 return FileHandler.names 1430 1431 # not entirely sure what the rules are here 1432 def open_local_file(self, req): 1433 import future.backports.email.utils as email_utils 1434 import mimetypes 1435 host = req.host 1436 filename = req.selector 1437 localfile = url2pathname(filename) 1438 try: 1439 stats = os.stat(localfile) 1440 size = stats.st_size 1441 modified = email_utils.formatdate(stats.st_mtime, usegmt=True) 1442 mtype = mimetypes.guess_type(filename)[0] 1443 headers = email.message_from_string( 1444 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % 1445 (mtype or 'text/plain', size, modified)) 1446 if host: 1447 host, port = splitport(host) 1448 if not host or \ 1449 (not port and _safe_gethostbyname(host) in self.get_names()): 1450 if host: 1451 origurl = 'file://' + host + filename 1452 else: 1453 origurl = 'file://' + filename 1454 return addinfourl(open(localfile, 'rb'), headers, origurl) 1455 except OSError as exp: 1456 # users shouldn't expect OSErrors coming from urlopen() 1457 raise URLError(exp) 1458 raise URLError('file not on local host') 1459 1460def _safe_gethostbyname(host): 1461 try: 1462 return socket.gethostbyname(host) 1463 except socket.gaierror: 1464 return None 1465 1466class FTPHandler(BaseHandler): 1467 def ftp_open(self, req): 1468 import ftplib 1469 import mimetypes 1470 host = req.host 1471 if not host: 1472 raise URLError('ftp error: no host given') 1473 host, port = splitport(host) 1474 if port is None: 1475 port = ftplib.FTP_PORT 1476 else: 1477 port = int(port) 1478 1479 # username/password handling 1480 user, host = splituser(host) 1481 if user: 1482 user, passwd = splitpasswd(user) 1483 else: 1484 passwd = None 1485 host = unquote(host) 1486 user = user or '' 1487 passwd = passwd or '' 1488 1489 try: 1490 host = socket.gethostbyname(host) 1491 except socket.error as msg: 1492 raise URLError(msg) 1493 path, attrs = splitattr(req.selector) 1494 dirs = path.split('/') 1495 dirs = list(map(unquote, dirs)) 1496 dirs, file = dirs[:-1], dirs[-1] 1497 if dirs and not dirs[0]: 1498 dirs = dirs[1:] 1499 try: 1500 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) 1501 type = file and 'I' or 'D' 1502 for attr in attrs: 1503 attr, value = splitvalue(attr) 1504 if attr.lower() == 'type' and \ 1505 value in ('a', 'A', 'i', 'I', 'd', 'D'): 1506 type = value.upper() 1507 fp, retrlen = fw.retrfile(file, type) 1508 headers = "" 1509 mtype = mimetypes.guess_type(req.full_url)[0] 1510 if mtype: 1511 headers += "Content-type: %s\n" % mtype 1512 if retrlen is not None and retrlen >= 0: 1513 headers += "Content-length: %d\n" % retrlen 1514 headers = email.message_from_string(headers) 1515 return addinfourl(fp, headers, req.full_url) 1516 except ftplib.all_errors as exp: 1517 exc = URLError('ftp error: %r' % exp) 1518 raise_with_traceback(exc) 1519 1520 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1521 return ftpwrapper(user, passwd, host, port, dirs, timeout, 1522 persistent=False) 1523 1524class CacheFTPHandler(FTPHandler): 1525 # XXX would be nice to have pluggable cache strategies 1526 # XXX this stuff is definitely not thread safe 1527 def __init__(self): 1528 self.cache = {} 1529 self.timeout = {} 1530 self.soonest = 0 1531 self.delay = 60 1532 self.max_conns = 16 1533 1534 def setTimeout(self, t): 1535 self.delay = t 1536 1537 def setMaxConns(self, m): 1538 self.max_conns = m 1539 1540 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1541 key = user, host, port, '/'.join(dirs), timeout 1542 if key in self.cache: 1543 self.timeout[key] = time.time() + self.delay 1544 else: 1545 self.cache[key] = ftpwrapper(user, passwd, host, port, 1546 dirs, timeout) 1547 self.timeout[key] = time.time() + self.delay 1548 self.check_cache() 1549 return self.cache[key] 1550 1551 def check_cache(self): 1552 # first check for old ones 1553 t = time.time() 1554 if self.soonest <= t: 1555 for k, v in list(self.timeout.items()): 1556 if v < t: 1557 self.cache[k].close() 1558 del self.cache[k] 1559 del self.timeout[k] 1560 self.soonest = min(list(self.timeout.values())) 1561 1562 # then check the size 1563 if len(self.cache) == self.max_conns: 1564 for k, v in list(self.timeout.items()): 1565 if v == self.soonest: 1566 del self.cache[k] 1567 del self.timeout[k] 1568 break 1569 self.soonest = min(list(self.timeout.values())) 1570 1571 def clear_cache(self): 1572 for conn in self.cache.values(): 1573 conn.close() 1574 self.cache.clear() 1575 self.timeout.clear() 1576 1577 1578# Code move from the old urllib module 1579 1580MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 1581 1582# Helper for non-unix systems 1583if os.name == 'nt': 1584 from nturl2path import url2pathname, pathname2url 1585else: 1586 def url2pathname(pathname): 1587 """OS-specific conversion from a relative URL of the 'file' scheme 1588 to a file system path; not recommended for general use.""" 1589 return unquote(pathname) 1590 1591 def pathname2url(pathname): 1592 """OS-specific conversion from a file system path to a relative URL 1593 of the 'file' scheme; not recommended for general use.""" 1594 return quote(pathname) 1595 1596# This really consists of two pieces: 1597# (1) a class which handles opening of all sorts of URLs 1598# (plus assorted utilities etc.) 1599# (2) a set of functions for parsing URLs 1600# XXX Should these be separated out into different modules? 1601 1602 1603ftpcache = {} 1604class URLopener(object): 1605 """Class to open URLs. 1606 This is a class rather than just a subroutine because we may need 1607 more than one set of global protocol-specific options. 1608 Note -- this is a base class for those who don't want the 1609 automatic handling of errors type 302 (relocated) and 401 1610 (authorization needed).""" 1611 1612 __tempfiles = None 1613 1614 version = "Python-urllib/%s" % __version__ 1615 1616 # Constructor 1617 def __init__(self, proxies=None, **x509): 1618 msg = "%(class)s style of invoking requests is deprecated. " \ 1619 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__} 1620 warnings.warn(msg, DeprecationWarning, stacklevel=3) 1621 if proxies is None: 1622 proxies = getproxies() 1623 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 1624 self.proxies = proxies 1625 self.key_file = x509.get('key_file') 1626 self.cert_file = x509.get('cert_file') 1627 self.addheaders = [('User-Agent', self.version)] 1628 self.__tempfiles = [] 1629 self.__unlink = os.unlink # See cleanup() 1630 self.tempcache = None 1631 # Undocumented feature: if you assign {} to tempcache, 1632 # it is used to cache files retrieved with 1633 # self.retrieve(). This is not enabled by default 1634 # since it does not work for changing documents (and I 1635 # haven't got the logic to check expiration headers 1636 # yet). 1637 self.ftpcache = ftpcache 1638 # Undocumented feature: you can use a different 1639 # ftp cache by assigning to the .ftpcache member; 1640 # in case you want logically independent URL openers 1641 # XXX This is not threadsafe. Bah. 1642 1643 def __del__(self): 1644 self.close() 1645 1646 def close(self): 1647 self.cleanup() 1648 1649 def cleanup(self): 1650 # This code sometimes runs when the rest of this module 1651 # has already been deleted, so it can't use any globals 1652 # or import anything. 1653 if self.__tempfiles: 1654 for file in self.__tempfiles: 1655 try: 1656 self.__unlink(file) 1657 except OSError: 1658 pass 1659 del self.__tempfiles[:] 1660 if self.tempcache: 1661 self.tempcache.clear() 1662 1663 def addheader(self, *args): 1664 """Add a header to be used by the HTTP interface only 1665 e.g. u.addheader('Accept', 'sound/basic')""" 1666 self.addheaders.append(args) 1667 1668 # External interface 1669 def open(self, fullurl, data=None): 1670 """Use URLopener().open(file) instead of open(file, 'r').""" 1671 fullurl = unwrap(to_bytes(fullurl)) 1672 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") 1673 if self.tempcache and fullurl in self.tempcache: 1674 filename, headers = self.tempcache[fullurl] 1675 fp = open(filename, 'rb') 1676 return addinfourl(fp, headers, fullurl) 1677 urltype, url = splittype(fullurl) 1678 if not urltype: 1679 urltype = 'file' 1680 if urltype in self.proxies: 1681 proxy = self.proxies[urltype] 1682 urltype, proxyhost = splittype(proxy) 1683 host, selector = splithost(proxyhost) 1684 url = (host, fullurl) # Signal special case to open_*() 1685 else: 1686 proxy = None 1687 name = 'open_' + urltype 1688 self.type = urltype 1689 name = name.replace('-', '_') 1690 if not hasattr(self, name): 1691 if proxy: 1692 return self.open_unknown_proxy(proxy, fullurl, data) 1693 else: 1694 return self.open_unknown(fullurl, data) 1695 try: 1696 if data is None: 1697 return getattr(self, name)(url) 1698 else: 1699 return getattr(self, name)(url, data) 1700 except HTTPError: 1701 raise 1702 except socket.error as msg: 1703 raise_with_traceback(IOError('socket error', msg)) 1704 1705 def open_unknown(self, fullurl, data=None): 1706 """Overridable interface to open unknown URL type.""" 1707 type, url = splittype(fullurl) 1708 raise IOError('url error', 'unknown url type', type) 1709 1710 def open_unknown_proxy(self, proxy, fullurl, data=None): 1711 """Overridable interface to open unknown URL type.""" 1712 type, url = splittype(fullurl) 1713 raise IOError('url error', 'invalid proxy for %s' % type, proxy) 1714 1715 # External interface 1716 def retrieve(self, url, filename=None, reporthook=None, data=None): 1717 """retrieve(url) returns (filename, headers) for a local object 1718 or (tempfilename, headers) for a remote object.""" 1719 url = unwrap(to_bytes(url)) 1720 if self.tempcache and url in self.tempcache: 1721 return self.tempcache[url] 1722 type, url1 = splittype(url) 1723 if filename is None and (not type or type == 'file'): 1724 try: 1725 fp = self.open_local_file(url1) 1726 hdrs = fp.info() 1727 fp.close() 1728 return url2pathname(splithost(url1)[1]), hdrs 1729 except IOError as msg: 1730 pass 1731 fp = self.open(url, data) 1732 try: 1733 headers = fp.info() 1734 if filename: 1735 tfp = open(filename, 'wb') 1736 else: 1737 import tempfile 1738 garbage, path = splittype(url) 1739 garbage, path = splithost(path or "") 1740 path, garbage = splitquery(path or "") 1741 path, garbage = splitattr(path or "") 1742 suffix = os.path.splitext(path)[1] 1743 (fd, filename) = tempfile.mkstemp(suffix) 1744 self.__tempfiles.append(filename) 1745 tfp = os.fdopen(fd, 'wb') 1746 try: 1747 result = filename, headers 1748 if self.tempcache is not None: 1749 self.tempcache[url] = result 1750 bs = 1024*8 1751 size = -1 1752 read = 0 1753 blocknum = 0 1754 if "content-length" in headers: 1755 size = int(headers["Content-Length"]) 1756 if reporthook: 1757 reporthook(blocknum, bs, size) 1758 while 1: 1759 block = fp.read(bs) 1760 if not block: 1761 break 1762 read += len(block) 1763 tfp.write(block) 1764 blocknum += 1 1765 if reporthook: 1766 reporthook(blocknum, bs, size) 1767 finally: 1768 tfp.close() 1769 finally: 1770 fp.close() 1771 1772 # raise exception if actual size does not match content-length header 1773 if size >= 0 and read < size: 1774 raise ContentTooShortError( 1775 "retrieval incomplete: got only %i out of %i bytes" 1776 % (read, size), result) 1777 1778 return result 1779 1780 # Each method named open_<type> knows how to open that type of URL 1781 1782 def _open_generic_http(self, connection_factory, url, data): 1783 """Make an HTTP connection using connection_class. 1784 1785 This is an internal method that should be called from 1786 open_http() or open_https(). 1787 1788 Arguments: 1789 - connection_factory should take a host name and return an 1790 HTTPConnection instance. 1791 - url is the url to retrieval or a host, relative-path pair. 1792 - data is payload for a POST request or None. 1793 """ 1794 1795 user_passwd = None 1796 proxy_passwd= None 1797 if isinstance(url, str): 1798 host, selector = splithost(url) 1799 if host: 1800 user_passwd, host = splituser(host) 1801 host = unquote(host) 1802 realhost = host 1803 else: 1804 host, selector = url 1805 # check whether the proxy contains authorization information 1806 proxy_passwd, host = splituser(host) 1807 # now we proceed with the url we want to obtain 1808 urltype, rest = splittype(selector) 1809 url = rest 1810 user_passwd = None 1811 if urltype.lower() != 'http': 1812 realhost = None 1813 else: 1814 realhost, rest = splithost(rest) 1815 if realhost: 1816 user_passwd, realhost = splituser(realhost) 1817 if user_passwd: 1818 selector = "%s://%s%s" % (urltype, realhost, rest) 1819 if proxy_bypass(realhost): 1820 host = realhost 1821 1822 if not host: raise IOError('http error', 'no host given') 1823 1824 if proxy_passwd: 1825 proxy_passwd = unquote(proxy_passwd) 1826 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii') 1827 else: 1828 proxy_auth = None 1829 1830 if user_passwd: 1831 user_passwd = unquote(user_passwd) 1832 auth = base64.b64encode(user_passwd.encode()).decode('ascii') 1833 else: 1834 auth = None 1835 http_conn = connection_factory(host) 1836 headers = {} 1837 if proxy_auth: 1838 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth 1839 if auth: 1840 headers["Authorization"] = "Basic %s" % auth 1841 if realhost: 1842 headers["Host"] = realhost 1843 1844 # Add Connection:close as we don't support persistent connections yet. 1845 # This helps in closing the socket and avoiding ResourceWarning 1846 1847 headers["Connection"] = "close" 1848 1849 for header, value in self.addheaders: 1850 headers[header] = value 1851 1852 if data is not None: 1853 headers["Content-Type"] = "application/x-www-form-urlencoded" 1854 http_conn.request("POST", selector, data, headers) 1855 else: 1856 http_conn.request("GET", selector, headers=headers) 1857 1858 try: 1859 response = http_conn.getresponse() 1860 except http_client.BadStatusLine: 1861 # something went wrong with the HTTP status line 1862 raise URLError("http protocol error: bad status line") 1863 1864 # According to RFC 2616, "2xx" code indicates that the client's 1865 # request was successfully received, understood, and accepted. 1866 if 200 <= response.status < 300: 1867 return addinfourl(response, response.msg, "http:" + url, 1868 response.status) 1869 else: 1870 return self.http_error( 1871 url, response.fp, 1872 response.status, response.reason, response.msg, data) 1873 1874 def open_http(self, url, data=None): 1875 """Use HTTP protocol.""" 1876 return self._open_generic_http(http_client.HTTPConnection, url, data) 1877 1878 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 1879 """Handle http errors. 1880 1881 Derived class can override this, or provide specific handlers 1882 named http_error_DDD where DDD is the 3-digit error code.""" 1883 # First check if there's a specific handler for this error 1884 name = 'http_error_%d' % errcode 1885 if hasattr(self, name): 1886 method = getattr(self, name) 1887 if data is None: 1888 result = method(url, fp, errcode, errmsg, headers) 1889 else: 1890 result = method(url, fp, errcode, errmsg, headers, data) 1891 if result: return result 1892 return self.http_error_default(url, fp, errcode, errmsg, headers) 1893 1894 def http_error_default(self, url, fp, errcode, errmsg, headers): 1895 """Default error handler: close the connection and raise IOError.""" 1896 fp.close() 1897 raise HTTPError(url, errcode, errmsg, headers, None) 1898 1899 if _have_ssl: 1900 def _https_connection(self, host): 1901 return http_client.HTTPSConnection(host, 1902 key_file=self.key_file, 1903 cert_file=self.cert_file) 1904 1905 def open_https(self, url, data=None): 1906 """Use HTTPS protocol.""" 1907 return self._open_generic_http(self._https_connection, url, data) 1908 1909 def open_file(self, url): 1910 """Use local file or FTP depending on form of URL.""" 1911 if not isinstance(url, str): 1912 raise URLError('file error: proxy support for file protocol currently not implemented') 1913 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': 1914 raise ValueError("file:// scheme is supported only on localhost") 1915 else: 1916 return self.open_local_file(url) 1917 1918 def open_local_file(self, url): 1919 """Use local file.""" 1920 import future.backports.email.utils as email_utils 1921 import mimetypes 1922 host, file = splithost(url) 1923 localname = url2pathname(file) 1924 try: 1925 stats = os.stat(localname) 1926 except OSError as e: 1927 raise URLError(e.strerror, e.filename) 1928 size = stats.st_size 1929 modified = email_utils.formatdate(stats.st_mtime, usegmt=True) 1930 mtype = mimetypes.guess_type(url)[0] 1931 headers = email.message_from_string( 1932 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % 1933 (mtype or 'text/plain', size, modified)) 1934 if not host: 1935 urlfile = file 1936 if file[:1] == '/': 1937 urlfile = 'file://' + file 1938 return addinfourl(open(localname, 'rb'), headers, urlfile) 1939 host, port = splitport(host) 1940 if (not port 1941 and socket.gethostbyname(host) in ((localhost(),) + thishost())): 1942 urlfile = file 1943 if file[:1] == '/': 1944 urlfile = 'file://' + file 1945 elif file[:2] == './': 1946 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url) 1947 return addinfourl(open(localname, 'rb'), headers, urlfile) 1948 raise URLError('local file error: not on local host') 1949 1950 def open_ftp(self, url): 1951 """Use FTP protocol.""" 1952 if not isinstance(url, str): 1953 raise URLError('ftp error: proxy support for ftp protocol currently not implemented') 1954 import mimetypes 1955 host, path = splithost(url) 1956 if not host: raise URLError('ftp error: no host given') 1957 host, port = splitport(host) 1958 user, host = splituser(host) 1959 if user: user, passwd = splitpasswd(user) 1960 else: passwd = None 1961 host = unquote(host) 1962 user = unquote(user or '') 1963 passwd = unquote(passwd or '') 1964 host = socket.gethostbyname(host) 1965 if not port: 1966 import ftplib 1967 port = ftplib.FTP_PORT 1968 else: 1969 port = int(port) 1970 path, attrs = splitattr(path) 1971 path = unquote(path) 1972 dirs = path.split('/') 1973 dirs, file = dirs[:-1], dirs[-1] 1974 if dirs and not dirs[0]: dirs = dirs[1:] 1975 if dirs and not dirs[0]: dirs[0] = '/' 1976 key = user, host, port, '/'.join(dirs) 1977 # XXX thread unsafe! 1978 if len(self.ftpcache) > MAXFTPCACHE: 1979 # Prune the cache, rather arbitrarily 1980 for k in self.ftpcache.keys(): 1981 if k != key: 1982 v = self.ftpcache[k] 1983 del self.ftpcache[k] 1984 v.close() 1985 try: 1986 if key not in self.ftpcache: 1987 self.ftpcache[key] = \ 1988 ftpwrapper(user, passwd, host, port, dirs) 1989 if not file: type = 'D' 1990 else: type = 'I' 1991 for attr in attrs: 1992 attr, value = splitvalue(attr) 1993 if attr.lower() == 'type' and \ 1994 value in ('a', 'A', 'i', 'I', 'd', 'D'): 1995 type = value.upper() 1996 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 1997 mtype = mimetypes.guess_type("ftp:" + url)[0] 1998 headers = "" 1999 if mtype: 2000 headers += "Content-Type: %s\n" % mtype 2001 if retrlen is not None and retrlen >= 0: 2002 headers += "Content-Length: %d\n" % retrlen 2003 headers = email.message_from_string(headers) 2004 return addinfourl(fp, headers, "ftp:" + url) 2005 except ftperrors() as exp: 2006 raise_with_traceback(URLError('ftp error %r' % exp)) 2007 2008 def open_data(self, url, data=None): 2009 """Use "data" URL.""" 2010 if not isinstance(url, str): 2011 raise URLError('data error: proxy support for data protocol currently not implemented') 2012 # ignore POSTed data 2013 # 2014 # syntax of data URLs: 2015 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 2016 # mediatype := [ type "/" subtype ] *( ";" parameter ) 2017 # data := *urlchar 2018 # parameter := attribute "=" value 2019 try: 2020 [type, data] = url.split(',', 1) 2021 except ValueError: 2022 raise IOError('data error', 'bad data URL') 2023 if not type: 2024 type = 'text/plain;charset=US-ASCII' 2025 semi = type.rfind(';') 2026 if semi >= 0 and '=' not in type[semi:]: 2027 encoding = type[semi+1:] 2028 type = type[:semi] 2029 else: 2030 encoding = '' 2031 msg = [] 2032 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT', 2033 time.gmtime(time.time()))) 2034 msg.append('Content-type: %s' % type) 2035 if encoding == 'base64': 2036 # XXX is this encoding/decoding ok? 2037 data = base64.decodebytes(data.encode('ascii')).decode('latin-1') 2038 else: 2039 data = unquote(data) 2040 msg.append('Content-Length: %d' % len(data)) 2041 msg.append('') 2042 msg.append(data) 2043 msg = '\n'.join(msg) 2044 headers = email.message_from_string(msg) 2045 f = io.StringIO(msg) 2046 #f.fileno = None # needed for addinfourl 2047 return addinfourl(f, headers, url) 2048 2049 2050class FancyURLopener(URLopener): 2051 """Derived class with handlers for errors we can handle (perhaps).""" 2052 2053 def __init__(self, *args, **kwargs): 2054 URLopener.__init__(self, *args, **kwargs) 2055 self.auth_cache = {} 2056 self.tries = 0 2057 self.maxtries = 10 2058 2059 def http_error_default(self, url, fp, errcode, errmsg, headers): 2060 """Default error handling -- don't raise an exception.""" 2061 return addinfourl(fp, headers, "http:" + url, errcode) 2062 2063 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 2064 """Error 302 -- relocated (temporarily).""" 2065 self.tries += 1 2066 if self.maxtries and self.tries >= self.maxtries: 2067 if hasattr(self, "http_error_500"): 2068 meth = self.http_error_500 2069 else: 2070 meth = self.http_error_default 2071 self.tries = 0 2072 return meth(url, fp, 500, 2073 "Internal Server Error: Redirect Recursion", headers) 2074 result = self.redirect_internal(url, fp, errcode, errmsg, headers, 2075 data) 2076 self.tries = 0 2077 return result 2078 2079 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 2080 if 'location' in headers: 2081 newurl = headers['location'] 2082 elif 'uri' in headers: 2083 newurl = headers['uri'] 2084 else: 2085 return 2086 fp.close() 2087 2088 # In case the server sent a relative URL, join with original: 2089 newurl = urljoin(self.type + ":" + url, newurl) 2090 2091 urlparts = urlparse(newurl) 2092 2093 # For security reasons, we don't allow redirection to anything other 2094 # than http, https and ftp. 2095 2096 # We are using newer HTTPError with older redirect_internal method 2097 # This older method will get deprecated in 3.3 2098 2099 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 2100 raise HTTPError(newurl, errcode, 2101 errmsg + 2102 " Redirection to url '%s' is not allowed." % newurl, 2103 headers, fp) 2104 2105 return self.open(newurl) 2106 2107 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 2108 """Error 301 -- also relocated (permanently).""" 2109 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2110 2111 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): 2112 """Error 303 -- also relocated (essentially identical to 302).""" 2113 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2114 2115 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): 2116 """Error 307 -- relocated, but turn POST into error.""" 2117 if data is None: 2118 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2119 else: 2120 return self.http_error_default(url, fp, errcode, errmsg, headers) 2121 2122 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None, 2123 retry=False): 2124 """Error 401 -- authentication required. 2125 This function supports Basic authentication only.""" 2126 if 'www-authenticate' not in headers: 2127 URLopener.http_error_default(self, url, fp, 2128 errcode, errmsg, headers) 2129 stuff = headers['www-authenticate'] 2130 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2131 if not match: 2132 URLopener.http_error_default(self, url, fp, 2133 errcode, errmsg, headers) 2134 scheme, realm = match.groups() 2135 if scheme.lower() != 'basic': 2136 URLopener.http_error_default(self, url, fp, 2137 errcode, errmsg, headers) 2138 if not retry: 2139 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2140 headers) 2141 name = 'retry_' + self.type + '_basic_auth' 2142 if data is None: 2143 return getattr(self,name)(url, realm) 2144 else: 2145 return getattr(self,name)(url, realm, data) 2146 2147 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None, 2148 retry=False): 2149 """Error 407 -- proxy authentication required. 2150 This function supports Basic authentication only.""" 2151 if 'proxy-authenticate' not in headers: 2152 URLopener.http_error_default(self, url, fp, 2153 errcode, errmsg, headers) 2154 stuff = headers['proxy-authenticate'] 2155 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2156 if not match: 2157 URLopener.http_error_default(self, url, fp, 2158 errcode, errmsg, headers) 2159 scheme, realm = match.groups() 2160 if scheme.lower() != 'basic': 2161 URLopener.http_error_default(self, url, fp, 2162 errcode, errmsg, headers) 2163 if not retry: 2164 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2165 headers) 2166 name = 'retry_proxy_' + self.type + '_basic_auth' 2167 if data is None: 2168 return getattr(self,name)(url, realm) 2169 else: 2170 return getattr(self,name)(url, realm, data) 2171 2172 def retry_proxy_http_basic_auth(self, url, realm, data=None): 2173 host, selector = splithost(url) 2174 newurl = 'http://' + host + selector 2175 proxy = self.proxies['http'] 2176 urltype, proxyhost = splittype(proxy) 2177 proxyhost, proxyselector = splithost(proxyhost) 2178 i = proxyhost.find('@') + 1 2179 proxyhost = proxyhost[i:] 2180 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2181 if not (user or passwd): return None 2182 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2183 quote(passwd, safe=''), proxyhost) 2184 self.proxies['http'] = 'http://' + proxyhost + proxyselector 2185 if data is None: 2186 return self.open(newurl) 2187 else: 2188 return self.open(newurl, data) 2189 2190 def retry_proxy_https_basic_auth(self, url, realm, data=None): 2191 host, selector = splithost(url) 2192 newurl = 'https://' + host + selector 2193 proxy = self.proxies['https'] 2194 urltype, proxyhost = splittype(proxy) 2195 proxyhost, proxyselector = splithost(proxyhost) 2196 i = proxyhost.find('@') + 1 2197 proxyhost = proxyhost[i:] 2198 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2199 if not (user or passwd): return None 2200 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2201 quote(passwd, safe=''), proxyhost) 2202 self.proxies['https'] = 'https://' + proxyhost + proxyselector 2203 if data is None: 2204 return self.open(newurl) 2205 else: 2206 return self.open(newurl, data) 2207 2208 def retry_http_basic_auth(self, url, realm, data=None): 2209 host, selector = splithost(url) 2210 i = host.find('@') + 1 2211 host = host[i:] 2212 user, passwd = self.get_user_passwd(host, realm, i) 2213 if not (user or passwd): return None 2214 host = "%s:%s@%s" % (quote(user, safe=''), 2215 quote(passwd, safe=''), host) 2216 newurl = 'http://' + host + selector 2217 if data is None: 2218 return self.open(newurl) 2219 else: 2220 return self.open(newurl, data) 2221 2222 def retry_https_basic_auth(self, url, realm, data=None): 2223 host, selector = splithost(url) 2224 i = host.find('@') + 1 2225 host = host[i:] 2226 user, passwd = self.get_user_passwd(host, realm, i) 2227 if not (user or passwd): return None 2228 host = "%s:%s@%s" % (quote(user, safe=''), 2229 quote(passwd, safe=''), host) 2230 newurl = 'https://' + host + selector 2231 if data is None: 2232 return self.open(newurl) 2233 else: 2234 return self.open(newurl, data) 2235 2236 def get_user_passwd(self, host, realm, clear_cache=0): 2237 key = realm + '@' + host.lower() 2238 if key in self.auth_cache: 2239 if clear_cache: 2240 del self.auth_cache[key] 2241 else: 2242 return self.auth_cache[key] 2243 user, passwd = self.prompt_user_passwd(host, realm) 2244 if user or passwd: self.auth_cache[key] = (user, passwd) 2245 return user, passwd 2246 2247 def prompt_user_passwd(self, host, realm): 2248 """Override this in a GUI environment!""" 2249 import getpass 2250 try: 2251 user = input("Enter username for %s at %s: " % (realm, host)) 2252 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 2253 (user, realm, host)) 2254 return user, passwd 2255 except KeyboardInterrupt: 2256 print() 2257 return None, None 2258 2259 2260# Utility functions 2261 2262_localhost = None 2263def localhost(): 2264 """Return the IP address of the magic hostname 'localhost'.""" 2265 global _localhost 2266 if _localhost is None: 2267 _localhost = socket.gethostbyname('localhost') 2268 return _localhost 2269 2270_thishost = None 2271def thishost(): 2272 """Return the IP addresses of the current host.""" 2273 global _thishost 2274 if _thishost is None: 2275 try: 2276 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2]) 2277 except socket.gaierror: 2278 _thishost = tuple(socket.gethostbyname_ex('localhost')[2]) 2279 return _thishost 2280 2281_ftperrors = None 2282def ftperrors(): 2283 """Return the set of errors raised by the FTP class.""" 2284 global _ftperrors 2285 if _ftperrors is None: 2286 import ftplib 2287 _ftperrors = ftplib.all_errors 2288 return _ftperrors 2289 2290_noheaders = None 2291def noheaders(): 2292 """Return an empty email Message object.""" 2293 global _noheaders 2294 if _noheaders is None: 2295 _noheaders = email.message_from_string("") 2296 return _noheaders 2297 2298 2299# Utility classes 2300 2301class ftpwrapper(object): 2302 """Class used by open_ftp() for cache of open FTP connections.""" 2303 2304 def __init__(self, user, passwd, host, port, dirs, timeout=None, 2305 persistent=True): 2306 self.user = user 2307 self.passwd = passwd 2308 self.host = host 2309 self.port = port 2310 self.dirs = dirs 2311 self.timeout = timeout 2312 self.refcount = 0 2313 self.keepalive = persistent 2314 self.init() 2315 2316 def init(self): 2317 import ftplib 2318 self.busy = 0 2319 self.ftp = ftplib.FTP() 2320 self.ftp.connect(self.host, self.port, self.timeout) 2321 self.ftp.login(self.user, self.passwd) 2322 _target = '/'.join(self.dirs) 2323 self.ftp.cwd(_target) 2324 2325 def retrfile(self, file, type): 2326 import ftplib 2327 self.endtransfer() 2328 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 2329 else: cmd = 'TYPE ' + type; isdir = 0 2330 try: 2331 self.ftp.voidcmd(cmd) 2332 except ftplib.all_errors: 2333 self.init() 2334 self.ftp.voidcmd(cmd) 2335 conn = None 2336 if file and not isdir: 2337 # Try to retrieve as a file 2338 try: 2339 cmd = 'RETR ' + file 2340 conn, retrlen = self.ftp.ntransfercmd(cmd) 2341 except ftplib.error_perm as reason: 2342 if str(reason)[:3] != '550': 2343 raise_with_traceback(URLError('ftp error: %r' % reason)) 2344 if not conn: 2345 # Set transfer mode to ASCII! 2346 self.ftp.voidcmd('TYPE A') 2347 # Try a directory listing. Verify that directory exists. 2348 if file: 2349 pwd = self.ftp.pwd() 2350 try: 2351 try: 2352 self.ftp.cwd(file) 2353 except ftplib.error_perm as reason: 2354 ### Was: 2355 # raise URLError('ftp error: %r' % reason) from reason 2356 exc = URLError('ftp error: %r' % reason) 2357 exc.__cause__ = reason 2358 raise exc 2359 finally: 2360 self.ftp.cwd(pwd) 2361 cmd = 'LIST ' + file 2362 else: 2363 cmd = 'LIST' 2364 conn, retrlen = self.ftp.ntransfercmd(cmd) 2365 self.busy = 1 2366 2367 ftpobj = addclosehook(conn.makefile('rb'), self.file_close) 2368 self.refcount += 1 2369 conn.close() 2370 # Pass back both a suitably decorated object and a retrieval length 2371 return (ftpobj, retrlen) 2372 2373 def endtransfer(self): 2374 self.busy = 0 2375 2376 def close(self): 2377 self.keepalive = False 2378 if self.refcount <= 0: 2379 self.real_close() 2380 2381 def file_close(self): 2382 self.endtransfer() 2383 self.refcount -= 1 2384 if self.refcount <= 0 and not self.keepalive: 2385 self.real_close() 2386 2387 def real_close(self): 2388 self.endtransfer() 2389 try: 2390 self.ftp.close() 2391 except ftperrors(): 2392 pass 2393 2394# Proxy handling 2395def getproxies_environment(): 2396 """Return a dictionary of scheme -> proxy server URL mappings. 2397 2398 Scan the environment for variables named <scheme>_proxy; 2399 this seems to be the standard convention. If you need a 2400 different way, you can pass a proxies dictionary to the 2401 [Fancy]URLopener constructor. 2402 2403 """ 2404 proxies = {} 2405 for name, value in os.environ.items(): 2406 name = name.lower() 2407 if value and name[-6:] == '_proxy': 2408 proxies[name[:-6]] = value 2409 return proxies 2410 2411def proxy_bypass_environment(host): 2412 """Test if proxies should not be used for a particular host. 2413 2414 Checks the environment for a variable named no_proxy, which should 2415 be a list of DNS suffixes separated by commas, or '*' for all hosts. 2416 """ 2417 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '') 2418 # '*' is special case for always bypass 2419 if no_proxy == '*': 2420 return 1 2421 # strip port off host 2422 hostonly, port = splitport(host) 2423 # check if the host ends with any of the DNS suffixes 2424 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')] 2425 for name in no_proxy_list: 2426 if name and (hostonly.endswith(name) or host.endswith(name)): 2427 return 1 2428 # otherwise, don't bypass 2429 return 0 2430 2431 2432# This code tests an OSX specific data structure but is testable on all 2433# platforms 2434def _proxy_bypass_macosx_sysconf(host, proxy_settings): 2435 """ 2436 Return True iff this host shouldn't be accessed using a proxy 2437 2438 This function uses the MacOSX framework SystemConfiguration 2439 to fetch the proxy information. 2440 2441 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie: 2442 { 'exclude_simple': bool, 2443 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16'] 2444 } 2445 """ 2446 from fnmatch import fnmatch 2447 2448 hostonly, port = splitport(host) 2449 2450 def ip2num(ipAddr): 2451 parts = ipAddr.split('.') 2452 parts = list(map(int, parts)) 2453 if len(parts) != 4: 2454 parts = (parts + [0, 0, 0, 0])[:4] 2455 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] 2456 2457 # Check for simple host names: 2458 if '.' not in host: 2459 if proxy_settings['exclude_simple']: 2460 return True 2461 2462 hostIP = None 2463 2464 for value in proxy_settings.get('exceptions', ()): 2465 # Items in the list are strings like these: *.local, 169.254/16 2466 if not value: continue 2467 2468 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) 2469 if m is not None: 2470 if hostIP is None: 2471 try: 2472 hostIP = socket.gethostbyname(hostonly) 2473 hostIP = ip2num(hostIP) 2474 except socket.error: 2475 continue 2476 2477 base = ip2num(m.group(1)) 2478 mask = m.group(2) 2479 if mask is None: 2480 mask = 8 * (m.group(1).count('.') + 1) 2481 else: 2482 mask = int(mask[1:]) 2483 mask = 32 - mask 2484 2485 if (hostIP >> mask) == (base >> mask): 2486 return True 2487 2488 elif fnmatch(host, value): 2489 return True 2490 2491 return False 2492 2493 2494if sys.platform == 'darwin': 2495 from _scproxy import _get_proxy_settings, _get_proxies 2496 2497 def proxy_bypass_macosx_sysconf(host): 2498 proxy_settings = _get_proxy_settings() 2499 return _proxy_bypass_macosx_sysconf(host, proxy_settings) 2500 2501 def getproxies_macosx_sysconf(): 2502 """Return a dictionary of scheme -> proxy server URL mappings. 2503 2504 This function uses the MacOSX framework SystemConfiguration 2505 to fetch the proxy information. 2506 """ 2507 return _get_proxies() 2508 2509 2510 2511 def proxy_bypass(host): 2512 if getproxies_environment(): 2513 return proxy_bypass_environment(host) 2514 else: 2515 return proxy_bypass_macosx_sysconf(host) 2516 2517 def getproxies(): 2518 return getproxies_environment() or getproxies_macosx_sysconf() 2519 2520 2521elif os.name == 'nt': 2522 def getproxies_registry(): 2523 """Return a dictionary of scheme -> proxy server URL mappings. 2524 2525 Win32 uses the registry to store proxies. 2526 2527 """ 2528 proxies = {} 2529 try: 2530 import winreg 2531 except ImportError: 2532 # Std module, so should be around - but you never know! 2533 return proxies 2534 try: 2535 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2536 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2537 proxyEnable = winreg.QueryValueEx(internetSettings, 2538 'ProxyEnable')[0] 2539 if proxyEnable: 2540 # Returned as Unicode but problems if not converted to ASCII 2541 proxyServer = str(winreg.QueryValueEx(internetSettings, 2542 'ProxyServer')[0]) 2543 if '=' in proxyServer: 2544 # Per-protocol settings 2545 for p in proxyServer.split(';'): 2546 protocol, address = p.split('=', 1) 2547 # See if address has a type:// prefix 2548 if not re.match('^([^/:]+)://', address): 2549 address = '%s://%s' % (protocol, address) 2550 proxies[protocol] = address 2551 else: 2552 # Use one setting for all protocols 2553 if proxyServer[:5] == 'http:': 2554 proxies['http'] = proxyServer 2555 else: 2556 proxies['http'] = 'http://%s' % proxyServer 2557 proxies['https'] = 'https://%s' % proxyServer 2558 proxies['ftp'] = 'ftp://%s' % proxyServer 2559 internetSettings.Close() 2560 except (WindowsError, ValueError, TypeError): 2561 # Either registry key not found etc, or the value in an 2562 # unexpected format. 2563 # proxies already set up to be empty so nothing to do 2564 pass 2565 return proxies 2566 2567 def getproxies(): 2568 """Return a dictionary of scheme -> proxy server URL mappings. 2569 2570 Returns settings gathered from the environment, if specified, 2571 or the registry. 2572 2573 """ 2574 return getproxies_environment() or getproxies_registry() 2575 2576 def proxy_bypass_registry(host): 2577 try: 2578 import winreg 2579 except ImportError: 2580 # Std modules, so should be around - but you never know! 2581 return 0 2582 try: 2583 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2584 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2585 proxyEnable = winreg.QueryValueEx(internetSettings, 2586 'ProxyEnable')[0] 2587 proxyOverride = str(winreg.QueryValueEx(internetSettings, 2588 'ProxyOverride')[0]) 2589 # ^^^^ Returned as Unicode but problems if not converted to ASCII 2590 except WindowsError: 2591 return 0 2592 if not proxyEnable or not proxyOverride: 2593 return 0 2594 # try to make a host list from name and IP address. 2595 rawHost, port = splitport(host) 2596 host = [rawHost] 2597 try: 2598 addr = socket.gethostbyname(rawHost) 2599 if addr != rawHost: 2600 host.append(addr) 2601 except socket.error: 2602 pass 2603 try: 2604 fqdn = socket.getfqdn(rawHost) 2605 if fqdn != rawHost: 2606 host.append(fqdn) 2607 except socket.error: 2608 pass 2609 # make a check value list from the registry entry: replace the 2610 # '<local>' string by the localhost entry and the corresponding 2611 # canonical entry. 2612 proxyOverride = proxyOverride.split(';') 2613 # now check if we match one of the registry values. 2614 for test in proxyOverride: 2615 if test == '<local>': 2616 if '.' not in rawHost: 2617 return 1 2618 test = test.replace(".", r"\.") # mask dots 2619 test = test.replace("*", r".*") # change glob sequence 2620 test = test.replace("?", r".") # change glob char 2621 for val in host: 2622 if re.match(test, val, re.I): 2623 return 1 2624 return 0 2625 2626 def proxy_bypass(host): 2627 """Return a dictionary of scheme -> proxy server URL mappings. 2628 2629 Returns settings gathered from the environment, if specified, 2630 or the registry. 2631 2632 """ 2633 if getproxies_environment(): 2634 return proxy_bypass_environment(host) 2635 else: 2636 return proxy_bypass_registry(host) 2637 2638else: 2639 # By default use environment variables 2640 getproxies = getproxies_environment 2641 proxy_bypass = proxy_bypass_environment 2642