1"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below).  It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work.  Each Handler implements a particular protocol or
10option.  The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL.  For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns.  The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib.  pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back.  One difference is that you can also pass
20a Request instance instead of URL.  Raises a URLError (subclass of
21OSError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers.  Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate.  If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33
34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
36
37Request -- An object that encapsulates the state of a request.  The
38state can be as simple as the URL.  It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
49import urllib.request
50
51# set up authentication info
52authinfo = urllib.request.HTTPBasicAuthHandler()
53authinfo.add_password(realm='PDQ Application',
54                      uri='https://mahler:8092/site-updates.py',
55                      user='klem',
56                      passwd='geheim$parole')
57
58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
59
60# build a new opener that adds authentication and caching FTP handlers
61opener = urllib.request.build_opener(proxy_support, authinfo,
62                                     urllib.request.CacheFTPHandler)
63
64# install it
65urllib.request.install_opener(opener)
66
67f = urllib.request.urlopen('https://www.python.org/')
68"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled?  The client needs to know the HTTP error code.  But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies  XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
85import bisect
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import re
93import socket
94import string
95import sys
96import time
97import tempfile
98import contextlib
99import warnings
100
101
102from urllib.error import URLError, HTTPError, ContentTooShortError
103from urllib.parse import (
104    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
105    _splittype, _splithost, _splitport, _splituser, _splitpasswd,
106    _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,
107    unquote_to_bytes, urlunparse)
108from urllib.response import addinfourl, addclosehook
109
110# check for SSL
111try:
112    import ssl
113except ImportError:
114    _have_ssl = False
115else:
116    _have_ssl = True
117
118__all__ = [
119    # Classes
120    'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
121    'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
122    'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
123    'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
124    'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
125    'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
126    'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
127    'UnknownHandler', 'HTTPErrorProcessor',
128    # Functions
129    'urlopen', 'install_opener', 'build_opener',
130    'pathname2url', 'url2pathname', 'getproxies',
131    # Legacy interface
132    'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
133]
134
135# used in User-Agent header sent
136__version__ = '%d.%d' % sys.version_info[:2]
137
138_opener = None
139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
140            *, cafile=None, capath=None, cadefault=False, context=None):
141    '''Open the URL url, which can be either a string or a Request object.
142
143    *data* must be an object specifying additional data to be sent to
144    the server, or None if no such data is needed.  See Request for
145    details.
146
147    urllib.request module uses HTTP/1.1 and includes a "Connection:close"
148    header in its HTTP requests.
149
150    The optional *timeout* parameter specifies a timeout in seconds for
151    blocking operations like the connection attempt (if not specified, the
152    global default timeout setting will be used). This only works for HTTP,
153    HTTPS and FTP connections.
154
155    If *context* is specified, it must be a ssl.SSLContext instance describing
156    the various SSL options. See HTTPSConnection for more details.
157
158    The optional *cafile* and *capath* parameters specify a set of trusted CA
159    certificates for HTTPS requests. cafile should point to a single file
160    containing a bundle of CA certificates, whereas capath should point to a
161    directory of hashed certificate files. More information can be found in
162    ssl.SSLContext.load_verify_locations().
163
164    The *cadefault* parameter is ignored.
165
166
167    This function always returns an object which can work as a
168    context manager and has the properties url, headers, and status.
169    See urllib.response.addinfourl for more detail on these properties.
170
171    For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
172    object slightly modified. In addition to the three new methods above, the
173    msg attribute contains the same information as the reason attribute ---
174    the reason phrase returned by the server --- instead of the response
175    headers as it is specified in the documentation for HTTPResponse.
176
177    For FTP, file, and data URLs and requests explicitly handled by legacy
178    URLopener and FancyURLopener classes, this function returns a
179    urllib.response.addinfourl object.
180
181    Note that None may be returned if no handler handles the request (though
182    the default installed global OpenerDirector uses UnknownHandler to ensure
183    this never happens).
184
185    In addition, if proxy settings are detected (for example, when a *_proxy
186    environment variable like http_proxy is set), ProxyHandler is default
187    installed and makes sure the requests are handled through the proxy.
188
189    '''
190    global _opener
191    if cafile or capath or cadefault:
192        import warnings
193        warnings.warn("cafile, capath and cadefault are deprecated, use a "
194                      "custom context instead.", DeprecationWarning, 2)
195        if context is not None:
196            raise ValueError(
197                "You can't pass both context and any of cafile, capath, and "
198                "cadefault"
199            )
200        if not _have_ssl:
201            raise ValueError('SSL support not available')
202        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
203                                             cafile=cafile,
204                                             capath=capath)
205        https_handler = HTTPSHandler(context=context)
206        opener = build_opener(https_handler)
207    elif context:
208        https_handler = HTTPSHandler(context=context)
209        opener = build_opener(https_handler)
210    elif _opener is None:
211        _opener = opener = build_opener()
212    else:
213        opener = _opener
214    return opener.open(url, data, timeout)
215
216def install_opener(opener):
217    global _opener
218    _opener = opener
219
220_url_tempfiles = []
221def urlretrieve(url, filename=None, reporthook=None, data=None):
222    """
223    Retrieve a URL into a temporary location on disk.
224
225    Requires a URL argument. If a filename is passed, it is used as
226    the temporary file location. The reporthook argument should be
227    a callable that accepts a block number, a read size, and the
228    total file size of the URL target. The data argument should be
229    valid URL encoded data.
230
231    If a filename is passed and the URL points to a local resource,
232    the result is a copy from local file to new file.
233
234    Returns a tuple containing the path to the newly created
235    data file as well as the resulting HTTPMessage object.
236    """
237    url_type, path = _splittype(url)
238
239    with contextlib.closing(urlopen(url, data)) as fp:
240        headers = fp.info()
241
242        # Just return the local path and the "headers" for file://
243        # URLs. No sense in performing a copy unless requested.
244        if url_type == "file" and not filename:
245            return os.path.normpath(path), headers
246
247        # Handle temporary file setup.
248        if filename:
249            tfp = open(filename, 'wb')
250        else:
251            tfp = tempfile.NamedTemporaryFile(delete=False)
252            filename = tfp.name
253            _url_tempfiles.append(filename)
254
255        with tfp:
256            result = filename, headers
257            bs = 1024*8
258            size = -1
259            read = 0
260            blocknum = 0
261            if "content-length" in headers:
262                size = int(headers["Content-Length"])
263
264            if reporthook:
265                reporthook(blocknum, bs, size)
266
267            while True:
268                block = fp.read(bs)
269                if not block:
270                    break
271                read += len(block)
272                tfp.write(block)
273                blocknum += 1
274                if reporthook:
275                    reporthook(blocknum, bs, size)
276
277    if size >= 0 and read < size:
278        raise ContentTooShortError(
279            "retrieval incomplete: got only %i out of %i bytes"
280            % (read, size), result)
281
282    return result
283
284def urlcleanup():
285    """Clean up temporary files from urlretrieve calls."""
286    for temp_file in _url_tempfiles:
287        try:
288            os.unlink(temp_file)
289        except OSError:
290            pass
291
292    del _url_tempfiles[:]
293    global _opener
294    if _opener:
295        _opener = None
296
297# copied from cookielib.py
298_cut_port_re = re.compile(r":\d+$", re.ASCII)
299def request_host(request):
300    """Return request-host, as defined by RFC 2965.
301
302    Variation from RFC: returned value is lowercased, for convenient
303    comparison.
304
305    """
306    url = request.full_url
307    host = urlparse(url)[1]
308    if host == "":
309        host = request.get_header("Host", "")
310
311    # remove port, if present
312    host = _cut_port_re.sub("", host, 1)
313    return host.lower()
314
315class Request:
316
317    def __init__(self, url, data=None, headers={},
318                 origin_req_host=None, unverifiable=False,
319                 method=None):
320        self.full_url = url
321        self.headers = {}
322        self.unredirected_hdrs = {}
323        self._data = None
324        self.data = data
325        self._tunnel_host = None
326        for key, value in headers.items():
327            self.add_header(key, value)
328        if origin_req_host is None:
329            origin_req_host = request_host(self)
330        self.origin_req_host = origin_req_host
331        self.unverifiable = unverifiable
332        if method:
333            self.method = method
334
335    @property
336    def full_url(self):
337        if self.fragment:
338            return '{}#{}'.format(self._full_url, self.fragment)
339        return self._full_url
340
341    @full_url.setter
342    def full_url(self, url):
343        # unwrap('<URL:type://host/path>') --> 'type://host/path'
344        self._full_url = unwrap(url)
345        self._full_url, self.fragment = _splittag(self._full_url)
346        self._parse()
347
348    @full_url.deleter
349    def full_url(self):
350        self._full_url = None
351        self.fragment = None
352        self.selector = ''
353
354    @property
355    def data(self):
356        return self._data
357
358    @data.setter
359    def data(self, data):
360        if data != self._data:
361            self._data = data
362            # issue 16464
363            # if we change data we need to remove content-length header
364            # (cause it's most probably calculated for previous value)
365            if self.has_header("Content-length"):
366                self.remove_header("Content-length")
367
368    @data.deleter
369    def data(self):
370        self.data = None
371
372    def _parse(self):
373        self.type, rest = _splittype(self._full_url)
374        if self.type is None:
375            raise ValueError("unknown url type: %r" % self.full_url)
376        self.host, self.selector = _splithost(rest)
377        if self.host:
378            self.host = unquote(self.host)
379
380    def get_method(self):
381        """Return a string indicating the HTTP request method."""
382        default_method = "POST" if self.data is not None else "GET"
383        return getattr(self, 'method', default_method)
384
385    def get_full_url(self):
386        return self.full_url
387
388    def set_proxy(self, host, type):
389        if self.type == 'https' and not self._tunnel_host:
390            self._tunnel_host = self.host
391        else:
392            self.type= type
393            self.selector = self.full_url
394        self.host = host
395
396    def has_proxy(self):
397        return self.selector == self.full_url
398
399    def add_header(self, key, val):
400        # useful for something like authentication
401        self.headers[key.capitalize()] = val
402
403    def add_unredirected_header(self, key, val):
404        # will not be added to a redirected request
405        self.unredirected_hdrs[key.capitalize()] = val
406
407    def has_header(self, header_name):
408        return (header_name in self.headers or
409                header_name in self.unredirected_hdrs)
410
411    def get_header(self, header_name, default=None):
412        return self.headers.get(
413            header_name,
414            self.unredirected_hdrs.get(header_name, default))
415
416    def remove_header(self, header_name):
417        self.headers.pop(header_name, None)
418        self.unredirected_hdrs.pop(header_name, None)
419
420    def header_items(self):
421        hdrs = {**self.unredirected_hdrs, **self.headers}
422        return list(hdrs.items())
423
424class OpenerDirector:
425    def __init__(self):
426        client_version = "Python-urllib/%s" % __version__
427        self.addheaders = [('User-agent', client_version)]
428        # self.handlers is retained only for backward compatibility
429        self.handlers = []
430        # manage the individual handlers
431        self.handle_open = {}
432        self.handle_error = {}
433        self.process_response = {}
434        self.process_request = {}
435
436    def add_handler(self, handler):
437        if not hasattr(handler, "add_parent"):
438            raise TypeError("expected BaseHandler instance, got %r" %
439                            type(handler))
440
441        added = False
442        for meth in dir(handler):
443            if meth in ["redirect_request", "do_open", "proxy_open"]:
444                # oops, coincidental match
445                continue
446
447            i = meth.find("_")
448            protocol = meth[:i]
449            condition = meth[i+1:]
450
451            if condition.startswith("error"):
452                j = condition.find("_") + i + 1
453                kind = meth[j+1:]
454                try:
455                    kind = int(kind)
456                except ValueError:
457                    pass
458                lookup = self.handle_error.get(protocol, {})
459                self.handle_error[protocol] = lookup
460            elif condition == "open":
461                kind = protocol
462                lookup = self.handle_open
463            elif condition == "response":
464                kind = protocol
465                lookup = self.process_response
466            elif condition == "request":
467                kind = protocol
468                lookup = self.process_request
469            else:
470                continue
471
472            handlers = lookup.setdefault(kind, [])
473            if handlers:
474                bisect.insort(handlers, handler)
475            else:
476                handlers.append(handler)
477            added = True
478
479        if added:
480            bisect.insort(self.handlers, handler)
481            handler.add_parent(self)
482
483    def close(self):
484        # Only exists for backwards compatibility.
485        pass
486
487    def _call_chain(self, chain, kind, meth_name, *args):
488        # Handlers raise an exception if no one else should try to handle
489        # the request, or return None if they can't but another handler
490        # could.  Otherwise, they return the response.
491        handlers = chain.get(kind, ())
492        for handler in handlers:
493            func = getattr(handler, meth_name)
494            result = func(*args)
495            if result is not None:
496                return result
497
498    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
499        # accept a URL or a Request object
500        if isinstance(fullurl, str):
501            req = Request(fullurl, data)
502        else:
503            req = fullurl
504            if data is not None:
505                req.data = data
506
507        req.timeout = timeout
508        protocol = req.type
509
510        # pre-process request
511        meth_name = protocol+"_request"
512        for processor in self.process_request.get(protocol, []):
513            meth = getattr(processor, meth_name)
514            req = meth(req)
515
516        sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
517        response = self._open(req, data)
518
519        # post-process response
520        meth_name = protocol+"_response"
521        for processor in self.process_response.get(protocol, []):
522            meth = getattr(processor, meth_name)
523            response = meth(req, response)
524
525        return response
526
527    def _open(self, req, data=None):
528        result = self._call_chain(self.handle_open, 'default',
529                                  'default_open', req)
530        if result:
531            return result
532
533        protocol = req.type
534        result = self._call_chain(self.handle_open, protocol, protocol +
535                                  '_open', req)
536        if result:
537            return result
538
539        return self._call_chain(self.handle_open, 'unknown',
540                                'unknown_open', req)
541
542    def error(self, proto, *args):
543        if proto in ('http', 'https'):
544            # XXX http[s] protocols are special-cased
545            dict = self.handle_error['http'] # https is not different than http
546            proto = args[2]  # YUCK!
547            meth_name = 'http_error_%s' % proto
548            http_err = 1
549            orig_args = args
550        else:
551            dict = self.handle_error
552            meth_name = proto + '_error'
553            http_err = 0
554        args = (dict, proto, meth_name) + args
555        result = self._call_chain(*args)
556        if result:
557            return result
558
559        if http_err:
560            args = (dict, 'default', 'http_error_default') + orig_args
561            return self._call_chain(*args)
562
563# XXX probably also want an abstract factory that knows when it makes
564# sense to skip a superclass in favor of a subclass and when it might
565# make sense to include both
566
567def build_opener(*handlers):
568    """Create an opener object from a list of handlers.
569
570    The opener will use several default handlers, including support
571    for HTTP, FTP and when applicable HTTPS.
572
573    If any of the handlers passed as arguments are subclasses of the
574    default handlers, the default handlers will not be used.
575    """
576    opener = OpenerDirector()
577    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
578                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
579                       FTPHandler, FileHandler, HTTPErrorProcessor,
580                       DataHandler]
581    if hasattr(http.client, "HTTPSConnection"):
582        default_classes.append(HTTPSHandler)
583    skip = set()
584    for klass in default_classes:
585        for check in handlers:
586            if isinstance(check, type):
587                if issubclass(check, klass):
588                    skip.add(klass)
589            elif isinstance(check, klass):
590                skip.add(klass)
591    for klass in skip:
592        default_classes.remove(klass)
593
594    for klass in default_classes:
595        opener.add_handler(klass())
596
597    for h in handlers:
598        if isinstance(h, type):
599            h = h()
600        opener.add_handler(h)
601    return opener
602
603class BaseHandler:
604    handler_order = 500
605
606    def add_parent(self, parent):
607        self.parent = parent
608
609    def close(self):
610        # Only exists for backwards compatibility
611        pass
612
613    def __lt__(self, other):
614        if not hasattr(other, "handler_order"):
615            # Try to preserve the old behavior of having custom classes
616            # inserted after default ones (works only for custom user
617            # classes which are not aware of handler_order).
618            return True
619        return self.handler_order < other.handler_order
620
621
622class HTTPErrorProcessor(BaseHandler):
623    """Process HTTP error responses."""
624    handler_order = 1000  # after all other processing
625
626    def http_response(self, request, response):
627        code, msg, hdrs = response.code, response.msg, response.info()
628
629        # According to RFC 2616, "2xx" code indicates that the client's
630        # request was successfully received, understood, and accepted.
631        if not (200 <= code < 300):
632            response = self.parent.error(
633                'http', request, response, code, msg, hdrs)
634
635        return response
636
637    https_response = http_response
638
639class HTTPDefaultErrorHandler(BaseHandler):
640    def http_error_default(self, req, fp, code, msg, hdrs):
641        raise HTTPError(req.full_url, code, msg, hdrs, fp)
642
643class HTTPRedirectHandler(BaseHandler):
644    # maximum number of redirections to any single URL
645    # this is needed because of the state that cookies introduce
646    max_repeats = 4
647    # maximum total number of redirections (regardless of URL) before
648    # assuming we're in a loop
649    max_redirections = 10
650
651    def redirect_request(self, req, fp, code, msg, headers, newurl):
652        """Return a Request or None in response to a redirect.
653
654        This is called by the http_error_30x methods when a
655        redirection response is received.  If a redirection should
656        take place, return a new Request to allow http_error_30x to
657        perform the redirect.  Otherwise, raise HTTPError if no-one
658        else should try to handle this url.  Return None if you can't
659        but another Handler might.
660        """
661        m = req.get_method()
662        if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
663            or code in (301, 302, 303) and m == "POST")):
664            raise HTTPError(req.full_url, code, msg, headers, fp)
665
666        # Strictly (according to RFC 2616), 301 or 302 in response to
667        # a POST MUST NOT cause a redirection without confirmation
668        # from the user (of urllib.request, in this case).  In practice,
669        # essentially all clients do redirect in this case, so we do
670        # the same.
671
672        # Be conciliant with URIs containing a space.  This is mainly
673        # redundant with the more complete encoding done in http_error_302(),
674        # but it is kept for compatibility with other callers.
675        newurl = newurl.replace(' ', '%20')
676
677        CONTENT_HEADERS = ("content-length", "content-type")
678        newheaders = {k: v for k, v in req.headers.items()
679                      if k.lower() not in CONTENT_HEADERS}
680        return Request(newurl,
681                       headers=newheaders,
682                       origin_req_host=req.origin_req_host,
683                       unverifiable=True)
684
685    # Implementation note: To avoid the server sending us into an
686    # infinite loop, the request object needs to track what URLs we
687    # have already seen.  Do this by adding a handler-specific
688    # attribute to the Request object.
689    def http_error_302(self, req, fp, code, msg, headers):
690        # Some servers (incorrectly) return multiple Location headers
691        # (so probably same goes for URI).  Use first header.
692        if "location" in headers:
693            newurl = headers["location"]
694        elif "uri" in headers:
695            newurl = headers["uri"]
696        else:
697            return
698
699        # fix a possible malformed URL
700        urlparts = urlparse(newurl)
701
702        # For security reasons we don't allow redirection to anything other
703        # than http, https or ftp.
704
705        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
706            raise HTTPError(
707                newurl, code,
708                "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
709                headers, fp)
710
711        if not urlparts.path and urlparts.netloc:
712            urlparts = list(urlparts)
713            urlparts[2] = "/"
714        newurl = urlunparse(urlparts)
715
716        # http.client.parse_headers() decodes as ISO-8859-1.  Recover the
717        # original bytes and percent-encode non-ASCII bytes, and any special
718        # characters such as the space.
719        newurl = quote(
720            newurl, encoding="iso-8859-1", safe=string.punctuation)
721        newurl = urljoin(req.full_url, newurl)
722
723        # XXX Probably want to forget about the state of the current
724        # request, although that might interact poorly with other
725        # handlers that also use handler-specific request attributes
726        new = self.redirect_request(req, fp, code, msg, headers, newurl)
727        if new is None:
728            return
729
730        # loop detection
731        # .redirect_dict has a key url if url was previously visited.
732        if hasattr(req, 'redirect_dict'):
733            visited = new.redirect_dict = req.redirect_dict
734            if (visited.get(newurl, 0) >= self.max_repeats or
735                len(visited) >= self.max_redirections):
736                raise HTTPError(req.full_url, code,
737                                self.inf_msg + msg, headers, fp)
738        else:
739            visited = new.redirect_dict = req.redirect_dict = {}
740        visited[newurl] = visited.get(newurl, 0) + 1
741
742        # Don't close the fp until we are sure that we won't use it
743        # with HTTPError.
744        fp.read()
745        fp.close()
746
747        return self.parent.open(new, timeout=req.timeout)
748
749    http_error_301 = http_error_303 = http_error_307 = http_error_302
750
751    inf_msg = "The HTTP server returned a redirect error that would " \
752              "lead to an infinite loop.\n" \
753              "The last 30x error message was:\n"
754
755
756def _parse_proxy(proxy):
757    """Return (scheme, user, password, host/port) given a URL or an authority.
758
759    If a URL is supplied, it must have an authority (host:port) component.
760    According to RFC 3986, having an authority component means the URL must
761    have two slashes after the scheme.
762    """
763    scheme, r_scheme = _splittype(proxy)
764    if not r_scheme.startswith("/"):
765        # authority
766        scheme = None
767        authority = proxy
768    else:
769        # URL
770        if not r_scheme.startswith("//"):
771            raise ValueError("proxy URL with no authority: %r" % proxy)
772        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
773        # and 3.3.), path is empty or starts with '/'
774        if '@' in r_scheme:
775            host_separator = r_scheme.find('@')
776            end = r_scheme.find("/", host_separator)
777        else:
778            end = r_scheme.find("/", 2)
779        if end == -1:
780            end = None
781        authority = r_scheme[2:end]
782    userinfo, hostport = _splituser(authority)
783    if userinfo is not None:
784        user, password = _splitpasswd(userinfo)
785    else:
786        user = password = None
787    return scheme, user, password, hostport
788
789class ProxyHandler(BaseHandler):
790    # Proxies must be in front
791    handler_order = 100
792
793    def __init__(self, proxies=None):
794        if proxies is None:
795            proxies = getproxies()
796        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
797        self.proxies = proxies
798        for type, url in proxies.items():
799            type = type.lower()
800            setattr(self, '%s_open' % type,
801                    lambda r, proxy=url, type=type, meth=self.proxy_open:
802                        meth(r, proxy, type))
803
804    def proxy_open(self, req, proxy, type):
805        orig_type = req.type
806        proxy_type, user, password, hostport = _parse_proxy(proxy)
807        if proxy_type is None:
808            proxy_type = orig_type
809
810        if req.host and proxy_bypass(req.host):
811            return None
812
813        if user and password:
814            user_pass = '%s:%s' % (unquote(user),
815                                   unquote(password))
816            creds = base64.b64encode(user_pass.encode()).decode("ascii")
817            req.add_header('Proxy-authorization', 'Basic ' + creds)
818        hostport = unquote(hostport)
819        req.set_proxy(hostport, proxy_type)
820        if orig_type == proxy_type or orig_type == 'https':
821            # let other handlers take care of it
822            return None
823        else:
824            # need to start over, because the other handlers don't
825            # grok the proxy's URL type
826            # e.g. if we have a constructor arg proxies like so:
827            # {'http': 'ftp://proxy.example.com'}, we may end up turning
828            # a request for http://acme.example.com/a into one for
829            # ftp://proxy.example.com/a
830            return self.parent.open(req, timeout=req.timeout)
831
832class HTTPPasswordMgr:
833
834    def __init__(self):
835        self.passwd = {}
836
837    def add_password(self, realm, uri, user, passwd):
838        # uri could be a single URI or a sequence
839        if isinstance(uri, str):
840            uri = [uri]
841        if realm not in self.passwd:
842            self.passwd[realm] = {}
843        for default_port in True, False:
844            reduced_uri = tuple(
845                self.reduce_uri(u, default_port) for u in uri)
846            self.passwd[realm][reduced_uri] = (user, passwd)
847
848    def find_user_password(self, realm, authuri):
849        domains = self.passwd.get(realm, {})
850        for default_port in True, False:
851            reduced_authuri = self.reduce_uri(authuri, default_port)
852            for uris, authinfo in domains.items():
853                for uri in uris:
854                    if self.is_suburi(uri, reduced_authuri):
855                        return authinfo
856        return None, None
857
858    def reduce_uri(self, uri, default_port=True):
859        """Accept authority or URI and extract only the authority and path."""
860        # note HTTP URLs do not have a userinfo component
861        parts = urlsplit(uri)
862        if parts[1]:
863            # URI
864            scheme = parts[0]
865            authority = parts[1]
866            path = parts[2] or '/'
867        else:
868            # host or host:port
869            scheme = None
870            authority = uri
871            path = '/'
872        host, port = _splitport(authority)
873        if default_port and port is None and scheme is not None:
874            dport = {"http": 80,
875                     "https": 443,
876                     }.get(scheme)
877            if dport is not None:
878                authority = "%s:%d" % (host, dport)
879        return authority, path
880
881    def is_suburi(self, base, test):
882        """Check if test is below base in a URI tree
883
884        Both args must be URIs in reduced form.
885        """
886        if base == test:
887            return True
888        if base[0] != test[0]:
889            return False
890        common = posixpath.commonprefix((base[1], test[1]))
891        if len(common) == len(base[1]):
892            return True
893        return False
894
895
896class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
897
898    def find_user_password(self, realm, authuri):
899        user, password = HTTPPasswordMgr.find_user_password(self, realm,
900                                                            authuri)
901        if user is not None:
902            return user, password
903        return HTTPPasswordMgr.find_user_password(self, None, authuri)
904
905
906class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
907
908    def __init__(self, *args, **kwargs):
909        self.authenticated = {}
910        super().__init__(*args, **kwargs)
911
912    def add_password(self, realm, uri, user, passwd, is_authenticated=False):
913        self.update_authenticated(uri, is_authenticated)
914        # Add a default for prior auth requests
915        if realm is not None:
916            super().add_password(None, uri, user, passwd)
917        super().add_password(realm, uri, user, passwd)
918
919    def update_authenticated(self, uri, is_authenticated=False):
920        # uri could be a single URI or a sequence
921        if isinstance(uri, str):
922            uri = [uri]
923
924        for default_port in True, False:
925            for u in uri:
926                reduced_uri = self.reduce_uri(u, default_port)
927                self.authenticated[reduced_uri] = is_authenticated
928
929    def is_authenticated(self, authuri):
930        for default_port in True, False:
931            reduced_authuri = self.reduce_uri(authuri, default_port)
932            for uri in self.authenticated:
933                if self.is_suburi(uri, reduced_authuri):
934                    return self.authenticated[uri]
935
936
937class AbstractBasicAuthHandler:
938
939    # XXX this allows for multiple auth-schemes, but will stupidly pick
940    # the last one with a realm specified.
941
942    # allow for double- and single-quoted realm values
943    # (single quotes are a violation of the RFC, but appear in the wild)
944    rx = re.compile('(?:^|,)'   # start of the string or ','
945                    '[ \t]*'    # optional whitespaces
946                    '([^ \t,]+)' # scheme like "Basic"
947                    '[ \t]+'    # mandatory whitespaces
948                    # realm=xxx
949                    # realm='xxx'
950                    # realm="xxx"
951                    'realm=(["\']?)([^"\']*)\\2',
952                    re.I)
953
954    # XXX could pre-emptively send auth info already accepted (RFC 2617,
955    # end of section 2, and section 1.2 immediately after "credentials"
956    # production).
957
958    def __init__(self, password_mgr=None):
959        if password_mgr is None:
960            password_mgr = HTTPPasswordMgr()
961        self.passwd = password_mgr
962        self.add_password = self.passwd.add_password
963
964    def _parse_realm(self, header):
965        # parse WWW-Authenticate header: accept multiple challenges per header
966        found_challenge = False
967        for mo in AbstractBasicAuthHandler.rx.finditer(header):
968            scheme, quote, realm = mo.groups()
969            if quote not in ['"', "'"]:
970                warnings.warn("Basic Auth Realm was unquoted",
971                              UserWarning, 3)
972
973            yield (scheme, realm)
974
975            found_challenge = True
976
977        if not found_challenge:
978            if header:
979                scheme = header.split()[0]
980            else:
981                scheme = ''
982            yield (scheme, None)
983
984    def http_error_auth_reqed(self, authreq, host, req, headers):
985        # host may be an authority (without userinfo) or a URL with an
986        # authority
987        headers = headers.get_all(authreq)
988        if not headers:
989            # no header found
990            return
991
992        unsupported = None
993        for header in headers:
994            for scheme, realm in self._parse_realm(header):
995                if scheme.lower() != 'basic':
996                    unsupported = scheme
997                    continue
998
999                if realm is not None:
1000                    # Use the first matching Basic challenge.
1001                    # Ignore following challenges even if they use the Basic
1002                    # scheme.
1003                    return self.retry_http_basic_auth(host, req, realm)
1004
1005        if unsupported is not None:
1006            raise ValueError("AbstractBasicAuthHandler does not "
1007                             "support the following scheme: %r"
1008                             % (scheme,))
1009
1010    def retry_http_basic_auth(self, host, req, realm):
1011        user, pw = self.passwd.find_user_password(realm, host)
1012        if pw is not None:
1013            raw = "%s:%s" % (user, pw)
1014            auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
1015            if req.get_header(self.auth_header, None) == auth:
1016                return None
1017            req.add_unredirected_header(self.auth_header, auth)
1018            return self.parent.open(req, timeout=req.timeout)
1019        else:
1020            return None
1021
1022    def http_request(self, req):
1023        if (not hasattr(self.passwd, 'is_authenticated') or
1024           not self.passwd.is_authenticated(req.full_url)):
1025            return req
1026
1027        if not req.has_header('Authorization'):
1028            user, passwd = self.passwd.find_user_password(None, req.full_url)
1029            credentials = '{0}:{1}'.format(user, passwd).encode()
1030            auth_str = base64.standard_b64encode(credentials).decode()
1031            req.add_unredirected_header('Authorization',
1032                                        'Basic {}'.format(auth_str.strip()))
1033        return req
1034
1035    def http_response(self, req, response):
1036        if hasattr(self.passwd, 'is_authenticated'):
1037            if 200 <= response.code < 300:
1038                self.passwd.update_authenticated(req.full_url, True)
1039            else:
1040                self.passwd.update_authenticated(req.full_url, False)
1041        return response
1042
1043    https_request = http_request
1044    https_response = http_response
1045
1046
1047
1048class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1049
1050    auth_header = 'Authorization'
1051
1052    def http_error_401(self, req, fp, code, msg, headers):
1053        url = req.full_url
1054        response = self.http_error_auth_reqed('www-authenticate',
1055                                          url, req, headers)
1056        return response
1057
1058
1059class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1060
1061    auth_header = 'Proxy-authorization'
1062
1063    def http_error_407(self, req, fp, code, msg, headers):
1064        # http_error_auth_reqed requires that there is no userinfo component in
1065        # authority.  Assume there isn't one, since urllib.request does not (and
1066        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1067        # userinfo.
1068        authority = req.host
1069        response = self.http_error_auth_reqed('proxy-authenticate',
1070                                          authority, req, headers)
1071        return response
1072
1073
1074# Return n random bytes.
1075_randombytes = os.urandom
1076
1077
1078class AbstractDigestAuthHandler:
1079    # Digest authentication is specified in RFC 2617.
1080
1081    # XXX The client does not inspect the Authentication-Info header
1082    # in a successful response.
1083
1084    # XXX It should be possible to test this implementation against
1085    # a mock server that just generates a static set of challenges.
1086
1087    # XXX qop="auth-int" supports is shaky
1088
1089    def __init__(self, passwd=None):
1090        if passwd is None:
1091            passwd = HTTPPasswordMgr()
1092        self.passwd = passwd
1093        self.add_password = self.passwd.add_password
1094        self.retried = 0
1095        self.nonce_count = 0
1096        self.last_nonce = None
1097
1098    def reset_retry_count(self):
1099        self.retried = 0
1100
1101    def http_error_auth_reqed(self, auth_header, host, req, headers):
1102        authreq = headers.get(auth_header, None)
1103        if self.retried > 5:
1104            # Don't fail endlessly - if we failed once, we'll probably
1105            # fail a second time. Hm. Unless the Password Manager is
1106            # prompting for the information. Crap. This isn't great
1107            # but it's better than the current 'repeat until recursion
1108            # depth exceeded' approach <wink>
1109            raise HTTPError(req.full_url, 401, "digest auth failed",
1110                            headers, None)
1111        else:
1112            self.retried += 1
1113        if authreq:
1114            scheme = authreq.split()[0]
1115            if scheme.lower() == 'digest':
1116                return self.retry_http_digest_auth(req, authreq)
1117            elif scheme.lower() != 'basic':
1118                raise ValueError("AbstractDigestAuthHandler does not support"
1119                                 " the following scheme: '%s'" % scheme)
1120
1121    def retry_http_digest_auth(self, req, auth):
1122        token, challenge = auth.split(' ', 1)
1123        chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1124        auth = self.get_authorization(req, chal)
1125        if auth:
1126            auth_val = 'Digest %s' % auth
1127            if req.headers.get(self.auth_header, None) == auth_val:
1128                return None
1129            req.add_unredirected_header(self.auth_header, auth_val)
1130            resp = self.parent.open(req, timeout=req.timeout)
1131            return resp
1132
1133    def get_cnonce(self, nonce):
1134        # The cnonce-value is an opaque
1135        # quoted string value provided by the client and used by both client
1136        # and server to avoid chosen plaintext attacks, to provide mutual
1137        # authentication, and to provide some message integrity protection.
1138        # This isn't a fabulous effort, but it's probably Good Enough.
1139        s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1140        b = s.encode("ascii") + _randombytes(8)
1141        dig = hashlib.sha1(b).hexdigest()
1142        return dig[:16]
1143
1144    def get_authorization(self, req, chal):
1145        try:
1146            realm = chal['realm']
1147            nonce = chal['nonce']
1148            qop = chal.get('qop')
1149            algorithm = chal.get('algorithm', 'MD5')
1150            # mod_digest doesn't send an opaque, even though it isn't
1151            # supposed to be optional
1152            opaque = chal.get('opaque', None)
1153        except KeyError:
1154            return None
1155
1156        H, KD = self.get_algorithm_impls(algorithm)
1157        if H is None:
1158            return None
1159
1160        user, pw = self.passwd.find_user_password(realm, req.full_url)
1161        if user is None:
1162            return None
1163
1164        # XXX not implemented yet
1165        if req.data is not None:
1166            entdig = self.get_entity_digest(req.data, chal)
1167        else:
1168            entdig = None
1169
1170        A1 = "%s:%s:%s" % (user, realm, pw)
1171        A2 = "%s:%s" % (req.get_method(),
1172                        # XXX selector: what about proxies and full urls
1173                        req.selector)
1174        # NOTE: As per  RFC 2617, when server sends "auth,auth-int", the client could use either `auth`
1175        #     or `auth-int` to the response back. we use `auth` to send the response back.
1176        if qop is None:
1177            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1178        elif 'auth' in qop.split(','):
1179            if nonce == self.last_nonce:
1180                self.nonce_count += 1
1181            else:
1182                self.nonce_count = 1
1183                self.last_nonce = nonce
1184            ncvalue = '%08x' % self.nonce_count
1185            cnonce = self.get_cnonce(nonce)
1186            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2))
1187            respdig = KD(H(A1), noncebit)
1188        else:
1189            # XXX handle auth-int.
1190            raise URLError("qop '%s' is not supported." % qop)
1191
1192        # XXX should the partial digests be encoded too?
1193
1194        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1195               'response="%s"' % (user, realm, nonce, req.selector,
1196                                  respdig)
1197        if opaque:
1198            base += ', opaque="%s"' % opaque
1199        if entdig:
1200            base += ', digest="%s"' % entdig
1201        base += ', algorithm="%s"' % algorithm
1202        if qop:
1203            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1204        return base
1205
1206    def get_algorithm_impls(self, algorithm):
1207        # lambdas assume digest modules are imported at the top level
1208        if algorithm == 'MD5':
1209            H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1210        elif algorithm == 'SHA':
1211            H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1212        # XXX MD5-sess
1213        else:
1214            raise ValueError("Unsupported digest authentication "
1215                             "algorithm %r" % algorithm)
1216        KD = lambda s, d: H("%s:%s" % (s, d))
1217        return H, KD
1218
1219    def get_entity_digest(self, data, chal):
1220        # XXX not implemented yet
1221        return None
1222
1223
1224class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1225    """An authentication protocol defined by RFC 2069
1226
1227    Digest authentication improves on basic authentication because it
1228    does not transmit passwords in the clear.
1229    """
1230
1231    auth_header = 'Authorization'
1232    handler_order = 490  # before Basic auth
1233
1234    def http_error_401(self, req, fp, code, msg, headers):
1235        host = urlparse(req.full_url)[1]
1236        retry = self.http_error_auth_reqed('www-authenticate',
1237                                           host, req, headers)
1238        self.reset_retry_count()
1239        return retry
1240
1241
1242class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1243
1244    auth_header = 'Proxy-Authorization'
1245    handler_order = 490  # before Basic auth
1246
1247    def http_error_407(self, req, fp, code, msg, headers):
1248        host = req.host
1249        retry = self.http_error_auth_reqed('proxy-authenticate',
1250                                           host, req, headers)
1251        self.reset_retry_count()
1252        return retry
1253
1254class AbstractHTTPHandler(BaseHandler):
1255
1256    def __init__(self, debuglevel=0):
1257        self._debuglevel = debuglevel
1258
1259    def set_http_debuglevel(self, level):
1260        self._debuglevel = level
1261
1262    def _get_content_length(self, request):
1263        return http.client.HTTPConnection._get_content_length(
1264            request.data,
1265            request.get_method())
1266
1267    def do_request_(self, request):
1268        host = request.host
1269        if not host:
1270            raise URLError('no host given')
1271
1272        if request.data is not None:  # POST
1273            data = request.data
1274            if isinstance(data, str):
1275                msg = "POST data should be bytes, an iterable of bytes, " \
1276                      "or a file object. It cannot be of type str."
1277                raise TypeError(msg)
1278            if not request.has_header('Content-type'):
1279                request.add_unredirected_header(
1280                    'Content-type',
1281                    'application/x-www-form-urlencoded')
1282            if (not request.has_header('Content-length')
1283                    and not request.has_header('Transfer-encoding')):
1284                content_length = self._get_content_length(request)
1285                if content_length is not None:
1286                    request.add_unredirected_header(
1287                            'Content-length', str(content_length))
1288                else:
1289                    request.add_unredirected_header(
1290                            'Transfer-encoding', 'chunked')
1291
1292        sel_host = host
1293        if request.has_proxy():
1294            scheme, sel = _splittype(request.selector)
1295            sel_host, sel_path = _splithost(sel)
1296        if not request.has_header('Host'):
1297            request.add_unredirected_header('Host', sel_host)
1298        for name, value in self.parent.addheaders:
1299            name = name.capitalize()
1300            if not request.has_header(name):
1301                request.add_unredirected_header(name, value)
1302
1303        return request
1304
1305    def do_open(self, http_class, req, **http_conn_args):
1306        """Return an HTTPResponse object for the request, using http_class.
1307
1308        http_class must implement the HTTPConnection API from http.client.
1309        """
1310        host = req.host
1311        if not host:
1312            raise URLError('no host given')
1313
1314        # will parse host:port
1315        h = http_class(host, timeout=req.timeout, **http_conn_args)
1316        h.set_debuglevel(self._debuglevel)
1317
1318        headers = dict(req.unredirected_hdrs)
1319        headers.update({k: v for k, v in req.headers.items()
1320                        if k not in headers})
1321
1322        # TODO(jhylton): Should this be redesigned to handle
1323        # persistent connections?
1324
1325        # We want to make an HTTP/1.1 request, but the addinfourl
1326        # class isn't prepared to deal with a persistent connection.
1327        # It will try to read all remaining data from the socket,
1328        # which will block while the server waits for the next request.
1329        # So make sure the connection gets closed after the (only)
1330        # request.
1331        headers["Connection"] = "close"
1332        headers = {name.title(): val for name, val in headers.items()}
1333
1334        if req._tunnel_host:
1335            tunnel_headers = {}
1336            proxy_auth_hdr = "Proxy-Authorization"
1337            if proxy_auth_hdr in headers:
1338                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1339                # Proxy-Authorization should not be sent to origin
1340                # server.
1341                del headers[proxy_auth_hdr]
1342            h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1343
1344        try:
1345            try:
1346                h.request(req.get_method(), req.selector, req.data, headers,
1347                          encode_chunked=req.has_header('Transfer-encoding'))
1348            except OSError as err: # timeout error
1349                raise URLError(err)
1350            r = h.getresponse()
1351        except:
1352            h.close()
1353            raise
1354
1355        # If the server does not send us a 'Connection: close' header,
1356        # HTTPConnection assumes the socket should be left open. Manually
1357        # mark the socket to be closed when this response object goes away.
1358        if h.sock:
1359            h.sock.close()
1360            h.sock = None
1361
1362        r.url = req.get_full_url()
1363        # This line replaces the .msg attribute of the HTTPResponse
1364        # with .headers, because urllib clients expect the response to
1365        # have the reason in .msg.  It would be good to mark this
1366        # attribute is deprecated and get then to use info() or
1367        # .headers.
1368        r.msg = r.reason
1369        return r
1370
1371
1372class HTTPHandler(AbstractHTTPHandler):
1373
1374    def http_open(self, req):
1375        return self.do_open(http.client.HTTPConnection, req)
1376
1377    http_request = AbstractHTTPHandler.do_request_
1378
1379if hasattr(http.client, 'HTTPSConnection'):
1380
1381    class HTTPSHandler(AbstractHTTPHandler):
1382
1383        def __init__(self, debuglevel=0, context=None, check_hostname=None):
1384            AbstractHTTPHandler.__init__(self, debuglevel)
1385            self._context = context
1386            self._check_hostname = check_hostname
1387
1388        def https_open(self, req):
1389            return self.do_open(http.client.HTTPSConnection, req,
1390                context=self._context, check_hostname=self._check_hostname)
1391
1392        https_request = AbstractHTTPHandler.do_request_
1393
1394    __all__.append('HTTPSHandler')
1395
1396class HTTPCookieProcessor(BaseHandler):
1397    def __init__(self, cookiejar=None):
1398        import http.cookiejar
1399        if cookiejar is None:
1400            cookiejar = http.cookiejar.CookieJar()
1401        self.cookiejar = cookiejar
1402
1403    def http_request(self, request):
1404        self.cookiejar.add_cookie_header(request)
1405        return request
1406
1407    def http_response(self, request, response):
1408        self.cookiejar.extract_cookies(response, request)
1409        return response
1410
1411    https_request = http_request
1412    https_response = http_response
1413
1414class UnknownHandler(BaseHandler):
1415    def unknown_open(self, req):
1416        type = req.type
1417        raise URLError('unknown url type: %s' % type)
1418
1419def parse_keqv_list(l):
1420    """Parse list of key=value strings where keys are not duplicated."""
1421    parsed = {}
1422    for elt in l:
1423        k, v = elt.split('=', 1)
1424        if v[0] == '"' and v[-1] == '"':
1425            v = v[1:-1]
1426        parsed[k] = v
1427    return parsed
1428
1429def parse_http_list(s):
1430    """Parse lists as described by RFC 2068 Section 2.
1431
1432    In particular, parse comma-separated lists where the elements of
1433    the list may include quoted-strings.  A quoted-string could
1434    contain a comma.  A non-quoted string could have quotes in the
1435    middle.  Neither commas nor quotes count if they are escaped.
1436    Only double-quotes count, not single-quotes.
1437    """
1438    res = []
1439    part = ''
1440
1441    escape = quote = False
1442    for cur in s:
1443        if escape:
1444            part += cur
1445            escape = False
1446            continue
1447        if quote:
1448            if cur == '\\':
1449                escape = True
1450                continue
1451            elif cur == '"':
1452                quote = False
1453            part += cur
1454            continue
1455
1456        if cur == ',':
1457            res.append(part)
1458            part = ''
1459            continue
1460
1461        if cur == '"':
1462            quote = True
1463
1464        part += cur
1465
1466    # append last part
1467    if part:
1468        res.append(part)
1469
1470    return [part.strip() for part in res]
1471
1472class FileHandler(BaseHandler):
1473    # Use local file or FTP depending on form of URL
1474    def file_open(self, req):
1475        url = req.selector
1476        if url[:2] == '//' and url[2:3] != '/' and (req.host and
1477                req.host != 'localhost'):
1478            if not req.host in self.get_names():
1479                raise URLError("file:// scheme is supported only on localhost")
1480        else:
1481            return self.open_local_file(req)
1482
1483    # names for the localhost
1484    names = None
1485    def get_names(self):
1486        if FileHandler.names is None:
1487            try:
1488                FileHandler.names = tuple(
1489                    socket.gethostbyname_ex('localhost')[2] +
1490                    socket.gethostbyname_ex(socket.gethostname())[2])
1491            except socket.gaierror:
1492                FileHandler.names = (socket.gethostbyname('localhost'),)
1493        return FileHandler.names
1494
1495    # not entirely sure what the rules are here
1496    def open_local_file(self, req):
1497        import email.utils
1498        import mimetypes
1499        host = req.host
1500        filename = req.selector
1501        localfile = url2pathname(filename)
1502        try:
1503            stats = os.stat(localfile)
1504            size = stats.st_size
1505            modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1506            mtype = mimetypes.guess_type(filename)[0]
1507            headers = email.message_from_string(
1508                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1509                (mtype or 'text/plain', size, modified))
1510            if host:
1511                host, port = _splitport(host)
1512            if not host or \
1513                (not port and _safe_gethostbyname(host) in self.get_names()):
1514                if host:
1515                    origurl = 'file://' + host + filename
1516                else:
1517                    origurl = 'file://' + filename
1518                return addinfourl(open(localfile, 'rb'), headers, origurl)
1519        except OSError as exp:
1520            raise URLError(exp)
1521        raise URLError('file not on local host')
1522
1523def _safe_gethostbyname(host):
1524    try:
1525        return socket.gethostbyname(host)
1526    except socket.gaierror:
1527        return None
1528
1529class FTPHandler(BaseHandler):
1530    def ftp_open(self, req):
1531        import ftplib
1532        import mimetypes
1533        host = req.host
1534        if not host:
1535            raise URLError('ftp error: no host given')
1536        host, port = _splitport(host)
1537        if port is None:
1538            port = ftplib.FTP_PORT
1539        else:
1540            port = int(port)
1541
1542        # username/password handling
1543        user, host = _splituser(host)
1544        if user:
1545            user, passwd = _splitpasswd(user)
1546        else:
1547            passwd = None
1548        host = unquote(host)
1549        user = user or ''
1550        passwd = passwd or ''
1551
1552        try:
1553            host = socket.gethostbyname(host)
1554        except OSError as msg:
1555            raise URLError(msg)
1556        path, attrs = _splitattr(req.selector)
1557        dirs = path.split('/')
1558        dirs = list(map(unquote, dirs))
1559        dirs, file = dirs[:-1], dirs[-1]
1560        if dirs and not dirs[0]:
1561            dirs = dirs[1:]
1562        try:
1563            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1564            type = file and 'I' or 'D'
1565            for attr in attrs:
1566                attr, value = _splitvalue(attr)
1567                if attr.lower() == 'type' and \
1568                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1569                    type = value.upper()
1570            fp, retrlen = fw.retrfile(file, type)
1571            headers = ""
1572            mtype = mimetypes.guess_type(req.full_url)[0]
1573            if mtype:
1574                headers += "Content-type: %s\n" % mtype
1575            if retrlen is not None and retrlen >= 0:
1576                headers += "Content-length: %d\n" % retrlen
1577            headers = email.message_from_string(headers)
1578            return addinfourl(fp, headers, req.full_url)
1579        except ftplib.all_errors as exp:
1580            exc = URLError('ftp error: %r' % exp)
1581            raise exc.with_traceback(sys.exc_info()[2])
1582
1583    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1584        return ftpwrapper(user, passwd, host, port, dirs, timeout,
1585                          persistent=False)
1586
1587class CacheFTPHandler(FTPHandler):
1588    # XXX would be nice to have pluggable cache strategies
1589    # XXX this stuff is definitely not thread safe
1590    def __init__(self):
1591        self.cache = {}
1592        self.timeout = {}
1593        self.soonest = 0
1594        self.delay = 60
1595        self.max_conns = 16
1596
1597    def setTimeout(self, t):
1598        self.delay = t
1599
1600    def setMaxConns(self, m):
1601        self.max_conns = m
1602
1603    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1604        key = user, host, port, '/'.join(dirs), timeout
1605        if key in self.cache:
1606            self.timeout[key] = time.time() + self.delay
1607        else:
1608            self.cache[key] = ftpwrapper(user, passwd, host, port,
1609                                         dirs, timeout)
1610            self.timeout[key] = time.time() + self.delay
1611        self.check_cache()
1612        return self.cache[key]
1613
1614    def check_cache(self):
1615        # first check for old ones
1616        t = time.time()
1617        if self.soonest <= t:
1618            for k, v in list(self.timeout.items()):
1619                if v < t:
1620                    self.cache[k].close()
1621                    del self.cache[k]
1622                    del self.timeout[k]
1623        self.soonest = min(list(self.timeout.values()))
1624
1625        # then check the size
1626        if len(self.cache) == self.max_conns:
1627            for k, v in list(self.timeout.items()):
1628                if v == self.soonest:
1629                    del self.cache[k]
1630                    del self.timeout[k]
1631                    break
1632            self.soonest = min(list(self.timeout.values()))
1633
1634    def clear_cache(self):
1635        for conn in self.cache.values():
1636            conn.close()
1637        self.cache.clear()
1638        self.timeout.clear()
1639
1640class DataHandler(BaseHandler):
1641    def data_open(self, req):
1642        # data URLs as specified in RFC 2397.
1643        #
1644        # ignores POSTed data
1645        #
1646        # syntax:
1647        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
1648        # mediatype := [ type "/" subtype ] *( ";" parameter )
1649        # data      := *urlchar
1650        # parameter := attribute "=" value
1651        url = req.full_url
1652
1653        scheme, data = url.split(":",1)
1654        mediatype, data = data.split(",",1)
1655
1656        # even base64 encoded data URLs might be quoted so unquote in any case:
1657        data = unquote_to_bytes(data)
1658        if mediatype.endswith(";base64"):
1659            data = base64.decodebytes(data)
1660            mediatype = mediatype[:-7]
1661
1662        if not mediatype:
1663            mediatype = "text/plain;charset=US-ASCII"
1664
1665        headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1666            (mediatype, len(data)))
1667
1668        return addinfourl(io.BytesIO(data), headers, url)
1669
1670
1671# Code move from the old urllib module
1672
1673MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
1674
1675# Helper for non-unix systems
1676if os.name == 'nt':
1677    from nturl2path import url2pathname, pathname2url
1678else:
1679    def url2pathname(pathname):
1680        """OS-specific conversion from a relative URL of the 'file' scheme
1681        to a file system path; not recommended for general use."""
1682        return unquote(pathname)
1683
1684    def pathname2url(pathname):
1685        """OS-specific conversion from a file system path to a relative URL
1686        of the 'file' scheme; not recommended for general use."""
1687        return quote(pathname)
1688
1689
1690ftpcache = {}
1691
1692
1693class URLopener:
1694    """Class to open URLs.
1695    This is a class rather than just a subroutine because we may need
1696    more than one set of global protocol-specific options.
1697    Note -- this is a base class for those who don't want the
1698    automatic handling of errors type 302 (relocated) and 401
1699    (authorization needed)."""
1700
1701    __tempfiles = None
1702
1703    version = "Python-urllib/%s" % __version__
1704
1705    # Constructor
1706    def __init__(self, proxies=None, **x509):
1707        msg = "%(class)s style of invoking requests is deprecated. " \
1708              "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1709        warnings.warn(msg, DeprecationWarning, stacklevel=3)
1710        if proxies is None:
1711            proxies = getproxies()
1712        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1713        self.proxies = proxies
1714        self.key_file = x509.get('key_file')
1715        self.cert_file = x509.get('cert_file')
1716        self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]
1717        self.__tempfiles = []
1718        self.__unlink = os.unlink # See cleanup()
1719        self.tempcache = None
1720        # Undocumented feature: if you assign {} to tempcache,
1721        # it is used to cache files retrieved with
1722        # self.retrieve().  This is not enabled by default
1723        # since it does not work for changing documents (and I
1724        # haven't got the logic to check expiration headers
1725        # yet).
1726        self.ftpcache = ftpcache
1727        # Undocumented feature: you can use a different
1728        # ftp cache by assigning to the .ftpcache member;
1729        # in case you want logically independent URL openers
1730        # XXX This is not threadsafe.  Bah.
1731
1732    def __del__(self):
1733        self.close()
1734
1735    def close(self):
1736        self.cleanup()
1737
1738    def cleanup(self):
1739        # This code sometimes runs when the rest of this module
1740        # has already been deleted, so it can't use any globals
1741        # or import anything.
1742        if self.__tempfiles:
1743            for file in self.__tempfiles:
1744                try:
1745                    self.__unlink(file)
1746                except OSError:
1747                    pass
1748            del self.__tempfiles[:]
1749        if self.tempcache:
1750            self.tempcache.clear()
1751
1752    def addheader(self, *args):
1753        """Add a header to be used by the HTTP interface only
1754        e.g. u.addheader('Accept', 'sound/basic')"""
1755        self.addheaders.append(args)
1756
1757    # External interface
1758    def open(self, fullurl, data=None):
1759        """Use URLopener().open(file) instead of open(file, 'r')."""
1760        fullurl = unwrap(_to_bytes(fullurl))
1761        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1762        if self.tempcache and fullurl in self.tempcache:
1763            filename, headers = self.tempcache[fullurl]
1764            fp = open(filename, 'rb')
1765            return addinfourl(fp, headers, fullurl)
1766        urltype, url = _splittype(fullurl)
1767        if not urltype:
1768            urltype = 'file'
1769        if urltype in self.proxies:
1770            proxy = self.proxies[urltype]
1771            urltype, proxyhost = _splittype(proxy)
1772            host, selector = _splithost(proxyhost)
1773            url = (host, fullurl) # Signal special case to open_*()
1774        else:
1775            proxy = None
1776        name = 'open_' + urltype
1777        self.type = urltype
1778        name = name.replace('-', '_')
1779        if not hasattr(self, name) or name == 'open_local_file':
1780            if proxy:
1781                return self.open_unknown_proxy(proxy, fullurl, data)
1782            else:
1783                return self.open_unknown(fullurl, data)
1784        try:
1785            if data is None:
1786                return getattr(self, name)(url)
1787            else:
1788                return getattr(self, name)(url, data)
1789        except (HTTPError, URLError):
1790            raise
1791        except OSError as msg:
1792            raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
1793
1794    def open_unknown(self, fullurl, data=None):
1795        """Overridable interface to open unknown URL type."""
1796        type, url = _splittype(fullurl)
1797        raise OSError('url error', 'unknown url type', type)
1798
1799    def open_unknown_proxy(self, proxy, fullurl, data=None):
1800        """Overridable interface to open unknown URL type."""
1801        type, url = _splittype(fullurl)
1802        raise OSError('url error', 'invalid proxy for %s' % type, proxy)
1803
1804    # External interface
1805    def retrieve(self, url, filename=None, reporthook=None, data=None):
1806        """retrieve(url) returns (filename, headers) for a local object
1807        or (tempfilename, headers) for a remote object."""
1808        url = unwrap(_to_bytes(url))
1809        if self.tempcache and url in self.tempcache:
1810            return self.tempcache[url]
1811        type, url1 = _splittype(url)
1812        if filename is None and (not type or type == 'file'):
1813            try:
1814                fp = self.open_local_file(url1)
1815                hdrs = fp.info()
1816                fp.close()
1817                return url2pathname(_splithost(url1)[1]), hdrs
1818            except OSError:
1819                pass
1820        fp = self.open(url, data)
1821        try:
1822            headers = fp.info()
1823            if filename:
1824                tfp = open(filename, 'wb')
1825            else:
1826                garbage, path = _splittype(url)
1827                garbage, path = _splithost(path or "")
1828                path, garbage = _splitquery(path or "")
1829                path, garbage = _splitattr(path or "")
1830                suffix = os.path.splitext(path)[1]
1831                (fd, filename) = tempfile.mkstemp(suffix)
1832                self.__tempfiles.append(filename)
1833                tfp = os.fdopen(fd, 'wb')
1834            try:
1835                result = filename, headers
1836                if self.tempcache is not None:
1837                    self.tempcache[url] = result
1838                bs = 1024*8
1839                size = -1
1840                read = 0
1841                blocknum = 0
1842                if "content-length" in headers:
1843                    size = int(headers["Content-Length"])
1844                if reporthook:
1845                    reporthook(blocknum, bs, size)
1846                while 1:
1847                    block = fp.read(bs)
1848                    if not block:
1849                        break
1850                    read += len(block)
1851                    tfp.write(block)
1852                    blocknum += 1
1853                    if reporthook:
1854                        reporthook(blocknum, bs, size)
1855            finally:
1856                tfp.close()
1857        finally:
1858            fp.close()
1859
1860        # raise exception if actual size does not match content-length header
1861        if size >= 0 and read < size:
1862            raise ContentTooShortError(
1863                "retrieval incomplete: got only %i out of %i bytes"
1864                % (read, size), result)
1865
1866        return result
1867
1868    # Each method named open_<type> knows how to open that type of URL
1869
1870    def _open_generic_http(self, connection_factory, url, data):
1871        """Make an HTTP connection using connection_class.
1872
1873        This is an internal method that should be called from
1874        open_http() or open_https().
1875
1876        Arguments:
1877        - connection_factory should take a host name and return an
1878          HTTPConnection instance.
1879        - url is the url to retrieval or a host, relative-path pair.
1880        - data is payload for a POST request or None.
1881        """
1882
1883        user_passwd = None
1884        proxy_passwd= None
1885        if isinstance(url, str):
1886            host, selector = _splithost(url)
1887            if host:
1888                user_passwd, host = _splituser(host)
1889                host = unquote(host)
1890            realhost = host
1891        else:
1892            host, selector = url
1893            # check whether the proxy contains authorization information
1894            proxy_passwd, host = _splituser(host)
1895            # now we proceed with the url we want to obtain
1896            urltype, rest = _splittype(selector)
1897            url = rest
1898            user_passwd = None
1899            if urltype.lower() != 'http':
1900                realhost = None
1901            else:
1902                realhost, rest = _splithost(rest)
1903                if realhost:
1904                    user_passwd, realhost = _splituser(realhost)
1905                if user_passwd:
1906                    selector = "%s://%s%s" % (urltype, realhost, rest)
1907                if proxy_bypass(realhost):
1908                    host = realhost
1909
1910        if not host: raise OSError('http error', 'no host given')
1911
1912        if proxy_passwd:
1913            proxy_passwd = unquote(proxy_passwd)
1914            proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1915        else:
1916            proxy_auth = None
1917
1918        if user_passwd:
1919            user_passwd = unquote(user_passwd)
1920            auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1921        else:
1922            auth = None
1923        http_conn = connection_factory(host)
1924        headers = {}
1925        if proxy_auth:
1926            headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1927        if auth:
1928            headers["Authorization"] =  "Basic %s" % auth
1929        if realhost:
1930            headers["Host"] = realhost
1931
1932        # Add Connection:close as we don't support persistent connections yet.
1933        # This helps in closing the socket and avoiding ResourceWarning
1934
1935        headers["Connection"] = "close"
1936
1937        for header, value in self.addheaders:
1938            headers[header] = value
1939
1940        if data is not None:
1941            headers["Content-Type"] = "application/x-www-form-urlencoded"
1942            http_conn.request("POST", selector, data, headers)
1943        else:
1944            http_conn.request("GET", selector, headers=headers)
1945
1946        try:
1947            response = http_conn.getresponse()
1948        except http.client.BadStatusLine:
1949            # something went wrong with the HTTP status line
1950            raise URLError("http protocol error: bad status line")
1951
1952        # According to RFC 2616, "2xx" code indicates that the client's
1953        # request was successfully received, understood, and accepted.
1954        if 200 <= response.status < 300:
1955            return addinfourl(response, response.msg, "http:" + url,
1956                              response.status)
1957        else:
1958            return self.http_error(
1959                url, response.fp,
1960                response.status, response.reason, response.msg, data)
1961
1962    def open_http(self, url, data=None):
1963        """Use HTTP protocol."""
1964        return self._open_generic_http(http.client.HTTPConnection, url, data)
1965
1966    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1967        """Handle http errors.
1968
1969        Derived class can override this, or provide specific handlers
1970        named http_error_DDD where DDD is the 3-digit error code."""
1971        # First check if there's a specific handler for this error
1972        name = 'http_error_%d' % errcode
1973        if hasattr(self, name):
1974            method = getattr(self, name)
1975            if data is None:
1976                result = method(url, fp, errcode, errmsg, headers)
1977            else:
1978                result = method(url, fp, errcode, errmsg, headers, data)
1979            if result: return result
1980        return self.http_error_default(url, fp, errcode, errmsg, headers)
1981
1982    def http_error_default(self, url, fp, errcode, errmsg, headers):
1983        """Default error handler: close the connection and raise OSError."""
1984        fp.close()
1985        raise HTTPError(url, errcode, errmsg, headers, None)
1986
1987    if _have_ssl:
1988        def _https_connection(self, host):
1989            return http.client.HTTPSConnection(host,
1990                                           key_file=self.key_file,
1991                                           cert_file=self.cert_file)
1992
1993        def open_https(self, url, data=None):
1994            """Use HTTPS protocol."""
1995            return self._open_generic_http(self._https_connection, url, data)
1996
1997    def open_file(self, url):
1998        """Use local file or FTP depending on form of URL."""
1999        if not isinstance(url, str):
2000            raise URLError('file error: proxy support for file protocol currently not implemented')
2001        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
2002            raise ValueError("file:// scheme is supported only on localhost")
2003        else:
2004            return self.open_local_file(url)
2005
2006    def open_local_file(self, url):
2007        """Use local file."""
2008        import email.utils
2009        import mimetypes
2010        host, file = _splithost(url)
2011        localname = url2pathname(file)
2012        try:
2013            stats = os.stat(localname)
2014        except OSError as e:
2015            raise URLError(e.strerror, e.filename)
2016        size = stats.st_size
2017        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
2018        mtype = mimetypes.guess_type(url)[0]
2019        headers = email.message_from_string(
2020            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
2021            (mtype or 'text/plain', size, modified))
2022        if not host:
2023            urlfile = file
2024            if file[:1] == '/':
2025                urlfile = 'file://' + file
2026            return addinfourl(open(localname, 'rb'), headers, urlfile)
2027        host, port = _splitport(host)
2028        if (not port
2029           and socket.gethostbyname(host) in ((localhost(),) + thishost())):
2030            urlfile = file
2031            if file[:1] == '/':
2032                urlfile = 'file://' + file
2033            elif file[:2] == './':
2034                raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
2035            return addinfourl(open(localname, 'rb'), headers, urlfile)
2036        raise URLError('local file error: not on local host')
2037
2038    def open_ftp(self, url):
2039        """Use FTP protocol."""
2040        if not isinstance(url, str):
2041            raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
2042        import mimetypes
2043        host, path = _splithost(url)
2044        if not host: raise URLError('ftp error: no host given')
2045        host, port = _splitport(host)
2046        user, host = _splituser(host)
2047        if user: user, passwd = _splitpasswd(user)
2048        else: passwd = None
2049        host = unquote(host)
2050        user = unquote(user or '')
2051        passwd = unquote(passwd or '')
2052        host = socket.gethostbyname(host)
2053        if not port:
2054            import ftplib
2055            port = ftplib.FTP_PORT
2056        else:
2057            port = int(port)
2058        path, attrs = _splitattr(path)
2059        path = unquote(path)
2060        dirs = path.split('/')
2061        dirs, file = dirs[:-1], dirs[-1]
2062        if dirs and not dirs[0]: dirs = dirs[1:]
2063        if dirs and not dirs[0]: dirs[0] = '/'
2064        key = user, host, port, '/'.join(dirs)
2065        # XXX thread unsafe!
2066        if len(self.ftpcache) > MAXFTPCACHE:
2067            # Prune the cache, rather arbitrarily
2068            for k in list(self.ftpcache):
2069                if k != key:
2070                    v = self.ftpcache[k]
2071                    del self.ftpcache[k]
2072                    v.close()
2073        try:
2074            if key not in self.ftpcache:
2075                self.ftpcache[key] = \
2076                    ftpwrapper(user, passwd, host, port, dirs)
2077            if not file: type = 'D'
2078            else: type = 'I'
2079            for attr in attrs:
2080                attr, value = _splitvalue(attr)
2081                if attr.lower() == 'type' and \
2082                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
2083                    type = value.upper()
2084            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2085            mtype = mimetypes.guess_type("ftp:" + url)[0]
2086            headers = ""
2087            if mtype:
2088                headers += "Content-Type: %s\n" % mtype
2089            if retrlen is not None and retrlen >= 0:
2090                headers += "Content-Length: %d\n" % retrlen
2091            headers = email.message_from_string(headers)
2092            return addinfourl(fp, headers, "ftp:" + url)
2093        except ftperrors() as exp:
2094            raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
2095
2096    def open_data(self, url, data=None):
2097        """Use "data" URL."""
2098        if not isinstance(url, str):
2099            raise URLError('data error: proxy support for data protocol currently not implemented')
2100        # ignore POSTed data
2101        #
2102        # syntax of data URLs:
2103        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
2104        # mediatype := [ type "/" subtype ] *( ";" parameter )
2105        # data      := *urlchar
2106        # parameter := attribute "=" value
2107        try:
2108            [type, data] = url.split(',', 1)
2109        except ValueError:
2110            raise OSError('data error', 'bad data URL')
2111        if not type:
2112            type = 'text/plain;charset=US-ASCII'
2113        semi = type.rfind(';')
2114        if semi >= 0 and '=' not in type[semi:]:
2115            encoding = type[semi+1:]
2116            type = type[:semi]
2117        else:
2118            encoding = ''
2119        msg = []
2120        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2121                                            time.gmtime(time.time())))
2122        msg.append('Content-type: %s' % type)
2123        if encoding == 'base64':
2124            # XXX is this encoding/decoding ok?
2125            data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2126        else:
2127            data = unquote(data)
2128        msg.append('Content-Length: %d' % len(data))
2129        msg.append('')
2130        msg.append(data)
2131        msg = '\n'.join(msg)
2132        headers = email.message_from_string(msg)
2133        f = io.StringIO(msg)
2134        #f.fileno = None     # needed for addinfourl
2135        return addinfourl(f, headers, url)
2136
2137
2138class FancyURLopener(URLopener):
2139    """Derived class with handlers for errors we can handle (perhaps)."""
2140
2141    def __init__(self, *args, **kwargs):
2142        URLopener.__init__(self, *args, **kwargs)
2143        self.auth_cache = {}
2144        self.tries = 0
2145        self.maxtries = 10
2146
2147    def http_error_default(self, url, fp, errcode, errmsg, headers):
2148        """Default error handling -- don't raise an exception."""
2149        return addinfourl(fp, headers, "http:" + url, errcode)
2150
2151    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2152        """Error 302 -- relocated (temporarily)."""
2153        self.tries += 1
2154        try:
2155            if self.maxtries and self.tries >= self.maxtries:
2156                if hasattr(self, "http_error_500"):
2157                    meth = self.http_error_500
2158                else:
2159                    meth = self.http_error_default
2160                return meth(url, fp, 500,
2161                            "Internal Server Error: Redirect Recursion",
2162                            headers)
2163            result = self.redirect_internal(url, fp, errcode, errmsg,
2164                                            headers, data)
2165            return result
2166        finally:
2167            self.tries = 0
2168
2169    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2170        if 'location' in headers:
2171            newurl = headers['location']
2172        elif 'uri' in headers:
2173            newurl = headers['uri']
2174        else:
2175            return
2176        fp.close()
2177
2178        # In case the server sent a relative URL, join with original:
2179        newurl = urljoin(self.type + ":" + url, newurl)
2180
2181        urlparts = urlparse(newurl)
2182
2183        # For security reasons, we don't allow redirection to anything other
2184        # than http, https and ftp.
2185
2186        # We are using newer HTTPError with older redirect_internal method
2187        # This older method will get deprecated in 3.3
2188
2189        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2190            raise HTTPError(newurl, errcode,
2191                            errmsg +
2192                            " Redirection to url '%s' is not allowed." % newurl,
2193                            headers, fp)
2194
2195        return self.open(newurl)
2196
2197    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2198        """Error 301 -- also relocated (permanently)."""
2199        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2200
2201    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2202        """Error 303 -- also relocated (essentially identical to 302)."""
2203        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2204
2205    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2206        """Error 307 -- relocated, but turn POST into error."""
2207        if data is None:
2208            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2209        else:
2210            return self.http_error_default(url, fp, errcode, errmsg, headers)
2211
2212    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2213            retry=False):
2214        """Error 401 -- authentication required.
2215        This function supports Basic authentication only."""
2216        if 'www-authenticate' not in headers:
2217            URLopener.http_error_default(self, url, fp,
2218                                         errcode, errmsg, headers)
2219        stuff = headers['www-authenticate']
2220        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2221        if not match:
2222            URLopener.http_error_default(self, url, fp,
2223                                         errcode, errmsg, headers)
2224        scheme, realm = match.groups()
2225        if scheme.lower() != 'basic':
2226            URLopener.http_error_default(self, url, fp,
2227                                         errcode, errmsg, headers)
2228        if not retry:
2229            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2230                    headers)
2231        name = 'retry_' + self.type + '_basic_auth'
2232        if data is None:
2233            return getattr(self,name)(url, realm)
2234        else:
2235            return getattr(self,name)(url, realm, data)
2236
2237    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2238            retry=False):
2239        """Error 407 -- proxy authentication required.
2240        This function supports Basic authentication only."""
2241        if 'proxy-authenticate' not in headers:
2242            URLopener.http_error_default(self, url, fp,
2243                                         errcode, errmsg, headers)
2244        stuff = headers['proxy-authenticate']
2245        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2246        if not match:
2247            URLopener.http_error_default(self, url, fp,
2248                                         errcode, errmsg, headers)
2249        scheme, realm = match.groups()
2250        if scheme.lower() != 'basic':
2251            URLopener.http_error_default(self, url, fp,
2252                                         errcode, errmsg, headers)
2253        if not retry:
2254            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2255                    headers)
2256        name = 'retry_proxy_' + self.type + '_basic_auth'
2257        if data is None:
2258            return getattr(self,name)(url, realm)
2259        else:
2260            return getattr(self,name)(url, realm, data)
2261
2262    def retry_proxy_http_basic_auth(self, url, realm, data=None):
2263        host, selector = _splithost(url)
2264        newurl = 'http://' + host + selector
2265        proxy = self.proxies['http']
2266        urltype, proxyhost = _splittype(proxy)
2267        proxyhost, proxyselector = _splithost(proxyhost)
2268        i = proxyhost.find('@') + 1
2269        proxyhost = proxyhost[i:]
2270        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2271        if not (user or passwd): return None
2272        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2273                                  quote(passwd, safe=''), proxyhost)
2274        self.proxies['http'] = 'http://' + proxyhost + proxyselector
2275        if data is None:
2276            return self.open(newurl)
2277        else:
2278            return self.open(newurl, data)
2279
2280    def retry_proxy_https_basic_auth(self, url, realm, data=None):
2281        host, selector = _splithost(url)
2282        newurl = 'https://' + host + selector
2283        proxy = self.proxies['https']
2284        urltype, proxyhost = _splittype(proxy)
2285        proxyhost, proxyselector = _splithost(proxyhost)
2286        i = proxyhost.find('@') + 1
2287        proxyhost = proxyhost[i:]
2288        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2289        if not (user or passwd): return None
2290        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2291                                  quote(passwd, safe=''), proxyhost)
2292        self.proxies['https'] = 'https://' + proxyhost + proxyselector
2293        if data is None:
2294            return self.open(newurl)
2295        else:
2296            return self.open(newurl, data)
2297
2298    def retry_http_basic_auth(self, url, realm, data=None):
2299        host, selector = _splithost(url)
2300        i = host.find('@') + 1
2301        host = host[i:]
2302        user, passwd = self.get_user_passwd(host, realm, i)
2303        if not (user or passwd): return None
2304        host = "%s:%s@%s" % (quote(user, safe=''),
2305                             quote(passwd, safe=''), host)
2306        newurl = 'http://' + host + selector
2307        if data is None:
2308            return self.open(newurl)
2309        else:
2310            return self.open(newurl, data)
2311
2312    def retry_https_basic_auth(self, url, realm, data=None):
2313        host, selector = _splithost(url)
2314        i = host.find('@') + 1
2315        host = host[i:]
2316        user, passwd = self.get_user_passwd(host, realm, i)
2317        if not (user or passwd): return None
2318        host = "%s:%s@%s" % (quote(user, safe=''),
2319                             quote(passwd, safe=''), host)
2320        newurl = 'https://' + host + selector
2321        if data is None:
2322            return self.open(newurl)
2323        else:
2324            return self.open(newurl, data)
2325
2326    def get_user_passwd(self, host, realm, clear_cache=0):
2327        key = realm + '@' + host.lower()
2328        if key in self.auth_cache:
2329            if clear_cache:
2330                del self.auth_cache[key]
2331            else:
2332                return self.auth_cache[key]
2333        user, passwd = self.prompt_user_passwd(host, realm)
2334        if user or passwd: self.auth_cache[key] = (user, passwd)
2335        return user, passwd
2336
2337    def prompt_user_passwd(self, host, realm):
2338        """Override this in a GUI environment!"""
2339        import getpass
2340        try:
2341            user = input("Enter username for %s at %s: " % (realm, host))
2342            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2343                (user, realm, host))
2344            return user, passwd
2345        except KeyboardInterrupt:
2346            print()
2347            return None, None
2348
2349
2350# Utility functions
2351
2352_localhost = None
2353def localhost():
2354    """Return the IP address of the magic hostname 'localhost'."""
2355    global _localhost
2356    if _localhost is None:
2357        _localhost = socket.gethostbyname('localhost')
2358    return _localhost
2359
2360_thishost = None
2361def thishost():
2362    """Return the IP addresses of the current host."""
2363    global _thishost
2364    if _thishost is None:
2365        try:
2366            _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2367        except socket.gaierror:
2368            _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2369    return _thishost
2370
2371_ftperrors = None
2372def ftperrors():
2373    """Return the set of errors raised by the FTP class."""
2374    global _ftperrors
2375    if _ftperrors is None:
2376        import ftplib
2377        _ftperrors = ftplib.all_errors
2378    return _ftperrors
2379
2380_noheaders = None
2381def noheaders():
2382    """Return an empty email Message object."""
2383    global _noheaders
2384    if _noheaders is None:
2385        _noheaders = email.message_from_string("")
2386    return _noheaders
2387
2388
2389# Utility classes
2390
2391class ftpwrapper:
2392    """Class used by open_ftp() for cache of open FTP connections."""
2393
2394    def __init__(self, user, passwd, host, port, dirs, timeout=None,
2395                 persistent=True):
2396        self.user = user
2397        self.passwd = passwd
2398        self.host = host
2399        self.port = port
2400        self.dirs = dirs
2401        self.timeout = timeout
2402        self.refcount = 0
2403        self.keepalive = persistent
2404        try:
2405            self.init()
2406        except:
2407            self.close()
2408            raise
2409
2410    def init(self):
2411        import ftplib
2412        self.busy = 0
2413        self.ftp = ftplib.FTP()
2414        self.ftp.connect(self.host, self.port, self.timeout)
2415        self.ftp.login(self.user, self.passwd)
2416        _target = '/'.join(self.dirs)
2417        self.ftp.cwd(_target)
2418
2419    def retrfile(self, file, type):
2420        import ftplib
2421        self.endtransfer()
2422        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2423        else: cmd = 'TYPE ' + type; isdir = 0
2424        try:
2425            self.ftp.voidcmd(cmd)
2426        except ftplib.all_errors:
2427            self.init()
2428            self.ftp.voidcmd(cmd)
2429        conn = None
2430        if file and not isdir:
2431            # Try to retrieve as a file
2432            try:
2433                cmd = 'RETR ' + file
2434                conn, retrlen = self.ftp.ntransfercmd(cmd)
2435            except ftplib.error_perm as reason:
2436                if str(reason)[:3] != '550':
2437                    raise URLError('ftp error: %r' % reason).with_traceback(
2438                        sys.exc_info()[2])
2439        if not conn:
2440            # Set transfer mode to ASCII!
2441            self.ftp.voidcmd('TYPE A')
2442            # Try a directory listing. Verify that directory exists.
2443            if file:
2444                pwd = self.ftp.pwd()
2445                try:
2446                    try:
2447                        self.ftp.cwd(file)
2448                    except ftplib.error_perm as reason:
2449                        raise URLError('ftp error: %r' % reason) from reason
2450                finally:
2451                    self.ftp.cwd(pwd)
2452                cmd = 'LIST ' + file
2453            else:
2454                cmd = 'LIST'
2455            conn, retrlen = self.ftp.ntransfercmd(cmd)
2456        self.busy = 1
2457
2458        ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2459        self.refcount += 1
2460        conn.close()
2461        # Pass back both a suitably decorated object and a retrieval length
2462        return (ftpobj, retrlen)
2463
2464    def endtransfer(self):
2465        self.busy = 0
2466
2467    def close(self):
2468        self.keepalive = False
2469        if self.refcount <= 0:
2470            self.real_close()
2471
2472    def file_close(self):
2473        self.endtransfer()
2474        self.refcount -= 1
2475        if self.refcount <= 0 and not self.keepalive:
2476            self.real_close()
2477
2478    def real_close(self):
2479        self.endtransfer()
2480        try:
2481            self.ftp.close()
2482        except ftperrors():
2483            pass
2484
2485# Proxy handling
2486def getproxies_environment():
2487    """Return a dictionary of scheme -> proxy server URL mappings.
2488
2489    Scan the environment for variables named <scheme>_proxy;
2490    this seems to be the standard convention.  If you need a
2491    different way, you can pass a proxies dictionary to the
2492    [Fancy]URLopener constructor.
2493
2494    """
2495    proxies = {}
2496    # in order to prefer lowercase variables, process environment in
2497    # two passes: first matches any, second pass matches lowercase only
2498    for name, value in os.environ.items():
2499        name = name.lower()
2500        if value and name[-6:] == '_proxy':
2501            proxies[name[:-6]] = value
2502    # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
2503    # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
2504    # header from the client
2505    # If "proxy" is lowercase, it will still be used thanks to the next block
2506    if 'REQUEST_METHOD' in os.environ:
2507        proxies.pop('http', None)
2508    for name, value in os.environ.items():
2509        if name[-6:] == '_proxy':
2510            name = name.lower()
2511            if value:
2512                proxies[name[:-6]] = value
2513            else:
2514                proxies.pop(name[:-6], None)
2515    return proxies
2516
2517def proxy_bypass_environment(host, proxies=None):
2518    """Test if proxies should not be used for a particular host.
2519
2520    Checks the proxy dict for the value of no_proxy, which should
2521    be a list of comma separated DNS suffixes, or '*' for all hosts.
2522
2523    """
2524    if proxies is None:
2525        proxies = getproxies_environment()
2526    # don't bypass, if no_proxy isn't specified
2527    try:
2528        no_proxy = proxies['no']
2529    except KeyError:
2530        return False
2531    # '*' is special case for always bypass
2532    if no_proxy == '*':
2533        return True
2534    host = host.lower()
2535    # strip port off host
2536    hostonly, port = _splitport(host)
2537    # check if the host ends with any of the DNS suffixes
2538    for name in no_proxy.split(','):
2539        name = name.strip()
2540        if name:
2541            name = name.lstrip('.')  # ignore leading dots
2542            name = name.lower()
2543            if hostonly == name or host == name:
2544                return True
2545            name = '.' + name
2546            if hostonly.endswith(name) or host.endswith(name):
2547                return True
2548    # otherwise, don't bypass
2549    return False
2550
2551
2552# This code tests an OSX specific data structure but is testable on all
2553# platforms
2554def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2555    """
2556    Return True iff this host shouldn't be accessed using a proxy
2557
2558    This function uses the MacOSX framework SystemConfiguration
2559    to fetch the proxy information.
2560
2561    proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2562    { 'exclude_simple': bool,
2563      'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2564    }
2565    """
2566    from fnmatch import fnmatch
2567
2568    hostonly, port = _splitport(host)
2569
2570    def ip2num(ipAddr):
2571        parts = ipAddr.split('.')
2572        parts = list(map(int, parts))
2573        if len(parts) != 4:
2574            parts = (parts + [0, 0, 0, 0])[:4]
2575        return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2576
2577    # Check for simple host names:
2578    if '.' not in host:
2579        if proxy_settings['exclude_simple']:
2580            return True
2581
2582    hostIP = None
2583
2584    for value in proxy_settings.get('exceptions', ()):
2585        # Items in the list are strings like these: *.local, 169.254/16
2586        if not value: continue
2587
2588        m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2589        if m is not None:
2590            if hostIP is None:
2591                try:
2592                    hostIP = socket.gethostbyname(hostonly)
2593                    hostIP = ip2num(hostIP)
2594                except OSError:
2595                    continue
2596
2597            base = ip2num(m.group(1))
2598            mask = m.group(2)
2599            if mask is None:
2600                mask = 8 * (m.group(1).count('.') + 1)
2601            else:
2602                mask = int(mask[1:])
2603
2604            if mask < 0 or mask > 32:
2605                # System libraries ignore invalid prefix lengths
2606                continue
2607
2608            mask = 32 - mask
2609
2610            if (hostIP >> mask) == (base >> mask):
2611                return True
2612
2613        elif fnmatch(host, value):
2614            return True
2615
2616    return False
2617
2618
2619if sys.platform == 'darwin':
2620    from _scproxy import _get_proxy_settings, _get_proxies
2621
2622    def proxy_bypass_macosx_sysconf(host):
2623        proxy_settings = _get_proxy_settings()
2624        return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2625
2626    def getproxies_macosx_sysconf():
2627        """Return a dictionary of scheme -> proxy server URL mappings.
2628
2629        This function uses the MacOSX framework SystemConfiguration
2630        to fetch the proxy information.
2631        """
2632        return _get_proxies()
2633
2634
2635
2636    def proxy_bypass(host):
2637        """Return True, if host should be bypassed.
2638
2639        Checks proxy settings gathered from the environment, if specified,
2640        or from the MacOSX framework SystemConfiguration.
2641
2642        """
2643        proxies = getproxies_environment()
2644        if proxies:
2645            return proxy_bypass_environment(host, proxies)
2646        else:
2647            return proxy_bypass_macosx_sysconf(host)
2648
2649    def getproxies():
2650        return getproxies_environment() or getproxies_macosx_sysconf()
2651
2652
2653elif os.name == 'nt':
2654    def getproxies_registry():
2655        """Return a dictionary of scheme -> proxy server URL mappings.
2656
2657        Win32 uses the registry to store proxies.
2658
2659        """
2660        proxies = {}
2661        try:
2662            import winreg
2663        except ImportError:
2664            # Std module, so should be around - but you never know!
2665            return proxies
2666        try:
2667            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2668                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2669            proxyEnable = winreg.QueryValueEx(internetSettings,
2670                                               'ProxyEnable')[0]
2671            if proxyEnable:
2672                # Returned as Unicode but problems if not converted to ASCII
2673                proxyServer = str(winreg.QueryValueEx(internetSettings,
2674                                                       'ProxyServer')[0])
2675                if '=' in proxyServer:
2676                    # Per-protocol settings
2677                    for p in proxyServer.split(';'):
2678                        protocol, address = p.split('=', 1)
2679                        # See if address has a type:// prefix
2680                        if not re.match('(?:[^/:]+)://', address):
2681                            address = '%s://%s' % (protocol, address)
2682                        proxies[protocol] = address
2683                else:
2684                    # Use one setting for all protocols
2685                    if proxyServer[:5] == 'http:':
2686                        proxies['http'] = proxyServer
2687                    else:
2688                        proxies['http'] = 'http://%s' % proxyServer
2689                        proxies['https'] = 'https://%s' % proxyServer
2690                        proxies['ftp'] = 'ftp://%s' % proxyServer
2691            internetSettings.Close()
2692        except (OSError, ValueError, TypeError):
2693            # Either registry key not found etc, or the value in an
2694            # unexpected format.
2695            # proxies already set up to be empty so nothing to do
2696            pass
2697        return proxies
2698
2699    def getproxies():
2700        """Return a dictionary of scheme -> proxy server URL mappings.
2701
2702        Returns settings gathered from the environment, if specified,
2703        or the registry.
2704
2705        """
2706        return getproxies_environment() or getproxies_registry()
2707
2708    def proxy_bypass_registry(host):
2709        try:
2710            import winreg
2711        except ImportError:
2712            # Std modules, so should be around - but you never know!
2713            return 0
2714        try:
2715            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2716                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2717            proxyEnable = winreg.QueryValueEx(internetSettings,
2718                                               'ProxyEnable')[0]
2719            proxyOverride = str(winreg.QueryValueEx(internetSettings,
2720                                                     'ProxyOverride')[0])
2721            # ^^^^ Returned as Unicode but problems if not converted to ASCII
2722        except OSError:
2723            return 0
2724        if not proxyEnable or not proxyOverride:
2725            return 0
2726        # try to make a host list from name and IP address.
2727        rawHost, port = _splitport(host)
2728        host = [rawHost]
2729        try:
2730            addr = socket.gethostbyname(rawHost)
2731            if addr != rawHost:
2732                host.append(addr)
2733        except OSError:
2734            pass
2735        try:
2736            fqdn = socket.getfqdn(rawHost)
2737            if fqdn != rawHost:
2738                host.append(fqdn)
2739        except OSError:
2740            pass
2741        # make a check value list from the registry entry: replace the
2742        # '<local>' string by the localhost entry and the corresponding
2743        # canonical entry.
2744        proxyOverride = proxyOverride.split(';')
2745        # now check if we match one of the registry values.
2746        for test in proxyOverride:
2747            if test == '<local>':
2748                if '.' not in rawHost:
2749                    return 1
2750            test = test.replace(".", r"\.")     # mask dots
2751            test = test.replace("*", r".*")     # change glob sequence
2752            test = test.replace("?", r".")      # change glob char
2753            for val in host:
2754                if re.match(test, val, re.I):
2755                    return 1
2756        return 0
2757
2758    def proxy_bypass(host):
2759        """Return True, if host should be bypassed.
2760
2761        Checks proxy settings gathered from the environment, if specified,
2762        or the registry.
2763
2764        """
2765        proxies = getproxies_environment()
2766        if proxies:
2767            return proxy_bypass_environment(host, proxies)
2768        else:
2769            return proxy_bypass_registry(host)
2770
2771else:
2772    # By default use environment variables
2773    getproxies = getproxies_environment
2774    proxy_bypass = proxy_bypass_environment
2775