1"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below).  It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work.  Each Handler implements a particular protocol or
10option.  The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL.  For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns.  The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib.  pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back.  One difference is that you can also pass
20a Request instance instead of URL.  Raises a URLError (subclass of
21OSError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers.  Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate.  If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33
34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
36
37Request -- An object that encapsulates the state of a request.  The
38state can be as simple as the URL.  It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
49import urllib.request
50
51# set up authentication info
52authinfo = urllib.request.HTTPBasicAuthHandler()
53authinfo.add_password(realm='PDQ Application',
54                      uri='https://mahler:8092/site-updates.py',
55                      user='klem',
56                      passwd='geheim$parole')
57
58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
59
60# build a new opener that adds authentication and caching FTP handlers
61opener = urllib.request.build_opener(proxy_support, authinfo,
62                                     urllib.request.CacheFTPHandler)
63
64# install it
65urllib.request.install_opener(opener)
66
67f = urllib.request.urlopen('https://www.python.org/')
68"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled?  The client needs to know the HTTP error code.  But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies  XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
85import bisect
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import re
93import socket
94import string
95import sys
96import time
97import tempfile
98import contextlib
99import warnings
100
101
102from urllib.error import URLError, HTTPError, ContentTooShortError
103from urllib.parse import (
104    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
105    _splittype, _splithost, _splitport, _splituser, _splitpasswd,
106    _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,
107    unquote_to_bytes, urlunparse)
108from urllib.response import addinfourl, addclosehook
109
110# check for SSL
111try:
112    import ssl
113except ImportError:
114    _have_ssl = False
115else:
116    _have_ssl = True
117
118__all__ = [
119    # Classes
120    'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
121    'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
122    'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
123    'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
124    'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
125    'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
126    'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
127    'UnknownHandler', 'HTTPErrorProcessor',
128    # Functions
129    'urlopen', 'install_opener', 'build_opener',
130    'pathname2url', 'url2pathname', 'getproxies',
131    # Legacy interface
132    'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
133]
134
135# used in User-Agent header sent
136__version__ = '%d.%d' % sys.version_info[:2]
137
138_opener = None
139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
140            *, cafile=None, capath=None, cadefault=False, context=None):
141    '''Open the URL url, which can be either a string or a Request object.
142
143    *data* must be an object specifying additional data to be sent to
144    the server, or None if no such data is needed.  See Request for
145    details.
146
147    urllib.request module uses HTTP/1.1 and includes a "Connection:close"
148    header in its HTTP requests.
149
150    The optional *timeout* parameter specifies a timeout in seconds for
151    blocking operations like the connection attempt (if not specified, the
152    global default timeout setting will be used). This only works for HTTP,
153    HTTPS and FTP connections.
154
155    If *context* is specified, it must be a ssl.SSLContext instance describing
156    the various SSL options. See HTTPSConnection for more details.
157
158    The optional *cafile* and *capath* parameters specify a set of trusted CA
159    certificates for HTTPS requests. cafile should point to a single file
160    containing a bundle of CA certificates, whereas capath should point to a
161    directory of hashed certificate files. More information can be found in
162    ssl.SSLContext.load_verify_locations().
163
164    The *cadefault* parameter is ignored.
165
166
167    This function always returns an object which can work as a
168    context manager and has the properties url, headers, and status.
169    See urllib.response.addinfourl for more detail on these properties.
170
171    For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
172    object slightly modified. In addition to the three new methods above, the
173    msg attribute contains the same information as the reason attribute ---
174    the reason phrase returned by the server --- instead of the response
175    headers as it is specified in the documentation for HTTPResponse.
176
177    For FTP, file, and data URLs and requests explicitly handled by legacy
178    URLopener and FancyURLopener classes, this function returns a
179    urllib.response.addinfourl object.
180
181    Note that None may be returned if no handler handles the request (though
182    the default installed global OpenerDirector uses UnknownHandler to ensure
183    this never happens).
184
185    In addition, if proxy settings are detected (for example, when a *_proxy
186    environment variable like http_proxy is set), ProxyHandler is default
187    installed and makes sure the requests are handled through the proxy.
188
189    '''
190    global _opener
191    if cafile or capath or cadefault:
192        import warnings
193        warnings.warn("cafile, capath and cadefault are deprecated, use a "
194                      "custom context instead.", DeprecationWarning, 2)
195        if context is not None:
196            raise ValueError(
197                "You can't pass both context and any of cafile, capath, and "
198                "cadefault"
199            )
200        if not _have_ssl:
201            raise ValueError('SSL support not available')
202        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
203                                             cafile=cafile,
204                                             capath=capath)
205        # send ALPN extension to indicate HTTP/1.1 protocol
206        context.set_alpn_protocols(['http/1.1'])
207        https_handler = HTTPSHandler(context=context)
208        opener = build_opener(https_handler)
209    elif context:
210        https_handler = HTTPSHandler(context=context)
211        opener = build_opener(https_handler)
212    elif _opener is None:
213        _opener = opener = build_opener()
214    else:
215        opener = _opener
216    return opener.open(url, data, timeout)
217
218def install_opener(opener):
219    global _opener
220    _opener = opener
221
222_url_tempfiles = []
223def urlretrieve(url, filename=None, reporthook=None, data=None):
224    """
225    Retrieve a URL into a temporary location on disk.
226
227    Requires a URL argument. If a filename is passed, it is used as
228    the temporary file location. The reporthook argument should be
229    a callable that accepts a block number, a read size, and the
230    total file size of the URL target. The data argument should be
231    valid URL encoded data.
232
233    If a filename is passed and the URL points to a local resource,
234    the result is a copy from local file to new file.
235
236    Returns a tuple containing the path to the newly created
237    data file as well as the resulting HTTPMessage object.
238    """
239    url_type, path = _splittype(url)
240
241    with contextlib.closing(urlopen(url, data)) as fp:
242        headers = fp.info()
243
244        # Just return the local path and the "headers" for file://
245        # URLs. No sense in performing a copy unless requested.
246        if url_type == "file" and not filename:
247            return os.path.normpath(path), headers
248
249        # Handle temporary file setup.
250        if filename:
251            tfp = open(filename, 'wb')
252        else:
253            tfp = tempfile.NamedTemporaryFile(delete=False)
254            filename = tfp.name
255            _url_tempfiles.append(filename)
256
257        with tfp:
258            result = filename, headers
259            bs = 1024*8
260            size = -1
261            read = 0
262            blocknum = 0
263            if "content-length" in headers:
264                size = int(headers["Content-Length"])
265
266            if reporthook:
267                reporthook(blocknum, bs, size)
268
269            while True:
270                block = fp.read(bs)
271                if not block:
272                    break
273                read += len(block)
274                tfp.write(block)
275                blocknum += 1
276                if reporthook:
277                    reporthook(blocknum, bs, size)
278
279    if size >= 0 and read < size:
280        raise ContentTooShortError(
281            "retrieval incomplete: got only %i out of %i bytes"
282            % (read, size), result)
283
284    return result
285
286def urlcleanup():
287    """Clean up temporary files from urlretrieve calls."""
288    for temp_file in _url_tempfiles:
289        try:
290            os.unlink(temp_file)
291        except OSError:
292            pass
293
294    del _url_tempfiles[:]
295    global _opener
296    if _opener:
297        _opener = None
298
299# copied from cookielib.py
300_cut_port_re = re.compile(r":\d+$", re.ASCII)
301def request_host(request):
302    """Return request-host, as defined by RFC 2965.
303
304    Variation from RFC: returned value is lowercased, for convenient
305    comparison.
306
307    """
308    url = request.full_url
309    host = urlparse(url)[1]
310    if host == "":
311        host = request.get_header("Host", "")
312
313    # remove port, if present
314    host = _cut_port_re.sub("", host, 1)
315    return host.lower()
316
317class Request:
318
319    def __init__(self, url, data=None, headers={},
320                 origin_req_host=None, unverifiable=False,
321                 method=None):
322        self.full_url = url
323        self.headers = {}
324        self.unredirected_hdrs = {}
325        self._data = None
326        self.data = data
327        self._tunnel_host = None
328        for key, value in headers.items():
329            self.add_header(key, value)
330        if origin_req_host is None:
331            origin_req_host = request_host(self)
332        self.origin_req_host = origin_req_host
333        self.unverifiable = unverifiable
334        if method:
335            self.method = method
336
337    @property
338    def full_url(self):
339        if self.fragment:
340            return '{}#{}'.format(self._full_url, self.fragment)
341        return self._full_url
342
343    @full_url.setter
344    def full_url(self, url):
345        # unwrap('<URL:type://host/path>') --> 'type://host/path'
346        self._full_url = unwrap(url)
347        self._full_url, self.fragment = _splittag(self._full_url)
348        self._parse()
349
350    @full_url.deleter
351    def full_url(self):
352        self._full_url = None
353        self.fragment = None
354        self.selector = ''
355
356    @property
357    def data(self):
358        return self._data
359
360    @data.setter
361    def data(self, data):
362        if data != self._data:
363            self._data = data
364            # issue 16464
365            # if we change data we need to remove content-length header
366            # (cause it's most probably calculated for previous value)
367            if self.has_header("Content-length"):
368                self.remove_header("Content-length")
369
370    @data.deleter
371    def data(self):
372        self.data = None
373
374    def _parse(self):
375        self.type, rest = _splittype(self._full_url)
376        if self.type is None:
377            raise ValueError("unknown url type: %r" % self.full_url)
378        self.host, self.selector = _splithost(rest)
379        if self.host:
380            self.host = unquote(self.host)
381
382    def get_method(self):
383        """Return a string indicating the HTTP request method."""
384        default_method = "POST" if self.data is not None else "GET"
385        return getattr(self, 'method', default_method)
386
387    def get_full_url(self):
388        return self.full_url
389
390    def set_proxy(self, host, type):
391        if self.type == 'https' and not self._tunnel_host:
392            self._tunnel_host = self.host
393        else:
394            self.type= type
395            self.selector = self.full_url
396        self.host = host
397
398    def has_proxy(self):
399        return self.selector == self.full_url
400
401    def add_header(self, key, val):
402        # useful for something like authentication
403        self.headers[key.capitalize()] = val
404
405    def add_unredirected_header(self, key, val):
406        # will not be added to a redirected request
407        self.unredirected_hdrs[key.capitalize()] = val
408
409    def has_header(self, header_name):
410        return (header_name in self.headers or
411                header_name in self.unredirected_hdrs)
412
413    def get_header(self, header_name, default=None):
414        return self.headers.get(
415            header_name,
416            self.unredirected_hdrs.get(header_name, default))
417
418    def remove_header(self, header_name):
419        self.headers.pop(header_name, None)
420        self.unredirected_hdrs.pop(header_name, None)
421
422    def header_items(self):
423        hdrs = {**self.unredirected_hdrs, **self.headers}
424        return list(hdrs.items())
425
426class OpenerDirector:
427    def __init__(self):
428        client_version = "Python-urllib/%s" % __version__
429        self.addheaders = [('User-agent', client_version)]
430        # self.handlers is retained only for backward compatibility
431        self.handlers = []
432        # manage the individual handlers
433        self.handle_open = {}
434        self.handle_error = {}
435        self.process_response = {}
436        self.process_request = {}
437
438    def add_handler(self, handler):
439        if not hasattr(handler, "add_parent"):
440            raise TypeError("expected BaseHandler instance, got %r" %
441                            type(handler))
442
443        added = False
444        for meth in dir(handler):
445            if meth in ["redirect_request", "do_open", "proxy_open"]:
446                # oops, coincidental match
447                continue
448
449            i = meth.find("_")
450            protocol = meth[:i]
451            condition = meth[i+1:]
452
453            if condition.startswith("error"):
454                j = condition.find("_") + i + 1
455                kind = meth[j+1:]
456                try:
457                    kind = int(kind)
458                except ValueError:
459                    pass
460                lookup = self.handle_error.get(protocol, {})
461                self.handle_error[protocol] = lookup
462            elif condition == "open":
463                kind = protocol
464                lookup = self.handle_open
465            elif condition == "response":
466                kind = protocol
467                lookup = self.process_response
468            elif condition == "request":
469                kind = protocol
470                lookup = self.process_request
471            else:
472                continue
473
474            handlers = lookup.setdefault(kind, [])
475            if handlers:
476                bisect.insort(handlers, handler)
477            else:
478                handlers.append(handler)
479            added = True
480
481        if added:
482            bisect.insort(self.handlers, handler)
483            handler.add_parent(self)
484
485    def close(self):
486        # Only exists for backwards compatibility.
487        pass
488
489    def _call_chain(self, chain, kind, meth_name, *args):
490        # Handlers raise an exception if no one else should try to handle
491        # the request, or return None if they can't but another handler
492        # could.  Otherwise, they return the response.
493        handlers = chain.get(kind, ())
494        for handler in handlers:
495            func = getattr(handler, meth_name)
496            result = func(*args)
497            if result is not None:
498                return result
499
500    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
501        # accept a URL or a Request object
502        if isinstance(fullurl, str):
503            req = Request(fullurl, data)
504        else:
505            req = fullurl
506            if data is not None:
507                req.data = data
508
509        req.timeout = timeout
510        protocol = req.type
511
512        # pre-process request
513        meth_name = protocol+"_request"
514        for processor in self.process_request.get(protocol, []):
515            meth = getattr(processor, meth_name)
516            req = meth(req)
517
518        sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
519        response = self._open(req, data)
520
521        # post-process response
522        meth_name = protocol+"_response"
523        for processor in self.process_response.get(protocol, []):
524            meth = getattr(processor, meth_name)
525            response = meth(req, response)
526
527        return response
528
529    def _open(self, req, data=None):
530        result = self._call_chain(self.handle_open, 'default',
531                                  'default_open', req)
532        if result:
533            return result
534
535        protocol = req.type
536        result = self._call_chain(self.handle_open, protocol, protocol +
537                                  '_open', req)
538        if result:
539            return result
540
541        return self._call_chain(self.handle_open, 'unknown',
542                                'unknown_open', req)
543
544    def error(self, proto, *args):
545        if proto in ('http', 'https'):
546            # XXX http[s] protocols are special-cased
547            dict = self.handle_error['http'] # https is not different than http
548            proto = args[2]  # YUCK!
549            meth_name = 'http_error_%s' % proto
550            http_err = 1
551            orig_args = args
552        else:
553            dict = self.handle_error
554            meth_name = proto + '_error'
555            http_err = 0
556        args = (dict, proto, meth_name) + args
557        result = self._call_chain(*args)
558        if result:
559            return result
560
561        if http_err:
562            args = (dict, 'default', 'http_error_default') + orig_args
563            return self._call_chain(*args)
564
565# XXX probably also want an abstract factory that knows when it makes
566# sense to skip a superclass in favor of a subclass and when it might
567# make sense to include both
568
569def build_opener(*handlers):
570    """Create an opener object from a list of handlers.
571
572    The opener will use several default handlers, including support
573    for HTTP, FTP and when applicable HTTPS.
574
575    If any of the handlers passed as arguments are subclasses of the
576    default handlers, the default handlers will not be used.
577    """
578    opener = OpenerDirector()
579    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
580                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
581                       FTPHandler, FileHandler, HTTPErrorProcessor,
582                       DataHandler]
583    if hasattr(http.client, "HTTPSConnection"):
584        default_classes.append(HTTPSHandler)
585    skip = set()
586    for klass in default_classes:
587        for check in handlers:
588            if isinstance(check, type):
589                if issubclass(check, klass):
590                    skip.add(klass)
591            elif isinstance(check, klass):
592                skip.add(klass)
593    for klass in skip:
594        default_classes.remove(klass)
595
596    for klass in default_classes:
597        opener.add_handler(klass())
598
599    for h in handlers:
600        if isinstance(h, type):
601            h = h()
602        opener.add_handler(h)
603    return opener
604
605class BaseHandler:
606    handler_order = 500
607
608    def add_parent(self, parent):
609        self.parent = parent
610
611    def close(self):
612        # Only exists for backwards compatibility
613        pass
614
615    def __lt__(self, other):
616        if not hasattr(other, "handler_order"):
617            # Try to preserve the old behavior of having custom classes
618            # inserted after default ones (works only for custom user
619            # classes which are not aware of handler_order).
620            return True
621        return self.handler_order < other.handler_order
622
623
624class HTTPErrorProcessor(BaseHandler):
625    """Process HTTP error responses."""
626    handler_order = 1000  # after all other processing
627
628    def http_response(self, request, response):
629        code, msg, hdrs = response.code, response.msg, response.info()
630
631        # According to RFC 2616, "2xx" code indicates that the client's
632        # request was successfully received, understood, and accepted.
633        if not (200 <= code < 300):
634            response = self.parent.error(
635                'http', request, response, code, msg, hdrs)
636
637        return response
638
639    https_response = http_response
640
641class HTTPDefaultErrorHandler(BaseHandler):
642    def http_error_default(self, req, fp, code, msg, hdrs):
643        raise HTTPError(req.full_url, code, msg, hdrs, fp)
644
645class HTTPRedirectHandler(BaseHandler):
646    # maximum number of redirections to any single URL
647    # this is needed because of the state that cookies introduce
648    max_repeats = 4
649    # maximum total number of redirections (regardless of URL) before
650    # assuming we're in a loop
651    max_redirections = 10
652
653    def redirect_request(self, req, fp, code, msg, headers, newurl):
654        """Return a Request or None in response to a redirect.
655
656        This is called by the http_error_30x methods when a
657        redirection response is received.  If a redirection should
658        take place, return a new Request to allow http_error_30x to
659        perform the redirect.  Otherwise, raise HTTPError if no-one
660        else should try to handle this url.  Return None if you can't
661        but another Handler might.
662        """
663        m = req.get_method()
664        if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
665            or code in (301, 302, 303) and m == "POST")):
666            raise HTTPError(req.full_url, code, msg, headers, fp)
667
668        # Strictly (according to RFC 2616), 301 or 302 in response to
669        # a POST MUST NOT cause a redirection without confirmation
670        # from the user (of urllib.request, in this case).  In practice,
671        # essentially all clients do redirect in this case, so we do
672        # the same.
673
674        # Be conciliant with URIs containing a space.  This is mainly
675        # redundant with the more complete encoding done in http_error_302(),
676        # but it is kept for compatibility with other callers.
677        newurl = newurl.replace(' ', '%20')
678
679        CONTENT_HEADERS = ("content-length", "content-type")
680        newheaders = {k: v for k, v in req.headers.items()
681                      if k.lower() not in CONTENT_HEADERS}
682        return Request(newurl,
683                       headers=newheaders,
684                       origin_req_host=req.origin_req_host,
685                       unverifiable=True)
686
687    # Implementation note: To avoid the server sending us into an
688    # infinite loop, the request object needs to track what URLs we
689    # have already seen.  Do this by adding a handler-specific
690    # attribute to the Request object.
691    def http_error_302(self, req, fp, code, msg, headers):
692        # Some servers (incorrectly) return multiple Location headers
693        # (so probably same goes for URI).  Use first header.
694        if "location" in headers:
695            newurl = headers["location"]
696        elif "uri" in headers:
697            newurl = headers["uri"]
698        else:
699            return
700
701        # fix a possible malformed URL
702        urlparts = urlparse(newurl)
703
704        # For security reasons we don't allow redirection to anything other
705        # than http, https or ftp.
706
707        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
708            raise HTTPError(
709                newurl, code,
710                "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
711                headers, fp)
712
713        if not urlparts.path and urlparts.netloc:
714            urlparts = list(urlparts)
715            urlparts[2] = "/"
716        newurl = urlunparse(urlparts)
717
718        # http.client.parse_headers() decodes as ISO-8859-1.  Recover the
719        # original bytes and percent-encode non-ASCII bytes, and any special
720        # characters such as the space.
721        newurl = quote(
722            newurl, encoding="iso-8859-1", safe=string.punctuation)
723        newurl = urljoin(req.full_url, newurl)
724
725        # XXX Probably want to forget about the state of the current
726        # request, although that might interact poorly with other
727        # handlers that also use handler-specific request attributes
728        new = self.redirect_request(req, fp, code, msg, headers, newurl)
729        if new is None:
730            return
731
732        # loop detection
733        # .redirect_dict has a key url if url was previously visited.
734        if hasattr(req, 'redirect_dict'):
735            visited = new.redirect_dict = req.redirect_dict
736            if (visited.get(newurl, 0) >= self.max_repeats or
737                len(visited) >= self.max_redirections):
738                raise HTTPError(req.full_url, code,
739                                self.inf_msg + msg, headers, fp)
740        else:
741            visited = new.redirect_dict = req.redirect_dict = {}
742        visited[newurl] = visited.get(newurl, 0) + 1
743
744        # Don't close the fp until we are sure that we won't use it
745        # with HTTPError.
746        fp.read()
747        fp.close()
748
749        return self.parent.open(new, timeout=req.timeout)
750
751    http_error_301 = http_error_303 = http_error_307 = http_error_302
752
753    inf_msg = "The HTTP server returned a redirect error that would " \
754              "lead to an infinite loop.\n" \
755              "The last 30x error message was:\n"
756
757
758def _parse_proxy(proxy):
759    """Return (scheme, user, password, host/port) given a URL or an authority.
760
761    If a URL is supplied, it must have an authority (host:port) component.
762    According to RFC 3986, having an authority component means the URL must
763    have two slashes after the scheme.
764    """
765    scheme, r_scheme = _splittype(proxy)
766    if not r_scheme.startswith("/"):
767        # authority
768        scheme = None
769        authority = proxy
770    else:
771        # URL
772        if not r_scheme.startswith("//"):
773            raise ValueError("proxy URL with no authority: %r" % proxy)
774        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
775        # and 3.3.), path is empty or starts with '/'
776        if '@' in r_scheme:
777            host_separator = r_scheme.find('@')
778            end = r_scheme.find("/", host_separator)
779        else:
780            end = r_scheme.find("/", 2)
781        if end == -1:
782            end = None
783        authority = r_scheme[2:end]
784    userinfo, hostport = _splituser(authority)
785    if userinfo is not None:
786        user, password = _splitpasswd(userinfo)
787    else:
788        user = password = None
789    return scheme, user, password, hostport
790
791class ProxyHandler(BaseHandler):
792    # Proxies must be in front
793    handler_order = 100
794
795    def __init__(self, proxies=None):
796        if proxies is None:
797            proxies = getproxies()
798        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
799        self.proxies = proxies
800        for type, url in proxies.items():
801            type = type.lower()
802            setattr(self, '%s_open' % type,
803                    lambda r, proxy=url, type=type, meth=self.proxy_open:
804                        meth(r, proxy, type))
805
806    def proxy_open(self, req, proxy, type):
807        orig_type = req.type
808        proxy_type, user, password, hostport = _parse_proxy(proxy)
809        if proxy_type is None:
810            proxy_type = orig_type
811
812        if req.host and proxy_bypass(req.host):
813            return None
814
815        if user and password:
816            user_pass = '%s:%s' % (unquote(user),
817                                   unquote(password))
818            creds = base64.b64encode(user_pass.encode()).decode("ascii")
819            req.add_header('Proxy-authorization', 'Basic ' + creds)
820        hostport = unquote(hostport)
821        req.set_proxy(hostport, proxy_type)
822        if orig_type == proxy_type or orig_type == 'https':
823            # let other handlers take care of it
824            return None
825        else:
826            # need to start over, because the other handlers don't
827            # grok the proxy's URL type
828            # e.g. if we have a constructor arg proxies like so:
829            # {'http': 'ftp://proxy.example.com'}, we may end up turning
830            # a request for http://acme.example.com/a into one for
831            # ftp://proxy.example.com/a
832            return self.parent.open(req, timeout=req.timeout)
833
834class HTTPPasswordMgr:
835
836    def __init__(self):
837        self.passwd = {}
838
839    def add_password(self, realm, uri, user, passwd):
840        # uri could be a single URI or a sequence
841        if isinstance(uri, str):
842            uri = [uri]
843        if realm not in self.passwd:
844            self.passwd[realm] = {}
845        for default_port in True, False:
846            reduced_uri = tuple(
847                self.reduce_uri(u, default_port) for u in uri)
848            self.passwd[realm][reduced_uri] = (user, passwd)
849
850    def find_user_password(self, realm, authuri):
851        domains = self.passwd.get(realm, {})
852        for default_port in True, False:
853            reduced_authuri = self.reduce_uri(authuri, default_port)
854            for uris, authinfo in domains.items():
855                for uri in uris:
856                    if self.is_suburi(uri, reduced_authuri):
857                        return authinfo
858        return None, None
859
860    def reduce_uri(self, uri, default_port=True):
861        """Accept authority or URI and extract only the authority and path."""
862        # note HTTP URLs do not have a userinfo component
863        parts = urlsplit(uri)
864        if parts[1]:
865            # URI
866            scheme = parts[0]
867            authority = parts[1]
868            path = parts[2] or '/'
869        else:
870            # host or host:port
871            scheme = None
872            authority = uri
873            path = '/'
874        host, port = _splitport(authority)
875        if default_port and port is None and scheme is not None:
876            dport = {"http": 80,
877                     "https": 443,
878                     }.get(scheme)
879            if dport is not None:
880                authority = "%s:%d" % (host, dport)
881        return authority, path
882
883    def is_suburi(self, base, test):
884        """Check if test is below base in a URI tree
885
886        Both args must be URIs in reduced form.
887        """
888        if base == test:
889            return True
890        if base[0] != test[0]:
891            return False
892        common = posixpath.commonprefix((base[1], test[1]))
893        if len(common) == len(base[1]):
894            return True
895        return False
896
897
898class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
899
900    def find_user_password(self, realm, authuri):
901        user, password = HTTPPasswordMgr.find_user_password(self, realm,
902                                                            authuri)
903        if user is not None:
904            return user, password
905        return HTTPPasswordMgr.find_user_password(self, None, authuri)
906
907
908class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
909
910    def __init__(self, *args, **kwargs):
911        self.authenticated = {}
912        super().__init__(*args, **kwargs)
913
914    def add_password(self, realm, uri, user, passwd, is_authenticated=False):
915        self.update_authenticated(uri, is_authenticated)
916        # Add a default for prior auth requests
917        if realm is not None:
918            super().add_password(None, uri, user, passwd)
919        super().add_password(realm, uri, user, passwd)
920
921    def update_authenticated(self, uri, is_authenticated=False):
922        # uri could be a single URI or a sequence
923        if isinstance(uri, str):
924            uri = [uri]
925
926        for default_port in True, False:
927            for u in uri:
928                reduced_uri = self.reduce_uri(u, default_port)
929                self.authenticated[reduced_uri] = is_authenticated
930
931    def is_authenticated(self, authuri):
932        for default_port in True, False:
933            reduced_authuri = self.reduce_uri(authuri, default_port)
934            for uri in self.authenticated:
935                if self.is_suburi(uri, reduced_authuri):
936                    return self.authenticated[uri]
937
938
939class AbstractBasicAuthHandler:
940
941    # XXX this allows for multiple auth-schemes, but will stupidly pick
942    # the last one with a realm specified.
943
944    # allow for double- and single-quoted realm values
945    # (single quotes are a violation of the RFC, but appear in the wild)
946    rx = re.compile('(?:^|,)'   # start of the string or ','
947                    '[ \t]*'    # optional whitespaces
948                    '([^ \t,]+)' # scheme like "Basic"
949                    '[ \t]+'    # mandatory whitespaces
950                    # realm=xxx
951                    # realm='xxx'
952                    # realm="xxx"
953                    'realm=(["\']?)([^"\']*)\\2',
954                    re.I)
955
956    # XXX could pre-emptively send auth info already accepted (RFC 2617,
957    # end of section 2, and section 1.2 immediately after "credentials"
958    # production).
959
960    def __init__(self, password_mgr=None):
961        if password_mgr is None:
962            password_mgr = HTTPPasswordMgr()
963        self.passwd = password_mgr
964        self.add_password = self.passwd.add_password
965
966    def _parse_realm(self, header):
967        # parse WWW-Authenticate header: accept multiple challenges per header
968        found_challenge = False
969        for mo in AbstractBasicAuthHandler.rx.finditer(header):
970            scheme, quote, realm = mo.groups()
971            if quote not in ['"', "'"]:
972                warnings.warn("Basic Auth Realm was unquoted",
973                              UserWarning, 3)
974
975            yield (scheme, realm)
976
977            found_challenge = True
978
979        if not found_challenge:
980            if header:
981                scheme = header.split()[0]
982            else:
983                scheme = ''
984            yield (scheme, None)
985
986    def http_error_auth_reqed(self, authreq, host, req, headers):
987        # host may be an authority (without userinfo) or a URL with an
988        # authority
989        headers = headers.get_all(authreq)
990        if not headers:
991            # no header found
992            return
993
994        unsupported = None
995        for header in headers:
996            for scheme, realm in self._parse_realm(header):
997                if scheme.lower() != 'basic':
998                    unsupported = scheme
999                    continue
1000
1001                if realm is not None:
1002                    # Use the first matching Basic challenge.
1003                    # Ignore following challenges even if they use the Basic
1004                    # scheme.
1005                    return self.retry_http_basic_auth(host, req, realm)
1006
1007        if unsupported is not None:
1008            raise ValueError("AbstractBasicAuthHandler does not "
1009                             "support the following scheme: %r"
1010                             % (scheme,))
1011
1012    def retry_http_basic_auth(self, host, req, realm):
1013        user, pw = self.passwd.find_user_password(realm, host)
1014        if pw is not None:
1015            raw = "%s:%s" % (user, pw)
1016            auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
1017            if req.get_header(self.auth_header, None) == auth:
1018                return None
1019            req.add_unredirected_header(self.auth_header, auth)
1020            return self.parent.open(req, timeout=req.timeout)
1021        else:
1022            return None
1023
1024    def http_request(self, req):
1025        if (not hasattr(self.passwd, 'is_authenticated') or
1026           not self.passwd.is_authenticated(req.full_url)):
1027            return req
1028
1029        if not req.has_header('Authorization'):
1030            user, passwd = self.passwd.find_user_password(None, req.full_url)
1031            credentials = '{0}:{1}'.format(user, passwd).encode()
1032            auth_str = base64.standard_b64encode(credentials).decode()
1033            req.add_unredirected_header('Authorization',
1034                                        'Basic {}'.format(auth_str.strip()))
1035        return req
1036
1037    def http_response(self, req, response):
1038        if hasattr(self.passwd, 'is_authenticated'):
1039            if 200 <= response.code < 300:
1040                self.passwd.update_authenticated(req.full_url, True)
1041            else:
1042                self.passwd.update_authenticated(req.full_url, False)
1043        return response
1044
1045    https_request = http_request
1046    https_response = http_response
1047
1048
1049
1050class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1051
1052    auth_header = 'Authorization'
1053
1054    def http_error_401(self, req, fp, code, msg, headers):
1055        url = req.full_url
1056        response = self.http_error_auth_reqed('www-authenticate',
1057                                          url, req, headers)
1058        return response
1059
1060
1061class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1062
1063    auth_header = 'Proxy-authorization'
1064
1065    def http_error_407(self, req, fp, code, msg, headers):
1066        # http_error_auth_reqed requires that there is no userinfo component in
1067        # authority.  Assume there isn't one, since urllib.request does not (and
1068        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1069        # userinfo.
1070        authority = req.host
1071        response = self.http_error_auth_reqed('proxy-authenticate',
1072                                          authority, req, headers)
1073        return response
1074
1075
1076# Return n random bytes.
1077_randombytes = os.urandom
1078
1079
1080class AbstractDigestAuthHandler:
1081    # Digest authentication is specified in RFC 2617.
1082
1083    # XXX The client does not inspect the Authentication-Info header
1084    # in a successful response.
1085
1086    # XXX It should be possible to test this implementation against
1087    # a mock server that just generates a static set of challenges.
1088
1089    # XXX qop="auth-int" supports is shaky
1090
1091    def __init__(self, passwd=None):
1092        if passwd is None:
1093            passwd = HTTPPasswordMgr()
1094        self.passwd = passwd
1095        self.add_password = self.passwd.add_password
1096        self.retried = 0
1097        self.nonce_count = 0
1098        self.last_nonce = None
1099
1100    def reset_retry_count(self):
1101        self.retried = 0
1102
1103    def http_error_auth_reqed(self, auth_header, host, req, headers):
1104        authreq = headers.get(auth_header, None)
1105        if self.retried > 5:
1106            # Don't fail endlessly - if we failed once, we'll probably
1107            # fail a second time. Hm. Unless the Password Manager is
1108            # prompting for the information. Crap. This isn't great
1109            # but it's better than the current 'repeat until recursion
1110            # depth exceeded' approach <wink>
1111            raise HTTPError(req.full_url, 401, "digest auth failed",
1112                            headers, None)
1113        else:
1114            self.retried += 1
1115        if authreq:
1116            scheme = authreq.split()[0]
1117            if scheme.lower() == 'digest':
1118                return self.retry_http_digest_auth(req, authreq)
1119            elif scheme.lower() != 'basic':
1120                raise ValueError("AbstractDigestAuthHandler does not support"
1121                                 " the following scheme: '%s'" % scheme)
1122
1123    def retry_http_digest_auth(self, req, auth):
1124        token, challenge = auth.split(' ', 1)
1125        chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1126        auth = self.get_authorization(req, chal)
1127        if auth:
1128            auth_val = 'Digest %s' % auth
1129            if req.headers.get(self.auth_header, None) == auth_val:
1130                return None
1131            req.add_unredirected_header(self.auth_header, auth_val)
1132            resp = self.parent.open(req, timeout=req.timeout)
1133            return resp
1134
1135    def get_cnonce(self, nonce):
1136        # The cnonce-value is an opaque
1137        # quoted string value provided by the client and used by both client
1138        # and server to avoid chosen plaintext attacks, to provide mutual
1139        # authentication, and to provide some message integrity protection.
1140        # This isn't a fabulous effort, but it's probably Good Enough.
1141        s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1142        b = s.encode("ascii") + _randombytes(8)
1143        dig = hashlib.sha1(b).hexdigest()
1144        return dig[:16]
1145
1146    def get_authorization(self, req, chal):
1147        try:
1148            realm = chal['realm']
1149            nonce = chal['nonce']
1150            qop = chal.get('qop')
1151            algorithm = chal.get('algorithm', 'MD5')
1152            # mod_digest doesn't send an opaque, even though it isn't
1153            # supposed to be optional
1154            opaque = chal.get('opaque', None)
1155        except KeyError:
1156            return None
1157
1158        H, KD = self.get_algorithm_impls(algorithm)
1159        if H is None:
1160            return None
1161
1162        user, pw = self.passwd.find_user_password(realm, req.full_url)
1163        if user is None:
1164            return None
1165
1166        # XXX not implemented yet
1167        if req.data is not None:
1168            entdig = self.get_entity_digest(req.data, chal)
1169        else:
1170            entdig = None
1171
1172        A1 = "%s:%s:%s" % (user, realm, pw)
1173        A2 = "%s:%s" % (req.get_method(),
1174                        # XXX selector: what about proxies and full urls
1175                        req.selector)
1176        # NOTE: As per  RFC 2617, when server sends "auth,auth-int", the client could use either `auth`
1177        #     or `auth-int` to the response back. we use `auth` to send the response back.
1178        if qop is None:
1179            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1180        elif 'auth' in qop.split(','):
1181            if nonce == self.last_nonce:
1182                self.nonce_count += 1
1183            else:
1184                self.nonce_count = 1
1185                self.last_nonce = nonce
1186            ncvalue = '%08x' % self.nonce_count
1187            cnonce = self.get_cnonce(nonce)
1188            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2))
1189            respdig = KD(H(A1), noncebit)
1190        else:
1191            # XXX handle auth-int.
1192            raise URLError("qop '%s' is not supported." % qop)
1193
1194        # XXX should the partial digests be encoded too?
1195
1196        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1197               'response="%s"' % (user, realm, nonce, req.selector,
1198                                  respdig)
1199        if opaque:
1200            base += ', opaque="%s"' % opaque
1201        if entdig:
1202            base += ', digest="%s"' % entdig
1203        base += ', algorithm="%s"' % algorithm
1204        if qop:
1205            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1206        return base
1207
1208    def get_algorithm_impls(self, algorithm):
1209        # lambdas assume digest modules are imported at the top level
1210        if algorithm == 'MD5':
1211            H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1212        elif algorithm == 'SHA':
1213            H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1214        # XXX MD5-sess
1215        else:
1216            raise ValueError("Unsupported digest authentication "
1217                             "algorithm %r" % algorithm)
1218        KD = lambda s, d: H("%s:%s" % (s, d))
1219        return H, KD
1220
1221    def get_entity_digest(self, data, chal):
1222        # XXX not implemented yet
1223        return None
1224
1225
1226class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1227    """An authentication protocol defined by RFC 2069
1228
1229    Digest authentication improves on basic authentication because it
1230    does not transmit passwords in the clear.
1231    """
1232
1233    auth_header = 'Authorization'
1234    handler_order = 490  # before Basic auth
1235
1236    def http_error_401(self, req, fp, code, msg, headers):
1237        host = urlparse(req.full_url)[1]
1238        retry = self.http_error_auth_reqed('www-authenticate',
1239                                           host, req, headers)
1240        self.reset_retry_count()
1241        return retry
1242
1243
1244class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1245
1246    auth_header = 'Proxy-Authorization'
1247    handler_order = 490  # before Basic auth
1248
1249    def http_error_407(self, req, fp, code, msg, headers):
1250        host = req.host
1251        retry = self.http_error_auth_reqed('proxy-authenticate',
1252                                           host, req, headers)
1253        self.reset_retry_count()
1254        return retry
1255
1256class AbstractHTTPHandler(BaseHandler):
1257
1258    def __init__(self, debuglevel=0):
1259        self._debuglevel = debuglevel
1260
1261    def set_http_debuglevel(self, level):
1262        self._debuglevel = level
1263
1264    def _get_content_length(self, request):
1265        return http.client.HTTPConnection._get_content_length(
1266            request.data,
1267            request.get_method())
1268
1269    def do_request_(self, request):
1270        host = request.host
1271        if not host:
1272            raise URLError('no host given')
1273
1274        if request.data is not None:  # POST
1275            data = request.data
1276            if isinstance(data, str):
1277                msg = "POST data should be bytes, an iterable of bytes, " \
1278                      "or a file object. It cannot be of type str."
1279                raise TypeError(msg)
1280            if not request.has_header('Content-type'):
1281                request.add_unredirected_header(
1282                    'Content-type',
1283                    'application/x-www-form-urlencoded')
1284            if (not request.has_header('Content-length')
1285                    and not request.has_header('Transfer-encoding')):
1286                content_length = self._get_content_length(request)
1287                if content_length is not None:
1288                    request.add_unredirected_header(
1289                            'Content-length', str(content_length))
1290                else:
1291                    request.add_unredirected_header(
1292                            'Transfer-encoding', 'chunked')
1293
1294        sel_host = host
1295        if request.has_proxy():
1296            scheme, sel = _splittype(request.selector)
1297            sel_host, sel_path = _splithost(sel)
1298        if not request.has_header('Host'):
1299            request.add_unredirected_header('Host', sel_host)
1300        for name, value in self.parent.addheaders:
1301            name = name.capitalize()
1302            if not request.has_header(name):
1303                request.add_unredirected_header(name, value)
1304
1305        return request
1306
1307    def do_open(self, http_class, req, **http_conn_args):
1308        """Return an HTTPResponse object for the request, using http_class.
1309
1310        http_class must implement the HTTPConnection API from http.client.
1311        """
1312        host = req.host
1313        if not host:
1314            raise URLError('no host given')
1315
1316        # will parse host:port
1317        h = http_class(host, timeout=req.timeout, **http_conn_args)
1318        h.set_debuglevel(self._debuglevel)
1319
1320        headers = dict(req.unredirected_hdrs)
1321        headers.update({k: v for k, v in req.headers.items()
1322                        if k not in headers})
1323
1324        # TODO(jhylton): Should this be redesigned to handle
1325        # persistent connections?
1326
1327        # We want to make an HTTP/1.1 request, but the addinfourl
1328        # class isn't prepared to deal with a persistent connection.
1329        # It will try to read all remaining data from the socket,
1330        # which will block while the server waits for the next request.
1331        # So make sure the connection gets closed after the (only)
1332        # request.
1333        headers["Connection"] = "close"
1334        headers = {name.title(): val for name, val in headers.items()}
1335
1336        if req._tunnel_host:
1337            tunnel_headers = {}
1338            proxy_auth_hdr = "Proxy-Authorization"
1339            if proxy_auth_hdr in headers:
1340                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1341                # Proxy-Authorization should not be sent to origin
1342                # server.
1343                del headers[proxy_auth_hdr]
1344            h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1345
1346        try:
1347            try:
1348                h.request(req.get_method(), req.selector, req.data, headers,
1349                          encode_chunked=req.has_header('Transfer-encoding'))
1350            except OSError as err: # timeout error
1351                raise URLError(err)
1352            r = h.getresponse()
1353        except:
1354            h.close()
1355            raise
1356
1357        # If the server does not send us a 'Connection: close' header,
1358        # HTTPConnection assumes the socket should be left open. Manually
1359        # mark the socket to be closed when this response object goes away.
1360        if h.sock:
1361            h.sock.close()
1362            h.sock = None
1363
1364        r.url = req.get_full_url()
1365        # This line replaces the .msg attribute of the HTTPResponse
1366        # with .headers, because urllib clients expect the response to
1367        # have the reason in .msg.  It would be good to mark this
1368        # attribute is deprecated and get then to use info() or
1369        # .headers.
1370        r.msg = r.reason
1371        return r
1372
1373
1374class HTTPHandler(AbstractHTTPHandler):
1375
1376    def http_open(self, req):
1377        return self.do_open(http.client.HTTPConnection, req)
1378
1379    http_request = AbstractHTTPHandler.do_request_
1380
1381if hasattr(http.client, 'HTTPSConnection'):
1382
1383    class HTTPSHandler(AbstractHTTPHandler):
1384
1385        def __init__(self, debuglevel=0, context=None, check_hostname=None):
1386            AbstractHTTPHandler.__init__(self, debuglevel)
1387            self._context = context
1388            self._check_hostname = check_hostname
1389
1390        def https_open(self, req):
1391            return self.do_open(http.client.HTTPSConnection, req,
1392                context=self._context, check_hostname=self._check_hostname)
1393
1394        https_request = AbstractHTTPHandler.do_request_
1395
1396    __all__.append('HTTPSHandler')
1397
1398class HTTPCookieProcessor(BaseHandler):
1399    def __init__(self, cookiejar=None):
1400        import http.cookiejar
1401        if cookiejar is None:
1402            cookiejar = http.cookiejar.CookieJar()
1403        self.cookiejar = cookiejar
1404
1405    def http_request(self, request):
1406        self.cookiejar.add_cookie_header(request)
1407        return request
1408
1409    def http_response(self, request, response):
1410        self.cookiejar.extract_cookies(response, request)
1411        return response
1412
1413    https_request = http_request
1414    https_response = http_response
1415
1416class UnknownHandler(BaseHandler):
1417    def unknown_open(self, req):
1418        type = req.type
1419        raise URLError('unknown url type: %s' % type)
1420
1421def parse_keqv_list(l):
1422    """Parse list of key=value strings where keys are not duplicated."""
1423    parsed = {}
1424    for elt in l:
1425        k, v = elt.split('=', 1)
1426        if v[0] == '"' and v[-1] == '"':
1427            v = v[1:-1]
1428        parsed[k] = v
1429    return parsed
1430
1431def parse_http_list(s):
1432    """Parse lists as described by RFC 2068 Section 2.
1433
1434    In particular, parse comma-separated lists where the elements of
1435    the list may include quoted-strings.  A quoted-string could
1436    contain a comma.  A non-quoted string could have quotes in the
1437    middle.  Neither commas nor quotes count if they are escaped.
1438    Only double-quotes count, not single-quotes.
1439    """
1440    res = []
1441    part = ''
1442
1443    escape = quote = False
1444    for cur in s:
1445        if escape:
1446            part += cur
1447            escape = False
1448            continue
1449        if quote:
1450            if cur == '\\':
1451                escape = True
1452                continue
1453            elif cur == '"':
1454                quote = False
1455            part += cur
1456            continue
1457
1458        if cur == ',':
1459            res.append(part)
1460            part = ''
1461            continue
1462
1463        if cur == '"':
1464            quote = True
1465
1466        part += cur
1467
1468    # append last part
1469    if part:
1470        res.append(part)
1471
1472    return [part.strip() for part in res]
1473
1474class FileHandler(BaseHandler):
1475    # Use local file or FTP depending on form of URL
1476    def file_open(self, req):
1477        url = req.selector
1478        if url[:2] == '//' and url[2:3] != '/' and (req.host and
1479                req.host != 'localhost'):
1480            if not req.host in self.get_names():
1481                raise URLError("file:// scheme is supported only on localhost")
1482        else:
1483            return self.open_local_file(req)
1484
1485    # names for the localhost
1486    names = None
1487    def get_names(self):
1488        if FileHandler.names is None:
1489            try:
1490                FileHandler.names = tuple(
1491                    socket.gethostbyname_ex('localhost')[2] +
1492                    socket.gethostbyname_ex(socket.gethostname())[2])
1493            except socket.gaierror:
1494                FileHandler.names = (socket.gethostbyname('localhost'),)
1495        return FileHandler.names
1496
1497    # not entirely sure what the rules are here
1498    def open_local_file(self, req):
1499        import email.utils
1500        import mimetypes
1501        host = req.host
1502        filename = req.selector
1503        localfile = url2pathname(filename)
1504        try:
1505            stats = os.stat(localfile)
1506            size = stats.st_size
1507            modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1508            mtype = mimetypes.guess_type(filename)[0]
1509            headers = email.message_from_string(
1510                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1511                (mtype or 'text/plain', size, modified))
1512            if host:
1513                host, port = _splitport(host)
1514            if not host or \
1515                (not port and _safe_gethostbyname(host) in self.get_names()):
1516                if host:
1517                    origurl = 'file://' + host + filename
1518                else:
1519                    origurl = 'file://' + filename
1520                return addinfourl(open(localfile, 'rb'), headers, origurl)
1521        except OSError as exp:
1522            raise URLError(exp)
1523        raise URLError('file not on local host')
1524
1525def _safe_gethostbyname(host):
1526    try:
1527        return socket.gethostbyname(host)
1528    except socket.gaierror:
1529        return None
1530
1531class FTPHandler(BaseHandler):
1532    def ftp_open(self, req):
1533        import ftplib
1534        import mimetypes
1535        host = req.host
1536        if not host:
1537            raise URLError('ftp error: no host given')
1538        host, port = _splitport(host)
1539        if port is None:
1540            port = ftplib.FTP_PORT
1541        else:
1542            port = int(port)
1543
1544        # username/password handling
1545        user, host = _splituser(host)
1546        if user:
1547            user, passwd = _splitpasswd(user)
1548        else:
1549            passwd = None
1550        host = unquote(host)
1551        user = user or ''
1552        passwd = passwd or ''
1553
1554        try:
1555            host = socket.gethostbyname(host)
1556        except OSError as msg:
1557            raise URLError(msg)
1558        path, attrs = _splitattr(req.selector)
1559        dirs = path.split('/')
1560        dirs = list(map(unquote, dirs))
1561        dirs, file = dirs[:-1], dirs[-1]
1562        if dirs and not dirs[0]:
1563            dirs = dirs[1:]
1564        try:
1565            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1566            type = file and 'I' or 'D'
1567            for attr in attrs:
1568                attr, value = _splitvalue(attr)
1569                if attr.lower() == 'type' and \
1570                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1571                    type = value.upper()
1572            fp, retrlen = fw.retrfile(file, type)
1573            headers = ""
1574            mtype = mimetypes.guess_type(req.full_url)[0]
1575            if mtype:
1576                headers += "Content-type: %s\n" % mtype
1577            if retrlen is not None and retrlen >= 0:
1578                headers += "Content-length: %d\n" % retrlen
1579            headers = email.message_from_string(headers)
1580            return addinfourl(fp, headers, req.full_url)
1581        except ftplib.all_errors as exp:
1582            exc = URLError('ftp error: %r' % exp)
1583            raise exc.with_traceback(sys.exc_info()[2])
1584
1585    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1586        return ftpwrapper(user, passwd, host, port, dirs, timeout,
1587                          persistent=False)
1588
1589class CacheFTPHandler(FTPHandler):
1590    # XXX would be nice to have pluggable cache strategies
1591    # XXX this stuff is definitely not thread safe
1592    def __init__(self):
1593        self.cache = {}
1594        self.timeout = {}
1595        self.soonest = 0
1596        self.delay = 60
1597        self.max_conns = 16
1598
1599    def setTimeout(self, t):
1600        self.delay = t
1601
1602    def setMaxConns(self, m):
1603        self.max_conns = m
1604
1605    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1606        key = user, host, port, '/'.join(dirs), timeout
1607        if key in self.cache:
1608            self.timeout[key] = time.time() + self.delay
1609        else:
1610            self.cache[key] = ftpwrapper(user, passwd, host, port,
1611                                         dirs, timeout)
1612            self.timeout[key] = time.time() + self.delay
1613        self.check_cache()
1614        return self.cache[key]
1615
1616    def check_cache(self):
1617        # first check for old ones
1618        t = time.time()
1619        if self.soonest <= t:
1620            for k, v in list(self.timeout.items()):
1621                if v < t:
1622                    self.cache[k].close()
1623                    del self.cache[k]
1624                    del self.timeout[k]
1625        self.soonest = min(list(self.timeout.values()))
1626
1627        # then check the size
1628        if len(self.cache) == self.max_conns:
1629            for k, v in list(self.timeout.items()):
1630                if v == self.soonest:
1631                    del self.cache[k]
1632                    del self.timeout[k]
1633                    break
1634            self.soonest = min(list(self.timeout.values()))
1635
1636    def clear_cache(self):
1637        for conn in self.cache.values():
1638            conn.close()
1639        self.cache.clear()
1640        self.timeout.clear()
1641
1642class DataHandler(BaseHandler):
1643    def data_open(self, req):
1644        # data URLs as specified in RFC 2397.
1645        #
1646        # ignores POSTed data
1647        #
1648        # syntax:
1649        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
1650        # mediatype := [ type "/" subtype ] *( ";" parameter )
1651        # data      := *urlchar
1652        # parameter := attribute "=" value
1653        url = req.full_url
1654
1655        scheme, data = url.split(":",1)
1656        mediatype, data = data.split(",",1)
1657
1658        # even base64 encoded data URLs might be quoted so unquote in any case:
1659        data = unquote_to_bytes(data)
1660        if mediatype.endswith(";base64"):
1661            data = base64.decodebytes(data)
1662            mediatype = mediatype[:-7]
1663
1664        if not mediatype:
1665            mediatype = "text/plain;charset=US-ASCII"
1666
1667        headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1668            (mediatype, len(data)))
1669
1670        return addinfourl(io.BytesIO(data), headers, url)
1671
1672
1673# Code move from the old urllib module
1674
1675MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
1676
1677# Helper for non-unix systems
1678if os.name == 'nt':
1679    from nturl2path import url2pathname, pathname2url
1680else:
1681    def url2pathname(pathname):
1682        """OS-specific conversion from a relative URL of the 'file' scheme
1683        to a file system path; not recommended for general use."""
1684        return unquote(pathname)
1685
1686    def pathname2url(pathname):
1687        """OS-specific conversion from a file system path to a relative URL
1688        of the 'file' scheme; not recommended for general use."""
1689        return quote(pathname)
1690
1691
1692ftpcache = {}
1693
1694
1695class URLopener:
1696    """Class to open URLs.
1697    This is a class rather than just a subroutine because we may need
1698    more than one set of global protocol-specific options.
1699    Note -- this is a base class for those who don't want the
1700    automatic handling of errors type 302 (relocated) and 401
1701    (authorization needed)."""
1702
1703    __tempfiles = None
1704
1705    version = "Python-urllib/%s" % __version__
1706
1707    # Constructor
1708    def __init__(self, proxies=None, **x509):
1709        msg = "%(class)s style of invoking requests is deprecated. " \
1710              "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1711        warnings.warn(msg, DeprecationWarning, stacklevel=3)
1712        if proxies is None:
1713            proxies = getproxies()
1714        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1715        self.proxies = proxies
1716        self.key_file = x509.get('key_file')
1717        self.cert_file = x509.get('cert_file')
1718        self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]
1719        self.__tempfiles = []
1720        self.__unlink = os.unlink # See cleanup()
1721        self.tempcache = None
1722        # Undocumented feature: if you assign {} to tempcache,
1723        # it is used to cache files retrieved with
1724        # self.retrieve().  This is not enabled by default
1725        # since it does not work for changing documents (and I
1726        # haven't got the logic to check expiration headers
1727        # yet).
1728        self.ftpcache = ftpcache
1729        # Undocumented feature: you can use a different
1730        # ftp cache by assigning to the .ftpcache member;
1731        # in case you want logically independent URL openers
1732        # XXX This is not threadsafe.  Bah.
1733
1734    def __del__(self):
1735        self.close()
1736
1737    def close(self):
1738        self.cleanup()
1739
1740    def cleanup(self):
1741        # This code sometimes runs when the rest of this module
1742        # has already been deleted, so it can't use any globals
1743        # or import anything.
1744        if self.__tempfiles:
1745            for file in self.__tempfiles:
1746                try:
1747                    self.__unlink(file)
1748                except OSError:
1749                    pass
1750            del self.__tempfiles[:]
1751        if self.tempcache:
1752            self.tempcache.clear()
1753
1754    def addheader(self, *args):
1755        """Add a header to be used by the HTTP interface only
1756        e.g. u.addheader('Accept', 'sound/basic')"""
1757        self.addheaders.append(args)
1758
1759    # External interface
1760    def open(self, fullurl, data=None):
1761        """Use URLopener().open(file) instead of open(file, 'r')."""
1762        fullurl = unwrap(_to_bytes(fullurl))
1763        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1764        if self.tempcache and fullurl in self.tempcache:
1765            filename, headers = self.tempcache[fullurl]
1766            fp = open(filename, 'rb')
1767            return addinfourl(fp, headers, fullurl)
1768        urltype, url = _splittype(fullurl)
1769        if not urltype:
1770            urltype = 'file'
1771        if urltype in self.proxies:
1772            proxy = self.proxies[urltype]
1773            urltype, proxyhost = _splittype(proxy)
1774            host, selector = _splithost(proxyhost)
1775            url = (host, fullurl) # Signal special case to open_*()
1776        else:
1777            proxy = None
1778        name = 'open_' + urltype
1779        self.type = urltype
1780        name = name.replace('-', '_')
1781        if not hasattr(self, name) or name == 'open_local_file':
1782            if proxy:
1783                return self.open_unknown_proxy(proxy, fullurl, data)
1784            else:
1785                return self.open_unknown(fullurl, data)
1786        try:
1787            if data is None:
1788                return getattr(self, name)(url)
1789            else:
1790                return getattr(self, name)(url, data)
1791        except (HTTPError, URLError):
1792            raise
1793        except OSError as msg:
1794            raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
1795
1796    def open_unknown(self, fullurl, data=None):
1797        """Overridable interface to open unknown URL type."""
1798        type, url = _splittype(fullurl)
1799        raise OSError('url error', 'unknown url type', type)
1800
1801    def open_unknown_proxy(self, proxy, fullurl, data=None):
1802        """Overridable interface to open unknown URL type."""
1803        type, url = _splittype(fullurl)
1804        raise OSError('url error', 'invalid proxy for %s' % type, proxy)
1805
1806    # External interface
1807    def retrieve(self, url, filename=None, reporthook=None, data=None):
1808        """retrieve(url) returns (filename, headers) for a local object
1809        or (tempfilename, headers) for a remote object."""
1810        url = unwrap(_to_bytes(url))
1811        if self.tempcache and url in self.tempcache:
1812            return self.tempcache[url]
1813        type, url1 = _splittype(url)
1814        if filename is None and (not type or type == 'file'):
1815            try:
1816                fp = self.open_local_file(url1)
1817                hdrs = fp.info()
1818                fp.close()
1819                return url2pathname(_splithost(url1)[1]), hdrs
1820            except OSError:
1821                pass
1822        fp = self.open(url, data)
1823        try:
1824            headers = fp.info()
1825            if filename:
1826                tfp = open(filename, 'wb')
1827            else:
1828                garbage, path = _splittype(url)
1829                garbage, path = _splithost(path or "")
1830                path, garbage = _splitquery(path or "")
1831                path, garbage = _splitattr(path or "")
1832                suffix = os.path.splitext(path)[1]
1833                (fd, filename) = tempfile.mkstemp(suffix)
1834                self.__tempfiles.append(filename)
1835                tfp = os.fdopen(fd, 'wb')
1836            try:
1837                result = filename, headers
1838                if self.tempcache is not None:
1839                    self.tempcache[url] = result
1840                bs = 1024*8
1841                size = -1
1842                read = 0
1843                blocknum = 0
1844                if "content-length" in headers:
1845                    size = int(headers["Content-Length"])
1846                if reporthook:
1847                    reporthook(blocknum, bs, size)
1848                while 1:
1849                    block = fp.read(bs)
1850                    if not block:
1851                        break
1852                    read += len(block)
1853                    tfp.write(block)
1854                    blocknum += 1
1855                    if reporthook:
1856                        reporthook(blocknum, bs, size)
1857            finally:
1858                tfp.close()
1859        finally:
1860            fp.close()
1861
1862        # raise exception if actual size does not match content-length header
1863        if size >= 0 and read < size:
1864            raise ContentTooShortError(
1865                "retrieval incomplete: got only %i out of %i bytes"
1866                % (read, size), result)
1867
1868        return result
1869
1870    # Each method named open_<type> knows how to open that type of URL
1871
1872    def _open_generic_http(self, connection_factory, url, data):
1873        """Make an HTTP connection using connection_class.
1874
1875        This is an internal method that should be called from
1876        open_http() or open_https().
1877
1878        Arguments:
1879        - connection_factory should take a host name and return an
1880          HTTPConnection instance.
1881        - url is the url to retrieval or a host, relative-path pair.
1882        - data is payload for a POST request or None.
1883        """
1884
1885        user_passwd = None
1886        proxy_passwd= None
1887        if isinstance(url, str):
1888            host, selector = _splithost(url)
1889            if host:
1890                user_passwd, host = _splituser(host)
1891                host = unquote(host)
1892            realhost = host
1893        else:
1894            host, selector = url
1895            # check whether the proxy contains authorization information
1896            proxy_passwd, host = _splituser(host)
1897            # now we proceed with the url we want to obtain
1898            urltype, rest = _splittype(selector)
1899            url = rest
1900            user_passwd = None
1901            if urltype.lower() != 'http':
1902                realhost = None
1903            else:
1904                realhost, rest = _splithost(rest)
1905                if realhost:
1906                    user_passwd, realhost = _splituser(realhost)
1907                if user_passwd:
1908                    selector = "%s://%s%s" % (urltype, realhost, rest)
1909                if proxy_bypass(realhost):
1910                    host = realhost
1911
1912        if not host: raise OSError('http error', 'no host given')
1913
1914        if proxy_passwd:
1915            proxy_passwd = unquote(proxy_passwd)
1916            proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1917        else:
1918            proxy_auth = None
1919
1920        if user_passwd:
1921            user_passwd = unquote(user_passwd)
1922            auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1923        else:
1924            auth = None
1925        http_conn = connection_factory(host)
1926        headers = {}
1927        if proxy_auth:
1928            headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1929        if auth:
1930            headers["Authorization"] =  "Basic %s" % auth
1931        if realhost:
1932            headers["Host"] = realhost
1933
1934        # Add Connection:close as we don't support persistent connections yet.
1935        # This helps in closing the socket and avoiding ResourceWarning
1936
1937        headers["Connection"] = "close"
1938
1939        for header, value in self.addheaders:
1940            headers[header] = value
1941
1942        if data is not None:
1943            headers["Content-Type"] = "application/x-www-form-urlencoded"
1944            http_conn.request("POST", selector, data, headers)
1945        else:
1946            http_conn.request("GET", selector, headers=headers)
1947
1948        try:
1949            response = http_conn.getresponse()
1950        except http.client.BadStatusLine:
1951            # something went wrong with the HTTP status line
1952            raise URLError("http protocol error: bad status line")
1953
1954        # According to RFC 2616, "2xx" code indicates that the client's
1955        # request was successfully received, understood, and accepted.
1956        if 200 <= response.status < 300:
1957            return addinfourl(response, response.msg, "http:" + url,
1958                              response.status)
1959        else:
1960            return self.http_error(
1961                url, response.fp,
1962                response.status, response.reason, response.msg, data)
1963
1964    def open_http(self, url, data=None):
1965        """Use HTTP protocol."""
1966        return self._open_generic_http(http.client.HTTPConnection, url, data)
1967
1968    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1969        """Handle http errors.
1970
1971        Derived class can override this, or provide specific handlers
1972        named http_error_DDD where DDD is the 3-digit error code."""
1973        # First check if there's a specific handler for this error
1974        name = 'http_error_%d' % errcode
1975        if hasattr(self, name):
1976            method = getattr(self, name)
1977            if data is None:
1978                result = method(url, fp, errcode, errmsg, headers)
1979            else:
1980                result = method(url, fp, errcode, errmsg, headers, data)
1981            if result: return result
1982        return self.http_error_default(url, fp, errcode, errmsg, headers)
1983
1984    def http_error_default(self, url, fp, errcode, errmsg, headers):
1985        """Default error handler: close the connection and raise OSError."""
1986        fp.close()
1987        raise HTTPError(url, errcode, errmsg, headers, None)
1988
1989    if _have_ssl:
1990        def _https_connection(self, host):
1991            return http.client.HTTPSConnection(host,
1992                                           key_file=self.key_file,
1993                                           cert_file=self.cert_file)
1994
1995        def open_https(self, url, data=None):
1996            """Use HTTPS protocol."""
1997            return self._open_generic_http(self._https_connection, url, data)
1998
1999    def open_file(self, url):
2000        """Use local file or FTP depending on form of URL."""
2001        if not isinstance(url, str):
2002            raise URLError('file error: proxy support for file protocol currently not implemented')
2003        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
2004            raise ValueError("file:// scheme is supported only on localhost")
2005        else:
2006            return self.open_local_file(url)
2007
2008    def open_local_file(self, url):
2009        """Use local file."""
2010        import email.utils
2011        import mimetypes
2012        host, file = _splithost(url)
2013        localname = url2pathname(file)
2014        try:
2015            stats = os.stat(localname)
2016        except OSError as e:
2017            raise URLError(e.strerror, e.filename)
2018        size = stats.st_size
2019        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
2020        mtype = mimetypes.guess_type(url)[0]
2021        headers = email.message_from_string(
2022            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
2023            (mtype or 'text/plain', size, modified))
2024        if not host:
2025            urlfile = file
2026            if file[:1] == '/':
2027                urlfile = 'file://' + file
2028            return addinfourl(open(localname, 'rb'), headers, urlfile)
2029        host, port = _splitport(host)
2030        if (not port
2031           and socket.gethostbyname(host) in ((localhost(),) + thishost())):
2032            urlfile = file
2033            if file[:1] == '/':
2034                urlfile = 'file://' + file
2035            elif file[:2] == './':
2036                raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
2037            return addinfourl(open(localname, 'rb'), headers, urlfile)
2038        raise URLError('local file error: not on local host')
2039
2040    def open_ftp(self, url):
2041        """Use FTP protocol."""
2042        if not isinstance(url, str):
2043            raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
2044        import mimetypes
2045        host, path = _splithost(url)
2046        if not host: raise URLError('ftp error: no host given')
2047        host, port = _splitport(host)
2048        user, host = _splituser(host)
2049        if user: user, passwd = _splitpasswd(user)
2050        else: passwd = None
2051        host = unquote(host)
2052        user = unquote(user or '')
2053        passwd = unquote(passwd or '')
2054        host = socket.gethostbyname(host)
2055        if not port:
2056            import ftplib
2057            port = ftplib.FTP_PORT
2058        else:
2059            port = int(port)
2060        path, attrs = _splitattr(path)
2061        path = unquote(path)
2062        dirs = path.split('/')
2063        dirs, file = dirs[:-1], dirs[-1]
2064        if dirs and not dirs[0]: dirs = dirs[1:]
2065        if dirs and not dirs[0]: dirs[0] = '/'
2066        key = user, host, port, '/'.join(dirs)
2067        # XXX thread unsafe!
2068        if len(self.ftpcache) > MAXFTPCACHE:
2069            # Prune the cache, rather arbitrarily
2070            for k in list(self.ftpcache):
2071                if k != key:
2072                    v = self.ftpcache[k]
2073                    del self.ftpcache[k]
2074                    v.close()
2075        try:
2076            if key not in self.ftpcache:
2077                self.ftpcache[key] = \
2078                    ftpwrapper(user, passwd, host, port, dirs)
2079            if not file: type = 'D'
2080            else: type = 'I'
2081            for attr in attrs:
2082                attr, value = _splitvalue(attr)
2083                if attr.lower() == 'type' and \
2084                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
2085                    type = value.upper()
2086            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2087            mtype = mimetypes.guess_type("ftp:" + url)[0]
2088            headers = ""
2089            if mtype:
2090                headers += "Content-Type: %s\n" % mtype
2091            if retrlen is not None and retrlen >= 0:
2092                headers += "Content-Length: %d\n" % retrlen
2093            headers = email.message_from_string(headers)
2094            return addinfourl(fp, headers, "ftp:" + url)
2095        except ftperrors() as exp:
2096            raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
2097
2098    def open_data(self, url, data=None):
2099        """Use "data" URL."""
2100        if not isinstance(url, str):
2101            raise URLError('data error: proxy support for data protocol currently not implemented')
2102        # ignore POSTed data
2103        #
2104        # syntax of data URLs:
2105        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
2106        # mediatype := [ type "/" subtype ] *( ";" parameter )
2107        # data      := *urlchar
2108        # parameter := attribute "=" value
2109        try:
2110            [type, data] = url.split(',', 1)
2111        except ValueError:
2112            raise OSError('data error', 'bad data URL')
2113        if not type:
2114            type = 'text/plain;charset=US-ASCII'
2115        semi = type.rfind(';')
2116        if semi >= 0 and '=' not in type[semi:]:
2117            encoding = type[semi+1:]
2118            type = type[:semi]
2119        else:
2120            encoding = ''
2121        msg = []
2122        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2123                                            time.gmtime(time.time())))
2124        msg.append('Content-type: %s' % type)
2125        if encoding == 'base64':
2126            # XXX is this encoding/decoding ok?
2127            data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2128        else:
2129            data = unquote(data)
2130        msg.append('Content-Length: %d' % len(data))
2131        msg.append('')
2132        msg.append(data)
2133        msg = '\n'.join(msg)
2134        headers = email.message_from_string(msg)
2135        f = io.StringIO(msg)
2136        #f.fileno = None     # needed for addinfourl
2137        return addinfourl(f, headers, url)
2138
2139
2140class FancyURLopener(URLopener):
2141    """Derived class with handlers for errors we can handle (perhaps)."""
2142
2143    def __init__(self, *args, **kwargs):
2144        URLopener.__init__(self, *args, **kwargs)
2145        self.auth_cache = {}
2146        self.tries = 0
2147        self.maxtries = 10
2148
2149    def http_error_default(self, url, fp, errcode, errmsg, headers):
2150        """Default error handling -- don't raise an exception."""
2151        return addinfourl(fp, headers, "http:" + url, errcode)
2152
2153    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2154        """Error 302 -- relocated (temporarily)."""
2155        self.tries += 1
2156        try:
2157            if self.maxtries and self.tries >= self.maxtries:
2158                if hasattr(self, "http_error_500"):
2159                    meth = self.http_error_500
2160                else:
2161                    meth = self.http_error_default
2162                return meth(url, fp, 500,
2163                            "Internal Server Error: Redirect Recursion",
2164                            headers)
2165            result = self.redirect_internal(url, fp, errcode, errmsg,
2166                                            headers, data)
2167            return result
2168        finally:
2169            self.tries = 0
2170
2171    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2172        if 'location' in headers:
2173            newurl = headers['location']
2174        elif 'uri' in headers:
2175            newurl = headers['uri']
2176        else:
2177            return
2178        fp.close()
2179
2180        # In case the server sent a relative URL, join with original:
2181        newurl = urljoin(self.type + ":" + url, newurl)
2182
2183        urlparts = urlparse(newurl)
2184
2185        # For security reasons, we don't allow redirection to anything other
2186        # than http, https and ftp.
2187
2188        # We are using newer HTTPError with older redirect_internal method
2189        # This older method will get deprecated in 3.3
2190
2191        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2192            raise HTTPError(newurl, errcode,
2193                            errmsg +
2194                            " Redirection to url '%s' is not allowed." % newurl,
2195                            headers, fp)
2196
2197        return self.open(newurl)
2198
2199    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2200        """Error 301 -- also relocated (permanently)."""
2201        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2202
2203    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2204        """Error 303 -- also relocated (essentially identical to 302)."""
2205        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2206
2207    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2208        """Error 307 -- relocated, but turn POST into error."""
2209        if data is None:
2210            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2211        else:
2212            return self.http_error_default(url, fp, errcode, errmsg, headers)
2213
2214    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2215            retry=False):
2216        """Error 401 -- authentication required.
2217        This function supports Basic authentication only."""
2218        if 'www-authenticate' not in headers:
2219            URLopener.http_error_default(self, url, fp,
2220                                         errcode, errmsg, headers)
2221        stuff = headers['www-authenticate']
2222        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2223        if not match:
2224            URLopener.http_error_default(self, url, fp,
2225                                         errcode, errmsg, headers)
2226        scheme, realm = match.groups()
2227        if scheme.lower() != 'basic':
2228            URLopener.http_error_default(self, url, fp,
2229                                         errcode, errmsg, headers)
2230        if not retry:
2231            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2232                    headers)
2233        name = 'retry_' + self.type + '_basic_auth'
2234        if data is None:
2235            return getattr(self,name)(url, realm)
2236        else:
2237            return getattr(self,name)(url, realm, data)
2238
2239    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2240            retry=False):
2241        """Error 407 -- proxy authentication required.
2242        This function supports Basic authentication only."""
2243        if 'proxy-authenticate' not in headers:
2244            URLopener.http_error_default(self, url, fp,
2245                                         errcode, errmsg, headers)
2246        stuff = headers['proxy-authenticate']
2247        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2248        if not match:
2249            URLopener.http_error_default(self, url, fp,
2250                                         errcode, errmsg, headers)
2251        scheme, realm = match.groups()
2252        if scheme.lower() != 'basic':
2253            URLopener.http_error_default(self, url, fp,
2254                                         errcode, errmsg, headers)
2255        if not retry:
2256            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2257                    headers)
2258        name = 'retry_proxy_' + self.type + '_basic_auth'
2259        if data is None:
2260            return getattr(self,name)(url, realm)
2261        else:
2262            return getattr(self,name)(url, realm, data)
2263
2264    def retry_proxy_http_basic_auth(self, url, realm, data=None):
2265        host, selector = _splithost(url)
2266        newurl = 'http://' + host + selector
2267        proxy = self.proxies['http']
2268        urltype, proxyhost = _splittype(proxy)
2269        proxyhost, proxyselector = _splithost(proxyhost)
2270        i = proxyhost.find('@') + 1
2271        proxyhost = proxyhost[i:]
2272        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2273        if not (user or passwd): return None
2274        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2275                                  quote(passwd, safe=''), proxyhost)
2276        self.proxies['http'] = 'http://' + proxyhost + proxyselector
2277        if data is None:
2278            return self.open(newurl)
2279        else:
2280            return self.open(newurl, data)
2281
2282    def retry_proxy_https_basic_auth(self, url, realm, data=None):
2283        host, selector = _splithost(url)
2284        newurl = 'https://' + host + selector
2285        proxy = self.proxies['https']
2286        urltype, proxyhost = _splittype(proxy)
2287        proxyhost, proxyselector = _splithost(proxyhost)
2288        i = proxyhost.find('@') + 1
2289        proxyhost = proxyhost[i:]
2290        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2291        if not (user or passwd): return None
2292        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2293                                  quote(passwd, safe=''), proxyhost)
2294        self.proxies['https'] = 'https://' + proxyhost + proxyselector
2295        if data is None:
2296            return self.open(newurl)
2297        else:
2298            return self.open(newurl, data)
2299
2300    def retry_http_basic_auth(self, url, realm, data=None):
2301        host, selector = _splithost(url)
2302        i = host.find('@') + 1
2303        host = host[i:]
2304        user, passwd = self.get_user_passwd(host, realm, i)
2305        if not (user or passwd): return None
2306        host = "%s:%s@%s" % (quote(user, safe=''),
2307                             quote(passwd, safe=''), host)
2308        newurl = 'http://' + host + selector
2309        if data is None:
2310            return self.open(newurl)
2311        else:
2312            return self.open(newurl, data)
2313
2314    def retry_https_basic_auth(self, url, realm, data=None):
2315        host, selector = _splithost(url)
2316        i = host.find('@') + 1
2317        host = host[i:]
2318        user, passwd = self.get_user_passwd(host, realm, i)
2319        if not (user or passwd): return None
2320        host = "%s:%s@%s" % (quote(user, safe=''),
2321                             quote(passwd, safe=''), host)
2322        newurl = 'https://' + host + selector
2323        if data is None:
2324            return self.open(newurl)
2325        else:
2326            return self.open(newurl, data)
2327
2328    def get_user_passwd(self, host, realm, clear_cache=0):
2329        key = realm + '@' + host.lower()
2330        if key in self.auth_cache:
2331            if clear_cache:
2332                del self.auth_cache[key]
2333            else:
2334                return self.auth_cache[key]
2335        user, passwd = self.prompt_user_passwd(host, realm)
2336        if user or passwd: self.auth_cache[key] = (user, passwd)
2337        return user, passwd
2338
2339    def prompt_user_passwd(self, host, realm):
2340        """Override this in a GUI environment!"""
2341        import getpass
2342        try:
2343            user = input("Enter username for %s at %s: " % (realm, host))
2344            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2345                (user, realm, host))
2346            return user, passwd
2347        except KeyboardInterrupt:
2348            print()
2349            return None, None
2350
2351
2352# Utility functions
2353
2354_localhost = None
2355def localhost():
2356    """Return the IP address of the magic hostname 'localhost'."""
2357    global _localhost
2358    if _localhost is None:
2359        _localhost = socket.gethostbyname('localhost')
2360    return _localhost
2361
2362_thishost = None
2363def thishost():
2364    """Return the IP addresses of the current host."""
2365    global _thishost
2366    if _thishost is None:
2367        try:
2368            _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2369        except socket.gaierror:
2370            _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2371    return _thishost
2372
2373_ftperrors = None
2374def ftperrors():
2375    """Return the set of errors raised by the FTP class."""
2376    global _ftperrors
2377    if _ftperrors is None:
2378        import ftplib
2379        _ftperrors = ftplib.all_errors
2380    return _ftperrors
2381
2382_noheaders = None
2383def noheaders():
2384    """Return an empty email Message object."""
2385    global _noheaders
2386    if _noheaders is None:
2387        _noheaders = email.message_from_string("")
2388    return _noheaders
2389
2390
2391# Utility classes
2392
2393class ftpwrapper:
2394    """Class used by open_ftp() for cache of open FTP connections."""
2395
2396    def __init__(self, user, passwd, host, port, dirs, timeout=None,
2397                 persistent=True):
2398        self.user = user
2399        self.passwd = passwd
2400        self.host = host
2401        self.port = port
2402        self.dirs = dirs
2403        self.timeout = timeout
2404        self.refcount = 0
2405        self.keepalive = persistent
2406        try:
2407            self.init()
2408        except:
2409            self.close()
2410            raise
2411
2412    def init(self):
2413        import ftplib
2414        self.busy = 0
2415        self.ftp = ftplib.FTP()
2416        self.ftp.connect(self.host, self.port, self.timeout)
2417        self.ftp.login(self.user, self.passwd)
2418        _target = '/'.join(self.dirs)
2419        self.ftp.cwd(_target)
2420
2421    def retrfile(self, file, type):
2422        import ftplib
2423        self.endtransfer()
2424        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2425        else: cmd = 'TYPE ' + type; isdir = 0
2426        try:
2427            self.ftp.voidcmd(cmd)
2428        except ftplib.all_errors:
2429            self.init()
2430            self.ftp.voidcmd(cmd)
2431        conn = None
2432        if file and not isdir:
2433            # Try to retrieve as a file
2434            try:
2435                cmd = 'RETR ' + file
2436                conn, retrlen = self.ftp.ntransfercmd(cmd)
2437            except ftplib.error_perm as reason:
2438                if str(reason)[:3] != '550':
2439                    raise URLError('ftp error: %r' % reason).with_traceback(
2440                        sys.exc_info()[2])
2441        if not conn:
2442            # Set transfer mode to ASCII!
2443            self.ftp.voidcmd('TYPE A')
2444            # Try a directory listing. Verify that directory exists.
2445            if file:
2446                pwd = self.ftp.pwd()
2447                try:
2448                    try:
2449                        self.ftp.cwd(file)
2450                    except ftplib.error_perm as reason:
2451                        raise URLError('ftp error: %r' % reason) from reason
2452                finally:
2453                    self.ftp.cwd(pwd)
2454                cmd = 'LIST ' + file
2455            else:
2456                cmd = 'LIST'
2457            conn, retrlen = self.ftp.ntransfercmd(cmd)
2458        self.busy = 1
2459
2460        ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2461        self.refcount += 1
2462        conn.close()
2463        # Pass back both a suitably decorated object and a retrieval length
2464        return (ftpobj, retrlen)
2465
2466    def endtransfer(self):
2467        self.busy = 0
2468
2469    def close(self):
2470        self.keepalive = False
2471        if self.refcount <= 0:
2472            self.real_close()
2473
2474    def file_close(self):
2475        self.endtransfer()
2476        self.refcount -= 1
2477        if self.refcount <= 0 and not self.keepalive:
2478            self.real_close()
2479
2480    def real_close(self):
2481        self.endtransfer()
2482        try:
2483            self.ftp.close()
2484        except ftperrors():
2485            pass
2486
2487# Proxy handling
2488def getproxies_environment():
2489    """Return a dictionary of scheme -> proxy server URL mappings.
2490
2491    Scan the environment for variables named <scheme>_proxy;
2492    this seems to be the standard convention.  If you need a
2493    different way, you can pass a proxies dictionary to the
2494    [Fancy]URLopener constructor.
2495
2496    """
2497    proxies = {}
2498    # in order to prefer lowercase variables, process environment in
2499    # two passes: first matches any, second pass matches lowercase only
2500    for name, value in os.environ.items():
2501        name = name.lower()
2502        if value and name[-6:] == '_proxy':
2503            proxies[name[:-6]] = value
2504    # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
2505    # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
2506    # header from the client
2507    # If "proxy" is lowercase, it will still be used thanks to the next block
2508    if 'REQUEST_METHOD' in os.environ:
2509        proxies.pop('http', None)
2510    for name, value in os.environ.items():
2511        if name[-6:] == '_proxy':
2512            name = name.lower()
2513            if value:
2514                proxies[name[:-6]] = value
2515            else:
2516                proxies.pop(name[:-6], None)
2517    return proxies
2518
2519def proxy_bypass_environment(host, proxies=None):
2520    """Test if proxies should not be used for a particular host.
2521
2522    Checks the proxy dict for the value of no_proxy, which should
2523    be a list of comma separated DNS suffixes, or '*' for all hosts.
2524
2525    """
2526    if proxies is None:
2527        proxies = getproxies_environment()
2528    # don't bypass, if no_proxy isn't specified
2529    try:
2530        no_proxy = proxies['no']
2531    except KeyError:
2532        return False
2533    # '*' is special case for always bypass
2534    if no_proxy == '*':
2535        return True
2536    host = host.lower()
2537    # strip port off host
2538    hostonly, port = _splitport(host)
2539    # check if the host ends with any of the DNS suffixes
2540    for name in no_proxy.split(','):
2541        name = name.strip()
2542        if name:
2543            name = name.lstrip('.')  # ignore leading dots
2544            name = name.lower()
2545            if hostonly == name or host == name:
2546                return True
2547            name = '.' + name
2548            if hostonly.endswith(name) or host.endswith(name):
2549                return True
2550    # otherwise, don't bypass
2551    return False
2552
2553
2554# This code tests an OSX specific data structure but is testable on all
2555# platforms
2556def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2557    """
2558    Return True iff this host shouldn't be accessed using a proxy
2559
2560    This function uses the MacOSX framework SystemConfiguration
2561    to fetch the proxy information.
2562
2563    proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2564    { 'exclude_simple': bool,
2565      'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2566    }
2567    """
2568    from fnmatch import fnmatch
2569
2570    hostonly, port = _splitport(host)
2571
2572    def ip2num(ipAddr):
2573        parts = ipAddr.split('.')
2574        parts = list(map(int, parts))
2575        if len(parts) != 4:
2576            parts = (parts + [0, 0, 0, 0])[:4]
2577        return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2578
2579    # Check for simple host names:
2580    if '.' not in host:
2581        if proxy_settings['exclude_simple']:
2582            return True
2583
2584    hostIP = None
2585
2586    for value in proxy_settings.get('exceptions', ()):
2587        # Items in the list are strings like these: *.local, 169.254/16
2588        if not value: continue
2589
2590        m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2591        if m is not None:
2592            if hostIP is None:
2593                try:
2594                    hostIP = socket.gethostbyname(hostonly)
2595                    hostIP = ip2num(hostIP)
2596                except OSError:
2597                    continue
2598
2599            base = ip2num(m.group(1))
2600            mask = m.group(2)
2601            if mask is None:
2602                mask = 8 * (m.group(1).count('.') + 1)
2603            else:
2604                mask = int(mask[1:])
2605
2606            if mask < 0 or mask > 32:
2607                # System libraries ignore invalid prefix lengths
2608                continue
2609
2610            mask = 32 - mask
2611
2612            if (hostIP >> mask) == (base >> mask):
2613                return True
2614
2615        elif fnmatch(host, value):
2616            return True
2617
2618    return False
2619
2620
2621if sys.platform == 'darwin':
2622    from _scproxy import _get_proxy_settings, _get_proxies
2623
2624    def proxy_bypass_macosx_sysconf(host):
2625        proxy_settings = _get_proxy_settings()
2626        return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2627
2628    def getproxies_macosx_sysconf():
2629        """Return a dictionary of scheme -> proxy server URL mappings.
2630
2631        This function uses the MacOSX framework SystemConfiguration
2632        to fetch the proxy information.
2633        """
2634        return _get_proxies()
2635
2636
2637
2638    def proxy_bypass(host):
2639        """Return True, if host should be bypassed.
2640
2641        Checks proxy settings gathered from the environment, if specified,
2642        or from the MacOSX framework SystemConfiguration.
2643
2644        """
2645        proxies = getproxies_environment()
2646        if proxies:
2647            return proxy_bypass_environment(host, proxies)
2648        else:
2649            return proxy_bypass_macosx_sysconf(host)
2650
2651    def getproxies():
2652        return getproxies_environment() or getproxies_macosx_sysconf()
2653
2654
2655elif os.name == 'nt':
2656    def getproxies_registry():
2657        """Return a dictionary of scheme -> proxy server URL mappings.
2658
2659        Win32 uses the registry to store proxies.
2660
2661        """
2662        proxies = {}
2663        try:
2664            import winreg
2665        except ImportError:
2666            # Std module, so should be around - but you never know!
2667            return proxies
2668        try:
2669            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2670                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2671            proxyEnable = winreg.QueryValueEx(internetSettings,
2672                                               'ProxyEnable')[0]
2673            if proxyEnable:
2674                # Returned as Unicode but problems if not converted to ASCII
2675                proxyServer = str(winreg.QueryValueEx(internetSettings,
2676                                                       'ProxyServer')[0])
2677                if '=' in proxyServer:
2678                    # Per-protocol settings
2679                    for p in proxyServer.split(';'):
2680                        protocol, address = p.split('=', 1)
2681                        # See if address has a type:// prefix
2682                        if not re.match('(?:[^/:]+)://', address):
2683                            address = '%s://%s' % (protocol, address)
2684                        proxies[protocol] = address
2685                else:
2686                    # Use one setting for all protocols
2687                    if proxyServer[:5] == 'http:':
2688                        proxies['http'] = proxyServer
2689                    else:
2690                        proxies['http'] = 'http://%s' % proxyServer
2691                        proxies['https'] = 'https://%s' % proxyServer
2692                        proxies['ftp'] = 'ftp://%s' % proxyServer
2693            internetSettings.Close()
2694        except (OSError, ValueError, TypeError):
2695            # Either registry key not found etc, or the value in an
2696            # unexpected format.
2697            # proxies already set up to be empty so nothing to do
2698            pass
2699        return proxies
2700
2701    def getproxies():
2702        """Return a dictionary of scheme -> proxy server URL mappings.
2703
2704        Returns settings gathered from the environment, if specified,
2705        or the registry.
2706
2707        """
2708        return getproxies_environment() or getproxies_registry()
2709
2710    def proxy_bypass_registry(host):
2711        try:
2712            import winreg
2713        except ImportError:
2714            # Std modules, so should be around - but you never know!
2715            return 0
2716        try:
2717            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2718                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2719            proxyEnable = winreg.QueryValueEx(internetSettings,
2720                                               'ProxyEnable')[0]
2721            proxyOverride = str(winreg.QueryValueEx(internetSettings,
2722                                                     'ProxyOverride')[0])
2723            # ^^^^ Returned as Unicode but problems if not converted to ASCII
2724        except OSError:
2725            return 0
2726        if not proxyEnable or not proxyOverride:
2727            return 0
2728        # try to make a host list from name and IP address.
2729        rawHost, port = _splitport(host)
2730        host = [rawHost]
2731        try:
2732            addr = socket.gethostbyname(rawHost)
2733            if addr != rawHost:
2734                host.append(addr)
2735        except OSError:
2736            pass
2737        try:
2738            fqdn = socket.getfqdn(rawHost)
2739            if fqdn != rawHost:
2740                host.append(fqdn)
2741        except OSError:
2742            pass
2743        # make a check value list from the registry entry: replace the
2744        # '<local>' string by the localhost entry and the corresponding
2745        # canonical entry.
2746        proxyOverride = proxyOverride.split(';')
2747        # now check if we match one of the registry values.
2748        for test in proxyOverride:
2749            if test == '<local>':
2750                if '.' not in rawHost:
2751                    return 1
2752            test = test.replace(".", r"\.")     # mask dots
2753            test = test.replace("*", r".*")     # change glob sequence
2754            test = test.replace("?", r".")      # change glob char
2755            for val in host:
2756                if re.match(test, val, re.I):
2757                    return 1
2758        return 0
2759
2760    def proxy_bypass(host):
2761        """Return True, if host should be bypassed.
2762
2763        Checks proxy settings gathered from the environment, if specified,
2764        or the registry.
2765
2766        """
2767        proxies = getproxies_environment()
2768        if proxies:
2769            return proxy_bypass_environment(host, proxies)
2770        else:
2771            return proxy_bypass_registry(host)
2772
2773else:
2774    # By default use environment variables
2775    getproxies = getproxies_environment
2776    proxy_bypass = proxy_bypass_environment
2777