1"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below).  It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work.  Each Handler implements a particular protocol or
10option.  The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL.  For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns.  The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib.  pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back.  One difference is that you can also pass
20a Request instance instead of URL.  Raises a URLError (subclass of
21OSError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers.  Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate.  If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33
34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
36
37Request -- An object that encapsulates the state of a request.  The
38state can be as simple as the URL.  It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
49import urllib.request
50
51# set up authentication info
52authinfo = urllib.request.HTTPBasicAuthHandler()
53authinfo.add_password(realm='PDQ Application',
54                      uri='https://mahler:8092/site-updates.py',
55                      user='klem',
56                      passwd='geheim$parole')
57
58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
59
60# build a new opener that adds authentication and caching FTP handlers
61opener = urllib.request.build_opener(proxy_support, authinfo,
62                                     urllib.request.CacheFTPHandler)
63
64# install it
65urllib.request.install_opener(opener)
66
67f = urllib.request.urlopen('http://www.python.org/')
68"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled?  The client needs to know the HTTP error code.  But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies  XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
85import bisect
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import re
93import socket
94import string
95import sys
96import time
97import tempfile
98import contextlib
99import warnings
100
101
102from urllib.error import URLError, HTTPError, ContentTooShortError
103from urllib.parse import (
104    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
105    _splittype, _splithost, _splitport, _splituser, _splitpasswd,
106    _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,
107    unquote_to_bytes, urlunparse)
108from urllib.response import addinfourl, addclosehook
109
110# check for SSL
111try:
112    import ssl
113except ImportError:
114    _have_ssl = False
115else:
116    _have_ssl = True
117
118__all__ = [
119    # Classes
120    'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
121    'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
122    'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
123    'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
124    'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
125    'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
126    'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
127    'UnknownHandler', 'HTTPErrorProcessor',
128    # Functions
129    'urlopen', 'install_opener', 'build_opener',
130    'pathname2url', 'url2pathname', 'getproxies',
131    # Legacy interface
132    'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
133]
134
135# used in User-Agent header sent
136__version__ = '%d.%d' % sys.version_info[:2]
137
138_opener = None
139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
140            *, cafile=None, capath=None, cadefault=False, context=None):
141    '''Open the URL url, which can be either a string or a Request object.
142
143    *data* must be an object specifying additional data to be sent to
144    the server, or None if no such data is needed.  See Request for
145    details.
146
147    urllib.request module uses HTTP/1.1 and includes a "Connection:close"
148    header in its HTTP requests.
149
150    The optional *timeout* parameter specifies a timeout in seconds for
151    blocking operations like the connection attempt (if not specified, the
152    global default timeout setting will be used). This only works for HTTP,
153    HTTPS and FTP connections.
154
155    If *context* is specified, it must be a ssl.SSLContext instance describing
156    the various SSL options. See HTTPSConnection for more details.
157
158    The optional *cafile* and *capath* parameters specify a set of trusted CA
159    certificates for HTTPS requests. cafile should point to a single file
160    containing a bundle of CA certificates, whereas capath should point to a
161    directory of hashed certificate files. More information can be found in
162    ssl.SSLContext.load_verify_locations().
163
164    The *cadefault* parameter is ignored.
165
166    This function always returns an object which can work as a context
167    manager and has methods such as
168
169    * geturl() - return the URL of the resource retrieved, commonly used to
170      determine if a redirect was followed
171
172    * info() - return the meta-information of the page, such as headers, in the
173      form of an email.message_from_string() instance (see Quick Reference to
174      HTTP Headers)
175
176    * getcode() - return the HTTP status code of the response.  Raises URLError
177      on errors.
178
179    For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
180    object slightly modified. In addition to the three new methods above, the
181    msg attribute contains the same information as the reason attribute ---
182    the reason phrase returned by the server --- instead of the response
183    headers as it is specified in the documentation for HTTPResponse.
184
185    For FTP, file, and data URLs and requests explicitly handled by legacy
186    URLopener and FancyURLopener classes, this function returns a
187    urllib.response.addinfourl object.
188
189    Note that None may be returned if no handler handles the request (though
190    the default installed global OpenerDirector uses UnknownHandler to ensure
191    this never happens).
192
193    In addition, if proxy settings are detected (for example, when a *_proxy
194    environment variable like http_proxy is set), ProxyHandler is default
195    installed and makes sure the requests are handled through the proxy.
196
197    '''
198    global _opener
199    if cafile or capath or cadefault:
200        import warnings
201        warnings.warn("cafile, capath and cadefault are deprecated, use a "
202                      "custom context instead.", DeprecationWarning, 2)
203        if context is not None:
204            raise ValueError(
205                "You can't pass both context and any of cafile, capath, and "
206                "cadefault"
207            )
208        if not _have_ssl:
209            raise ValueError('SSL support not available')
210        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
211                                             cafile=cafile,
212                                             capath=capath)
213        https_handler = HTTPSHandler(context=context)
214        opener = build_opener(https_handler)
215    elif context:
216        https_handler = HTTPSHandler(context=context)
217        opener = build_opener(https_handler)
218    elif _opener is None:
219        _opener = opener = build_opener()
220    else:
221        opener = _opener
222    return opener.open(url, data, timeout)
223
224def install_opener(opener):
225    global _opener
226    _opener = opener
227
228_url_tempfiles = []
229def urlretrieve(url, filename=None, reporthook=None, data=None):
230    """
231    Retrieve a URL into a temporary location on disk.
232
233    Requires a URL argument. If a filename is passed, it is used as
234    the temporary file location. The reporthook argument should be
235    a callable that accepts a block number, a read size, and the
236    total file size of the URL target. The data argument should be
237    valid URL encoded data.
238
239    If a filename is passed and the URL points to a local resource,
240    the result is a copy from local file to new file.
241
242    Returns a tuple containing the path to the newly created
243    data file as well as the resulting HTTPMessage object.
244    """
245    url_type, path = _splittype(url)
246
247    with contextlib.closing(urlopen(url, data)) as fp:
248        headers = fp.info()
249
250        # Just return the local path and the "headers" for file://
251        # URLs. No sense in performing a copy unless requested.
252        if url_type == "file" and not filename:
253            return os.path.normpath(path), headers
254
255        # Handle temporary file setup.
256        if filename:
257            tfp = open(filename, 'wb')
258        else:
259            tfp = tempfile.NamedTemporaryFile(delete=False)
260            filename = tfp.name
261            _url_tempfiles.append(filename)
262
263        with tfp:
264            result = filename, headers
265            bs = 1024*8
266            size = -1
267            read = 0
268            blocknum = 0
269            if "content-length" in headers:
270                size = int(headers["Content-Length"])
271
272            if reporthook:
273                reporthook(blocknum, bs, size)
274
275            while True:
276                block = fp.read(bs)
277                if not block:
278                    break
279                read += len(block)
280                tfp.write(block)
281                blocknum += 1
282                if reporthook:
283                    reporthook(blocknum, bs, size)
284
285    if size >= 0 and read < size:
286        raise ContentTooShortError(
287            "retrieval incomplete: got only %i out of %i bytes"
288            % (read, size), result)
289
290    return result
291
292def urlcleanup():
293    """Clean up temporary files from urlretrieve calls."""
294    for temp_file in _url_tempfiles:
295        try:
296            os.unlink(temp_file)
297        except OSError:
298            pass
299
300    del _url_tempfiles[:]
301    global _opener
302    if _opener:
303        _opener = None
304
305# copied from cookielib.py
306_cut_port_re = re.compile(r":\d+$", re.ASCII)
307def request_host(request):
308    """Return request-host, as defined by RFC 2965.
309
310    Variation from RFC: returned value is lowercased, for convenient
311    comparison.
312
313    """
314    url = request.full_url
315    host = urlparse(url)[1]
316    if host == "":
317        host = request.get_header("Host", "")
318
319    # remove port, if present
320    host = _cut_port_re.sub("", host, 1)
321    return host.lower()
322
323class Request:
324
325    def __init__(self, url, data=None, headers={},
326                 origin_req_host=None, unverifiable=False,
327                 method=None):
328        self.full_url = url
329        self.headers = {}
330        self.unredirected_hdrs = {}
331        self._data = None
332        self.data = data
333        self._tunnel_host = None
334        for key, value in headers.items():
335            self.add_header(key, value)
336        if origin_req_host is None:
337            origin_req_host = request_host(self)
338        self.origin_req_host = origin_req_host
339        self.unverifiable = unverifiable
340        if method:
341            self.method = method
342
343    @property
344    def full_url(self):
345        if self.fragment:
346            return '{}#{}'.format(self._full_url, self.fragment)
347        return self._full_url
348
349    @full_url.setter
350    def full_url(self, url):
351        # unwrap('<URL:type://host/path>') --> 'type://host/path'
352        self._full_url = unwrap(url)
353        self._full_url, self.fragment = _splittag(self._full_url)
354        self._parse()
355
356    @full_url.deleter
357    def full_url(self):
358        self._full_url = None
359        self.fragment = None
360        self.selector = ''
361
362    @property
363    def data(self):
364        return self._data
365
366    @data.setter
367    def data(self, data):
368        if data != self._data:
369            self._data = data
370            # issue 16464
371            # if we change data we need to remove content-length header
372            # (cause it's most probably calculated for previous value)
373            if self.has_header("Content-length"):
374                self.remove_header("Content-length")
375
376    @data.deleter
377    def data(self):
378        self.data = None
379
380    def _parse(self):
381        self.type, rest = _splittype(self._full_url)
382        if self.type is None:
383            raise ValueError("unknown url type: %r" % self.full_url)
384        self.host, self.selector = _splithost(rest)
385        if self.host:
386            self.host = unquote(self.host)
387
388    def get_method(self):
389        """Return a string indicating the HTTP request method."""
390        default_method = "POST" if self.data is not None else "GET"
391        return getattr(self, 'method', default_method)
392
393    def get_full_url(self):
394        return self.full_url
395
396    def set_proxy(self, host, type):
397        if self.type == 'https' and not self._tunnel_host:
398            self._tunnel_host = self.host
399        else:
400            self.type= type
401            self.selector = self.full_url
402        self.host = host
403
404    def has_proxy(self):
405        return self.selector == self.full_url
406
407    def add_header(self, key, val):
408        # useful for something like authentication
409        self.headers[key.capitalize()] = val
410
411    def add_unredirected_header(self, key, val):
412        # will not be added to a redirected request
413        self.unredirected_hdrs[key.capitalize()] = val
414
415    def has_header(self, header_name):
416        return (header_name in self.headers or
417                header_name in self.unredirected_hdrs)
418
419    def get_header(self, header_name, default=None):
420        return self.headers.get(
421            header_name,
422            self.unredirected_hdrs.get(header_name, default))
423
424    def remove_header(self, header_name):
425        self.headers.pop(header_name, None)
426        self.unredirected_hdrs.pop(header_name, None)
427
428    def header_items(self):
429        hdrs = {**self.unredirected_hdrs, **self.headers}
430        return list(hdrs.items())
431
432class OpenerDirector:
433    def __init__(self):
434        client_version = "Python-urllib/%s" % __version__
435        self.addheaders = [('User-agent', client_version)]
436        # self.handlers is retained only for backward compatibility
437        self.handlers = []
438        # manage the individual handlers
439        self.handle_open = {}
440        self.handle_error = {}
441        self.process_response = {}
442        self.process_request = {}
443
444    def add_handler(self, handler):
445        if not hasattr(handler, "add_parent"):
446            raise TypeError("expected BaseHandler instance, got %r" %
447                            type(handler))
448
449        added = False
450        for meth in dir(handler):
451            if meth in ["redirect_request", "do_open", "proxy_open"]:
452                # oops, coincidental match
453                continue
454
455            i = meth.find("_")
456            protocol = meth[:i]
457            condition = meth[i+1:]
458
459            if condition.startswith("error"):
460                j = condition.find("_") + i + 1
461                kind = meth[j+1:]
462                try:
463                    kind = int(kind)
464                except ValueError:
465                    pass
466                lookup = self.handle_error.get(protocol, {})
467                self.handle_error[protocol] = lookup
468            elif condition == "open":
469                kind = protocol
470                lookup = self.handle_open
471            elif condition == "response":
472                kind = protocol
473                lookup = self.process_response
474            elif condition == "request":
475                kind = protocol
476                lookup = self.process_request
477            else:
478                continue
479
480            handlers = lookup.setdefault(kind, [])
481            if handlers:
482                bisect.insort(handlers, handler)
483            else:
484                handlers.append(handler)
485            added = True
486
487        if added:
488            bisect.insort(self.handlers, handler)
489            handler.add_parent(self)
490
491    def close(self):
492        # Only exists for backwards compatibility.
493        pass
494
495    def _call_chain(self, chain, kind, meth_name, *args):
496        # Handlers raise an exception if no one else should try to handle
497        # the request, or return None if they can't but another handler
498        # could.  Otherwise, they return the response.
499        handlers = chain.get(kind, ())
500        for handler in handlers:
501            func = getattr(handler, meth_name)
502            result = func(*args)
503            if result is not None:
504                return result
505
506    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
507        # accept a URL or a Request object
508        if isinstance(fullurl, str):
509            req = Request(fullurl, data)
510        else:
511            req = fullurl
512            if data is not None:
513                req.data = data
514
515        req.timeout = timeout
516        protocol = req.type
517
518        # pre-process request
519        meth_name = protocol+"_request"
520        for processor in self.process_request.get(protocol, []):
521            meth = getattr(processor, meth_name)
522            req = meth(req)
523
524        sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
525        response = self._open(req, data)
526
527        # post-process response
528        meth_name = protocol+"_response"
529        for processor in self.process_response.get(protocol, []):
530            meth = getattr(processor, meth_name)
531            response = meth(req, response)
532
533        return response
534
535    def _open(self, req, data=None):
536        result = self._call_chain(self.handle_open, 'default',
537                                  'default_open', req)
538        if result:
539            return result
540
541        protocol = req.type
542        result = self._call_chain(self.handle_open, protocol, protocol +
543                                  '_open', req)
544        if result:
545            return result
546
547        return self._call_chain(self.handle_open, 'unknown',
548                                'unknown_open', req)
549
550    def error(self, proto, *args):
551        if proto in ('http', 'https'):
552            # XXX http[s] protocols are special-cased
553            dict = self.handle_error['http'] # https is not different than http
554            proto = args[2]  # YUCK!
555            meth_name = 'http_error_%s' % proto
556            http_err = 1
557            orig_args = args
558        else:
559            dict = self.handle_error
560            meth_name = proto + '_error'
561            http_err = 0
562        args = (dict, proto, meth_name) + args
563        result = self._call_chain(*args)
564        if result:
565            return result
566
567        if http_err:
568            args = (dict, 'default', 'http_error_default') + orig_args
569            return self._call_chain(*args)
570
571# XXX probably also want an abstract factory that knows when it makes
572# sense to skip a superclass in favor of a subclass and when it might
573# make sense to include both
574
575def build_opener(*handlers):
576    """Create an opener object from a list of handlers.
577
578    The opener will use several default handlers, including support
579    for HTTP, FTP and when applicable HTTPS.
580
581    If any of the handlers passed as arguments are subclasses of the
582    default handlers, the default handlers will not be used.
583    """
584    opener = OpenerDirector()
585    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
586                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
587                       FTPHandler, FileHandler, HTTPErrorProcessor,
588                       DataHandler]
589    if hasattr(http.client, "HTTPSConnection"):
590        default_classes.append(HTTPSHandler)
591    skip = set()
592    for klass in default_classes:
593        for check in handlers:
594            if isinstance(check, type):
595                if issubclass(check, klass):
596                    skip.add(klass)
597            elif isinstance(check, klass):
598                skip.add(klass)
599    for klass in skip:
600        default_classes.remove(klass)
601
602    for klass in default_classes:
603        opener.add_handler(klass())
604
605    for h in handlers:
606        if isinstance(h, type):
607            h = h()
608        opener.add_handler(h)
609    return opener
610
611class BaseHandler:
612    handler_order = 500
613
614    def add_parent(self, parent):
615        self.parent = parent
616
617    def close(self):
618        # Only exists for backwards compatibility
619        pass
620
621    def __lt__(self, other):
622        if not hasattr(other, "handler_order"):
623            # Try to preserve the old behavior of having custom classes
624            # inserted after default ones (works only for custom user
625            # classes which are not aware of handler_order).
626            return True
627        return self.handler_order < other.handler_order
628
629
630class HTTPErrorProcessor(BaseHandler):
631    """Process HTTP error responses."""
632    handler_order = 1000  # after all other processing
633
634    def http_response(self, request, response):
635        code, msg, hdrs = response.code, response.msg, response.info()
636
637        # According to RFC 2616, "2xx" code indicates that the client's
638        # request was successfully received, understood, and accepted.
639        if not (200 <= code < 300):
640            response = self.parent.error(
641                'http', request, response, code, msg, hdrs)
642
643        return response
644
645    https_response = http_response
646
647class HTTPDefaultErrorHandler(BaseHandler):
648    def http_error_default(self, req, fp, code, msg, hdrs):
649        raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651class HTTPRedirectHandler(BaseHandler):
652    # maximum number of redirections to any single URL
653    # this is needed because of the state that cookies introduce
654    max_repeats = 4
655    # maximum total number of redirections (regardless of URL) before
656    # assuming we're in a loop
657    max_redirections = 10
658
659    def redirect_request(self, req, fp, code, msg, headers, newurl):
660        """Return a Request or None in response to a redirect.
661
662        This is called by the http_error_30x methods when a
663        redirection response is received.  If a redirection should
664        take place, return a new Request to allow http_error_30x to
665        perform the redirect.  Otherwise, raise HTTPError if no-one
666        else should try to handle this url.  Return None if you can't
667        but another Handler might.
668        """
669        m = req.get_method()
670        if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
671            or code in (301, 302, 303) and m == "POST")):
672            raise HTTPError(req.full_url, code, msg, headers, fp)
673
674        # Strictly (according to RFC 2616), 301 or 302 in response to
675        # a POST MUST NOT cause a redirection without confirmation
676        # from the user (of urllib.request, in this case).  In practice,
677        # essentially all clients do redirect in this case, so we do
678        # the same.
679
680        # Be conciliant with URIs containing a space.  This is mainly
681        # redundant with the more complete encoding done in http_error_302(),
682        # but it is kept for compatibility with other callers.
683        newurl = newurl.replace(' ', '%20')
684
685        CONTENT_HEADERS = ("content-length", "content-type")
686        newheaders = {k: v for k, v in req.headers.items()
687                      if k.lower() not in CONTENT_HEADERS}
688        return Request(newurl,
689                       headers=newheaders,
690                       origin_req_host=req.origin_req_host,
691                       unverifiable=True)
692
693    # Implementation note: To avoid the server sending us into an
694    # infinite loop, the request object needs to track what URLs we
695    # have already seen.  Do this by adding a handler-specific
696    # attribute to the Request object.
697    def http_error_302(self, req, fp, code, msg, headers):
698        # Some servers (incorrectly) return multiple Location headers
699        # (so probably same goes for URI).  Use first header.
700        if "location" in headers:
701            newurl = headers["location"]
702        elif "uri" in headers:
703            newurl = headers["uri"]
704        else:
705            return
706
707        # fix a possible malformed URL
708        urlparts = urlparse(newurl)
709
710        # For security reasons we don't allow redirection to anything other
711        # than http, https or ftp.
712
713        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
714            raise HTTPError(
715                newurl, code,
716                "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
717                headers, fp)
718
719        if not urlparts.path and urlparts.netloc:
720            urlparts = list(urlparts)
721            urlparts[2] = "/"
722        newurl = urlunparse(urlparts)
723
724        # http.client.parse_headers() decodes as ISO-8859-1.  Recover the
725        # original bytes and percent-encode non-ASCII bytes, and any special
726        # characters such as the space.
727        newurl = quote(
728            newurl, encoding="iso-8859-1", safe=string.punctuation)
729        newurl = urljoin(req.full_url, newurl)
730
731        # XXX Probably want to forget about the state of the current
732        # request, although that might interact poorly with other
733        # handlers that also use handler-specific request attributes
734        new = self.redirect_request(req, fp, code, msg, headers, newurl)
735        if new is None:
736            return
737
738        # loop detection
739        # .redirect_dict has a key url if url was previously visited.
740        if hasattr(req, 'redirect_dict'):
741            visited = new.redirect_dict = req.redirect_dict
742            if (visited.get(newurl, 0) >= self.max_repeats or
743                len(visited) >= self.max_redirections):
744                raise HTTPError(req.full_url, code,
745                                self.inf_msg + msg, headers, fp)
746        else:
747            visited = new.redirect_dict = req.redirect_dict = {}
748        visited[newurl] = visited.get(newurl, 0) + 1
749
750        # Don't close the fp until we are sure that we won't use it
751        # with HTTPError.
752        fp.read()
753        fp.close()
754
755        return self.parent.open(new, timeout=req.timeout)
756
757    http_error_301 = http_error_303 = http_error_307 = http_error_302
758
759    inf_msg = "The HTTP server returned a redirect error that would " \
760              "lead to an infinite loop.\n" \
761              "The last 30x error message was:\n"
762
763
764def _parse_proxy(proxy):
765    """Return (scheme, user, password, host/port) given a URL or an authority.
766
767    If a URL is supplied, it must have an authority (host:port) component.
768    According to RFC 3986, having an authority component means the URL must
769    have two slashes after the scheme.
770    """
771    scheme, r_scheme = _splittype(proxy)
772    if not r_scheme.startswith("/"):
773        # authority
774        scheme = None
775        authority = proxy
776    else:
777        # URL
778        if not r_scheme.startswith("//"):
779            raise ValueError("proxy URL with no authority: %r" % proxy)
780        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
781        # and 3.3.), path is empty or starts with '/'
782        if '@' in r_scheme:
783            host_separator = r_scheme.find('@')
784            end = r_scheme.find("/", host_separator)
785        else:
786            end = r_scheme.find("/", 2)
787        if end == -1:
788            end = None
789        authority = r_scheme[2:end]
790    userinfo, hostport = _splituser(authority)
791    if userinfo is not None:
792        user, password = _splitpasswd(userinfo)
793    else:
794        user = password = None
795    return scheme, user, password, hostport
796
797class ProxyHandler(BaseHandler):
798    # Proxies must be in front
799    handler_order = 100
800
801    def __init__(self, proxies=None):
802        if proxies is None:
803            proxies = getproxies()
804        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
805        self.proxies = proxies
806        for type, url in proxies.items():
807            type = type.lower()
808            setattr(self, '%s_open' % type,
809                    lambda r, proxy=url, type=type, meth=self.proxy_open:
810                        meth(r, proxy, type))
811
812    def proxy_open(self, req, proxy, type):
813        orig_type = req.type
814        proxy_type, user, password, hostport = _parse_proxy(proxy)
815        if proxy_type is None:
816            proxy_type = orig_type
817
818        if req.host and proxy_bypass(req.host):
819            return None
820
821        if user and password:
822            user_pass = '%s:%s' % (unquote(user),
823                                   unquote(password))
824            creds = base64.b64encode(user_pass.encode()).decode("ascii")
825            req.add_header('Proxy-authorization', 'Basic ' + creds)
826        hostport = unquote(hostport)
827        req.set_proxy(hostport, proxy_type)
828        if orig_type == proxy_type or orig_type == 'https':
829            # let other handlers take care of it
830            return None
831        else:
832            # need to start over, because the other handlers don't
833            # grok the proxy's URL type
834            # e.g. if we have a constructor arg proxies like so:
835            # {'http': 'ftp://proxy.example.com'}, we may end up turning
836            # a request for http://acme.example.com/a into one for
837            # ftp://proxy.example.com/a
838            return self.parent.open(req, timeout=req.timeout)
839
840class HTTPPasswordMgr:
841
842    def __init__(self):
843        self.passwd = {}
844
845    def add_password(self, realm, uri, user, passwd):
846        # uri could be a single URI or a sequence
847        if isinstance(uri, str):
848            uri = [uri]
849        if realm not in self.passwd:
850            self.passwd[realm] = {}
851        for default_port in True, False:
852            reduced_uri = tuple(
853                self.reduce_uri(u, default_port) for u in uri)
854            self.passwd[realm][reduced_uri] = (user, passwd)
855
856    def find_user_password(self, realm, authuri):
857        domains = self.passwd.get(realm, {})
858        for default_port in True, False:
859            reduced_authuri = self.reduce_uri(authuri, default_port)
860            for uris, authinfo in domains.items():
861                for uri in uris:
862                    if self.is_suburi(uri, reduced_authuri):
863                        return authinfo
864        return None, None
865
866    def reduce_uri(self, uri, default_port=True):
867        """Accept authority or URI and extract only the authority and path."""
868        # note HTTP URLs do not have a userinfo component
869        parts = urlsplit(uri)
870        if parts[1]:
871            # URI
872            scheme = parts[0]
873            authority = parts[1]
874            path = parts[2] or '/'
875        else:
876            # host or host:port
877            scheme = None
878            authority = uri
879            path = '/'
880        host, port = _splitport(authority)
881        if default_port and port is None and scheme is not None:
882            dport = {"http": 80,
883                     "https": 443,
884                     }.get(scheme)
885            if dport is not None:
886                authority = "%s:%d" % (host, dport)
887        return authority, path
888
889    def is_suburi(self, base, test):
890        """Check if test is below base in a URI tree
891
892        Both args must be URIs in reduced form.
893        """
894        if base == test:
895            return True
896        if base[0] != test[0]:
897            return False
898        common = posixpath.commonprefix((base[1], test[1]))
899        if len(common) == len(base[1]):
900            return True
901        return False
902
903
904class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
905
906    def find_user_password(self, realm, authuri):
907        user, password = HTTPPasswordMgr.find_user_password(self, realm,
908                                                            authuri)
909        if user is not None:
910            return user, password
911        return HTTPPasswordMgr.find_user_password(self, None, authuri)
912
913
914class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
915
916    def __init__(self, *args, **kwargs):
917        self.authenticated = {}
918        super().__init__(*args, **kwargs)
919
920    def add_password(self, realm, uri, user, passwd, is_authenticated=False):
921        self.update_authenticated(uri, is_authenticated)
922        # Add a default for prior auth requests
923        if realm is not None:
924            super().add_password(None, uri, user, passwd)
925        super().add_password(realm, uri, user, passwd)
926
927    def update_authenticated(self, uri, is_authenticated=False):
928        # uri could be a single URI or a sequence
929        if isinstance(uri, str):
930            uri = [uri]
931
932        for default_port in True, False:
933            for u in uri:
934                reduced_uri = self.reduce_uri(u, default_port)
935                self.authenticated[reduced_uri] = is_authenticated
936
937    def is_authenticated(self, authuri):
938        for default_port in True, False:
939            reduced_authuri = self.reduce_uri(authuri, default_port)
940            for uri in self.authenticated:
941                if self.is_suburi(uri, reduced_authuri):
942                    return self.authenticated[uri]
943
944
945class AbstractBasicAuthHandler:
946
947    # XXX this allows for multiple auth-schemes, but will stupidly pick
948    # the last one with a realm specified.
949
950    # allow for double- and single-quoted realm values
951    # (single quotes are a violation of the RFC, but appear in the wild)
952    rx = re.compile('(?:^|,)'   # start of the string or ','
953                    '[ \t]*'    # optional whitespaces
954                    '([^ \t,]+)' # scheme like "Basic"
955                    '[ \t]+'    # mandatory whitespaces
956                    # realm=xxx
957                    # realm='xxx'
958                    # realm="xxx"
959                    'realm=(["\']?)([^"\']*)\\2',
960                    re.I)
961
962    # XXX could pre-emptively send auth info already accepted (RFC 2617,
963    # end of section 2, and section 1.2 immediately after "credentials"
964    # production).
965
966    def __init__(self, password_mgr=None):
967        if password_mgr is None:
968            password_mgr = HTTPPasswordMgr()
969        self.passwd = password_mgr
970        self.add_password = self.passwd.add_password
971
972    def _parse_realm(self, header):
973        # parse WWW-Authenticate header: accept multiple challenges per header
974        found_challenge = False
975        for mo in AbstractBasicAuthHandler.rx.finditer(header):
976            scheme, quote, realm = mo.groups()
977            if quote not in ['"', "'"]:
978                warnings.warn("Basic Auth Realm was unquoted",
979                              UserWarning, 3)
980
981            yield (scheme, realm)
982
983            found_challenge = True
984
985        if not found_challenge:
986            if header:
987                scheme = header.split()[0]
988            else:
989                scheme = ''
990            yield (scheme, None)
991
992    def http_error_auth_reqed(self, authreq, host, req, headers):
993        # host may be an authority (without userinfo) or a URL with an
994        # authority
995        headers = headers.get_all(authreq)
996        if not headers:
997            # no header found
998            return
999
1000        unsupported = None
1001        for header in headers:
1002            for scheme, realm in self._parse_realm(header):
1003                if scheme.lower() != 'basic':
1004                    unsupported = scheme
1005                    continue
1006
1007                if realm is not None:
1008                    # Use the first matching Basic challenge.
1009                    # Ignore following challenges even if they use the Basic
1010                    # scheme.
1011                    return self.retry_http_basic_auth(host, req, realm)
1012
1013        if unsupported is not None:
1014            raise ValueError("AbstractBasicAuthHandler does not "
1015                             "support the following scheme: %r"
1016                             % (scheme,))
1017
1018    def retry_http_basic_auth(self, host, req, realm):
1019        user, pw = self.passwd.find_user_password(realm, host)
1020        if pw is not None:
1021            raw = "%s:%s" % (user, pw)
1022            auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
1023            if req.get_header(self.auth_header, None) == auth:
1024                return None
1025            req.add_unredirected_header(self.auth_header, auth)
1026            return self.parent.open(req, timeout=req.timeout)
1027        else:
1028            return None
1029
1030    def http_request(self, req):
1031        if (not hasattr(self.passwd, 'is_authenticated') or
1032           not self.passwd.is_authenticated(req.full_url)):
1033            return req
1034
1035        if not req.has_header('Authorization'):
1036            user, passwd = self.passwd.find_user_password(None, req.full_url)
1037            credentials = '{0}:{1}'.format(user, passwd).encode()
1038            auth_str = base64.standard_b64encode(credentials).decode()
1039            req.add_unredirected_header('Authorization',
1040                                        'Basic {}'.format(auth_str.strip()))
1041        return req
1042
1043    def http_response(self, req, response):
1044        if hasattr(self.passwd, 'is_authenticated'):
1045            if 200 <= response.code < 300:
1046                self.passwd.update_authenticated(req.full_url, True)
1047            else:
1048                self.passwd.update_authenticated(req.full_url, False)
1049        return response
1050
1051    https_request = http_request
1052    https_response = http_response
1053
1054
1055
1056class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1057
1058    auth_header = 'Authorization'
1059
1060    def http_error_401(self, req, fp, code, msg, headers):
1061        url = req.full_url
1062        response = self.http_error_auth_reqed('www-authenticate',
1063                                          url, req, headers)
1064        return response
1065
1066
1067class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1068
1069    auth_header = 'Proxy-authorization'
1070
1071    def http_error_407(self, req, fp, code, msg, headers):
1072        # http_error_auth_reqed requires that there is no userinfo component in
1073        # authority.  Assume there isn't one, since urllib.request does not (and
1074        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1075        # userinfo.
1076        authority = req.host
1077        response = self.http_error_auth_reqed('proxy-authenticate',
1078                                          authority, req, headers)
1079        return response
1080
1081
1082# Return n random bytes.
1083_randombytes = os.urandom
1084
1085
1086class AbstractDigestAuthHandler:
1087    # Digest authentication is specified in RFC 2617.
1088
1089    # XXX The client does not inspect the Authentication-Info header
1090    # in a successful response.
1091
1092    # XXX It should be possible to test this implementation against
1093    # a mock server that just generates a static set of challenges.
1094
1095    # XXX qop="auth-int" supports is shaky
1096
1097    def __init__(self, passwd=None):
1098        if passwd is None:
1099            passwd = HTTPPasswordMgr()
1100        self.passwd = passwd
1101        self.add_password = self.passwd.add_password
1102        self.retried = 0
1103        self.nonce_count = 0
1104        self.last_nonce = None
1105
1106    def reset_retry_count(self):
1107        self.retried = 0
1108
1109    def http_error_auth_reqed(self, auth_header, host, req, headers):
1110        authreq = headers.get(auth_header, None)
1111        if self.retried > 5:
1112            # Don't fail endlessly - if we failed once, we'll probably
1113            # fail a second time. Hm. Unless the Password Manager is
1114            # prompting for the information. Crap. This isn't great
1115            # but it's better than the current 'repeat until recursion
1116            # depth exceeded' approach <wink>
1117            raise HTTPError(req.full_url, 401, "digest auth failed",
1118                            headers, None)
1119        else:
1120            self.retried += 1
1121        if authreq:
1122            scheme = authreq.split()[0]
1123            if scheme.lower() == 'digest':
1124                return self.retry_http_digest_auth(req, authreq)
1125            elif scheme.lower() != 'basic':
1126                raise ValueError("AbstractDigestAuthHandler does not support"
1127                                 " the following scheme: '%s'" % scheme)
1128
1129    def retry_http_digest_auth(self, req, auth):
1130        token, challenge = auth.split(' ', 1)
1131        chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1132        auth = self.get_authorization(req, chal)
1133        if auth:
1134            auth_val = 'Digest %s' % auth
1135            if req.headers.get(self.auth_header, None) == auth_val:
1136                return None
1137            req.add_unredirected_header(self.auth_header, auth_val)
1138            resp = self.parent.open(req, timeout=req.timeout)
1139            return resp
1140
1141    def get_cnonce(self, nonce):
1142        # The cnonce-value is an opaque
1143        # quoted string value provided by the client and used by both client
1144        # and server to avoid chosen plaintext attacks, to provide mutual
1145        # authentication, and to provide some message integrity protection.
1146        # This isn't a fabulous effort, but it's probably Good Enough.
1147        s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1148        b = s.encode("ascii") + _randombytes(8)
1149        dig = hashlib.sha1(b).hexdigest()
1150        return dig[:16]
1151
1152    def get_authorization(self, req, chal):
1153        try:
1154            realm = chal['realm']
1155            nonce = chal['nonce']
1156            qop = chal.get('qop')
1157            algorithm = chal.get('algorithm', 'MD5')
1158            # mod_digest doesn't send an opaque, even though it isn't
1159            # supposed to be optional
1160            opaque = chal.get('opaque', None)
1161        except KeyError:
1162            return None
1163
1164        H, KD = self.get_algorithm_impls(algorithm)
1165        if H is None:
1166            return None
1167
1168        user, pw = self.passwd.find_user_password(realm, req.full_url)
1169        if user is None:
1170            return None
1171
1172        # XXX not implemented yet
1173        if req.data is not None:
1174            entdig = self.get_entity_digest(req.data, chal)
1175        else:
1176            entdig = None
1177
1178        A1 = "%s:%s:%s" % (user, realm, pw)
1179        A2 = "%s:%s" % (req.get_method(),
1180                        # XXX selector: what about proxies and full urls
1181                        req.selector)
1182        # NOTE: As per  RFC 2617, when server sends "auth,auth-int", the client could use either `auth`
1183        #     or `auth-int` to the response back. we use `auth` to send the response back.
1184        if qop is None:
1185            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1186        elif 'auth' in qop.split(','):
1187            if nonce == self.last_nonce:
1188                self.nonce_count += 1
1189            else:
1190                self.nonce_count = 1
1191                self.last_nonce = nonce
1192            ncvalue = '%08x' % self.nonce_count
1193            cnonce = self.get_cnonce(nonce)
1194            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2))
1195            respdig = KD(H(A1), noncebit)
1196        else:
1197            # XXX handle auth-int.
1198            raise URLError("qop '%s' is not supported." % qop)
1199
1200        # XXX should the partial digests be encoded too?
1201
1202        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1203               'response="%s"' % (user, realm, nonce, req.selector,
1204                                  respdig)
1205        if opaque:
1206            base += ', opaque="%s"' % opaque
1207        if entdig:
1208            base += ', digest="%s"' % entdig
1209        base += ', algorithm="%s"' % algorithm
1210        if qop:
1211            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1212        return base
1213
1214    def get_algorithm_impls(self, algorithm):
1215        # lambdas assume digest modules are imported at the top level
1216        if algorithm == 'MD5':
1217            H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1218        elif algorithm == 'SHA':
1219            H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1220        # XXX MD5-sess
1221        else:
1222            raise ValueError("Unsupported digest authentication "
1223                             "algorithm %r" % algorithm)
1224        KD = lambda s, d: H("%s:%s" % (s, d))
1225        return H, KD
1226
1227    def get_entity_digest(self, data, chal):
1228        # XXX not implemented yet
1229        return None
1230
1231
1232class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1233    """An authentication protocol defined by RFC 2069
1234
1235    Digest authentication improves on basic authentication because it
1236    does not transmit passwords in the clear.
1237    """
1238
1239    auth_header = 'Authorization'
1240    handler_order = 490  # before Basic auth
1241
1242    def http_error_401(self, req, fp, code, msg, headers):
1243        host = urlparse(req.full_url)[1]
1244        retry = self.http_error_auth_reqed('www-authenticate',
1245                                           host, req, headers)
1246        self.reset_retry_count()
1247        return retry
1248
1249
1250class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1251
1252    auth_header = 'Proxy-Authorization'
1253    handler_order = 490  # before Basic auth
1254
1255    def http_error_407(self, req, fp, code, msg, headers):
1256        host = req.host
1257        retry = self.http_error_auth_reqed('proxy-authenticate',
1258                                           host, req, headers)
1259        self.reset_retry_count()
1260        return retry
1261
1262class AbstractHTTPHandler(BaseHandler):
1263
1264    def __init__(self, debuglevel=0):
1265        self._debuglevel = debuglevel
1266
1267    def set_http_debuglevel(self, level):
1268        self._debuglevel = level
1269
1270    def _get_content_length(self, request):
1271        return http.client.HTTPConnection._get_content_length(
1272            request.data,
1273            request.get_method())
1274
1275    def do_request_(self, request):
1276        host = request.host
1277        if not host:
1278            raise URLError('no host given')
1279
1280        if request.data is not None:  # POST
1281            data = request.data
1282            if isinstance(data, str):
1283                msg = "POST data should be bytes, an iterable of bytes, " \
1284                      "or a file object. It cannot be of type str."
1285                raise TypeError(msg)
1286            if not request.has_header('Content-type'):
1287                request.add_unredirected_header(
1288                    'Content-type',
1289                    'application/x-www-form-urlencoded')
1290            if (not request.has_header('Content-length')
1291                    and not request.has_header('Transfer-encoding')):
1292                content_length = self._get_content_length(request)
1293                if content_length is not None:
1294                    request.add_unredirected_header(
1295                            'Content-length', str(content_length))
1296                else:
1297                    request.add_unredirected_header(
1298                            'Transfer-encoding', 'chunked')
1299
1300        sel_host = host
1301        if request.has_proxy():
1302            scheme, sel = _splittype(request.selector)
1303            sel_host, sel_path = _splithost(sel)
1304        if not request.has_header('Host'):
1305            request.add_unredirected_header('Host', sel_host)
1306        for name, value in self.parent.addheaders:
1307            name = name.capitalize()
1308            if not request.has_header(name):
1309                request.add_unredirected_header(name, value)
1310
1311        return request
1312
1313    def do_open(self, http_class, req, **http_conn_args):
1314        """Return an HTTPResponse object for the request, using http_class.
1315
1316        http_class must implement the HTTPConnection API from http.client.
1317        """
1318        host = req.host
1319        if not host:
1320            raise URLError('no host given')
1321
1322        # will parse host:port
1323        h = http_class(host, timeout=req.timeout, **http_conn_args)
1324        h.set_debuglevel(self._debuglevel)
1325
1326        headers = dict(req.unredirected_hdrs)
1327        headers.update({k: v for k, v in req.headers.items()
1328                        if k not in headers})
1329
1330        # TODO(jhylton): Should this be redesigned to handle
1331        # persistent connections?
1332
1333        # We want to make an HTTP/1.1 request, but the addinfourl
1334        # class isn't prepared to deal with a persistent connection.
1335        # It will try to read all remaining data from the socket,
1336        # which will block while the server waits for the next request.
1337        # So make sure the connection gets closed after the (only)
1338        # request.
1339        headers["Connection"] = "close"
1340        headers = {name.title(): val for name, val in headers.items()}
1341
1342        if req._tunnel_host:
1343            tunnel_headers = {}
1344            proxy_auth_hdr = "Proxy-Authorization"
1345            if proxy_auth_hdr in headers:
1346                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1347                # Proxy-Authorization should not be sent to origin
1348                # server.
1349                del headers[proxy_auth_hdr]
1350            h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1351
1352        try:
1353            try:
1354                h.request(req.get_method(), req.selector, req.data, headers,
1355                          encode_chunked=req.has_header('Transfer-encoding'))
1356            except OSError as err: # timeout error
1357                raise URLError(err)
1358            r = h.getresponse()
1359        except:
1360            h.close()
1361            raise
1362
1363        # If the server does not send us a 'Connection: close' header,
1364        # HTTPConnection assumes the socket should be left open. Manually
1365        # mark the socket to be closed when this response object goes away.
1366        if h.sock:
1367            h.sock.close()
1368            h.sock = None
1369
1370        r.url = req.get_full_url()
1371        # This line replaces the .msg attribute of the HTTPResponse
1372        # with .headers, because urllib clients expect the response to
1373        # have the reason in .msg.  It would be good to mark this
1374        # attribute is deprecated and get then to use info() or
1375        # .headers.
1376        r.msg = r.reason
1377        return r
1378
1379
1380class HTTPHandler(AbstractHTTPHandler):
1381
1382    def http_open(self, req):
1383        return self.do_open(http.client.HTTPConnection, req)
1384
1385    http_request = AbstractHTTPHandler.do_request_
1386
1387if hasattr(http.client, 'HTTPSConnection'):
1388
1389    class HTTPSHandler(AbstractHTTPHandler):
1390
1391        def __init__(self, debuglevel=0, context=None, check_hostname=None):
1392            AbstractHTTPHandler.__init__(self, debuglevel)
1393            self._context = context
1394            self._check_hostname = check_hostname
1395
1396        def https_open(self, req):
1397            return self.do_open(http.client.HTTPSConnection, req,
1398                context=self._context, check_hostname=self._check_hostname)
1399
1400        https_request = AbstractHTTPHandler.do_request_
1401
1402    __all__.append('HTTPSHandler')
1403
1404class HTTPCookieProcessor(BaseHandler):
1405    def __init__(self, cookiejar=None):
1406        import http.cookiejar
1407        if cookiejar is None:
1408            cookiejar = http.cookiejar.CookieJar()
1409        self.cookiejar = cookiejar
1410
1411    def http_request(self, request):
1412        self.cookiejar.add_cookie_header(request)
1413        return request
1414
1415    def http_response(self, request, response):
1416        self.cookiejar.extract_cookies(response, request)
1417        return response
1418
1419    https_request = http_request
1420    https_response = http_response
1421
1422class UnknownHandler(BaseHandler):
1423    def unknown_open(self, req):
1424        type = req.type
1425        raise URLError('unknown url type: %s' % type)
1426
1427def parse_keqv_list(l):
1428    """Parse list of key=value strings where keys are not duplicated."""
1429    parsed = {}
1430    for elt in l:
1431        k, v = elt.split('=', 1)
1432        if v[0] == '"' and v[-1] == '"':
1433            v = v[1:-1]
1434        parsed[k] = v
1435    return parsed
1436
1437def parse_http_list(s):
1438    """Parse lists as described by RFC 2068 Section 2.
1439
1440    In particular, parse comma-separated lists where the elements of
1441    the list may include quoted-strings.  A quoted-string could
1442    contain a comma.  A non-quoted string could have quotes in the
1443    middle.  Neither commas nor quotes count if they are escaped.
1444    Only double-quotes count, not single-quotes.
1445    """
1446    res = []
1447    part = ''
1448
1449    escape = quote = False
1450    for cur in s:
1451        if escape:
1452            part += cur
1453            escape = False
1454            continue
1455        if quote:
1456            if cur == '\\':
1457                escape = True
1458                continue
1459            elif cur == '"':
1460                quote = False
1461            part += cur
1462            continue
1463
1464        if cur == ',':
1465            res.append(part)
1466            part = ''
1467            continue
1468
1469        if cur == '"':
1470            quote = True
1471
1472        part += cur
1473
1474    # append last part
1475    if part:
1476        res.append(part)
1477
1478    return [part.strip() for part in res]
1479
1480class FileHandler(BaseHandler):
1481    # Use local file or FTP depending on form of URL
1482    def file_open(self, req):
1483        url = req.selector
1484        if url[:2] == '//' and url[2:3] != '/' and (req.host and
1485                req.host != 'localhost'):
1486            if not req.host in self.get_names():
1487                raise URLError("file:// scheme is supported only on localhost")
1488        else:
1489            return self.open_local_file(req)
1490
1491    # names for the localhost
1492    names = None
1493    def get_names(self):
1494        if FileHandler.names is None:
1495            try:
1496                FileHandler.names = tuple(
1497                    socket.gethostbyname_ex('localhost')[2] +
1498                    socket.gethostbyname_ex(socket.gethostname())[2])
1499            except socket.gaierror:
1500                FileHandler.names = (socket.gethostbyname('localhost'),)
1501        return FileHandler.names
1502
1503    # not entirely sure what the rules are here
1504    def open_local_file(self, req):
1505        import email.utils
1506        import mimetypes
1507        host = req.host
1508        filename = req.selector
1509        localfile = url2pathname(filename)
1510        try:
1511            stats = os.stat(localfile)
1512            size = stats.st_size
1513            modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1514            mtype = mimetypes.guess_type(filename)[0]
1515            headers = email.message_from_string(
1516                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1517                (mtype or 'text/plain', size, modified))
1518            if host:
1519                host, port = _splitport(host)
1520            if not host or \
1521                (not port and _safe_gethostbyname(host) in self.get_names()):
1522                if host:
1523                    origurl = 'file://' + host + filename
1524                else:
1525                    origurl = 'file://' + filename
1526                return addinfourl(open(localfile, 'rb'), headers, origurl)
1527        except OSError as exp:
1528            raise URLError(exp)
1529        raise URLError('file not on local host')
1530
1531def _safe_gethostbyname(host):
1532    try:
1533        return socket.gethostbyname(host)
1534    except socket.gaierror:
1535        return None
1536
1537class FTPHandler(BaseHandler):
1538    def ftp_open(self, req):
1539        import ftplib
1540        import mimetypes
1541        host = req.host
1542        if not host:
1543            raise URLError('ftp error: no host given')
1544        host, port = _splitport(host)
1545        if port is None:
1546            port = ftplib.FTP_PORT
1547        else:
1548            port = int(port)
1549
1550        # username/password handling
1551        user, host = _splituser(host)
1552        if user:
1553            user, passwd = _splitpasswd(user)
1554        else:
1555            passwd = None
1556        host = unquote(host)
1557        user = user or ''
1558        passwd = passwd or ''
1559
1560        try:
1561            host = socket.gethostbyname(host)
1562        except OSError as msg:
1563            raise URLError(msg)
1564        path, attrs = _splitattr(req.selector)
1565        dirs = path.split('/')
1566        dirs = list(map(unquote, dirs))
1567        dirs, file = dirs[:-1], dirs[-1]
1568        if dirs and not dirs[0]:
1569            dirs = dirs[1:]
1570        try:
1571            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1572            type = file and 'I' or 'D'
1573            for attr in attrs:
1574                attr, value = _splitvalue(attr)
1575                if attr.lower() == 'type' and \
1576                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1577                    type = value.upper()
1578            fp, retrlen = fw.retrfile(file, type)
1579            headers = ""
1580            mtype = mimetypes.guess_type(req.full_url)[0]
1581            if mtype:
1582                headers += "Content-type: %s\n" % mtype
1583            if retrlen is not None and retrlen >= 0:
1584                headers += "Content-length: %d\n" % retrlen
1585            headers = email.message_from_string(headers)
1586            return addinfourl(fp, headers, req.full_url)
1587        except ftplib.all_errors as exp:
1588            exc = URLError('ftp error: %r' % exp)
1589            raise exc.with_traceback(sys.exc_info()[2])
1590
1591    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1592        return ftpwrapper(user, passwd, host, port, dirs, timeout,
1593                          persistent=False)
1594
1595class CacheFTPHandler(FTPHandler):
1596    # XXX would be nice to have pluggable cache strategies
1597    # XXX this stuff is definitely not thread safe
1598    def __init__(self):
1599        self.cache = {}
1600        self.timeout = {}
1601        self.soonest = 0
1602        self.delay = 60
1603        self.max_conns = 16
1604
1605    def setTimeout(self, t):
1606        self.delay = t
1607
1608    def setMaxConns(self, m):
1609        self.max_conns = m
1610
1611    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1612        key = user, host, port, '/'.join(dirs), timeout
1613        if key in self.cache:
1614            self.timeout[key] = time.time() + self.delay
1615        else:
1616            self.cache[key] = ftpwrapper(user, passwd, host, port,
1617                                         dirs, timeout)
1618            self.timeout[key] = time.time() + self.delay
1619        self.check_cache()
1620        return self.cache[key]
1621
1622    def check_cache(self):
1623        # first check for old ones
1624        t = time.time()
1625        if self.soonest <= t:
1626            for k, v in list(self.timeout.items()):
1627                if v < t:
1628                    self.cache[k].close()
1629                    del self.cache[k]
1630                    del self.timeout[k]
1631        self.soonest = min(list(self.timeout.values()))
1632
1633        # then check the size
1634        if len(self.cache) == self.max_conns:
1635            for k, v in list(self.timeout.items()):
1636                if v == self.soonest:
1637                    del self.cache[k]
1638                    del self.timeout[k]
1639                    break
1640            self.soonest = min(list(self.timeout.values()))
1641
1642    def clear_cache(self):
1643        for conn in self.cache.values():
1644            conn.close()
1645        self.cache.clear()
1646        self.timeout.clear()
1647
1648class DataHandler(BaseHandler):
1649    def data_open(self, req):
1650        # data URLs as specified in RFC 2397.
1651        #
1652        # ignores POSTed data
1653        #
1654        # syntax:
1655        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
1656        # mediatype := [ type "/" subtype ] *( ";" parameter )
1657        # data      := *urlchar
1658        # parameter := attribute "=" value
1659        url = req.full_url
1660
1661        scheme, data = url.split(":",1)
1662        mediatype, data = data.split(",",1)
1663
1664        # even base64 encoded data URLs might be quoted so unquote in any case:
1665        data = unquote_to_bytes(data)
1666        if mediatype.endswith(";base64"):
1667            data = base64.decodebytes(data)
1668            mediatype = mediatype[:-7]
1669
1670        if not mediatype:
1671            mediatype = "text/plain;charset=US-ASCII"
1672
1673        headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1674            (mediatype, len(data)))
1675
1676        return addinfourl(io.BytesIO(data), headers, url)
1677
1678
1679# Code move from the old urllib module
1680
1681MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
1682
1683# Helper for non-unix systems
1684if os.name == 'nt':
1685    from nturl2path import url2pathname, pathname2url
1686else:
1687    def url2pathname(pathname):
1688        """OS-specific conversion from a relative URL of the 'file' scheme
1689        to a file system path; not recommended for general use."""
1690        return unquote(pathname)
1691
1692    def pathname2url(pathname):
1693        """OS-specific conversion from a file system path to a relative URL
1694        of the 'file' scheme; not recommended for general use."""
1695        return quote(pathname)
1696
1697
1698ftpcache = {}
1699
1700
1701class URLopener:
1702    """Class to open URLs.
1703    This is a class rather than just a subroutine because we may need
1704    more than one set of global protocol-specific options.
1705    Note -- this is a base class for those who don't want the
1706    automatic handling of errors type 302 (relocated) and 401
1707    (authorization needed)."""
1708
1709    __tempfiles = None
1710
1711    version = "Python-urllib/%s" % __version__
1712
1713    # Constructor
1714    def __init__(self, proxies=None, **x509):
1715        msg = "%(class)s style of invoking requests is deprecated. " \
1716              "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1717        warnings.warn(msg, DeprecationWarning, stacklevel=3)
1718        if proxies is None:
1719            proxies = getproxies()
1720        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1721        self.proxies = proxies
1722        self.key_file = x509.get('key_file')
1723        self.cert_file = x509.get('cert_file')
1724        self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]
1725        self.__tempfiles = []
1726        self.__unlink = os.unlink # See cleanup()
1727        self.tempcache = None
1728        # Undocumented feature: if you assign {} to tempcache,
1729        # it is used to cache files retrieved with
1730        # self.retrieve().  This is not enabled by default
1731        # since it does not work for changing documents (and I
1732        # haven't got the logic to check expiration headers
1733        # yet).
1734        self.ftpcache = ftpcache
1735        # Undocumented feature: you can use a different
1736        # ftp cache by assigning to the .ftpcache member;
1737        # in case you want logically independent URL openers
1738        # XXX This is not threadsafe.  Bah.
1739
1740    def __del__(self):
1741        self.close()
1742
1743    def close(self):
1744        self.cleanup()
1745
1746    def cleanup(self):
1747        # This code sometimes runs when the rest of this module
1748        # has already been deleted, so it can't use any globals
1749        # or import anything.
1750        if self.__tempfiles:
1751            for file in self.__tempfiles:
1752                try:
1753                    self.__unlink(file)
1754                except OSError:
1755                    pass
1756            del self.__tempfiles[:]
1757        if self.tempcache:
1758            self.tempcache.clear()
1759
1760    def addheader(self, *args):
1761        """Add a header to be used by the HTTP interface only
1762        e.g. u.addheader('Accept', 'sound/basic')"""
1763        self.addheaders.append(args)
1764
1765    # External interface
1766    def open(self, fullurl, data=None):
1767        """Use URLopener().open(file) instead of open(file, 'r')."""
1768        fullurl = unwrap(_to_bytes(fullurl))
1769        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1770        if self.tempcache and fullurl in self.tempcache:
1771            filename, headers = self.tempcache[fullurl]
1772            fp = open(filename, 'rb')
1773            return addinfourl(fp, headers, fullurl)
1774        urltype, url = _splittype(fullurl)
1775        if not urltype:
1776            urltype = 'file'
1777        if urltype in self.proxies:
1778            proxy = self.proxies[urltype]
1779            urltype, proxyhost = _splittype(proxy)
1780            host, selector = _splithost(proxyhost)
1781            url = (host, fullurl) # Signal special case to open_*()
1782        else:
1783            proxy = None
1784        name = 'open_' + urltype
1785        self.type = urltype
1786        name = name.replace('-', '_')
1787        if not hasattr(self, name) or name == 'open_local_file':
1788            if proxy:
1789                return self.open_unknown_proxy(proxy, fullurl, data)
1790            else:
1791                return self.open_unknown(fullurl, data)
1792        try:
1793            if data is None:
1794                return getattr(self, name)(url)
1795            else:
1796                return getattr(self, name)(url, data)
1797        except (HTTPError, URLError):
1798            raise
1799        except OSError as msg:
1800            raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
1801
1802    def open_unknown(self, fullurl, data=None):
1803        """Overridable interface to open unknown URL type."""
1804        type, url = _splittype(fullurl)
1805        raise OSError('url error', 'unknown url type', type)
1806
1807    def open_unknown_proxy(self, proxy, fullurl, data=None):
1808        """Overridable interface to open unknown URL type."""
1809        type, url = _splittype(fullurl)
1810        raise OSError('url error', 'invalid proxy for %s' % type, proxy)
1811
1812    # External interface
1813    def retrieve(self, url, filename=None, reporthook=None, data=None):
1814        """retrieve(url) returns (filename, headers) for a local object
1815        or (tempfilename, headers) for a remote object."""
1816        url = unwrap(_to_bytes(url))
1817        if self.tempcache and url in self.tempcache:
1818            return self.tempcache[url]
1819        type, url1 = _splittype(url)
1820        if filename is None and (not type or type == 'file'):
1821            try:
1822                fp = self.open_local_file(url1)
1823                hdrs = fp.info()
1824                fp.close()
1825                return url2pathname(_splithost(url1)[1]), hdrs
1826            except OSError as msg:
1827                pass
1828        fp = self.open(url, data)
1829        try:
1830            headers = fp.info()
1831            if filename:
1832                tfp = open(filename, 'wb')
1833            else:
1834                garbage, path = _splittype(url)
1835                garbage, path = _splithost(path or "")
1836                path, garbage = _splitquery(path or "")
1837                path, garbage = _splitattr(path or "")
1838                suffix = os.path.splitext(path)[1]
1839                (fd, filename) = tempfile.mkstemp(suffix)
1840                self.__tempfiles.append(filename)
1841                tfp = os.fdopen(fd, 'wb')
1842            try:
1843                result = filename, headers
1844                if self.tempcache is not None:
1845                    self.tempcache[url] = result
1846                bs = 1024*8
1847                size = -1
1848                read = 0
1849                blocknum = 0
1850                if "content-length" in headers:
1851                    size = int(headers["Content-Length"])
1852                if reporthook:
1853                    reporthook(blocknum, bs, size)
1854                while 1:
1855                    block = fp.read(bs)
1856                    if not block:
1857                        break
1858                    read += len(block)
1859                    tfp.write(block)
1860                    blocknum += 1
1861                    if reporthook:
1862                        reporthook(blocknum, bs, size)
1863            finally:
1864                tfp.close()
1865        finally:
1866            fp.close()
1867
1868        # raise exception if actual size does not match content-length header
1869        if size >= 0 and read < size:
1870            raise ContentTooShortError(
1871                "retrieval incomplete: got only %i out of %i bytes"
1872                % (read, size), result)
1873
1874        return result
1875
1876    # Each method named open_<type> knows how to open that type of URL
1877
1878    def _open_generic_http(self, connection_factory, url, data):
1879        """Make an HTTP connection using connection_class.
1880
1881        This is an internal method that should be called from
1882        open_http() or open_https().
1883
1884        Arguments:
1885        - connection_factory should take a host name and return an
1886          HTTPConnection instance.
1887        - url is the url to retrieval or a host, relative-path pair.
1888        - data is payload for a POST request or None.
1889        """
1890
1891        user_passwd = None
1892        proxy_passwd= None
1893        if isinstance(url, str):
1894            host, selector = _splithost(url)
1895            if host:
1896                user_passwd, host = _splituser(host)
1897                host = unquote(host)
1898            realhost = host
1899        else:
1900            host, selector = url
1901            # check whether the proxy contains authorization information
1902            proxy_passwd, host = _splituser(host)
1903            # now we proceed with the url we want to obtain
1904            urltype, rest = _splittype(selector)
1905            url = rest
1906            user_passwd = None
1907            if urltype.lower() != 'http':
1908                realhost = None
1909            else:
1910                realhost, rest = _splithost(rest)
1911                if realhost:
1912                    user_passwd, realhost = _splituser(realhost)
1913                if user_passwd:
1914                    selector = "%s://%s%s" % (urltype, realhost, rest)
1915                if proxy_bypass(realhost):
1916                    host = realhost
1917
1918        if not host: raise OSError('http error', 'no host given')
1919
1920        if proxy_passwd:
1921            proxy_passwd = unquote(proxy_passwd)
1922            proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1923        else:
1924            proxy_auth = None
1925
1926        if user_passwd:
1927            user_passwd = unquote(user_passwd)
1928            auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1929        else:
1930            auth = None
1931        http_conn = connection_factory(host)
1932        headers = {}
1933        if proxy_auth:
1934            headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1935        if auth:
1936            headers["Authorization"] =  "Basic %s" % auth
1937        if realhost:
1938            headers["Host"] = realhost
1939
1940        # Add Connection:close as we don't support persistent connections yet.
1941        # This helps in closing the socket and avoiding ResourceWarning
1942
1943        headers["Connection"] = "close"
1944
1945        for header, value in self.addheaders:
1946            headers[header] = value
1947
1948        if data is not None:
1949            headers["Content-Type"] = "application/x-www-form-urlencoded"
1950            http_conn.request("POST", selector, data, headers)
1951        else:
1952            http_conn.request("GET", selector, headers=headers)
1953
1954        try:
1955            response = http_conn.getresponse()
1956        except http.client.BadStatusLine:
1957            # something went wrong with the HTTP status line
1958            raise URLError("http protocol error: bad status line")
1959
1960        # According to RFC 2616, "2xx" code indicates that the client's
1961        # request was successfully received, understood, and accepted.
1962        if 200 <= response.status < 300:
1963            return addinfourl(response, response.msg, "http:" + url,
1964                              response.status)
1965        else:
1966            return self.http_error(
1967                url, response.fp,
1968                response.status, response.reason, response.msg, data)
1969
1970    def open_http(self, url, data=None):
1971        """Use HTTP protocol."""
1972        return self._open_generic_http(http.client.HTTPConnection, url, data)
1973
1974    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1975        """Handle http errors.
1976
1977        Derived class can override this, or provide specific handlers
1978        named http_error_DDD where DDD is the 3-digit error code."""
1979        # First check if there's a specific handler for this error
1980        name = 'http_error_%d' % errcode
1981        if hasattr(self, name):
1982            method = getattr(self, name)
1983            if data is None:
1984                result = method(url, fp, errcode, errmsg, headers)
1985            else:
1986                result = method(url, fp, errcode, errmsg, headers, data)
1987            if result: return result
1988        return self.http_error_default(url, fp, errcode, errmsg, headers)
1989
1990    def http_error_default(self, url, fp, errcode, errmsg, headers):
1991        """Default error handler: close the connection and raise OSError."""
1992        fp.close()
1993        raise HTTPError(url, errcode, errmsg, headers, None)
1994
1995    if _have_ssl:
1996        def _https_connection(self, host):
1997            return http.client.HTTPSConnection(host,
1998                                           key_file=self.key_file,
1999                                           cert_file=self.cert_file)
2000
2001        def open_https(self, url, data=None):
2002            """Use HTTPS protocol."""
2003            return self._open_generic_http(self._https_connection, url, data)
2004
2005    def open_file(self, url):
2006        """Use local file or FTP depending on form of URL."""
2007        if not isinstance(url, str):
2008            raise URLError('file error: proxy support for file protocol currently not implemented')
2009        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
2010            raise ValueError("file:// scheme is supported only on localhost")
2011        else:
2012            return self.open_local_file(url)
2013
2014    def open_local_file(self, url):
2015        """Use local file."""
2016        import email.utils
2017        import mimetypes
2018        host, file = _splithost(url)
2019        localname = url2pathname(file)
2020        try:
2021            stats = os.stat(localname)
2022        except OSError as e:
2023            raise URLError(e.strerror, e.filename)
2024        size = stats.st_size
2025        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
2026        mtype = mimetypes.guess_type(url)[0]
2027        headers = email.message_from_string(
2028            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
2029            (mtype or 'text/plain', size, modified))
2030        if not host:
2031            urlfile = file
2032            if file[:1] == '/':
2033                urlfile = 'file://' + file
2034            return addinfourl(open(localname, 'rb'), headers, urlfile)
2035        host, port = _splitport(host)
2036        if (not port
2037           and socket.gethostbyname(host) in ((localhost(),) + thishost())):
2038            urlfile = file
2039            if file[:1] == '/':
2040                urlfile = 'file://' + file
2041            elif file[:2] == './':
2042                raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
2043            return addinfourl(open(localname, 'rb'), headers, urlfile)
2044        raise URLError('local file error: not on local host')
2045
2046    def open_ftp(self, url):
2047        """Use FTP protocol."""
2048        if not isinstance(url, str):
2049            raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
2050        import mimetypes
2051        host, path = _splithost(url)
2052        if not host: raise URLError('ftp error: no host given')
2053        host, port = _splitport(host)
2054        user, host = _splituser(host)
2055        if user: user, passwd = _splitpasswd(user)
2056        else: passwd = None
2057        host = unquote(host)
2058        user = unquote(user or '')
2059        passwd = unquote(passwd or '')
2060        host = socket.gethostbyname(host)
2061        if not port:
2062            import ftplib
2063            port = ftplib.FTP_PORT
2064        else:
2065            port = int(port)
2066        path, attrs = _splitattr(path)
2067        path = unquote(path)
2068        dirs = path.split('/')
2069        dirs, file = dirs[:-1], dirs[-1]
2070        if dirs and not dirs[0]: dirs = dirs[1:]
2071        if dirs and not dirs[0]: dirs[0] = '/'
2072        key = user, host, port, '/'.join(dirs)
2073        # XXX thread unsafe!
2074        if len(self.ftpcache) > MAXFTPCACHE:
2075            # Prune the cache, rather arbitrarily
2076            for k in list(self.ftpcache):
2077                if k != key:
2078                    v = self.ftpcache[k]
2079                    del self.ftpcache[k]
2080                    v.close()
2081        try:
2082            if key not in self.ftpcache:
2083                self.ftpcache[key] = \
2084                    ftpwrapper(user, passwd, host, port, dirs)
2085            if not file: type = 'D'
2086            else: type = 'I'
2087            for attr in attrs:
2088                attr, value = _splitvalue(attr)
2089                if attr.lower() == 'type' and \
2090                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
2091                    type = value.upper()
2092            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2093            mtype = mimetypes.guess_type("ftp:" + url)[0]
2094            headers = ""
2095            if mtype:
2096                headers += "Content-Type: %s\n" % mtype
2097            if retrlen is not None and retrlen >= 0:
2098                headers += "Content-Length: %d\n" % retrlen
2099            headers = email.message_from_string(headers)
2100            return addinfourl(fp, headers, "ftp:" + url)
2101        except ftperrors() as exp:
2102            raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
2103
2104    def open_data(self, url, data=None):
2105        """Use "data" URL."""
2106        if not isinstance(url, str):
2107            raise URLError('data error: proxy support for data protocol currently not implemented')
2108        # ignore POSTed data
2109        #
2110        # syntax of data URLs:
2111        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
2112        # mediatype := [ type "/" subtype ] *( ";" parameter )
2113        # data      := *urlchar
2114        # parameter := attribute "=" value
2115        try:
2116            [type, data] = url.split(',', 1)
2117        except ValueError:
2118            raise OSError('data error', 'bad data URL')
2119        if not type:
2120            type = 'text/plain;charset=US-ASCII'
2121        semi = type.rfind(';')
2122        if semi >= 0 and '=' not in type[semi:]:
2123            encoding = type[semi+1:]
2124            type = type[:semi]
2125        else:
2126            encoding = ''
2127        msg = []
2128        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2129                                            time.gmtime(time.time())))
2130        msg.append('Content-type: %s' % type)
2131        if encoding == 'base64':
2132            # XXX is this encoding/decoding ok?
2133            data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2134        else:
2135            data = unquote(data)
2136        msg.append('Content-Length: %d' % len(data))
2137        msg.append('')
2138        msg.append(data)
2139        msg = '\n'.join(msg)
2140        headers = email.message_from_string(msg)
2141        f = io.StringIO(msg)
2142        #f.fileno = None     # needed for addinfourl
2143        return addinfourl(f, headers, url)
2144
2145
2146class FancyURLopener(URLopener):
2147    """Derived class with handlers for errors we can handle (perhaps)."""
2148
2149    def __init__(self, *args, **kwargs):
2150        URLopener.__init__(self, *args, **kwargs)
2151        self.auth_cache = {}
2152        self.tries = 0
2153        self.maxtries = 10
2154
2155    def http_error_default(self, url, fp, errcode, errmsg, headers):
2156        """Default error handling -- don't raise an exception."""
2157        return addinfourl(fp, headers, "http:" + url, errcode)
2158
2159    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2160        """Error 302 -- relocated (temporarily)."""
2161        self.tries += 1
2162        try:
2163            if self.maxtries and self.tries >= self.maxtries:
2164                if hasattr(self, "http_error_500"):
2165                    meth = self.http_error_500
2166                else:
2167                    meth = self.http_error_default
2168                return meth(url, fp, 500,
2169                            "Internal Server Error: Redirect Recursion",
2170                            headers)
2171            result = self.redirect_internal(url, fp, errcode, errmsg,
2172                                            headers, data)
2173            return result
2174        finally:
2175            self.tries = 0
2176
2177    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2178        if 'location' in headers:
2179            newurl = headers['location']
2180        elif 'uri' in headers:
2181            newurl = headers['uri']
2182        else:
2183            return
2184        fp.close()
2185
2186        # In case the server sent a relative URL, join with original:
2187        newurl = urljoin(self.type + ":" + url, newurl)
2188
2189        urlparts = urlparse(newurl)
2190
2191        # For security reasons, we don't allow redirection to anything other
2192        # than http, https and ftp.
2193
2194        # We are using newer HTTPError with older redirect_internal method
2195        # This older method will get deprecated in 3.3
2196
2197        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2198            raise HTTPError(newurl, errcode,
2199                            errmsg +
2200                            " Redirection to url '%s' is not allowed." % newurl,
2201                            headers, fp)
2202
2203        return self.open(newurl)
2204
2205    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2206        """Error 301 -- also relocated (permanently)."""
2207        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2208
2209    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2210        """Error 303 -- also relocated (essentially identical to 302)."""
2211        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2212
2213    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2214        """Error 307 -- relocated, but turn POST into error."""
2215        if data is None:
2216            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2217        else:
2218            return self.http_error_default(url, fp, errcode, errmsg, headers)
2219
2220    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2221            retry=False):
2222        """Error 401 -- authentication required.
2223        This function supports Basic authentication only."""
2224        if 'www-authenticate' not in headers:
2225            URLopener.http_error_default(self, url, fp,
2226                                         errcode, errmsg, headers)
2227        stuff = headers['www-authenticate']
2228        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2229        if not match:
2230            URLopener.http_error_default(self, url, fp,
2231                                         errcode, errmsg, headers)
2232        scheme, realm = match.groups()
2233        if scheme.lower() != 'basic':
2234            URLopener.http_error_default(self, url, fp,
2235                                         errcode, errmsg, headers)
2236        if not retry:
2237            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2238                    headers)
2239        name = 'retry_' + self.type + '_basic_auth'
2240        if data is None:
2241            return getattr(self,name)(url, realm)
2242        else:
2243            return getattr(self,name)(url, realm, data)
2244
2245    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2246            retry=False):
2247        """Error 407 -- proxy authentication required.
2248        This function supports Basic authentication only."""
2249        if 'proxy-authenticate' not in headers:
2250            URLopener.http_error_default(self, url, fp,
2251                                         errcode, errmsg, headers)
2252        stuff = headers['proxy-authenticate']
2253        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2254        if not match:
2255            URLopener.http_error_default(self, url, fp,
2256                                         errcode, errmsg, headers)
2257        scheme, realm = match.groups()
2258        if scheme.lower() != 'basic':
2259            URLopener.http_error_default(self, url, fp,
2260                                         errcode, errmsg, headers)
2261        if not retry:
2262            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2263                    headers)
2264        name = 'retry_proxy_' + self.type + '_basic_auth'
2265        if data is None:
2266            return getattr(self,name)(url, realm)
2267        else:
2268            return getattr(self,name)(url, realm, data)
2269
2270    def retry_proxy_http_basic_auth(self, url, realm, data=None):
2271        host, selector = _splithost(url)
2272        newurl = 'http://' + host + selector
2273        proxy = self.proxies['http']
2274        urltype, proxyhost = _splittype(proxy)
2275        proxyhost, proxyselector = _splithost(proxyhost)
2276        i = proxyhost.find('@') + 1
2277        proxyhost = proxyhost[i:]
2278        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2279        if not (user or passwd): return None
2280        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2281                                  quote(passwd, safe=''), proxyhost)
2282        self.proxies['http'] = 'http://' + proxyhost + proxyselector
2283        if data is None:
2284            return self.open(newurl)
2285        else:
2286            return self.open(newurl, data)
2287
2288    def retry_proxy_https_basic_auth(self, url, realm, data=None):
2289        host, selector = _splithost(url)
2290        newurl = 'https://' + host + selector
2291        proxy = self.proxies['https']
2292        urltype, proxyhost = _splittype(proxy)
2293        proxyhost, proxyselector = _splithost(proxyhost)
2294        i = proxyhost.find('@') + 1
2295        proxyhost = proxyhost[i:]
2296        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2297        if not (user or passwd): return None
2298        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2299                                  quote(passwd, safe=''), proxyhost)
2300        self.proxies['https'] = 'https://' + proxyhost + proxyselector
2301        if data is None:
2302            return self.open(newurl)
2303        else:
2304            return self.open(newurl, data)
2305
2306    def retry_http_basic_auth(self, url, realm, data=None):
2307        host, selector = _splithost(url)
2308        i = host.find('@') + 1
2309        host = host[i:]
2310        user, passwd = self.get_user_passwd(host, realm, i)
2311        if not (user or passwd): return None
2312        host = "%s:%s@%s" % (quote(user, safe=''),
2313                             quote(passwd, safe=''), host)
2314        newurl = 'http://' + host + selector
2315        if data is None:
2316            return self.open(newurl)
2317        else:
2318            return self.open(newurl, data)
2319
2320    def retry_https_basic_auth(self, url, realm, data=None):
2321        host, selector = _splithost(url)
2322        i = host.find('@') + 1
2323        host = host[i:]
2324        user, passwd = self.get_user_passwd(host, realm, i)
2325        if not (user or passwd): return None
2326        host = "%s:%s@%s" % (quote(user, safe=''),
2327                             quote(passwd, safe=''), host)
2328        newurl = 'https://' + host + selector
2329        if data is None:
2330            return self.open(newurl)
2331        else:
2332            return self.open(newurl, data)
2333
2334    def get_user_passwd(self, host, realm, clear_cache=0):
2335        key = realm + '@' + host.lower()
2336        if key in self.auth_cache:
2337            if clear_cache:
2338                del self.auth_cache[key]
2339            else:
2340                return self.auth_cache[key]
2341        user, passwd = self.prompt_user_passwd(host, realm)
2342        if user or passwd: self.auth_cache[key] = (user, passwd)
2343        return user, passwd
2344
2345    def prompt_user_passwd(self, host, realm):
2346        """Override this in a GUI environment!"""
2347        import getpass
2348        try:
2349            user = input("Enter username for %s at %s: " % (realm, host))
2350            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2351                (user, realm, host))
2352            return user, passwd
2353        except KeyboardInterrupt:
2354            print()
2355            return None, None
2356
2357
2358# Utility functions
2359
2360_localhost = None
2361def localhost():
2362    """Return the IP address of the magic hostname 'localhost'."""
2363    global _localhost
2364    if _localhost is None:
2365        _localhost = socket.gethostbyname('localhost')
2366    return _localhost
2367
2368_thishost = None
2369def thishost():
2370    """Return the IP addresses of the current host."""
2371    global _thishost
2372    if _thishost is None:
2373        try:
2374            _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2375        except socket.gaierror:
2376            _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2377    return _thishost
2378
2379_ftperrors = None
2380def ftperrors():
2381    """Return the set of errors raised by the FTP class."""
2382    global _ftperrors
2383    if _ftperrors is None:
2384        import ftplib
2385        _ftperrors = ftplib.all_errors
2386    return _ftperrors
2387
2388_noheaders = None
2389def noheaders():
2390    """Return an empty email Message object."""
2391    global _noheaders
2392    if _noheaders is None:
2393        _noheaders = email.message_from_string("")
2394    return _noheaders
2395
2396
2397# Utility classes
2398
2399class ftpwrapper:
2400    """Class used by open_ftp() for cache of open FTP connections."""
2401
2402    def __init__(self, user, passwd, host, port, dirs, timeout=None,
2403                 persistent=True):
2404        self.user = user
2405        self.passwd = passwd
2406        self.host = host
2407        self.port = port
2408        self.dirs = dirs
2409        self.timeout = timeout
2410        self.refcount = 0
2411        self.keepalive = persistent
2412        try:
2413            self.init()
2414        except:
2415            self.close()
2416            raise
2417
2418    def init(self):
2419        import ftplib
2420        self.busy = 0
2421        self.ftp = ftplib.FTP()
2422        self.ftp.connect(self.host, self.port, self.timeout)
2423        self.ftp.login(self.user, self.passwd)
2424        _target = '/'.join(self.dirs)
2425        self.ftp.cwd(_target)
2426
2427    def retrfile(self, file, type):
2428        import ftplib
2429        self.endtransfer()
2430        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2431        else: cmd = 'TYPE ' + type; isdir = 0
2432        try:
2433            self.ftp.voidcmd(cmd)
2434        except ftplib.all_errors:
2435            self.init()
2436            self.ftp.voidcmd(cmd)
2437        conn = None
2438        if file and not isdir:
2439            # Try to retrieve as a file
2440            try:
2441                cmd = 'RETR ' + file
2442                conn, retrlen = self.ftp.ntransfercmd(cmd)
2443            except ftplib.error_perm as reason:
2444                if str(reason)[:3] != '550':
2445                    raise URLError('ftp error: %r' % reason).with_traceback(
2446                        sys.exc_info()[2])
2447        if not conn:
2448            # Set transfer mode to ASCII!
2449            self.ftp.voidcmd('TYPE A')
2450            # Try a directory listing. Verify that directory exists.
2451            if file:
2452                pwd = self.ftp.pwd()
2453                try:
2454                    try:
2455                        self.ftp.cwd(file)
2456                    except ftplib.error_perm as reason:
2457                        raise URLError('ftp error: %r' % reason) from reason
2458                finally:
2459                    self.ftp.cwd(pwd)
2460                cmd = 'LIST ' + file
2461            else:
2462                cmd = 'LIST'
2463            conn, retrlen = self.ftp.ntransfercmd(cmd)
2464        self.busy = 1
2465
2466        ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2467        self.refcount += 1
2468        conn.close()
2469        # Pass back both a suitably decorated object and a retrieval length
2470        return (ftpobj, retrlen)
2471
2472    def endtransfer(self):
2473        self.busy = 0
2474
2475    def close(self):
2476        self.keepalive = False
2477        if self.refcount <= 0:
2478            self.real_close()
2479
2480    def file_close(self):
2481        self.endtransfer()
2482        self.refcount -= 1
2483        if self.refcount <= 0 and not self.keepalive:
2484            self.real_close()
2485
2486    def real_close(self):
2487        self.endtransfer()
2488        try:
2489            self.ftp.close()
2490        except ftperrors():
2491            pass
2492
2493# Proxy handling
2494def getproxies_environment():
2495    """Return a dictionary of scheme -> proxy server URL mappings.
2496
2497    Scan the environment for variables named <scheme>_proxy;
2498    this seems to be the standard convention.  If you need a
2499    different way, you can pass a proxies dictionary to the
2500    [Fancy]URLopener constructor.
2501
2502    """
2503    proxies = {}
2504    # in order to prefer lowercase variables, process environment in
2505    # two passes: first matches any, second pass matches lowercase only
2506    for name, value in os.environ.items():
2507        name = name.lower()
2508        if value and name[-6:] == '_proxy':
2509            proxies[name[:-6]] = value
2510    # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
2511    # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
2512    # header from the client
2513    # If "proxy" is lowercase, it will still be used thanks to the next block
2514    if 'REQUEST_METHOD' in os.environ:
2515        proxies.pop('http', None)
2516    for name, value in os.environ.items():
2517        if name[-6:] == '_proxy':
2518            name = name.lower()
2519            if value:
2520                proxies[name[:-6]] = value
2521            else:
2522                proxies.pop(name[:-6], None)
2523    return proxies
2524
2525def proxy_bypass_environment(host, proxies=None):
2526    """Test if proxies should not be used for a particular host.
2527
2528    Checks the proxy dict for the value of no_proxy, which should
2529    be a list of comma separated DNS suffixes, or '*' for all hosts.
2530
2531    """
2532    if proxies is None:
2533        proxies = getproxies_environment()
2534    # don't bypass, if no_proxy isn't specified
2535    try:
2536        no_proxy = proxies['no']
2537    except KeyError:
2538        return False
2539    # '*' is special case for always bypass
2540    if no_proxy == '*':
2541        return True
2542    host = host.lower()
2543    # strip port off host
2544    hostonly, port = _splitport(host)
2545    # check if the host ends with any of the DNS suffixes
2546    for name in no_proxy.split(','):
2547        name = name.strip()
2548        if name:
2549            name = name.lstrip('.')  # ignore leading dots
2550            name = name.lower()
2551            if hostonly == name or host == name:
2552                return True
2553            name = '.' + name
2554            if hostonly.endswith(name) or host.endswith(name):
2555                return True
2556    # otherwise, don't bypass
2557    return False
2558
2559
2560# This code tests an OSX specific data structure but is testable on all
2561# platforms
2562def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2563    """
2564    Return True iff this host shouldn't be accessed using a proxy
2565
2566    This function uses the MacOSX framework SystemConfiguration
2567    to fetch the proxy information.
2568
2569    proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2570    { 'exclude_simple': bool,
2571      'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2572    }
2573    """
2574    from fnmatch import fnmatch
2575
2576    hostonly, port = _splitport(host)
2577
2578    def ip2num(ipAddr):
2579        parts = ipAddr.split('.')
2580        parts = list(map(int, parts))
2581        if len(parts) != 4:
2582            parts = (parts + [0, 0, 0, 0])[:4]
2583        return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2584
2585    # Check for simple host names:
2586    if '.' not in host:
2587        if proxy_settings['exclude_simple']:
2588            return True
2589
2590    hostIP = None
2591
2592    for value in proxy_settings.get('exceptions', ()):
2593        # Items in the list are strings like these: *.local, 169.254/16
2594        if not value: continue
2595
2596        m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2597        if m is not None:
2598            if hostIP is None:
2599                try:
2600                    hostIP = socket.gethostbyname(hostonly)
2601                    hostIP = ip2num(hostIP)
2602                except OSError:
2603                    continue
2604
2605            base = ip2num(m.group(1))
2606            mask = m.group(2)
2607            if mask is None:
2608                mask = 8 * (m.group(1).count('.') + 1)
2609            else:
2610                mask = int(mask[1:])
2611
2612            if mask < 0 or mask > 32:
2613                # System libraries ignore invalid prefix lengths
2614                continue
2615
2616            mask = 32 - mask
2617
2618            if (hostIP >> mask) == (base >> mask):
2619                return True
2620
2621        elif fnmatch(host, value):
2622            return True
2623
2624    return False
2625
2626
2627if sys.platform == 'darwin':
2628    from _scproxy import _get_proxy_settings, _get_proxies
2629
2630    def proxy_bypass_macosx_sysconf(host):
2631        proxy_settings = _get_proxy_settings()
2632        return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2633
2634    def getproxies_macosx_sysconf():
2635        """Return a dictionary of scheme -> proxy server URL mappings.
2636
2637        This function uses the MacOSX framework SystemConfiguration
2638        to fetch the proxy information.
2639        """
2640        return _get_proxies()
2641
2642
2643
2644    def proxy_bypass(host):
2645        """Return True, if host should be bypassed.
2646
2647        Checks proxy settings gathered from the environment, if specified,
2648        or from the MacOSX framework SystemConfiguration.
2649
2650        """
2651        proxies = getproxies_environment()
2652        if proxies:
2653            return proxy_bypass_environment(host, proxies)
2654        else:
2655            return proxy_bypass_macosx_sysconf(host)
2656
2657    def getproxies():
2658        return getproxies_environment() or getproxies_macosx_sysconf()
2659
2660
2661elif os.name == 'nt':
2662    def getproxies_registry():
2663        """Return a dictionary of scheme -> proxy server URL mappings.
2664
2665        Win32 uses the registry to store proxies.
2666
2667        """
2668        proxies = {}
2669        try:
2670            import winreg
2671        except ImportError:
2672            # Std module, so should be around - but you never know!
2673            return proxies
2674        try:
2675            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2676                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2677            proxyEnable = winreg.QueryValueEx(internetSettings,
2678                                               'ProxyEnable')[0]
2679            if proxyEnable:
2680                # Returned as Unicode but problems if not converted to ASCII
2681                proxyServer = str(winreg.QueryValueEx(internetSettings,
2682                                                       'ProxyServer')[0])
2683                if '=' in proxyServer:
2684                    # Per-protocol settings
2685                    for p in proxyServer.split(';'):
2686                        protocol, address = p.split('=', 1)
2687                        # See if address has a type:// prefix
2688                        if not re.match('(?:[^/:]+)://', address):
2689                            address = '%s://%s' % (protocol, address)
2690                        proxies[protocol] = address
2691                else:
2692                    # Use one setting for all protocols
2693                    if proxyServer[:5] == 'http:':
2694                        proxies['http'] = proxyServer
2695                    else:
2696                        proxies['http'] = 'http://%s' % proxyServer
2697                        proxies['https'] = 'https://%s' % proxyServer
2698                        proxies['ftp'] = 'ftp://%s' % proxyServer
2699            internetSettings.Close()
2700        except (OSError, ValueError, TypeError):
2701            # Either registry key not found etc, or the value in an
2702            # unexpected format.
2703            # proxies already set up to be empty so nothing to do
2704            pass
2705        return proxies
2706
2707    def getproxies():
2708        """Return a dictionary of scheme -> proxy server URL mappings.
2709
2710        Returns settings gathered from the environment, if specified,
2711        or the registry.
2712
2713        """
2714        return getproxies_environment() or getproxies_registry()
2715
2716    def proxy_bypass_registry(host):
2717        try:
2718            import winreg
2719        except ImportError:
2720            # Std modules, so should be around - but you never know!
2721            return 0
2722        try:
2723            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2724                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2725            proxyEnable = winreg.QueryValueEx(internetSettings,
2726                                               'ProxyEnable')[0]
2727            proxyOverride = str(winreg.QueryValueEx(internetSettings,
2728                                                     'ProxyOverride')[0])
2729            # ^^^^ Returned as Unicode but problems if not converted to ASCII
2730        except OSError:
2731            return 0
2732        if not proxyEnable or not proxyOverride:
2733            return 0
2734        # try to make a host list from name and IP address.
2735        rawHost, port = _splitport(host)
2736        host = [rawHost]
2737        try:
2738            addr = socket.gethostbyname(rawHost)
2739            if addr != rawHost:
2740                host.append(addr)
2741        except OSError:
2742            pass
2743        try:
2744            fqdn = socket.getfqdn(rawHost)
2745            if fqdn != rawHost:
2746                host.append(fqdn)
2747        except OSError:
2748            pass
2749        # make a check value list from the registry entry: replace the
2750        # '<local>' string by the localhost entry and the corresponding
2751        # canonical entry.
2752        proxyOverride = proxyOverride.split(';')
2753        # now check if we match one of the registry values.
2754        for test in proxyOverride:
2755            if test == '<local>':
2756                if '.' not in rawHost:
2757                    return 1
2758            test = test.replace(".", r"\.")     # mask dots
2759            test = test.replace("*", r".*")     # change glob sequence
2760            test = test.replace("?", r".")      # change glob char
2761            for val in host:
2762                if re.match(test, val, re.I):
2763                    return 1
2764        return 0
2765
2766    def proxy_bypass(host):
2767        """Return True, if host should be bypassed.
2768
2769        Checks proxy settings gathered from the environment, if specified,
2770        or the registry.
2771
2772        """
2773        proxies = getproxies_environment()
2774        if proxies:
2775            return proxy_bypass_environment(host, proxies)
2776        else:
2777            return proxy_bypass_registry(host)
2778
2779else:
2780    # By default use environment variables
2781    getproxies = getproxies_environment
2782    proxy_bypass = proxy_bypass_environment
2783