1"""
2Ported using Python-Future from the Python 3.3 standard library.
3
4An extensible library for opening URLs using a variety of protocols
5
6The simplest way to use this module is to call the urlopen function,
7which accepts a string containing a URL or a Request object (described
8below).  It opens the URL and returns the results as file-like
9object; the returned object has some extra methods described below.
10
11The OpenerDirector manages a collection of Handler objects that do
12all the actual work.  Each Handler implements a particular protocol or
13option.  The OpenerDirector is a composite object that invokes the
14Handlers needed to open the requested URL.  For example, the
15HTTPHandler performs HTTP GET and POST requests and deals with
16non-error returns.  The HTTPRedirectHandler automatically deals with
17HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
18deals with digest authentication.
19
20urlopen(url, data=None) -- Basic usage is the same as original
21urllib.  pass the url and optionally data to post to an HTTP URL, and
22get a file-like object back.  One difference is that you can also pass
23a Request instance instead of URL.  Raises a URLError (subclass of
24IOError); for HTTP errors, raises an HTTPError, which can also be
25treated as a valid response.
26
27build_opener -- Function that creates a new OpenerDirector instance.
28Will install the default handlers.  Accepts one or more Handlers as
29arguments, either instances or Handler classes that it will
30instantiate.  If one of the argument is a subclass of the default
31handler, the argument will be installed instead of the default.
32
33install_opener -- Installs a new opener as the default opener.
34
35objects of interest:
36
37OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
38the Handler classes, while dealing with requests and responses.
39
40Request -- An object that encapsulates the state of a request.  The
41state can be as simple as the URL.  It can also include extra HTTP
42headers, e.g. a User-Agent.
43
44BaseHandler --
45
46internals:
47BaseHandler and parent
48_call_chain conventions
49
50Example usage:
51
52import urllib.request
53
54# set up authentication info
55authinfo = urllib.request.HTTPBasicAuthHandler()
56authinfo.add_password(realm='PDQ Application',
57                      uri='https://mahler:8092/site-updates.py',
58                      user='klem',
59                      passwd='geheim$parole')
60
61proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
62
63# build a new opener that adds authentication and caching FTP handlers
64opener = urllib.request.build_opener(proxy_support, authinfo,
65                                     urllib.request.CacheFTPHandler)
66
67# install it
68urllib.request.install_opener(opener)
69
70f = urllib.request.urlopen('http://www.python.org/')
71"""
72
73# XXX issues:
74# If an authentication error handler that tries to perform
75# authentication for some reason but fails, how should the error be
76# signalled?  The client needs to know the HTTP error code.  But if
77# the handler knows that the problem was, e.g., that it didn't know
78# that hash algo that requested in the challenge, it would be good to
79# pass that information along to the client, too.
80# ftp errors aren't handled cleanly
81# check digest against correct (i.e. non-apache) implementation
82
83# Possible extensions:
84# complex proxies  XXX not sure what exactly was meant by this
85# abstract factory for opener
86
87from __future__ import absolute_import, division, print_function, unicode_literals
88from future.builtins import bytes, dict, filter, input, int, map, open, str
89from future.utils import PY2, PY3, raise_with_traceback
90
91import base64
92import bisect
93import hashlib
94import array
95
96from future.backports import email
97from future.backports.http import client as http_client
98from .error import URLError, HTTPError, ContentTooShortError
99from .parse import (
100    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
101    splittype, splithost, splitport, splituser, splitpasswd,
102    splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
103from .response import addinfourl, addclosehook
104
105import io
106import os
107import posixpath
108import re
109import socket
110import sys
111import time
112import collections
113import tempfile
114import contextlib
115import warnings
116
117# check for SSL
118try:
119    import ssl
120    # Not available in the SSL module in Py2:
121    from ssl import SSLContext
122except ImportError:
123    _have_ssl = False
124else:
125    _have_ssl = True
126
127__all__ = [
128    # Classes
129    'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
130    'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
131    'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
132    'AbstractBasicAuthHandler', 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler',
133    'AbstractDigestAuthHandler', 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler',
134    'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler',
135    'UnknownHandler', 'HTTPErrorProcessor',
136    # Functions
137    'urlopen', 'install_opener', 'build_opener',
138    'pathname2url', 'url2pathname', 'getproxies',
139    # Legacy interface
140    'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
141]
142
143# used in User-Agent header sent
144__version__ = sys.version[:3]
145
146_opener = None
147def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, **_3to2kwargs):
148    if 'cadefault' in _3to2kwargs: cadefault = _3to2kwargs['cadefault']; del _3to2kwargs['cadefault']
149    else: cadefault = False
150    if 'capath' in _3to2kwargs: capath = _3to2kwargs['capath']; del _3to2kwargs['capath']
151    else: capath = None
152    if 'cafile' in _3to2kwargs: cafile = _3to2kwargs['cafile']; del _3to2kwargs['cafile']
153    else: cafile = None
154    global _opener
155    if cafile or capath or cadefault:
156        if not _have_ssl:
157            raise ValueError('SSL support not available')
158        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
159        context.options |= ssl.OP_NO_SSLv2
160        context.verify_mode = ssl.CERT_REQUIRED
161        if cafile or capath:
162            context.load_verify_locations(cafile, capath)
163        else:
164            context.set_default_verify_paths()
165        https_handler = HTTPSHandler(context=context, check_hostname=True)
166        opener = build_opener(https_handler)
167    elif _opener is None:
168        _opener = opener = build_opener()
169    else:
170        opener = _opener
171    return opener.open(url, data, timeout)
172
173def install_opener(opener):
174    global _opener
175    _opener = opener
176
177_url_tempfiles = []
178def urlretrieve(url, filename=None, reporthook=None, data=None):
179    """
180    Retrieve a URL into a temporary location on disk.
181
182    Requires a URL argument. If a filename is passed, it is used as
183    the temporary file location. The reporthook argument should be
184    a callable that accepts a block number, a read size, and the
185    total file size of the URL target. The data argument should be
186    valid URL encoded data.
187
188    If a filename is passed and the URL points to a local resource,
189    the result is a copy from local file to new file.
190
191    Returns a tuple containing the path to the newly created
192    data file as well as the resulting HTTPMessage object.
193    """
194    url_type, path = splittype(url)
195
196    with contextlib.closing(urlopen(url, data)) as fp:
197        headers = fp.info()
198
199        # Just return the local path and the "headers" for file://
200        # URLs. No sense in performing a copy unless requested.
201        if url_type == "file" and not filename:
202            return os.path.normpath(path), headers
203
204        # Handle temporary file setup.
205        if filename:
206            tfp = open(filename, 'wb')
207        else:
208            tfp = tempfile.NamedTemporaryFile(delete=False)
209            filename = tfp.name
210            _url_tempfiles.append(filename)
211
212        with tfp:
213            result = filename, headers
214            bs = 1024*8
215            size = -1
216            read = 0
217            blocknum = 0
218            if "content-length" in headers:
219                size = int(headers["Content-Length"])
220
221            if reporthook:
222                reporthook(blocknum, bs, size)
223
224            while True:
225                block = fp.read(bs)
226                if not block:
227                    break
228                read += len(block)
229                tfp.write(block)
230                blocknum += 1
231                if reporthook:
232                    reporthook(blocknum, bs, size)
233
234    if size >= 0 and read < size:
235        raise ContentTooShortError(
236            "retrieval incomplete: got only %i out of %i bytes"
237            % (read, size), result)
238
239    return result
240
241def urlcleanup():
242    for temp_file in _url_tempfiles:
243        try:
244            os.unlink(temp_file)
245        except EnvironmentError:
246            pass
247
248    del _url_tempfiles[:]
249    global _opener
250    if _opener:
251        _opener = None
252
253if PY3:
254    _cut_port_re = re.compile(r":\d+$", re.ASCII)
255else:
256    _cut_port_re = re.compile(r":\d+$")
257
258def request_host(request):
259
260    """Return request-host, as defined by RFC 2965.
261
262    Variation from RFC: returned value is lowercased, for convenient
263    comparison.
264
265    """
266    url = request.full_url
267    host = urlparse(url)[1]
268    if host == "":
269        host = request.get_header("Host", "")
270
271    # remove port, if present
272    host = _cut_port_re.sub("", host, 1)
273    return host.lower()
274
275class Request(object):
276
277    def __init__(self, url, data=None, headers={},
278                 origin_req_host=None, unverifiable=False,
279                 method=None):
280        # unwrap('<URL:type://host/path>') --> 'type://host/path'
281        self.full_url = unwrap(url)
282        self.full_url, self.fragment = splittag(self.full_url)
283        self.data = data
284        self.headers = {}
285        self._tunnel_host = None
286        for key, value in headers.items():
287            self.add_header(key, value)
288        self.unredirected_hdrs = {}
289        if origin_req_host is None:
290            origin_req_host = request_host(self)
291        self.origin_req_host = origin_req_host
292        self.unverifiable = unverifiable
293        self.method = method
294        self._parse()
295
296    def _parse(self):
297        self.type, rest = splittype(self.full_url)
298        if self.type is None:
299            raise ValueError("unknown url type: %r" % self.full_url)
300        self.host, self.selector = splithost(rest)
301        if self.host:
302            self.host = unquote(self.host)
303
304    def get_method(self):
305        """Return a string indicating the HTTP request method."""
306        if self.method is not None:
307            return self.method
308        elif self.data is not None:
309            return "POST"
310        else:
311            return "GET"
312
313    def get_full_url(self):
314        if self.fragment:
315            return '%s#%s' % (self.full_url, self.fragment)
316        else:
317            return self.full_url
318
319    # Begin deprecated methods
320
321    def add_data(self, data):
322        msg = "Request.add_data method is deprecated."
323        warnings.warn(msg, DeprecationWarning, stacklevel=1)
324        self.data = data
325
326    def has_data(self):
327        msg = "Request.has_data method is deprecated."
328        warnings.warn(msg, DeprecationWarning, stacklevel=1)
329        return self.data is not None
330
331    def get_data(self):
332        msg = "Request.get_data method is deprecated."
333        warnings.warn(msg, DeprecationWarning, stacklevel=1)
334        return self.data
335
336    def get_type(self):
337        msg = "Request.get_type method is deprecated."
338        warnings.warn(msg, DeprecationWarning, stacklevel=1)
339        return self.type
340
341    def get_host(self):
342        msg = "Request.get_host method is deprecated."
343        warnings.warn(msg, DeprecationWarning, stacklevel=1)
344        return self.host
345
346    def get_selector(self):
347        msg = "Request.get_selector method is deprecated."
348        warnings.warn(msg, DeprecationWarning, stacklevel=1)
349        return self.selector
350
351    def is_unverifiable(self):
352        msg = "Request.is_unverifiable method is deprecated."
353        warnings.warn(msg, DeprecationWarning, stacklevel=1)
354        return self.unverifiable
355
356    def get_origin_req_host(self):
357        msg = "Request.get_origin_req_host method is deprecated."
358        warnings.warn(msg, DeprecationWarning, stacklevel=1)
359        return self.origin_req_host
360
361    # End deprecated methods
362
363    def set_proxy(self, host, type):
364        if self.type == 'https' and not self._tunnel_host:
365            self._tunnel_host = self.host
366        else:
367            self.type= type
368            self.selector = self.full_url
369        self.host = host
370
371    def has_proxy(self):
372        return self.selector == self.full_url
373
374    def add_header(self, key, val):
375        # useful for something like authentication
376        self.headers[key.capitalize()] = val
377
378    def add_unredirected_header(self, key, val):
379        # will not be added to a redirected request
380        self.unredirected_hdrs[key.capitalize()] = val
381
382    def has_header(self, header_name):
383        return (header_name in self.headers or
384                header_name in self.unredirected_hdrs)
385
386    def get_header(self, header_name, default=None):
387        return self.headers.get(
388            header_name,
389            self.unredirected_hdrs.get(header_name, default))
390
391    def header_items(self):
392        hdrs = self.unredirected_hdrs.copy()
393        hdrs.update(self.headers)
394        return list(hdrs.items())
395
396class OpenerDirector(object):
397    def __init__(self):
398        client_version = "Python-urllib/%s" % __version__
399        self.addheaders = [('User-agent', client_version)]
400        # self.handlers is retained only for backward compatibility
401        self.handlers = []
402        # manage the individual handlers
403        self.handle_open = {}
404        self.handle_error = {}
405        self.process_response = {}
406        self.process_request = {}
407
408    def add_handler(self, handler):
409        if not hasattr(handler, "add_parent"):
410            raise TypeError("expected BaseHandler instance, got %r" %
411                            type(handler))
412
413        added = False
414        for meth in dir(handler):
415            if meth in ["redirect_request", "do_open", "proxy_open"]:
416                # oops, coincidental match
417                continue
418
419            i = meth.find("_")
420            protocol = meth[:i]
421            condition = meth[i+1:]
422
423            if condition.startswith("error"):
424                j = condition.find("_") + i + 1
425                kind = meth[j+1:]
426                try:
427                    kind = int(kind)
428                except ValueError:
429                    pass
430                lookup = self.handle_error.get(protocol, {})
431                self.handle_error[protocol] = lookup
432            elif condition == "open":
433                kind = protocol
434                lookup = self.handle_open
435            elif condition == "response":
436                kind = protocol
437                lookup = self.process_response
438            elif condition == "request":
439                kind = protocol
440                lookup = self.process_request
441            else:
442                continue
443
444            handlers = lookup.setdefault(kind, [])
445            if handlers:
446                bisect.insort(handlers, handler)
447            else:
448                handlers.append(handler)
449            added = True
450
451        if added:
452            bisect.insort(self.handlers, handler)
453            handler.add_parent(self)
454
455    def close(self):
456        # Only exists for backwards compatibility.
457        pass
458
459    def _call_chain(self, chain, kind, meth_name, *args):
460        # Handlers raise an exception if no one else should try to handle
461        # the request, or return None if they can't but another handler
462        # could.  Otherwise, they return the response.
463        handlers = chain.get(kind, ())
464        for handler in handlers:
465            func = getattr(handler, meth_name)
466            result = func(*args)
467            if result is not None:
468                return result
469
470    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
471        """
472        Accept a URL or a Request object
473
474        Python-Future: if the URL is passed as a byte-string, decode it first.
475        """
476        if isinstance(fullurl, bytes):
477            fullurl = fullurl.decode()
478        if isinstance(fullurl, str):
479            req = Request(fullurl, data)
480        else:
481            req = fullurl
482            if data is not None:
483                req.data = data
484
485        req.timeout = timeout
486        protocol = req.type
487
488        # pre-process request
489        meth_name = protocol+"_request"
490        for processor in self.process_request.get(protocol, []):
491            meth = getattr(processor, meth_name)
492            req = meth(req)
493
494        response = self._open(req, data)
495
496        # post-process response
497        meth_name = protocol+"_response"
498        for processor in self.process_response.get(protocol, []):
499            meth = getattr(processor, meth_name)
500            response = meth(req, response)
501
502        return response
503
504    def _open(self, req, data=None):
505        result = self._call_chain(self.handle_open, 'default',
506                                  'default_open', req)
507        if result:
508            return result
509
510        protocol = req.type
511        result = self._call_chain(self.handle_open, protocol, protocol +
512                                  '_open', req)
513        if result:
514            return result
515
516        return self._call_chain(self.handle_open, 'unknown',
517                                'unknown_open', req)
518
519    def error(self, proto, *args):
520        if proto in ('http', 'https'):
521            # XXX http[s] protocols are special-cased
522            dict = self.handle_error['http'] # https is not different than http
523            proto = args[2]  # YUCK!
524            meth_name = 'http_error_%s' % proto
525            http_err = 1
526            orig_args = args
527        else:
528            dict = self.handle_error
529            meth_name = proto + '_error'
530            http_err = 0
531        args = (dict, proto, meth_name) + args
532        result = self._call_chain(*args)
533        if result:
534            return result
535
536        if http_err:
537            args = (dict, 'default', 'http_error_default') + orig_args
538            return self._call_chain(*args)
539
540# XXX probably also want an abstract factory that knows when it makes
541# sense to skip a superclass in favor of a subclass and when it might
542# make sense to include both
543
544def build_opener(*handlers):
545    """Create an opener object from a list of handlers.
546
547    The opener will use several default handlers, including support
548    for HTTP, FTP and when applicable HTTPS.
549
550    If any of the handlers passed as arguments are subclasses of the
551    default handlers, the default handlers will not be used.
552    """
553    def isclass(obj):
554        return isinstance(obj, type) or hasattr(obj, "__bases__")
555
556    opener = OpenerDirector()
557    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
558                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
559                       FTPHandler, FileHandler, HTTPErrorProcessor]
560    if hasattr(http_client, "HTTPSConnection"):
561        default_classes.append(HTTPSHandler)
562    skip = set()
563    for klass in default_classes:
564        for check in handlers:
565            if isclass(check):
566                if issubclass(check, klass):
567                    skip.add(klass)
568            elif isinstance(check, klass):
569                skip.add(klass)
570    for klass in skip:
571        default_classes.remove(klass)
572
573    for klass in default_classes:
574        opener.add_handler(klass())
575
576    for h in handlers:
577        if isclass(h):
578            h = h()
579        opener.add_handler(h)
580    return opener
581
582class BaseHandler(object):
583    handler_order = 500
584
585    def add_parent(self, parent):
586        self.parent = parent
587
588    def close(self):
589        # Only exists for backwards compatibility
590        pass
591
592    def __lt__(self, other):
593        if not hasattr(other, "handler_order"):
594            # Try to preserve the old behavior of having custom classes
595            # inserted after default ones (works only for custom user
596            # classes which are not aware of handler_order).
597            return True
598        return self.handler_order < other.handler_order
599
600
601class HTTPErrorProcessor(BaseHandler):
602    """Process HTTP error responses."""
603    handler_order = 1000  # after all other processing
604
605    def http_response(self, request, response):
606        code, msg, hdrs = response.code, response.msg, response.info()
607
608        # According to RFC 2616, "2xx" code indicates that the client's
609        # request was successfully received, understood, and accepted.
610        if not (200 <= code < 300):
611            response = self.parent.error(
612                'http', request, response, code, msg, hdrs)
613
614        return response
615
616    https_response = http_response
617
618class HTTPDefaultErrorHandler(BaseHandler):
619    def http_error_default(self, req, fp, code, msg, hdrs):
620        raise HTTPError(req.full_url, code, msg, hdrs, fp)
621
622class HTTPRedirectHandler(BaseHandler):
623    # maximum number of redirections to any single URL
624    # this is needed because of the state that cookies introduce
625    max_repeats = 4
626    # maximum total number of redirections (regardless of URL) before
627    # assuming we're in a loop
628    max_redirections = 10
629
630    def redirect_request(self, req, fp, code, msg, headers, newurl):
631        """Return a Request or None in response to a redirect.
632
633        This is called by the http_error_30x methods when a
634        redirection response is received.  If a redirection should
635        take place, return a new Request to allow http_error_30x to
636        perform the redirect.  Otherwise, raise HTTPError if no-one
637        else should try to handle this url.  Return None if you can't
638        but another Handler might.
639        """
640        m = req.get_method()
641        if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
642            or code in (301, 302, 303) and m == "POST")):
643            raise HTTPError(req.full_url, code, msg, headers, fp)
644
645        # Strictly (according to RFC 2616), 301 or 302 in response to
646        # a POST MUST NOT cause a redirection without confirmation
647        # from the user (of urllib.request, in this case).  In practice,
648        # essentially all clients do redirect in this case, so we do
649        # the same.
650        # be conciliant with URIs containing a space
651        newurl = newurl.replace(' ', '%20')
652        CONTENT_HEADERS = ("content-length", "content-type")
653        newheaders = dict((k, v) for k, v in req.headers.items()
654                          if k.lower() not in CONTENT_HEADERS)
655        return Request(newurl,
656                       headers=newheaders,
657                       origin_req_host=req.origin_req_host,
658                       unverifiable=True)
659
660    # Implementation note: To avoid the server sending us into an
661    # infinite loop, the request object needs to track what URLs we
662    # have already seen.  Do this by adding a handler-specific
663    # attribute to the Request object.
664    def http_error_302(self, req, fp, code, msg, headers):
665        # Some servers (incorrectly) return multiple Location headers
666        # (so probably same goes for URI).  Use first header.
667        if "location" in headers:
668            newurl = headers["location"]
669        elif "uri" in headers:
670            newurl = headers["uri"]
671        else:
672            return
673
674        # fix a possible malformed URL
675        urlparts = urlparse(newurl)
676
677        # For security reasons we don't allow redirection to anything other
678        # than http, https or ftp.
679
680        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
681            raise HTTPError(
682                newurl, code,
683                "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
684                headers, fp)
685
686        if not urlparts.path:
687            urlparts = list(urlparts)
688            urlparts[2] = "/"
689        newurl = urlunparse(urlparts)
690
691        newurl = urljoin(req.full_url, newurl)
692
693        # XXX Probably want to forget about the state of the current
694        # request, although that might interact poorly with other
695        # handlers that also use handler-specific request attributes
696        new = self.redirect_request(req, fp, code, msg, headers, newurl)
697        if new is None:
698            return
699
700        # loop detection
701        # .redirect_dict has a key url if url was previously visited.
702        if hasattr(req, 'redirect_dict'):
703            visited = new.redirect_dict = req.redirect_dict
704            if (visited.get(newurl, 0) >= self.max_repeats or
705                len(visited) >= self.max_redirections):
706                raise HTTPError(req.full_url, code,
707                                self.inf_msg + msg, headers, fp)
708        else:
709            visited = new.redirect_dict = req.redirect_dict = {}
710        visited[newurl] = visited.get(newurl, 0) + 1
711
712        # Don't close the fp until we are sure that we won't use it
713        # with HTTPError.
714        fp.read()
715        fp.close()
716
717        return self.parent.open(new, timeout=req.timeout)
718
719    http_error_301 = http_error_303 = http_error_307 = http_error_302
720
721    inf_msg = "The HTTP server returned a redirect error that would " \
722              "lead to an infinite loop.\n" \
723              "The last 30x error message was:\n"
724
725
726def _parse_proxy(proxy):
727    """Return (scheme, user, password, host/port) given a URL or an authority.
728
729    If a URL is supplied, it must have an authority (host:port) component.
730    According to RFC 3986, having an authority component means the URL must
731    have two slashes after the scheme:
732
733    >>> _parse_proxy('file:/ftp.example.com/')
734    Traceback (most recent call last):
735    ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
736
737    The first three items of the returned tuple may be None.
738
739    Examples of authority parsing:
740
741    >>> _parse_proxy('proxy.example.com')
742    (None, None, None, 'proxy.example.com')
743    >>> _parse_proxy('proxy.example.com:3128')
744    (None, None, None, 'proxy.example.com:3128')
745
746    The authority component may optionally include userinfo (assumed to be
747    username:password):
748
749    >>> _parse_proxy('joe:password@proxy.example.com')
750    (None, 'joe', 'password', 'proxy.example.com')
751    >>> _parse_proxy('joe:password@proxy.example.com:3128')
752    (None, 'joe', 'password', 'proxy.example.com:3128')
753
754    Same examples, but with URLs instead:
755
756    >>> _parse_proxy('http://proxy.example.com/')
757    ('http', None, None, 'proxy.example.com')
758    >>> _parse_proxy('http://proxy.example.com:3128/')
759    ('http', None, None, 'proxy.example.com:3128')
760    >>> _parse_proxy('http://joe:password@proxy.example.com/')
761    ('http', 'joe', 'password', 'proxy.example.com')
762    >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
763    ('http', 'joe', 'password', 'proxy.example.com:3128')
764
765    Everything after the authority is ignored:
766
767    >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
768    ('ftp', 'joe', 'password', 'proxy.example.com')
769
770    Test for no trailing '/' case:
771
772    >>> _parse_proxy('http://joe:password@proxy.example.com')
773    ('http', 'joe', 'password', 'proxy.example.com')
774
775    """
776    scheme, r_scheme = splittype(proxy)
777    if not r_scheme.startswith("/"):
778        # authority
779        scheme = None
780        authority = proxy
781    else:
782        # URL
783        if not r_scheme.startswith("//"):
784            raise ValueError("proxy URL with no authority: %r" % proxy)
785        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
786        # and 3.3.), path is empty or starts with '/'
787        end = r_scheme.find("/", 2)
788        if end == -1:
789            end = None
790        authority = r_scheme[2:end]
791    userinfo, hostport = splituser(authority)
792    if userinfo is not None:
793        user, password = splitpasswd(userinfo)
794    else:
795        user = password = None
796    return scheme, user, password, hostport
797
798class ProxyHandler(BaseHandler):
799    # Proxies must be in front
800    handler_order = 100
801
802    def __init__(self, proxies=None):
803        if proxies is None:
804            proxies = getproxies()
805        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
806        self.proxies = proxies
807        for type, url in proxies.items():
808            setattr(self, '%s_open' % type,
809                    lambda r, proxy=url, type=type, meth=self.proxy_open:
810                        meth(r, proxy, type))
811
812    def proxy_open(self, req, proxy, type):
813        orig_type = req.type
814        proxy_type, user, password, hostport = _parse_proxy(proxy)
815        if proxy_type is None:
816            proxy_type = orig_type
817
818        if req.host and proxy_bypass(req.host):
819            return None
820
821        if user and password:
822            user_pass = '%s:%s' % (unquote(user),
823                                   unquote(password))
824            creds = base64.b64encode(user_pass.encode()).decode("ascii")
825            req.add_header('Proxy-authorization', 'Basic ' + creds)
826        hostport = unquote(hostport)
827        req.set_proxy(hostport, proxy_type)
828        if orig_type == proxy_type or orig_type == 'https':
829            # let other handlers take care of it
830            return None
831        else:
832            # need to start over, because the other handlers don't
833            # grok the proxy's URL type
834            # e.g. if we have a constructor arg proxies like so:
835            # {'http': 'ftp://proxy.example.com'}, we may end up turning
836            # a request for http://acme.example.com/a into one for
837            # ftp://proxy.example.com/a
838            return self.parent.open(req, timeout=req.timeout)
839
840class HTTPPasswordMgr(object):
841
842    def __init__(self):
843        self.passwd = {}
844
845    def add_password(self, realm, uri, user, passwd):
846        # uri could be a single URI or a sequence
847        if isinstance(uri, str):
848            uri = [uri]
849        if realm not in self.passwd:
850            self.passwd[realm] = {}
851        for default_port in True, False:
852            reduced_uri = tuple(
853                [self.reduce_uri(u, default_port) for u in uri])
854            self.passwd[realm][reduced_uri] = (user, passwd)
855
856    def find_user_password(self, realm, authuri):
857        domains = self.passwd.get(realm, {})
858        for default_port in True, False:
859            reduced_authuri = self.reduce_uri(authuri, default_port)
860            for uris, authinfo in domains.items():
861                for uri in uris:
862                    if self.is_suburi(uri, reduced_authuri):
863                        return authinfo
864        return None, None
865
866    def reduce_uri(self, uri, default_port=True):
867        """Accept authority or URI and extract only the authority and path."""
868        # note HTTP URLs do not have a userinfo component
869        parts = urlsplit(uri)
870        if parts[1]:
871            # URI
872            scheme = parts[0]
873            authority = parts[1]
874            path = parts[2] or '/'
875        else:
876            # host or host:port
877            scheme = None
878            authority = uri
879            path = '/'
880        host, port = splitport(authority)
881        if default_port and port is None and scheme is not None:
882            dport = {"http": 80,
883                     "https": 443,
884                     }.get(scheme)
885            if dport is not None:
886                authority = "%s:%d" % (host, dport)
887        return authority, path
888
889    def is_suburi(self, base, test):
890        """Check if test is below base in a URI tree
891
892        Both args must be URIs in reduced form.
893        """
894        if base == test:
895            return True
896        if base[0] != test[0]:
897            return False
898        common = posixpath.commonprefix((base[1], test[1]))
899        if len(common) == len(base[1]):
900            return True
901        return False
902
903
904class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
905
906    def find_user_password(self, realm, authuri):
907        user, password = HTTPPasswordMgr.find_user_password(self, realm,
908                                                            authuri)
909        if user is not None:
910            return user, password
911        return HTTPPasswordMgr.find_user_password(self, None, authuri)
912
913
914class AbstractBasicAuthHandler(object):
915
916    # XXX this allows for multiple auth-schemes, but will stupidly pick
917    # the last one with a realm specified.
918
919    # allow for double- and single-quoted realm values
920    # (single quotes are a violation of the RFC, but appear in the wild)
921    rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
922                    'realm=(["\']?)([^"\']*)\\2', re.I)
923
924    # XXX could pre-emptively send auth info already accepted (RFC 2617,
925    # end of section 2, and section 1.2 immediately after "credentials"
926    # production).
927
928    def __init__(self, password_mgr=None):
929        if password_mgr is None:
930            password_mgr = HTTPPasswordMgr()
931        self.passwd = password_mgr
932        self.add_password = self.passwd.add_password
933        self.retried = 0
934
935    def reset_retry_count(self):
936        self.retried = 0
937
938    def http_error_auth_reqed(self, authreq, host, req, headers):
939        # host may be an authority (without userinfo) or a URL with an
940        # authority
941        # XXX could be multiple headers
942        authreq = headers.get(authreq, None)
943
944        if self.retried > 5:
945            # retry sending the username:password 5 times before failing.
946            raise HTTPError(req.get_full_url(), 401, "basic auth failed",
947                    headers, None)
948        else:
949            self.retried += 1
950
951        if authreq:
952            scheme = authreq.split()[0]
953            if scheme.lower() != 'basic':
954                raise ValueError("AbstractBasicAuthHandler does not"
955                                 " support the following scheme: '%s'" %
956                                 scheme)
957            else:
958                mo = AbstractBasicAuthHandler.rx.search(authreq)
959                if mo:
960                    scheme, quote, realm = mo.groups()
961                    if quote not in ['"',"'"]:
962                        warnings.warn("Basic Auth Realm was unquoted",
963                                      UserWarning, 2)
964                    if scheme.lower() == 'basic':
965                        response = self.retry_http_basic_auth(host, req, realm)
966                        if response and response.code != 401:
967                            self.retried = 0
968                        return response
969
970    def retry_http_basic_auth(self, host, req, realm):
971        user, pw = self.passwd.find_user_password(realm, host)
972        if pw is not None:
973            raw = "%s:%s" % (user, pw)
974            auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
975            if req.headers.get(self.auth_header, None) == auth:
976                return None
977            req.add_unredirected_header(self.auth_header, auth)
978            return self.parent.open(req, timeout=req.timeout)
979        else:
980            return None
981
982
983class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
984
985    auth_header = 'Authorization'
986
987    def http_error_401(self, req, fp, code, msg, headers):
988        url = req.full_url
989        response = self.http_error_auth_reqed('www-authenticate',
990                                          url, req, headers)
991        self.reset_retry_count()
992        return response
993
994
995class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
996
997    auth_header = 'Proxy-authorization'
998
999    def http_error_407(self, req, fp, code, msg, headers):
1000        # http_error_auth_reqed requires that there is no userinfo component in
1001        # authority.  Assume there isn't one, since urllib.request does not (and
1002        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1003        # userinfo.
1004        authority = req.host
1005        response = self.http_error_auth_reqed('proxy-authenticate',
1006                                          authority, req, headers)
1007        self.reset_retry_count()
1008        return response
1009
1010
1011# Return n random bytes.
1012_randombytes = os.urandom
1013
1014
1015class AbstractDigestAuthHandler(object):
1016    # Digest authentication is specified in RFC 2617.
1017
1018    # XXX The client does not inspect the Authentication-Info header
1019    # in a successful response.
1020
1021    # XXX It should be possible to test this implementation against
1022    # a mock server that just generates a static set of challenges.
1023
1024    # XXX qop="auth-int" supports is shaky
1025
1026    def __init__(self, passwd=None):
1027        if passwd is None:
1028            passwd = HTTPPasswordMgr()
1029        self.passwd = passwd
1030        self.add_password = self.passwd.add_password
1031        self.retried = 0
1032        self.nonce_count = 0
1033        self.last_nonce = None
1034
1035    def reset_retry_count(self):
1036        self.retried = 0
1037
1038    def http_error_auth_reqed(self, auth_header, host, req, headers):
1039        authreq = headers.get(auth_header, None)
1040        if self.retried > 5:
1041            # Don't fail endlessly - if we failed once, we'll probably
1042            # fail a second time. Hm. Unless the Password Manager is
1043            # prompting for the information. Crap. This isn't great
1044            # but it's better than the current 'repeat until recursion
1045            # depth exceeded' approach <wink>
1046            raise HTTPError(req.full_url, 401, "digest auth failed",
1047                            headers, None)
1048        else:
1049            self.retried += 1
1050        if authreq:
1051            scheme = authreq.split()[0]
1052            if scheme.lower() == 'digest':
1053                return self.retry_http_digest_auth(req, authreq)
1054            elif scheme.lower() != 'basic':
1055                raise ValueError("AbstractDigestAuthHandler does not support"
1056                                 " the following scheme: '%s'" % scheme)
1057
1058    def retry_http_digest_auth(self, req, auth):
1059        token, challenge = auth.split(' ', 1)
1060        chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1061        auth = self.get_authorization(req, chal)
1062        if auth:
1063            auth_val = 'Digest %s' % auth
1064            if req.headers.get(self.auth_header, None) == auth_val:
1065                return None
1066            req.add_unredirected_header(self.auth_header, auth_val)
1067            resp = self.parent.open(req, timeout=req.timeout)
1068            return resp
1069
1070    def get_cnonce(self, nonce):
1071        # The cnonce-value is an opaque
1072        # quoted string value provided by the client and used by both client
1073        # and server to avoid chosen plaintext attacks, to provide mutual
1074        # authentication, and to provide some message integrity protection.
1075        # This isn't a fabulous effort, but it's probably Good Enough.
1076        s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1077        b = s.encode("ascii") + _randombytes(8)
1078        dig = hashlib.sha1(b).hexdigest()
1079        return dig[:16]
1080
1081    def get_authorization(self, req, chal):
1082        try:
1083            realm = chal['realm']
1084            nonce = chal['nonce']
1085            qop = chal.get('qop')
1086            algorithm = chal.get('algorithm', 'MD5')
1087            # mod_digest doesn't send an opaque, even though it isn't
1088            # supposed to be optional
1089            opaque = chal.get('opaque', None)
1090        except KeyError:
1091            return None
1092
1093        H, KD = self.get_algorithm_impls(algorithm)
1094        if H is None:
1095            return None
1096
1097        user, pw = self.passwd.find_user_password(realm, req.full_url)
1098        if user is None:
1099            return None
1100
1101        # XXX not implemented yet
1102        if req.data is not None:
1103            entdig = self.get_entity_digest(req.data, chal)
1104        else:
1105            entdig = None
1106
1107        A1 = "%s:%s:%s" % (user, realm, pw)
1108        A2 = "%s:%s" % (req.get_method(),
1109                        # XXX selector: what about proxies and full urls
1110                        req.selector)
1111        if qop == 'auth':
1112            if nonce == self.last_nonce:
1113                self.nonce_count += 1
1114            else:
1115                self.nonce_count = 1
1116                self.last_nonce = nonce
1117            ncvalue = '%08x' % self.nonce_count
1118            cnonce = self.get_cnonce(nonce)
1119            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1120            respdig = KD(H(A1), noncebit)
1121        elif qop is None:
1122            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1123        else:
1124            # XXX handle auth-int.
1125            raise URLError("qop '%s' is not supported." % qop)
1126
1127        # XXX should the partial digests be encoded too?
1128
1129        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1130               'response="%s"' % (user, realm, nonce, req.selector,
1131                                  respdig)
1132        if opaque:
1133            base += ', opaque="%s"' % opaque
1134        if entdig:
1135            base += ', digest="%s"' % entdig
1136        base += ', algorithm="%s"' % algorithm
1137        if qop:
1138            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1139        return base
1140
1141    def get_algorithm_impls(self, algorithm):
1142        # lambdas assume digest modules are imported at the top level
1143        if algorithm == 'MD5':
1144            H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1145        elif algorithm == 'SHA':
1146            H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1147        # XXX MD5-sess
1148        KD = lambda s, d: H("%s:%s" % (s, d))
1149        return H, KD
1150
1151    def get_entity_digest(self, data, chal):
1152        # XXX not implemented yet
1153        return None
1154
1155
1156class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1157    """An authentication protocol defined by RFC 2069
1158
1159    Digest authentication improves on basic authentication because it
1160    does not transmit passwords in the clear.
1161    """
1162
1163    auth_header = 'Authorization'
1164    handler_order = 490  # before Basic auth
1165
1166    def http_error_401(self, req, fp, code, msg, headers):
1167        host = urlparse(req.full_url)[1]
1168        retry = self.http_error_auth_reqed('www-authenticate',
1169                                           host, req, headers)
1170        self.reset_retry_count()
1171        return retry
1172
1173
1174class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1175
1176    auth_header = 'Proxy-Authorization'
1177    handler_order = 490  # before Basic auth
1178
1179    def http_error_407(self, req, fp, code, msg, headers):
1180        host = req.host
1181        retry = self.http_error_auth_reqed('proxy-authenticate',
1182                                           host, req, headers)
1183        self.reset_retry_count()
1184        return retry
1185
1186class AbstractHTTPHandler(BaseHandler):
1187
1188    def __init__(self, debuglevel=0):
1189        self._debuglevel = debuglevel
1190
1191    def set_http_debuglevel(self, level):
1192        self._debuglevel = level
1193
1194    def do_request_(self, request):
1195        host = request.host
1196        if not host:
1197            raise URLError('no host given')
1198
1199        if request.data is not None:  # POST
1200            data = request.data
1201            if isinstance(data, str):
1202                msg = "POST data should be bytes or an iterable of bytes. " \
1203                      "It cannot be of type str."
1204                raise TypeError(msg)
1205            if not request.has_header('Content-type'):
1206                request.add_unredirected_header(
1207                    'Content-type',
1208                    'application/x-www-form-urlencoded')
1209            if not request.has_header('Content-length'):
1210                size = None
1211                try:
1212                    ### For Python-Future:
1213                    if PY2 and isinstance(data, array.array):
1214                        # memoryviews of arrays aren't supported
1215                        # in Py2.7. (e.g. memoryview(array.array('I',
1216                        # [1, 2, 3, 4])) raises a TypeError.)
1217                        # So we calculate the size manually instead:
1218                        size = len(data) * data.itemsize
1219                    ###
1220                    else:
1221                        mv = memoryview(data)
1222                        size = len(mv) * mv.itemsize
1223                except TypeError:
1224                    if isinstance(data, collections.Iterable):
1225                        raise ValueError("Content-Length should be specified "
1226                                "for iterable data of type %r %r" % (type(data),
1227                                data))
1228                else:
1229                    request.add_unredirected_header(
1230                            'Content-length', '%d' % size)
1231
1232        sel_host = host
1233        if request.has_proxy():
1234            scheme, sel = splittype(request.selector)
1235            sel_host, sel_path = splithost(sel)
1236        if not request.has_header('Host'):
1237            request.add_unredirected_header('Host', sel_host)
1238        for name, value in self.parent.addheaders:
1239            name = name.capitalize()
1240            if not request.has_header(name):
1241                request.add_unredirected_header(name, value)
1242
1243        return request
1244
1245    def do_open(self, http_class, req, **http_conn_args):
1246        """Return an HTTPResponse object for the request, using http_class.
1247
1248        http_class must implement the HTTPConnection API from http.client.
1249        """
1250        host = req.host
1251        if not host:
1252            raise URLError('no host given')
1253
1254        # will parse host:port
1255        h = http_class(host, timeout=req.timeout, **http_conn_args)
1256
1257        headers = dict(req.unredirected_hdrs)
1258        headers.update(dict((k, v) for k, v in req.headers.items()
1259                            if k not in headers))
1260
1261        # TODO(jhylton): Should this be redesigned to handle
1262        # persistent connections?
1263
1264        # We want to make an HTTP/1.1 request, but the addinfourl
1265        # class isn't prepared to deal with a persistent connection.
1266        # It will try to read all remaining data from the socket,
1267        # which will block while the server waits for the next request.
1268        # So make sure the connection gets closed after the (only)
1269        # request.
1270        headers["Connection"] = "close"
1271        headers = dict((name.title(), val) for name, val in headers.items())
1272
1273        if req._tunnel_host:
1274            tunnel_headers = {}
1275            proxy_auth_hdr = "Proxy-Authorization"
1276            if proxy_auth_hdr in headers:
1277                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1278                # Proxy-Authorization should not be sent to origin
1279                # server.
1280                del headers[proxy_auth_hdr]
1281            h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1282
1283        try:
1284            h.request(req.get_method(), req.selector, req.data, headers)
1285        except socket.error as err: # timeout error
1286            h.close()
1287            raise URLError(err)
1288        else:
1289            r = h.getresponse()
1290            # If the server does not send us a 'Connection: close' header,
1291            # HTTPConnection assumes the socket should be left open. Manually
1292            # mark the socket to be closed when this response object goes away.
1293            if h.sock:
1294                h.sock.close()
1295                h.sock = None
1296
1297
1298        r.url = req.get_full_url()
1299        # This line replaces the .msg attribute of the HTTPResponse
1300        # with .headers, because urllib clients expect the response to
1301        # have the reason in .msg.  It would be good to mark this
1302        # attribute is deprecated and get then to use info() or
1303        # .headers.
1304        r.msg = r.reason
1305        return r
1306
1307
1308class HTTPHandler(AbstractHTTPHandler):
1309
1310    def http_open(self, req):
1311        return self.do_open(http_client.HTTPConnection, req)
1312
1313    http_request = AbstractHTTPHandler.do_request_
1314
1315if hasattr(http_client, 'HTTPSConnection'):
1316
1317    class HTTPSHandler(AbstractHTTPHandler):
1318
1319        def __init__(self, debuglevel=0, context=None, check_hostname=None):
1320            AbstractHTTPHandler.__init__(self, debuglevel)
1321            self._context = context
1322            self._check_hostname = check_hostname
1323
1324        def https_open(self, req):
1325            return self.do_open(http_client.HTTPSConnection, req,
1326                context=self._context, check_hostname=self._check_hostname)
1327
1328        https_request = AbstractHTTPHandler.do_request_
1329
1330    __all__.append('HTTPSHandler')
1331
1332class HTTPCookieProcessor(BaseHandler):
1333    def __init__(self, cookiejar=None):
1334        import future.backports.http.cookiejar as http_cookiejar
1335        if cookiejar is None:
1336            cookiejar = http_cookiejar.CookieJar()
1337        self.cookiejar = cookiejar
1338
1339    def http_request(self, request):
1340        self.cookiejar.add_cookie_header(request)
1341        return request
1342
1343    def http_response(self, request, response):
1344        self.cookiejar.extract_cookies(response, request)
1345        return response
1346
1347    https_request = http_request
1348    https_response = http_response
1349
1350class UnknownHandler(BaseHandler):
1351    def unknown_open(self, req):
1352        type = req.type
1353        raise URLError('unknown url type: %s' % type)
1354
1355def parse_keqv_list(l):
1356    """Parse list of key=value strings where keys are not duplicated."""
1357    parsed = {}
1358    for elt in l:
1359        k, v = elt.split('=', 1)
1360        if v[0] == '"' and v[-1] == '"':
1361            v = v[1:-1]
1362        parsed[k] = v
1363    return parsed
1364
1365def parse_http_list(s):
1366    """Parse lists as described by RFC 2068 Section 2.
1367
1368    In particular, parse comma-separated lists where the elements of
1369    the list may include quoted-strings.  A quoted-string could
1370    contain a comma.  A non-quoted string could have quotes in the
1371    middle.  Neither commas nor quotes count if they are escaped.
1372    Only double-quotes count, not single-quotes.
1373    """
1374    res = []
1375    part = ''
1376
1377    escape = quote = False
1378    for cur in s:
1379        if escape:
1380            part += cur
1381            escape = False
1382            continue
1383        if quote:
1384            if cur == '\\':
1385                escape = True
1386                continue
1387            elif cur == '"':
1388                quote = False
1389            part += cur
1390            continue
1391
1392        if cur == ',':
1393            res.append(part)
1394            part = ''
1395            continue
1396
1397        if cur == '"':
1398            quote = True
1399
1400        part += cur
1401
1402    # append last part
1403    if part:
1404        res.append(part)
1405
1406    return [part.strip() for part in res]
1407
1408class FileHandler(BaseHandler):
1409    # Use local file or FTP depending on form of URL
1410    def file_open(self, req):
1411        url = req.selector
1412        if url[:2] == '//' and url[2:3] != '/' and (req.host and
1413                req.host != 'localhost'):
1414            if not req.host is self.get_names():
1415                raise URLError("file:// scheme is supported only on localhost")
1416        else:
1417            return self.open_local_file(req)
1418
1419    # names for the localhost
1420    names = None
1421    def get_names(self):
1422        if FileHandler.names is None:
1423            try:
1424                FileHandler.names = tuple(
1425                    socket.gethostbyname_ex('localhost')[2] +
1426                    socket.gethostbyname_ex(socket.gethostname())[2])
1427            except socket.gaierror:
1428                FileHandler.names = (socket.gethostbyname('localhost'),)
1429        return FileHandler.names
1430
1431    # not entirely sure what the rules are here
1432    def open_local_file(self, req):
1433        import future.backports.email.utils as email_utils
1434        import mimetypes
1435        host = req.host
1436        filename = req.selector
1437        localfile = url2pathname(filename)
1438        try:
1439            stats = os.stat(localfile)
1440            size = stats.st_size
1441            modified = email_utils.formatdate(stats.st_mtime, usegmt=True)
1442            mtype = mimetypes.guess_type(filename)[0]
1443            headers = email.message_from_string(
1444                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1445                (mtype or 'text/plain', size, modified))
1446            if host:
1447                host, port = splitport(host)
1448            if not host or \
1449                (not port and _safe_gethostbyname(host) in self.get_names()):
1450                if host:
1451                    origurl = 'file://' + host + filename
1452                else:
1453                    origurl = 'file://' + filename
1454                return addinfourl(open(localfile, 'rb'), headers, origurl)
1455        except OSError as exp:
1456            # users shouldn't expect OSErrors coming from urlopen()
1457            raise URLError(exp)
1458        raise URLError('file not on local host')
1459
1460def _safe_gethostbyname(host):
1461    try:
1462        return socket.gethostbyname(host)
1463    except socket.gaierror:
1464        return None
1465
1466class FTPHandler(BaseHandler):
1467    def ftp_open(self, req):
1468        import ftplib
1469        import mimetypes
1470        host = req.host
1471        if not host:
1472            raise URLError('ftp error: no host given')
1473        host, port = splitport(host)
1474        if port is None:
1475            port = ftplib.FTP_PORT
1476        else:
1477            port = int(port)
1478
1479        # username/password handling
1480        user, host = splituser(host)
1481        if user:
1482            user, passwd = splitpasswd(user)
1483        else:
1484            passwd = None
1485        host = unquote(host)
1486        user = user or ''
1487        passwd = passwd or ''
1488
1489        try:
1490            host = socket.gethostbyname(host)
1491        except socket.error as msg:
1492            raise URLError(msg)
1493        path, attrs = splitattr(req.selector)
1494        dirs = path.split('/')
1495        dirs = list(map(unquote, dirs))
1496        dirs, file = dirs[:-1], dirs[-1]
1497        if dirs and not dirs[0]:
1498            dirs = dirs[1:]
1499        try:
1500            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1501            type = file and 'I' or 'D'
1502            for attr in attrs:
1503                attr, value = splitvalue(attr)
1504                if attr.lower() == 'type' and \
1505                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1506                    type = value.upper()
1507            fp, retrlen = fw.retrfile(file, type)
1508            headers = ""
1509            mtype = mimetypes.guess_type(req.full_url)[0]
1510            if mtype:
1511                headers += "Content-type: %s\n" % mtype
1512            if retrlen is not None and retrlen >= 0:
1513                headers += "Content-length: %d\n" % retrlen
1514            headers = email.message_from_string(headers)
1515            return addinfourl(fp, headers, req.full_url)
1516        except ftplib.all_errors as exp:
1517            exc = URLError('ftp error: %r' % exp)
1518            raise_with_traceback(exc)
1519
1520    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1521        return ftpwrapper(user, passwd, host, port, dirs, timeout,
1522                          persistent=False)
1523
1524class CacheFTPHandler(FTPHandler):
1525    # XXX would be nice to have pluggable cache strategies
1526    # XXX this stuff is definitely not thread safe
1527    def __init__(self):
1528        self.cache = {}
1529        self.timeout = {}
1530        self.soonest = 0
1531        self.delay = 60
1532        self.max_conns = 16
1533
1534    def setTimeout(self, t):
1535        self.delay = t
1536
1537    def setMaxConns(self, m):
1538        self.max_conns = m
1539
1540    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1541        key = user, host, port, '/'.join(dirs), timeout
1542        if key in self.cache:
1543            self.timeout[key] = time.time() + self.delay
1544        else:
1545            self.cache[key] = ftpwrapper(user, passwd, host, port,
1546                                         dirs, timeout)
1547            self.timeout[key] = time.time() + self.delay
1548        self.check_cache()
1549        return self.cache[key]
1550
1551    def check_cache(self):
1552        # first check for old ones
1553        t = time.time()
1554        if self.soonest <= t:
1555            for k, v in list(self.timeout.items()):
1556                if v < t:
1557                    self.cache[k].close()
1558                    del self.cache[k]
1559                    del self.timeout[k]
1560        self.soonest = min(list(self.timeout.values()))
1561
1562        # then check the size
1563        if len(self.cache) == self.max_conns:
1564            for k, v in list(self.timeout.items()):
1565                if v == self.soonest:
1566                    del self.cache[k]
1567                    del self.timeout[k]
1568                    break
1569            self.soonest = min(list(self.timeout.values()))
1570
1571    def clear_cache(self):
1572        for conn in self.cache.values():
1573            conn.close()
1574        self.cache.clear()
1575        self.timeout.clear()
1576
1577
1578# Code move from the old urllib module
1579
1580MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
1581
1582# Helper for non-unix systems
1583if os.name == 'nt':
1584    from nturl2path import url2pathname, pathname2url
1585else:
1586    def url2pathname(pathname):
1587        """OS-specific conversion from a relative URL of the 'file' scheme
1588        to a file system path; not recommended for general use."""
1589        return unquote(pathname)
1590
1591    def pathname2url(pathname):
1592        """OS-specific conversion from a file system path to a relative URL
1593        of the 'file' scheme; not recommended for general use."""
1594        return quote(pathname)
1595
1596# This really consists of two pieces:
1597# (1) a class which handles opening of all sorts of URLs
1598#     (plus assorted utilities etc.)
1599# (2) a set of functions for parsing URLs
1600# XXX Should these be separated out into different modules?
1601
1602
1603ftpcache = {}
1604class URLopener(object):
1605    """Class to open URLs.
1606    This is a class rather than just a subroutine because we may need
1607    more than one set of global protocol-specific options.
1608    Note -- this is a base class for those who don't want the
1609    automatic handling of errors type 302 (relocated) and 401
1610    (authorization needed)."""
1611
1612    __tempfiles = None
1613
1614    version = "Python-urllib/%s" % __version__
1615
1616    # Constructor
1617    def __init__(self, proxies=None, **x509):
1618        msg = "%(class)s style of invoking requests is deprecated. " \
1619              "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1620        warnings.warn(msg, DeprecationWarning, stacklevel=3)
1621        if proxies is None:
1622            proxies = getproxies()
1623        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1624        self.proxies = proxies
1625        self.key_file = x509.get('key_file')
1626        self.cert_file = x509.get('cert_file')
1627        self.addheaders = [('User-Agent', self.version)]
1628        self.__tempfiles = []
1629        self.__unlink = os.unlink # See cleanup()
1630        self.tempcache = None
1631        # Undocumented feature: if you assign {} to tempcache,
1632        # it is used to cache files retrieved with
1633        # self.retrieve().  This is not enabled by default
1634        # since it does not work for changing documents (and I
1635        # haven't got the logic to check expiration headers
1636        # yet).
1637        self.ftpcache = ftpcache
1638        # Undocumented feature: you can use a different
1639        # ftp cache by assigning to the .ftpcache member;
1640        # in case you want logically independent URL openers
1641        # XXX This is not threadsafe.  Bah.
1642
1643    def __del__(self):
1644        self.close()
1645
1646    def close(self):
1647        self.cleanup()
1648
1649    def cleanup(self):
1650        # This code sometimes runs when the rest of this module
1651        # has already been deleted, so it can't use any globals
1652        # or import anything.
1653        if self.__tempfiles:
1654            for file in self.__tempfiles:
1655                try:
1656                    self.__unlink(file)
1657                except OSError:
1658                    pass
1659            del self.__tempfiles[:]
1660        if self.tempcache:
1661            self.tempcache.clear()
1662
1663    def addheader(self, *args):
1664        """Add a header to be used by the HTTP interface only
1665        e.g. u.addheader('Accept', 'sound/basic')"""
1666        self.addheaders.append(args)
1667
1668    # External interface
1669    def open(self, fullurl, data=None):
1670        """Use URLopener().open(file) instead of open(file, 'r')."""
1671        fullurl = unwrap(to_bytes(fullurl))
1672        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1673        if self.tempcache and fullurl in self.tempcache:
1674            filename, headers = self.tempcache[fullurl]
1675            fp = open(filename, 'rb')
1676            return addinfourl(fp, headers, fullurl)
1677        urltype, url = splittype(fullurl)
1678        if not urltype:
1679            urltype = 'file'
1680        if urltype in self.proxies:
1681            proxy = self.proxies[urltype]
1682            urltype, proxyhost = splittype(proxy)
1683            host, selector = splithost(proxyhost)
1684            url = (host, fullurl) # Signal special case to open_*()
1685        else:
1686            proxy = None
1687        name = 'open_' + urltype
1688        self.type = urltype
1689        name = name.replace('-', '_')
1690        if not hasattr(self, name):
1691            if proxy:
1692                return self.open_unknown_proxy(proxy, fullurl, data)
1693            else:
1694                return self.open_unknown(fullurl, data)
1695        try:
1696            if data is None:
1697                return getattr(self, name)(url)
1698            else:
1699                return getattr(self, name)(url, data)
1700        except HTTPError:
1701            raise
1702        except socket.error as msg:
1703            raise_with_traceback(IOError('socket error', msg))
1704
1705    def open_unknown(self, fullurl, data=None):
1706        """Overridable interface to open unknown URL type."""
1707        type, url = splittype(fullurl)
1708        raise IOError('url error', 'unknown url type', type)
1709
1710    def open_unknown_proxy(self, proxy, fullurl, data=None):
1711        """Overridable interface to open unknown URL type."""
1712        type, url = splittype(fullurl)
1713        raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1714
1715    # External interface
1716    def retrieve(self, url, filename=None, reporthook=None, data=None):
1717        """retrieve(url) returns (filename, headers) for a local object
1718        or (tempfilename, headers) for a remote object."""
1719        url = unwrap(to_bytes(url))
1720        if self.tempcache and url in self.tempcache:
1721            return self.tempcache[url]
1722        type, url1 = splittype(url)
1723        if filename is None and (not type or type == 'file'):
1724            try:
1725                fp = self.open_local_file(url1)
1726                hdrs = fp.info()
1727                fp.close()
1728                return url2pathname(splithost(url1)[1]), hdrs
1729            except IOError as msg:
1730                pass
1731        fp = self.open(url, data)
1732        try:
1733            headers = fp.info()
1734            if filename:
1735                tfp = open(filename, 'wb')
1736            else:
1737                import tempfile
1738                garbage, path = splittype(url)
1739                garbage, path = splithost(path or "")
1740                path, garbage = splitquery(path or "")
1741                path, garbage = splitattr(path or "")
1742                suffix = os.path.splitext(path)[1]
1743                (fd, filename) = tempfile.mkstemp(suffix)
1744                self.__tempfiles.append(filename)
1745                tfp = os.fdopen(fd, 'wb')
1746            try:
1747                result = filename, headers
1748                if self.tempcache is not None:
1749                    self.tempcache[url] = result
1750                bs = 1024*8
1751                size = -1
1752                read = 0
1753                blocknum = 0
1754                if "content-length" in headers:
1755                    size = int(headers["Content-Length"])
1756                if reporthook:
1757                    reporthook(blocknum, bs, size)
1758                while 1:
1759                    block = fp.read(bs)
1760                    if not block:
1761                        break
1762                    read += len(block)
1763                    tfp.write(block)
1764                    blocknum += 1
1765                    if reporthook:
1766                        reporthook(blocknum, bs, size)
1767            finally:
1768                tfp.close()
1769        finally:
1770            fp.close()
1771
1772        # raise exception if actual size does not match content-length header
1773        if size >= 0 and read < size:
1774            raise ContentTooShortError(
1775                "retrieval incomplete: got only %i out of %i bytes"
1776                % (read, size), result)
1777
1778        return result
1779
1780    # Each method named open_<type> knows how to open that type of URL
1781
1782    def _open_generic_http(self, connection_factory, url, data):
1783        """Make an HTTP connection using connection_class.
1784
1785        This is an internal method that should be called from
1786        open_http() or open_https().
1787
1788        Arguments:
1789        - connection_factory should take a host name and return an
1790          HTTPConnection instance.
1791        - url is the url to retrieval or a host, relative-path pair.
1792        - data is payload for a POST request or None.
1793        """
1794
1795        user_passwd = None
1796        proxy_passwd= None
1797        if isinstance(url, str):
1798            host, selector = splithost(url)
1799            if host:
1800                user_passwd, host = splituser(host)
1801                host = unquote(host)
1802            realhost = host
1803        else:
1804            host, selector = url
1805            # check whether the proxy contains authorization information
1806            proxy_passwd, host = splituser(host)
1807            # now we proceed with the url we want to obtain
1808            urltype, rest = splittype(selector)
1809            url = rest
1810            user_passwd = None
1811            if urltype.lower() != 'http':
1812                realhost = None
1813            else:
1814                realhost, rest = splithost(rest)
1815                if realhost:
1816                    user_passwd, realhost = splituser(realhost)
1817                if user_passwd:
1818                    selector = "%s://%s%s" % (urltype, realhost, rest)
1819                if proxy_bypass(realhost):
1820                    host = realhost
1821
1822        if not host: raise IOError('http error', 'no host given')
1823
1824        if proxy_passwd:
1825            proxy_passwd = unquote(proxy_passwd)
1826            proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1827        else:
1828            proxy_auth = None
1829
1830        if user_passwd:
1831            user_passwd = unquote(user_passwd)
1832            auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1833        else:
1834            auth = None
1835        http_conn = connection_factory(host)
1836        headers = {}
1837        if proxy_auth:
1838            headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1839        if auth:
1840            headers["Authorization"] =  "Basic %s" % auth
1841        if realhost:
1842            headers["Host"] = realhost
1843
1844        # Add Connection:close as we don't support persistent connections yet.
1845        # This helps in closing the socket and avoiding ResourceWarning
1846
1847        headers["Connection"] = "close"
1848
1849        for header, value in self.addheaders:
1850            headers[header] = value
1851
1852        if data is not None:
1853            headers["Content-Type"] = "application/x-www-form-urlencoded"
1854            http_conn.request("POST", selector, data, headers)
1855        else:
1856            http_conn.request("GET", selector, headers=headers)
1857
1858        try:
1859            response = http_conn.getresponse()
1860        except http_client.BadStatusLine:
1861            # something went wrong with the HTTP status line
1862            raise URLError("http protocol error: bad status line")
1863
1864        # According to RFC 2616, "2xx" code indicates that the client's
1865        # request was successfully received, understood, and accepted.
1866        if 200 <= response.status < 300:
1867            return addinfourl(response, response.msg, "http:" + url,
1868                              response.status)
1869        else:
1870            return self.http_error(
1871                url, response.fp,
1872                response.status, response.reason, response.msg, data)
1873
1874    def open_http(self, url, data=None):
1875        """Use HTTP protocol."""
1876        return self._open_generic_http(http_client.HTTPConnection, url, data)
1877
1878    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1879        """Handle http errors.
1880
1881        Derived class can override this, or provide specific handlers
1882        named http_error_DDD where DDD is the 3-digit error code."""
1883        # First check if there's a specific handler for this error
1884        name = 'http_error_%d' % errcode
1885        if hasattr(self, name):
1886            method = getattr(self, name)
1887            if data is None:
1888                result = method(url, fp, errcode, errmsg, headers)
1889            else:
1890                result = method(url, fp, errcode, errmsg, headers, data)
1891            if result: return result
1892        return self.http_error_default(url, fp, errcode, errmsg, headers)
1893
1894    def http_error_default(self, url, fp, errcode, errmsg, headers):
1895        """Default error handler: close the connection and raise IOError."""
1896        fp.close()
1897        raise HTTPError(url, errcode, errmsg, headers, None)
1898
1899    if _have_ssl:
1900        def _https_connection(self, host):
1901            return http_client.HTTPSConnection(host,
1902                                           key_file=self.key_file,
1903                                           cert_file=self.cert_file)
1904
1905        def open_https(self, url, data=None):
1906            """Use HTTPS protocol."""
1907            return self._open_generic_http(self._https_connection, url, data)
1908
1909    def open_file(self, url):
1910        """Use local file or FTP depending on form of URL."""
1911        if not isinstance(url, str):
1912            raise URLError('file error: proxy support for file protocol currently not implemented')
1913        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1914            raise ValueError("file:// scheme is supported only on localhost")
1915        else:
1916            return self.open_local_file(url)
1917
1918    def open_local_file(self, url):
1919        """Use local file."""
1920        import future.backports.email.utils as email_utils
1921        import mimetypes
1922        host, file = splithost(url)
1923        localname = url2pathname(file)
1924        try:
1925            stats = os.stat(localname)
1926        except OSError as e:
1927            raise URLError(e.strerror, e.filename)
1928        size = stats.st_size
1929        modified = email_utils.formatdate(stats.st_mtime, usegmt=True)
1930        mtype = mimetypes.guess_type(url)[0]
1931        headers = email.message_from_string(
1932            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1933            (mtype or 'text/plain', size, modified))
1934        if not host:
1935            urlfile = file
1936            if file[:1] == '/':
1937                urlfile = 'file://' + file
1938            return addinfourl(open(localname, 'rb'), headers, urlfile)
1939        host, port = splitport(host)
1940        if (not port
1941           and socket.gethostbyname(host) in ((localhost(),) + thishost())):
1942            urlfile = file
1943            if file[:1] == '/':
1944                urlfile = 'file://' + file
1945            elif file[:2] == './':
1946                raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
1947            return addinfourl(open(localname, 'rb'), headers, urlfile)
1948        raise URLError('local file error: not on local host')
1949
1950    def open_ftp(self, url):
1951        """Use FTP protocol."""
1952        if not isinstance(url, str):
1953            raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
1954        import mimetypes
1955        host, path = splithost(url)
1956        if not host: raise URLError('ftp error: no host given')
1957        host, port = splitport(host)
1958        user, host = splituser(host)
1959        if user: user, passwd = splitpasswd(user)
1960        else: passwd = None
1961        host = unquote(host)
1962        user = unquote(user or '')
1963        passwd = unquote(passwd or '')
1964        host = socket.gethostbyname(host)
1965        if not port:
1966            import ftplib
1967            port = ftplib.FTP_PORT
1968        else:
1969            port = int(port)
1970        path, attrs = splitattr(path)
1971        path = unquote(path)
1972        dirs = path.split('/')
1973        dirs, file = dirs[:-1], dirs[-1]
1974        if dirs and not dirs[0]: dirs = dirs[1:]
1975        if dirs and not dirs[0]: dirs[0] = '/'
1976        key = user, host, port, '/'.join(dirs)
1977        # XXX thread unsafe!
1978        if len(self.ftpcache) > MAXFTPCACHE:
1979            # Prune the cache, rather arbitrarily
1980            for k in self.ftpcache.keys():
1981                if k != key:
1982                    v = self.ftpcache[k]
1983                    del self.ftpcache[k]
1984                    v.close()
1985        try:
1986            if key not in self.ftpcache:
1987                self.ftpcache[key] = \
1988                    ftpwrapper(user, passwd, host, port, dirs)
1989            if not file: type = 'D'
1990            else: type = 'I'
1991            for attr in attrs:
1992                attr, value = splitvalue(attr)
1993                if attr.lower() == 'type' and \
1994                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1995                    type = value.upper()
1996            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
1997            mtype = mimetypes.guess_type("ftp:" + url)[0]
1998            headers = ""
1999            if mtype:
2000                headers += "Content-Type: %s\n" % mtype
2001            if retrlen is not None and retrlen >= 0:
2002                headers += "Content-Length: %d\n" % retrlen
2003            headers = email.message_from_string(headers)
2004            return addinfourl(fp, headers, "ftp:" + url)
2005        except ftperrors() as exp:
2006            raise_with_traceback(URLError('ftp error %r' % exp))
2007
2008    def open_data(self, url, data=None):
2009        """Use "data" URL."""
2010        if not isinstance(url, str):
2011            raise URLError('data error: proxy support for data protocol currently not implemented')
2012        # ignore POSTed data
2013        #
2014        # syntax of data URLs:
2015        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
2016        # mediatype := [ type "/" subtype ] *( ";" parameter )
2017        # data      := *urlchar
2018        # parameter := attribute "=" value
2019        try:
2020            [type, data] = url.split(',', 1)
2021        except ValueError:
2022            raise IOError('data error', 'bad data URL')
2023        if not type:
2024            type = 'text/plain;charset=US-ASCII'
2025        semi = type.rfind(';')
2026        if semi >= 0 and '=' not in type[semi:]:
2027            encoding = type[semi+1:]
2028            type = type[:semi]
2029        else:
2030            encoding = ''
2031        msg = []
2032        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2033                                            time.gmtime(time.time())))
2034        msg.append('Content-type: %s' % type)
2035        if encoding == 'base64':
2036            # XXX is this encoding/decoding ok?
2037            data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2038        else:
2039            data = unquote(data)
2040        msg.append('Content-Length: %d' % len(data))
2041        msg.append('')
2042        msg.append(data)
2043        msg = '\n'.join(msg)
2044        headers = email.message_from_string(msg)
2045        f = io.StringIO(msg)
2046        #f.fileno = None     # needed for addinfourl
2047        return addinfourl(f, headers, url)
2048
2049
2050class FancyURLopener(URLopener):
2051    """Derived class with handlers for errors we can handle (perhaps)."""
2052
2053    def __init__(self, *args, **kwargs):
2054        URLopener.__init__(self, *args, **kwargs)
2055        self.auth_cache = {}
2056        self.tries = 0
2057        self.maxtries = 10
2058
2059    def http_error_default(self, url, fp, errcode, errmsg, headers):
2060        """Default error handling -- don't raise an exception."""
2061        return addinfourl(fp, headers, "http:" + url, errcode)
2062
2063    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2064        """Error 302 -- relocated (temporarily)."""
2065        self.tries += 1
2066        if self.maxtries and self.tries >= self.maxtries:
2067            if hasattr(self, "http_error_500"):
2068                meth = self.http_error_500
2069            else:
2070                meth = self.http_error_default
2071            self.tries = 0
2072            return meth(url, fp, 500,
2073                        "Internal Server Error: Redirect Recursion", headers)
2074        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
2075                                        data)
2076        self.tries = 0
2077        return result
2078
2079    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2080        if 'location' in headers:
2081            newurl = headers['location']
2082        elif 'uri' in headers:
2083            newurl = headers['uri']
2084        else:
2085            return
2086        fp.close()
2087
2088        # In case the server sent a relative URL, join with original:
2089        newurl = urljoin(self.type + ":" + url, newurl)
2090
2091        urlparts = urlparse(newurl)
2092
2093        # For security reasons, we don't allow redirection to anything other
2094        # than http, https and ftp.
2095
2096        # We are using newer HTTPError with older redirect_internal method
2097        # This older method will get deprecated in 3.3
2098
2099        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2100            raise HTTPError(newurl, errcode,
2101                            errmsg +
2102                            " Redirection to url '%s' is not allowed." % newurl,
2103                            headers, fp)
2104
2105        return self.open(newurl)
2106
2107    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2108        """Error 301 -- also relocated (permanently)."""
2109        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2110
2111    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2112        """Error 303 -- also relocated (essentially identical to 302)."""
2113        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2114
2115    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2116        """Error 307 -- relocated, but turn POST into error."""
2117        if data is None:
2118            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2119        else:
2120            return self.http_error_default(url, fp, errcode, errmsg, headers)
2121
2122    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2123            retry=False):
2124        """Error 401 -- authentication required.
2125        This function supports Basic authentication only."""
2126        if 'www-authenticate' not in headers:
2127            URLopener.http_error_default(self, url, fp,
2128                                         errcode, errmsg, headers)
2129        stuff = headers['www-authenticate']
2130        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2131        if not match:
2132            URLopener.http_error_default(self, url, fp,
2133                                         errcode, errmsg, headers)
2134        scheme, realm = match.groups()
2135        if scheme.lower() != 'basic':
2136            URLopener.http_error_default(self, url, fp,
2137                                         errcode, errmsg, headers)
2138        if not retry:
2139            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2140                    headers)
2141        name = 'retry_' + self.type + '_basic_auth'
2142        if data is None:
2143            return getattr(self,name)(url, realm)
2144        else:
2145            return getattr(self,name)(url, realm, data)
2146
2147    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2148            retry=False):
2149        """Error 407 -- proxy authentication required.
2150        This function supports Basic authentication only."""
2151        if 'proxy-authenticate' not in headers:
2152            URLopener.http_error_default(self, url, fp,
2153                                         errcode, errmsg, headers)
2154        stuff = headers['proxy-authenticate']
2155        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2156        if not match:
2157            URLopener.http_error_default(self, url, fp,
2158                                         errcode, errmsg, headers)
2159        scheme, realm = match.groups()
2160        if scheme.lower() != 'basic':
2161            URLopener.http_error_default(self, url, fp,
2162                                         errcode, errmsg, headers)
2163        if not retry:
2164            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2165                    headers)
2166        name = 'retry_proxy_' + self.type + '_basic_auth'
2167        if data is None:
2168            return getattr(self,name)(url, realm)
2169        else:
2170            return getattr(self,name)(url, realm, data)
2171
2172    def retry_proxy_http_basic_auth(self, url, realm, data=None):
2173        host, selector = splithost(url)
2174        newurl = 'http://' + host + selector
2175        proxy = self.proxies['http']
2176        urltype, proxyhost = splittype(proxy)
2177        proxyhost, proxyselector = splithost(proxyhost)
2178        i = proxyhost.find('@') + 1
2179        proxyhost = proxyhost[i:]
2180        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2181        if not (user or passwd): return None
2182        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2183                                  quote(passwd, safe=''), proxyhost)
2184        self.proxies['http'] = 'http://' + proxyhost + proxyselector
2185        if data is None:
2186            return self.open(newurl)
2187        else:
2188            return self.open(newurl, data)
2189
2190    def retry_proxy_https_basic_auth(self, url, realm, data=None):
2191        host, selector = splithost(url)
2192        newurl = 'https://' + host + selector
2193        proxy = self.proxies['https']
2194        urltype, proxyhost = splittype(proxy)
2195        proxyhost, proxyselector = splithost(proxyhost)
2196        i = proxyhost.find('@') + 1
2197        proxyhost = proxyhost[i:]
2198        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2199        if not (user or passwd): return None
2200        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2201                                  quote(passwd, safe=''), proxyhost)
2202        self.proxies['https'] = 'https://' + proxyhost + proxyselector
2203        if data is None:
2204            return self.open(newurl)
2205        else:
2206            return self.open(newurl, data)
2207
2208    def retry_http_basic_auth(self, url, realm, data=None):
2209        host, selector = splithost(url)
2210        i = host.find('@') + 1
2211        host = host[i:]
2212        user, passwd = self.get_user_passwd(host, realm, i)
2213        if not (user or passwd): return None
2214        host = "%s:%s@%s" % (quote(user, safe=''),
2215                             quote(passwd, safe=''), host)
2216        newurl = 'http://' + host + selector
2217        if data is None:
2218            return self.open(newurl)
2219        else:
2220            return self.open(newurl, data)
2221
2222    def retry_https_basic_auth(self, url, realm, data=None):
2223        host, selector = splithost(url)
2224        i = host.find('@') + 1
2225        host = host[i:]
2226        user, passwd = self.get_user_passwd(host, realm, i)
2227        if not (user or passwd): return None
2228        host = "%s:%s@%s" % (quote(user, safe=''),
2229                             quote(passwd, safe=''), host)
2230        newurl = 'https://' + host + selector
2231        if data is None:
2232            return self.open(newurl)
2233        else:
2234            return self.open(newurl, data)
2235
2236    def get_user_passwd(self, host, realm, clear_cache=0):
2237        key = realm + '@' + host.lower()
2238        if key in self.auth_cache:
2239            if clear_cache:
2240                del self.auth_cache[key]
2241            else:
2242                return self.auth_cache[key]
2243        user, passwd = self.prompt_user_passwd(host, realm)
2244        if user or passwd: self.auth_cache[key] = (user, passwd)
2245        return user, passwd
2246
2247    def prompt_user_passwd(self, host, realm):
2248        """Override this in a GUI environment!"""
2249        import getpass
2250        try:
2251            user = input("Enter username for %s at %s: " % (realm, host))
2252            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2253                (user, realm, host))
2254            return user, passwd
2255        except KeyboardInterrupt:
2256            print()
2257            return None, None
2258
2259
2260# Utility functions
2261
2262_localhost = None
2263def localhost():
2264    """Return the IP address of the magic hostname 'localhost'."""
2265    global _localhost
2266    if _localhost is None:
2267        _localhost = socket.gethostbyname('localhost')
2268    return _localhost
2269
2270_thishost = None
2271def thishost():
2272    """Return the IP addresses of the current host."""
2273    global _thishost
2274    if _thishost is None:
2275        try:
2276            _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2277        except socket.gaierror:
2278            _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2279    return _thishost
2280
2281_ftperrors = None
2282def ftperrors():
2283    """Return the set of errors raised by the FTP class."""
2284    global _ftperrors
2285    if _ftperrors is None:
2286        import ftplib
2287        _ftperrors = ftplib.all_errors
2288    return _ftperrors
2289
2290_noheaders = None
2291def noheaders():
2292    """Return an empty email Message object."""
2293    global _noheaders
2294    if _noheaders is None:
2295        _noheaders = email.message_from_string("")
2296    return _noheaders
2297
2298
2299# Utility classes
2300
2301class ftpwrapper(object):
2302    """Class used by open_ftp() for cache of open FTP connections."""
2303
2304    def __init__(self, user, passwd, host, port, dirs, timeout=None,
2305                 persistent=True):
2306        self.user = user
2307        self.passwd = passwd
2308        self.host = host
2309        self.port = port
2310        self.dirs = dirs
2311        self.timeout = timeout
2312        self.refcount = 0
2313        self.keepalive = persistent
2314        self.init()
2315
2316    def init(self):
2317        import ftplib
2318        self.busy = 0
2319        self.ftp = ftplib.FTP()
2320        self.ftp.connect(self.host, self.port, self.timeout)
2321        self.ftp.login(self.user, self.passwd)
2322        _target = '/'.join(self.dirs)
2323        self.ftp.cwd(_target)
2324
2325    def retrfile(self, file, type):
2326        import ftplib
2327        self.endtransfer()
2328        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2329        else: cmd = 'TYPE ' + type; isdir = 0
2330        try:
2331            self.ftp.voidcmd(cmd)
2332        except ftplib.all_errors:
2333            self.init()
2334            self.ftp.voidcmd(cmd)
2335        conn = None
2336        if file and not isdir:
2337            # Try to retrieve as a file
2338            try:
2339                cmd = 'RETR ' + file
2340                conn, retrlen = self.ftp.ntransfercmd(cmd)
2341            except ftplib.error_perm as reason:
2342                if str(reason)[:3] != '550':
2343                    raise_with_traceback(URLError('ftp error: %r' % reason))
2344        if not conn:
2345            # Set transfer mode to ASCII!
2346            self.ftp.voidcmd('TYPE A')
2347            # Try a directory listing. Verify that directory exists.
2348            if file:
2349                pwd = self.ftp.pwd()
2350                try:
2351                    try:
2352                        self.ftp.cwd(file)
2353                    except ftplib.error_perm as reason:
2354                        ### Was:
2355                        # raise URLError('ftp error: %r' % reason) from reason
2356                        exc = URLError('ftp error: %r' % reason)
2357                        exc.__cause__ = reason
2358                        raise exc
2359                finally:
2360                    self.ftp.cwd(pwd)
2361                cmd = 'LIST ' + file
2362            else:
2363                cmd = 'LIST'
2364            conn, retrlen = self.ftp.ntransfercmd(cmd)
2365        self.busy = 1
2366
2367        ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2368        self.refcount += 1
2369        conn.close()
2370        # Pass back both a suitably decorated object and a retrieval length
2371        return (ftpobj, retrlen)
2372
2373    def endtransfer(self):
2374        self.busy = 0
2375
2376    def close(self):
2377        self.keepalive = False
2378        if self.refcount <= 0:
2379            self.real_close()
2380
2381    def file_close(self):
2382        self.endtransfer()
2383        self.refcount -= 1
2384        if self.refcount <= 0 and not self.keepalive:
2385            self.real_close()
2386
2387    def real_close(self):
2388        self.endtransfer()
2389        try:
2390            self.ftp.close()
2391        except ftperrors():
2392            pass
2393
2394# Proxy handling
2395def getproxies_environment():
2396    """Return a dictionary of scheme -> proxy server URL mappings.
2397
2398    Scan the environment for variables named <scheme>_proxy;
2399    this seems to be the standard convention.  If you need a
2400    different way, you can pass a proxies dictionary to the
2401    [Fancy]URLopener constructor.
2402
2403    """
2404    proxies = {}
2405    for name, value in os.environ.items():
2406        name = name.lower()
2407        if value and name[-6:] == '_proxy':
2408            proxies[name[:-6]] = value
2409    return proxies
2410
2411def proxy_bypass_environment(host):
2412    """Test if proxies should not be used for a particular host.
2413
2414    Checks the environment for a variable named no_proxy, which should
2415    be a list of DNS suffixes separated by commas, or '*' for all hosts.
2416    """
2417    no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2418    # '*' is special case for always bypass
2419    if no_proxy == '*':
2420        return 1
2421    # strip port off host
2422    hostonly, port = splitport(host)
2423    # check if the host ends with any of the DNS suffixes
2424    no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2425    for name in no_proxy_list:
2426        if name and (hostonly.endswith(name) or host.endswith(name)):
2427            return 1
2428    # otherwise, don't bypass
2429    return 0
2430
2431
2432# This code tests an OSX specific data structure but is testable on all
2433# platforms
2434def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2435    """
2436    Return True iff this host shouldn't be accessed using a proxy
2437
2438    This function uses the MacOSX framework SystemConfiguration
2439    to fetch the proxy information.
2440
2441    proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2442    { 'exclude_simple': bool,
2443      'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2444    }
2445    """
2446    from fnmatch import fnmatch
2447
2448    hostonly, port = splitport(host)
2449
2450    def ip2num(ipAddr):
2451        parts = ipAddr.split('.')
2452        parts = list(map(int, parts))
2453        if len(parts) != 4:
2454            parts = (parts + [0, 0, 0, 0])[:4]
2455        return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2456
2457    # Check for simple host names:
2458    if '.' not in host:
2459        if proxy_settings['exclude_simple']:
2460            return True
2461
2462    hostIP = None
2463
2464    for value in proxy_settings.get('exceptions', ()):
2465        # Items in the list are strings like these: *.local, 169.254/16
2466        if not value: continue
2467
2468        m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2469        if m is not None:
2470            if hostIP is None:
2471                try:
2472                    hostIP = socket.gethostbyname(hostonly)
2473                    hostIP = ip2num(hostIP)
2474                except socket.error:
2475                    continue
2476
2477            base = ip2num(m.group(1))
2478            mask = m.group(2)
2479            if mask is None:
2480                mask = 8 * (m.group(1).count('.') + 1)
2481            else:
2482                mask = int(mask[1:])
2483            mask = 32 - mask
2484
2485            if (hostIP >> mask) == (base >> mask):
2486                return True
2487
2488        elif fnmatch(host, value):
2489            return True
2490
2491    return False
2492
2493
2494if sys.platform == 'darwin':
2495    from _scproxy import _get_proxy_settings, _get_proxies
2496
2497    def proxy_bypass_macosx_sysconf(host):
2498        proxy_settings = _get_proxy_settings()
2499        return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2500
2501    def getproxies_macosx_sysconf():
2502        """Return a dictionary of scheme -> proxy server URL mappings.
2503
2504        This function uses the MacOSX framework SystemConfiguration
2505        to fetch the proxy information.
2506        """
2507        return _get_proxies()
2508
2509
2510
2511    def proxy_bypass(host):
2512        if getproxies_environment():
2513            return proxy_bypass_environment(host)
2514        else:
2515            return proxy_bypass_macosx_sysconf(host)
2516
2517    def getproxies():
2518        return getproxies_environment() or getproxies_macosx_sysconf()
2519
2520
2521elif os.name == 'nt':
2522    def getproxies_registry():
2523        """Return a dictionary of scheme -> proxy server URL mappings.
2524
2525        Win32 uses the registry to store proxies.
2526
2527        """
2528        proxies = {}
2529        try:
2530            import winreg
2531        except ImportError:
2532            # Std module, so should be around - but you never know!
2533            return proxies
2534        try:
2535            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2536                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2537            proxyEnable = winreg.QueryValueEx(internetSettings,
2538                                               'ProxyEnable')[0]
2539            if proxyEnable:
2540                # Returned as Unicode but problems if not converted to ASCII
2541                proxyServer = str(winreg.QueryValueEx(internetSettings,
2542                                                       'ProxyServer')[0])
2543                if '=' in proxyServer:
2544                    # Per-protocol settings
2545                    for p in proxyServer.split(';'):
2546                        protocol, address = p.split('=', 1)
2547                        # See if address has a type:// prefix
2548                        if not re.match('^([^/:]+)://', address):
2549                            address = '%s://%s' % (protocol, address)
2550                        proxies[protocol] = address
2551                else:
2552                    # Use one setting for all protocols
2553                    if proxyServer[:5] == 'http:':
2554                        proxies['http'] = proxyServer
2555                    else:
2556                        proxies['http'] = 'http://%s' % proxyServer
2557                        proxies['https'] = 'https://%s' % proxyServer
2558                        proxies['ftp'] = 'ftp://%s' % proxyServer
2559            internetSettings.Close()
2560        except (WindowsError, ValueError, TypeError):
2561            # Either registry key not found etc, or the value in an
2562            # unexpected format.
2563            # proxies already set up to be empty so nothing to do
2564            pass
2565        return proxies
2566
2567    def getproxies():
2568        """Return a dictionary of scheme -> proxy server URL mappings.
2569
2570        Returns settings gathered from the environment, if specified,
2571        or the registry.
2572
2573        """
2574        return getproxies_environment() or getproxies_registry()
2575
2576    def proxy_bypass_registry(host):
2577        try:
2578            import winreg
2579        except ImportError:
2580            # Std modules, so should be around - but you never know!
2581            return 0
2582        try:
2583            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2584                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2585            proxyEnable = winreg.QueryValueEx(internetSettings,
2586                                               'ProxyEnable')[0]
2587            proxyOverride = str(winreg.QueryValueEx(internetSettings,
2588                                                     'ProxyOverride')[0])
2589            # ^^^^ Returned as Unicode but problems if not converted to ASCII
2590        except WindowsError:
2591            return 0
2592        if not proxyEnable or not proxyOverride:
2593            return 0
2594        # try to make a host list from name and IP address.
2595        rawHost, port = splitport(host)
2596        host = [rawHost]
2597        try:
2598            addr = socket.gethostbyname(rawHost)
2599            if addr != rawHost:
2600                host.append(addr)
2601        except socket.error:
2602            pass
2603        try:
2604            fqdn = socket.getfqdn(rawHost)
2605            if fqdn != rawHost:
2606                host.append(fqdn)
2607        except socket.error:
2608            pass
2609        # make a check value list from the registry entry: replace the
2610        # '<local>' string by the localhost entry and the corresponding
2611        # canonical entry.
2612        proxyOverride = proxyOverride.split(';')
2613        # now check if we match one of the registry values.
2614        for test in proxyOverride:
2615            if test == '<local>':
2616                if '.' not in rawHost:
2617                    return 1
2618            test = test.replace(".", r"\.")     # mask dots
2619            test = test.replace("*", r".*")     # change glob sequence
2620            test = test.replace("?", r".")      # change glob char
2621            for val in host:
2622                if re.match(test, val, re.I):
2623                    return 1
2624        return 0
2625
2626    def proxy_bypass(host):
2627        """Return a dictionary of scheme -> proxy server URL mappings.
2628
2629        Returns settings gathered from the environment, if specified,
2630        or the registry.
2631
2632        """
2633        if getproxies_environment():
2634            return proxy_bypass_environment(host)
2635        else:
2636            return proxy_bypass_registry(host)
2637
2638else:
2639    # By default use environment variables
2640    getproxies = getproxies_environment
2641    proxy_bypass = proxy_bypass_environment
2642