1"""
2Ported using Python-Future from the Python 3.3 standard library.
3
4An extensible library for opening URLs using a variety of protocols
5
6The simplest way to use this module is to call the urlopen function,
7which accepts a string containing a URL or a Request object (described
8below).  It opens the URL and returns the results as file-like
9object; the returned object has some extra methods described below.
10
11The OpenerDirector manages a collection of Handler objects that do
12all the actual work.  Each Handler implements a particular protocol or
13option.  The OpenerDirector is a composite object that invokes the
14Handlers needed to open the requested URL.  For example, the
15HTTPHandler performs HTTP GET and POST requests and deals with
16non-error returns.  The HTTPRedirectHandler automatically deals with
17HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
18deals with digest authentication.
19
20urlopen(url, data=None) -- Basic usage is the same as original
21urllib.  pass the url and optionally data to post to an HTTP URL, and
22get a file-like object back.  One difference is that you can also pass
23a Request instance instead of URL.  Raises a URLError (subclass of
24IOError); for HTTP errors, raises an HTTPError, which can also be
25treated as a valid response.
26
27build_opener -- Function that creates a new OpenerDirector instance.
28Will install the default handlers.  Accepts one or more Handlers as
29arguments, either instances or Handler classes that it will
30instantiate.  If one of the argument is a subclass of the default
31handler, the argument will be installed instead of the default.
32
33install_opener -- Installs a new opener as the default opener.
34
35objects of interest:
36
37OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
38the Handler classes, while dealing with requests and responses.
39
40Request -- An object that encapsulates the state of a request.  The
41state can be as simple as the URL.  It can also include extra HTTP
42headers, e.g. a User-Agent.
43
44BaseHandler --
45
46internals:
47BaseHandler and parent
48_call_chain conventions
49
50Example usage:
51
52import urllib.request
53
54# set up authentication info
55authinfo = urllib.request.HTTPBasicAuthHandler()
56authinfo.add_password(realm='PDQ Application',
57                      uri='https://mahler:8092/site-updates.py',
58                      user='klem',
59                      passwd='geheim$parole')
60
61proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
62
63# build a new opener that adds authentication and caching FTP handlers
64opener = urllib.request.build_opener(proxy_support, authinfo,
65                                     urllib.request.CacheFTPHandler)
66
67# install it
68urllib.request.install_opener(opener)
69
70f = urllib.request.urlopen('http://www.python.org/')
71"""
72
73# XXX issues:
74# If an authentication error handler that tries to perform
75# authentication for some reason but fails, how should the error be
76# signalled?  The client needs to know the HTTP error code.  But if
77# the handler knows that the problem was, e.g., that it didn't know
78# that hash algo that requested in the challenge, it would be good to
79# pass that information along to the client, too.
80# ftp errors aren't handled cleanly
81# check digest against correct (i.e. non-apache) implementation
82
83# Possible extensions:
84# complex proxies  XXX not sure what exactly was meant by this
85# abstract factory for opener
86
87from __future__ import absolute_import, division, print_function, unicode_literals
88from future.builtins import bytes, dict, filter, input, int, map, open, str
89from future.utils import PY2, PY3, raise_with_traceback
90
91import base64
92import bisect
93import hashlib
94import array
95
96from future.backports import email
97from future.backports.http import client as http_client
98from .error import URLError, HTTPError, ContentTooShortError
99from .parse import (
100    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
101    splittype, splithost, splitport, splituser, splitpasswd,
102    splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
103from .response import addinfourl, addclosehook
104
105import io
106import os
107import posixpath
108import re
109import socket
110import sys
111import time
112import tempfile
113import contextlib
114import warnings
115
116from future.utils import PY2
117
118if PY2:
119    from collections import Iterable
120else:
121    from collections.abc import Iterable
122
123# check for SSL
124try:
125    import ssl
126    # Not available in the SSL module in Py2:
127    from ssl import SSLContext
128except ImportError:
129    _have_ssl = False
130else:
131    _have_ssl = True
132
133__all__ = [
134    # Classes
135    'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
136    'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
137    'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
138    'AbstractBasicAuthHandler', 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler',
139    'AbstractDigestAuthHandler', 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler',
140    'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler',
141    'UnknownHandler', 'HTTPErrorProcessor',
142    # Functions
143    'urlopen', 'install_opener', 'build_opener',
144    'pathname2url', 'url2pathname', 'getproxies',
145    # Legacy interface
146    'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
147]
148
149# used in User-Agent header sent
150__version__ = sys.version[:3]
151
152_opener = None
153def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, **_3to2kwargs):
154    if 'cadefault' in _3to2kwargs: cadefault = _3to2kwargs['cadefault']; del _3to2kwargs['cadefault']
155    else: cadefault = False
156    if 'capath' in _3to2kwargs: capath = _3to2kwargs['capath']; del _3to2kwargs['capath']
157    else: capath = None
158    if 'cafile' in _3to2kwargs: cafile = _3to2kwargs['cafile']; del _3to2kwargs['cafile']
159    else: cafile = None
160    global _opener
161    if cafile or capath or cadefault:
162        if not _have_ssl:
163            raise ValueError('SSL support not available')
164        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
165        context.options |= ssl.OP_NO_SSLv2
166        context.verify_mode = ssl.CERT_REQUIRED
167        if cafile or capath:
168            context.load_verify_locations(cafile, capath)
169        else:
170            context.set_default_verify_paths()
171        https_handler = HTTPSHandler(context=context, check_hostname=True)
172        opener = build_opener(https_handler)
173    elif _opener is None:
174        _opener = opener = build_opener()
175    else:
176        opener = _opener
177    return opener.open(url, data, timeout)
178
179def install_opener(opener):
180    global _opener
181    _opener = opener
182
183_url_tempfiles = []
184def urlretrieve(url, filename=None, reporthook=None, data=None):
185    """
186    Retrieve a URL into a temporary location on disk.
187
188    Requires a URL argument. If a filename is passed, it is used as
189    the temporary file location. The reporthook argument should be
190    a callable that accepts a block number, a read size, and the
191    total file size of the URL target. The data argument should be
192    valid URL encoded data.
193
194    If a filename is passed and the URL points to a local resource,
195    the result is a copy from local file to new file.
196
197    Returns a tuple containing the path to the newly created
198    data file as well as the resulting HTTPMessage object.
199    """
200    url_type, path = splittype(url)
201
202    with contextlib.closing(urlopen(url, data)) as fp:
203        headers = fp.info()
204
205        # Just return the local path and the "headers" for file://
206        # URLs. No sense in performing a copy unless requested.
207        if url_type == "file" and not filename:
208            return os.path.normpath(path), headers
209
210        # Handle temporary file setup.
211        if filename:
212            tfp = open(filename, 'wb')
213        else:
214            tfp = tempfile.NamedTemporaryFile(delete=False)
215            filename = tfp.name
216            _url_tempfiles.append(filename)
217
218        with tfp:
219            result = filename, headers
220            bs = 1024*8
221            size = -1
222            read = 0
223            blocknum = 0
224            if "content-length" in headers:
225                size = int(headers["Content-Length"])
226
227            if reporthook:
228                reporthook(blocknum, bs, size)
229
230            while True:
231                block = fp.read(bs)
232                if not block:
233                    break
234                read += len(block)
235                tfp.write(block)
236                blocknum += 1
237                if reporthook:
238                    reporthook(blocknum, bs, size)
239
240    if size >= 0 and read < size:
241        raise ContentTooShortError(
242            "retrieval incomplete: got only %i out of %i bytes"
243            % (read, size), result)
244
245    return result
246
247def urlcleanup():
248    for temp_file in _url_tempfiles:
249        try:
250            os.unlink(temp_file)
251        except EnvironmentError:
252            pass
253
254    del _url_tempfiles[:]
255    global _opener
256    if _opener:
257        _opener = None
258
259if PY3:
260    _cut_port_re = re.compile(r":\d+$", re.ASCII)
261else:
262    _cut_port_re = re.compile(r":\d+$")
263
264def request_host(request):
265
266    """Return request-host, as defined by RFC 2965.
267
268    Variation from RFC: returned value is lowercased, for convenient
269    comparison.
270
271    """
272    url = request.full_url
273    host = urlparse(url)[1]
274    if host == "":
275        host = request.get_header("Host", "")
276
277    # remove port, if present
278    host = _cut_port_re.sub("", host, 1)
279    return host.lower()
280
281class Request(object):
282
283    def __init__(self, url, data=None, headers={},
284                 origin_req_host=None, unverifiable=False,
285                 method=None):
286        # unwrap('<URL:type://host/path>') --> 'type://host/path'
287        self.full_url = unwrap(url)
288        self.full_url, self.fragment = splittag(self.full_url)
289        self.data = data
290        self.headers = {}
291        self._tunnel_host = None
292        for key, value in headers.items():
293            self.add_header(key, value)
294        self.unredirected_hdrs = {}
295        if origin_req_host is None:
296            origin_req_host = request_host(self)
297        self.origin_req_host = origin_req_host
298        self.unverifiable = unverifiable
299        self.method = method
300        self._parse()
301
302    def _parse(self):
303        self.type, rest = splittype(self.full_url)
304        if self.type is None:
305            raise ValueError("unknown url type: %r" % self.full_url)
306        self.host, self.selector = splithost(rest)
307        if self.host:
308            self.host = unquote(self.host)
309
310    def get_method(self):
311        """Return a string indicating the HTTP request method."""
312        if self.method is not None:
313            return self.method
314        elif self.data is not None:
315            return "POST"
316        else:
317            return "GET"
318
319    def get_full_url(self):
320        if self.fragment:
321            return '%s#%s' % (self.full_url, self.fragment)
322        else:
323            return self.full_url
324
325    # Begin deprecated methods
326
327    def add_data(self, data):
328        msg = "Request.add_data method is deprecated."
329        warnings.warn(msg, DeprecationWarning, stacklevel=1)
330        self.data = data
331
332    def has_data(self):
333        msg = "Request.has_data method is deprecated."
334        warnings.warn(msg, DeprecationWarning, stacklevel=1)
335        return self.data is not None
336
337    def get_data(self):
338        msg = "Request.get_data method is deprecated."
339        warnings.warn(msg, DeprecationWarning, stacklevel=1)
340        return self.data
341
342    def get_type(self):
343        msg = "Request.get_type method is deprecated."
344        warnings.warn(msg, DeprecationWarning, stacklevel=1)
345        return self.type
346
347    def get_host(self):
348        msg = "Request.get_host method is deprecated."
349        warnings.warn(msg, DeprecationWarning, stacklevel=1)
350        return self.host
351
352    def get_selector(self):
353        msg = "Request.get_selector method is deprecated."
354        warnings.warn(msg, DeprecationWarning, stacklevel=1)
355        return self.selector
356
357    def is_unverifiable(self):
358        msg = "Request.is_unverifiable method is deprecated."
359        warnings.warn(msg, DeprecationWarning, stacklevel=1)
360        return self.unverifiable
361
362    def get_origin_req_host(self):
363        msg = "Request.get_origin_req_host method is deprecated."
364        warnings.warn(msg, DeprecationWarning, stacklevel=1)
365        return self.origin_req_host
366
367    # End deprecated methods
368
369    def set_proxy(self, host, type):
370        if self.type == 'https' and not self._tunnel_host:
371            self._tunnel_host = self.host
372        else:
373            self.type= type
374            self.selector = self.full_url
375        self.host = host
376
377    def has_proxy(self):
378        return self.selector == self.full_url
379
380    def add_header(self, key, val):
381        # useful for something like authentication
382        self.headers[key.capitalize()] = val
383
384    def add_unredirected_header(self, key, val):
385        # will not be added to a redirected request
386        self.unredirected_hdrs[key.capitalize()] = val
387
388    def has_header(self, header_name):
389        return (header_name in self.headers or
390                header_name in self.unredirected_hdrs)
391
392    def get_header(self, header_name, default=None):
393        return self.headers.get(
394            header_name,
395            self.unredirected_hdrs.get(header_name, default))
396
397    def header_items(self):
398        hdrs = self.unredirected_hdrs.copy()
399        hdrs.update(self.headers)
400        return list(hdrs.items())
401
402class OpenerDirector(object):
403    def __init__(self):
404        client_version = "Python-urllib/%s" % __version__
405        self.addheaders = [('User-agent', client_version)]
406        # self.handlers is retained only for backward compatibility
407        self.handlers = []
408        # manage the individual handlers
409        self.handle_open = {}
410        self.handle_error = {}
411        self.process_response = {}
412        self.process_request = {}
413
414    def add_handler(self, handler):
415        if not hasattr(handler, "add_parent"):
416            raise TypeError("expected BaseHandler instance, got %r" %
417                            type(handler))
418
419        added = False
420        for meth in dir(handler):
421            if meth in ["redirect_request", "do_open", "proxy_open"]:
422                # oops, coincidental match
423                continue
424
425            i = meth.find("_")
426            protocol = meth[:i]
427            condition = meth[i+1:]
428
429            if condition.startswith("error"):
430                j = condition.find("_") + i + 1
431                kind = meth[j+1:]
432                try:
433                    kind = int(kind)
434                except ValueError:
435                    pass
436                lookup = self.handle_error.get(protocol, {})
437                self.handle_error[protocol] = lookup
438            elif condition == "open":
439                kind = protocol
440                lookup = self.handle_open
441            elif condition == "response":
442                kind = protocol
443                lookup = self.process_response
444            elif condition == "request":
445                kind = protocol
446                lookup = self.process_request
447            else:
448                continue
449
450            handlers = lookup.setdefault(kind, [])
451            if handlers:
452                bisect.insort(handlers, handler)
453            else:
454                handlers.append(handler)
455            added = True
456
457        if added:
458            bisect.insort(self.handlers, handler)
459            handler.add_parent(self)
460
461    def close(self):
462        # Only exists for backwards compatibility.
463        pass
464
465    def _call_chain(self, chain, kind, meth_name, *args):
466        # Handlers raise an exception if no one else should try to handle
467        # the request, or return None if they can't but another handler
468        # could.  Otherwise, they return the response.
469        handlers = chain.get(kind, ())
470        for handler in handlers:
471            func = getattr(handler, meth_name)
472            result = func(*args)
473            if result is not None:
474                return result
475
476    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
477        """
478        Accept a URL or a Request object
479
480        Python-Future: if the URL is passed as a byte-string, decode it first.
481        """
482        if isinstance(fullurl, bytes):
483            fullurl = fullurl.decode()
484        if isinstance(fullurl, str):
485            req = Request(fullurl, data)
486        else:
487            req = fullurl
488            if data is not None:
489                req.data = data
490
491        req.timeout = timeout
492        protocol = req.type
493
494        # pre-process request
495        meth_name = protocol+"_request"
496        for processor in self.process_request.get(protocol, []):
497            meth = getattr(processor, meth_name)
498            req = meth(req)
499
500        response = self._open(req, data)
501
502        # post-process response
503        meth_name = protocol+"_response"
504        for processor in self.process_response.get(protocol, []):
505            meth = getattr(processor, meth_name)
506            response = meth(req, response)
507
508        return response
509
510    def _open(self, req, data=None):
511        result = self._call_chain(self.handle_open, 'default',
512                                  'default_open', req)
513        if result:
514            return result
515
516        protocol = req.type
517        result = self._call_chain(self.handle_open, protocol, protocol +
518                                  '_open', req)
519        if result:
520            return result
521
522        return self._call_chain(self.handle_open, 'unknown',
523                                'unknown_open', req)
524
525    def error(self, proto, *args):
526        if proto in ('http', 'https'):
527            # XXX http[s] protocols are special-cased
528            dict = self.handle_error['http'] # https is not different than http
529            proto = args[2]  # YUCK!
530            meth_name = 'http_error_%s' % proto
531            http_err = 1
532            orig_args = args
533        else:
534            dict = self.handle_error
535            meth_name = proto + '_error'
536            http_err = 0
537        args = (dict, proto, meth_name) + args
538        result = self._call_chain(*args)
539        if result:
540            return result
541
542        if http_err:
543            args = (dict, 'default', 'http_error_default') + orig_args
544            return self._call_chain(*args)
545
546# XXX probably also want an abstract factory that knows when it makes
547# sense to skip a superclass in favor of a subclass and when it might
548# make sense to include both
549
550def build_opener(*handlers):
551    """Create an opener object from a list of handlers.
552
553    The opener will use several default handlers, including support
554    for HTTP, FTP and when applicable HTTPS.
555
556    If any of the handlers passed as arguments are subclasses of the
557    default handlers, the default handlers will not be used.
558    """
559    def isclass(obj):
560        return isinstance(obj, type) or hasattr(obj, "__bases__")
561
562    opener = OpenerDirector()
563    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
564                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
565                       FTPHandler, FileHandler, HTTPErrorProcessor]
566    if hasattr(http_client, "HTTPSConnection"):
567        default_classes.append(HTTPSHandler)
568    skip = set()
569    for klass in default_classes:
570        for check in handlers:
571            if isclass(check):
572                if issubclass(check, klass):
573                    skip.add(klass)
574            elif isinstance(check, klass):
575                skip.add(klass)
576    for klass in skip:
577        default_classes.remove(klass)
578
579    for klass in default_classes:
580        opener.add_handler(klass())
581
582    for h in handlers:
583        if isclass(h):
584            h = h()
585        opener.add_handler(h)
586    return opener
587
588class BaseHandler(object):
589    handler_order = 500
590
591    def add_parent(self, parent):
592        self.parent = parent
593
594    def close(self):
595        # Only exists for backwards compatibility
596        pass
597
598    def __lt__(self, other):
599        if not hasattr(other, "handler_order"):
600            # Try to preserve the old behavior of having custom classes
601            # inserted after default ones (works only for custom user
602            # classes which are not aware of handler_order).
603            return True
604        return self.handler_order < other.handler_order
605
606
607class HTTPErrorProcessor(BaseHandler):
608    """Process HTTP error responses."""
609    handler_order = 1000  # after all other processing
610
611    def http_response(self, request, response):
612        code, msg, hdrs = response.code, response.msg, response.info()
613
614        # According to RFC 2616, "2xx" code indicates that the client's
615        # request was successfully received, understood, and accepted.
616        if not (200 <= code < 300):
617            response = self.parent.error(
618                'http', request, response, code, msg, hdrs)
619
620        return response
621
622    https_response = http_response
623
624class HTTPDefaultErrorHandler(BaseHandler):
625    def http_error_default(self, req, fp, code, msg, hdrs):
626        raise HTTPError(req.full_url, code, msg, hdrs, fp)
627
628class HTTPRedirectHandler(BaseHandler):
629    # maximum number of redirections to any single URL
630    # this is needed because of the state that cookies introduce
631    max_repeats = 4
632    # maximum total number of redirections (regardless of URL) before
633    # assuming we're in a loop
634    max_redirections = 10
635
636    def redirect_request(self, req, fp, code, msg, headers, newurl):
637        """Return a Request or None in response to a redirect.
638
639        This is called by the http_error_30x methods when a
640        redirection response is received.  If a redirection should
641        take place, return a new Request to allow http_error_30x to
642        perform the redirect.  Otherwise, raise HTTPError if no-one
643        else should try to handle this url.  Return None if you can't
644        but another Handler might.
645        """
646        m = req.get_method()
647        if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
648            or code in (301, 302, 303) and m == "POST")):
649            raise HTTPError(req.full_url, code, msg, headers, fp)
650
651        # Strictly (according to RFC 2616), 301 or 302 in response to
652        # a POST MUST NOT cause a redirection without confirmation
653        # from the user (of urllib.request, in this case).  In practice,
654        # essentially all clients do redirect in this case, so we do
655        # the same.
656        # be conciliant with URIs containing a space
657        newurl = newurl.replace(' ', '%20')
658        CONTENT_HEADERS = ("content-length", "content-type")
659        newheaders = dict((k, v) for k, v in req.headers.items()
660                          if k.lower() not in CONTENT_HEADERS)
661        return Request(newurl,
662                       headers=newheaders,
663                       origin_req_host=req.origin_req_host,
664                       unverifiable=True)
665
666    # Implementation note: To avoid the server sending us into an
667    # infinite loop, the request object needs to track what URLs we
668    # have already seen.  Do this by adding a handler-specific
669    # attribute to the Request object.
670    def http_error_302(self, req, fp, code, msg, headers):
671        # Some servers (incorrectly) return multiple Location headers
672        # (so probably same goes for URI).  Use first header.
673        if "location" in headers:
674            newurl = headers["location"]
675        elif "uri" in headers:
676            newurl = headers["uri"]
677        else:
678            return
679
680        # fix a possible malformed URL
681        urlparts = urlparse(newurl)
682
683        # For security reasons we don't allow redirection to anything other
684        # than http, https or ftp.
685
686        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
687            raise HTTPError(
688                newurl, code,
689                "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
690                headers, fp)
691
692        if not urlparts.path:
693            urlparts = list(urlparts)
694            urlparts[2] = "/"
695        newurl = urlunparse(urlparts)
696
697        newurl = urljoin(req.full_url, newurl)
698
699        # XXX Probably want to forget about the state of the current
700        # request, although that might interact poorly with other
701        # handlers that also use handler-specific request attributes
702        new = self.redirect_request(req, fp, code, msg, headers, newurl)
703        if new is None:
704            return
705
706        # loop detection
707        # .redirect_dict has a key url if url was previously visited.
708        if hasattr(req, 'redirect_dict'):
709            visited = new.redirect_dict = req.redirect_dict
710            if (visited.get(newurl, 0) >= self.max_repeats or
711                len(visited) >= self.max_redirections):
712                raise HTTPError(req.full_url, code,
713                                self.inf_msg + msg, headers, fp)
714        else:
715            visited = new.redirect_dict = req.redirect_dict = {}
716        visited[newurl] = visited.get(newurl, 0) + 1
717
718        # Don't close the fp until we are sure that we won't use it
719        # with HTTPError.
720        fp.read()
721        fp.close()
722
723        return self.parent.open(new, timeout=req.timeout)
724
725    http_error_301 = http_error_303 = http_error_307 = http_error_302
726
727    inf_msg = "The HTTP server returned a redirect error that would " \
728              "lead to an infinite loop.\n" \
729              "The last 30x error message was:\n"
730
731
732def _parse_proxy(proxy):
733    """Return (scheme, user, password, host/port) given a URL or an authority.
734
735    If a URL is supplied, it must have an authority (host:port) component.
736    According to RFC 3986, having an authority component means the URL must
737    have two slashes after the scheme:
738
739    >>> _parse_proxy('file:/ftp.example.com/')
740    Traceback (most recent call last):
741    ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
742
743    The first three items of the returned tuple may be None.
744
745    Examples of authority parsing:
746
747    >>> _parse_proxy('proxy.example.com')
748    (None, None, None, 'proxy.example.com')
749    >>> _parse_proxy('proxy.example.com:3128')
750    (None, None, None, 'proxy.example.com:3128')
751
752    The authority component may optionally include userinfo (assumed to be
753    username:password):
754
755    >>> _parse_proxy('joe:password@proxy.example.com')
756    (None, 'joe', 'password', 'proxy.example.com')
757    >>> _parse_proxy('joe:password@proxy.example.com:3128')
758    (None, 'joe', 'password', 'proxy.example.com:3128')
759
760    Same examples, but with URLs instead:
761
762    >>> _parse_proxy('http://proxy.example.com/')
763    ('http', None, None, 'proxy.example.com')
764    >>> _parse_proxy('http://proxy.example.com:3128/')
765    ('http', None, None, 'proxy.example.com:3128')
766    >>> _parse_proxy('http://joe:password@proxy.example.com/')
767    ('http', 'joe', 'password', 'proxy.example.com')
768    >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
769    ('http', 'joe', 'password', 'proxy.example.com:3128')
770
771    Everything after the authority is ignored:
772
773    >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
774    ('ftp', 'joe', 'password', 'proxy.example.com')
775
776    Test for no trailing '/' case:
777
778    >>> _parse_proxy('http://joe:password@proxy.example.com')
779    ('http', 'joe', 'password', 'proxy.example.com')
780
781    """
782    scheme, r_scheme = splittype(proxy)
783    if not r_scheme.startswith("/"):
784        # authority
785        scheme = None
786        authority = proxy
787    else:
788        # URL
789        if not r_scheme.startswith("//"):
790            raise ValueError("proxy URL with no authority: %r" % proxy)
791        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
792        # and 3.3.), path is empty or starts with '/'
793        end = r_scheme.find("/", 2)
794        if end == -1:
795            end = None
796        authority = r_scheme[2:end]
797    userinfo, hostport = splituser(authority)
798    if userinfo is not None:
799        user, password = splitpasswd(userinfo)
800    else:
801        user = password = None
802    return scheme, user, password, hostport
803
804class ProxyHandler(BaseHandler):
805    # Proxies must be in front
806    handler_order = 100
807
808    def __init__(self, proxies=None):
809        if proxies is None:
810            proxies = getproxies()
811        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
812        self.proxies = proxies
813        for type, url in proxies.items():
814            setattr(self, '%s_open' % type,
815                    lambda r, proxy=url, type=type, meth=self.proxy_open:
816                        meth(r, proxy, type))
817
818    def proxy_open(self, req, proxy, type):
819        orig_type = req.type
820        proxy_type, user, password, hostport = _parse_proxy(proxy)
821        if proxy_type is None:
822            proxy_type = orig_type
823
824        if req.host and proxy_bypass(req.host):
825            return None
826
827        if user and password:
828            user_pass = '%s:%s' % (unquote(user),
829                                   unquote(password))
830            creds = base64.b64encode(user_pass.encode()).decode("ascii")
831            req.add_header('Proxy-authorization', 'Basic ' + creds)
832        hostport = unquote(hostport)
833        req.set_proxy(hostport, proxy_type)
834        if orig_type == proxy_type or orig_type == 'https':
835            # let other handlers take care of it
836            return None
837        else:
838            # need to start over, because the other handlers don't
839            # grok the proxy's URL type
840            # e.g. if we have a constructor arg proxies like so:
841            # {'http': 'ftp://proxy.example.com'}, we may end up turning
842            # a request for http://acme.example.com/a into one for
843            # ftp://proxy.example.com/a
844            return self.parent.open(req, timeout=req.timeout)
845
846class HTTPPasswordMgr(object):
847
848    def __init__(self):
849        self.passwd = {}
850
851    def add_password(self, realm, uri, user, passwd):
852        # uri could be a single URI or a sequence
853        if isinstance(uri, str):
854            uri = [uri]
855        if realm not in self.passwd:
856            self.passwd[realm] = {}
857        for default_port in True, False:
858            reduced_uri = tuple(
859                [self.reduce_uri(u, default_port) for u in uri])
860            self.passwd[realm][reduced_uri] = (user, passwd)
861
862    def find_user_password(self, realm, authuri):
863        domains = self.passwd.get(realm, {})
864        for default_port in True, False:
865            reduced_authuri = self.reduce_uri(authuri, default_port)
866            for uris, authinfo in domains.items():
867                for uri in uris:
868                    if self.is_suburi(uri, reduced_authuri):
869                        return authinfo
870        return None, None
871
872    def reduce_uri(self, uri, default_port=True):
873        """Accept authority or URI and extract only the authority and path."""
874        # note HTTP URLs do not have a userinfo component
875        parts = urlsplit(uri)
876        if parts[1]:
877            # URI
878            scheme = parts[0]
879            authority = parts[1]
880            path = parts[2] or '/'
881        else:
882            # host or host:port
883            scheme = None
884            authority = uri
885            path = '/'
886        host, port = splitport(authority)
887        if default_port and port is None and scheme is not None:
888            dport = {"http": 80,
889                     "https": 443,
890                     }.get(scheme)
891            if dport is not None:
892                authority = "%s:%d" % (host, dport)
893        return authority, path
894
895    def is_suburi(self, base, test):
896        """Check if test is below base in a URI tree
897
898        Both args must be URIs in reduced form.
899        """
900        if base == test:
901            return True
902        if base[0] != test[0]:
903            return False
904        common = posixpath.commonprefix((base[1], test[1]))
905        if len(common) == len(base[1]):
906            return True
907        return False
908
909
910class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
911
912    def find_user_password(self, realm, authuri):
913        user, password = HTTPPasswordMgr.find_user_password(self, realm,
914                                                            authuri)
915        if user is not None:
916            return user, password
917        return HTTPPasswordMgr.find_user_password(self, None, authuri)
918
919
920class AbstractBasicAuthHandler(object):
921
922    # XXX this allows for multiple auth-schemes, but will stupidly pick
923    # the last one with a realm specified.
924
925    # allow for double- and single-quoted realm values
926    # (single quotes are a violation of the RFC, but appear in the wild)
927    rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
928                    'realm=(["\']?)([^"\']*)\\2', re.I)
929
930    # XXX could pre-emptively send auth info already accepted (RFC 2617,
931    # end of section 2, and section 1.2 immediately after "credentials"
932    # production).
933
934    def __init__(self, password_mgr=None):
935        if password_mgr is None:
936            password_mgr = HTTPPasswordMgr()
937        self.passwd = password_mgr
938        self.add_password = self.passwd.add_password
939        self.retried = 0
940
941    def reset_retry_count(self):
942        self.retried = 0
943
944    def http_error_auth_reqed(self, authreq, host, req, headers):
945        # host may be an authority (without userinfo) or a URL with an
946        # authority
947        # XXX could be multiple headers
948        authreq = headers.get(authreq, None)
949
950        if self.retried > 5:
951            # retry sending the username:password 5 times before failing.
952            raise HTTPError(req.get_full_url(), 401, "basic auth failed",
953                    headers, None)
954        else:
955            self.retried += 1
956
957        if authreq:
958            scheme = authreq.split()[0]
959            if scheme.lower() != 'basic':
960                raise ValueError("AbstractBasicAuthHandler does not"
961                                 " support the following scheme: '%s'" %
962                                 scheme)
963            else:
964                mo = AbstractBasicAuthHandler.rx.search(authreq)
965                if mo:
966                    scheme, quote, realm = mo.groups()
967                    if quote not in ['"',"'"]:
968                        warnings.warn("Basic Auth Realm was unquoted",
969                                      UserWarning, 2)
970                    if scheme.lower() == 'basic':
971                        response = self.retry_http_basic_auth(host, req, realm)
972                        if response and response.code != 401:
973                            self.retried = 0
974                        return response
975
976    def retry_http_basic_auth(self, host, req, realm):
977        user, pw = self.passwd.find_user_password(realm, host)
978        if pw is not None:
979            raw = "%s:%s" % (user, pw)
980            auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
981            if req.headers.get(self.auth_header, None) == auth:
982                return None
983            req.add_unredirected_header(self.auth_header, auth)
984            return self.parent.open(req, timeout=req.timeout)
985        else:
986            return None
987
988
989class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
990
991    auth_header = 'Authorization'
992
993    def http_error_401(self, req, fp, code, msg, headers):
994        url = req.full_url
995        response = self.http_error_auth_reqed('www-authenticate',
996                                          url, req, headers)
997        self.reset_retry_count()
998        return response
999
1000
1001class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1002
1003    auth_header = 'Proxy-authorization'
1004
1005    def http_error_407(self, req, fp, code, msg, headers):
1006        # http_error_auth_reqed requires that there is no userinfo component in
1007        # authority.  Assume there isn't one, since urllib.request does not (and
1008        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1009        # userinfo.
1010        authority = req.host
1011        response = self.http_error_auth_reqed('proxy-authenticate',
1012                                          authority, req, headers)
1013        self.reset_retry_count()
1014        return response
1015
1016
1017# Return n random bytes.
1018_randombytes = os.urandom
1019
1020
1021class AbstractDigestAuthHandler(object):
1022    # Digest authentication is specified in RFC 2617.
1023
1024    # XXX The client does not inspect the Authentication-Info header
1025    # in a successful response.
1026
1027    # XXX It should be possible to test this implementation against
1028    # a mock server that just generates a static set of challenges.
1029
1030    # XXX qop="auth-int" supports is shaky
1031
1032    def __init__(self, passwd=None):
1033        if passwd is None:
1034            passwd = HTTPPasswordMgr()
1035        self.passwd = passwd
1036        self.add_password = self.passwd.add_password
1037        self.retried = 0
1038        self.nonce_count = 0
1039        self.last_nonce = None
1040
1041    def reset_retry_count(self):
1042        self.retried = 0
1043
1044    def http_error_auth_reqed(self, auth_header, host, req, headers):
1045        authreq = headers.get(auth_header, None)
1046        if self.retried > 5:
1047            # Don't fail endlessly - if we failed once, we'll probably
1048            # fail a second time. Hm. Unless the Password Manager is
1049            # prompting for the information. Crap. This isn't great
1050            # but it's better than the current 'repeat until recursion
1051            # depth exceeded' approach <wink>
1052            raise HTTPError(req.full_url, 401, "digest auth failed",
1053                            headers, None)
1054        else:
1055            self.retried += 1
1056        if authreq:
1057            scheme = authreq.split()[0]
1058            if scheme.lower() == 'digest':
1059                return self.retry_http_digest_auth(req, authreq)
1060            elif scheme.lower() != 'basic':
1061                raise ValueError("AbstractDigestAuthHandler does not support"
1062                                 " the following scheme: '%s'" % scheme)
1063
1064    def retry_http_digest_auth(self, req, auth):
1065        token, challenge = auth.split(' ', 1)
1066        chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1067        auth = self.get_authorization(req, chal)
1068        if auth:
1069            auth_val = 'Digest %s' % auth
1070            if req.headers.get(self.auth_header, None) == auth_val:
1071                return None
1072            req.add_unredirected_header(self.auth_header, auth_val)
1073            resp = self.parent.open(req, timeout=req.timeout)
1074            return resp
1075
1076    def get_cnonce(self, nonce):
1077        # The cnonce-value is an opaque
1078        # quoted string value provided by the client and used by both client
1079        # and server to avoid chosen plaintext attacks, to provide mutual
1080        # authentication, and to provide some message integrity protection.
1081        # This isn't a fabulous effort, but it's probably Good Enough.
1082        s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1083        b = s.encode("ascii") + _randombytes(8)
1084        dig = hashlib.sha1(b).hexdigest()
1085        return dig[:16]
1086
1087    def get_authorization(self, req, chal):
1088        try:
1089            realm = chal['realm']
1090            nonce = chal['nonce']
1091            qop = chal.get('qop')
1092            algorithm = chal.get('algorithm', 'MD5')
1093            # mod_digest doesn't send an opaque, even though it isn't
1094            # supposed to be optional
1095            opaque = chal.get('opaque', None)
1096        except KeyError:
1097            return None
1098
1099        H, KD = self.get_algorithm_impls(algorithm)
1100        if H is None:
1101            return None
1102
1103        user, pw = self.passwd.find_user_password(realm, req.full_url)
1104        if user is None:
1105            return None
1106
1107        # XXX not implemented yet
1108        if req.data is not None:
1109            entdig = self.get_entity_digest(req.data, chal)
1110        else:
1111            entdig = None
1112
1113        A1 = "%s:%s:%s" % (user, realm, pw)
1114        A2 = "%s:%s" % (req.get_method(),
1115                        # XXX selector: what about proxies and full urls
1116                        req.selector)
1117        if qop == 'auth':
1118            if nonce == self.last_nonce:
1119                self.nonce_count += 1
1120            else:
1121                self.nonce_count = 1
1122                self.last_nonce = nonce
1123            ncvalue = '%08x' % self.nonce_count
1124            cnonce = self.get_cnonce(nonce)
1125            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1126            respdig = KD(H(A1), noncebit)
1127        elif qop is None:
1128            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1129        else:
1130            # XXX handle auth-int.
1131            raise URLError("qop '%s' is not supported." % qop)
1132
1133        # XXX should the partial digests be encoded too?
1134
1135        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1136               'response="%s"' % (user, realm, nonce, req.selector,
1137                                  respdig)
1138        if opaque:
1139            base += ', opaque="%s"' % opaque
1140        if entdig:
1141            base += ', digest="%s"' % entdig
1142        base += ', algorithm="%s"' % algorithm
1143        if qop:
1144            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1145        return base
1146
1147    def get_algorithm_impls(self, algorithm):
1148        # lambdas assume digest modules are imported at the top level
1149        if algorithm == 'MD5':
1150            H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1151        elif algorithm == 'SHA':
1152            H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1153        # XXX MD5-sess
1154        KD = lambda s, d: H("%s:%s" % (s, d))
1155        return H, KD
1156
1157    def get_entity_digest(self, data, chal):
1158        # XXX not implemented yet
1159        return None
1160
1161
1162class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1163    """An authentication protocol defined by RFC 2069
1164
1165    Digest authentication improves on basic authentication because it
1166    does not transmit passwords in the clear.
1167    """
1168
1169    auth_header = 'Authorization'
1170    handler_order = 490  # before Basic auth
1171
1172    def http_error_401(self, req, fp, code, msg, headers):
1173        host = urlparse(req.full_url)[1]
1174        retry = self.http_error_auth_reqed('www-authenticate',
1175                                           host, req, headers)
1176        self.reset_retry_count()
1177        return retry
1178
1179
1180class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1181
1182    auth_header = 'Proxy-Authorization'
1183    handler_order = 490  # before Basic auth
1184
1185    def http_error_407(self, req, fp, code, msg, headers):
1186        host = req.host
1187        retry = self.http_error_auth_reqed('proxy-authenticate',
1188                                           host, req, headers)
1189        self.reset_retry_count()
1190        return retry
1191
1192class AbstractHTTPHandler(BaseHandler):
1193
1194    def __init__(self, debuglevel=0):
1195        self._debuglevel = debuglevel
1196
1197    def set_http_debuglevel(self, level):
1198        self._debuglevel = level
1199
1200    def do_request_(self, request):
1201        host = request.host
1202        if not host:
1203            raise URLError('no host given')
1204
1205        if request.data is not None:  # POST
1206            data = request.data
1207            if isinstance(data, str):
1208                msg = "POST data should be bytes or an iterable of bytes. " \
1209                      "It cannot be of type str."
1210                raise TypeError(msg)
1211            if not request.has_header('Content-type'):
1212                request.add_unredirected_header(
1213                    'Content-type',
1214                    'application/x-www-form-urlencoded')
1215            if not request.has_header('Content-length'):
1216                size = None
1217                try:
1218                    ### For Python-Future:
1219                    if PY2 and isinstance(data, array.array):
1220                        # memoryviews of arrays aren't supported
1221                        # in Py2.7. (e.g. memoryview(array.array('I',
1222                        # [1, 2, 3, 4])) raises a TypeError.)
1223                        # So we calculate the size manually instead:
1224                        size = len(data) * data.itemsize
1225                    ###
1226                    else:
1227                        mv = memoryview(data)
1228                        size = len(mv) * mv.itemsize
1229                except TypeError:
1230                    if isinstance(data, Iterable):
1231                        raise ValueError("Content-Length should be specified "
1232                                "for iterable data of type %r %r" % (type(data),
1233                                data))
1234                else:
1235                    request.add_unredirected_header(
1236                            'Content-length', '%d' % size)
1237
1238        sel_host = host
1239        if request.has_proxy():
1240            scheme, sel = splittype(request.selector)
1241            sel_host, sel_path = splithost(sel)
1242        if not request.has_header('Host'):
1243            request.add_unredirected_header('Host', sel_host)
1244        for name, value in self.parent.addheaders:
1245            name = name.capitalize()
1246            if not request.has_header(name):
1247                request.add_unredirected_header(name, value)
1248
1249        return request
1250
1251    def do_open(self, http_class, req, **http_conn_args):
1252        """Return an HTTPResponse object for the request, using http_class.
1253
1254        http_class must implement the HTTPConnection API from http.client.
1255        """
1256        host = req.host
1257        if not host:
1258            raise URLError('no host given')
1259
1260        # will parse host:port
1261        h = http_class(host, timeout=req.timeout, **http_conn_args)
1262
1263        headers = dict(req.unredirected_hdrs)
1264        headers.update(dict((k, v) for k, v in req.headers.items()
1265                            if k not in headers))
1266
1267        # TODO(jhylton): Should this be redesigned to handle
1268        # persistent connections?
1269
1270        # We want to make an HTTP/1.1 request, but the addinfourl
1271        # class isn't prepared to deal with a persistent connection.
1272        # It will try to read all remaining data from the socket,
1273        # which will block while the server waits for the next request.
1274        # So make sure the connection gets closed after the (only)
1275        # request.
1276        headers["Connection"] = "close"
1277        headers = dict((name.title(), val) for name, val in headers.items())
1278
1279        if req._tunnel_host:
1280            tunnel_headers = {}
1281            proxy_auth_hdr = "Proxy-Authorization"
1282            if proxy_auth_hdr in headers:
1283                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1284                # Proxy-Authorization should not be sent to origin
1285                # server.
1286                del headers[proxy_auth_hdr]
1287            h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1288
1289        try:
1290            h.request(req.get_method(), req.selector, req.data, headers)
1291        except socket.error as err: # timeout error
1292            h.close()
1293            raise URLError(err)
1294        else:
1295            r = h.getresponse()
1296            # If the server does not send us a 'Connection: close' header,
1297            # HTTPConnection assumes the socket should be left open. Manually
1298            # mark the socket to be closed when this response object goes away.
1299            if h.sock:
1300                h.sock.close()
1301                h.sock = None
1302
1303
1304        r.url = req.get_full_url()
1305        # This line replaces the .msg attribute of the HTTPResponse
1306        # with .headers, because urllib clients expect the response to
1307        # have the reason in .msg.  It would be good to mark this
1308        # attribute is deprecated and get then to use info() or
1309        # .headers.
1310        r.msg = r.reason
1311        return r
1312
1313
1314class HTTPHandler(AbstractHTTPHandler):
1315
1316    def http_open(self, req):
1317        return self.do_open(http_client.HTTPConnection, req)
1318
1319    http_request = AbstractHTTPHandler.do_request_
1320
1321if hasattr(http_client, 'HTTPSConnection'):
1322
1323    class HTTPSHandler(AbstractHTTPHandler):
1324
1325        def __init__(self, debuglevel=0, context=None, check_hostname=None):
1326            AbstractHTTPHandler.__init__(self, debuglevel)
1327            self._context = context
1328            self._check_hostname = check_hostname
1329
1330        def https_open(self, req):
1331            return self.do_open(http_client.HTTPSConnection, req,
1332                context=self._context, check_hostname=self._check_hostname)
1333
1334        https_request = AbstractHTTPHandler.do_request_
1335
1336    __all__.append('HTTPSHandler')
1337
1338class HTTPCookieProcessor(BaseHandler):
1339    def __init__(self, cookiejar=None):
1340        import future.backports.http.cookiejar as http_cookiejar
1341        if cookiejar is None:
1342            cookiejar = http_cookiejar.CookieJar()
1343        self.cookiejar = cookiejar
1344
1345    def http_request(self, request):
1346        self.cookiejar.add_cookie_header(request)
1347        return request
1348
1349    def http_response(self, request, response):
1350        self.cookiejar.extract_cookies(response, request)
1351        return response
1352
1353    https_request = http_request
1354    https_response = http_response
1355
1356class UnknownHandler(BaseHandler):
1357    def unknown_open(self, req):
1358        type = req.type
1359        raise URLError('unknown url type: %s' % type)
1360
1361def parse_keqv_list(l):
1362    """Parse list of key=value strings where keys are not duplicated."""
1363    parsed = {}
1364    for elt in l:
1365        k, v = elt.split('=', 1)
1366        if v[0] == '"' and v[-1] == '"':
1367            v = v[1:-1]
1368        parsed[k] = v
1369    return parsed
1370
1371def parse_http_list(s):
1372    """Parse lists as described by RFC 2068 Section 2.
1373
1374    In particular, parse comma-separated lists where the elements of
1375    the list may include quoted-strings.  A quoted-string could
1376    contain a comma.  A non-quoted string could have quotes in the
1377    middle.  Neither commas nor quotes count if they are escaped.
1378    Only double-quotes count, not single-quotes.
1379    """
1380    res = []
1381    part = ''
1382
1383    escape = quote = False
1384    for cur in s:
1385        if escape:
1386            part += cur
1387            escape = False
1388            continue
1389        if quote:
1390            if cur == '\\':
1391                escape = True
1392                continue
1393            elif cur == '"':
1394                quote = False
1395            part += cur
1396            continue
1397
1398        if cur == ',':
1399            res.append(part)
1400            part = ''
1401            continue
1402
1403        if cur == '"':
1404            quote = True
1405
1406        part += cur
1407
1408    # append last part
1409    if part:
1410        res.append(part)
1411
1412    return [part.strip() for part in res]
1413
1414class FileHandler(BaseHandler):
1415    # Use local file or FTP depending on form of URL
1416    def file_open(self, req):
1417        url = req.selector
1418        if url[:2] == '//' and url[2:3] != '/' and (req.host and
1419                req.host != 'localhost'):
1420            if not req.host is self.get_names():
1421                raise URLError("file:// scheme is supported only on localhost")
1422        else:
1423            return self.open_local_file(req)
1424
1425    # names for the localhost
1426    names = None
1427    def get_names(self):
1428        if FileHandler.names is None:
1429            try:
1430                FileHandler.names = tuple(
1431                    socket.gethostbyname_ex('localhost')[2] +
1432                    socket.gethostbyname_ex(socket.gethostname())[2])
1433            except socket.gaierror:
1434                FileHandler.names = (socket.gethostbyname('localhost'),)
1435        return FileHandler.names
1436
1437    # not entirely sure what the rules are here
1438    def open_local_file(self, req):
1439        import future.backports.email.utils as email_utils
1440        import mimetypes
1441        host = req.host
1442        filename = req.selector
1443        localfile = url2pathname(filename)
1444        try:
1445            stats = os.stat(localfile)
1446            size = stats.st_size
1447            modified = email_utils.formatdate(stats.st_mtime, usegmt=True)
1448            mtype = mimetypes.guess_type(filename)[0]
1449            headers = email.message_from_string(
1450                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1451                (mtype or 'text/plain', size, modified))
1452            if host:
1453                host, port = splitport(host)
1454            if not host or \
1455                (not port and _safe_gethostbyname(host) in self.get_names()):
1456                if host:
1457                    origurl = 'file://' + host + filename
1458                else:
1459                    origurl = 'file://' + filename
1460                return addinfourl(open(localfile, 'rb'), headers, origurl)
1461        except OSError as exp:
1462            # users shouldn't expect OSErrors coming from urlopen()
1463            raise URLError(exp)
1464        raise URLError('file not on local host')
1465
1466def _safe_gethostbyname(host):
1467    try:
1468        return socket.gethostbyname(host)
1469    except socket.gaierror:
1470        return None
1471
1472class FTPHandler(BaseHandler):
1473    def ftp_open(self, req):
1474        import ftplib
1475        import mimetypes
1476        host = req.host
1477        if not host:
1478            raise URLError('ftp error: no host given')
1479        host, port = splitport(host)
1480        if port is None:
1481            port = ftplib.FTP_PORT
1482        else:
1483            port = int(port)
1484
1485        # username/password handling
1486        user, host = splituser(host)
1487        if user:
1488            user, passwd = splitpasswd(user)
1489        else:
1490            passwd = None
1491        host = unquote(host)
1492        user = user or ''
1493        passwd = passwd or ''
1494
1495        try:
1496            host = socket.gethostbyname(host)
1497        except socket.error as msg:
1498            raise URLError(msg)
1499        path, attrs = splitattr(req.selector)
1500        dirs = path.split('/')
1501        dirs = list(map(unquote, dirs))
1502        dirs, file = dirs[:-1], dirs[-1]
1503        if dirs and not dirs[0]:
1504            dirs = dirs[1:]
1505        try:
1506            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1507            type = file and 'I' or 'D'
1508            for attr in attrs:
1509                attr, value = splitvalue(attr)
1510                if attr.lower() == 'type' and \
1511                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1512                    type = value.upper()
1513            fp, retrlen = fw.retrfile(file, type)
1514            headers = ""
1515            mtype = mimetypes.guess_type(req.full_url)[0]
1516            if mtype:
1517                headers += "Content-type: %s\n" % mtype
1518            if retrlen is not None and retrlen >= 0:
1519                headers += "Content-length: %d\n" % retrlen
1520            headers = email.message_from_string(headers)
1521            return addinfourl(fp, headers, req.full_url)
1522        except ftplib.all_errors as exp:
1523            exc = URLError('ftp error: %r' % exp)
1524            raise_with_traceback(exc)
1525
1526    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1527        return ftpwrapper(user, passwd, host, port, dirs, timeout,
1528                          persistent=False)
1529
1530class CacheFTPHandler(FTPHandler):
1531    # XXX would be nice to have pluggable cache strategies
1532    # XXX this stuff is definitely not thread safe
1533    def __init__(self):
1534        self.cache = {}
1535        self.timeout = {}
1536        self.soonest = 0
1537        self.delay = 60
1538        self.max_conns = 16
1539
1540    def setTimeout(self, t):
1541        self.delay = t
1542
1543    def setMaxConns(self, m):
1544        self.max_conns = m
1545
1546    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1547        key = user, host, port, '/'.join(dirs), timeout
1548        if key in self.cache:
1549            self.timeout[key] = time.time() + self.delay
1550        else:
1551            self.cache[key] = ftpwrapper(user, passwd, host, port,
1552                                         dirs, timeout)
1553            self.timeout[key] = time.time() + self.delay
1554        self.check_cache()
1555        return self.cache[key]
1556
1557    def check_cache(self):
1558        # first check for old ones
1559        t = time.time()
1560        if self.soonest <= t:
1561            for k, v in list(self.timeout.items()):
1562                if v < t:
1563                    self.cache[k].close()
1564                    del self.cache[k]
1565                    del self.timeout[k]
1566        self.soonest = min(list(self.timeout.values()))
1567
1568        # then check the size
1569        if len(self.cache) == self.max_conns:
1570            for k, v in list(self.timeout.items()):
1571                if v == self.soonest:
1572                    del self.cache[k]
1573                    del self.timeout[k]
1574                    break
1575            self.soonest = min(list(self.timeout.values()))
1576
1577    def clear_cache(self):
1578        for conn in self.cache.values():
1579            conn.close()
1580        self.cache.clear()
1581        self.timeout.clear()
1582
1583
1584# Code move from the old urllib module
1585
1586MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
1587
1588# Helper for non-unix systems
1589if os.name == 'nt':
1590    from nturl2path import url2pathname, pathname2url
1591else:
1592    def url2pathname(pathname):
1593        """OS-specific conversion from a relative URL of the 'file' scheme
1594        to a file system path; not recommended for general use."""
1595        return unquote(pathname)
1596
1597    def pathname2url(pathname):
1598        """OS-specific conversion from a file system path to a relative URL
1599        of the 'file' scheme; not recommended for general use."""
1600        return quote(pathname)
1601
1602# This really consists of two pieces:
1603# (1) a class which handles opening of all sorts of URLs
1604#     (plus assorted utilities etc.)
1605# (2) a set of functions for parsing URLs
1606# XXX Should these be separated out into different modules?
1607
1608
1609ftpcache = {}
1610class URLopener(object):
1611    """Class to open URLs.
1612    This is a class rather than just a subroutine because we may need
1613    more than one set of global protocol-specific options.
1614    Note -- this is a base class for those who don't want the
1615    automatic handling of errors type 302 (relocated) and 401
1616    (authorization needed)."""
1617
1618    __tempfiles = None
1619
1620    version = "Python-urllib/%s" % __version__
1621
1622    # Constructor
1623    def __init__(self, proxies=None, **x509):
1624        msg = "%(class)s style of invoking requests is deprecated. " \
1625              "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1626        warnings.warn(msg, DeprecationWarning, stacklevel=3)
1627        if proxies is None:
1628            proxies = getproxies()
1629        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1630        self.proxies = proxies
1631        self.key_file = x509.get('key_file')
1632        self.cert_file = x509.get('cert_file')
1633        self.addheaders = [('User-Agent', self.version)]
1634        self.__tempfiles = []
1635        self.__unlink = os.unlink # See cleanup()
1636        self.tempcache = None
1637        # Undocumented feature: if you assign {} to tempcache,
1638        # it is used to cache files retrieved with
1639        # self.retrieve().  This is not enabled by default
1640        # since it does not work for changing documents (and I
1641        # haven't got the logic to check expiration headers
1642        # yet).
1643        self.ftpcache = ftpcache
1644        # Undocumented feature: you can use a different
1645        # ftp cache by assigning to the .ftpcache member;
1646        # in case you want logically independent URL openers
1647        # XXX This is not threadsafe.  Bah.
1648
1649    def __del__(self):
1650        self.close()
1651
1652    def close(self):
1653        self.cleanup()
1654
1655    def cleanup(self):
1656        # This code sometimes runs when the rest of this module
1657        # has already been deleted, so it can't use any globals
1658        # or import anything.
1659        if self.__tempfiles:
1660            for file in self.__tempfiles:
1661                try:
1662                    self.__unlink(file)
1663                except OSError:
1664                    pass
1665            del self.__tempfiles[:]
1666        if self.tempcache:
1667            self.tempcache.clear()
1668
1669    def addheader(self, *args):
1670        """Add a header to be used by the HTTP interface only
1671        e.g. u.addheader('Accept', 'sound/basic')"""
1672        self.addheaders.append(args)
1673
1674    # External interface
1675    def open(self, fullurl, data=None):
1676        """Use URLopener().open(file) instead of open(file, 'r')."""
1677        fullurl = unwrap(to_bytes(fullurl))
1678        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1679        if self.tempcache and fullurl in self.tempcache:
1680            filename, headers = self.tempcache[fullurl]
1681            fp = open(filename, 'rb')
1682            return addinfourl(fp, headers, fullurl)
1683        urltype, url = splittype(fullurl)
1684        if not urltype:
1685            urltype = 'file'
1686        if urltype in self.proxies:
1687            proxy = self.proxies[urltype]
1688            urltype, proxyhost = splittype(proxy)
1689            host, selector = splithost(proxyhost)
1690            url = (host, fullurl) # Signal special case to open_*()
1691        else:
1692            proxy = None
1693        name = 'open_' + urltype
1694        self.type = urltype
1695        name = name.replace('-', '_')
1696        if not hasattr(self, name):
1697            if proxy:
1698                return self.open_unknown_proxy(proxy, fullurl, data)
1699            else:
1700                return self.open_unknown(fullurl, data)
1701        try:
1702            if data is None:
1703                return getattr(self, name)(url)
1704            else:
1705                return getattr(self, name)(url, data)
1706        except HTTPError:
1707            raise
1708        except socket.error as msg:
1709            raise_with_traceback(IOError('socket error', msg))
1710
1711    def open_unknown(self, fullurl, data=None):
1712        """Overridable interface to open unknown URL type."""
1713        type, url = splittype(fullurl)
1714        raise IOError('url error', 'unknown url type', type)
1715
1716    def open_unknown_proxy(self, proxy, fullurl, data=None):
1717        """Overridable interface to open unknown URL type."""
1718        type, url = splittype(fullurl)
1719        raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1720
1721    # External interface
1722    def retrieve(self, url, filename=None, reporthook=None, data=None):
1723        """retrieve(url) returns (filename, headers) for a local object
1724        or (tempfilename, headers) for a remote object."""
1725        url = unwrap(to_bytes(url))
1726        if self.tempcache and url in self.tempcache:
1727            return self.tempcache[url]
1728        type, url1 = splittype(url)
1729        if filename is None and (not type or type == 'file'):
1730            try:
1731                fp = self.open_local_file(url1)
1732                hdrs = fp.info()
1733                fp.close()
1734                return url2pathname(splithost(url1)[1]), hdrs
1735            except IOError as msg:
1736                pass
1737        fp = self.open(url, data)
1738        try:
1739            headers = fp.info()
1740            if filename:
1741                tfp = open(filename, 'wb')
1742            else:
1743                import tempfile
1744                garbage, path = splittype(url)
1745                garbage, path = splithost(path or "")
1746                path, garbage = splitquery(path or "")
1747                path, garbage = splitattr(path or "")
1748                suffix = os.path.splitext(path)[1]
1749                (fd, filename) = tempfile.mkstemp(suffix)
1750                self.__tempfiles.append(filename)
1751                tfp = os.fdopen(fd, 'wb')
1752            try:
1753                result = filename, headers
1754                if self.tempcache is not None:
1755                    self.tempcache[url] = result
1756                bs = 1024*8
1757                size = -1
1758                read = 0
1759                blocknum = 0
1760                if "content-length" in headers:
1761                    size = int(headers["Content-Length"])
1762                if reporthook:
1763                    reporthook(blocknum, bs, size)
1764                while 1:
1765                    block = fp.read(bs)
1766                    if not block:
1767                        break
1768                    read += len(block)
1769                    tfp.write(block)
1770                    blocknum += 1
1771                    if reporthook:
1772                        reporthook(blocknum, bs, size)
1773            finally:
1774                tfp.close()
1775        finally:
1776            fp.close()
1777
1778        # raise exception if actual size does not match content-length header
1779        if size >= 0 and read < size:
1780            raise ContentTooShortError(
1781                "retrieval incomplete: got only %i out of %i bytes"
1782                % (read, size), result)
1783
1784        return result
1785
1786    # Each method named open_<type> knows how to open that type of URL
1787
1788    def _open_generic_http(self, connection_factory, url, data):
1789        """Make an HTTP connection using connection_class.
1790
1791        This is an internal method that should be called from
1792        open_http() or open_https().
1793
1794        Arguments:
1795        - connection_factory should take a host name and return an
1796          HTTPConnection instance.
1797        - url is the url to retrieval or a host, relative-path pair.
1798        - data is payload for a POST request or None.
1799        """
1800
1801        user_passwd = None
1802        proxy_passwd= None
1803        if isinstance(url, str):
1804            host, selector = splithost(url)
1805            if host:
1806                user_passwd, host = splituser(host)
1807                host = unquote(host)
1808            realhost = host
1809        else:
1810            host, selector = url
1811            # check whether the proxy contains authorization information
1812            proxy_passwd, host = splituser(host)
1813            # now we proceed with the url we want to obtain
1814            urltype, rest = splittype(selector)
1815            url = rest
1816            user_passwd = None
1817            if urltype.lower() != 'http':
1818                realhost = None
1819            else:
1820                realhost, rest = splithost(rest)
1821                if realhost:
1822                    user_passwd, realhost = splituser(realhost)
1823                if user_passwd:
1824                    selector = "%s://%s%s" % (urltype, realhost, rest)
1825                if proxy_bypass(realhost):
1826                    host = realhost
1827
1828        if not host: raise IOError('http error', 'no host given')
1829
1830        if proxy_passwd:
1831            proxy_passwd = unquote(proxy_passwd)
1832            proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1833        else:
1834            proxy_auth = None
1835
1836        if user_passwd:
1837            user_passwd = unquote(user_passwd)
1838            auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1839        else:
1840            auth = None
1841        http_conn = connection_factory(host)
1842        headers = {}
1843        if proxy_auth:
1844            headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1845        if auth:
1846            headers["Authorization"] =  "Basic %s" % auth
1847        if realhost:
1848            headers["Host"] = realhost
1849
1850        # Add Connection:close as we don't support persistent connections yet.
1851        # This helps in closing the socket and avoiding ResourceWarning
1852
1853        headers["Connection"] = "close"
1854
1855        for header, value in self.addheaders:
1856            headers[header] = value
1857
1858        if data is not None:
1859            headers["Content-Type"] = "application/x-www-form-urlencoded"
1860            http_conn.request("POST", selector, data, headers)
1861        else:
1862            http_conn.request("GET", selector, headers=headers)
1863
1864        try:
1865            response = http_conn.getresponse()
1866        except http_client.BadStatusLine:
1867            # something went wrong with the HTTP status line
1868            raise URLError("http protocol error: bad status line")
1869
1870        # According to RFC 2616, "2xx" code indicates that the client's
1871        # request was successfully received, understood, and accepted.
1872        if 200 <= response.status < 300:
1873            return addinfourl(response, response.msg, "http:" + url,
1874                              response.status)
1875        else:
1876            return self.http_error(
1877                url, response.fp,
1878                response.status, response.reason, response.msg, data)
1879
1880    def open_http(self, url, data=None):
1881        """Use HTTP protocol."""
1882        return self._open_generic_http(http_client.HTTPConnection, url, data)
1883
1884    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1885        """Handle http errors.
1886
1887        Derived class can override this, or provide specific handlers
1888        named http_error_DDD where DDD is the 3-digit error code."""
1889        # First check if there's a specific handler for this error
1890        name = 'http_error_%d' % errcode
1891        if hasattr(self, name):
1892            method = getattr(self, name)
1893            if data is None:
1894                result = method(url, fp, errcode, errmsg, headers)
1895            else:
1896                result = method(url, fp, errcode, errmsg, headers, data)
1897            if result: return result
1898        return self.http_error_default(url, fp, errcode, errmsg, headers)
1899
1900    def http_error_default(self, url, fp, errcode, errmsg, headers):
1901        """Default error handler: close the connection and raise IOError."""
1902        fp.close()
1903        raise HTTPError(url, errcode, errmsg, headers, None)
1904
1905    if _have_ssl:
1906        def _https_connection(self, host):
1907            return http_client.HTTPSConnection(host,
1908                                           key_file=self.key_file,
1909                                           cert_file=self.cert_file)
1910
1911        def open_https(self, url, data=None):
1912            """Use HTTPS protocol."""
1913            return self._open_generic_http(self._https_connection, url, data)
1914
1915    def open_file(self, url):
1916        """Use local file or FTP depending on form of URL."""
1917        if not isinstance(url, str):
1918            raise URLError('file error: proxy support for file protocol currently not implemented')
1919        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1920            raise ValueError("file:// scheme is supported only on localhost")
1921        else:
1922            return self.open_local_file(url)
1923
1924    def open_local_file(self, url):
1925        """Use local file."""
1926        import future.backports.email.utils as email_utils
1927        import mimetypes
1928        host, file = splithost(url)
1929        localname = url2pathname(file)
1930        try:
1931            stats = os.stat(localname)
1932        except OSError as e:
1933            raise URLError(e.strerror, e.filename)
1934        size = stats.st_size
1935        modified = email_utils.formatdate(stats.st_mtime, usegmt=True)
1936        mtype = mimetypes.guess_type(url)[0]
1937        headers = email.message_from_string(
1938            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1939            (mtype or 'text/plain', size, modified))
1940        if not host:
1941            urlfile = file
1942            if file[:1] == '/':
1943                urlfile = 'file://' + file
1944            return addinfourl(open(localname, 'rb'), headers, urlfile)
1945        host, port = splitport(host)
1946        if (not port
1947           and socket.gethostbyname(host) in ((localhost(),) + thishost())):
1948            urlfile = file
1949            if file[:1] == '/':
1950                urlfile = 'file://' + file
1951            elif file[:2] == './':
1952                raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
1953            return addinfourl(open(localname, 'rb'), headers, urlfile)
1954        raise URLError('local file error: not on local host')
1955
1956    def open_ftp(self, url):
1957        """Use FTP protocol."""
1958        if not isinstance(url, str):
1959            raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
1960        import mimetypes
1961        host, path = splithost(url)
1962        if not host: raise URLError('ftp error: no host given')
1963        host, port = splitport(host)
1964        user, host = splituser(host)
1965        if user: user, passwd = splitpasswd(user)
1966        else: passwd = None
1967        host = unquote(host)
1968        user = unquote(user or '')
1969        passwd = unquote(passwd or '')
1970        host = socket.gethostbyname(host)
1971        if not port:
1972            import ftplib
1973            port = ftplib.FTP_PORT
1974        else:
1975            port = int(port)
1976        path, attrs = splitattr(path)
1977        path = unquote(path)
1978        dirs = path.split('/')
1979        dirs, file = dirs[:-1], dirs[-1]
1980        if dirs and not dirs[0]: dirs = dirs[1:]
1981        if dirs and not dirs[0]: dirs[0] = '/'
1982        key = user, host, port, '/'.join(dirs)
1983        # XXX thread unsafe!
1984        if len(self.ftpcache) > MAXFTPCACHE:
1985            # Prune the cache, rather arbitrarily
1986            for k in self.ftpcache.keys():
1987                if k != key:
1988                    v = self.ftpcache[k]
1989                    del self.ftpcache[k]
1990                    v.close()
1991        try:
1992            if key not in self.ftpcache:
1993                self.ftpcache[key] = \
1994                    ftpwrapper(user, passwd, host, port, dirs)
1995            if not file: type = 'D'
1996            else: type = 'I'
1997            for attr in attrs:
1998                attr, value = splitvalue(attr)
1999                if attr.lower() == 'type' and \
2000                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
2001                    type = value.upper()
2002            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2003            mtype = mimetypes.guess_type("ftp:" + url)[0]
2004            headers = ""
2005            if mtype:
2006                headers += "Content-Type: %s\n" % mtype
2007            if retrlen is not None and retrlen >= 0:
2008                headers += "Content-Length: %d\n" % retrlen
2009            headers = email.message_from_string(headers)
2010            return addinfourl(fp, headers, "ftp:" + url)
2011        except ftperrors() as exp:
2012            raise_with_traceback(URLError('ftp error %r' % exp))
2013
2014    def open_data(self, url, data=None):
2015        """Use "data" URL."""
2016        if not isinstance(url, str):
2017            raise URLError('data error: proxy support for data protocol currently not implemented')
2018        # ignore POSTed data
2019        #
2020        # syntax of data URLs:
2021        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
2022        # mediatype := [ type "/" subtype ] *( ";" parameter )
2023        # data      := *urlchar
2024        # parameter := attribute "=" value
2025        try:
2026            [type, data] = url.split(',', 1)
2027        except ValueError:
2028            raise IOError('data error', 'bad data URL')
2029        if not type:
2030            type = 'text/plain;charset=US-ASCII'
2031        semi = type.rfind(';')
2032        if semi >= 0 and '=' not in type[semi:]:
2033            encoding = type[semi+1:]
2034            type = type[:semi]
2035        else:
2036            encoding = ''
2037        msg = []
2038        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2039                                            time.gmtime(time.time())))
2040        msg.append('Content-type: %s' % type)
2041        if encoding == 'base64':
2042            # XXX is this encoding/decoding ok?
2043            data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2044        else:
2045            data = unquote(data)
2046        msg.append('Content-Length: %d' % len(data))
2047        msg.append('')
2048        msg.append(data)
2049        msg = '\n'.join(msg)
2050        headers = email.message_from_string(msg)
2051        f = io.StringIO(msg)
2052        #f.fileno = None     # needed for addinfourl
2053        return addinfourl(f, headers, url)
2054
2055
2056class FancyURLopener(URLopener):
2057    """Derived class with handlers for errors we can handle (perhaps)."""
2058
2059    def __init__(self, *args, **kwargs):
2060        URLopener.__init__(self, *args, **kwargs)
2061        self.auth_cache = {}
2062        self.tries = 0
2063        self.maxtries = 10
2064
2065    def http_error_default(self, url, fp, errcode, errmsg, headers):
2066        """Default error handling -- don't raise an exception."""
2067        return addinfourl(fp, headers, "http:" + url, errcode)
2068
2069    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2070        """Error 302 -- relocated (temporarily)."""
2071        self.tries += 1
2072        if self.maxtries and self.tries >= self.maxtries:
2073            if hasattr(self, "http_error_500"):
2074                meth = self.http_error_500
2075            else:
2076                meth = self.http_error_default
2077            self.tries = 0
2078            return meth(url, fp, 500,
2079                        "Internal Server Error: Redirect Recursion", headers)
2080        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
2081                                        data)
2082        self.tries = 0
2083        return result
2084
2085    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2086        if 'location' in headers:
2087            newurl = headers['location']
2088        elif 'uri' in headers:
2089            newurl = headers['uri']
2090        else:
2091            return
2092        fp.close()
2093
2094        # In case the server sent a relative URL, join with original:
2095        newurl = urljoin(self.type + ":" + url, newurl)
2096
2097        urlparts = urlparse(newurl)
2098
2099        # For security reasons, we don't allow redirection to anything other
2100        # than http, https and ftp.
2101
2102        # We are using newer HTTPError with older redirect_internal method
2103        # This older method will get deprecated in 3.3
2104
2105        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2106            raise HTTPError(newurl, errcode,
2107                            errmsg +
2108                            " Redirection to url '%s' is not allowed." % newurl,
2109                            headers, fp)
2110
2111        return self.open(newurl)
2112
2113    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2114        """Error 301 -- also relocated (permanently)."""
2115        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2116
2117    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2118        """Error 303 -- also relocated (essentially identical to 302)."""
2119        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2120
2121    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2122        """Error 307 -- relocated, but turn POST into error."""
2123        if data is None:
2124            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2125        else:
2126            return self.http_error_default(url, fp, errcode, errmsg, headers)
2127
2128    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2129            retry=False):
2130        """Error 401 -- authentication required.
2131        This function supports Basic authentication only."""
2132        if 'www-authenticate' not in headers:
2133            URLopener.http_error_default(self, url, fp,
2134                                         errcode, errmsg, headers)
2135        stuff = headers['www-authenticate']
2136        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2137        if not match:
2138            URLopener.http_error_default(self, url, fp,
2139                                         errcode, errmsg, headers)
2140        scheme, realm = match.groups()
2141        if scheme.lower() != 'basic':
2142            URLopener.http_error_default(self, url, fp,
2143                                         errcode, errmsg, headers)
2144        if not retry:
2145            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2146                    headers)
2147        name = 'retry_' + self.type + '_basic_auth'
2148        if data is None:
2149            return getattr(self,name)(url, realm)
2150        else:
2151            return getattr(self,name)(url, realm, data)
2152
2153    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2154            retry=False):
2155        """Error 407 -- proxy authentication required.
2156        This function supports Basic authentication only."""
2157        if 'proxy-authenticate' not in headers:
2158            URLopener.http_error_default(self, url, fp,
2159                                         errcode, errmsg, headers)
2160        stuff = headers['proxy-authenticate']
2161        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2162        if not match:
2163            URLopener.http_error_default(self, url, fp,
2164                                         errcode, errmsg, headers)
2165        scheme, realm = match.groups()
2166        if scheme.lower() != 'basic':
2167            URLopener.http_error_default(self, url, fp,
2168                                         errcode, errmsg, headers)
2169        if not retry:
2170            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2171                    headers)
2172        name = 'retry_proxy_' + self.type + '_basic_auth'
2173        if data is None:
2174            return getattr(self,name)(url, realm)
2175        else:
2176            return getattr(self,name)(url, realm, data)
2177
2178    def retry_proxy_http_basic_auth(self, url, realm, data=None):
2179        host, selector = splithost(url)
2180        newurl = 'http://' + host + selector
2181        proxy = self.proxies['http']
2182        urltype, proxyhost = splittype(proxy)
2183        proxyhost, proxyselector = splithost(proxyhost)
2184        i = proxyhost.find('@') + 1
2185        proxyhost = proxyhost[i:]
2186        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2187        if not (user or passwd): return None
2188        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2189                                  quote(passwd, safe=''), proxyhost)
2190        self.proxies['http'] = 'http://' + proxyhost + proxyselector
2191        if data is None:
2192            return self.open(newurl)
2193        else:
2194            return self.open(newurl, data)
2195
2196    def retry_proxy_https_basic_auth(self, url, realm, data=None):
2197        host, selector = splithost(url)
2198        newurl = 'https://' + host + selector
2199        proxy = self.proxies['https']
2200        urltype, proxyhost = splittype(proxy)
2201        proxyhost, proxyselector = splithost(proxyhost)
2202        i = proxyhost.find('@') + 1
2203        proxyhost = proxyhost[i:]
2204        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2205        if not (user or passwd): return None
2206        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2207                                  quote(passwd, safe=''), proxyhost)
2208        self.proxies['https'] = 'https://' + proxyhost + proxyselector
2209        if data is None:
2210            return self.open(newurl)
2211        else:
2212            return self.open(newurl, data)
2213
2214    def retry_http_basic_auth(self, url, realm, data=None):
2215        host, selector = splithost(url)
2216        i = host.find('@') + 1
2217        host = host[i:]
2218        user, passwd = self.get_user_passwd(host, realm, i)
2219        if not (user or passwd): return None
2220        host = "%s:%s@%s" % (quote(user, safe=''),
2221                             quote(passwd, safe=''), host)
2222        newurl = 'http://' + host + selector
2223        if data is None:
2224            return self.open(newurl)
2225        else:
2226            return self.open(newurl, data)
2227
2228    def retry_https_basic_auth(self, url, realm, data=None):
2229        host, selector = splithost(url)
2230        i = host.find('@') + 1
2231        host = host[i:]
2232        user, passwd = self.get_user_passwd(host, realm, i)
2233        if not (user or passwd): return None
2234        host = "%s:%s@%s" % (quote(user, safe=''),
2235                             quote(passwd, safe=''), host)
2236        newurl = 'https://' + host + selector
2237        if data is None:
2238            return self.open(newurl)
2239        else:
2240            return self.open(newurl, data)
2241
2242    def get_user_passwd(self, host, realm, clear_cache=0):
2243        key = realm + '@' + host.lower()
2244        if key in self.auth_cache:
2245            if clear_cache:
2246                del self.auth_cache[key]
2247            else:
2248                return self.auth_cache[key]
2249        user, passwd = self.prompt_user_passwd(host, realm)
2250        if user or passwd: self.auth_cache[key] = (user, passwd)
2251        return user, passwd
2252
2253    def prompt_user_passwd(self, host, realm):
2254        """Override this in a GUI environment!"""
2255        import getpass
2256        try:
2257            user = input("Enter username for %s at %s: " % (realm, host))
2258            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2259                (user, realm, host))
2260            return user, passwd
2261        except KeyboardInterrupt:
2262            print()
2263            return None, None
2264
2265
2266# Utility functions
2267
2268_localhost = None
2269def localhost():
2270    """Return the IP address of the magic hostname 'localhost'."""
2271    global _localhost
2272    if _localhost is None:
2273        _localhost = socket.gethostbyname('localhost')
2274    return _localhost
2275
2276_thishost = None
2277def thishost():
2278    """Return the IP addresses of the current host."""
2279    global _thishost
2280    if _thishost is None:
2281        try:
2282            _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2283        except socket.gaierror:
2284            _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2285    return _thishost
2286
2287_ftperrors = None
2288def ftperrors():
2289    """Return the set of errors raised by the FTP class."""
2290    global _ftperrors
2291    if _ftperrors is None:
2292        import ftplib
2293        _ftperrors = ftplib.all_errors
2294    return _ftperrors
2295
2296_noheaders = None
2297def noheaders():
2298    """Return an empty email Message object."""
2299    global _noheaders
2300    if _noheaders is None:
2301        _noheaders = email.message_from_string("")
2302    return _noheaders
2303
2304
2305# Utility classes
2306
2307class ftpwrapper(object):
2308    """Class used by open_ftp() for cache of open FTP connections."""
2309
2310    def __init__(self, user, passwd, host, port, dirs, timeout=None,
2311                 persistent=True):
2312        self.user = user
2313        self.passwd = passwd
2314        self.host = host
2315        self.port = port
2316        self.dirs = dirs
2317        self.timeout = timeout
2318        self.refcount = 0
2319        self.keepalive = persistent
2320        self.init()
2321
2322    def init(self):
2323        import ftplib
2324        self.busy = 0
2325        self.ftp = ftplib.FTP()
2326        self.ftp.connect(self.host, self.port, self.timeout)
2327        self.ftp.login(self.user, self.passwd)
2328        _target = '/'.join(self.dirs)
2329        self.ftp.cwd(_target)
2330
2331    def retrfile(self, file, type):
2332        import ftplib
2333        self.endtransfer()
2334        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2335        else: cmd = 'TYPE ' + type; isdir = 0
2336        try:
2337            self.ftp.voidcmd(cmd)
2338        except ftplib.all_errors:
2339            self.init()
2340            self.ftp.voidcmd(cmd)
2341        conn = None
2342        if file and not isdir:
2343            # Try to retrieve as a file
2344            try:
2345                cmd = 'RETR ' + file
2346                conn, retrlen = self.ftp.ntransfercmd(cmd)
2347            except ftplib.error_perm as reason:
2348                if str(reason)[:3] != '550':
2349                    raise_with_traceback(URLError('ftp error: %r' % reason))
2350        if not conn:
2351            # Set transfer mode to ASCII!
2352            self.ftp.voidcmd('TYPE A')
2353            # Try a directory listing. Verify that directory exists.
2354            if file:
2355                pwd = self.ftp.pwd()
2356                try:
2357                    try:
2358                        self.ftp.cwd(file)
2359                    except ftplib.error_perm as reason:
2360                        ### Was:
2361                        # raise URLError('ftp error: %r' % reason) from reason
2362                        exc = URLError('ftp error: %r' % reason)
2363                        exc.__cause__ = reason
2364                        raise exc
2365                finally:
2366                    self.ftp.cwd(pwd)
2367                cmd = 'LIST ' + file
2368            else:
2369                cmd = 'LIST'
2370            conn, retrlen = self.ftp.ntransfercmd(cmd)
2371        self.busy = 1
2372
2373        ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2374        self.refcount += 1
2375        conn.close()
2376        # Pass back both a suitably decorated object and a retrieval length
2377        return (ftpobj, retrlen)
2378
2379    def endtransfer(self):
2380        self.busy = 0
2381
2382    def close(self):
2383        self.keepalive = False
2384        if self.refcount <= 0:
2385            self.real_close()
2386
2387    def file_close(self):
2388        self.endtransfer()
2389        self.refcount -= 1
2390        if self.refcount <= 0 and not self.keepalive:
2391            self.real_close()
2392
2393    def real_close(self):
2394        self.endtransfer()
2395        try:
2396            self.ftp.close()
2397        except ftperrors():
2398            pass
2399
2400# Proxy handling
2401def getproxies_environment():
2402    """Return a dictionary of scheme -> proxy server URL mappings.
2403
2404    Scan the environment for variables named <scheme>_proxy;
2405    this seems to be the standard convention.  If you need a
2406    different way, you can pass a proxies dictionary to the
2407    [Fancy]URLopener constructor.
2408
2409    """
2410    proxies = {}
2411    for name, value in os.environ.items():
2412        name = name.lower()
2413        if value and name[-6:] == '_proxy':
2414            proxies[name[:-6]] = value
2415    return proxies
2416
2417def proxy_bypass_environment(host):
2418    """Test if proxies should not be used for a particular host.
2419
2420    Checks the environment for a variable named no_proxy, which should
2421    be a list of DNS suffixes separated by commas, or '*' for all hosts.
2422    """
2423    no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2424    # '*' is special case for always bypass
2425    if no_proxy == '*':
2426        return 1
2427    # strip port off host
2428    hostonly, port = splitport(host)
2429    # check if the host ends with any of the DNS suffixes
2430    no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2431    for name in no_proxy_list:
2432        if name and (hostonly.endswith(name) or host.endswith(name)):
2433            return 1
2434    # otherwise, don't bypass
2435    return 0
2436
2437
2438# This code tests an OSX specific data structure but is testable on all
2439# platforms
2440def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2441    """
2442    Return True iff this host shouldn't be accessed using a proxy
2443
2444    This function uses the MacOSX framework SystemConfiguration
2445    to fetch the proxy information.
2446
2447    proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2448    { 'exclude_simple': bool,
2449      'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2450    }
2451    """
2452    from fnmatch import fnmatch
2453
2454    hostonly, port = splitport(host)
2455
2456    def ip2num(ipAddr):
2457        parts = ipAddr.split('.')
2458        parts = list(map(int, parts))
2459        if len(parts) != 4:
2460            parts = (parts + [0, 0, 0, 0])[:4]
2461        return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2462
2463    # Check for simple host names:
2464    if '.' not in host:
2465        if proxy_settings['exclude_simple']:
2466            return True
2467
2468    hostIP = None
2469
2470    for value in proxy_settings.get('exceptions', ()):
2471        # Items in the list are strings like these: *.local, 169.254/16
2472        if not value: continue
2473
2474        m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2475        if m is not None:
2476            if hostIP is None:
2477                try:
2478                    hostIP = socket.gethostbyname(hostonly)
2479                    hostIP = ip2num(hostIP)
2480                except socket.error:
2481                    continue
2482
2483            base = ip2num(m.group(1))
2484            mask = m.group(2)
2485            if mask is None:
2486                mask = 8 * (m.group(1).count('.') + 1)
2487            else:
2488                mask = int(mask[1:])
2489            mask = 32 - mask
2490
2491            if (hostIP >> mask) == (base >> mask):
2492                return True
2493
2494        elif fnmatch(host, value):
2495            return True
2496
2497    return False
2498
2499
2500if sys.platform == 'darwin':
2501    from _scproxy import _get_proxy_settings, _get_proxies
2502
2503    def proxy_bypass_macosx_sysconf(host):
2504        proxy_settings = _get_proxy_settings()
2505        return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2506
2507    def getproxies_macosx_sysconf():
2508        """Return a dictionary of scheme -> proxy server URL mappings.
2509
2510        This function uses the MacOSX framework SystemConfiguration
2511        to fetch the proxy information.
2512        """
2513        return _get_proxies()
2514
2515
2516
2517    def proxy_bypass(host):
2518        if getproxies_environment():
2519            return proxy_bypass_environment(host)
2520        else:
2521            return proxy_bypass_macosx_sysconf(host)
2522
2523    def getproxies():
2524        return getproxies_environment() or getproxies_macosx_sysconf()
2525
2526
2527elif os.name == 'nt':
2528    def getproxies_registry():
2529        """Return a dictionary of scheme -> proxy server URL mappings.
2530
2531        Win32 uses the registry to store proxies.
2532
2533        """
2534        proxies = {}
2535        try:
2536            import winreg
2537        except ImportError:
2538            # Std module, so should be around - but you never know!
2539            return proxies
2540        try:
2541            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2542                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2543            proxyEnable = winreg.QueryValueEx(internetSettings,
2544                                               'ProxyEnable')[0]
2545            if proxyEnable:
2546                # Returned as Unicode but problems if not converted to ASCII
2547                proxyServer = str(winreg.QueryValueEx(internetSettings,
2548                                                       'ProxyServer')[0])
2549                if '=' in proxyServer:
2550                    # Per-protocol settings
2551                    for p in proxyServer.split(';'):
2552                        protocol, address = p.split('=', 1)
2553                        # See if address has a type:// prefix
2554                        if not re.match('^([^/:]+)://', address):
2555                            address = '%s://%s' % (protocol, address)
2556                        proxies[protocol] = address
2557                else:
2558                    # Use one setting for all protocols
2559                    if proxyServer[:5] == 'http:':
2560                        proxies['http'] = proxyServer
2561                    else:
2562                        proxies['http'] = 'http://%s' % proxyServer
2563                        proxies['https'] = 'https://%s' % proxyServer
2564                        proxies['ftp'] = 'ftp://%s' % proxyServer
2565            internetSettings.Close()
2566        except (WindowsError, ValueError, TypeError):
2567            # Either registry key not found etc, or the value in an
2568            # unexpected format.
2569            # proxies already set up to be empty so nothing to do
2570            pass
2571        return proxies
2572
2573    def getproxies():
2574        """Return a dictionary of scheme -> proxy server URL mappings.
2575
2576        Returns settings gathered from the environment, if specified,
2577        or the registry.
2578
2579        """
2580        return getproxies_environment() or getproxies_registry()
2581
2582    def proxy_bypass_registry(host):
2583        try:
2584            import winreg
2585        except ImportError:
2586            # Std modules, so should be around - but you never know!
2587            return 0
2588        try:
2589            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2590                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2591            proxyEnable = winreg.QueryValueEx(internetSettings,
2592                                               'ProxyEnable')[0]
2593            proxyOverride = str(winreg.QueryValueEx(internetSettings,
2594                                                     'ProxyOverride')[0])
2595            # ^^^^ Returned as Unicode but problems if not converted to ASCII
2596        except WindowsError:
2597            return 0
2598        if not proxyEnable or not proxyOverride:
2599            return 0
2600        # try to make a host list from name and IP address.
2601        rawHost, port = splitport(host)
2602        host = [rawHost]
2603        try:
2604            addr = socket.gethostbyname(rawHost)
2605            if addr != rawHost:
2606                host.append(addr)
2607        except socket.error:
2608            pass
2609        try:
2610            fqdn = socket.getfqdn(rawHost)
2611            if fqdn != rawHost:
2612                host.append(fqdn)
2613        except socket.error:
2614            pass
2615        # make a check value list from the registry entry: replace the
2616        # '<local>' string by the localhost entry and the corresponding
2617        # canonical entry.
2618        proxyOverride = proxyOverride.split(';')
2619        # now check if we match one of the registry values.
2620        for test in proxyOverride:
2621            if test == '<local>':
2622                if '.' not in rawHost:
2623                    return 1
2624            test = test.replace(".", r"\.")     # mask dots
2625            test = test.replace("*", r".*")     # change glob sequence
2626            test = test.replace("?", r".")      # change glob char
2627            for val in host:
2628                if re.match(test, val, re.I):
2629                    return 1
2630        return 0
2631
2632    def proxy_bypass(host):
2633        """Return a dictionary of scheme -> proxy server URL mappings.
2634
2635        Returns settings gathered from the environment, if specified,
2636        or the registry.
2637
2638        """
2639        if getproxies_environment():
2640            return proxy_bypass_environment(host)
2641        else:
2642            return proxy_bypass_registry(host)
2643
2644else:
2645    # By default use environment variables
2646    getproxies = getproxies_environment
2647    proxy_bypass = proxy_bypass_environment
2648