1# Copyright (C) 2005-2010 Canonical Ltd
2#
3# This program is free software; you can redistribute it and/or modify
4# it under the terms of the GNU General Public License as published by
5# the Free Software Foundation; either version 2 of the License, or
6# (at your option) any later version.
7#
8# This program is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11# GNU General Public License for more details.
12#
13# You should have received a copy of the GNU General Public License
14# along with this program; if not, write to the Free Software
15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17"""Base implementation of Transport over http using urllib.
18
19There are separate implementation modules for each http client implementation.
20"""
21
22from __future__ import absolute_import
23
24DEBUG = 0
25
26import base64
27import cgi
28import errno
29import os
30import re
31import socket
32import ssl
33import sys
34import time
35import urllib
36import weakref
37
38try:
39    import http.client as http_client
40except ImportError:
41    import httplib as http_client
42try:
43    import urllib.request as urllib_request
44except ImportError:  # python < 3
45    import urllib2 as urllib_request
46try:
47    from urllib.parse import urljoin, splitport, splittype, splithost, urlencode
48except ImportError:
49    from urlparse import urljoin
50    from urllib import splitport, splittype, splithost, urlencode
51
52# TODO: handle_response should be integrated into the http/__init__.py
53from .response import handle_response
54
55# FIXME: Oversimplifying, two kind of exceptions should be
56# raised, once a request is issued: URLError before we have been
57# able to process the response, HTTPError after that. Process the
58# response means we are able to leave the socket clean, so if we
59# are not able to do that, we should close the connection. The
60# actual code more or less do that, tests should be written to
61# ensure that.
62
63from ... import __version__ as breezy_version
64from ... import (
65    config,
66    debug,
67    errors,
68    lazy_import,
69    osutils,
70    trace,
71    transport,
72    ui,
73    urlutils,
74)
75from ...bzr.smart import medium
76from ...trace import mutter
77from ...transport import (
78    ConnectedTransport,
79    UnusableRedirect,
80    )
81
82from . import default_user_agent, ssl
83
84
85checked_kerberos = False
86kerberos = None
87
88
89class addinfourl(urllib_request.addinfourl):
90    '''Replacement addinfourl class compatible with python-2.7's xmlrpclib
91
92    In python-2.7, xmlrpclib expects that the response object that it receives
93    has a getheader method.  http_client.HTTPResponse provides this but
94    urllib_request.addinfourl does not.  Add the necessary functions here, ported to
95    use the internal data structures of addinfourl.
96    '''
97
98    def getheader(self, name, default=None):
99        if self.headers is None:
100            raise http_client.ResponseNotReady()
101        return self.headers.getheader(name, default)
102
103    def getheaders(self):
104        if self.headers is None:
105            raise http_client.ResponseNotReady()
106        return list(self.headers.items())
107
108
109class _ReportingFileSocket(object):
110
111    def __init__(self, filesock, report_activity=None):
112        self.filesock = filesock
113        self._report_activity = report_activity
114
115    def report_activity(self, size, direction):
116        if self._report_activity:
117            self._report_activity(size, direction)
118
119    def read(self, size=1):
120        s = self.filesock.read(size)
121        self.report_activity(len(s), 'read')
122        return s
123
124    def readline(self, size=-1):
125        s = self.filesock.readline(size)
126        self.report_activity(len(s), 'read')
127        return s
128
129    def readinto(self, b):
130        s = self.filesock.readinto(b)
131        self.report_activity(s, 'read')
132        return s
133
134    def __getattr__(self, name):
135        return getattr(self.filesock, name)
136
137
138class _ReportingSocket(object):
139
140    def __init__(self, sock, report_activity=None):
141        self.sock = sock
142        self._report_activity = report_activity
143
144    def report_activity(self, size, direction):
145        if self._report_activity:
146            self._report_activity(size, direction)
147
148    def sendall(self, s, *args):
149        self.sock.sendall(s, *args)
150        self.report_activity(len(s), 'write')
151
152    def recv(self, *args):
153        s = self.sock.recv(*args)
154        self.report_activity(len(s), 'read')
155        return s
156
157    def makefile(self, mode='r', bufsize=-1):
158        # http_client creates a fileobject that doesn't do buffering, which
159        # makes fp.readline() very expensive because it only reads one byte
160        # at a time.  So we wrap the socket in an object that forces
161        # sock.makefile to make a buffered file.
162        fsock = self.sock.makefile(mode, 65536)
163        # And wrap that into a reporting kind of fileobject
164        return _ReportingFileSocket(fsock, self._report_activity)
165
166    def __getattr__(self, name):
167        return getattr(self.sock, name)
168
169
170# We define our own Response class to keep our http_client pipe clean
171class Response(http_client.HTTPResponse):
172    """Custom HTTPResponse, to avoid the need to decorate.
173
174    http_client prefers to decorate the returned objects, rather
175    than using a custom object.
176    """
177
178    # Some responses have bodies in which we have no interest
179    _body_ignored_responses = [301, 302, 303, 307, 308, 403, 404, 501]
180
181    # in finish() below, we may have to discard several MB in the worst
182    # case. To avoid buffering that much, we read and discard by chunks
183    # instead. The underlying file is either a socket or a StringIO, so reading
184    # 8k chunks should be fine.
185    _discarded_buf_size = 8192
186
187    def __init__(self, sock, debuglevel=0, method=None, url=None):
188        self.url = url
189        super(Response, self).__init__(
190            sock, debuglevel=debuglevel, method=method, url=url)
191
192    def begin(self):
193        """Begin to read the response from the server.
194
195        http_client assumes that some responses get no content and do
196        not even attempt to read the body in that case, leaving
197        the body in the socket, blocking the next request. Let's
198        try to workaround that.
199        """
200        http_client.HTTPResponse.begin(self)
201        if self.status in self._body_ignored_responses:
202            if self.debuglevel >= 2:
203                print("For status: [%s], will ready body, length: %s" % (
204                    self.status, self.length))
205            if not (self.length is None or self.will_close):
206                # In some cases, we just can't read the body not
207                # even try or we may encounter a 104, 'Connection
208                # reset by peer' error if there is indeed no body
209                # and the server closed the connection just after
210                # having issued the response headers (even if the
211                # headers indicate a Content-Type...)
212                body = self.read(self.length)
213                if self.debuglevel >= 9:
214                    # This one can be huge and is generally not interesting
215                    print("Consumed body: [%s]" % body)
216            self.close()
217        elif self.status == 200:
218            # Whatever the request is, it went ok, so we surely don't want to
219            # close the connection. Some cases are not correctly detected by
220            # http_client.HTTPConnection.getresponse (called by
221            # http_client.HTTPResponse.begin). The CONNECT response for the https
222            # through proxy case is one.  Note: the 'will_close' below refers
223            # to the "true" socket between us and the server, whereas the
224            # 'close()' above refers to the copy of that socket created by
225            # http_client for the response itself. So, in the if above we close the
226            # socket to indicate that we are done with the response whereas
227            # below we keep the socket with the server opened.
228            self.will_close = False
229
230    def finish(self):
231        """Finish reading the body.
232
233        In some cases, the client may have left some bytes to read in the
234        body. That will block the next request to succeed if we use a
235        persistent connection. If we don't use a persistent connection, well,
236        nothing will block the next request since a new connection will be
237        issued anyway.
238
239        :return: the number of bytes left on the socket (may be None)
240        """
241        pending = None
242        if not self.isclosed():
243            # Make sure nothing was left to be read on the socket
244            pending = 0
245            data = True
246            while data and self.length:
247                # read() will update self.length
248                data = self.read(min(self.length, self._discarded_buf_size))
249                pending += len(data)
250            if pending:
251                trace.mutter("%s bytes left on the HTTP socket", pending)
252            self.close()
253        return pending
254
255
256# Not inheriting from 'object' because http_client.HTTPConnection doesn't.
257class AbstractHTTPConnection:
258    """A custom HTTP(S) Connection, which can reset itself on a bad response"""
259
260    response_class = Response
261
262    # When we detect a server responding with the whole file to range requests,
263    # we want to warn. But not below a given thresold.
264    _range_warning_thresold = 1024 * 1024
265
266    def __init__(self, report_activity=None):
267        self._response = None
268        self._report_activity = report_activity
269        self._ranges_received_whole_file = None
270
271    def _mutter_connect(self):
272        netloc = '%s:%s' % (self.host, self.port)
273        if self.proxied_host is not None:
274            netloc += '(proxy for %s)' % self.proxied_host
275        trace.mutter('* About to connect() to %s' % netloc)
276
277    def getresponse(self):
278        """Capture the response to be able to cleanup"""
279        self._response = http_client.HTTPConnection.getresponse(self)
280        return self._response
281
282    def cleanup_pipe(self):
283        """Read the remaining bytes of the last response if any."""
284        if self._response is not None:
285            try:
286                pending = self._response.finish()
287                # Warn the user (once)
288                if (self._ranges_received_whole_file is None
289                        and self._response.status == 200
290                        and pending
291                        and pending > self._range_warning_thresold):
292                    self._ranges_received_whole_file = True
293                    trace.warning(
294                        'Got a 200 response when asking for multiple ranges,'
295                        ' does your server at %s:%s support range requests?',
296                        self.host, self.port)
297            except socket.error as e:
298                # It's conceivable that the socket is in a bad state here
299                # (including some test cases) and in this case, it doesn't need
300                # cleaning anymore, so no need to fail, we just get rid of the
301                # socket and let callers reconnect
302                if (len(e.args) == 0
303                        or e.args[0] not in (errno.ECONNRESET, errno.ECONNABORTED)):
304                    raise
305                self.close()
306            self._response = None
307        # Preserve our preciousss
308        sock = self.sock
309        self.sock = None
310        # Let http_client.HTTPConnection do its housekeeping
311        self.close()
312        # Restore our preciousss
313        self.sock = sock
314
315    def _wrap_socket_for_reporting(self, sock):
316        """Wrap the socket before anybody use it."""
317        self.sock = _ReportingSocket(sock, self._report_activity)
318
319
320class HTTPConnection(AbstractHTTPConnection, http_client.HTTPConnection):
321
322    # XXX: Needs refactoring at the caller level.
323    def __init__(self, host, port=None, proxied_host=None,
324                 report_activity=None, ca_certs=None):
325        AbstractHTTPConnection.__init__(self, report_activity=report_activity)
326        http_client.HTTPConnection.__init__(self, host, port)
327        self.proxied_host = proxied_host
328        # ca_certs is ignored, it's only relevant for https
329
330    def connect(self):
331        if 'http' in debug.debug_flags:
332            self._mutter_connect()
333        http_client.HTTPConnection.connect(self)
334        self._wrap_socket_for_reporting(self.sock)
335
336
337class HTTPSConnection(AbstractHTTPConnection, http_client.HTTPSConnection):
338
339    def __init__(self, host, port=None, key_file=None, cert_file=None,
340                 proxied_host=None,
341                 report_activity=None, ca_certs=None):
342        AbstractHTTPConnection.__init__(self, report_activity=report_activity)
343        http_client.HTTPSConnection.__init__(
344            self, host, port, key_file, cert_file)
345        self.proxied_host = proxied_host
346        self.ca_certs = ca_certs
347
348    def connect(self):
349        if 'http' in debug.debug_flags:
350            self._mutter_connect()
351        http_client.HTTPConnection.connect(self)
352        self._wrap_socket_for_reporting(self.sock)
353        if self.proxied_host is None:
354            self.connect_to_origin()
355
356    def connect_to_origin(self):
357        # FIXME JRV 2011-12-18: Use location config here?
358        config_stack = config.GlobalStack()
359        cert_reqs = config_stack.get('ssl.cert_reqs')
360        if self.proxied_host is not None:
361            host = self.proxied_host.split(":", 1)[0]
362        else:
363            host = self.host
364        if cert_reqs == ssl.CERT_NONE:
365            ui.ui_factory.show_user_warning('not_checking_ssl_cert', host=host)
366            ui.ui_factory.suppressed_warnings.add('not_checking_ssl_cert')
367            ca_certs = None
368        else:
369            if self.ca_certs is None:
370                ca_certs = config_stack.get('ssl.ca_certs')
371            else:
372                ca_certs = self.ca_certs
373            if ca_certs is None:
374                trace.warning(
375                    "No valid trusted SSL CA certificates file set. See "
376                    "'brz help ssl.ca_certs' for more information on setting "
377                    "trusted CAs.")
378        try:
379            ssl_context = ssl.create_default_context(
380                purpose=ssl.Purpose.SERVER_AUTH, cafile=ca_certs)
381            ssl_context.check_hostname = cert_reqs != ssl.CERT_NONE
382            if self.cert_file:
383                ssl_context.load_cert_chain(
384                    keyfile=self.key_file, certfile=self.cert_file)
385            ssl_context.verify_mode = cert_reqs
386            ssl_sock = ssl_context.wrap_socket(
387                self.sock, server_hostname=self.host)
388        except ssl.SSLError:
389            trace.note(
390                "\n"
391                "See `brz help ssl.ca_certs` for how to specify trusted CA"
392                "certificates.\n"
393                "Pass -Ossl.cert_reqs=none to disable certificate "
394                "verification entirely.\n")
395            raise
396        # Wrap the ssl socket before anybody use it
397        self._wrap_socket_for_reporting(ssl_sock)
398
399
400class Request(urllib_request.Request):
401    """A custom Request object.
402
403    urllib_request determines the request method heuristically (based on
404    the presence or absence of data). We set the method
405    statically.
406
407    The Request object tracks:
408    - the connection the request will be made on.
409    - the authentication parameters needed to preventively set
410      the authentication header once a first authentication have
411       been made.
412    """
413
414    def __init__(self, method, url, data=None, headers={},
415                 origin_req_host=None, unverifiable=False,
416                 connection=None, parent=None):
417        urllib_request.Request.__init__(
418            self, url, data, headers,
419            origin_req_host, unverifiable)
420        self.method = method
421        self.connection = connection
422        # To handle redirections
423        self.parent = parent
424        self.redirected_to = None
425        # Unless told otherwise, redirections are not followed
426        self.follow_redirections = False
427        # auth and proxy_auth are dicts containing, at least
428        # (scheme, host, port, realm, user, password, protocol, path).
429        # The dict entries are mostly handled by the AuthHandler.
430        # Some authentication schemes may add more entries.
431        self.auth = {}
432        self.proxy_auth = {}
433        self.proxied_host = None
434
435    def get_method(self):
436        return self.method
437
438    def set_proxy(self, proxy, type):
439        """Set the proxy and remember the proxied host."""
440        host, port = splitport(self.host)
441        if port is None:
442            # We need to set the default port ourselves way before it gets set
443            # in the HTTP[S]Connection object at build time.
444            if self.type == 'https':
445                conn_class = HTTPSConnection
446            else:
447                conn_class = HTTPConnection
448            port = conn_class.default_port
449        self.proxied_host = '%s:%s' % (host, port)
450        urllib_request.Request.set_proxy(self, proxy, type)
451        # When urllib_request makes a https request with our wrapper code and a proxy,
452        # it sets Host to the https proxy, not the host we want to talk to.
453        # I'm fairly sure this is our fault, but what is the cause is an open
454        # question. -- Robert Collins May 8 2010.
455        self.add_unredirected_header('Host', self.proxied_host)
456
457
458class _ConnectRequest(Request):
459
460    def __init__(self, request):
461        """Constructor
462
463        :param request: the first request sent to the proxied host, already
464            processed by the opener (i.e. proxied_host is already set).
465        """
466        # We give a fake url and redefine selector or urllib_request will be
467        # confused
468        Request.__init__(self, 'CONNECT', request.get_full_url(),
469                         connection=request.connection)
470        if request.proxied_host is None:
471            raise AssertionError()
472        self.proxied_host = request.proxied_host
473
474    @property
475    def selector(self):
476        return self.proxied_host
477
478    def get_selector(self):
479        return self.selector
480
481    def set_proxy(self, proxy, type):
482        """Set the proxy without remembering the proxied host.
483
484        We already know the proxied host by definition, the CONNECT request
485        occurs only when the connection goes through a proxy. The usual
486        processing (masquerade the request so that the connection is done to
487        the proxy while the request is targeted at another host) does not apply
488        here. In fact, the connection is already established with proxy and we
489        just want to enable the SSL tunneling.
490        """
491        urllib_request.Request.set_proxy(self, proxy, type)
492
493
494class ConnectionHandler(urllib_request.BaseHandler):
495    """Provides connection-sharing by pre-processing requests.
496
497    urllib_request provides no way to access the HTTPConnection object
498    internally used. But we need it in order to achieve
499    connection sharing. So, we add it to the request just before
500    it is processed, and then we override the do_open method for
501    http[s] requests in AbstractHTTPHandler.
502    """
503
504    handler_order = 1000  # after all pre-processings
505
506    def __init__(self, report_activity=None, ca_certs=None):
507        self._report_activity = report_activity
508        self.ca_certs = ca_certs
509
510    def create_connection(self, request, http_connection_class):
511        host = request.host
512        if not host:
513            # Just a bit of paranoia here, this should have been
514            # handled in the higher levels
515            raise urlutils.InvalidURL(request.get_full_url(), 'no host given.')
516
517        # We create a connection (but it will not connect until the first
518        # request is made)
519        try:
520            connection = http_connection_class(
521                host, proxied_host=request.proxied_host,
522                report_activity=self._report_activity,
523                ca_certs=self.ca_certs)
524        except http_client.InvalidURL as exception:
525            # There is only one occurrence of InvalidURL in http_client
526            raise urlutils.InvalidURL(request.get_full_url(),
527                                      extra='nonnumeric port')
528
529        return connection
530
531    def capture_connection(self, request, http_connection_class):
532        """Capture or inject the request connection.
533
534        Two cases:
535        - the request have no connection: create a new one,
536
537        - the request have a connection: this one have been used
538          already, let's capture it, so that we can give it to
539          another transport to be reused. We don't do that
540          ourselves: the Transport object get the connection from
541          a first request and then propagate it, from request to
542          request or to cloned transports.
543        """
544        connection = request.connection
545        if connection is None:
546            # Create a new one
547            connection = self.create_connection(request, http_connection_class)
548            request.connection = connection
549
550        # All connections will pass here, propagate debug level
551        connection.set_debuglevel(DEBUG)
552        return request
553
554    def http_request(self, request):
555        return self.capture_connection(request, HTTPConnection)
556
557    def https_request(self, request):
558        return self.capture_connection(request, HTTPSConnection)
559
560
561class AbstractHTTPHandler(urllib_request.AbstractHTTPHandler):
562    """A custom handler for HTTP(S) requests.
563
564    We overrive urllib_request.AbstractHTTPHandler to get a better
565    control of the connection, the ability to implement new
566    request types and return a response able to cope with
567    persistent connections.
568    """
569
570    # We change our order to be before urllib_request HTTP[S]Handlers
571    # and be chosen instead of them (the first http_open called
572    # wins).
573    handler_order = 400
574
575    _default_headers = {'Pragma': 'no-cache',
576                        'Cache-control': 'max-age=0',
577                        'Connection': 'Keep-Alive',
578                        'User-agent': default_user_agent(),
579                        'Accept': '*/*',
580                        }
581
582    def __init__(self):
583        urllib_request.AbstractHTTPHandler.__init__(self, debuglevel=DEBUG)
584
585    def http_request(self, request):
586        """Common headers setting"""
587
588        for name, value in self._default_headers.items():
589            if name not in request.headers:
590                request.headers[name] = value
591        # FIXME: We may have to add the Content-Length header if
592        # we have data to send.
593        return request
594
595    def retry_or_raise(self, http_class, request, first_try):
596        """Retry the request (once) or raise the exception.
597
598        urllib_request raises exception of application level kind, we
599        just have to translate them.
600
601        http_client can raise exceptions of transport level (badly
602        formatted dialog, loss of connexion or socket level
603        problems). In that case we should issue the request again
604        (http_client will close and reopen a new connection if
605        needed).
606        """
607        # When an exception occurs, we give back the original
608        # Traceback or the bugs are hard to diagnose.
609        exc_type, exc_val, exc_tb = sys.exc_info()
610        if exc_type == socket.gaierror:
611            # No need to retry, that will not help
612            origin_req_host = request.origin_req_host
613            raise errors.ConnectionError("Couldn't resolve host '%s'"
614                                         % origin_req_host,
615                                         orig_error=exc_val)
616        elif isinstance(exc_val, http_client.ImproperConnectionState):
617            # The http_client pipeline is in incorrect state, it's a bug in our
618            # implementation.
619            raise exc_val.with_traceback(exc_tb)
620        else:
621            if first_try:
622                if self._debuglevel >= 2:
623                    print('Received exception: [%r]' % exc_val)
624                    print('  On connection: [%r]' % request.connection)
625                    method = request.get_method()
626                    url = request.get_full_url()
627                    print('  Will retry, %s %r' % (method, url))
628                request.connection.close()
629                response = self.do_open(http_class, request, False)
630            else:
631                if self._debuglevel >= 2:
632                    print('Received second exception: [%r]' % exc_val)
633                    print('  On connection: [%r]' % request.connection)
634                if exc_type in (http_client.BadStatusLine, http_client.UnknownProtocol):
635                    # http_client.BadStatusLine and
636                    # http_client.UnknownProtocol indicates that a
637                    # bogus server was encountered or a bad
638                    # connection (i.e. transient errors) is
639                    # experimented, we have already retried once
640                    # for that request so we raise the exception.
641                    my_exception = errors.InvalidHttpResponse(
642                        request.get_full_url(),
643                        'Bad status line received',
644                        orig_error=exc_val)
645                elif (isinstance(exc_val, socket.error) and len(exc_val.args)
646                      and exc_val.args[0] in (errno.ECONNRESET, 10053, 10054)):
647                    # 10053 == WSAECONNABORTED
648                    # 10054 == WSAECONNRESET
649                    raise errors.ConnectionReset(
650                        "Connection lost while sending request.")
651                else:
652                    # All other exception are considered connection related.
653
654                    # socket errors generally occurs for reasons
655                    # far outside our scope, so closing the
656                    # connection and retrying is the best we can
657                    # do.
658                    selector = request.selector
659                    my_exception = errors.ConnectionError(
660                        msg='while sending %s %s:' % (request.get_method(),
661                                                      selector),
662                        orig_error=exc_val)
663
664                if self._debuglevel >= 2:
665                    print('On connection: [%r]' % request.connection)
666                    method = request.get_method()
667                    url = request.get_full_url()
668                    print('  Failed again, %s %r' % (method, url))
669                    print('  Will raise: [%r]' % my_exception)
670                raise my_exception.with_traceback(exc_tb)
671        return response
672
673    def do_open(self, http_class, request, first_try=True):
674        """See urllib_request.AbstractHTTPHandler.do_open for the general idea.
675
676        The request will be retried once if it fails.
677        """
678        connection = request.connection
679        if connection is None:
680            raise AssertionError(
681                'Cannot process a request without a connection')
682
683        # Get all the headers
684        headers = {}
685        headers.update(request.header_items())
686        headers.update(request.unredirected_hdrs)
687        # Some servers or proxies will choke on headers not properly
688        # cased. http_client/urllib/urllib_request all use capitalize to get canonical
689        # header names, but only python2.5 urllib_request use title() to fix them just
690        # before sending the request. And not all versions of python 2.5 do
691        # that. Since we replace urllib_request.AbstractHTTPHandler.do_open we do it
692        # ourself below.
693        headers = {name.title(): val for name, val in headers.items()}
694
695        try:
696            method = request.get_method()
697            url = request.selector
698            if sys.version_info[:2] >= (3, 6):
699                connection._send_request(method, url,
700                                         # FIXME: implements 100-continue
701                                         # None, # We don't send the body yet
702                                         request.data,
703                                         headers, encode_chunked=False)
704            else:
705                connection._send_request(method, url,
706                                         # FIXME: implements 100-continue
707                                         # None, # We don't send the body yet
708                                         request.data,
709                                         headers)
710            if 'http' in debug.debug_flags:
711                trace.mutter('> %s %s' % (method, url))
712                hdrs = []
713                for k, v in headers.items():
714                    # People are often told to paste -Dhttp output to help
715                    # debug. Don't compromise credentials.
716                    if k in ('Authorization', 'Proxy-Authorization'):
717                        v = '<masked>'
718                    hdrs.append('%s: %s' % (k, v))
719                trace.mutter('> ' + '\n> '.join(hdrs) + '\n')
720            if self._debuglevel >= 1:
721                print('Request sent: [%r] from (%s)'
722                      % (request, request.connection.sock.getsockname()))
723            response = connection.getresponse()
724            convert_to_addinfourl = True
725        except (ssl.SSLError, ssl.CertificateError):
726            # Something is wrong with either the certificate or the hostname,
727            # re-trying won't help
728            raise
729        except (socket.gaierror, http_client.BadStatusLine, http_client.UnknownProtocol,
730                socket.error, http_client.HTTPException):
731            response = self.retry_or_raise(http_class, request, first_try)
732            convert_to_addinfourl = False
733
734        response.msg = response.reason
735        return response
736
737# FIXME: HTTPConnection does not fully support 100-continue (the
738# server responses are just ignored)
739
740#        if code == 100:
741#            mutter('Will send the body')
742#            # We can send the body now
743#            body = request.data
744#            if body is None:
745#                raise URLError("No data given")
746#            connection.send(body)
747#            response = connection.getresponse()
748
749        if self._debuglevel >= 2:
750            print('Receives response: %r' % response)
751            print('  For: %r(%r)' % (request.get_method(),
752                                     request.get_full_url()))
753
754        if convert_to_addinfourl:
755            # Shamelessly copied from urllib_request
756            req = request
757            r = response
758            r.recv = r.read
759            fp = socket._fileobject(r, bufsize=65536)
760            resp = addinfourl(fp, r.msg, req.get_full_url())
761            resp.code = r.status
762            resp.msg = r.reason
763            resp.version = r.version
764            if self._debuglevel >= 2:
765                print('Create addinfourl: %r' % resp)
766                print('  For: %r(%r)' % (request.get_method(),
767                                         request.get_full_url()))
768            if 'http' in debug.debug_flags:
769                version = 'HTTP/%d.%d'
770                try:
771                    version = version % (resp.version / 10,
772                                         resp.version % 10)
773                except:
774                    version = 'HTTP/%r' % resp.version
775                trace.mutter('< %s %s %s' % (version, resp.code,
776                                             resp.msg))
777                # Use the raw header lines instead of treating resp.info() as a
778                # dict since we may miss duplicated headers otherwise.
779                hdrs = [h.rstrip('\r\n') for h in resp.info().headers]
780                trace.mutter('< ' + '\n< '.join(hdrs) + '\n')
781        else:
782            resp = response
783        return resp
784
785
786class HTTPHandler(AbstractHTTPHandler):
787    """A custom handler that just thunks into HTTPConnection"""
788
789    def http_open(self, request):
790        return self.do_open(HTTPConnection, request)
791
792
793class HTTPSHandler(AbstractHTTPHandler):
794    """A custom handler that just thunks into HTTPSConnection"""
795
796    https_request = AbstractHTTPHandler.http_request
797
798    def https_open(self, request):
799        connection = request.connection
800        if connection.sock is None and \
801                connection.proxied_host is not None and \
802                request.get_method() != 'CONNECT':  # Don't loop
803            # FIXME: We need a gazillion connection tests here, but we still
804            # miss a https server :-( :
805            # - with and without proxy
806            # - with and without certificate
807            # - with self-signed certificate
808            # - with and without authentication
809            # - with good and bad credentials (especially the proxy auth around
810            #   CONNECT)
811            # - with basic and digest schemes
812            # - reconnection on errors
813            # - connection persistence behaviour (including reconnection)
814
815            # We are about to connect for the first time via a proxy, we must
816            # issue a CONNECT request first to establish the encrypted link
817            connect = _ConnectRequest(request)
818            response = self.parent.open(connect)
819            if response.code != 200:
820                raise errors.ConnectionError("Can't connect to %s via proxy %s" % (
821                    connect.proxied_host, self.host))
822            # Housekeeping
823            connection.cleanup_pipe()
824            # Establish the connection encryption
825            connection.connect_to_origin()
826            # Propagate the connection to the original request
827            request.connection = connection
828        return self.do_open(HTTPSConnection, request)
829
830
831class HTTPRedirectHandler(urllib_request.HTTPRedirectHandler):
832    """Handles redirect requests.
833
834    We have to implement our own scheme because we use a specific
835    Request object and because we want to implement a specific
836    policy.
837    """
838    _debuglevel = DEBUG
839    # RFC2616 says that only read requests should be redirected
840    # without interacting with the user. But Breezy uses some
841    # shortcuts to optimize against roundtrips which can leads to
842    # write requests being issued before read requests of
843    # containing dirs can be redirected. So we redirect write
844    # requests in the same way which seems to respect the spirit
845    # of the RFC if not its letter.
846
847    def redirect_request(self, req, fp, code, msg, headers, newurl):
848        """See urllib_request.HTTPRedirectHandler.redirect_request"""
849        # We would have preferred to update the request instead
850        # of creating a new one, but the urllib_request.Request object
851        # has a too complicated creation process to provide a
852        # simple enough equivalent update process. Instead, when
853        # redirecting, we only update the following request in
854        # the redirect chain with a reference to the parent
855        # request .
856
857        # Some codes make no sense in our context and are treated
858        # as errors:
859
860        # 300: Multiple choices for different representations of
861        #      the URI. Using that mechanisn with Breezy will violate the
862        #      protocol neutrality of Transport.
863
864        # 304: Not modified (SHOULD only occurs with conditional
865        #      GETs which are not used by our implementation)
866
867        # 305: Use proxy. I can't imagine this one occurring in
868        #      our context-- vila/20060909
869
870        # 306: Unused (if the RFC says so...)
871
872        # If the code is 302 and the request is HEAD, some may
873        # think that it is a sufficent hint that the file exists
874        # and that we MAY avoid following the redirections. But
875        # if we want to be sure, we MUST follow them.
876
877        origin_req_host = req.origin_req_host
878
879        if code in (301, 302, 303, 307, 308):
880            return Request(req.get_method(), newurl,
881                           headers=req.headers,
882                           origin_req_host=origin_req_host,
883                           unverifiable=True,
884                           # TODO: It will be nice to be able to
885                           # detect virtual hosts sharing the same
886                           # IP address, that will allow us to
887                           # share the same connection...
888                           connection=None,
889                           parent=req,
890                           )
891        else:
892            raise urllib_request.HTTPError(
893                req.get_full_url(), code, msg, headers, fp)
894
895    def http_error_302(self, req, fp, code, msg, headers):
896        """Requests the redirected to URI.
897
898        Copied from urllib_request to be able to clean the pipe of the associated
899        connection, *before* issuing the redirected request but *after* having
900        eventually raised an error.
901        """
902        # Some servers (incorrectly) return multiple Location headers
903        # (so probably same goes for URI).  Use first header.
904
905        # TODO: Once we get rid of addinfourl objects, the
906        # following will need to be updated to use correct case
907        # for headers.
908        if 'location' in headers:
909            newurl = headers.get('location')
910        elif 'uri' in headers:
911            newurl = headers.get('uri')
912        else:
913            return
914
915        newurl = urljoin(req.get_full_url(), newurl)
916
917        if self._debuglevel >= 1:
918            print('Redirected to: %s (followed: %r)' % (newurl,
919                                                        req.follow_redirections))
920        if req.follow_redirections is False:
921            req.redirected_to = newurl
922            return fp
923
924        # This call succeeds or raise an error. urllib_request returns
925        # if redirect_request returns None, but our
926        # redirect_request never returns None.
927        redirected_req = self.redirect_request(req, fp, code, msg, headers,
928                                               newurl)
929
930        # loop detection
931        # .redirect_dict has a key url if url was previously visited.
932        if hasattr(req, 'redirect_dict'):
933            visited = redirected_req.redirect_dict = req.redirect_dict
934            if (visited.get(newurl, 0) >= self.max_repeats or
935                    len(visited) >= self.max_redirections):
936                raise urllib_request.HTTPError(req.get_full_url(), code,
937                                               self.inf_msg + msg, headers, fp)
938        else:
939            visited = redirected_req.redirect_dict = req.redirect_dict = {}
940        visited[newurl] = visited.get(newurl, 0) + 1
941
942        # We can close the fp now that we are sure that we won't
943        # use it with HTTPError.
944        fp.close()
945        # We have all we need already in the response
946        req.connection.cleanup_pipe()
947
948        return self.parent.open(redirected_req)
949
950    http_error_301 = http_error_303 = http_error_307 = http_error_308 = http_error_302
951
952
953class ProxyHandler(urllib_request.ProxyHandler):
954    """Handles proxy setting.
955
956    Copied and modified from urllib_request to be able to modify the request during
957    the request pre-processing instead of modifying it at _open time. As we
958    capture (or create) the connection object during request processing, _open
959    time was too late.
960
961    The main task is to modify the request so that the connection is done to
962    the proxy while the request still refers to the destination host.
963
964    Note: the proxy handling *may* modify the protocol used; the request may be
965    against an https server proxied through an http proxy. So, https_request
966    will be called, but later it's really http_open that will be called. This
967    explains why we don't have to call self.parent.open as the urllib_request did.
968    """
969
970    # Proxies must be in front
971    handler_order = 100
972    _debuglevel = DEBUG
973
974    def __init__(self, proxies=None):
975        urllib_request.ProxyHandler.__init__(self, proxies)
976        # First, let's get rid of urllib_request implementation
977        for type, proxy in self.proxies.items():
978            if self._debuglevel >= 3:
979                print('Will unbind %s_open for %r' % (type, proxy))
980            delattr(self, '%s_open' % type)
981
982        def bind_scheme_request(proxy, scheme):
983            if proxy is None:
984                return
985            scheme_request = scheme + '_request'
986            if self._debuglevel >= 3:
987                print('Will bind %s for %r' % (scheme_request, proxy))
988            setattr(self, scheme_request,
989                    lambda request: self.set_proxy(request, scheme))
990        # We are interested only by the http[s] proxies
991        http_proxy = self.get_proxy_env_var('http')
992        bind_scheme_request(http_proxy, 'http')
993        https_proxy = self.get_proxy_env_var('https')
994        bind_scheme_request(https_proxy, 'https')
995
996    def get_proxy_env_var(self, name, default_to='all'):
997        """Get a proxy env var.
998
999        Note that we indirectly rely on
1000        urllib.getproxies_environment taking into account the
1001        uppercased values for proxy variables.
1002        """
1003        try:
1004            return self.proxies[name.lower()]
1005        except KeyError:
1006            if default_to is not None:
1007                # Try to get the alternate environment variable
1008                try:
1009                    return self.proxies[default_to]
1010                except KeyError:
1011                    pass
1012        return None
1013
1014    def proxy_bypass(self, host):
1015        """Check if host should be proxied or not.
1016
1017        :returns: True to skip the proxy, False otherwise.
1018        """
1019        no_proxy = self.get_proxy_env_var('no', default_to=None)
1020        bypass = self.evaluate_proxy_bypass(host, no_proxy)
1021        if bypass is None:
1022            # Nevertheless, there are platform-specific ways to
1023            # ignore proxies...
1024            return urllib_request.proxy_bypass(host)
1025        else:
1026            return bypass
1027
1028    def evaluate_proxy_bypass(self, host, no_proxy):
1029        """Check the host against a comma-separated no_proxy list as a string.
1030
1031        :param host: ``host:port`` being requested
1032
1033        :param no_proxy: comma-separated list of hosts to access directly.
1034
1035        :returns: True to skip the proxy, False not to, or None to
1036            leave it to urllib.
1037        """
1038        if no_proxy is None:
1039            # All hosts are proxied
1040            return False
1041        hhost, hport = splitport(host)
1042        # Does host match any of the domains mentioned in
1043        # no_proxy ? The rules about what is authorized in no_proxy
1044        # are fuzzy (to say the least). We try to allow most
1045        # commonly seen values.
1046        for domain in no_proxy.split(','):
1047            domain = domain.strip()
1048            if domain == '':
1049                continue
1050            dhost, dport = splitport(domain)
1051            if hport == dport or dport is None:
1052                # Protect glob chars
1053                dhost = dhost.replace(".", r"\.")
1054                dhost = dhost.replace("*", r".*")
1055                dhost = dhost.replace("?", r".")
1056                if re.match(dhost, hhost, re.IGNORECASE):
1057                    return True
1058        # Nothing explicitly avoid the host
1059        return None
1060
1061    def set_proxy(self, request, type):
1062        host = request.host
1063        if self.proxy_bypass(host):
1064            return request
1065
1066        proxy = self.get_proxy_env_var(type)
1067        if self._debuglevel >= 3:
1068            print('set_proxy %s_request for %r' % (type, proxy))
1069        # FIXME: python 2.5 urlparse provides a better _parse_proxy which can
1070        # grok user:password@host:port as well as
1071        # http://user:password@host:port
1072
1073        parsed_url = transport.ConnectedTransport._split_url(proxy)
1074        if not parsed_url.host:
1075            raise urlutils.InvalidURL(proxy, 'No host component')
1076
1077        if request.proxy_auth == {}:
1078            # No proxy auth parameter are available, we are handling the first
1079            # proxied request, intialize.  scheme (the authentication scheme)
1080            # and realm will be set by the AuthHandler
1081            request.proxy_auth = {
1082                'host': parsed_url.host,
1083                'port': parsed_url.port,
1084                'user': parsed_url.user,
1085                'password': parsed_url.password,
1086                'protocol': parsed_url.scheme,
1087                # We ignore path since we connect to a proxy
1088                'path': None}
1089        if parsed_url.port is None:
1090            phost = parsed_url.host
1091        else:
1092            phost = parsed_url.host + ':%d' % parsed_url.port
1093        request.set_proxy(phost, type)
1094        if self._debuglevel >= 3:
1095            print('set_proxy: proxy set to %s://%s' % (type, phost))
1096        return request
1097
1098
1099class AbstractAuthHandler(urllib_request.BaseHandler):
1100    """A custom abstract authentication handler for all http authentications.
1101
1102    Provides the meat to handle authentication errors and
1103    preventively set authentication headers after the first
1104    successful authentication.
1105
1106    This can be used for http and proxy, as well as for basic, negotiate and
1107    digest authentications.
1108
1109    This provides an unified interface for all authentication handlers
1110    (urllib_request provides far too many with different policies).
1111
1112    The interaction between this handler and the urllib_request
1113    framework is not obvious, it works as follow:
1114
1115    opener.open(request) is called:
1116
1117    - that may trigger http_request which will add an authentication header
1118      (self.build_header) if enough info is available.
1119
1120    - the request is sent to the server,
1121
1122    - if an authentication error is received self.auth_required is called,
1123      we acquire the authentication info in the error headers and call
1124      self.auth_match to check that we are able to try the
1125      authentication and complete the authentication parameters,
1126
1127    - we call parent.open(request), that may trigger http_request
1128      and will add a header (self.build_header), but here we have
1129      all the required info (keep in mind that the request and
1130      authentication used in the recursive calls are really (and must be)
1131      the *same* objects).
1132
1133    - if the call returns a response, the authentication have been
1134      successful and the request authentication parameters have been updated.
1135    """
1136
1137    scheme = None
1138    """The scheme as it appears in the server header (lower cased)"""
1139
1140    _max_retry = 3
1141    """We don't want to retry authenticating endlessly"""
1142
1143    requires_username = True
1144    """Whether the auth mechanism requires a username."""
1145
1146    # The following attributes should be defined by daughter
1147    # classes:
1148    # - auth_required_header:  the header received from the server
1149    # - auth_header: the header sent in the request
1150
1151    def __init__(self):
1152        # We want to know when we enter into an try/fail cycle of
1153        # authentications so we initialize to None to indicate that we aren't
1154        # in such a cycle by default.
1155        self._retry_count = None
1156
1157    def _parse_auth_header(self, server_header):
1158        """Parse the authentication header.
1159
1160        :param server_header: The value of the header sent by the server
1161            describing the authenticaion request.
1162
1163        :return: A tuple (scheme, remainder) scheme being the first word in the
1164            given header (lower cased), remainder may be None.
1165        """
1166        try:
1167            scheme, remainder = server_header.split(None, 1)
1168        except ValueError:
1169            scheme = server_header
1170            remainder = None
1171        return (scheme.lower(), remainder)
1172
1173    def update_auth(self, auth, key, value):
1174        """Update a value in auth marking the auth as modified if needed"""
1175        old_value = auth.get(key, None)
1176        if old_value != value:
1177            auth[key] = value
1178            auth['modified'] = True
1179
1180    def auth_required(self, request, headers):
1181        """Retry the request if the auth scheme is ours.
1182
1183        :param request: The request needing authentication.
1184        :param headers: The headers for the authentication error response.
1185        :return: None or the response for the authenticated request.
1186        """
1187        # Don't try  to authenticate endlessly
1188        if self._retry_count is None:
1189            # The retry being recusrsive calls, None identify the first retry
1190            self._retry_count = 1
1191        else:
1192            self._retry_count += 1
1193            if self._retry_count > self._max_retry:
1194                # Let's be ready for next round
1195                self._retry_count = None
1196                return None
1197        server_headers = headers.get_all(self.auth_required_header)
1198        if not server_headers:
1199            # The http error MUST have the associated
1200            # header. This must never happen in production code.
1201            trace.mutter('%s not found', self.auth_required_header)
1202            return None
1203
1204        auth = self.get_auth(request)
1205        auth['modified'] = False
1206        # Put some common info in auth if the caller didn't
1207        if auth.get('path', None) is None:
1208            parsed_url = urlutils.URL.from_string(request.get_full_url())
1209            self.update_auth(auth, 'protocol', parsed_url.scheme)
1210            self.update_auth(auth, 'host', parsed_url.host)
1211            self.update_auth(auth, 'port', parsed_url.port)
1212            self.update_auth(auth, 'path', parsed_url.path)
1213        # FIXME: the auth handler should be selected at a single place instead
1214        # of letting all handlers try to match all headers, but the current
1215        # design doesn't allow a simple implementation.
1216        for server_header in server_headers:
1217            # Several schemes can be proposed by the server, try to match each
1218            # one in turn
1219            matching_handler = self.auth_match(server_header, auth)
1220            if matching_handler:
1221                # auth_match may have modified auth (by adding the
1222                # password or changing the realm, for example)
1223                if (request.get_header(self.auth_header, None) is not None
1224                        and not auth['modified']):
1225                    # We already tried that, give up
1226                    return None
1227
1228                # Only the most secure scheme proposed by the server should be
1229                # used, since the handlers use 'handler_order' to describe that
1230                # property, the first handler tried takes precedence, the
1231                # others should not attempt to authenticate if the best one
1232                # failed.
1233                best_scheme = auth.get('best_scheme', None)
1234                if best_scheme is None:
1235                    # At that point, if current handler should doesn't succeed
1236                    # the credentials are wrong (or incomplete), but we know
1237                    # that the associated scheme should be used.
1238                    best_scheme = auth['best_scheme'] = self.scheme
1239                if best_scheme != self.scheme:
1240                    continue
1241
1242                if self.requires_username and auth.get('user', None) is None:
1243                    # Without a known user, we can't authenticate
1244                    return None
1245
1246                # Housekeeping
1247                request.connection.cleanup_pipe()
1248                # Retry the request with an authentication header added
1249                response = self.parent.open(request)
1250                if response:
1251                    self.auth_successful(request, response)
1252                return response
1253        # We are not qualified to handle the authentication.
1254        # Note: the authentication error handling will try all
1255        # available handlers. If one of them authenticates
1256        # successfully, a response will be returned. If none of
1257        # them succeeds, None will be returned and the error
1258        # handler will raise the 401 'Unauthorized' or the 407
1259        # 'Proxy Authentication Required' error.
1260        return None
1261
1262    def add_auth_header(self, request, header):
1263        """Add the authentication header to the request"""
1264        request.add_unredirected_header(self.auth_header, header)
1265
1266    def auth_match(self, header, auth):
1267        """Check that we are able to handle that authentication scheme.
1268
1269        The request authentication parameters may need to be
1270        updated with info from the server. Some of these
1271        parameters, when combined, are considered to be the
1272        authentication key, if one of them change the
1273        authentication result may change. 'user' and 'password'
1274        are exampls, but some auth schemes may have others
1275        (digest's nonce is an example, digest's nonce_count is a
1276        *counter-example*). Such parameters must be updated by
1277        using the update_auth() method.
1278
1279        :param header: The authentication header sent by the server.
1280        :param auth: The auth parameters already known. They may be
1281             updated.
1282        :returns: True if we can try to handle the authentication.
1283        """
1284        raise NotImplementedError(self.auth_match)
1285
1286    def build_auth_header(self, auth, request):
1287        """Build the value of the header used to authenticate.
1288
1289        :param auth: The auth parameters needed to build the header.
1290        :param request: The request needing authentication.
1291
1292        :return: None or header.
1293        """
1294        raise NotImplementedError(self.build_auth_header)
1295
1296    def auth_successful(self, request, response):
1297        """The authentification was successful for the request.
1298
1299        Additional infos may be available in the response.
1300
1301        :param request: The succesfully authenticated request.
1302        :param response: The server response (may contain auth info).
1303        """
1304        # It may happen that we need to reconnect later, let's be ready
1305        self._retry_count = None
1306
1307    def get_user_password(self, auth):
1308        """Ask user for a password if none is already available.
1309
1310        :param auth: authentication info gathered so far (from the initial url
1311            and then during dialog with the server).
1312        """
1313        auth_conf = config.AuthenticationConfig()
1314        user = auth.get('user', None)
1315        password = auth.get('password', None)
1316        realm = auth['realm']
1317        port = auth.get('port', None)
1318
1319        if user is None:
1320            user = auth_conf.get_user(auth['protocol'], auth['host'],
1321                                      port=port, path=auth['path'],
1322                                      realm=realm, ask=True,
1323                                      prompt=self.build_username_prompt(auth))
1324        if user is not None and password is None:
1325            password = auth_conf.get_password(
1326                auth['protocol'], auth['host'], user,
1327                port=port,
1328                path=auth['path'], realm=realm,
1329                prompt=self.build_password_prompt(auth))
1330
1331        return user, password
1332
1333    def _build_password_prompt(self, auth):
1334        """Build a prompt taking the protocol used into account.
1335
1336        The AuthHandler is used by http and https, we want that information in
1337        the prompt, so we build the prompt from the authentication dict which
1338        contains all the needed parts.
1339
1340        Also, http and proxy AuthHandlers present different prompts to the
1341        user. The daughter classes should implements a public
1342        build_password_prompt using this method.
1343        """
1344        prompt = u'%s' % auth['protocol'].upper() + u' %(user)s@%(host)s'
1345        realm = auth['realm']
1346        if realm is not None:
1347            prompt += u", Realm: '%s'" % realm
1348        prompt += u' password'
1349        return prompt
1350
1351    def _build_username_prompt(self, auth):
1352        """Build a prompt taking the protocol used into account.
1353
1354        The AuthHandler is used by http and https, we want that information in
1355        the prompt, so we build the prompt from the authentication dict which
1356        contains all the needed parts.
1357
1358        Also, http and proxy AuthHandlers present different prompts to the
1359        user. The daughter classes should implements a public
1360        build_username_prompt using this method.
1361        """
1362        prompt = u'%s' % auth['protocol'].upper() + u' %(host)s'
1363        realm = auth['realm']
1364        if realm is not None:
1365            prompt += u", Realm: '%s'" % realm
1366        prompt += u' username'
1367        return prompt
1368
1369    def http_request(self, request):
1370        """Insert an authentication header if information is available"""
1371        auth = self.get_auth(request)
1372        if self.auth_params_reusable(auth):
1373            self.add_auth_header(
1374                request, self.build_auth_header(auth, request))
1375        return request
1376
1377    https_request = http_request  # FIXME: Need test
1378
1379
1380class NegotiateAuthHandler(AbstractAuthHandler):
1381    """A authentication handler that handles WWW-Authenticate: Negotiate.
1382
1383    At the moment this handler supports just Kerberos. In the future,
1384    NTLM support may also be added.
1385    """
1386
1387    scheme = 'negotiate'
1388    handler_order = 480
1389    requires_username = False
1390
1391    def auth_match(self, header, auth):
1392        scheme, raw_auth = self._parse_auth_header(header)
1393        if scheme != self.scheme:
1394            return False
1395        self.update_auth(auth, 'scheme', scheme)
1396        resp = self._auth_match_kerberos(auth)
1397        if resp is None:
1398            return False
1399        # Optionally should try to authenticate using NTLM here
1400        self.update_auth(auth, 'negotiate_response', resp)
1401        return True
1402
1403    def _auth_match_kerberos(self, auth):
1404        """Try to create a GSSAPI response for authenticating against a host."""
1405        global kerberos, checked_kerberos
1406        if kerberos is None and not checked_kerberos:
1407            try:
1408                import kerberos
1409            except ImportError:
1410                kerberos = None
1411            checked_kerberos = True
1412        if kerberos is None:
1413            return None
1414        ret, vc = kerberos.authGSSClientInit("HTTP@%(host)s" % auth)
1415        if ret < 1:
1416            trace.warning('Unable to create GSSAPI context for %s: %d',
1417                          auth['host'], ret)
1418            return None
1419        ret = kerberos.authGSSClientStep(vc, "")
1420        if ret < 0:
1421            trace.mutter('authGSSClientStep failed: %d', ret)
1422            return None
1423        return kerberos.authGSSClientResponse(vc)
1424
1425    def build_auth_header(self, auth, request):
1426        return "Negotiate %s" % auth['negotiate_response']
1427
1428    def auth_params_reusable(self, auth):
1429        # If the auth scheme is known, it means a previous
1430        # authentication was successful, all information is
1431        # available, no further checks are needed.
1432        return (auth.get('scheme', None) == 'negotiate' and
1433                auth.get('negotiate_response', None) is not None)
1434
1435
1436class BasicAuthHandler(AbstractAuthHandler):
1437    """A custom basic authentication handler."""
1438
1439    scheme = 'basic'
1440    handler_order = 500
1441    auth_regexp = re.compile('realm="([^"]*)"', re.I)
1442
1443    def build_auth_header(self, auth, request):
1444        raw = '%s:%s' % (auth['user'], auth['password'])
1445        auth_header = 'Basic ' + \
1446            base64.b64encode(raw.encode('utf-8')).decode('ascii')
1447        return auth_header
1448
1449    def extract_realm(self, header_value):
1450        match = self.auth_regexp.search(header_value)
1451        realm = None
1452        if match:
1453            realm = match.group(1)
1454        return match, realm
1455
1456    def auth_match(self, header, auth):
1457        scheme, raw_auth = self._parse_auth_header(header)
1458        if scheme != self.scheme:
1459            return False
1460
1461        match, realm = self.extract_realm(raw_auth)
1462        if match:
1463            # Put useful info into auth
1464            self.update_auth(auth, 'scheme', scheme)
1465            self.update_auth(auth, 'realm', realm)
1466            if (auth.get('user', None) is None
1467                    or auth.get('password', None) is None):
1468                user, password = self.get_user_password(auth)
1469                self.update_auth(auth, 'user', user)
1470                self.update_auth(auth, 'password', password)
1471        return match is not None
1472
1473    def auth_params_reusable(self, auth):
1474        # If the auth scheme is known, it means a previous
1475        # authentication was successful, all information is
1476        # available, no further checks are needed.
1477        return auth.get('scheme', None) == 'basic'
1478
1479
1480def get_digest_algorithm_impls(algorithm):
1481    H = None
1482    KD = None
1483    if algorithm == 'MD5':
1484        def H(x): return osutils.md5(x).hexdigest()
1485    elif algorithm == 'SHA':
1486        H = osutils.sha_string
1487    if H is not None:
1488        def KD(secret, data): return H(
1489            ("%s:%s" % (secret, data)).encode('utf-8'))
1490    return H, KD
1491
1492
1493def get_new_cnonce(nonce, nonce_count):
1494    raw = '%s:%d:%s:%s' % (nonce, nonce_count, time.ctime(),
1495                           osutils.rand_chars(8))
1496    return osutils.sha_string(raw.encode('utf-8'))[:16]
1497
1498
1499class DigestAuthHandler(AbstractAuthHandler):
1500    """A custom digest authentication handler."""
1501
1502    scheme = 'digest'
1503    # Before basic as digest is a bit more secure and should be preferred
1504    handler_order = 490
1505
1506    def auth_params_reusable(self, auth):
1507        # If the auth scheme is known, it means a previous
1508        # authentication was successful, all information is
1509        # available, no further checks are needed.
1510        return auth.get('scheme', None) == 'digest'
1511
1512    def auth_match(self, header, auth):
1513        scheme, raw_auth = self._parse_auth_header(header)
1514        if scheme != self.scheme:
1515            return False
1516
1517        # Put the requested authentication info into a dict
1518        req_auth = urllib_request.parse_keqv_list(
1519            urllib_request.parse_http_list(raw_auth))
1520
1521        # Check that we can handle that authentication
1522        qop = req_auth.get('qop', None)
1523        if qop != 'auth':  # No auth-int so far
1524            return False
1525
1526        H, KD = get_digest_algorithm_impls(req_auth.get('algorithm', 'MD5'))
1527        if H is None:
1528            return False
1529
1530        realm = req_auth.get('realm', None)
1531        # Put useful info into auth
1532        self.update_auth(auth, 'scheme', scheme)
1533        self.update_auth(auth, 'realm', realm)
1534        if auth.get('user', None) is None or auth.get('password', None) is None:
1535            user, password = self.get_user_password(auth)
1536            self.update_auth(auth, 'user', user)
1537            self.update_auth(auth, 'password', password)
1538
1539        try:
1540            if req_auth.get('algorithm', None) is not None:
1541                self.update_auth(auth, 'algorithm', req_auth.get('algorithm'))
1542            nonce = req_auth['nonce']
1543            if auth.get('nonce', None) != nonce:
1544                # A new nonce, never used
1545                self.update_auth(auth, 'nonce_count', 0)
1546            self.update_auth(auth, 'nonce', nonce)
1547            self.update_auth(auth, 'qop', qop)
1548            auth['opaque'] = req_auth.get('opaque', None)
1549        except KeyError:
1550            # Some required field is not there
1551            return False
1552
1553        return True
1554
1555    def build_auth_header(self, auth, request):
1556        selector = request.selector
1557        url_scheme, url_selector = splittype(selector)
1558        sel_host, uri = splithost(url_selector)
1559
1560        A1 = ('%s:%s:%s' %
1561              (auth['user'], auth['realm'], auth['password'])).encode('utf-8')
1562        A2 = ('%s:%s' % (request.get_method(), uri)).encode('utf-8')
1563
1564        nonce = auth['nonce']
1565        qop = auth['qop']
1566
1567        nonce_count = auth['nonce_count'] + 1
1568        ncvalue = '%08x' % nonce_count
1569        cnonce = get_new_cnonce(nonce, nonce_count)
1570
1571        H, KD = get_digest_algorithm_impls(auth.get('algorithm', 'MD5'))
1572        nonce_data = '%s:%s:%s:%s:%s' % (nonce, ncvalue, cnonce, qop, H(A2))
1573        request_digest = KD(H(A1), nonce_data)
1574
1575        header = 'Digest '
1576        header += 'username="%s", realm="%s", nonce="%s"' % (auth['user'],
1577                                                             auth['realm'],
1578                                                             nonce)
1579        header += ', uri="%s"' % uri
1580        header += ', cnonce="%s", nc=%s' % (cnonce, ncvalue)
1581        header += ', qop="%s"' % qop
1582        header += ', response="%s"' % request_digest
1583        # Append the optional fields
1584        opaque = auth.get('opaque', None)
1585        if opaque:
1586            header += ', opaque="%s"' % opaque
1587        if auth.get('algorithm', None):
1588            header += ', algorithm="%s"' % auth.get('algorithm')
1589
1590        # We have used the nonce once more, update the count
1591        auth['nonce_count'] = nonce_count
1592
1593        return header
1594
1595
1596class HTTPAuthHandler(AbstractAuthHandler):
1597    """Custom http authentication handler.
1598
1599    Send the authentication preventively to avoid the roundtrip
1600    associated with the 401 error and keep the revelant info in
1601    the auth request attribute.
1602    """
1603
1604    auth_required_header = 'www-authenticate'
1605    auth_header = 'Authorization'
1606
1607    def get_auth(self, request):
1608        """Get the auth params from the request"""
1609        return request.auth
1610
1611    def set_auth(self, request, auth):
1612        """Set the auth params for the request"""
1613        request.auth = auth
1614
1615    def build_password_prompt(self, auth):
1616        return self._build_password_prompt(auth)
1617
1618    def build_username_prompt(self, auth):
1619        return self._build_username_prompt(auth)
1620
1621    def http_error_401(self, req, fp, code, msg, headers):
1622        return self.auth_required(req, headers)
1623
1624
1625class ProxyAuthHandler(AbstractAuthHandler):
1626    """Custom proxy authentication handler.
1627
1628    Send the authentication preventively to avoid the roundtrip
1629    associated with the 407 error and keep the revelant info in
1630    the proxy_auth request attribute..
1631    """
1632
1633    auth_required_header = 'proxy-authenticate'
1634    # FIXME: the correct capitalization is Proxy-Authorization,
1635    # but python-2.4 urllib_request.Request insist on using capitalize()
1636    # instead of title().
1637    auth_header = 'Proxy-authorization'
1638
1639    def get_auth(self, request):
1640        """Get the auth params from the request"""
1641        return request.proxy_auth
1642
1643    def set_auth(self, request, auth):
1644        """Set the auth params for the request"""
1645        request.proxy_auth = auth
1646
1647    def build_password_prompt(self, auth):
1648        prompt = self._build_password_prompt(auth)
1649        prompt = u'Proxy ' + prompt
1650        return prompt
1651
1652    def build_username_prompt(self, auth):
1653        prompt = self._build_username_prompt(auth)
1654        prompt = u'Proxy ' + prompt
1655        return prompt
1656
1657    def http_error_407(self, req, fp, code, msg, headers):
1658        return self.auth_required(req, headers)
1659
1660
1661class HTTPBasicAuthHandler(BasicAuthHandler, HTTPAuthHandler):
1662    """Custom http basic authentication handler"""
1663
1664
1665class ProxyBasicAuthHandler(BasicAuthHandler, ProxyAuthHandler):
1666    """Custom proxy basic authentication handler"""
1667
1668
1669class HTTPDigestAuthHandler(DigestAuthHandler, HTTPAuthHandler):
1670    """Custom http basic authentication handler"""
1671
1672
1673class ProxyDigestAuthHandler(DigestAuthHandler, ProxyAuthHandler):
1674    """Custom proxy basic authentication handler"""
1675
1676
1677class HTTPNegotiateAuthHandler(NegotiateAuthHandler, HTTPAuthHandler):
1678    """Custom http negotiate authentication handler"""
1679
1680
1681class ProxyNegotiateAuthHandler(NegotiateAuthHandler, ProxyAuthHandler):
1682    """Custom proxy negotiate authentication handler"""
1683
1684
1685class HTTPErrorProcessor(urllib_request.HTTPErrorProcessor):
1686    """Process HTTP error responses.
1687
1688    We don't really process the errors, quite the contrary
1689    instead, we leave our Transport handle them.
1690    """
1691
1692    accepted_errors = [200,  # Ok
1693                       201,
1694                       202,
1695                       204,
1696                       206,  # Partial content
1697                       400,
1698                       403,
1699                       404,  # Not found
1700                       405,  # Method not allowed
1701                       406,  # Not Acceptable
1702                       409,  # Conflict
1703                       416,  # Range not satisfiable
1704                       422,  # Unprocessible entity
1705                       501,  # Not implemented
1706                       ]
1707    """The error codes the caller will handle.
1708
1709    This can be specialized in the request on a case-by case basis, but the
1710    common cases are covered here.
1711    """
1712
1713    def http_response(self, request, response):
1714        code, msg, hdrs = response.code, response.msg, response.info()
1715
1716        if code not in self.accepted_errors:
1717            response = self.parent.error('http', request, response,
1718                                         code, msg, hdrs)
1719        return response
1720
1721    https_response = http_response
1722
1723
1724class HTTPDefaultErrorHandler(urllib_request.HTTPDefaultErrorHandler):
1725    """Translate common errors into Breezy Exceptions"""
1726
1727    def http_error_default(self, req, fp, code, msg, hdrs):
1728        if code == 403:
1729            raise errors.TransportError(
1730                'Server refuses to fulfill the request (403 Forbidden)'
1731                ' for %s' % req.get_full_url())
1732        else:
1733            raise errors.UnexpectedHttpStatus(
1734                req.get_full_url(), code,
1735                'Unable to handle http code: %s' % msg)
1736
1737
1738class Opener(object):
1739    """A wrapper around urllib_request.build_opener
1740
1741    Daughter classes can override to build their own specific opener
1742    """
1743    # TODO: Provides hooks for daughter classes.
1744
1745    def __init__(self,
1746                 connection=ConnectionHandler,
1747                 redirect=HTTPRedirectHandler,
1748                 error=HTTPErrorProcessor,
1749                 report_activity=None,
1750                 ca_certs=None):
1751        self._opener = urllib_request.build_opener(
1752            connection(report_activity=report_activity, ca_certs=ca_certs),
1753            redirect, error,
1754            ProxyHandler(),
1755            HTTPBasicAuthHandler(),
1756            HTTPDigestAuthHandler(),
1757            HTTPNegotiateAuthHandler(),
1758            ProxyBasicAuthHandler(),
1759            ProxyDigestAuthHandler(),
1760            ProxyNegotiateAuthHandler(),
1761            HTTPHandler,
1762            HTTPSHandler,
1763            HTTPDefaultErrorHandler,
1764            )
1765
1766        self.open = self._opener.open
1767        if DEBUG >= 9:
1768            # When dealing with handler order, it's easy to mess
1769            # things up, the following will help understand which
1770            # handler is used, when and for what.
1771            import pprint
1772            pprint.pprint(self._opener.__dict__)
1773
1774
1775class HttpTransport(ConnectedTransport):
1776    """HTTP Client implementations.
1777
1778    The protocol can be given as e.g. http+urllib://host/ to use a particular
1779    implementation.
1780    """
1781
1782    # _unqualified_scheme: "http" or "https"
1783    # _scheme: may have "+pycurl", etc
1784
1785    # In order to debug we have to issue our traces in sync with
1786    # httplib, which use print :(
1787    _debuglevel = 0
1788
1789    def __init__(self, base, _from_transport=None, ca_certs=None):
1790        """Set the base path where files will be stored."""
1791        proto_match = re.match(r'^(https?)(\+\w+)?://', base)
1792        if not proto_match:
1793            raise AssertionError("not a http url: %r" % base)
1794        self._unqualified_scheme = proto_match.group(1)
1795        super(HttpTransport, self).__init__(
1796            base, _from_transport=_from_transport)
1797        self._medium = None
1798        # range hint is handled dynamically throughout the life
1799        # of the transport object. We start by trying multi-range
1800        # requests and if the server returns bogus results, we
1801        # retry with single range requests and, finally, we
1802        # forget about range if the server really can't
1803        # understand. Once acquired, this piece of info is
1804        # propagated to clones.
1805        if _from_transport is not None:
1806            self._range_hint = _from_transport._range_hint
1807            self._opener = _from_transport._opener
1808        else:
1809            self._range_hint = 'multi'
1810            self._opener = Opener(
1811                report_activity=self._report_activity, ca_certs=ca_certs)
1812
1813    def request(self, method, url, fields=None, headers=None, **urlopen_kw):
1814        body = urlopen_kw.pop('body', None)
1815        if fields is not None:
1816            data = urlencode(fields).encode()
1817            if body is not None:
1818                raise ValueError(
1819                    'body and fields are mutually exclusive')
1820        else:
1821            data = body
1822        if headers is None:
1823            headers = {}
1824        request = Request(method, url, data, headers)
1825        request.follow_redirections = (urlopen_kw.pop('retries', 0) > 0)
1826        if urlopen_kw:
1827            raise NotImplementedError(
1828                'unknown arguments: %r' % urlopen_kw.keys())
1829        connection = self._get_connection()
1830        if connection is not None:
1831            # Give back shared info
1832            request.connection = connection
1833            (auth, proxy_auth) = self._get_credentials()
1834            # Clean the httplib.HTTPConnection pipeline in case the previous
1835            # request couldn't do it
1836            connection.cleanup_pipe()
1837        else:
1838            # First request, initialize credentials.
1839            # scheme and realm will be set by the _urllib2_wrappers.AuthHandler
1840            auth = self._create_auth()
1841            # Proxy initialization will be done by the first proxied request
1842            proxy_auth = dict()
1843        # Ensure authentication info is provided
1844        request.auth = auth
1845        request.proxy_auth = proxy_auth
1846
1847        if self._debuglevel > 0:
1848            print('perform: %s base: %s, url: %s' % (request.method, self.base,
1849                                                     request.get_full_url()))
1850        response = self._opener.open(request)
1851        if self._get_connection() is not request.connection:
1852            # First connection or reconnection
1853            self._set_connection(request.connection,
1854                                 (request.auth, request.proxy_auth))
1855        else:
1856            # http may change the credentials while keeping the
1857            # connection opened
1858            self._update_credentials((request.auth, request.proxy_auth))
1859
1860        code = response.code
1861        if (request.follow_redirections is False
1862                and code in (301, 302, 303, 307, 308)):
1863            raise errors.RedirectRequested(request.get_full_url(),
1864                                           request.redirected_to,
1865                                           is_permanent=(code in (301, 308)))
1866
1867        if request.redirected_to is not None:
1868            trace.mutter('redirected from: %s to: %s' % (request.get_full_url(),
1869                                                         request.redirected_to))
1870
1871        class Urllib3LikeResponse(object):
1872
1873            def __init__(self, actual):
1874                self._actual = actual
1875                self._data = None
1876
1877            def getheader(self, name, default=None):
1878                if self._actual.headers is None:
1879                    raise http_client.ResponseNotReady()
1880                return self._actual.headers.get(name, default)
1881
1882            def getheaders(self):
1883                if self._actual.headers is None:
1884                    raise http_client.ResponseNotReady()
1885                return list(self._actual.headers.items())
1886
1887            @property
1888            def status(self):
1889                return self._actual.code
1890
1891            @property
1892            def reason(self):
1893                return self._actual.reason
1894
1895            @property
1896            def data(self):
1897                if self._data is None:
1898                    self._data = self._actual.read()
1899                return self._data
1900
1901            @property
1902            def text(self):
1903                if self.status == 204:
1904                    return None
1905                charset = cgi.parse_header(
1906                    self._actual.headers['Content-Type'])[1].get('charset')
1907                if charset:
1908                    return self.data.decode(charset)
1909                else:
1910                    return self.data.decode()
1911
1912            def read(self, amt=None):
1913                return self._actual.read(amt)
1914
1915            def readlines(self):
1916                return self._actual.readlines()
1917
1918            def readline(self, size=-1):
1919                return self._actual.readline(size)
1920
1921        return Urllib3LikeResponse(response)
1922
1923    def disconnect(self):
1924        connection = self._get_connection()
1925        if connection is not None:
1926            connection.close()
1927
1928    def has(self, relpath):
1929        """Does the target location exist?
1930        """
1931        response = self._head(relpath)
1932
1933        code = response.status
1934        if code == 200:  # "ok",
1935            return True
1936        else:
1937            return False
1938
1939    def get(self, relpath):
1940        """Get the file at the given relative path.
1941
1942        :param relpath: The relative path to the file
1943        """
1944        code, response_file = self._get(relpath, None)
1945        return response_file
1946
1947    def _get(self, relpath, offsets, tail_amount=0):
1948        """Get a file, or part of a file.
1949
1950        :param relpath: Path relative to transport base URL
1951        :param offsets: None to get the whole file;
1952            or  a list of _CoalescedOffset to fetch parts of a file.
1953        :param tail_amount: The amount to get from the end of the file.
1954
1955        :returns: (http_code, result_file)
1956        """
1957        abspath = self._remote_path(relpath)
1958        headers = {}
1959        if offsets or tail_amount:
1960            range_header = self._attempted_range_header(offsets, tail_amount)
1961            if range_header is not None:
1962                bytes = 'bytes=' + range_header
1963                headers = {'Range': bytes}
1964        else:
1965            range_header = None
1966
1967        response = self.request('GET', abspath, headers=headers)
1968
1969        if response.status == 404:  # not found
1970            raise errors.NoSuchFile(abspath)
1971        elif response.status == 416:
1972            # We don't know which, but one of the ranges we specified was
1973            # wrong.
1974            raise errors.InvalidHttpRange(abspath, range_header,
1975                                          'Server return code %d' % response.status)
1976        elif response.status == 400:
1977            if range_header:
1978                # We don't know which, but one of the ranges we specified was
1979                # wrong.
1980                raise errors.InvalidHttpRange(
1981                    abspath, range_header,
1982                    'Server return code %d' % response.status)
1983            else:
1984                raise errors.BadHttpRequest(abspath, response.reason)
1985        elif response.status not in (200, 206):
1986            raise errors.UnexpectedHttpStatus(abspath, response.status)
1987
1988        data = handle_response(
1989            abspath, response.status, response.getheader, response)
1990        return response.status, data
1991
1992    def _remote_path(self, relpath):
1993        """See ConnectedTransport._remote_path.
1994
1995        user and passwords are not embedded in the path provided to the server.
1996        """
1997        url = self._parsed_url.clone(relpath)
1998        url.user = url.quoted_user = None
1999        url.password = url.quoted_password = None
2000        url.scheme = self._unqualified_scheme
2001        return str(url)
2002
2003    def _create_auth(self):
2004        """Returns a dict containing the credentials provided at build time."""
2005        auth = dict(host=self._parsed_url.host, port=self._parsed_url.port,
2006                    user=self._parsed_url.user, password=self._parsed_url.password,
2007                    protocol=self._unqualified_scheme,
2008                    path=self._parsed_url.path)
2009        return auth
2010
2011    def get_smart_medium(self):
2012        """See Transport.get_smart_medium."""
2013        if self._medium is None:
2014            # Since medium holds some state (smart server probing at least), we
2015            # need to keep it around. Note that this is needed because medium
2016            # has the same 'base' attribute as the transport so it can't be
2017            # shared between transports having different bases.
2018            self._medium = SmartClientHTTPMedium(self)
2019        return self._medium
2020
2021    def _degrade_range_hint(self, relpath, ranges):
2022        if self._range_hint == 'multi':
2023            self._range_hint = 'single'
2024            mutter('Retry "%s" with single range request' % relpath)
2025        elif self._range_hint == 'single':
2026            self._range_hint = None
2027            mutter('Retry "%s" without ranges' % relpath)
2028        else:
2029            # We tried all the tricks, but nothing worked, caller must reraise.
2030            return False
2031        return True
2032
2033    # _coalesce_offsets is a helper for readv, it try to combine ranges without
2034    # degrading readv performances. _bytes_to_read_before_seek is the value
2035    # used for the limit parameter and has been tuned for other transports. For
2036    # HTTP, the name is inappropriate but the parameter is still useful and
2037    # helps reduce the number of chunks in the response. The overhead for a
2038    # chunk (headers, length, footer around the data itself is variable but
2039    # around 50 bytes. We use 128 to reduce the range specifiers that appear in
2040    # the header, some servers (notably Apache) enforce a maximum length for a
2041    # header and issue a '400: Bad request' error when too much ranges are
2042    # specified.
2043    _bytes_to_read_before_seek = 128
2044    # No limit on the offset number that get combined into one, we are trying
2045    # to avoid downloading the whole file.
2046    _max_readv_combine = 0
2047    # By default Apache has a limit of ~400 ranges before replying with a 400
2048    # Bad Request. So we go underneath that amount to be safe.
2049    _max_get_ranges = 200
2050    # We impose no limit on the range size. But see _pycurl.py for a different
2051    # use.
2052    _get_max_size = 0
2053
2054    def _readv(self, relpath, offsets):
2055        """Get parts of the file at the given relative path.
2056
2057        :param offsets: A list of (offset, size) tuples.
2058        :param return: A list or generator of (offset, data) tuples
2059        """
2060        # offsets may be a generator, we will iterate it several times, so
2061        # build a list
2062        offsets = list(offsets)
2063
2064        try_again = True
2065        retried_offset = None
2066        while try_again:
2067            try_again = False
2068
2069            # Coalesce the offsets to minimize the GET requests issued
2070            sorted_offsets = sorted(offsets)
2071            coalesced = self._coalesce_offsets(
2072                sorted_offsets, limit=self._max_readv_combine,
2073                fudge_factor=self._bytes_to_read_before_seek,
2074                max_size=self._get_max_size)
2075
2076            # Turn it into a list, we will iterate it several times
2077            coalesced = list(coalesced)
2078            if 'http' in debug.debug_flags:
2079                mutter('http readv of %s  offsets => %s collapsed %s',
2080                       relpath, len(offsets), len(coalesced))
2081
2082            # Cache the data read, but only until it's been used
2083            data_map = {}
2084            # We will iterate on the data received from the GET requests and
2085            # serve the corresponding offsets respecting the initial order. We
2086            # need an offset iterator for that.
2087            iter_offsets = iter(offsets)
2088            try:
2089                cur_offset_and_size = next(iter_offsets)
2090            except StopIteration:
2091                return
2092
2093            try:
2094                for cur_coal, rfile in self._coalesce_readv(relpath, coalesced):
2095                    # Split the received chunk
2096                    for offset, size in cur_coal.ranges:
2097                        start = cur_coal.start + offset
2098                        rfile.seek(start, os.SEEK_SET)
2099                        data = rfile.read(size)
2100                        data_len = len(data)
2101                        if data_len != size:
2102                            raise errors.ShortReadvError(relpath, start, size,
2103                                                         actual=data_len)
2104                        if (start, size) == cur_offset_and_size:
2105                            # The offset requested are sorted as the coalesced
2106                            # ones, no need to cache. Win !
2107                            yield cur_offset_and_size[0], data
2108                            try:
2109                                cur_offset_and_size = next(iter_offsets)
2110                            except StopIteration:
2111                                return
2112                        else:
2113                            # Different sorting. We need to cache.
2114                            data_map[(start, size)] = data
2115
2116                    # Yield everything we can
2117                    while cur_offset_and_size in data_map:
2118                        # Clean the cached data since we use it
2119                        # XXX: will break if offsets contains duplicates --
2120                        # vila20071129
2121                        this_data = data_map.pop(cur_offset_and_size)
2122                        yield cur_offset_and_size[0], this_data
2123                        try:
2124                            cur_offset_and_size = next(iter_offsets)
2125                        except StopIteration:
2126                            return
2127
2128            except (errors.ShortReadvError, errors.InvalidRange,
2129                    errors.InvalidHttpRange, errors.HttpBoundaryMissing) as e:
2130                mutter('Exception %r: %s during http._readv', e, e)
2131                if (not isinstance(e, errors.ShortReadvError)
2132                        or retried_offset == cur_offset_and_size):
2133                    # We don't degrade the range hint for ShortReadvError since
2134                    # they do not indicate a problem with the server ability to
2135                    # handle ranges. Except when we fail to get back a required
2136                    # offset twice in a row. In that case, falling back to
2137                    # single range or whole file should help.
2138                    if not self._degrade_range_hint(relpath, coalesced):
2139                        raise
2140                # Some offsets may have been already processed, so we retry
2141                # only the unsuccessful ones.
2142                offsets = [cur_offset_and_size] + [o for o in iter_offsets]
2143                retried_offset = cur_offset_and_size
2144                try_again = True
2145
2146    def _coalesce_readv(self, relpath, coalesced):
2147        """Issue several GET requests to satisfy the coalesced offsets"""
2148
2149        def get_and_yield(relpath, coalesced):
2150            if coalesced:
2151                # Note that the _get below may raise
2152                # errors.InvalidHttpRange. It's the caller's responsibility to
2153                # decide how to retry since it may provide different coalesced
2154                # offsets.
2155                code, rfile = self._get(relpath, coalesced)
2156                for coal in coalesced:
2157                    yield coal, rfile
2158
2159        if self._range_hint is None:
2160            # Download whole file
2161            for c, rfile in get_and_yield(relpath, coalesced):
2162                yield c, rfile
2163        else:
2164            total = len(coalesced)
2165            if self._range_hint == 'multi':
2166                max_ranges = self._max_get_ranges
2167            elif self._range_hint == 'single':
2168                max_ranges = total
2169            else:
2170                raise AssertionError("Unknown _range_hint %r"
2171                                     % (self._range_hint,))
2172            # TODO: Some web servers may ignore the range requests and return
2173            # the whole file, we may want to detect that and avoid further
2174            # requests.
2175            # Hint: test_readv_multiple_get_requests will fail once we do that
2176            cumul = 0
2177            ranges = []
2178            for coal in coalesced:
2179                if ((self._get_max_size > 0
2180                     and cumul + coal.length > self._get_max_size) or
2181                        len(ranges) >= max_ranges):
2182                    # Get that much and yield
2183                    for c, rfile in get_and_yield(relpath, ranges):
2184                        yield c, rfile
2185                    # Restart with the current offset
2186                    ranges = [coal]
2187                    cumul = coal.length
2188                else:
2189                    ranges.append(coal)
2190                    cumul += coal.length
2191            # Get the rest and yield
2192            for c, rfile in get_and_yield(relpath, ranges):
2193                yield c, rfile
2194
2195    def recommended_page_size(self):
2196        """See Transport.recommended_page_size().
2197
2198        For HTTP we suggest a large page size to reduce the overhead
2199        introduced by latency.
2200        """
2201        return 64 * 1024
2202
2203    def _post(self, body_bytes):
2204        """POST body_bytes to .bzr/smart on this transport.
2205
2206        :returns: (response code, response body file-like object).
2207        """
2208        # TODO: Requiring all the body_bytes to be available at the beginning of
2209        # the POST may require large client buffers.  It would be nice to have
2210        # an interface that allows streaming via POST when possible (and
2211        # degrades to a local buffer when not).
2212        abspath = self._remote_path('.bzr/smart')
2213        response = self.request(
2214            'POST', abspath, body=body_bytes,
2215            headers={'Content-Type': 'application/octet-stream'})
2216        if response.status not in (200, 403):
2217            raise errors.UnexpectedHttpStatus(abspath, response.status)
2218        code = response.status
2219        data = handle_response(
2220            abspath, code, response.getheader, response)
2221        return code, data
2222
2223    def _head(self, relpath):
2224        """Request the HEAD of a file.
2225
2226        Performs the request and leaves callers handle the results.
2227        """
2228        abspath = self._remote_path(relpath)
2229        response = self.request('HEAD', abspath)
2230        if response.status not in (200, 404):
2231            raise errors.UnexpectedHttpStatus(abspath, response.status)
2232
2233        return response
2234
2235        raise NotImplementedError(self._post)
2236
2237    def put_file(self, relpath, f, mode=None):
2238        """Copy the file-like object into the location.
2239
2240        :param relpath: Location to put the contents, relative to base.
2241        :param f:       File-like object.
2242        """
2243        raise errors.TransportNotPossible('http PUT not supported')
2244
2245    def mkdir(self, relpath, mode=None):
2246        """Create a directory at the given path."""
2247        raise errors.TransportNotPossible('http does not support mkdir()')
2248
2249    def rmdir(self, relpath):
2250        """See Transport.rmdir."""
2251        raise errors.TransportNotPossible('http does not support rmdir()')
2252
2253    def append_file(self, relpath, f, mode=None):
2254        """Append the text in the file-like object into the final
2255        location.
2256        """
2257        raise errors.TransportNotPossible('http does not support append()')
2258
2259    def copy(self, rel_from, rel_to):
2260        """Copy the item at rel_from to the location at rel_to"""
2261        raise errors.TransportNotPossible('http does not support copy()')
2262
2263    def copy_to(self, relpaths, other, mode=None, pb=None):
2264        """Copy a set of entries from self into another Transport.
2265
2266        :param relpaths: A list/generator of entries to be copied.
2267
2268        TODO: if other is LocalTransport, is it possible to
2269              do better than put(get())?
2270        """
2271        # At this point HttpTransport might be able to check and see if
2272        # the remote location is the same, and rather than download, and
2273        # then upload, it could just issue a remote copy_this command.
2274        if isinstance(other, HttpTransport):
2275            raise errors.TransportNotPossible(
2276                'http cannot be the target of copy_to()')
2277        else:
2278            return super(HttpTransport, self).\
2279                copy_to(relpaths, other, mode=mode, pb=pb)
2280
2281    def move(self, rel_from, rel_to):
2282        """Move the item at rel_from to the location at rel_to"""
2283        raise errors.TransportNotPossible('http does not support move()')
2284
2285    def delete(self, relpath):
2286        """Delete the item at relpath"""
2287        raise errors.TransportNotPossible('http does not support delete()')
2288
2289    def external_url(self):
2290        """See breezy.transport.Transport.external_url."""
2291        # HTTP URL's are externally usable as long as they don't mention their
2292        # implementation qualifier
2293        url = self._parsed_url.clone()
2294        url.scheme = self._unqualified_scheme
2295        return str(url)
2296
2297    def is_readonly(self):
2298        """See Transport.is_readonly."""
2299        return True
2300
2301    def listable(self):
2302        """See Transport.listable."""
2303        return False
2304
2305    def stat(self, relpath):
2306        """Return the stat information for a file.
2307        """
2308        raise errors.TransportNotPossible('http does not support stat()')
2309
2310    def lock_read(self, relpath):
2311        """Lock the given file for shared (read) access.
2312        :return: A lock object, which should be passed to Transport.unlock()
2313        """
2314        # The old RemoteBranch ignore lock for reading, so we will
2315        # continue that tradition and return a bogus lock object.
2316        class BogusLock(object):
2317            def __init__(self, path):
2318                self.path = path
2319
2320            def unlock(self):
2321                pass
2322        return BogusLock(relpath)
2323
2324    def lock_write(self, relpath):
2325        """Lock the given file for exclusive (write) access.
2326        WARNING: many transports do not support this, so trying avoid using it
2327
2328        :return: A lock object, which should be passed to Transport.unlock()
2329        """
2330        raise errors.TransportNotPossible('http does not support lock_write()')
2331
2332    def _attempted_range_header(self, offsets, tail_amount):
2333        """Prepare a HTTP Range header at a level the server should accept.
2334
2335        :return: the range header representing offsets/tail_amount or None if
2336            no header can be built.
2337        """
2338
2339        if self._range_hint == 'multi':
2340            # Generate the header describing all offsets
2341            return self._range_header(offsets, tail_amount)
2342        elif self._range_hint == 'single':
2343            # Combine all the requested ranges into a single
2344            # encompassing one
2345            if len(offsets) > 0:
2346                if tail_amount not in (0, None):
2347                    # Nothing we can do here to combine ranges with tail_amount
2348                    # in a single range, just returns None. The whole file
2349                    # should be downloaded.
2350                    return None
2351                else:
2352                    start = offsets[0].start
2353                    last = offsets[-1]
2354                    end = last.start + last.length - 1
2355                    whole = self._coalesce_offsets([(start, end - start + 1)],
2356                                                   limit=0, fudge_factor=0)
2357                    return self._range_header(list(whole), 0)
2358            else:
2359                # Only tail_amount, requested, leave range_header
2360                # do its work
2361                return self._range_header(offsets, tail_amount)
2362        else:
2363            return None
2364
2365    @staticmethod
2366    def _range_header(ranges, tail_amount):
2367        """Turn a list of bytes ranges into a HTTP Range header value.
2368
2369        :param ranges: A list of _CoalescedOffset
2370        :param tail_amount: The amount to get from the end of the file.
2371
2372        :return: HTTP range header string.
2373
2374        At least a non-empty ranges *or* a tail_amount must be
2375        provided.
2376        """
2377        strings = []
2378        for offset in ranges:
2379            strings.append('%d-%d' % (offset.start,
2380                                      offset.start + offset.length - 1))
2381
2382        if tail_amount:
2383            strings.append('-%d' % tail_amount)
2384
2385        return ','.join(strings)
2386
2387    def _redirected_to(self, source, target):
2388        """Returns a transport suitable to re-issue a redirected request.
2389
2390        :param source: The source url as returned by the server.
2391        :param target: The target url as returned by the server.
2392
2393        The redirection can be handled only if the relpath involved is not
2394        renamed by the redirection.
2395
2396        :returns: A transport
2397        :raise UnusableRedirect: when the URL can not be reinterpreted
2398        """
2399        parsed_source = self._split_url(source)
2400        parsed_target = self._split_url(target)
2401        pl = len(self._parsed_url.path)
2402        # determine the excess tail - the relative path that was in
2403        # the original request but not part of this transports' URL.
2404        excess_tail = parsed_source.path[pl:].strip("/")
2405        if not parsed_target.path.endswith(excess_tail):
2406            # The final part of the url has been renamed, we can't handle the
2407            # redirection.
2408            raise UnusableRedirect(
2409                source, target, "final part of the url was renamed")
2410
2411        target_path = parsed_target.path
2412        if excess_tail:
2413            # Drop the tail that was in the redirect but not part of
2414            # the path of this transport.
2415            target_path = target_path[:-len(excess_tail)]
2416
2417        if parsed_target.scheme in ('http', 'https'):
2418            # Same protocol family (i.e. http[s]), we will preserve the same
2419            # http client implementation when a redirection occurs from one to
2420            # the other (otherwise users may be surprised that bzr switches
2421            # from one implementation to the other, and devs may suffer
2422            # debugging it).
2423            if (parsed_target.scheme == self._unqualified_scheme
2424                and parsed_target.host == self._parsed_url.host
2425                and parsed_target.port == self._parsed_url.port
2426                and (parsed_target.user is None or
2427                     parsed_target.user == self._parsed_url.user)):
2428                # If a user is specified, it should match, we don't care about
2429                # passwords, wrong passwords will be rejected anyway.
2430                return self.clone(target_path)
2431            else:
2432                # Rebuild the url preserving the scheme qualification and the
2433                # credentials (if they don't apply, the redirected to server
2434                # will tell us, but if they do apply, we avoid prompting the
2435                # user)
2436                redir_scheme = parsed_target.scheme
2437                new_url = self._unsplit_url(redir_scheme,
2438                                            self._parsed_url.user,
2439                                            self._parsed_url.password,
2440                                            parsed_target.host, parsed_target.port,
2441                                            target_path)
2442                return transport.get_transport_from_url(new_url)
2443        else:
2444            # Redirected to a different protocol
2445            new_url = self._unsplit_url(parsed_target.scheme,
2446                                        parsed_target.user,
2447                                        parsed_target.password,
2448                                        parsed_target.host, parsed_target.port,
2449                                        target_path)
2450            return transport.get_transport_from_url(new_url)
2451
2452    def _options(self, relpath):
2453        abspath = self._remote_path(relpath)
2454        resp = self.request('OPTIONS', abspath)
2455        if resp.status == 404:
2456            raise errors.NoSuchFile(abspath)
2457        if resp.status in (403, 405):
2458            raise errors.InvalidHttpResponse(
2459                abspath,
2460                "OPTIONS not supported or forbidden for remote URL")
2461        return resp.getheaders()
2462
2463
2464# TODO: May be better located in smart/medium.py with the other
2465# SmartMedium classes
2466class SmartClientHTTPMedium(medium.SmartClientMedium):
2467
2468    def __init__(self, http_transport):
2469        super(SmartClientHTTPMedium, self).__init__(http_transport.base)
2470        # We don't want to create a circular reference between the http
2471        # transport and its associated medium. Since the transport will live
2472        # longer than the medium, the medium keep only a weak reference to its
2473        # transport.
2474        self._http_transport_ref = weakref.ref(http_transport)
2475
2476    def get_request(self):
2477        return SmartClientHTTPMediumRequest(self)
2478
2479    def should_probe(self):
2480        return True
2481
2482    def remote_path_from_transport(self, transport):
2483        # Strip the optional 'bzr+' prefix from transport so it will have the
2484        # same scheme as self.
2485        transport_base = transport.base
2486        if transport_base.startswith('bzr+'):
2487            transport_base = transport_base[4:]
2488        rel_url = urlutils.relative_url(self.base, transport_base)
2489        return urlutils.unquote(rel_url)
2490
2491    def send_http_smart_request(self, bytes):
2492        try:
2493            # Get back the http_transport hold by the weak reference
2494            t = self._http_transport_ref()
2495            code, body_filelike = t._post(bytes)
2496            if code != 200:
2497                raise errors.UnexpectedHttpStatus(
2498                    t._remote_path('.bzr/smart'), code)
2499        except (errors.InvalidHttpResponse, errors.ConnectionReset) as e:
2500            raise errors.SmartProtocolError(str(e))
2501        return body_filelike
2502
2503    def _report_activity(self, bytes, direction):
2504        """See SmartMedium._report_activity.
2505
2506        Does nothing; the underlying plain HTTP transport will report the
2507        activity that this medium would report.
2508        """
2509        pass
2510
2511    def disconnect(self):
2512        """See SmartClientMedium.disconnect()."""
2513        t = self._http_transport_ref()
2514        t.disconnect()
2515
2516
2517# TODO: May be better located in smart/medium.py with the other
2518# SmartMediumRequest classes
2519class SmartClientHTTPMediumRequest(medium.SmartClientMediumRequest):
2520    """A SmartClientMediumRequest that works with an HTTP medium."""
2521
2522    def __init__(self, client_medium):
2523        medium.SmartClientMediumRequest.__init__(self, client_medium)
2524        self._buffer = b''
2525
2526    def _accept_bytes(self, bytes):
2527        self._buffer += bytes
2528
2529    def _finished_writing(self):
2530        data = self._medium.send_http_smart_request(self._buffer)
2531        self._response_body = data
2532
2533    def _read_bytes(self, count):
2534        """See SmartClientMediumRequest._read_bytes."""
2535        return self._response_body.read(count)
2536
2537    def _read_line(self):
2538        line, excess = medium._get_line(self._response_body.read)
2539        if excess != b'':
2540            raise AssertionError(
2541                '_get_line returned excess bytes, but this mediumrequest '
2542                'cannot handle excess. (%r)' % (excess,))
2543        return line
2544
2545    def _finished_reading(self):
2546        """See SmartClientMediumRequest._finished_reading."""
2547        pass
2548
2549
2550def unhtml_roughly(maybe_html, length_limit=1000):
2551    """Very approximate html->text translation, for presenting error bodies.
2552
2553    :param length_limit: Truncate the result to this many characters.
2554
2555    >>> unhtml_roughly("<b>bad</b> things happened\\n")
2556    ' bad  things happened '
2557    """
2558    return re.subn(r"(<[^>]*>|\n|&nbsp;)", " ", maybe_html)[0][:length_limit]
2559
2560
2561def get_test_permutations():
2562    """Return the permutations to be used in testing."""
2563    from breezy.tests import (
2564        features,
2565        http_server,
2566        )
2567    permutations = [(HttpTransport, http_server.HttpServer), ]
2568    if features.HTTPSServerFeature.available():
2569        from breezy.tests import (
2570            https_server,
2571            ssl_certs,
2572            )
2573
2574        class HTTPS_transport(HttpTransport):
2575
2576            def __init__(self, base, _from_transport=None):
2577                super(HTTPS_transport, self).__init__(
2578                    base, _from_transport=_from_transport,
2579                    ca_certs=ssl_certs.build_path('ca.crt'))
2580
2581        permutations.append((HTTPS_transport,
2582                             https_server.HTTPSServer))
2583    return permutations
2584