1from __future__ import absolute_import
2from contextlib import contextmanager
3import zlib
4import io
5import logging
6from socket import timeout as SocketTimeout
7from socket import error as SocketError
8
9try:
10    import brotli
11except ImportError:
12    brotli = None
13
14from ._collections import HTTPHeaderDict
15from .exceptions import (
16    BodyNotHttplibCompatible, ProtocolError, DecodeError, ReadTimeoutError,
17    ResponseNotChunked, IncompleteRead, InvalidHeader
18)
19from .packages.six import string_types as basestring, PY3
20from .packages.six.moves import http_client as httplib
21from .connection import HTTPException, BaseSSLError
22from .util.response import is_fp_closed, is_response_to_head
23
24log = logging.getLogger(__name__)
25
26
27class DeflateDecoder(object):
28
29    def __init__(self):
30        self._first_try = True
31        self._data = b''
32        self._obj = zlib.decompressobj()
33
34    def __getattr__(self, name):
35        return getattr(self._obj, name)
36
37    def decompress(self, data):
38        if not data:
39            return data
40
41        if not self._first_try:
42            return self._obj.decompress(data)
43
44        self._data += data
45        try:
46            decompressed = self._obj.decompress(data)
47            if decompressed:
48                self._first_try = False
49                self._data = None
50            return decompressed
51        except zlib.error:
52            self._first_try = False
53            self._obj = zlib.decompressobj(-zlib.MAX_WBITS)
54            try:
55                return self.decompress(self._data)
56            finally:
57                self._data = None
58
59
60class GzipDecoderState(object):
61
62    FIRST_MEMBER = 0
63    OTHER_MEMBERS = 1
64    SWALLOW_DATA = 2
65
66
67class GzipDecoder(object):
68
69    def __init__(self):
70        self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
71        self._state = GzipDecoderState.FIRST_MEMBER
72
73    def __getattr__(self, name):
74        return getattr(self._obj, name)
75
76    def decompress(self, data):
77        ret = bytearray()
78        if self._state == GzipDecoderState.SWALLOW_DATA or not data:
79            return bytes(ret)
80        while True:
81            try:
82                ret += self._obj.decompress(data)
83            except zlib.error:
84                previous_state = self._state
85                # Ignore data after the first error
86                self._state = GzipDecoderState.SWALLOW_DATA
87                if previous_state == GzipDecoderState.OTHER_MEMBERS:
88                    # Allow trailing garbage acceptable in other gzip clients
89                    return bytes(ret)
90                raise
91            data = self._obj.unused_data
92            if not data:
93                return bytes(ret)
94            self._state = GzipDecoderState.OTHER_MEMBERS
95            self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
96
97
98if brotli is not None:
99    class BrotliDecoder(object):
100        # Supports both 'brotlipy' and 'Brotli' packages
101        # since they share an import name. The top branches
102        # are for 'brotlipy' and bottom branches for 'Brotli'
103        def __init__(self):
104            self._obj = brotli.Decompressor()
105
106        def decompress(self, data):
107            if hasattr(self._obj, 'decompress'):
108                return self._obj.decompress(data)
109            return self._obj.process(data)
110
111        def flush(self):
112            if hasattr(self._obj, 'flush'):
113                return self._obj.flush()
114            return b''
115
116
117class MultiDecoder(object):
118    """
119    From RFC7231:
120        If one or more encodings have been applied to a representation, the
121        sender that applied the encodings MUST generate a Content-Encoding
122        header field that lists the content codings in the order in which
123        they were applied.
124    """
125
126    def __init__(self, modes):
127        self._decoders = [_get_decoder(m.strip()) for m in modes.split(',')]
128
129    def flush(self):
130        return self._decoders[0].flush()
131
132    def decompress(self, data):
133        for d in reversed(self._decoders):
134            data = d.decompress(data)
135        return data
136
137
138def _get_decoder(mode):
139    if ',' in mode:
140        return MultiDecoder(mode)
141
142    if mode == 'gzip':
143        return GzipDecoder()
144
145    if brotli is not None and mode == 'br':
146        return BrotliDecoder()
147
148    return DeflateDecoder()
149
150
151class HTTPResponse(io.IOBase):
152    """
153    HTTP Response container.
154
155    Backwards-compatible to httplib's HTTPResponse but the response ``body`` is
156    loaded and decoded on-demand when the ``data`` property is accessed.  This
157    class is also compatible with the Python standard library's :mod:`io`
158    module, and can hence be treated as a readable object in the context of that
159    framework.
160
161    Extra parameters for behaviour not present in httplib.HTTPResponse:
162
163    :param preload_content:
164        If True, the response's body will be preloaded during construction.
165
166    :param decode_content:
167        If True, will attempt to decode the body based on the
168        'content-encoding' header.
169
170    :param original_response:
171        When this HTTPResponse wrapper is generated from an httplib.HTTPResponse
172        object, it's convenient to include the original for debug purposes. It's
173        otherwise unused.
174
175    :param retries:
176        The retries contains the last :class:`~urllib3.util.retry.Retry` that
177        was used during the request.
178
179    :param enforce_content_length:
180        Enforce content length checking. Body returned by server must match
181        value of Content-Length header, if present. Otherwise, raise error.
182    """
183
184    CONTENT_DECODERS = ['gzip', 'deflate']
185    if brotli is not None:
186        CONTENT_DECODERS += ['br']
187    REDIRECT_STATUSES = [301, 302, 303, 307, 308]
188
189    def __init__(self, body='', headers=None, status=0, version=0, reason=None,
190                 strict=0, preload_content=True, decode_content=True,
191                 original_response=None, pool=None, connection=None, msg=None,
192                 retries=None, enforce_content_length=False,
193                 request_method=None, request_url=None):
194
195        if isinstance(headers, HTTPHeaderDict):
196            self.headers = headers
197        else:
198            self.headers = HTTPHeaderDict(headers)
199        self.status = status
200        self.version = version
201        self.reason = reason
202        self.strict = strict
203        self.decode_content = decode_content
204        self.retries = retries
205        self.enforce_content_length = enforce_content_length
206
207        self._decoder = None
208        self._body = None
209        self._fp = None
210        self._original_response = original_response
211        self._fp_bytes_read = 0
212        self.msg = msg
213        self._request_url = request_url
214
215        if body and isinstance(body, (basestring, bytes)):
216            self._body = body
217
218        self._pool = pool
219        self._connection = connection
220
221        if hasattr(body, 'read'):
222            self._fp = body
223
224        # Are we using the chunked-style of transfer encoding?
225        self.chunked = False
226        self.chunk_left = None
227        tr_enc = self.headers.get('transfer-encoding', '').lower()
228        # Don't incur the penalty of creating a list and then discarding it
229        encodings = (enc.strip() for enc in tr_enc.split(","))
230        if "chunked" in encodings:
231            self.chunked = True
232
233        # Determine length of response
234        self.length_remaining = self._init_length(request_method)
235
236        # If requested, preload the body.
237        if preload_content and not self._body:
238            self._body = self.read(decode_content=decode_content)
239
240    def get_redirect_location(self):
241        """
242        Should we redirect and where to?
243
244        :returns: Truthy redirect location string if we got a redirect status
245            code and valid location. ``None`` if redirect status and no
246            location. ``False`` if not a redirect status code.
247        """
248        if self.status in self.REDIRECT_STATUSES:
249            return self.headers.get('location')
250
251        return False
252
253    def release_conn(self):
254        if not self._pool or not self._connection:
255            return
256
257        self._pool._put_conn(self._connection)
258        self._connection = None
259
260    @property
261    def data(self):
262        # For backwords-compat with earlier urllib3 0.4 and earlier.
263        if self._body:
264            return self._body
265
266        if self._fp:
267            return self.read(cache_content=True)
268
269    @property
270    def connection(self):
271        return self._connection
272
273    def isclosed(self):
274        return is_fp_closed(self._fp)
275
276    def tell(self):
277        """
278        Obtain the number of bytes pulled over the wire so far. May differ from
279        the amount of content returned by :meth:``HTTPResponse.read`` if bytes
280        are encoded on the wire (e.g, compressed).
281        """
282        return self._fp_bytes_read
283
284    def _init_length(self, request_method):
285        """
286        Set initial length value for Response content if available.
287        """
288        length = self.headers.get('content-length')
289
290        if length is not None:
291            if self.chunked:
292                # This Response will fail with an IncompleteRead if it can't be
293                # received as chunked. This method falls back to attempt reading
294                # the response before raising an exception.
295                log.warning("Received response with both Content-Length and "
296                            "Transfer-Encoding set. This is expressly forbidden "
297                            "by RFC 7230 sec 3.3.2. Ignoring Content-Length and "
298                            "attempting to process response as Transfer-Encoding: "
299                            "chunked.")
300                return None
301
302            try:
303                # RFC 7230 section 3.3.2 specifies multiple content lengths can
304                # be sent in a single Content-Length header
305                # (e.g. Content-Length: 42, 42). This line ensures the values
306                # are all valid ints and that as long as the `set` length is 1,
307                # all values are the same. Otherwise, the header is invalid.
308                lengths = set([int(val) for val in length.split(',')])
309                if len(lengths) > 1:
310                    raise InvalidHeader("Content-Length contained multiple "
311                                        "unmatching values (%s)" % length)
312                length = lengths.pop()
313            except ValueError:
314                length = None
315            else:
316                if length < 0:
317                    length = None
318
319        # Convert status to int for comparison
320        # In some cases, httplib returns a status of "_UNKNOWN"
321        try:
322            status = int(self.status)
323        except ValueError:
324            status = 0
325
326        # Check for responses that shouldn't include a body
327        if status in (204, 304) or 100 <= status < 200 or request_method == 'HEAD':
328            length = 0
329
330        return length
331
332    def _init_decoder(self):
333        """
334        Set-up the _decoder attribute if necessary.
335        """
336        # Note: content-encoding value should be case-insensitive, per RFC 7230
337        # Section 3.2
338        content_encoding = self.headers.get('content-encoding', '').lower()
339        if self._decoder is None:
340            if content_encoding in self.CONTENT_DECODERS:
341                self._decoder = _get_decoder(content_encoding)
342            elif ',' in content_encoding:
343                encodings = [
344                    e.strip() for e in content_encoding.split(',')
345                    if e.strip() in self.CONTENT_DECODERS]
346                if len(encodings):
347                    self._decoder = _get_decoder(content_encoding)
348
349    DECODER_ERROR_CLASSES = (IOError, zlib.error)
350    if brotli is not None:
351        DECODER_ERROR_CLASSES += (brotli.error,)
352
353    def _decode(self, data, decode_content, flush_decoder):
354        """
355        Decode the data passed in and potentially flush the decoder.
356        """
357        if not decode_content:
358            return data
359
360        try:
361            if self._decoder:
362                data = self._decoder.decompress(data)
363        except self.DECODER_ERROR_CLASSES as e:
364            content_encoding = self.headers.get('content-encoding', '').lower()
365            raise DecodeError(
366                "Received response with content-encoding: %s, but "
367                "failed to decode it." % content_encoding, e)
368        if flush_decoder:
369            data += self._flush_decoder()
370
371        return data
372
373    def _flush_decoder(self):
374        """
375        Flushes the decoder. Should only be called if the decoder is actually
376        being used.
377        """
378        if self._decoder:
379            buf = self._decoder.decompress(b'')
380            return buf + self._decoder.flush()
381
382        return b''
383
384    @contextmanager
385    def _error_catcher(self):
386        """
387        Catch low-level python exceptions, instead re-raising urllib3
388        variants, so that low-level exceptions are not leaked in the
389        high-level api.
390
391        On exit, release the connection back to the pool.
392        """
393        clean_exit = False
394
395        try:
396            try:
397                yield
398
399            except SocketTimeout:
400                # FIXME: Ideally we'd like to include the url in the ReadTimeoutError but
401                # there is yet no clean way to get at it from this context.
402                raise ReadTimeoutError(self._pool, None, 'Read timed out.')
403
404            except BaseSSLError as e:
405                # FIXME: Is there a better way to differentiate between SSLErrors?
406                if 'read operation timed out' not in str(e):  # Defensive:
407                    # This shouldn't happen but just in case we're missing an edge
408                    # case, let's avoid swallowing SSL errors.
409                    raise
410
411                raise ReadTimeoutError(self._pool, None, 'Read timed out.')
412
413            except (HTTPException, SocketError) as e:
414                # This includes IncompleteRead.
415                raise ProtocolError('Connection broken: %r' % e, e)
416
417            # If no exception is thrown, we should avoid cleaning up
418            # unnecessarily.
419            clean_exit = True
420        finally:
421            # If we didn't terminate cleanly, we need to throw away our
422            # connection.
423            if not clean_exit:
424                # The response may not be closed but we're not going to use it
425                # anymore so close it now to ensure that the connection is
426                # released back to the pool.
427                if self._original_response:
428                    self._original_response.close()
429
430                # Closing the response may not actually be sufficient to close
431                # everything, so if we have a hold of the connection close that
432                # too.
433                if self._connection:
434                    self._connection.close()
435
436            # If we hold the original response but it's closed now, we should
437            # return the connection back to the pool.
438            if self._original_response and self._original_response.isclosed():
439                self.release_conn()
440
441    def read(self, amt=None, decode_content=None, cache_content=False):
442        """
443        Similar to :meth:`httplib.HTTPResponse.read`, but with two additional
444        parameters: ``decode_content`` and ``cache_content``.
445
446        :param amt:
447            How much of the content to read. If specified, caching is skipped
448            because it doesn't make sense to cache partial content as the full
449            response.
450
451        :param decode_content:
452            If True, will attempt to decode the body based on the
453            'content-encoding' header.
454
455        :param cache_content:
456            If True, will save the returned data such that the same result is
457            returned despite of the state of the underlying file object. This
458            is useful if you want the ``.data`` property to continue working
459            after having ``.read()`` the file object. (Overridden if ``amt`` is
460            set.)
461        """
462        self._init_decoder()
463        if decode_content is None:
464            decode_content = self.decode_content
465
466        if self._fp is None:
467            return
468
469        flush_decoder = False
470        data = None
471
472        with self._error_catcher():
473            if amt is None:
474                # cStringIO doesn't like amt=None
475                data = self._fp.read()
476                flush_decoder = True
477            else:
478                cache_content = False
479                data = self._fp.read(amt)
480                if amt != 0 and not data:  # Platform-specific: Buggy versions of Python.
481                    # Close the connection when no data is returned
482                    #
483                    # This is redundant to what httplib/http.client _should_
484                    # already do.  However, versions of python released before
485                    # December 15, 2012 (http://bugs.python.org/issue16298) do
486                    # not properly close the connection in all cases. There is
487                    # no harm in redundantly calling close.
488                    self._fp.close()
489                    flush_decoder = True
490                    if self.enforce_content_length and self.length_remaining not in (0, None):
491                        # This is an edge case that httplib failed to cover due
492                        # to concerns of backward compatibility. We're
493                        # addressing it here to make sure IncompleteRead is
494                        # raised during streaming, so all calls with incorrect
495                        # Content-Length are caught.
496                        raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
497
498        if data:
499            self._fp_bytes_read += len(data)
500            if self.length_remaining is not None:
501                self.length_remaining -= len(data)
502
503            data = self._decode(data, decode_content, flush_decoder)
504
505            if cache_content:
506                self._body = data
507
508        return data
509
510    def stream(self, amt=2**16, decode_content=None):
511        """
512        A generator wrapper for the read() method. A call will block until
513        ``amt`` bytes have been read from the connection or until the
514        connection is closed.
515
516        :param amt:
517            How much of the content to read. The generator will return up to
518            much data per iteration, but may return less. This is particularly
519            likely when using compressed data. However, the empty string will
520            never be returned.
521
522        :param decode_content:
523            If True, will attempt to decode the body based on the
524            'content-encoding' header.
525        """
526        if self.chunked and self.supports_chunked_reads():
527            for line in self.read_chunked(amt, decode_content=decode_content):
528                yield line
529        else:
530            while not is_fp_closed(self._fp):
531                data = self.read(amt=amt, decode_content=decode_content)
532
533                if data:
534                    yield data
535
536    @classmethod
537    def from_httplib(ResponseCls, r, **response_kw):
538        """
539        Given an :class:`httplib.HTTPResponse` instance ``r``, return a
540        corresponding :class:`urllib3.response.HTTPResponse` object.
541
542        Remaining parameters are passed to the HTTPResponse constructor, along
543        with ``original_response=r``.
544        """
545        headers = r.msg
546
547        if not isinstance(headers, HTTPHeaderDict):
548            if PY3:
549                headers = HTTPHeaderDict(headers.items())
550            else:
551                # Python 2.7
552                headers = HTTPHeaderDict.from_httplib(headers)
553
554        # HTTPResponse objects in Python 3 don't have a .strict attribute
555        strict = getattr(r, 'strict', 0)
556        resp = ResponseCls(body=r,
557                           headers=headers,
558                           status=r.status,
559                           version=r.version,
560                           reason=r.reason,
561                           strict=strict,
562                           original_response=r,
563                           **response_kw)
564        return resp
565
566    # Backwards-compatibility methods for httplib.HTTPResponse
567    def getheaders(self):
568        return self.headers
569
570    def getheader(self, name, default=None):
571        return self.headers.get(name, default)
572
573    # Backwards compatibility for http.cookiejar
574    def info(self):
575        return self.headers
576
577    # Overrides from io.IOBase
578    def close(self):
579        if not self.closed:
580            self._fp.close()
581
582        if self._connection:
583            self._connection.close()
584
585    @property
586    def closed(self):
587        if self._fp is None:
588            return True
589        elif hasattr(self._fp, 'isclosed'):
590            return self._fp.isclosed()
591        elif hasattr(self._fp, 'closed'):
592            return self._fp.closed
593        else:
594            return True
595
596    def fileno(self):
597        if self._fp is None:
598            raise IOError("HTTPResponse has no file to get a fileno from")
599        elif hasattr(self._fp, "fileno"):
600            return self._fp.fileno()
601        else:
602            raise IOError("The file-like object this HTTPResponse is wrapped "
603                          "around has no file descriptor")
604
605    def flush(self):
606        if self._fp is not None and hasattr(self._fp, 'flush'):
607            return self._fp.flush()
608
609    def readable(self):
610        # This method is required for `io` module compatibility.
611        return True
612
613    def readinto(self, b):
614        # This method is required for `io` module compatibility.
615        temp = self.read(len(b))
616        if len(temp) == 0:
617            return 0
618        else:
619            b[:len(temp)] = temp
620            return len(temp)
621
622    def supports_chunked_reads(self):
623        """
624        Checks if the underlying file-like object looks like a
625        httplib.HTTPResponse object. We do this by testing for the fp
626        attribute. If it is present we assume it returns raw chunks as
627        processed by read_chunked().
628        """
629        return hasattr(self._fp, 'fp')
630
631    def _update_chunk_length(self):
632        # First, we'll figure out length of a chunk and then
633        # we'll try to read it from socket.
634        if self.chunk_left is not None:
635            return
636        line = self._fp.fp.readline()
637        line = line.split(b';', 1)[0]
638        try:
639            self.chunk_left = int(line, 16)
640        except ValueError:
641            # Invalid chunked protocol response, abort.
642            self.close()
643            raise httplib.IncompleteRead(line)
644
645    def _handle_chunk(self, amt):
646        returned_chunk = None
647        if amt is None:
648            chunk = self._fp._safe_read(self.chunk_left)
649            returned_chunk = chunk
650            self._fp._safe_read(2)  # Toss the CRLF at the end of the chunk.
651            self.chunk_left = None
652        elif amt < self.chunk_left:
653            value = self._fp._safe_read(amt)
654            self.chunk_left = self.chunk_left - amt
655            returned_chunk = value
656        elif amt == self.chunk_left:
657            value = self._fp._safe_read(amt)
658            self._fp._safe_read(2)  # Toss the CRLF at the end of the chunk.
659            self.chunk_left = None
660            returned_chunk = value
661        else:  # amt > self.chunk_left
662            returned_chunk = self._fp._safe_read(self.chunk_left)
663            self._fp._safe_read(2)  # Toss the CRLF at the end of the chunk.
664            self.chunk_left = None
665        return returned_chunk
666
667    def read_chunked(self, amt=None, decode_content=None):
668        """
669        Similar to :meth:`HTTPResponse.read`, but with an additional
670        parameter: ``decode_content``.
671
672        :param amt:
673            How much of the content to read. If specified, caching is skipped
674            because it doesn't make sense to cache partial content as the full
675            response.
676
677        :param decode_content:
678            If True, will attempt to decode the body based on the
679            'content-encoding' header.
680        """
681        self._init_decoder()
682        # FIXME: Rewrite this method and make it a class with a better structured logic.
683        if not self.chunked:
684            raise ResponseNotChunked(
685                "Response is not chunked. "
686                "Header 'transfer-encoding: chunked' is missing.")
687        if not self.supports_chunked_reads():
688            raise BodyNotHttplibCompatible(
689                "Body should be httplib.HTTPResponse like. "
690                "It should have have an fp attribute which returns raw chunks.")
691
692        with self._error_catcher():
693            # Don't bother reading the body of a HEAD request.
694            if self._original_response and is_response_to_head(self._original_response):
695                self._original_response.close()
696                return
697
698            # If a response is already read and closed
699            # then return immediately.
700            if self._fp.fp is None:
701                return
702
703            while True:
704                self._update_chunk_length()
705                if self.chunk_left == 0:
706                    break
707                chunk = self._handle_chunk(amt)
708                decoded = self._decode(chunk, decode_content=decode_content,
709                                       flush_decoder=False)
710                if decoded:
711                    yield decoded
712
713            if decode_content:
714                # On CPython and PyPy, we should never need to flush the
715                # decoder. However, on Jython we *might* need to, so
716                # lets defensively do it anyway.
717                decoded = self._flush_decoder()
718                if decoded:  # Platform-specific: Jython.
719                    yield decoded
720
721            # Chunk content ends with \r\n: discard it.
722            while True:
723                line = self._fp.fp.readline()
724                if not line:
725                    # Some sites may not end with '\r\n'.
726                    break
727                if line == b'\r\n':
728                    break
729
730            # We read everything; close the "file".
731            if self._original_response:
732                self._original_response.close()
733
734    def geturl(self):
735        """
736        Returns the URL that was the source of this response.
737        If the request that generated this response redirected, this method
738        will return the final redirect location.
739        """
740        if self.retries is not None and len(self.retries.history):
741            return self.retries.history[-1].redirect_location
742        else:
743            return self._request_url
744
745    def __iter__(self):
746        buffer = [b""]
747        for chunk in self.stream(decode_content=True):
748            if b"\n" in chunk:
749                chunk = chunk.split(b"\n")
750                yield b"".join(buffer) + chunk[0] + b"\n"
751                for x in chunk[1:-1]:
752                    yield x + b"\n"
753                if chunk[-1]:
754                    buffer = [chunk[-1]]
755                else:
756                    buffer = []
757            else:
758                buffer.append(chunk)
759        if buffer:
760            yield b"".join(buffer)
761