1from __future__ import absolute_import 2from contextlib import contextmanager 3import zlib 4import io 5import logging 6from socket import timeout as SocketTimeout 7from socket import error as SocketError 8 9try: 10 import brotli 11except ImportError: 12 brotli = None 13 14from ._collections import HTTPHeaderDict 15from .exceptions import ( 16 BodyNotHttplibCompatible, ProtocolError, DecodeError, ReadTimeoutError, 17 ResponseNotChunked, IncompleteRead, InvalidHeader 18) 19from .packages.six import string_types as basestring, PY3 20from .packages.six.moves import http_client as httplib 21from .connection import HTTPException, BaseSSLError 22from .util.response import is_fp_closed, is_response_to_head 23 24log = logging.getLogger(__name__) 25 26 27class DeflateDecoder(object): 28 29 def __init__(self): 30 self._first_try = True 31 self._data = b'' 32 self._obj = zlib.decompressobj() 33 34 def __getattr__(self, name): 35 return getattr(self._obj, name) 36 37 def decompress(self, data): 38 if not data: 39 return data 40 41 if not self._first_try: 42 return self._obj.decompress(data) 43 44 self._data += data 45 try: 46 decompressed = self._obj.decompress(data) 47 if decompressed: 48 self._first_try = False 49 self._data = None 50 return decompressed 51 except zlib.error: 52 self._first_try = False 53 self._obj = zlib.decompressobj(-zlib.MAX_WBITS) 54 try: 55 return self.decompress(self._data) 56 finally: 57 self._data = None 58 59 60class GzipDecoderState(object): 61 62 FIRST_MEMBER = 0 63 OTHER_MEMBERS = 1 64 SWALLOW_DATA = 2 65 66 67class GzipDecoder(object): 68 69 def __init__(self): 70 self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS) 71 self._state = GzipDecoderState.FIRST_MEMBER 72 73 def __getattr__(self, name): 74 return getattr(self._obj, name) 75 76 def decompress(self, data): 77 ret = bytearray() 78 if self._state == GzipDecoderState.SWALLOW_DATA or not data: 79 return bytes(ret) 80 while True: 81 try: 82 ret += self._obj.decompress(data) 83 except zlib.error: 84 previous_state = self._state 85 # Ignore data after the first error 86 self._state = GzipDecoderState.SWALLOW_DATA 87 if previous_state == GzipDecoderState.OTHER_MEMBERS: 88 # Allow trailing garbage acceptable in other gzip clients 89 return bytes(ret) 90 raise 91 data = self._obj.unused_data 92 if not data: 93 return bytes(ret) 94 self._state = GzipDecoderState.OTHER_MEMBERS 95 self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS) 96 97 98if brotli is not None: 99 class BrotliDecoder(object): 100 # Supports both 'brotlipy' and 'Brotli' packages 101 # since they share an import name. The top branches 102 # are for 'brotlipy' and bottom branches for 'Brotli' 103 def __init__(self): 104 self._obj = brotli.Decompressor() 105 106 def decompress(self, data): 107 if hasattr(self._obj, 'decompress'): 108 return self._obj.decompress(data) 109 return self._obj.process(data) 110 111 def flush(self): 112 if hasattr(self._obj, 'flush'): 113 return self._obj.flush() 114 return b'' 115 116 117class MultiDecoder(object): 118 """ 119 From RFC7231: 120 If one or more encodings have been applied to a representation, the 121 sender that applied the encodings MUST generate a Content-Encoding 122 header field that lists the content codings in the order in which 123 they were applied. 124 """ 125 126 def __init__(self, modes): 127 self._decoders = [_get_decoder(m.strip()) for m in modes.split(',')] 128 129 def flush(self): 130 return self._decoders[0].flush() 131 132 def decompress(self, data): 133 for d in reversed(self._decoders): 134 data = d.decompress(data) 135 return data 136 137 138def _get_decoder(mode): 139 if ',' in mode: 140 return MultiDecoder(mode) 141 142 if mode == 'gzip': 143 return GzipDecoder() 144 145 if brotli is not None and mode == 'br': 146 return BrotliDecoder() 147 148 return DeflateDecoder() 149 150 151class HTTPResponse(io.IOBase): 152 """ 153 HTTP Response container. 154 155 Backwards-compatible to httplib's HTTPResponse but the response ``body`` is 156 loaded and decoded on-demand when the ``data`` property is accessed. This 157 class is also compatible with the Python standard library's :mod:`io` 158 module, and can hence be treated as a readable object in the context of that 159 framework. 160 161 Extra parameters for behaviour not present in httplib.HTTPResponse: 162 163 :param preload_content: 164 If True, the response's body will be preloaded during construction. 165 166 :param decode_content: 167 If True, will attempt to decode the body based on the 168 'content-encoding' header. 169 170 :param original_response: 171 When this HTTPResponse wrapper is generated from an httplib.HTTPResponse 172 object, it's convenient to include the original for debug purposes. It's 173 otherwise unused. 174 175 :param retries: 176 The retries contains the last :class:`~urllib3.util.retry.Retry` that 177 was used during the request. 178 179 :param enforce_content_length: 180 Enforce content length checking. Body returned by server must match 181 value of Content-Length header, if present. Otherwise, raise error. 182 """ 183 184 CONTENT_DECODERS = ['gzip', 'deflate'] 185 if brotli is not None: 186 CONTENT_DECODERS += ['br'] 187 REDIRECT_STATUSES = [301, 302, 303, 307, 308] 188 189 def __init__(self, body='', headers=None, status=0, version=0, reason=None, 190 strict=0, preload_content=True, decode_content=True, 191 original_response=None, pool=None, connection=None, msg=None, 192 retries=None, enforce_content_length=False, 193 request_method=None, request_url=None): 194 195 if isinstance(headers, HTTPHeaderDict): 196 self.headers = headers 197 else: 198 self.headers = HTTPHeaderDict(headers) 199 self.status = status 200 self.version = version 201 self.reason = reason 202 self.strict = strict 203 self.decode_content = decode_content 204 self.retries = retries 205 self.enforce_content_length = enforce_content_length 206 207 self._decoder = None 208 self._body = None 209 self._fp = None 210 self._original_response = original_response 211 self._fp_bytes_read = 0 212 self.msg = msg 213 self._request_url = request_url 214 215 if body and isinstance(body, (basestring, bytes)): 216 self._body = body 217 218 self._pool = pool 219 self._connection = connection 220 221 if hasattr(body, 'read'): 222 self._fp = body 223 224 # Are we using the chunked-style of transfer encoding? 225 self.chunked = False 226 self.chunk_left = None 227 tr_enc = self.headers.get('transfer-encoding', '').lower() 228 # Don't incur the penalty of creating a list and then discarding it 229 encodings = (enc.strip() for enc in tr_enc.split(",")) 230 if "chunked" in encodings: 231 self.chunked = True 232 233 # Determine length of response 234 self.length_remaining = self._init_length(request_method) 235 236 # If requested, preload the body. 237 if preload_content and not self._body: 238 self._body = self.read(decode_content=decode_content) 239 240 def get_redirect_location(self): 241 """ 242 Should we redirect and where to? 243 244 :returns: Truthy redirect location string if we got a redirect status 245 code and valid location. ``None`` if redirect status and no 246 location. ``False`` if not a redirect status code. 247 """ 248 if self.status in self.REDIRECT_STATUSES: 249 return self.headers.get('location') 250 251 return False 252 253 def release_conn(self): 254 if not self._pool or not self._connection: 255 return 256 257 self._pool._put_conn(self._connection) 258 self._connection = None 259 260 @property 261 def data(self): 262 # For backwords-compat with earlier urllib3 0.4 and earlier. 263 if self._body: 264 return self._body 265 266 if self._fp: 267 return self.read(cache_content=True) 268 269 @property 270 def connection(self): 271 return self._connection 272 273 def isclosed(self): 274 return is_fp_closed(self._fp) 275 276 def tell(self): 277 """ 278 Obtain the number of bytes pulled over the wire so far. May differ from 279 the amount of content returned by :meth:``HTTPResponse.read`` if bytes 280 are encoded on the wire (e.g, compressed). 281 """ 282 return self._fp_bytes_read 283 284 def _init_length(self, request_method): 285 """ 286 Set initial length value for Response content if available. 287 """ 288 length = self.headers.get('content-length') 289 290 if length is not None: 291 if self.chunked: 292 # This Response will fail with an IncompleteRead if it can't be 293 # received as chunked. This method falls back to attempt reading 294 # the response before raising an exception. 295 log.warning("Received response with both Content-Length and " 296 "Transfer-Encoding set. This is expressly forbidden " 297 "by RFC 7230 sec 3.3.2. Ignoring Content-Length and " 298 "attempting to process response as Transfer-Encoding: " 299 "chunked.") 300 return None 301 302 try: 303 # RFC 7230 section 3.3.2 specifies multiple content lengths can 304 # be sent in a single Content-Length header 305 # (e.g. Content-Length: 42, 42). This line ensures the values 306 # are all valid ints and that as long as the `set` length is 1, 307 # all values are the same. Otherwise, the header is invalid. 308 lengths = set([int(val) for val in length.split(',')]) 309 if len(lengths) > 1: 310 raise InvalidHeader("Content-Length contained multiple " 311 "unmatching values (%s)" % length) 312 length = lengths.pop() 313 except ValueError: 314 length = None 315 else: 316 if length < 0: 317 length = None 318 319 # Convert status to int for comparison 320 # In some cases, httplib returns a status of "_UNKNOWN" 321 try: 322 status = int(self.status) 323 except ValueError: 324 status = 0 325 326 # Check for responses that shouldn't include a body 327 if status in (204, 304) or 100 <= status < 200 or request_method == 'HEAD': 328 length = 0 329 330 return length 331 332 def _init_decoder(self): 333 """ 334 Set-up the _decoder attribute if necessary. 335 """ 336 # Note: content-encoding value should be case-insensitive, per RFC 7230 337 # Section 3.2 338 content_encoding = self.headers.get('content-encoding', '').lower() 339 if self._decoder is None: 340 if content_encoding in self.CONTENT_DECODERS: 341 self._decoder = _get_decoder(content_encoding) 342 elif ',' in content_encoding: 343 encodings = [ 344 e.strip() for e in content_encoding.split(',') 345 if e.strip() in self.CONTENT_DECODERS] 346 if len(encodings): 347 self._decoder = _get_decoder(content_encoding) 348 349 DECODER_ERROR_CLASSES = (IOError, zlib.error) 350 if brotli is not None: 351 DECODER_ERROR_CLASSES += (brotli.error,) 352 353 def _decode(self, data, decode_content, flush_decoder): 354 """ 355 Decode the data passed in and potentially flush the decoder. 356 """ 357 if not decode_content: 358 return data 359 360 try: 361 if self._decoder: 362 data = self._decoder.decompress(data) 363 except self.DECODER_ERROR_CLASSES as e: 364 content_encoding = self.headers.get('content-encoding', '').lower() 365 raise DecodeError( 366 "Received response with content-encoding: %s, but " 367 "failed to decode it." % content_encoding, e) 368 if flush_decoder: 369 data += self._flush_decoder() 370 371 return data 372 373 def _flush_decoder(self): 374 """ 375 Flushes the decoder. Should only be called if the decoder is actually 376 being used. 377 """ 378 if self._decoder: 379 buf = self._decoder.decompress(b'') 380 return buf + self._decoder.flush() 381 382 return b'' 383 384 @contextmanager 385 def _error_catcher(self): 386 """ 387 Catch low-level python exceptions, instead re-raising urllib3 388 variants, so that low-level exceptions are not leaked in the 389 high-level api. 390 391 On exit, release the connection back to the pool. 392 """ 393 clean_exit = False 394 395 try: 396 try: 397 yield 398 399 except SocketTimeout: 400 # FIXME: Ideally we'd like to include the url in the ReadTimeoutError but 401 # there is yet no clean way to get at it from this context. 402 raise ReadTimeoutError(self._pool, None, 'Read timed out.') 403 404 except BaseSSLError as e: 405 # FIXME: Is there a better way to differentiate between SSLErrors? 406 if 'read operation timed out' not in str(e): # Defensive: 407 # This shouldn't happen but just in case we're missing an edge 408 # case, let's avoid swallowing SSL errors. 409 raise 410 411 raise ReadTimeoutError(self._pool, None, 'Read timed out.') 412 413 except (HTTPException, SocketError) as e: 414 # This includes IncompleteRead. 415 raise ProtocolError('Connection broken: %r' % e, e) 416 417 # If no exception is thrown, we should avoid cleaning up 418 # unnecessarily. 419 clean_exit = True 420 finally: 421 # If we didn't terminate cleanly, we need to throw away our 422 # connection. 423 if not clean_exit: 424 # The response may not be closed but we're not going to use it 425 # anymore so close it now to ensure that the connection is 426 # released back to the pool. 427 if self._original_response: 428 self._original_response.close() 429 430 # Closing the response may not actually be sufficient to close 431 # everything, so if we have a hold of the connection close that 432 # too. 433 if self._connection: 434 self._connection.close() 435 436 # If we hold the original response but it's closed now, we should 437 # return the connection back to the pool. 438 if self._original_response and self._original_response.isclosed(): 439 self.release_conn() 440 441 def read(self, amt=None, decode_content=None, cache_content=False): 442 """ 443 Similar to :meth:`httplib.HTTPResponse.read`, but with two additional 444 parameters: ``decode_content`` and ``cache_content``. 445 446 :param amt: 447 How much of the content to read. If specified, caching is skipped 448 because it doesn't make sense to cache partial content as the full 449 response. 450 451 :param decode_content: 452 If True, will attempt to decode the body based on the 453 'content-encoding' header. 454 455 :param cache_content: 456 If True, will save the returned data such that the same result is 457 returned despite of the state of the underlying file object. This 458 is useful if you want the ``.data`` property to continue working 459 after having ``.read()`` the file object. (Overridden if ``amt`` is 460 set.) 461 """ 462 self._init_decoder() 463 if decode_content is None: 464 decode_content = self.decode_content 465 466 if self._fp is None: 467 return 468 469 flush_decoder = False 470 data = None 471 472 with self._error_catcher(): 473 if amt is None: 474 # cStringIO doesn't like amt=None 475 data = self._fp.read() 476 flush_decoder = True 477 else: 478 cache_content = False 479 data = self._fp.read(amt) 480 if amt != 0 and not data: # Platform-specific: Buggy versions of Python. 481 # Close the connection when no data is returned 482 # 483 # This is redundant to what httplib/http.client _should_ 484 # already do. However, versions of python released before 485 # December 15, 2012 (http://bugs.python.org/issue16298) do 486 # not properly close the connection in all cases. There is 487 # no harm in redundantly calling close. 488 self._fp.close() 489 flush_decoder = True 490 if self.enforce_content_length and self.length_remaining not in (0, None): 491 # This is an edge case that httplib failed to cover due 492 # to concerns of backward compatibility. We're 493 # addressing it here to make sure IncompleteRead is 494 # raised during streaming, so all calls with incorrect 495 # Content-Length are caught. 496 raise IncompleteRead(self._fp_bytes_read, self.length_remaining) 497 498 if data: 499 self._fp_bytes_read += len(data) 500 if self.length_remaining is not None: 501 self.length_remaining -= len(data) 502 503 data = self._decode(data, decode_content, flush_decoder) 504 505 if cache_content: 506 self._body = data 507 508 return data 509 510 def stream(self, amt=2**16, decode_content=None): 511 """ 512 A generator wrapper for the read() method. A call will block until 513 ``amt`` bytes have been read from the connection or until the 514 connection is closed. 515 516 :param amt: 517 How much of the content to read. The generator will return up to 518 much data per iteration, but may return less. This is particularly 519 likely when using compressed data. However, the empty string will 520 never be returned. 521 522 :param decode_content: 523 If True, will attempt to decode the body based on the 524 'content-encoding' header. 525 """ 526 if self.chunked and self.supports_chunked_reads(): 527 for line in self.read_chunked(amt, decode_content=decode_content): 528 yield line 529 else: 530 while not is_fp_closed(self._fp): 531 data = self.read(amt=amt, decode_content=decode_content) 532 533 if data: 534 yield data 535 536 @classmethod 537 def from_httplib(ResponseCls, r, **response_kw): 538 """ 539 Given an :class:`httplib.HTTPResponse` instance ``r``, return a 540 corresponding :class:`urllib3.response.HTTPResponse` object. 541 542 Remaining parameters are passed to the HTTPResponse constructor, along 543 with ``original_response=r``. 544 """ 545 headers = r.msg 546 547 if not isinstance(headers, HTTPHeaderDict): 548 if PY3: 549 headers = HTTPHeaderDict(headers.items()) 550 else: 551 # Python 2.7 552 headers = HTTPHeaderDict.from_httplib(headers) 553 554 # HTTPResponse objects in Python 3 don't have a .strict attribute 555 strict = getattr(r, 'strict', 0) 556 resp = ResponseCls(body=r, 557 headers=headers, 558 status=r.status, 559 version=r.version, 560 reason=r.reason, 561 strict=strict, 562 original_response=r, 563 **response_kw) 564 return resp 565 566 # Backwards-compatibility methods for httplib.HTTPResponse 567 def getheaders(self): 568 return self.headers 569 570 def getheader(self, name, default=None): 571 return self.headers.get(name, default) 572 573 # Backwards compatibility for http.cookiejar 574 def info(self): 575 return self.headers 576 577 # Overrides from io.IOBase 578 def close(self): 579 if not self.closed: 580 self._fp.close() 581 582 if self._connection: 583 self._connection.close() 584 585 @property 586 def closed(self): 587 if self._fp is None: 588 return True 589 elif hasattr(self._fp, 'isclosed'): 590 return self._fp.isclosed() 591 elif hasattr(self._fp, 'closed'): 592 return self._fp.closed 593 else: 594 return True 595 596 def fileno(self): 597 if self._fp is None: 598 raise IOError("HTTPResponse has no file to get a fileno from") 599 elif hasattr(self._fp, "fileno"): 600 return self._fp.fileno() 601 else: 602 raise IOError("The file-like object this HTTPResponse is wrapped " 603 "around has no file descriptor") 604 605 def flush(self): 606 if self._fp is not None and hasattr(self._fp, 'flush'): 607 return self._fp.flush() 608 609 def readable(self): 610 # This method is required for `io` module compatibility. 611 return True 612 613 def readinto(self, b): 614 # This method is required for `io` module compatibility. 615 temp = self.read(len(b)) 616 if len(temp) == 0: 617 return 0 618 else: 619 b[:len(temp)] = temp 620 return len(temp) 621 622 def supports_chunked_reads(self): 623 """ 624 Checks if the underlying file-like object looks like a 625 httplib.HTTPResponse object. We do this by testing for the fp 626 attribute. If it is present we assume it returns raw chunks as 627 processed by read_chunked(). 628 """ 629 return hasattr(self._fp, 'fp') 630 631 def _update_chunk_length(self): 632 # First, we'll figure out length of a chunk and then 633 # we'll try to read it from socket. 634 if self.chunk_left is not None: 635 return 636 line = self._fp.fp.readline() 637 line = line.split(b';', 1)[0] 638 try: 639 self.chunk_left = int(line, 16) 640 except ValueError: 641 # Invalid chunked protocol response, abort. 642 self.close() 643 raise httplib.IncompleteRead(line) 644 645 def _handle_chunk(self, amt): 646 returned_chunk = None 647 if amt is None: 648 chunk = self._fp._safe_read(self.chunk_left) 649 returned_chunk = chunk 650 self._fp._safe_read(2) # Toss the CRLF at the end of the chunk. 651 self.chunk_left = None 652 elif amt < self.chunk_left: 653 value = self._fp._safe_read(amt) 654 self.chunk_left = self.chunk_left - amt 655 returned_chunk = value 656 elif amt == self.chunk_left: 657 value = self._fp._safe_read(amt) 658 self._fp._safe_read(2) # Toss the CRLF at the end of the chunk. 659 self.chunk_left = None 660 returned_chunk = value 661 else: # amt > self.chunk_left 662 returned_chunk = self._fp._safe_read(self.chunk_left) 663 self._fp._safe_read(2) # Toss the CRLF at the end of the chunk. 664 self.chunk_left = None 665 return returned_chunk 666 667 def read_chunked(self, amt=None, decode_content=None): 668 """ 669 Similar to :meth:`HTTPResponse.read`, but with an additional 670 parameter: ``decode_content``. 671 672 :param amt: 673 How much of the content to read. If specified, caching is skipped 674 because it doesn't make sense to cache partial content as the full 675 response. 676 677 :param decode_content: 678 If True, will attempt to decode the body based on the 679 'content-encoding' header. 680 """ 681 self._init_decoder() 682 # FIXME: Rewrite this method and make it a class with a better structured logic. 683 if not self.chunked: 684 raise ResponseNotChunked( 685 "Response is not chunked. " 686 "Header 'transfer-encoding: chunked' is missing.") 687 if not self.supports_chunked_reads(): 688 raise BodyNotHttplibCompatible( 689 "Body should be httplib.HTTPResponse like. " 690 "It should have have an fp attribute which returns raw chunks.") 691 692 with self._error_catcher(): 693 # Don't bother reading the body of a HEAD request. 694 if self._original_response and is_response_to_head(self._original_response): 695 self._original_response.close() 696 return 697 698 # If a response is already read and closed 699 # then return immediately. 700 if self._fp.fp is None: 701 return 702 703 while True: 704 self._update_chunk_length() 705 if self.chunk_left == 0: 706 break 707 chunk = self._handle_chunk(amt) 708 decoded = self._decode(chunk, decode_content=decode_content, 709 flush_decoder=False) 710 if decoded: 711 yield decoded 712 713 if decode_content: 714 # On CPython and PyPy, we should never need to flush the 715 # decoder. However, on Jython we *might* need to, so 716 # lets defensively do it anyway. 717 decoded = self._flush_decoder() 718 if decoded: # Platform-specific: Jython. 719 yield decoded 720 721 # Chunk content ends with \r\n: discard it. 722 while True: 723 line = self._fp.fp.readline() 724 if not line: 725 # Some sites may not end with '\r\n'. 726 break 727 if line == b'\r\n': 728 break 729 730 # We read everything; close the "file". 731 if self._original_response: 732 self._original_response.close() 733 734 def geturl(self): 735 """ 736 Returns the URL that was the source of this response. 737 If the request that generated this response redirected, this method 738 will return the final redirect location. 739 """ 740 if self.retries is not None and len(self.retries.history): 741 return self.retries.history[-1].redirect_location 742 else: 743 return self._request_url 744 745 def __iter__(self): 746 buffer = [b""] 747 for chunk in self.stream(decode_content=True): 748 if b"\n" in chunk: 749 chunk = chunk.split(b"\n") 750 yield b"".join(buffer) + chunk[0] + b"\n" 751 for x in chunk[1:-1]: 752 yield x + b"\n" 753 if chunk[-1]: 754 buffer = [chunk[-1]] 755 else: 756 buffer = [] 757 else: 758 buffer.append(chunk) 759 if buffer: 760 yield b"".join(buffer) 761