1r"""HTTP/1.1 client library 2 3<intro stuff goes here> 4<other stuff, too> 5 6HTTPConnection goes through a number of "states", which define when a client 7may legally make another request or fetch the response for a particular 8request. This diagram details these state transitions: 9 10 (null) 11 | 12 | HTTPConnection() 13 v 14 Idle 15 | 16 | putrequest() 17 v 18 Request-started 19 | 20 | ( putheader() )* endheaders() 21 v 22 Request-sent 23 |\_____________________________ 24 | | getresponse() raises 25 | response = getresponse() | ConnectionError 26 v v 27 Unread-response Idle 28 [Response-headers-read] 29 |\____________________ 30 | | 31 | response.read() | putrequest() 32 v v 33 Idle Req-started-unread-response 34 ______/| 35 / | 36 response.read() | | ( putheader() )* endheaders() 37 v v 38 Request-started Req-sent-unread-response 39 | 40 | response.read() 41 v 42 Request-sent 43 44This diagram presents the following rules: 45 -- a second request may not be started until {response-headers-read} 46 -- a response [object] cannot be retrieved until {request-sent} 47 -- there is no differentiation between an unread response body and a 48 partially read response body 49 50Note: this enforcement is applied by the HTTPConnection class. The 51 HTTPResponse class does not enforce this state machine, which 52 implies sophisticated clients may accelerate the request/response 53 pipeline. Caution should be taken, though: accelerating the states 54 beyond the above pattern may imply knowledge of the server's 55 connection-close behavior for certain requests. For example, it 56 is impossible to tell whether the server will close the connection 57 UNTIL the response headers have been read; this means that further 58 requests cannot be placed into the pipeline until it is known that 59 the server will NOT be closing the connection. 60 61Logical State __state __response 62------------- ------- ---------- 63Idle _CS_IDLE None 64Request-started _CS_REQ_STARTED None 65Request-sent _CS_REQ_SENT None 66Unread-response _CS_IDLE <response_class> 67Req-started-unread-response _CS_REQ_STARTED <response_class> 68Req-sent-unread-response _CS_REQ_SENT <response_class> 69""" 70 71import email.parser 72import email.message 73import http 74import io 75import re 76import socket 77import collections.abc 78from urllib.parse import urlsplit 79 80# HTTPMessage, parse_headers(), and the HTTP status code constants are 81# intentionally omitted for simplicity 82__all__ = ["HTTPResponse", "HTTPConnection", 83 "HTTPException", "NotConnected", "UnknownProtocol", 84 "UnknownTransferEncoding", "UnimplementedFileMode", 85 "IncompleteRead", "InvalidURL", "ImproperConnectionState", 86 "CannotSendRequest", "CannotSendHeader", "ResponseNotReady", 87 "BadStatusLine", "LineTooLong", "RemoteDisconnected", "error", 88 "responses"] 89 90HTTP_PORT = 80 91HTTPS_PORT = 443 92 93_UNKNOWN = 'UNKNOWN' 94 95# connection states 96_CS_IDLE = 'Idle' 97_CS_REQ_STARTED = 'Request-started' 98_CS_REQ_SENT = 'Request-sent' 99 100 101# hack to maintain backwards compatibility 102globals().update(http.HTTPStatus.__members__) 103 104# another hack to maintain backwards compatibility 105# Mapping status codes to official W3C names 106responses = {v: v.phrase for v in http.HTTPStatus.__members__.values()} 107 108# maximal line length when calling readline(). 109_MAXLINE = 65536 110_MAXHEADERS = 100 111 112# Header name/value ABNF (http://tools.ietf.org/html/rfc7230#section-3.2) 113# 114# VCHAR = %x21-7E 115# obs-text = %x80-FF 116# header-field = field-name ":" OWS field-value OWS 117# field-name = token 118# field-value = *( field-content / obs-fold ) 119# field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ] 120# field-vchar = VCHAR / obs-text 121# 122# obs-fold = CRLF 1*( SP / HTAB ) 123# ; obsolete line folding 124# ; see Section 3.2.4 125 126# token = 1*tchar 127# 128# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" 129# / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~" 130# / DIGIT / ALPHA 131# ; any VCHAR, except delimiters 132# 133# VCHAR defined in http://tools.ietf.org/html/rfc5234#appendix-B.1 134 135# the patterns for both name and value are more lenient than RFC 136# definitions to allow for backwards compatibility 137_is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch 138_is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search 139 140# These characters are not allowed within HTTP URL paths. 141# See https://tools.ietf.org/html/rfc3986#section-3.3 and the 142# https://tools.ietf.org/html/rfc3986#appendix-A pchar definition. 143# Prevents CVE-2019-9740. Includes control characters such as \r\n. 144# We don't restrict chars above \x7f as putrequest() limits us to ASCII. 145_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f]') 146# Arguably only these _should_ allowed: 147# _is_allowed_url_pchars_re = re.compile(r"^[/!$&'()*+,;=:@%a-zA-Z0-9._~-]+$") 148# We are more lenient for assumed real world compatibility purposes. 149 150# These characters are not allowed within HTTP method names 151# to prevent http header injection. 152_contains_disallowed_method_pchar_re = re.compile('[\x00-\x1f]') 153 154# We always set the Content-Length header for these methods because some 155# servers will otherwise respond with a 411 156_METHODS_EXPECTING_BODY = {'PATCH', 'POST', 'PUT'} 157 158 159def _encode(data, name='data'): 160 """Call data.encode("latin-1") but show a better error message.""" 161 try: 162 return data.encode("latin-1") 163 except UnicodeEncodeError as err: 164 raise UnicodeEncodeError( 165 err.encoding, 166 err.object, 167 err.start, 168 err.end, 169 "%s (%.20r) is not valid Latin-1. Use %s.encode('utf-8') " 170 "if you want to send it encoded in UTF-8." % 171 (name.title(), data[err.start:err.end], name)) from None 172 173 174class HTTPMessage(email.message.Message): 175 # XXX The only usage of this method is in 176 # http.server.CGIHTTPRequestHandler. Maybe move the code there so 177 # that it doesn't need to be part of the public API. The API has 178 # never been defined so this could cause backwards compatibility 179 # issues. 180 181 def getallmatchingheaders(self, name): 182 """Find all header lines matching a given header name. 183 184 Look through the list of headers and find all lines matching a given 185 header name (and their continuation lines). A list of the lines is 186 returned, without interpretation. If the header does not occur, an 187 empty list is returned. If the header occurs multiple times, all 188 occurrences are returned. Case is not important in the header name. 189 190 """ 191 name = name.lower() + ':' 192 n = len(name) 193 lst = [] 194 hit = 0 195 for line in self.keys(): 196 if line[:n].lower() == name: 197 hit = 1 198 elif not line[:1].isspace(): 199 hit = 0 200 if hit: 201 lst.append(line) 202 return lst 203 204def _read_headers(fp): 205 """Reads potential header lines into a list from a file pointer. 206 207 Length of line is limited by _MAXLINE, and number of 208 headers is limited by _MAXHEADERS. 209 """ 210 headers = [] 211 while True: 212 line = fp.readline(_MAXLINE + 1) 213 if len(line) > _MAXLINE: 214 raise LineTooLong("header line") 215 headers.append(line) 216 if len(headers) > _MAXHEADERS: 217 raise HTTPException("got more than %d headers" % _MAXHEADERS) 218 if line in (b'\r\n', b'\n', b''): 219 break 220 return headers 221 222def parse_headers(fp, _class=HTTPMessage): 223 """Parses only RFC2822 headers from a file pointer. 224 225 email Parser wants to see strings rather than bytes. 226 But a TextIOWrapper around self.rfile would buffer too many bytes 227 from the stream, bytes which we later need to read as bytes. 228 So we read the correct bytes here, as bytes, for email Parser 229 to parse. 230 231 """ 232 headers = _read_headers(fp) 233 hstring = b''.join(headers).decode('iso-8859-1') 234 return email.parser.Parser(_class=_class).parsestr(hstring) 235 236 237class HTTPResponse(io.BufferedIOBase): 238 239 # See RFC 2616 sec 19.6 and RFC 1945 sec 6 for details. 240 241 # The bytes from the socket object are iso-8859-1 strings. 242 # See RFC 2616 sec 2.2 which notes an exception for MIME-encoded 243 # text following RFC 2047. The basic status line parsing only 244 # accepts iso-8859-1. 245 246 def __init__(self, sock, debuglevel=0, method=None, url=None): 247 # If the response includes a content-length header, we need to 248 # make sure that the client doesn't read more than the 249 # specified number of bytes. If it does, it will block until 250 # the server times out and closes the connection. This will 251 # happen if a self.fp.read() is done (without a size) whether 252 # self.fp is buffered or not. So, no self.fp.read() by 253 # clients unless they know what they are doing. 254 self.fp = sock.makefile("rb") 255 self.debuglevel = debuglevel 256 self._method = method 257 258 # The HTTPResponse object is returned via urllib. The clients 259 # of http and urllib expect different attributes for the 260 # headers. headers is used here and supports urllib. msg is 261 # provided as a backwards compatibility layer for http 262 # clients. 263 264 self.headers = self.msg = None 265 266 # from the Status-Line of the response 267 self.version = _UNKNOWN # HTTP-Version 268 self.status = _UNKNOWN # Status-Code 269 self.reason = _UNKNOWN # Reason-Phrase 270 271 self.chunked = _UNKNOWN # is "chunked" being used? 272 self.chunk_left = _UNKNOWN # bytes left to read in current chunk 273 self.length = _UNKNOWN # number of bytes left in response 274 self.will_close = _UNKNOWN # conn will close at end of response 275 276 def _read_status(self): 277 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") 278 if len(line) > _MAXLINE: 279 raise LineTooLong("status line") 280 if self.debuglevel > 0: 281 print("reply:", repr(line)) 282 if not line: 283 # Presumably, the server closed the connection before 284 # sending a valid response. 285 raise RemoteDisconnected("Remote end closed connection without" 286 " response") 287 try: 288 version, status, reason = line.split(None, 2) 289 except ValueError: 290 try: 291 version, status = line.split(None, 1) 292 reason = "" 293 except ValueError: 294 # empty version will cause next test to fail. 295 version = "" 296 if not version.startswith("HTTP/"): 297 self._close_conn() 298 raise BadStatusLine(line) 299 300 # The status code is a three-digit number 301 try: 302 status = int(status) 303 if status < 100 or status > 999: 304 raise BadStatusLine(line) 305 except ValueError: 306 raise BadStatusLine(line) 307 return version, status, reason 308 309 def begin(self): 310 if self.headers is not None: 311 # we've already started reading the response 312 return 313 314 # read until we get a non-100 response 315 while True: 316 version, status, reason = self._read_status() 317 if status != CONTINUE: 318 break 319 # skip the header from the 100 response 320 skipped_headers = _read_headers(self.fp) 321 if self.debuglevel > 0: 322 print("headers:", skipped_headers) 323 del skipped_headers 324 325 self.code = self.status = status 326 self.reason = reason.strip() 327 if version in ("HTTP/1.0", "HTTP/0.9"): 328 # Some servers might still return "0.9", treat it as 1.0 anyway 329 self.version = 10 330 elif version.startswith("HTTP/1."): 331 self.version = 11 # use HTTP/1.1 code for HTTP/1.x where x>=1 332 else: 333 raise UnknownProtocol(version) 334 335 self.headers = self.msg = parse_headers(self.fp) 336 337 if self.debuglevel > 0: 338 for hdr, val in self.headers.items(): 339 print("header:", hdr + ":", val) 340 341 # are we using the chunked-style of transfer encoding? 342 tr_enc = self.headers.get("transfer-encoding") 343 if tr_enc and tr_enc.lower() == "chunked": 344 self.chunked = True 345 self.chunk_left = None 346 else: 347 self.chunked = False 348 349 # will the connection close at the end of the response? 350 self.will_close = self._check_close() 351 352 # do we have a Content-Length? 353 # NOTE: RFC 2616, S4.4, #3 says we ignore this if tr_enc is "chunked" 354 self.length = None 355 length = self.headers.get("content-length") 356 if length and not self.chunked: 357 try: 358 self.length = int(length) 359 except ValueError: 360 self.length = None 361 else: 362 if self.length < 0: # ignore nonsensical negative lengths 363 self.length = None 364 else: 365 self.length = None 366 367 # does the body have a fixed length? (of zero) 368 if (status == NO_CONTENT or status == NOT_MODIFIED or 369 100 <= status < 200 or # 1xx codes 370 self._method == "HEAD"): 371 self.length = 0 372 373 # if the connection remains open, and we aren't using chunked, and 374 # a content-length was not provided, then assume that the connection 375 # WILL close. 376 if (not self.will_close and 377 not self.chunked and 378 self.length is None): 379 self.will_close = True 380 381 def _check_close(self): 382 conn = self.headers.get("connection") 383 if self.version == 11: 384 # An HTTP/1.1 proxy is assumed to stay open unless 385 # explicitly closed. 386 if conn and "close" in conn.lower(): 387 return True 388 return False 389 390 # Some HTTP/1.0 implementations have support for persistent 391 # connections, using rules different than HTTP/1.1. 392 393 # For older HTTP, Keep-Alive indicates persistent connection. 394 if self.headers.get("keep-alive"): 395 return False 396 397 # At least Akamai returns a "Connection: Keep-Alive" header, 398 # which was supposed to be sent by the client. 399 if conn and "keep-alive" in conn.lower(): 400 return False 401 402 # Proxy-Connection is a netscape hack. 403 pconn = self.headers.get("proxy-connection") 404 if pconn and "keep-alive" in pconn.lower(): 405 return False 406 407 # otherwise, assume it will close 408 return True 409 410 def _close_conn(self): 411 fp = self.fp 412 self.fp = None 413 fp.close() 414 415 def close(self): 416 try: 417 super().close() # set "closed" flag 418 finally: 419 if self.fp: 420 self._close_conn() 421 422 # These implementations are for the benefit of io.BufferedReader. 423 424 # XXX This class should probably be revised to act more like 425 # the "raw stream" that BufferedReader expects. 426 427 def flush(self): 428 super().flush() 429 if self.fp: 430 self.fp.flush() 431 432 def readable(self): 433 """Always returns True""" 434 return True 435 436 # End of "raw stream" methods 437 438 def isclosed(self): 439 """True if the connection is closed.""" 440 # NOTE: it is possible that we will not ever call self.close(). This 441 # case occurs when will_close is TRUE, length is None, and we 442 # read up to the last byte, but NOT past it. 443 # 444 # IMPLIES: if will_close is FALSE, then self.close() will ALWAYS be 445 # called, meaning self.isclosed() is meaningful. 446 return self.fp is None 447 448 def read(self, amt=None): 449 if self.fp is None: 450 return b"" 451 452 if self._method == "HEAD": 453 self._close_conn() 454 return b"" 455 456 if amt is not None: 457 # Amount is given, implement using readinto 458 b = bytearray(amt) 459 n = self.readinto(b) 460 return memoryview(b)[:n].tobytes() 461 else: 462 # Amount is not given (unbounded read) so we must check self.length 463 # and self.chunked 464 465 if self.chunked: 466 return self._readall_chunked() 467 468 if self.length is None: 469 s = self.fp.read() 470 else: 471 try: 472 s = self._safe_read(self.length) 473 except IncompleteRead: 474 self._close_conn() 475 raise 476 self.length = 0 477 self._close_conn() # we read everything 478 return s 479 480 def readinto(self, b): 481 """Read up to len(b) bytes into bytearray b and return the number 482 of bytes read. 483 """ 484 485 if self.fp is None: 486 return 0 487 488 if self._method == "HEAD": 489 self._close_conn() 490 return 0 491 492 if self.chunked: 493 return self._readinto_chunked(b) 494 495 if self.length is not None: 496 if len(b) > self.length: 497 # clip the read to the "end of response" 498 b = memoryview(b)[0:self.length] 499 500 # we do not use _safe_read() here because this may be a .will_close 501 # connection, and the user is reading more bytes than will be provided 502 # (for example, reading in 1k chunks) 503 n = self.fp.readinto(b) 504 if not n and b: 505 # Ideally, we would raise IncompleteRead if the content-length 506 # wasn't satisfied, but it might break compatibility. 507 self._close_conn() 508 elif self.length is not None: 509 self.length -= n 510 if not self.length: 511 self._close_conn() 512 return n 513 514 def _read_next_chunk_size(self): 515 # Read the next chunk size from the file 516 line = self.fp.readline(_MAXLINE + 1) 517 if len(line) > _MAXLINE: 518 raise LineTooLong("chunk size") 519 i = line.find(b";") 520 if i >= 0: 521 line = line[:i] # strip chunk-extensions 522 try: 523 return int(line, 16) 524 except ValueError: 525 # close the connection as protocol synchronisation is 526 # probably lost 527 self._close_conn() 528 raise 529 530 def _read_and_discard_trailer(self): 531 # read and discard trailer up to the CRLF terminator 532 ### note: we shouldn't have any trailers! 533 while True: 534 line = self.fp.readline(_MAXLINE + 1) 535 if len(line) > _MAXLINE: 536 raise LineTooLong("trailer line") 537 if not line: 538 # a vanishingly small number of sites EOF without 539 # sending the trailer 540 break 541 if line in (b'\r\n', b'\n', b''): 542 break 543 544 def _get_chunk_left(self): 545 # return self.chunk_left, reading a new chunk if necessary. 546 # chunk_left == 0: at the end of the current chunk, need to close it 547 # chunk_left == None: No current chunk, should read next. 548 # This function returns non-zero or None if the last chunk has 549 # been read. 550 chunk_left = self.chunk_left 551 if not chunk_left: # Can be 0 or None 552 if chunk_left is not None: 553 # We are at the end of chunk, discard chunk end 554 self._safe_read(2) # toss the CRLF at the end of the chunk 555 try: 556 chunk_left = self._read_next_chunk_size() 557 except ValueError: 558 raise IncompleteRead(b'') 559 if chunk_left == 0: 560 # last chunk: 1*("0") [ chunk-extension ] CRLF 561 self._read_and_discard_trailer() 562 # we read everything; close the "file" 563 self._close_conn() 564 chunk_left = None 565 self.chunk_left = chunk_left 566 return chunk_left 567 568 def _readall_chunked(self): 569 assert self.chunked != _UNKNOWN 570 value = [] 571 try: 572 while True: 573 chunk_left = self._get_chunk_left() 574 if chunk_left is None: 575 break 576 value.append(self._safe_read(chunk_left)) 577 self.chunk_left = 0 578 return b''.join(value) 579 except IncompleteRead: 580 raise IncompleteRead(b''.join(value)) 581 582 def _readinto_chunked(self, b): 583 assert self.chunked != _UNKNOWN 584 total_bytes = 0 585 mvb = memoryview(b) 586 try: 587 while True: 588 chunk_left = self._get_chunk_left() 589 if chunk_left is None: 590 return total_bytes 591 592 if len(mvb) <= chunk_left: 593 n = self._safe_readinto(mvb) 594 self.chunk_left = chunk_left - n 595 return total_bytes + n 596 597 temp_mvb = mvb[:chunk_left] 598 n = self._safe_readinto(temp_mvb) 599 mvb = mvb[n:] 600 total_bytes += n 601 self.chunk_left = 0 602 603 except IncompleteRead: 604 raise IncompleteRead(bytes(b[0:total_bytes])) 605 606 def _safe_read(self, amt): 607 """Read the number of bytes requested. 608 609 This function should be used when <amt> bytes "should" be present for 610 reading. If the bytes are truly not available (due to EOF), then the 611 IncompleteRead exception can be used to detect the problem. 612 """ 613 data = self.fp.read(amt) 614 if len(data) < amt: 615 raise IncompleteRead(data, amt-len(data)) 616 return data 617 618 def _safe_readinto(self, b): 619 """Same as _safe_read, but for reading into a buffer.""" 620 amt = len(b) 621 n = self.fp.readinto(b) 622 if n < amt: 623 raise IncompleteRead(bytes(b[:n]), amt-n) 624 return n 625 626 def read1(self, n=-1): 627 """Read with at most one underlying system call. If at least one 628 byte is buffered, return that instead. 629 """ 630 if self.fp is None or self._method == "HEAD": 631 return b"" 632 if self.chunked: 633 return self._read1_chunked(n) 634 if self.length is not None and (n < 0 or n > self.length): 635 n = self.length 636 result = self.fp.read1(n) 637 if not result and n: 638 self._close_conn() 639 elif self.length is not None: 640 self.length -= len(result) 641 return result 642 643 def peek(self, n=-1): 644 # Having this enables IOBase.readline() to read more than one 645 # byte at a time 646 if self.fp is None or self._method == "HEAD": 647 return b"" 648 if self.chunked: 649 return self._peek_chunked(n) 650 return self.fp.peek(n) 651 652 def readline(self, limit=-1): 653 if self.fp is None or self._method == "HEAD": 654 return b"" 655 if self.chunked: 656 # Fallback to IOBase readline which uses peek() and read() 657 return super().readline(limit) 658 if self.length is not None and (limit < 0 or limit > self.length): 659 limit = self.length 660 result = self.fp.readline(limit) 661 if not result and limit: 662 self._close_conn() 663 elif self.length is not None: 664 self.length -= len(result) 665 return result 666 667 def _read1_chunked(self, n): 668 # Strictly speaking, _get_chunk_left() may cause more than one read, 669 # but that is ok, since that is to satisfy the chunked protocol. 670 chunk_left = self._get_chunk_left() 671 if chunk_left is None or n == 0: 672 return b'' 673 if not (0 <= n <= chunk_left): 674 n = chunk_left # if n is negative or larger than chunk_left 675 read = self.fp.read1(n) 676 self.chunk_left -= len(read) 677 if not read: 678 raise IncompleteRead(b"") 679 return read 680 681 def _peek_chunked(self, n): 682 # Strictly speaking, _get_chunk_left() may cause more than one read, 683 # but that is ok, since that is to satisfy the chunked protocol. 684 try: 685 chunk_left = self._get_chunk_left() 686 except IncompleteRead: 687 return b'' # peek doesn't worry about protocol 688 if chunk_left is None: 689 return b'' # eof 690 # peek is allowed to return more than requested. Just request the 691 # entire chunk, and truncate what we get. 692 return self.fp.peek(chunk_left)[:chunk_left] 693 694 def fileno(self): 695 return self.fp.fileno() 696 697 def getheader(self, name, default=None): 698 '''Returns the value of the header matching *name*. 699 700 If there are multiple matching headers, the values are 701 combined into a single string separated by commas and spaces. 702 703 If no matching header is found, returns *default* or None if 704 the *default* is not specified. 705 706 If the headers are unknown, raises http.client.ResponseNotReady. 707 708 ''' 709 if self.headers is None: 710 raise ResponseNotReady() 711 headers = self.headers.get_all(name) or default 712 if isinstance(headers, str) or not hasattr(headers, '__iter__'): 713 return headers 714 else: 715 return ', '.join(headers) 716 717 def getheaders(self): 718 """Return list of (header, value) tuples.""" 719 if self.headers is None: 720 raise ResponseNotReady() 721 return list(self.headers.items()) 722 723 # We override IOBase.__iter__ so that it doesn't check for closed-ness 724 725 def __iter__(self): 726 return self 727 728 # For compatibility with old-style urllib responses. 729 730 def info(self): 731 '''Returns an instance of the class mimetools.Message containing 732 meta-information associated with the URL. 733 734 When the method is HTTP, these headers are those returned by 735 the server at the head of the retrieved HTML page (including 736 Content-Length and Content-Type). 737 738 When the method is FTP, a Content-Length header will be 739 present if (as is now usual) the server passed back a file 740 length in response to the FTP retrieval request. A 741 Content-Type header will be present if the MIME type can be 742 guessed. 743 744 When the method is local-file, returned headers will include 745 a Date representing the file's last-modified time, a 746 Content-Length giving file size, and a Content-Type 747 containing a guess at the file's type. See also the 748 description of the mimetools module. 749 750 ''' 751 return self.headers 752 753 def geturl(self): 754 '''Return the real URL of the page. 755 756 In some cases, the HTTP server redirects a client to another 757 URL. The urlopen() function handles this transparently, but in 758 some cases the caller needs to know which URL the client was 759 redirected to. The geturl() method can be used to get at this 760 redirected URL. 761 762 ''' 763 return self.url 764 765 def getcode(self): 766 '''Return the HTTP status code that was sent with the response, 767 or None if the URL is not an HTTP URL. 768 769 ''' 770 return self.status 771 772class HTTPConnection: 773 774 _http_vsn = 11 775 _http_vsn_str = 'HTTP/1.1' 776 777 response_class = HTTPResponse 778 default_port = HTTP_PORT 779 auto_open = 1 780 debuglevel = 0 781 782 @staticmethod 783 def _is_textIO(stream): 784 """Test whether a file-like object is a text or a binary stream. 785 """ 786 return isinstance(stream, io.TextIOBase) 787 788 @staticmethod 789 def _get_content_length(body, method): 790 """Get the content-length based on the body. 791 792 If the body is None, we set Content-Length: 0 for methods that expect 793 a body (RFC 7230, Section 3.3.2). We also set the Content-Length for 794 any method if the body is a str or bytes-like object and not a file. 795 """ 796 if body is None: 797 # do an explicit check for not None here to distinguish 798 # between unset and set but empty 799 if method.upper() in _METHODS_EXPECTING_BODY: 800 return 0 801 else: 802 return None 803 804 if hasattr(body, 'read'): 805 # file-like object. 806 return None 807 808 try: 809 # does it implement the buffer protocol (bytes, bytearray, array)? 810 mv = memoryview(body) 811 return mv.nbytes 812 except TypeError: 813 pass 814 815 if isinstance(body, str): 816 return len(body) 817 818 return None 819 820 def __init__(self, host, port=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 821 source_address=None, blocksize=8192): 822 self.timeout = timeout 823 self.source_address = source_address 824 self.blocksize = blocksize 825 self.sock = None 826 self._buffer = [] 827 self.__response = None 828 self.__state = _CS_IDLE 829 self._method = None 830 self._tunnel_host = None 831 self._tunnel_port = None 832 self._tunnel_headers = {} 833 834 (self.host, self.port) = self._get_hostport(host, port) 835 836 self._validate_host(self.host) 837 838 # This is stored as an instance variable to allow unit 839 # tests to replace it with a suitable mockup 840 self._create_connection = socket.create_connection 841 842 def set_tunnel(self, host, port=None, headers=None): 843 """Set up host and port for HTTP CONNECT tunnelling. 844 845 In a connection that uses HTTP CONNECT tunneling, the host passed to the 846 constructor is used as a proxy server that relays all communication to 847 the endpoint passed to `set_tunnel`. This done by sending an HTTP 848 CONNECT request to the proxy server when the connection is established. 849 850 This method must be called before the HTTP connection has been 851 established. 852 853 The headers argument should be a mapping of extra HTTP headers to send 854 with the CONNECT request. 855 """ 856 857 if self.sock: 858 raise RuntimeError("Can't set up tunnel for established connection") 859 860 self._tunnel_host, self._tunnel_port = self._get_hostport(host, port) 861 if headers: 862 self._tunnel_headers = headers 863 else: 864 self._tunnel_headers.clear() 865 866 def _get_hostport(self, host, port): 867 if port is None: 868 i = host.rfind(':') 869 j = host.rfind(']') # ipv6 addresses have [...] 870 if i > j: 871 try: 872 port = int(host[i+1:]) 873 except ValueError: 874 if host[i+1:] == "": # http://foo.com:/ == http://foo.com/ 875 port = self.default_port 876 else: 877 raise InvalidURL("nonnumeric port: '%s'" % host[i+1:]) 878 host = host[:i] 879 else: 880 port = self.default_port 881 if host and host[0] == '[' and host[-1] == ']': 882 host = host[1:-1] 883 884 return (host, port) 885 886 def set_debuglevel(self, level): 887 self.debuglevel = level 888 889 def _tunnel(self): 890 connect_str = "CONNECT %s:%d HTTP/1.0\r\n" % (self._tunnel_host, 891 self._tunnel_port) 892 connect_bytes = connect_str.encode("ascii") 893 self.send(connect_bytes) 894 for header, value in self._tunnel_headers.items(): 895 header_str = "%s: %s\r\n" % (header, value) 896 header_bytes = header_str.encode("latin-1") 897 self.send(header_bytes) 898 self.send(b'\r\n') 899 900 response = self.response_class(self.sock, method=self._method) 901 (version, code, message) = response._read_status() 902 903 if code != http.HTTPStatus.OK: 904 self.close() 905 raise OSError("Tunnel connection failed: %d %s" % (code, 906 message.strip())) 907 while True: 908 line = response.fp.readline(_MAXLINE + 1) 909 if len(line) > _MAXLINE: 910 raise LineTooLong("header line") 911 if not line: 912 # for sites which EOF without sending a trailer 913 break 914 if line in (b'\r\n', b'\n', b''): 915 break 916 917 if self.debuglevel > 0: 918 print('header:', line.decode()) 919 920 def connect(self): 921 """Connect to the host and port specified in __init__.""" 922 self.sock = self._create_connection( 923 (self.host,self.port), self.timeout, self.source_address) 924 self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) 925 926 if self._tunnel_host: 927 self._tunnel() 928 929 def close(self): 930 """Close the connection to the HTTP server.""" 931 self.__state = _CS_IDLE 932 try: 933 sock = self.sock 934 if sock: 935 self.sock = None 936 sock.close() # close it manually... there may be other refs 937 finally: 938 response = self.__response 939 if response: 940 self.__response = None 941 response.close() 942 943 def send(self, data): 944 """Send `data' to the server. 945 ``data`` can be a string object, a bytes object, an array object, a 946 file-like object that supports a .read() method, or an iterable object. 947 """ 948 949 if self.sock is None: 950 if self.auto_open: 951 self.connect() 952 else: 953 raise NotConnected() 954 955 if self.debuglevel > 0: 956 print("send:", repr(data)) 957 if hasattr(data, "read") : 958 if self.debuglevel > 0: 959 print("sendIng a read()able") 960 encode = self._is_textIO(data) 961 if encode and self.debuglevel > 0: 962 print("encoding file using iso-8859-1") 963 while 1: 964 datablock = data.read(self.blocksize) 965 if not datablock: 966 break 967 if encode: 968 datablock = datablock.encode("iso-8859-1") 969 self.sock.sendall(datablock) 970 return 971 try: 972 self.sock.sendall(data) 973 except TypeError: 974 if isinstance(data, collections.abc.Iterable): 975 for d in data: 976 self.sock.sendall(d) 977 else: 978 raise TypeError("data should be a bytes-like object " 979 "or an iterable, got %r" % type(data)) 980 981 def _output(self, s): 982 """Add a line of output to the current request buffer. 983 984 Assumes that the line does *not* end with \\r\\n. 985 """ 986 self._buffer.append(s) 987 988 def _read_readable(self, readable): 989 if self.debuglevel > 0: 990 print("sendIng a read()able") 991 encode = self._is_textIO(readable) 992 if encode and self.debuglevel > 0: 993 print("encoding file using iso-8859-1") 994 while True: 995 datablock = readable.read(self.blocksize) 996 if not datablock: 997 break 998 if encode: 999 datablock = datablock.encode("iso-8859-1") 1000 yield datablock 1001 1002 def _send_output(self, message_body=None, encode_chunked=False): 1003 """Send the currently buffered request and clear the buffer. 1004 1005 Appends an extra \\r\\n to the buffer. 1006 A message_body may be specified, to be appended to the request. 1007 """ 1008 self._buffer.extend((b"", b"")) 1009 msg = b"\r\n".join(self._buffer) 1010 del self._buffer[:] 1011 self.send(msg) 1012 1013 if message_body is not None: 1014 1015 # create a consistent interface to message_body 1016 if hasattr(message_body, 'read'): 1017 # Let file-like take precedence over byte-like. This 1018 # is needed to allow the current position of mmap'ed 1019 # files to be taken into account. 1020 chunks = self._read_readable(message_body) 1021 else: 1022 try: 1023 # this is solely to check to see if message_body 1024 # implements the buffer API. it /would/ be easier 1025 # to capture if PyObject_CheckBuffer was exposed 1026 # to Python. 1027 memoryview(message_body) 1028 except TypeError: 1029 try: 1030 chunks = iter(message_body) 1031 except TypeError: 1032 raise TypeError("message_body should be a bytes-like " 1033 "object or an iterable, got %r" 1034 % type(message_body)) 1035 else: 1036 # the object implements the buffer interface and 1037 # can be passed directly into socket methods 1038 chunks = (message_body,) 1039 1040 for chunk in chunks: 1041 if not chunk: 1042 if self.debuglevel > 0: 1043 print('Zero length chunk ignored') 1044 continue 1045 1046 if encode_chunked and self._http_vsn == 11: 1047 # chunked encoding 1048 chunk = f'{len(chunk):X}\r\n'.encode('ascii') + chunk \ 1049 + b'\r\n' 1050 self.send(chunk) 1051 1052 if encode_chunked and self._http_vsn == 11: 1053 # end chunked transfer 1054 self.send(b'0\r\n\r\n') 1055 1056 def putrequest(self, method, url, skip_host=False, 1057 skip_accept_encoding=False): 1058 """Send a request to the server. 1059 1060 `method' specifies an HTTP request method, e.g. 'GET'. 1061 `url' specifies the object being requested, e.g. '/index.html'. 1062 `skip_host' if True does not add automatically a 'Host:' header 1063 `skip_accept_encoding' if True does not add automatically an 1064 'Accept-Encoding:' header 1065 """ 1066 1067 # if a prior response has been completed, then forget about it. 1068 if self.__response and self.__response.isclosed(): 1069 self.__response = None 1070 1071 1072 # in certain cases, we cannot issue another request on this connection. 1073 # this occurs when: 1074 # 1) we are in the process of sending a request. (_CS_REQ_STARTED) 1075 # 2) a response to a previous request has signalled that it is going 1076 # to close the connection upon completion. 1077 # 3) the headers for the previous response have not been read, thus 1078 # we cannot determine whether point (2) is true. (_CS_REQ_SENT) 1079 # 1080 # if there is no prior response, then we can request at will. 1081 # 1082 # if point (2) is true, then we will have passed the socket to the 1083 # response (effectively meaning, "there is no prior response"), and 1084 # will open a new one when a new request is made. 1085 # 1086 # Note: if a prior response exists, then we *can* start a new request. 1087 # We are not allowed to begin fetching the response to this new 1088 # request, however, until that prior response is complete. 1089 # 1090 if self.__state == _CS_IDLE: 1091 self.__state = _CS_REQ_STARTED 1092 else: 1093 raise CannotSendRequest(self.__state) 1094 1095 self._validate_method(method) 1096 1097 # Save the method for use later in the response phase 1098 self._method = method 1099 1100 url = url or '/' 1101 self._validate_path(url) 1102 1103 request = '%s %s %s' % (method, url, self._http_vsn_str) 1104 1105 self._output(self._encode_request(request)) 1106 1107 if self._http_vsn == 11: 1108 # Issue some standard headers for better HTTP/1.1 compliance 1109 1110 if not skip_host: 1111 # this header is issued *only* for HTTP/1.1 1112 # connections. more specifically, this means it is 1113 # only issued when the client uses the new 1114 # HTTPConnection() class. backwards-compat clients 1115 # will be using HTTP/1.0 and those clients may be 1116 # issuing this header themselves. we should NOT issue 1117 # it twice; some web servers (such as Apache) barf 1118 # when they see two Host: headers 1119 1120 # If we need a non-standard port,include it in the 1121 # header. If the request is going through a proxy, 1122 # but the host of the actual URL, not the host of the 1123 # proxy. 1124 1125 netloc = '' 1126 if url.startswith('http'): 1127 nil, netloc, nil, nil, nil = urlsplit(url) 1128 1129 if netloc: 1130 try: 1131 netloc_enc = netloc.encode("ascii") 1132 except UnicodeEncodeError: 1133 netloc_enc = netloc.encode("idna") 1134 self.putheader('Host', netloc_enc) 1135 else: 1136 if self._tunnel_host: 1137 host = self._tunnel_host 1138 port = self._tunnel_port 1139 else: 1140 host = self.host 1141 port = self.port 1142 1143 try: 1144 host_enc = host.encode("ascii") 1145 except UnicodeEncodeError: 1146 host_enc = host.encode("idna") 1147 1148 # As per RFC 273, IPv6 address should be wrapped with [] 1149 # when used as Host header 1150 1151 if host.find(':') >= 0: 1152 host_enc = b'[' + host_enc + b']' 1153 1154 if port == self.default_port: 1155 self.putheader('Host', host_enc) 1156 else: 1157 host_enc = host_enc.decode("ascii") 1158 self.putheader('Host', "%s:%s" % (host_enc, port)) 1159 1160 # note: we are assuming that clients will not attempt to set these 1161 # headers since *this* library must deal with the 1162 # consequences. this also means that when the supporting 1163 # libraries are updated to recognize other forms, then this 1164 # code should be changed (removed or updated). 1165 1166 # we only want a Content-Encoding of "identity" since we don't 1167 # support encodings such as x-gzip or x-deflate. 1168 if not skip_accept_encoding: 1169 self.putheader('Accept-Encoding', 'identity') 1170 1171 # we can accept "chunked" Transfer-Encodings, but no others 1172 # NOTE: no TE header implies *only* "chunked" 1173 #self.putheader('TE', 'chunked') 1174 1175 # if TE is supplied in the header, then it must appear in a 1176 # Connection header. 1177 #self.putheader('Connection', 'TE') 1178 1179 else: 1180 # For HTTP/1.0, the server will assume "not chunked" 1181 pass 1182 1183 def _encode_request(self, request): 1184 # ASCII also helps prevent CVE-2019-9740. 1185 return request.encode('ascii') 1186 1187 def _validate_method(self, method): 1188 """Validate a method name for putrequest.""" 1189 # prevent http header injection 1190 match = _contains_disallowed_method_pchar_re.search(method) 1191 if match: 1192 raise ValueError( 1193 f"method can't contain control characters. {method!r} " 1194 f"(found at least {match.group()!r})") 1195 1196 def _validate_path(self, url): 1197 """Validate a url for putrequest.""" 1198 # Prevent CVE-2019-9740. 1199 match = _contains_disallowed_url_pchar_re.search(url) 1200 if match: 1201 raise InvalidURL(f"URL can't contain control characters. {url!r} " 1202 f"(found at least {match.group()!r})") 1203 1204 def _validate_host(self, host): 1205 """Validate a host so it doesn't contain control characters.""" 1206 # Prevent CVE-2019-18348. 1207 match = _contains_disallowed_url_pchar_re.search(host) 1208 if match: 1209 raise InvalidURL(f"URL can't contain control characters. {host!r} " 1210 f"(found at least {match.group()!r})") 1211 1212 def putheader(self, header, *values): 1213 """Send a request header line to the server. 1214 1215 For example: h.putheader('Accept', 'text/html') 1216 """ 1217 if self.__state != _CS_REQ_STARTED: 1218 raise CannotSendHeader() 1219 1220 if hasattr(header, 'encode'): 1221 header = header.encode('ascii') 1222 1223 if not _is_legal_header_name(header): 1224 raise ValueError('Invalid header name %r' % (header,)) 1225 1226 values = list(values) 1227 for i, one_value in enumerate(values): 1228 if hasattr(one_value, 'encode'): 1229 values[i] = one_value.encode('latin-1') 1230 elif isinstance(one_value, int): 1231 values[i] = str(one_value).encode('ascii') 1232 1233 if _is_illegal_header_value(values[i]): 1234 raise ValueError('Invalid header value %r' % (values[i],)) 1235 1236 value = b'\r\n\t'.join(values) 1237 header = header + b': ' + value 1238 self._output(header) 1239 1240 def endheaders(self, message_body=None, *, encode_chunked=False): 1241 """Indicate that the last header line has been sent to the server. 1242 1243 This method sends the request to the server. The optional message_body 1244 argument can be used to pass a message body associated with the 1245 request. 1246 """ 1247 if self.__state == _CS_REQ_STARTED: 1248 self.__state = _CS_REQ_SENT 1249 else: 1250 raise CannotSendHeader() 1251 self._send_output(message_body, encode_chunked=encode_chunked) 1252 1253 def request(self, method, url, body=None, headers={}, *, 1254 encode_chunked=False): 1255 """Send a complete request to the server.""" 1256 self._send_request(method, url, body, headers, encode_chunked) 1257 1258 def _send_request(self, method, url, body, headers, encode_chunked): 1259 # Honor explicitly requested Host: and Accept-Encoding: headers. 1260 header_names = frozenset(k.lower() for k in headers) 1261 skips = {} 1262 if 'host' in header_names: 1263 skips['skip_host'] = 1 1264 if 'accept-encoding' in header_names: 1265 skips['skip_accept_encoding'] = 1 1266 1267 self.putrequest(method, url, **skips) 1268 1269 # chunked encoding will happen if HTTP/1.1 is used and either 1270 # the caller passes encode_chunked=True or the following 1271 # conditions hold: 1272 # 1. content-length has not been explicitly set 1273 # 2. the body is a file or iterable, but not a str or bytes-like 1274 # 3. Transfer-Encoding has NOT been explicitly set by the caller 1275 1276 if 'content-length' not in header_names: 1277 # only chunk body if not explicitly set for backwards 1278 # compatibility, assuming the client code is already handling the 1279 # chunking 1280 if 'transfer-encoding' not in header_names: 1281 # if content-length cannot be automatically determined, fall 1282 # back to chunked encoding 1283 encode_chunked = False 1284 content_length = self._get_content_length(body, method) 1285 if content_length is None: 1286 if body is not None: 1287 if self.debuglevel > 0: 1288 print('Unable to determine size of %r' % body) 1289 encode_chunked = True 1290 self.putheader('Transfer-Encoding', 'chunked') 1291 else: 1292 self.putheader('Content-Length', str(content_length)) 1293 else: 1294 encode_chunked = False 1295 1296 for hdr, value in headers.items(): 1297 self.putheader(hdr, value) 1298 if isinstance(body, str): 1299 # RFC 2616 Section 3.7.1 says that text default has a 1300 # default charset of iso-8859-1. 1301 body = _encode(body, 'body') 1302 self.endheaders(body, encode_chunked=encode_chunked) 1303 1304 def getresponse(self): 1305 """Get the response from the server. 1306 1307 If the HTTPConnection is in the correct state, returns an 1308 instance of HTTPResponse or of whatever object is returned by 1309 the response_class variable. 1310 1311 If a request has not been sent or if a previous response has 1312 not be handled, ResponseNotReady is raised. If the HTTP 1313 response indicates that the connection should be closed, then 1314 it will be closed before the response is returned. When the 1315 connection is closed, the underlying socket is closed. 1316 """ 1317 1318 # if a prior response has been completed, then forget about it. 1319 if self.__response and self.__response.isclosed(): 1320 self.__response = None 1321 1322 # if a prior response exists, then it must be completed (otherwise, we 1323 # cannot read this response's header to determine the connection-close 1324 # behavior) 1325 # 1326 # note: if a prior response existed, but was connection-close, then the 1327 # socket and response were made independent of this HTTPConnection 1328 # object since a new request requires that we open a whole new 1329 # connection 1330 # 1331 # this means the prior response had one of two states: 1332 # 1) will_close: this connection was reset and the prior socket and 1333 # response operate independently 1334 # 2) persistent: the response was retained and we await its 1335 # isclosed() status to become true. 1336 # 1337 if self.__state != _CS_REQ_SENT or self.__response: 1338 raise ResponseNotReady(self.__state) 1339 1340 if self.debuglevel > 0: 1341 response = self.response_class(self.sock, self.debuglevel, 1342 method=self._method) 1343 else: 1344 response = self.response_class(self.sock, method=self._method) 1345 1346 try: 1347 try: 1348 response.begin() 1349 except ConnectionError: 1350 self.close() 1351 raise 1352 assert response.will_close != _UNKNOWN 1353 self.__state = _CS_IDLE 1354 1355 if response.will_close: 1356 # this effectively passes the connection to the response 1357 self.close() 1358 else: 1359 # remember this, so we can tell when it is complete 1360 self.__response = response 1361 1362 return response 1363 except: 1364 response.close() 1365 raise 1366 1367try: 1368 import ssl 1369except ImportError: 1370 pass 1371else: 1372 class HTTPSConnection(HTTPConnection): 1373 "This class allows communication via SSL." 1374 1375 default_port = HTTPS_PORT 1376 1377 # XXX Should key_file and cert_file be deprecated in favour of context? 1378 1379 def __init__(self, host, port=None, key_file=None, cert_file=None, 1380 timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 1381 source_address=None, *, context=None, 1382 check_hostname=None, blocksize=8192): 1383 super(HTTPSConnection, self).__init__(host, port, timeout, 1384 source_address, 1385 blocksize=blocksize) 1386 if (key_file is not None or cert_file is not None or 1387 check_hostname is not None): 1388 import warnings 1389 warnings.warn("key_file, cert_file and check_hostname are " 1390 "deprecated, use a custom context instead.", 1391 DeprecationWarning, 2) 1392 self.key_file = key_file 1393 self.cert_file = cert_file 1394 if context is None: 1395 context = ssl._create_default_https_context() 1396 # enable PHA for TLS 1.3 connections if available 1397 if context.post_handshake_auth is not None: 1398 context.post_handshake_auth = True 1399 will_verify = context.verify_mode != ssl.CERT_NONE 1400 if check_hostname is None: 1401 check_hostname = context.check_hostname 1402 if check_hostname and not will_verify: 1403 raise ValueError("check_hostname needs a SSL context with " 1404 "either CERT_OPTIONAL or CERT_REQUIRED") 1405 if key_file or cert_file: 1406 context.load_cert_chain(cert_file, key_file) 1407 # cert and key file means the user wants to authenticate. 1408 # enable TLS 1.3 PHA implicitly even for custom contexts. 1409 if context.post_handshake_auth is not None: 1410 context.post_handshake_auth = True 1411 self._context = context 1412 if check_hostname is not None: 1413 self._context.check_hostname = check_hostname 1414 1415 def connect(self): 1416 "Connect to a host on a given (SSL) port." 1417 1418 super().connect() 1419 1420 if self._tunnel_host: 1421 server_hostname = self._tunnel_host 1422 else: 1423 server_hostname = self.host 1424 1425 self.sock = self._context.wrap_socket(self.sock, 1426 server_hostname=server_hostname) 1427 1428 __all__.append("HTTPSConnection") 1429 1430class HTTPException(Exception): 1431 # Subclasses that define an __init__ must call Exception.__init__ 1432 # or define self.args. Otherwise, str() will fail. 1433 pass 1434 1435class NotConnected(HTTPException): 1436 pass 1437 1438class InvalidURL(HTTPException): 1439 pass 1440 1441class UnknownProtocol(HTTPException): 1442 def __init__(self, version): 1443 self.args = version, 1444 self.version = version 1445 1446class UnknownTransferEncoding(HTTPException): 1447 pass 1448 1449class UnimplementedFileMode(HTTPException): 1450 pass 1451 1452class IncompleteRead(HTTPException): 1453 def __init__(self, partial, expected=None): 1454 self.args = partial, 1455 self.partial = partial 1456 self.expected = expected 1457 def __repr__(self): 1458 if self.expected is not None: 1459 e = ', %i more expected' % self.expected 1460 else: 1461 e = '' 1462 return '%s(%i bytes read%s)' % (self.__class__.__name__, 1463 len(self.partial), e) 1464 __str__ = object.__str__ 1465 1466class ImproperConnectionState(HTTPException): 1467 pass 1468 1469class CannotSendRequest(ImproperConnectionState): 1470 pass 1471 1472class CannotSendHeader(ImproperConnectionState): 1473 pass 1474 1475class ResponseNotReady(ImproperConnectionState): 1476 pass 1477 1478class BadStatusLine(HTTPException): 1479 def __init__(self, line): 1480 if not line: 1481 line = repr(line) 1482 self.args = line, 1483 self.line = line 1484 1485class LineTooLong(HTTPException): 1486 def __init__(self, line_type): 1487 HTTPException.__init__(self, "got more than %d bytes when reading %s" 1488 % (_MAXLINE, line_type)) 1489 1490class RemoteDisconnected(ConnectionResetError, BadStatusLine): 1491 def __init__(self, *pos, **kw): 1492 BadStatusLine.__init__(self, "") 1493 ConnectionResetError.__init__(self, *pos, **kw) 1494 1495# for backwards compatibility 1496error = HTTPException 1497