1############################################################################## 2# 3# Copyright (c) 2001, 2002 Zope Foundation and Contributors. 4# All Rights Reserved. 5# 6# This software is subject to the provisions of the Zope Public License, 7# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. 8# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED 9# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 10# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS 11# FOR A PARTICULAR PURPOSE. 12# 13############################################################################## 14"""HTTP Request Parser 15 16This server uses asyncore to accept connections and do initial 17processing but threads to do work. 18""" 19from io import BytesIO 20import re 21from urllib import parse 22from urllib.parse import unquote_to_bytes 23 24from waitress.buffers import OverflowableBuffer 25from waitress.receiver import ChunkedReceiver, FixedStreamReceiver 26from waitress.utilities import ( 27 BadRequest, 28 RequestEntityTooLarge, 29 RequestHeaderFieldsTooLarge, 30 ServerNotImplemented, 31 find_double_newline, 32) 33 34from .rfc7230 import HEADER_FIELD 35 36 37def unquote_bytes_to_wsgi(bytestring): 38 return unquote_to_bytes(bytestring).decode("latin-1") 39 40 41class ParsingError(Exception): 42 pass 43 44 45class TransferEncodingNotImplemented(Exception): 46 pass 47 48 49class HTTPRequestParser: 50 """A structure that collects the HTTP request. 51 52 Once the stream is completed, the instance is passed to 53 a server task constructor. 54 """ 55 56 completed = False # Set once request is completed. 57 empty = False # Set if no request was made. 58 expect_continue = False # client sent "Expect: 100-continue" header 59 headers_finished = False # True when headers have been read 60 header_plus = b"" 61 chunked = False 62 content_length = 0 63 header_bytes_received = 0 64 body_bytes_received = 0 65 body_rcv = None 66 version = "1.0" 67 error = None 68 connection_close = False 69 70 # Other attributes: first_line, header, headers, command, uri, version, 71 # path, query, fragment 72 73 def __init__(self, adj): 74 """ 75 adj is an Adjustments object. 76 """ 77 # headers is a mapping containing keys translated to uppercase 78 # with dashes turned into underscores. 79 self.headers = {} 80 self.adj = adj 81 82 def received(self, data): 83 """ 84 Receives the HTTP stream for one request. Returns the number of 85 bytes consumed. Sets the completed flag once both the header and the 86 body have been received. 87 """ 88 89 if self.completed: 90 return 0 # Can't consume any more. 91 92 datalen = len(data) 93 br = self.body_rcv 94 95 if br is None: 96 # In header. 97 max_header = self.adj.max_request_header_size 98 99 s = self.header_plus + data 100 index = find_double_newline(s) 101 consumed = 0 102 103 if index >= 0: 104 # If the headers have ended, and we also have part of the body 105 # message in data we still want to validate we aren't going 106 # over our limit for received headers. 107 self.header_bytes_received += index 108 consumed = datalen - (len(s) - index) 109 else: 110 self.header_bytes_received += datalen 111 consumed = datalen 112 113 # If the first line + headers is over the max length, we return a 114 # RequestHeaderFieldsTooLarge error rather than continuing to 115 # attempt to parse the headers. 116 117 if self.header_bytes_received >= max_header: 118 self.parse_header(b"GET / HTTP/1.0\r\n") 119 self.error = RequestHeaderFieldsTooLarge( 120 "exceeds max_header of %s" % max_header 121 ) 122 self.completed = True 123 124 return consumed 125 126 if index >= 0: 127 # Header finished. 128 header_plus = s[:index] 129 130 # Remove preceeding blank lines. This is suggested by 131 # https://tools.ietf.org/html/rfc7230#section-3.5 to support 132 # clients sending an extra CR LF after another request when 133 # using HTTP pipelining 134 header_plus = header_plus.lstrip() 135 136 if not header_plus: 137 self.empty = True 138 self.completed = True 139 else: 140 try: 141 self.parse_header(header_plus) 142 except ParsingError as e: 143 self.error = BadRequest(e.args[0]) 144 self.completed = True 145 except TransferEncodingNotImplemented as e: 146 self.error = ServerNotImplemented(e.args[0]) 147 self.completed = True 148 else: 149 if self.body_rcv is None: 150 # no content-length header and not a t-e: chunked 151 # request 152 self.completed = True 153 154 if self.content_length > 0: 155 max_body = self.adj.max_request_body_size 156 # we won't accept this request if the content-length 157 # is too large 158 159 if self.content_length >= max_body: 160 self.error = RequestEntityTooLarge( 161 "exceeds max_body of %s" % max_body 162 ) 163 self.completed = True 164 self.headers_finished = True 165 166 return consumed 167 168 # Header not finished yet. 169 self.header_plus = s 170 171 return datalen 172 else: 173 # In body. 174 consumed = br.received(data) 175 self.body_bytes_received += consumed 176 max_body = self.adj.max_request_body_size 177 178 if self.body_bytes_received >= max_body: 179 # this will only be raised during t-e: chunked requests 180 self.error = RequestEntityTooLarge("exceeds max_body of %s" % max_body) 181 self.completed = True 182 elif br.error: 183 # garbage in chunked encoding input probably 184 self.error = br.error 185 self.completed = True 186 elif br.completed: 187 # The request (with the body) is ready to use. 188 self.completed = True 189 190 if self.chunked: 191 # We've converted the chunked transfer encoding request 192 # body into a normal request body, so we know its content 193 # length; set the header here. We already popped the 194 # TRANSFER_ENCODING header in parse_header, so this will 195 # appear to the client to be an entirely non-chunked HTTP 196 # request with a valid content-length. 197 self.headers["CONTENT_LENGTH"] = str(br.__len__()) 198 199 return consumed 200 201 def parse_header(self, header_plus): 202 """ 203 Parses the header_plus block of text (the headers plus the 204 first line of the request). 205 """ 206 index = header_plus.find(b"\r\n") 207 208 if index >= 0: 209 first_line = header_plus[:index].rstrip() 210 header = header_plus[index + 2 :] 211 else: 212 raise ParsingError("HTTP message header invalid") 213 214 if b"\r" in first_line or b"\n" in first_line: 215 raise ParsingError("Bare CR or LF found in HTTP message") 216 217 self.first_line = first_line # for testing 218 219 lines = get_header_lines(header) 220 221 headers = self.headers 222 223 for line in lines: 224 header = HEADER_FIELD.match(line) 225 226 if not header: 227 raise ParsingError("Invalid header") 228 229 key, value = header.group("name", "value") 230 231 if b"_" in key: 232 # TODO(xistence): Should we drop this request instead? 233 234 continue 235 236 # Only strip off whitespace that is considered valid whitespace by 237 # RFC7230, don't strip the rest 238 value = value.strip(b" \t") 239 key1 = key.upper().replace(b"-", b"_").decode("latin-1") 240 # If a header already exists, we append subsequent values 241 # separated by a comma. Applications already need to handle 242 # the comma separated values, as HTTP front ends might do 243 # the concatenation for you (behavior specified in RFC2616). 244 try: 245 headers[key1] += (b", " + value).decode("latin-1") 246 except KeyError: 247 headers[key1] = value.decode("latin-1") 248 249 # command, uri, version will be bytes 250 command, uri, version = crack_first_line(first_line) 251 version = version.decode("latin-1") 252 command = command.decode("latin-1") 253 self.command = command 254 self.version = version 255 ( 256 self.proxy_scheme, 257 self.proxy_netloc, 258 self.path, 259 self.query, 260 self.fragment, 261 ) = split_uri(uri) 262 self.url_scheme = self.adj.url_scheme 263 connection = headers.get("CONNECTION", "") 264 265 if version == "1.0": 266 if connection.lower() != "keep-alive": 267 self.connection_close = True 268 269 if version == "1.1": 270 # since the server buffers data from chunked transfers and clients 271 # never need to deal with chunked requests, downstream clients 272 # should not see the HTTP_TRANSFER_ENCODING header; we pop it 273 # here 274 te = headers.pop("TRANSFER_ENCODING", "") 275 276 # NB: We can not just call bare strip() here because it will also 277 # remove other non-printable characters that we explicitly do not 278 # want removed so that if someone attempts to smuggle a request 279 # with these characters we don't fall prey to it. 280 # 281 # For example \x85 is stripped by default, but it is not considered 282 # valid whitespace to be stripped by RFC7230. 283 encodings = [ 284 encoding.strip(" \t").lower() for encoding in te.split(",") if encoding 285 ] 286 287 for encoding in encodings: 288 # Out of the transfer-codings listed in 289 # https://tools.ietf.org/html/rfc7230#section-4 we only support 290 # chunked at this time. 291 292 # Note: the identity transfer-coding was removed in RFC7230: 293 # https://tools.ietf.org/html/rfc7230#appendix-A.2 and is thus 294 # not supported 295 296 if encoding not in {"chunked"}: 297 raise TransferEncodingNotImplemented( 298 "Transfer-Encoding requested is not supported." 299 ) 300 301 if encodings and encodings[-1] == "chunked": 302 self.chunked = True 303 buf = OverflowableBuffer(self.adj.inbuf_overflow) 304 self.body_rcv = ChunkedReceiver(buf) 305 elif encodings: # pragma: nocover 306 raise TransferEncodingNotImplemented( 307 "Transfer-Encoding requested is not supported." 308 ) 309 310 expect = headers.get("EXPECT", "").lower() 311 self.expect_continue = expect == "100-continue" 312 313 if connection.lower() == "close": 314 self.connection_close = True 315 316 if not self.chunked: 317 try: 318 cl = int(headers.get("CONTENT_LENGTH", 0)) 319 except ValueError: 320 raise ParsingError("Content-Length is invalid") 321 322 self.content_length = cl 323 324 if cl > 0: 325 buf = OverflowableBuffer(self.adj.inbuf_overflow) 326 self.body_rcv = FixedStreamReceiver(cl, buf) 327 328 def get_body_stream(self): 329 body_rcv = self.body_rcv 330 331 if body_rcv is not None: 332 return body_rcv.getfile() 333 else: 334 return BytesIO() 335 336 def close(self): 337 body_rcv = self.body_rcv 338 339 if body_rcv is not None: 340 body_rcv.getbuf().close() 341 342 343def split_uri(uri): 344 # urlsplit handles byte input by returning bytes on py3, so 345 # scheme, netloc, path, query, and fragment are bytes 346 347 scheme = netloc = path = query = fragment = b"" 348 349 # urlsplit below will treat this as a scheme-less netloc, thereby losing 350 # the original intent of the request. Here we shamelessly stole 4 lines of 351 # code from the CPython stdlib to parse out the fragment and query but 352 # leave the path alone. See 353 # https://github.com/python/cpython/blob/8c9e9b0cd5b24dfbf1424d1f253d02de80e8f5ef/Lib/urllib/parse.py#L465-L468 354 # and https://github.com/Pylons/waitress/issues/260 355 356 if uri[:2] == b"//": 357 path = uri 358 359 if b"#" in path: 360 path, fragment = path.split(b"#", 1) 361 362 if b"?" in path: 363 path, query = path.split(b"?", 1) 364 else: 365 try: 366 scheme, netloc, path, query, fragment = parse.urlsplit(uri) 367 except UnicodeError: 368 raise ParsingError("Bad URI") 369 370 return ( 371 scheme.decode("latin-1"), 372 netloc.decode("latin-1"), 373 unquote_bytes_to_wsgi(path), 374 query.decode("latin-1"), 375 fragment.decode("latin-1"), 376 ) 377 378 379def get_header_lines(header): 380 """ 381 Splits the header into lines, putting multi-line headers together. 382 """ 383 r = [] 384 lines = header.split(b"\r\n") 385 386 for line in lines: 387 if not line: 388 continue 389 390 if b"\r" in line or b"\n" in line: 391 raise ParsingError( 392 'Bare CR or LF found in header line "%s"' % str(line, "latin-1") 393 ) 394 395 if line.startswith((b" ", b"\t")): 396 if not r: 397 # https://corte.si/posts/code/pathod/pythonservers/index.html 398 raise ParsingError('Malformed header line "%s"' % str(line, "latin-1")) 399 r[-1] += line 400 else: 401 r.append(line) 402 403 return r 404 405 406first_line_re = re.compile( 407 b"([^ ]+) " 408 b"((?:[^ :?#]+://[^ ?#/]*(?:[0-9]{1,5})?)?[^ ]+)" 409 b"(( HTTP/([0-9.]+))$|$)" 410) 411 412 413def crack_first_line(line): 414 m = first_line_re.match(line) 415 416 if m is not None and m.end() == len(line): 417 if m.group(3): 418 version = m.group(5) 419 else: 420 version = b"" 421 method = m.group(1) 422 423 # the request methods that are currently defined are all uppercase: 424 # https://www.iana.org/assignments/http-methods/http-methods.xhtml and 425 # the request method is case sensitive according to 426 # https://tools.ietf.org/html/rfc7231#section-4.1 427 428 # By disallowing anything but uppercase methods we save poor 429 # unsuspecting souls from sending lowercase HTTP methods to waitress 430 # and having the request complete, while servers like nginx drop the 431 # request onto the floor. 432 433 if method != method.upper(): 434 raise ParsingError('Malformed HTTP method "%s"' % str(method, "latin-1")) 435 uri = m.group(2) 436 437 return method, uri, version 438 else: 439 return b"", b"", b"" 440