1##############################################################################
2#
3# Copyright (c) 2001, 2002 Zope Foundation and Contributors.
4# All Rights Reserved.
5#
6# This software is subject to the provisions of the Zope Public License,
7# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
8# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
9# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
10# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
11# FOR A PARTICULAR PURPOSE.
12#
13##############################################################################
14"""HTTP Request Parser
15
16This server uses asyncore to accept connections and do initial
17processing but threads to do work.
18"""
19from io import BytesIO
20import re
21from urllib import parse
22from urllib.parse import unquote_to_bytes
23
24from waitress.buffers import OverflowableBuffer
25from waitress.receiver import ChunkedReceiver, FixedStreamReceiver
26from waitress.utilities import (
27    BadRequest,
28    RequestEntityTooLarge,
29    RequestHeaderFieldsTooLarge,
30    ServerNotImplemented,
31    find_double_newline,
32)
33
34from .rfc7230 import HEADER_FIELD
35
36
37def unquote_bytes_to_wsgi(bytestring):
38    return unquote_to_bytes(bytestring).decode("latin-1")
39
40
41class ParsingError(Exception):
42    pass
43
44
45class TransferEncodingNotImplemented(Exception):
46    pass
47
48
49class HTTPRequestParser:
50    """A structure that collects the HTTP request.
51
52    Once the stream is completed, the instance is passed to
53    a server task constructor.
54    """
55
56    completed = False  # Set once request is completed.
57    empty = False  # Set if no request was made.
58    expect_continue = False  # client sent "Expect: 100-continue" header
59    headers_finished = False  # True when headers have been read
60    header_plus = b""
61    chunked = False
62    content_length = 0
63    header_bytes_received = 0
64    body_bytes_received = 0
65    body_rcv = None
66    version = "1.0"
67    error = None
68    connection_close = False
69
70    # Other attributes: first_line, header, headers, command, uri, version,
71    # path, query, fragment
72
73    def __init__(self, adj):
74        """
75        adj is an Adjustments object.
76        """
77        # headers is a mapping containing keys translated to uppercase
78        # with dashes turned into underscores.
79        self.headers = {}
80        self.adj = adj
81
82    def received(self, data):
83        """
84        Receives the HTTP stream for one request.  Returns the number of
85        bytes consumed.  Sets the completed flag once both the header and the
86        body have been received.
87        """
88
89        if self.completed:
90            return 0  # Can't consume any more.
91
92        datalen = len(data)
93        br = self.body_rcv
94
95        if br is None:
96            # In header.
97            max_header = self.adj.max_request_header_size
98
99            s = self.header_plus + data
100            index = find_double_newline(s)
101            consumed = 0
102
103            if index >= 0:
104                # If the headers have ended, and we also have part of the body
105                # message in data we still want to validate we aren't going
106                # over our limit for received headers.
107                self.header_bytes_received += index
108                consumed = datalen - (len(s) - index)
109            else:
110                self.header_bytes_received += datalen
111                consumed = datalen
112
113            # If the first line + headers is over the max length, we return a
114            # RequestHeaderFieldsTooLarge error rather than continuing to
115            # attempt to parse the headers.
116
117            if self.header_bytes_received >= max_header:
118                self.parse_header(b"GET / HTTP/1.0\r\n")
119                self.error = RequestHeaderFieldsTooLarge(
120                    "exceeds max_header of %s" % max_header
121                )
122                self.completed = True
123
124                return consumed
125
126            if index >= 0:
127                # Header finished.
128                header_plus = s[:index]
129
130                # Remove preceeding blank lines. This is suggested by
131                # https://tools.ietf.org/html/rfc7230#section-3.5 to support
132                # clients sending an extra CR LF after another request when
133                # using HTTP pipelining
134                header_plus = header_plus.lstrip()
135
136                if not header_plus:
137                    self.empty = True
138                    self.completed = True
139                else:
140                    try:
141                        self.parse_header(header_plus)
142                    except ParsingError as e:
143                        self.error = BadRequest(e.args[0])
144                        self.completed = True
145                    except TransferEncodingNotImplemented as e:
146                        self.error = ServerNotImplemented(e.args[0])
147                        self.completed = True
148                    else:
149                        if self.body_rcv is None:
150                            # no content-length header and not a t-e: chunked
151                            # request
152                            self.completed = True
153
154                        if self.content_length > 0:
155                            max_body = self.adj.max_request_body_size
156                            # we won't accept this request if the content-length
157                            # is too large
158
159                            if self.content_length >= max_body:
160                                self.error = RequestEntityTooLarge(
161                                    "exceeds max_body of %s" % max_body
162                                )
163                                self.completed = True
164                self.headers_finished = True
165
166                return consumed
167
168            # Header not finished yet.
169            self.header_plus = s
170
171            return datalen
172        else:
173            # In body.
174            consumed = br.received(data)
175            self.body_bytes_received += consumed
176            max_body = self.adj.max_request_body_size
177
178            if self.body_bytes_received >= max_body:
179                # this will only be raised during t-e: chunked requests
180                self.error = RequestEntityTooLarge("exceeds max_body of %s" % max_body)
181                self.completed = True
182            elif br.error:
183                # garbage in chunked encoding input probably
184                self.error = br.error
185                self.completed = True
186            elif br.completed:
187                # The request (with the body) is ready to use.
188                self.completed = True
189
190                if self.chunked:
191                    # We've converted the chunked transfer encoding request
192                    # body into a normal request body, so we know its content
193                    # length; set the header here.  We already popped the
194                    # TRANSFER_ENCODING header in parse_header, so this will
195                    # appear to the client to be an entirely non-chunked HTTP
196                    # request with a valid content-length.
197                    self.headers["CONTENT_LENGTH"] = str(br.__len__())
198
199            return consumed
200
201    def parse_header(self, header_plus):
202        """
203        Parses the header_plus block of text (the headers plus the
204        first line of the request).
205        """
206        index = header_plus.find(b"\r\n")
207
208        if index >= 0:
209            first_line = header_plus[:index].rstrip()
210            header = header_plus[index + 2 :]
211        else:
212            raise ParsingError("HTTP message header invalid")
213
214        if b"\r" in first_line or b"\n" in first_line:
215            raise ParsingError("Bare CR or LF found in HTTP message")
216
217        self.first_line = first_line  # for testing
218
219        lines = get_header_lines(header)
220
221        headers = self.headers
222
223        for line in lines:
224            header = HEADER_FIELD.match(line)
225
226            if not header:
227                raise ParsingError("Invalid header")
228
229            key, value = header.group("name", "value")
230
231            if b"_" in key:
232                # TODO(xistence): Should we drop this request instead?
233
234                continue
235
236            # Only strip off whitespace that is considered valid whitespace by
237            # RFC7230, don't strip the rest
238            value = value.strip(b" \t")
239            key1 = key.upper().replace(b"-", b"_").decode("latin-1")
240            # If a header already exists, we append subsequent values
241            # separated by a comma. Applications already need to handle
242            # the comma separated values, as HTTP front ends might do
243            # the concatenation for you (behavior specified in RFC2616).
244            try:
245                headers[key1] += (b", " + value).decode("latin-1")
246            except KeyError:
247                headers[key1] = value.decode("latin-1")
248
249        # command, uri, version will be bytes
250        command, uri, version = crack_first_line(first_line)
251        version = version.decode("latin-1")
252        command = command.decode("latin-1")
253        self.command = command
254        self.version = version
255        (
256            self.proxy_scheme,
257            self.proxy_netloc,
258            self.path,
259            self.query,
260            self.fragment,
261        ) = split_uri(uri)
262        self.url_scheme = self.adj.url_scheme
263        connection = headers.get("CONNECTION", "")
264
265        if version == "1.0":
266            if connection.lower() != "keep-alive":
267                self.connection_close = True
268
269        if version == "1.1":
270            # since the server buffers data from chunked transfers and clients
271            # never need to deal with chunked requests, downstream clients
272            # should not see the HTTP_TRANSFER_ENCODING header; we pop it
273            # here
274            te = headers.pop("TRANSFER_ENCODING", "")
275
276            # NB: We can not just call bare strip() here because it will also
277            # remove other non-printable characters that we explicitly do not
278            # want removed so that if someone attempts to smuggle a request
279            # with these characters we don't fall prey to it.
280            #
281            # For example \x85 is stripped by default, but it is not considered
282            # valid whitespace to be stripped by RFC7230.
283            encodings = [
284                encoding.strip(" \t").lower() for encoding in te.split(",") if encoding
285            ]
286
287            for encoding in encodings:
288                # Out of the transfer-codings listed in
289                # https://tools.ietf.org/html/rfc7230#section-4 we only support
290                # chunked at this time.
291
292                # Note: the identity transfer-coding was removed in RFC7230:
293                # https://tools.ietf.org/html/rfc7230#appendix-A.2 and is thus
294                # not supported
295
296                if encoding not in {"chunked"}:
297                    raise TransferEncodingNotImplemented(
298                        "Transfer-Encoding requested is not supported."
299                    )
300
301            if encodings and encodings[-1] == "chunked":
302                self.chunked = True
303                buf = OverflowableBuffer(self.adj.inbuf_overflow)
304                self.body_rcv = ChunkedReceiver(buf)
305            elif encodings:  # pragma: nocover
306                raise TransferEncodingNotImplemented(
307                    "Transfer-Encoding requested is not supported."
308                )
309
310            expect = headers.get("EXPECT", "").lower()
311            self.expect_continue = expect == "100-continue"
312
313            if connection.lower() == "close":
314                self.connection_close = True
315
316        if not self.chunked:
317            try:
318                cl = int(headers.get("CONTENT_LENGTH", 0))
319            except ValueError:
320                raise ParsingError("Content-Length is invalid")
321
322            self.content_length = cl
323
324            if cl > 0:
325                buf = OverflowableBuffer(self.adj.inbuf_overflow)
326                self.body_rcv = FixedStreamReceiver(cl, buf)
327
328    def get_body_stream(self):
329        body_rcv = self.body_rcv
330
331        if body_rcv is not None:
332            return body_rcv.getfile()
333        else:
334            return BytesIO()
335
336    def close(self):
337        body_rcv = self.body_rcv
338
339        if body_rcv is not None:
340            body_rcv.getbuf().close()
341
342
343def split_uri(uri):
344    # urlsplit handles byte input by returning bytes on py3, so
345    # scheme, netloc, path, query, and fragment are bytes
346
347    scheme = netloc = path = query = fragment = b""
348
349    # urlsplit below will treat this as a scheme-less netloc, thereby losing
350    # the original intent of the request. Here we shamelessly stole 4 lines of
351    # code from the CPython stdlib to parse out the fragment and query but
352    # leave the path alone. See
353    # https://github.com/python/cpython/blob/8c9e9b0cd5b24dfbf1424d1f253d02de80e8f5ef/Lib/urllib/parse.py#L465-L468
354    # and https://github.com/Pylons/waitress/issues/260
355
356    if uri[:2] == b"//":
357        path = uri
358
359        if b"#" in path:
360            path, fragment = path.split(b"#", 1)
361
362        if b"?" in path:
363            path, query = path.split(b"?", 1)
364    else:
365        try:
366            scheme, netloc, path, query, fragment = parse.urlsplit(uri)
367        except UnicodeError:
368            raise ParsingError("Bad URI")
369
370    return (
371        scheme.decode("latin-1"),
372        netloc.decode("latin-1"),
373        unquote_bytes_to_wsgi(path),
374        query.decode("latin-1"),
375        fragment.decode("latin-1"),
376    )
377
378
379def get_header_lines(header):
380    """
381    Splits the header into lines, putting multi-line headers together.
382    """
383    r = []
384    lines = header.split(b"\r\n")
385
386    for line in lines:
387        if not line:
388            continue
389
390        if b"\r" in line or b"\n" in line:
391            raise ParsingError(
392                'Bare CR or LF found in header line "%s"' % str(line, "latin-1")
393            )
394
395        if line.startswith((b" ", b"\t")):
396            if not r:
397                # https://corte.si/posts/code/pathod/pythonservers/index.html
398                raise ParsingError('Malformed header line "%s"' % str(line, "latin-1"))
399            r[-1] += line
400        else:
401            r.append(line)
402
403    return r
404
405
406first_line_re = re.compile(
407    b"([^ ]+) "
408    b"((?:[^ :?#]+://[^ ?#/]*(?:[0-9]{1,5})?)?[^ ]+)"
409    b"(( HTTP/([0-9.]+))$|$)"
410)
411
412
413def crack_first_line(line):
414    m = first_line_re.match(line)
415
416    if m is not None and m.end() == len(line):
417        if m.group(3):
418            version = m.group(5)
419        else:
420            version = b""
421        method = m.group(1)
422
423        # the request methods that are currently defined are all uppercase:
424        # https://www.iana.org/assignments/http-methods/http-methods.xhtml and
425        # the request method is case sensitive according to
426        # https://tools.ietf.org/html/rfc7231#section-4.1
427
428        # By disallowing anything but uppercase methods we save poor
429        # unsuspecting souls from sending lowercase HTTP methods to waitress
430        # and having the request complete, while servers like nginx drop the
431        # request onto the floor.
432
433        if method != method.upper():
434            raise ParsingError('Malformed HTTP method "%s"' % str(method, "latin-1"))
435        uri = m.group(2)
436
437        return method, uri, version
438    else:
439        return b"", b"", b""
440