1 /*
2  * Copyright (C) 1996-2021 The Squid Software Foundation and contributors
3  *
4  * Squid software is distributed under GPLv2+ license and includes
5  * contributions from numerous individuals and organizations.
6  * Please see the COPYING and CONTRIBUTORS files for details.
7  */
8 
9 #include "squid.h"
10 #include "Debug.h"
11 #include "http/one/RequestParser.h"
12 #include "http/one/Tokenizer.h"
13 #include "http/ProtocolVersion.h"
14 #include "profiler/Profiler.h"
15 #include "SquidConfig.h"
16 
RequestParser(bool preserveParsed)17 Http::One::RequestParser::RequestParser(bool preserveParsed) :
18     Parser(),
19     preserveParsed_(preserveParsed)
20 {}
21 
22 Http1::Parser::size_type
firstLineSize() const23 Http::One::RequestParser::firstLineSize() const
24 {
25     // RFC 7230 section 2.6
26     /* method SP request-target SP "HTTP/" DIGIT "." DIGIT CRLF */
27     return method_.image().length() + uri_.length() + 12;
28 }
29 
30 /**
31  * Attempt to parse the first line of a new request message.
32  *
33  * Governed by RFC 7230 section 3.5
34  *  "
35  *    In the interest of robustness, a server that is expecting to receive
36  *    and parse a request-line SHOULD ignore at least one empty line (CRLF)
37  *    received prior to the request-line.
38  *  "
39  *
40  * Parsing state is stored between calls to avoid repeating buffer scans.
41  * If garbage is found the parsing offset is incremented.
42  */
43 void
skipGarbageLines()44 Http::One::RequestParser::skipGarbageLines()
45 {
46     if (Config.onoff.relaxed_header_parser) {
47         if (Config.onoff.relaxed_header_parser < 0 && (buf_[0] == '\r' || buf_[0] == '\n'))
48             debugs(74, DBG_IMPORTANT, "WARNING: Invalid HTTP Request: " <<
49                    "CRLF bytes received ahead of request-line. " <<
50                    "Ignored due to relaxed_header_parser.");
51         // Be tolerant of prefix empty lines
52         // ie any series of either \n or \r\n with no other characters and no repeated \r
53         while (!buf_.isEmpty() && (buf_[0] == '\n' || (buf_[0] == '\r' && buf_[1] == '\n'))) {
54             buf_.consume(1);
55         }
56     }
57 }
58 
59 /**
60  * Attempt to parse the method field out of an HTTP message request-line.
61  *
62  * Governed by:
63  *  RFC 1945 section 5.1
64  *  RFC 7230 section 2.6, 3.1 and 3.5
65  */
66 bool
parseMethodField(Http1::Tokenizer & tok)67 Http::One::RequestParser::parseMethodField(Http1::Tokenizer &tok)
68 {
69     // method field is a sequence of TCHAR.
70     // Limit to 32 characters to prevent overly long sequences of non-HTTP
71     // being sucked in before mismatch is detected. 32 is itself annoyingly
72     // big but there are methods registered by IANA that reach 17 bytes:
73     //  http://www.iana.org/assignments/http-methods
74     static const size_t maxMethodLength = 32; // TODO: make this configurable?
75 
76     SBuf methodFound;
77     if (!tok.prefix(methodFound, CharacterSet::TCHAR, maxMethodLength)) {
78         debugs(33, ErrorLevel(), "invalid request-line: missing or malformed method");
79         parseStatusCode = Http::scBadRequest;
80         return false;
81     }
82     method_ = HttpRequestMethod(methodFound);
83 
84     if (!skipDelimiter(tok.skipAll(DelimiterCharacters()), "after method"))
85         return false;
86 
87     return true;
88 }
89 
90 /// the characters which truly are valid within URI
91 static const CharacterSet &
UriValidCharacters()92 UriValidCharacters()
93 {
94     /* RFC 3986 section 2:
95      * "
96      *   A URI is composed from a limited set of characters consisting of
97      *   digits, letters, and a few graphic symbols.
98      * "
99      */
100     static const CharacterSet UriChars =
101         CharacterSet("URI-Chars","") +
102         // RFC 3986 section 2.2 - reserved characters
103         CharacterSet("gen-delims", ":/?#[]@") +
104         CharacterSet("sub-delims", "!$&'()*+,;=") +
105         // RFC 3986 section 2.3 - unreserved characters
106         CharacterSet::ALPHA +
107         CharacterSet::DIGIT +
108         CharacterSet("unreserved", "-._~") +
109         // RFC 3986 section 2.1 - percent encoding "%" HEXDIG
110         CharacterSet("pct-encoded", "%") +
111         CharacterSet::HEXDIG;
112 
113     return UriChars;
114 }
115 
116 /// characters which Squid will accept in the HTTP request-target (URI)
117 const CharacterSet &
RequestTargetCharacters()118 Http::One::RequestParser::RequestTargetCharacters()
119 {
120     if (Config.onoff.relaxed_header_parser) {
121 #if USE_HTTP_VIOLATIONS
122         static const CharacterSet RelaxedExtended =
123             UriValidCharacters() +
124             // accept whitespace (extended), it will be dealt with later
125             DelimiterCharacters() +
126             // RFC 2396 unwise character set which must never be transmitted
127             // in un-escaped form. But many web services do anyway.
128             CharacterSet("RFC2396-unwise","\"\\|^<>`{}") +
129             // UTF-8 because we want to be future-proof
130             CharacterSet("UTF-8", 128, 255);
131 
132         return RelaxedExtended;
133 #else
134         static const CharacterSet RelaxedCompliant =
135             UriValidCharacters() +
136             // accept whitespace (extended), it will be dealt with later.
137             DelimiterCharacters();
138 
139         return RelaxedCompliant;
140 #endif
141     }
142 
143     // strict parse only accepts what the RFC say we can
144     return UriValidCharacters();
145 }
146 
147 bool
parseUriField(Http1::Tokenizer & tok)148 Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok)
149 {
150     /* Arbitrary 64KB URI upper length limit.
151      *
152      * Not quite as arbitrary as it seems though. Old SquidString objects
153      * cannot store strings larger than 64KB, so we must limit until they
154      * have all been replaced with SBuf.
155      *
156      * Not that it matters but RFC 7230 section 3.1.1 requires (RECOMMENDED)
157      * at least 8000 octets for the whole line, including method and version.
158      */
159     const size_t maxUriLength = static_cast<size_t>((64*1024)-1);
160 
161     SBuf uriFound;
162     if (!tok.prefix(uriFound, RequestTargetCharacters())) {
163         parseStatusCode = Http::scBadRequest;
164         debugs(33, ErrorLevel(), "invalid request-line: missing or malformed URI");
165         return false;
166     }
167 
168     if (uriFound.length() > maxUriLength) {
169         // RFC 7230 section 3.1.1 mandatory (MUST) 414 response
170         parseStatusCode = Http::scUriTooLong;
171         debugs(33, ErrorLevel(), "invalid request-line: " << uriFound.length() <<
172                "-byte URI exceeds " << maxUriLength << "-byte limit");
173         return false;
174     }
175 
176     uri_ = uriFound;
177     return true;
178 }
179 
180 bool
parseHttpVersionField(Http1::Tokenizer & tok)181 Http::One::RequestParser::parseHttpVersionField(Http1::Tokenizer &tok)
182 {
183     static const SBuf http1p0("HTTP/1.0");
184     static const SBuf http1p1("HTTP/1.1");
185     const auto savedTok = tok;
186 
187     // Optimization: Expect (and quickly parse) HTTP/1.1 or HTTP/1.0 in
188     // the vast majority of cases.
189     if (tok.skipSuffix(http1p1)) {
190         msgProtocol_ = Http::ProtocolVersion(1, 1);
191         return true;
192     } else if (tok.skipSuffix(http1p0)) {
193         msgProtocol_ = Http::ProtocolVersion(1, 0);
194         return true;
195     } else {
196         // RFC 7230 section 2.6:
197         // HTTP-version  = HTTP-name "/" DIGIT "." DIGIT
198         static const CharacterSet period("Decimal point", ".");
199         static const SBuf proto("HTTP/");
200         SBuf majorDigit;
201         SBuf minorDigit;
202         if (tok.suffix(minorDigit, CharacterSet::DIGIT) &&
203                 tok.skipOneTrailing(period) &&
204                 tok.suffix(majorDigit, CharacterSet::DIGIT) &&
205                 tok.skipSuffix(proto)) {
206             const bool multiDigits = majorDigit.length() > 1 || minorDigit.length() > 1;
207             // use '0.0' for unsupported multiple digit version numbers
208             const unsigned int major = multiDigits ? 0 : (*majorDigit.rawContent() - '0');
209             const unsigned int minor = multiDigits ? 0 : (*minorDigit.rawContent() - '0');
210             msgProtocol_ = Http::ProtocolVersion(major, minor);
211             return true;
212         }
213     }
214 
215     // A GET request might use HTTP/0.9 syntax
216     if (method_ == Http::METHOD_GET) {
217         // RFC 1945 - no HTTP version field at all
218         tok = savedTok; // in case the URI ends with a digit
219         // report this assumption as an error if configured to triage parsing
220         debugs(33, ErrorLevel(), "assuming HTTP/0.9 request-line");
221         msgProtocol_ = Http::ProtocolVersion(0,9);
222         return true;
223     }
224 
225     debugs(33, ErrorLevel(), "invalid request-line: not HTTP");
226     parseStatusCode = Http::scBadRequest;
227     return false;
228 }
229 
230 /**
231  * Skip characters separating request-line fields.
232  * To handle bidirectional parsing, the caller does the actual skipping and
233  * we just check how many character the caller has skipped.
234  */
235 bool
skipDelimiter(const size_t count,const char * where)236 Http::One::RequestParser::skipDelimiter(const size_t count, const char *where)
237 {
238     if (count <= 0) {
239         debugs(33, ErrorLevel(), "invalid request-line: missing delimiter " << where);
240         parseStatusCode = Http::scBadRequest;
241         return false;
242     }
243 
244     // tolerant parser allows multiple whitespace characters between request-line fields
245     if (count > 1 && !Config.onoff.relaxed_header_parser) {
246         debugs(33, ErrorLevel(), "invalid request-line: too many delimiters " << where);
247         parseStatusCode = Http::scBadRequest;
248         return false;
249     }
250 
251     return true;
252 }
253 
254 /// Parse CRs at the end of request-line, just before the terminating LF.
255 bool
skipTrailingCrs(Http1::Tokenizer & tok)256 Http::One::RequestParser::skipTrailingCrs(Http1::Tokenizer &tok)
257 {
258     if (Config.onoff.relaxed_header_parser) {
259         (void)tok.skipAllTrailing(CharacterSet::CR); // optional; multiple OK
260     } else {
261         if (!tok.skipOneTrailing(CharacterSet::CR)) {
262             debugs(33, ErrorLevel(), "invalid request-line: missing CR before LF");
263             parseStatusCode = Http::scBadRequest;
264             return false;
265         }
266     }
267     return true;
268 }
269 
270 /**
271  * Attempt to parse the first line of a new request message.
272  *
273  * Governed by:
274  *  RFC 1945 section 5.1
275  *  RFC 7230 section 2.6, 3.1 and 3.5
276  *
277  * \retval -1  an error occurred. parseStatusCode indicates HTTP status result.
278  * \retval  1  successful parse. member fields contain the request-line items
279  * \retval  0  more data is needed to complete the parse
280  */
281 int
parseRequestFirstLine()282 Http::One::RequestParser::parseRequestFirstLine()
283 {
284     debugs(74, 5, "parsing possible request: buf.length=" << buf_.length());
285     debugs(74, DBG_DATA, buf_);
286 
287     SBuf line;
288 
289     // Earlier, skipGarbageLines() took care of any leading LFs (if allowed).
290     // Now, the request line has to end at the first LF.
291     static const CharacterSet lineChars = CharacterSet::LF.complement("notLF");
292     ::Parser::Tokenizer lineTok(buf_);
293     if (!lineTok.prefix(line, lineChars) || !lineTok.skip('\n')) {
294         if (buf_.length() >= Config.maxRequestHeaderSize) {
295             /* who should we blame for our failure to parse this line? */
296 
297             Http1::Tokenizer methodTok(buf_);
298             if (!parseMethodField(methodTok))
299                 return -1; // blame a bad method (or its delimiter)
300 
301             // assume it is the URI
302             debugs(74, ErrorLevel(), "invalid request-line: URI exceeds " <<
303                    Config.maxRequestHeaderSize << "-byte limit");
304             parseStatusCode = Http::scUriTooLong;
305             return -1;
306         }
307         debugs(74, 5, "Parser needs more data");
308         return 0;
309     }
310 
311     Http1::Tokenizer tok(line);
312 
313     if (!parseMethodField(tok))
314         return -1;
315 
316     /* now parse backwards, to leave just the URI */
317     if (!skipTrailingCrs(tok))
318         return -1;
319 
320     if (!parseHttpVersionField(tok))
321         return -1;
322 
323     if (!http0() && !skipDelimiter(tok.skipAllTrailing(DelimiterCharacters()), "before protocol version"))
324         return -1;
325 
326     /* parsed everything before and after the URI */
327 
328     if (!parseUriField(tok))
329         return -1;
330 
331     if (!tok.atEnd()) {
332         debugs(33, ErrorLevel(), "invalid request-line: garbage after URI");
333         parseStatusCode = Http::scBadRequest;
334         return -1;
335     }
336 
337     parseStatusCode = Http::scOkay;
338     buf_ = lineTok.remaining(); // incremental parse checkpoint
339     return 1;
340 }
341 
342 bool
parse(const SBuf & aBuf)343 Http::One::RequestParser::parse(const SBuf &aBuf)
344 {
345     const bool result = doParse(aBuf);
346     if (preserveParsed_) {
347         assert(aBuf.length() >= remaining().length());
348         parsed_.append(aBuf.substr(0, aBuf.length() - remaining().length())); // newly parsed bytes
349     }
350 
351     return result;
352 }
353 
354 // raw is not a reference because a reference might point back to our own buf_ or parsed_
355 bool
doParse(const SBuf & aBuf)356 Http::One::RequestParser::doParse(const SBuf &aBuf)
357 {
358     buf_ = aBuf;
359     debugs(74, DBG_DATA, "Parse buf={length=" << aBuf.length() << ", data='" << aBuf << "'}");
360 
361     // stage 1: locate the request-line
362     if (parsingStage_ == HTTP_PARSE_NONE) {
363         skipGarbageLines();
364 
365         // if we hit something before EOS treat it as a message
366         if (!buf_.isEmpty())
367             parsingStage_ = HTTP_PARSE_FIRST;
368         else
369             return false;
370     }
371 
372     // stage 2: parse the request-line
373     if (parsingStage_ == HTTP_PARSE_FIRST) {
374         PROF_start(HttpParserParseReqLine);
375         const int retcode = parseRequestFirstLine();
376 
377         // first-line (or a look-alike) found successfully.
378         if (retcode > 0) {
379             parsingStage_ = HTTP_PARSE_MIME;
380         }
381 
382         debugs(74, 5, "request-line: retval " << retcode << ": line={" << aBuf.length() << ", data='" << aBuf << "'}");
383         debugs(74, 5, "request-line: method: " << method_);
384         debugs(74, 5, "request-line: url: " << uri_);
385         debugs(74, 5, "request-line: proto: " << msgProtocol_);
386         debugs(74, 5, "Parser: bytes processed=" << (aBuf.length()-buf_.length()));
387         PROF_stop(HttpParserParseReqLine);
388 
389         // syntax errors already
390         if (retcode < 0) {
391             parsingStage_ = HTTP_PARSE_DONE;
392             return false;
393         }
394     }
395 
396     // stage 3: locate the mime header block
397     if (parsingStage_ == HTTP_PARSE_MIME) {
398         // HTTP/1.x request-line is valid and parsing completed.
399         if (!grabMimeBlock("Request", Config.maxRequestHeaderSize)) {
400             if (parseStatusCode == Http::scHeaderTooLarge)
401                 parseStatusCode = Http::scRequestHeaderFieldsTooLarge;
402             return false;
403         }
404     }
405 
406     return !needsMoreData();
407 }
408 
409