1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4 
5 using System.Diagnostics;
6 using System.Diagnostics.Contracts;
7 using System.Globalization;
8 using System.Text;
9 
10 namespace System.Net.Http
11 {
12     internal static class HttpRuleParser
13     {
14         private static readonly bool[] s_tokenChars = CreateTokenChars();
15         private const int maxNestedCount = 5;
16         private static readonly string[] s_dateFormats = new string[] {
17             // "r", // RFC 1123, required output format but too strict for input
18             "ddd, d MMM yyyy H:m:s 'GMT'", // RFC 1123 (r, except it allows both 1 and 01 for date and time)
19             "ddd, d MMM yyyy H:m:s", // RFC 1123, no zone - assume GMT
20             "d MMM yyyy H:m:s 'GMT'", // RFC 1123, no day-of-week
21             "d MMM yyyy H:m:s", // RFC 1123, no day-of-week, no zone
22             "ddd, d MMM yy H:m:s 'GMT'", // RFC 1123, short year
23             "ddd, d MMM yy H:m:s", // RFC 1123, short year, no zone
24             "d MMM yy H:m:s 'GMT'", // RFC 1123, no day-of-week, short year
25             "d MMM yy H:m:s", // RFC 1123, no day-of-week, short year, no zone
26 
27             "dddd, d'-'MMM'-'yy H:m:s 'GMT'", // RFC 850
28             "dddd, d'-'MMM'-'yy H:m:s", // RFC 850 no zone
29             "ddd MMM d H:m:s yyyy", // ANSI C's asctime() format
30 
31             "ddd, d MMM yyyy H:m:s zzz", // RFC 5322
32             "ddd, d MMM yyyy H:m:s", // RFC 5322 no zone
33             "d MMM yyyy H:m:s zzz", // RFC 5322 no day-of-week
34             "d MMM yyyy H:m:s", // RFC 5322 no day-of-week, no zone
35         };
36 
37         internal const char CR = (char)13;
38         internal const char LF = (char)10;
39         internal const int MaxInt64Digits = 19;
40         internal const int MaxInt32Digits = 10;
41 
42         // iso-8859-1, Western European (ISO)
43 #if uap
44         internal static readonly Encoding DefaultHttpEncoding = Encoding.GetEncoding("iso-8859-1");
45 #else
46         internal static readonly Encoding DefaultHttpEncoding = Encoding.GetEncoding(28591);
47 #endif
48 
CreateTokenChars()49         private static bool[] CreateTokenChars()
50         {
51             // token = 1*<any CHAR except CTLs or separators>
52             // CTL = <any US-ASCII control character (octets 0 - 31) and DEL (127)>
53 
54             var tokenChars = new bool[128]; // All elements default to "false".
55 
56             for (int i = 33; i < 127; i++) // Skip Space (32) & DEL (127).
57             {
58                 tokenChars[i] = true;
59             }
60 
61             // Remove separators: these are not valid token characters.
62             tokenChars[(byte)'('] = false;
63             tokenChars[(byte)')'] = false;
64             tokenChars[(byte)'<'] = false;
65             tokenChars[(byte)'>'] = false;
66             tokenChars[(byte)'@'] = false;
67             tokenChars[(byte)','] = false;
68             tokenChars[(byte)';'] = false;
69             tokenChars[(byte)':'] = false;
70             tokenChars[(byte)'\\'] = false;
71             tokenChars[(byte)'"'] = false;
72             tokenChars[(byte)'/'] = false;
73             tokenChars[(byte)'['] = false;
74             tokenChars[(byte)']'] = false;
75             tokenChars[(byte)'?'] = false;
76             tokenChars[(byte)'='] = false;
77             tokenChars[(byte)'{'] = false;
78             tokenChars[(byte)'}'] = false;
79 
80             return tokenChars;
81         }
82 
IsTokenChar(char character)83         internal static bool IsTokenChar(char character)
84         {
85             // Must be between 'space' (32) and 'DEL' (127).
86             if (character > 127)
87             {
88                 return false;
89             }
90 
91             return s_tokenChars[character];
92         }
93 
94         [Pure]
GetTokenLength(string input, int startIndex)95         internal static int GetTokenLength(string input, int startIndex)
96         {
97             Debug.Assert(input != null);
98             Contract.Ensures((Contract.Result<int>() >= 0) && (Contract.Result<int>() <= (input.Length - startIndex)));
99 
100             if (startIndex >= input.Length)
101             {
102                 return 0;
103             }
104 
105             int current = startIndex;
106 
107             while (current < input.Length)
108             {
109                 if (!IsTokenChar(input[current]))
110                 {
111                     return current - startIndex;
112                 }
113                 current++;
114             }
115             return input.Length - startIndex;
116         }
117 
118         [Pure]
IsToken(string input)119         internal static bool IsToken(string input)
120         {
121             for (int i = 0; i < input.Length; i++)
122             {
123                 if (!IsTokenChar(input[i]))
124                 {
125                     return false;
126                 }
127             }
128 
129             return true;
130         }
131 
132         [Pure]
IsToken(ReadOnlySpan<byte> input)133         internal static bool IsToken(ReadOnlySpan<byte> input)
134         {
135             for (int i = 0; i < input.Length; i++)
136             {
137                 if (!IsTokenChar((char) input[i]))
138                 {
139                     return false;
140                 }
141             }
142 
143             return true;
144         }
145 
GetWhitespaceLength(string input, int startIndex)146         internal static int GetWhitespaceLength(string input, int startIndex)
147         {
148             Debug.Assert(input != null);
149             Contract.Ensures((Contract.Result<int>() >= 0) && (Contract.Result<int>() <= (input.Length - startIndex)));
150 
151             if (startIndex >= input.Length)
152             {
153                 return 0;
154             }
155 
156             int current = startIndex;
157 
158             char c;
159             while (current < input.Length)
160             {
161                 c = input[current];
162 
163                 if ((c == ' ') || (c == '\t'))
164                 {
165                     current++;
166                     continue;
167                 }
168 
169                 if (c == '\r')
170                 {
171                     // If we have a #13 char, it must be followed by #10 and then at least one SP or HT.
172                     if ((current + 2 < input.Length) && (input[current + 1] == '\n'))
173                     {
174                         char spaceOrTab = input[current + 2];
175                         if ((spaceOrTab == ' ') || (spaceOrTab == '\t'))
176                         {
177                             current += 3;
178                             continue;
179                         }
180                     }
181                 }
182 
183                 return current - startIndex;
184             }
185 
186             // All characters between startIndex and the end of the string are LWS characters.
187             return input.Length - startIndex;
188         }
189 
ContainsInvalidNewLine(string value)190         internal static bool ContainsInvalidNewLine(string value)
191         {
192             return ContainsInvalidNewLine(value, 0);
193         }
194 
ContainsInvalidNewLine(string value, int startIndex)195         internal static bool ContainsInvalidNewLine(string value, int startIndex)
196         {
197             // Search for newlines followed by non-whitespace: This is not allowed in any header (be it a known or
198             // custom header). E.g. "value\r\nbadformat: header" is invalid. However "value\r\n goodformat: header"
199             // is valid: newlines followed by whitespace are allowed in header values.
200             int current = startIndex;
201             while (current < value.Length)
202             {
203                 if (value[current] == '\r')
204                 {
205                     int char10Index = current + 1;
206                     if ((char10Index < value.Length) && (value[char10Index] == '\n'))
207                     {
208                         current = char10Index + 1;
209 
210                         if (current == value.Length)
211                         {
212                             return true; // We have a string terminating with \r\n. This is invalid.
213                         }
214 
215                         char c = value[current];
216                         if ((c != ' ') && (c != '\t'))
217                         {
218                             return true;
219                         }
220                     }
221                 }
222                 current++;
223             }
224 
225             return false;
226         }
227 
GetNumberLength(string input, int startIndex, bool allowDecimal)228         internal static int GetNumberLength(string input, int startIndex, bool allowDecimal)
229         {
230             Debug.Assert(input != null);
231             Debug.Assert((startIndex >= 0) && (startIndex < input.Length));
232             Contract.Ensures((Contract.Result<int>() >= 0) && (Contract.Result<int>() <= (input.Length - startIndex)));
233 
234             int current = startIndex;
235             char c;
236 
237             // If decimal values are not allowed, we pretend to have read the '.' character already. I.e. if a dot is
238             // found in the string, parsing will be aborted.
239             bool haveDot = !allowDecimal;
240 
241             // The RFC doesn't allow decimal values starting with dot. I.e. value ".123" is invalid. It must be in the
242             // form "0.123". Also, there are no negative values defined in the RFC. So we'll just parse non-negative
243             // values.
244             // The RFC only allows decimal dots not ',' characters as decimal separators. Therefore value "1,23" is
245             // considered invalid and must be represented as "1.23".
246             if (input[current] == '.')
247             {
248                 return 0;
249             }
250 
251             while (current < input.Length)
252             {
253                 c = input[current];
254                 if ((c >= '0') && (c <= '9'))
255                 {
256                     current++;
257                 }
258                 else if (!haveDot && (c == '.'))
259                 {
260                     // Note that value "1." is valid.
261                     haveDot = true;
262                     current++;
263                 }
264                 else
265                 {
266                     break;
267                 }
268             }
269 
270             return current - startIndex;
271         }
272 
GetHostLength(string input, int startIndex, bool allowToken, out string host)273         internal static int GetHostLength(string input, int startIndex, bool allowToken, out string host)
274         {
275             Debug.Assert(input != null);
276             Debug.Assert(startIndex >= 0);
277             Contract.Ensures((Contract.Result<int>() >= 0) && (Contract.Result<int>() <= (input.Length - startIndex)));
278 
279             host = null;
280             if (startIndex >= input.Length)
281             {
282                 return 0;
283             }
284 
285             // A 'host' is either a token (if 'allowToken' == true) or a valid host name as defined by the URI RFC.
286             // So we first iterate through the string and search for path delimiters and whitespace. When found, stop
287             // and try to use the substring as token or URI host name. If it works, we have a host name, otherwise not.
288             int current = startIndex;
289             bool isToken = true;
290             while (current < input.Length)
291             {
292                 char c = input[current];
293                 if (c == '/')
294                 {
295                     return 0; // Host header must not contain paths.
296                 }
297 
298                 if ((c == ' ') || (c == '\t') || (c == '\r') || (c == ','))
299                 {
300                     break; // We hit a delimiter (',' or whitespace). Stop here.
301                 }
302 
303                 isToken = isToken && IsTokenChar(c);
304 
305                 current++;
306             }
307 
308             int length = current - startIndex;
309             if (length == 0)
310             {
311                 return 0;
312             }
313 
314             string result = input.Substring(startIndex, length);
315             if ((!allowToken || !isToken) && !IsValidHostName(result))
316             {
317                 return 0;
318             }
319 
320             host = result;
321             return length;
322         }
323 
GetCommentLength(string input, int startIndex, out int length)324         internal static HttpParseResult GetCommentLength(string input, int startIndex, out int length)
325         {
326             int nestedCount = 0;
327             return GetExpressionLength(input, startIndex, '(', ')', true, ref nestedCount, out length);
328         }
329 
GetQuotedStringLength(string input, int startIndex, out int length)330         internal static HttpParseResult GetQuotedStringLength(string input, int startIndex, out int length)
331         {
332             int nestedCount = 0;
333             return GetExpressionLength(input, startIndex, '"', '"', false, ref nestedCount, out length);
334         }
335 
336         // quoted-pair = "\" CHAR
337         // CHAR = <any US-ASCII character (octets 0 - 127)>
GetQuotedPairLength(string input, int startIndex, out int length)338         internal static HttpParseResult GetQuotedPairLength(string input, int startIndex, out int length)
339         {
340             Debug.Assert(input != null);
341             Debug.Assert((startIndex >= 0) && (startIndex < input.Length));
342             Contract.Ensures((Contract.ValueAtReturn(out length) >= 0) &&
343                 (Contract.ValueAtReturn(out length) <= (input.Length - startIndex)));
344 
345             length = 0;
346 
347             if (input[startIndex] != '\\')
348             {
349                 return HttpParseResult.NotParsed;
350             }
351 
352             // Quoted-char has 2 characters. Check whether there are 2 chars left ('\' + char)
353             // If so, check whether the character is in the range 0-127. If not, it's an invalid value.
354             if ((startIndex + 2 > input.Length) || (input[startIndex + 1] > 127))
355             {
356                 return HttpParseResult.InvalidFormat;
357             }
358 
359             // It doesn't matter what the char next to '\' is so we can skip along.
360             length = 2;
361             return HttpParseResult.Parsed;
362         }
363 
DateToString(DateTimeOffset dateTime)364         internal static string DateToString(DateTimeOffset dateTime)
365         {
366             // Format according to RFC1123; 'r' uses invariant info (DateTimeFormatInfo.InvariantInfo).
367             return dateTime.ToUniversalTime().ToString("r", CultureInfo.InvariantCulture);
368         }
369 
TryStringToDate(string input, out DateTimeOffset result)370         internal static bool TryStringToDate(string input, out DateTimeOffset result)
371         {
372             // Try the various date formats in the order listed above.
373             // We should accept a wide variety of common formats, but only output RFC 1123 style dates.
374             if (DateTimeOffset.TryParseExact(input, s_dateFormats, DateTimeFormatInfo.InvariantInfo,
375                 DateTimeStyles.AllowWhiteSpaces | DateTimeStyles.AssumeUniversal, out result))
376             {
377                 return true;
378             }
379 
380             return false;
381         }
382 
383         // TEXT = <any OCTET except CTLs, but including LWS>
384         // LWS = [CRLF] 1*( SP | HT )
385         // CTL = <any US-ASCII control character (octets 0 - 31) and DEL (127)>
386         //
387         // Since we don't really care about the content of a quoted string or comment, we're more tolerant and
388         // allow these characters. We only want to find the delimiters ('"' for quoted string and '(', ')' for comment).
389         //
390         // 'nestedCount': Comments can be nested. We allow a depth of up to 5 nested comments, i.e. something like
391         // "(((((comment)))))". If we wouldn't define a limit an attacker could send a comment with hundreds of nested
392         // comments, resulting in a stack overflow exception. In addition having more than 1 nested comment (if any)
393         // is unusual.
GetExpressionLength(string input, int startIndex, char openChar, char closeChar, bool supportsNesting, ref int nestedCount, out int length)394         private static HttpParseResult GetExpressionLength(string input, int startIndex, char openChar,
395             char closeChar, bool supportsNesting, ref int nestedCount, out int length)
396         {
397             Debug.Assert(input != null);
398             Debug.Assert((startIndex >= 0) && (startIndex < input.Length));
399             Contract.Ensures((Contract.Result<HttpParseResult>() != HttpParseResult.Parsed) ||
400                 (Contract.ValueAtReturn<int>(out length) > 0));
401 
402             length = 0;
403 
404             if (input[startIndex] != openChar)
405             {
406                 return HttpParseResult.NotParsed;
407             }
408 
409             int current = startIndex + 1; // Start parsing with the character next to the first open-char.
410             while (current < input.Length)
411             {
412                 // Only check whether we have a quoted char, if we have at least 3 characters left to read (i.e.
413                 // quoted char + closing char). Otherwise the closing char may be considered part of the quoted char.
414                 int quotedPairLength = 0;
415                 if ((current + 2 < input.Length) &&
416                     (GetQuotedPairLength(input, current, out quotedPairLength) == HttpParseResult.Parsed))
417                 {
418                     // We ignore invalid quoted-pairs. Invalid quoted-pairs may mean that it looked like a quoted pair,
419                     // but we actually have a quoted-string: e.g. "\ü" ('\' followed by a char >127 - quoted-pair only
420                     // allows ASCII chars after '\'; qdtext allows both '\' and >127 chars).
421                     current = current + quotedPairLength;
422                     continue;
423                 }
424 
425                 // If we support nested expressions and we find an open-char, then parse the nested expressions.
426                 if (supportsNesting && (input[current] == openChar))
427                 {
428                     nestedCount++;
429                     try
430                     {
431                         // Check if we exceeded the number of nested calls.
432                         if (nestedCount > maxNestedCount)
433                         {
434                             return HttpParseResult.InvalidFormat;
435                         }
436 
437                         int nestedLength = 0;
438                         HttpParseResult nestedResult = GetExpressionLength(input, current, openChar, closeChar,
439                             supportsNesting, ref nestedCount, out nestedLength);
440 
441                         switch (nestedResult)
442                         {
443                             case HttpParseResult.Parsed:
444                                 current += nestedLength; // Add the length of the nested expression and continue.
445                                 break;
446 
447                             case HttpParseResult.NotParsed:
448                                 Debug.Assert(false, "'NotParsed' is unexpected: We started nested expression " +
449                                     "parsing, because we found the open-char. So either it's a valid nested " +
450                                     "expression or it has invalid format.");
451                                 break;
452 
453                             case HttpParseResult.InvalidFormat:
454                                 // If the nested expression is invalid, we can't continue, so we fail with invalid format.
455                                 return HttpParseResult.InvalidFormat;
456 
457                             default:
458                                 Debug.Assert(false, "Unknown enum result: " + nestedResult);
459                                 break;
460                         }
461                     }
462                     finally
463                     {
464                         nestedCount--;
465                     }
466                 }
467 
468                 if (input[current] == closeChar)
469                 {
470                     length = current - startIndex + 1;
471                     return HttpParseResult.Parsed;
472                 }
473                 current++;
474             }
475 
476             // We didn't find the final quote, therefore we have an invalid expression string.
477             return HttpParseResult.InvalidFormat;
478         }
479 
IsValidHostName(string host)480         private static bool IsValidHostName(string host)
481         {
482             // Also add user info (u@) to make sure 'host' doesn't include user info.
483             Uri hostUri;
484             return Uri.TryCreate("http://u@" + host + "/", UriKind.Absolute, out hostUri);
485         }
486     }
487 }
488