1 /* Based on src/http/ngx_http_parse.c from NGINX copyright Igor Sysoev
2 *
3 * Additional changes are licensed under the same terms as NGINX and
4 * copyright Joyent, Inc. and other Node contributors. All rights reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 /**
26 * This is the http_parser_parse_url function extracted from joyent's
27 * http-parser library.
28 */
29
30 #include <assert.h>
31 #include <stdint.h>
32 #include <stdlib.h>
33
34 #include "url_parser.h"
35
36 /* Macros for character classes; depends on strict-mode */
37 #define CR '\r'
38 #define LF '\n'
39 #define LOWER(c) (unsigned char)(c | 0x20)
40 #define IS_ALPHA(c) (LOWER(c) >= 'a' && LOWER(c) <= 'z')
41 #define IS_NUM(c) ((c) >= '0' && (c) <= '9')
42 #define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c))
43 #define IS_HEX(c) (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))
44 #define IS_MARK(c) ((c) == '-' || (c) == '_' || (c) == '.' || \
45 (c) == '!' || (c) == '~' || (c) == '*' || (c) == '\'' || (c) == '(' || \
46 (c) == ')')
47 #define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || \
48 (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \
49 (c) == '$' || (c) == ',')
50
51 #define STRICT_TOKEN(c) (tokens[(unsigned char)c])
52
53 #ifndef BIT_AT
54 # define BIT_AT(a, i) \
55 (!!((unsigned int) (a)[(unsigned int) (i) >> 3] & \
56 (1 << ((unsigned int) (i) & 7))))
57 #endif
58
59 #if HTTP_PARSER_STRICT
60 # define T(v) 0
61 #else
62 # define T(v) v
63 #endif
64
65 static const uint8_t normal_url_char[32] = {
66 /* 0 nul 1 soh 2 stx 3 etx 4 eot 5 enq 6 ack 7 bel */
67 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
68 /* 8 bs 9 ht 10 nl 11 vt 12 np 13 cr 14 so 15 si */
69 0 | T(2) | 0 | 0 | T(16) | 0 | 0 | 0,
70 /* 16 dle 17 dc1 18 dc2 19 dc3 20 dc4 21 nak 22 syn 23 etb */
71 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
72 /* 24 can 25 em 26 sub 27 esc 28 fs 29 gs 30 rs 31 us */
73 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
74 /* 32 sp 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' */
75 0 | 2 | 4 | 0 | 16 | 32 | 64 | 128,
76 /* 40 ( 41 ) 42 * 43 + 44 , 45 - 46 . 47 / */
77 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
78 /* 48 0 49 1 50 2 51 3 52 4 53 5 54 6 55 7 */
79 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
80 /* 56 8 57 9 58 : 59 ; 60 < 61 = 62 > 63 ? */
81 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0,
82 /* 64 @ 65 A 66 B 67 C 68 D 69 E 70 F 71 G */
83 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
84 /* 72 H 73 I 74 J 75 K 76 L 77 M 78 N 79 O */
85 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
86 /* 80 P 81 Q 82 R 83 S 84 T 85 U 86 V 87 W */
87 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
88 /* 88 X 89 Y 90 Z 91 [ 92 \ 93 ] 94 ^ 95 _ */
89 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
90 /* 96 ` 97 a 98 b 99 c 100 d 101 e 102 f 103 g */
91 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
92 /* 104 h 105 i 106 j 107 k 108 l 109 m 110 n 111 o */
93 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
94 /* 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w */
95 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
96 /* 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ 127 del */
97 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0, };
98
99 #if HTTP_PARSER_STRICT
100 #define TOKEN(c) (tokens[(unsigned char)c])
101 #define IS_URL_CHAR(c) (BIT_AT(normal_url_char, (unsigned char)c))
102 #define IS_HOST_CHAR(c) (IS_ALPHANUM(c) || (c) == '.' || (c) == '-')
103 #else
104 #define TOKEN(c) ((c == ' ') ? ' ' : tokens[(unsigned char)c])
105 #define IS_URL_CHAR(c) \
106 (BIT_AT(normal_url_char, (unsigned char)c) || ((c) & 0x80))
107 #define IS_HOST_CHAR(c) \
108 (IS_ALPHANUM(c) || (c) == '.' || (c) == '-' || (c) == '_')
109 #endif
110
111
112
113 enum state
114 { s_dead = 1 /* important that this is > 0 */
115
116 , s_start_req_or_res
117 , s_res_or_resp_H
118 , s_start_res
119 , s_res_H
120 , s_res_HT
121 , s_res_HTT
122 , s_res_HTTP
123 , s_res_first_http_major
124 , s_res_http_major
125 , s_res_first_http_minor
126 , s_res_http_minor
127 , s_res_first_status_code
128 , s_res_status_code
129 , s_res_status_start
130 , s_res_status
131 , s_res_line_almost_done
132
133 , s_start_req
134
135 , s_req_method
136 , s_req_spaces_before_url
137 , s_req_schema
138 , s_req_schema_slash
139 , s_req_schema_slash_slash
140 , s_req_server_start
141 , s_req_server
142 , s_req_server_with_at
143 , s_req_path
144 , s_req_query_string_start
145 , s_req_query_string
146 , s_req_fragment_start
147 , s_req_fragment
148 , s_req_http_start
149 , s_req_http_H
150 , s_req_http_HT
151 , s_req_http_HTT
152 , s_req_http_HTTP
153 , s_req_first_http_major
154 , s_req_http_major
155 , s_req_first_http_minor
156 , s_req_http_minor
157 , s_req_line_almost_done
158
159 , s_header_field_start
160 , s_header_field
161 , s_header_value_discard_ws
162 , s_header_value_discard_ws_almost_done
163 , s_header_value_discard_lws
164 , s_header_value_start
165 , s_header_value
166 , s_header_value_lws
167
168 , s_header_almost_done
169
170 , s_chunk_size_start
171 , s_chunk_size
172 , s_chunk_parameters
173 , s_chunk_size_almost_done
174
175 , s_headers_almost_done
176 , s_headers_done
177
178 /* Important: 's_headers_done' must be the last 'header' state. All
179 * states beyond this must be 'body' states. It is used for overflow
180 * checking. See the PARSING_HEADER() macro.
181 */
182
183 , s_chunk_data
184 , s_chunk_data_almost_done
185 , s_chunk_data_done
186
187 , s_body_identity
188 , s_body_identity_eof
189
190 , s_message_done
191 };
192
193
194 enum http_host_state
195 {
196 s_http_host_dead = 1
197 , s_http_userinfo_start
198 , s_http_userinfo
199 , s_http_host_start
200 , s_http_host_v6_start
201 , s_http_host
202 , s_http_host_v6
203 , s_http_host_v6_end
204 , s_http_host_v6_zone_start
205 , s_http_host_v6_zone
206 , s_http_host_port_start
207 , s_http_host_port
208 };
209
210 /* Our URL parser.
211 *
212 * This is designed to be shared by http_parser_execute() for URL validation,
213 * hence it has a state transition + byte-for-byte interface. In addition, it
214 * is meant to be embedded in http_parser_parse_url(), which does the dirty
215 * work of turning state transitions URL components for its API.
216 *
217 * This function should only be invoked with non-space characters. It is
218 * assumed that the caller cares about (and can detect) the transition between
219 * URL and non-URL states by looking for these.
220 */
221 static enum state
parse_url_char(enum state s,const char ch)222 parse_url_char(enum state s, const char ch)
223 {
224 if (ch == ' ' || ch == '\r' || ch == '\n') {
225 return s_dead;
226 }
227
228 #if HTTP_PARSER_STRICT
229 if (ch == '\t' || ch == '\f') {
230 return s_dead;
231 }
232 #endif
233
234 switch (s) {
235 case s_req_spaces_before_url:
236 /* Proxied requests are followed by scheme of an absolute URI (alpha).
237 * All methods except CONNECT are followed by '/' or '*'.
238 */
239
240 if (ch == '/' || ch == '*') {
241 return s_req_path;
242 }
243
244 if (IS_ALPHA(ch)) {
245 return s_req_schema;
246 }
247
248 break;
249
250 case s_req_schema:
251 if (IS_ALPHA(ch)) {
252 return s;
253 }
254
255 if (ch == ':') {
256 return s_req_schema_slash;
257 }
258
259 break;
260
261 case s_req_schema_slash:
262 if (ch == '/') {
263 return s_req_schema_slash_slash;
264 }
265
266 break;
267
268 case s_req_schema_slash_slash:
269 if (ch == '/') {
270 return s_req_server_start;
271 }
272
273 break;
274
275 case s_req_server_with_at:
276 if (ch == '@') {
277 return s_dead;
278 }
279
280 /* FALLTHROUGH */
281 case s_req_server_start:
282 case s_req_server:
283 if (ch == '/') {
284 return s_req_path;
285 }
286
287 if (ch == '?') {
288 return s_req_query_string_start;
289 }
290
291 if (ch == '@') {
292 return s_req_server_with_at;
293 }
294
295 if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
296 return s_req_server;
297 }
298
299 break;
300
301 case s_req_path:
302 if (IS_URL_CHAR(ch)) {
303 return s;
304 }
305
306 switch (ch) {
307 case '?':
308 return s_req_query_string_start;
309
310 case '#':
311 return s_req_fragment_start;
312 }
313
314 break;
315
316 case s_req_query_string_start:
317 case s_req_query_string:
318 if (IS_URL_CHAR(ch)) {
319 return s_req_query_string;
320 }
321
322 switch (ch) {
323 case '?':
324 /* allow extra '?' in query string */
325 return s_req_query_string;
326
327 case '#':
328 return s_req_fragment_start;
329 }
330
331 break;
332
333 case s_req_fragment_start:
334 if (IS_URL_CHAR(ch)) {
335 return s_req_fragment;
336 }
337
338 switch (ch) {
339 case '?':
340 return s_req_fragment;
341
342 case '#':
343 return s;
344 }
345
346 break;
347
348 case s_req_fragment:
349 if (IS_URL_CHAR(ch)) {
350 return s;
351 }
352
353 switch (ch) {
354 case '?':
355 case '#':
356 return s;
357 }
358
359 break;
360
361 default:
362 break;
363 }
364
365 /* We should never fall out of the switch above unless there's an error */
366 return s_dead;
367 }
368
369 static enum http_host_state
http_parse_host_char(enum http_host_state s,const char ch)370 http_parse_host_char(enum http_host_state s, const char ch) {
371 switch(s) {
372 case s_http_userinfo:
373 case s_http_userinfo_start:
374 if (ch == '@') {
375 return s_http_host_start;
376 }
377
378 if (IS_USERINFO_CHAR(ch)) {
379 return s_http_userinfo;
380 }
381 break;
382
383 case s_http_host_start:
384 if (ch == '[') {
385 return s_http_host_v6_start;
386 }
387
388 if (IS_HOST_CHAR(ch)) {
389 return s_http_host;
390 }
391
392 break;
393
394 case s_http_host:
395 if (IS_HOST_CHAR(ch)) {
396 return s_http_host;
397 }
398
399 /* FALLTHROUGH */
400 case s_http_host_v6_end:
401 if (ch == ':') {
402 return s_http_host_port_start;
403 }
404
405 break;
406
407 case s_http_host_v6:
408 if (ch == ']') {
409 return s_http_host_v6_end;
410 }
411
412 /* FALLTHROUGH */
413 case s_http_host_v6_start:
414 if (IS_HEX(ch) || ch == ':' || ch == '.') {
415 return s_http_host_v6;
416 }
417
418 if (s == s_http_host_v6 && ch == '%') {
419 return s_http_host_v6_zone_start;
420 }
421 break;
422
423 case s_http_host_v6_zone:
424 if (ch == ']') {
425 return s_http_host_v6_end;
426 }
427
428 /* FALLTHROUGH */
429 case s_http_host_v6_zone_start:
430 /* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */
431 if (IS_ALPHANUM(ch) || ch == '%' || ch == '.' || ch == '-' || ch == '_' ||
432 ch == '~') {
433 return s_http_host_v6_zone;
434 }
435 break;
436
437 case s_http_host_port:
438 case s_http_host_port_start:
439 if (IS_NUM(ch)) {
440 return s_http_host_port;
441 }
442
443 break;
444
445 default:
446 break;
447 }
448 return s_http_host_dead;
449 }
450
451 static int
http_parse_host(const char * buf,struct http_parser_url * u,int found_at)452 http_parse_host(const char * buf, struct http_parser_url *u, int found_at) {
453 assert(u->field_set & (1 << UF_HOST));
454 enum http_host_state s;
455
456 const char *p;
457 size_t buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len;
458
459 u->field_data[UF_HOST].len = 0;
460
461 s = found_at ? s_http_userinfo_start : s_http_host_start;
462
463 for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) {
464 enum http_host_state new_s = http_parse_host_char(s, *p);
465
466 if (new_s == s_http_host_dead) {
467 return 1;
468 }
469
470 switch(new_s) {
471 case s_http_host:
472 if (s != s_http_host) {
473 u->field_data[UF_HOST].off = p - buf;
474 }
475 u->field_data[UF_HOST].len++;
476 break;
477
478 case s_http_host_v6:
479 if (s != s_http_host_v6) {
480 u->field_data[UF_HOST].off = p - buf;
481 }
482 u->field_data[UF_HOST].len++;
483 break;
484
485 case s_http_host_v6_zone_start:
486 case s_http_host_v6_zone:
487 u->field_data[UF_HOST].len++;
488 break;
489
490 case s_http_host_port:
491 if (s != s_http_host_port) {
492 u->field_data[UF_PORT].off = p - buf;
493 u->field_data[UF_PORT].len = 0;
494 u->field_set |= (1 << UF_PORT);
495 }
496 u->field_data[UF_PORT].len++;
497 break;
498
499 case s_http_userinfo:
500 if (s != s_http_userinfo) {
501 u->field_data[UF_USERINFO].off = p - buf ;
502 u->field_data[UF_USERINFO].len = 0;
503 u->field_set |= (1 << UF_USERINFO);
504 }
505 u->field_data[UF_USERINFO].len++;
506 break;
507
508 default:
509 break;
510 }
511 s = new_s;
512 }
513
514 /* Make sure we don't end somewhere unexpected */
515 switch (s) {
516 case s_http_host_start:
517 case s_http_host_v6_start:
518 case s_http_host_v6:
519 case s_http_host_v6_zone_start:
520 case s_http_host_v6_zone:
521 case s_http_host_port_start:
522 case s_http_userinfo:
523 case s_http_userinfo_start:
524 return 1;
525 default:
526 break;
527 }
528
529 return 0;
530 }
531
532
533 int
http_parser_parse_url(const char * buf,size_t buflen,int is_connect,struct http_parser_url * u)534 http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
535 struct http_parser_url *u)
536 {
537 enum state s;
538 const char *p;
539 enum http_parser_url_fields uf, old_uf;
540 int found_at = 0;
541
542 u->port = u->field_set = 0;
543 s = is_connect ? s_req_server_start : s_req_spaces_before_url;
544 old_uf = UF_MAX;
545
546 for (p = buf; p < buf + buflen; p++) {
547 s = parse_url_char(s, *p);
548
549 /* Figure out the next field that we're operating on */
550 switch (s) {
551 case s_dead:
552 return 1;
553
554 /* Skip delimeters */
555 case s_req_schema_slash:
556 case s_req_schema_slash_slash:
557 case s_req_server_start:
558 case s_req_query_string_start:
559 case s_req_fragment_start:
560 continue;
561
562 case s_req_schema:
563 uf = UF_SCHEMA;
564 break;
565
566 case s_req_server_with_at:
567 found_at = 1;
568
569 /* FALLTROUGH */
570 case s_req_server:
571 uf = UF_HOST;
572 break;
573
574 case s_req_path:
575 uf = UF_PATH;
576 break;
577
578 case s_req_query_string:
579 uf = UF_QUERY;
580 break;
581
582 case s_req_fragment:
583 uf = UF_FRAGMENT;
584 break;
585
586 default:
587 assert(!"Unexpected state");
588 return 1;
589 }
590
591 /* Nothing's changed; soldier on */
592 if (uf == old_uf) {
593 u->field_data[uf].len++;
594 continue;
595 }
596
597 u->field_data[uf].off = p - buf;
598 u->field_data[uf].len = 1;
599
600 u->field_set |= (1 << uf);
601 old_uf = uf;
602 }
603
604 /* host must be present if there is a schema */
605 /* parsing http:///toto will fail */
606 if ((u->field_set & (1 << UF_SCHEMA)) &&
607 (u->field_set & (1 << UF_HOST)) == 0) {
608 return 1;
609 }
610
611 if (u->field_set & (1 << UF_HOST)) {
612 if (http_parse_host(buf, u, found_at) != 0) {
613 return 1;
614 }
615 }
616
617 /* CONNECT requests can only contain "hostname:port" */
618 if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
619 return 1;
620 }
621
622 if (u->field_set & (1 << UF_PORT)) {
623 /* Don't bother with endp; we've already validated the string */
624 unsigned long v = strtoul(buf + u->field_data[UF_PORT].off, NULL, 10);
625
626 /* Ports have a max value of 2^16 */
627 if (v > 0xffff) {
628 return 1;
629 }
630
631 u->port = (uint16_t) v;
632 }
633
634 return 0;
635 }
636