1 /* Based on src/http/ngx_http_parse.c from NGINX copyright Igor Sysoev
2  *
3  * Additional changes are licensed under the same terms as NGINX and
4  * copyright Joyent, Inc. and other Node contributors. All rights reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 /**
26  * This is the http_parser_parse_url function extracted from joyent's
27  * http-parser library.
28  */
29 
30 #include <assert.h>
31 #include <stdint.h>
32 #include <stdlib.h>
33 
34 #include "url_parser.h"
35 
36 /* Macros for character classes; depends on strict-mode  */
37 #define CR                  '\r'
38 #define LF                  '\n'
39 #define LOWER(c)            (unsigned char)(c | 0x20)
40 #define IS_ALPHA(c)         (LOWER(c) >= 'a' && LOWER(c) <= 'z')
41 #define IS_NUM(c)           ((c) >= '0' && (c) <= '9')
42 #define IS_ALPHANUM(c)      (IS_ALPHA(c) || IS_NUM(c))
43 #define IS_HEX(c)           (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))
44 #define IS_MARK(c)          ((c) == '-' || (c) == '_' || (c) == '.' || \
45   (c) == '!' || (c) == '~' || (c) == '*' || (c) == '\'' || (c) == '(' || \
46   (c) == ')')
47 #define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || \
48   (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \
49   (c) == '$' || (c) == ',')
50 
51 #define STRICT_TOKEN(c)     (tokens[(unsigned char)c])
52 
53 #ifndef BIT_AT
54 # define BIT_AT(a, i)                                                \
55   (!!((unsigned int) (a)[(unsigned int) (i) >> 3] &                  \
56    (1 << ((unsigned int) (i) & 7))))
57 #endif
58 
59 #if HTTP_PARSER_STRICT
60 # define T(v) 0
61 #else
62 # define T(v) v
63 #endif
64 
65 static const uint8_t normal_url_char[32] = {
66 /*   0 nul    1 soh    2 stx    3 etx    4 eot    5 enq    6 ack    7 bel  */
67         0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
68 /*   8 bs     9 ht    10 nl    11 vt    12 np    13 cr    14 so    15 si   */
69         0    | T(2)   |   0    |   0    | T(16)  |   0    |   0    |   0,
70 /*  16 dle   17 dc1   18 dc2   19 dc3   20 dc4   21 nak   22 syn   23 etb */
71         0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
72 /*  24 can   25 em    26 sub   27 esc   28 fs    29 gs    30 rs    31 us  */
73         0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
74 /*  32 sp    33  !    34  "    35  #    36  $    37  %    38  &    39  '  */
75         0    |   2    |   4    |   0    |   16   |   32   |   64   |  128,
76 /*  40  (    41  )    42  *    43  +    44  ,    45  -    46  .    47  /  */
77         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
78 /*  48  0    49  1    50  2    51  3    52  4    53  5    54  6    55  7  */
79         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
80 /*  56  8    57  9    58  :    59  ;    60  <    61  =    62  >    63  ?  */
81         1    |   2    |   4    |   8    |   16   |   32   |   64   |   0,
82 /*  64  @    65  A    66  B    67  C    68  D    69  E    70  F    71  G  */
83         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
84 /*  72  H    73  I    74  J    75  K    76  L    77  M    78  N    79  O  */
85         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
86 /*  80  P    81  Q    82  R    83  S    84  T    85  U    86  V    87  W  */
87         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
88 /*  88  X    89  Y    90  Z    91  [    92  \    93  ]    94  ^    95  _  */
89         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
90 /*  96  `    97  a    98  b    99  c   100  d   101  e   102  f   103  g  */
91         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
92 /* 104  h   105  i   106  j   107  k   108  l   109  m   110  n   111  o  */
93         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
94 /* 112  p   113  q   114  r   115  s   116  t   117  u   118  v   119  w  */
95         1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
96 /* 120  x   121  y   122  z   123  {   124  |   125  }   126  ~   127 del */
97         1    |   2    |   4    |   8    |   16   |   32   |   64   |   0, };
98 
99 #if HTTP_PARSER_STRICT
100 #define TOKEN(c)            (tokens[(unsigned char)c])
101 #define IS_URL_CHAR(c)      (BIT_AT(normal_url_char, (unsigned char)c))
102 #define IS_HOST_CHAR(c)     (IS_ALPHANUM(c) || (c) == '.' || (c) == '-')
103 #else
104 #define TOKEN(c)            ((c == ' ') ? ' ' : tokens[(unsigned char)c])
105 #define IS_URL_CHAR(c)                                                         \
106   (BIT_AT(normal_url_char, (unsigned char)c) || ((c) & 0x80))
107 #define IS_HOST_CHAR(c)                                                        \
108   (IS_ALPHANUM(c) || (c) == '.' || (c) == '-' || (c) == '_')
109 #endif
110 
111 
112 
113 enum state
114   { s_dead = 1 /* important that this is > 0 */
115 
116   , s_start_req_or_res
117   , s_res_or_resp_H
118   , s_start_res
119   , s_res_H
120   , s_res_HT
121   , s_res_HTT
122   , s_res_HTTP
123   , s_res_first_http_major
124   , s_res_http_major
125   , s_res_first_http_minor
126   , s_res_http_minor
127   , s_res_first_status_code
128   , s_res_status_code
129   , s_res_status_start
130   , s_res_status
131   , s_res_line_almost_done
132 
133   , s_start_req
134 
135   , s_req_method
136   , s_req_spaces_before_url
137   , s_req_schema
138   , s_req_schema_slash
139   , s_req_schema_slash_slash
140   , s_req_server_start
141   , s_req_server
142   , s_req_server_with_at
143   , s_req_path
144   , s_req_query_string_start
145   , s_req_query_string
146   , s_req_fragment_start
147   , s_req_fragment
148   , s_req_http_start
149   , s_req_http_H
150   , s_req_http_HT
151   , s_req_http_HTT
152   , s_req_http_HTTP
153   , s_req_first_http_major
154   , s_req_http_major
155   , s_req_first_http_minor
156   , s_req_http_minor
157   , s_req_line_almost_done
158 
159   , s_header_field_start
160   , s_header_field
161   , s_header_value_discard_ws
162   , s_header_value_discard_ws_almost_done
163   , s_header_value_discard_lws
164   , s_header_value_start
165   , s_header_value
166   , s_header_value_lws
167 
168   , s_header_almost_done
169 
170   , s_chunk_size_start
171   , s_chunk_size
172   , s_chunk_parameters
173   , s_chunk_size_almost_done
174 
175   , s_headers_almost_done
176   , s_headers_done
177 
178   /* Important: 's_headers_done' must be the last 'header' state. All
179    * states beyond this must be 'body' states. It is used for overflow
180    * checking. See the PARSING_HEADER() macro.
181    */
182 
183   , s_chunk_data
184   , s_chunk_data_almost_done
185   , s_chunk_data_done
186 
187   , s_body_identity
188   , s_body_identity_eof
189 
190   , s_message_done
191   };
192 
193 
194 enum http_host_state
195   {
196     s_http_host_dead = 1
197   , s_http_userinfo_start
198   , s_http_userinfo
199   , s_http_host_start
200   , s_http_host_v6_start
201   , s_http_host
202   , s_http_host_v6
203   , s_http_host_v6_end
204   , s_http_host_v6_zone_start
205   , s_http_host_v6_zone
206   , s_http_host_port_start
207   , s_http_host_port
208 };
209 
210 /* Our URL parser.
211  *
212  * This is designed to be shared by http_parser_execute() for URL validation,
213  * hence it has a state transition + byte-for-byte interface. In addition, it
214  * is meant to be embedded in http_parser_parse_url(), which does the dirty
215  * work of turning state transitions URL components for its API.
216  *
217  * This function should only be invoked with non-space characters. It is
218  * assumed that the caller cares about (and can detect) the transition between
219  * URL and non-URL states by looking for these.
220  */
221 static enum state
parse_url_char(enum state s,const char ch)222 parse_url_char(enum state s, const char ch)
223 {
224   if (ch == ' ' || ch == '\r' || ch == '\n') {
225     return s_dead;
226   }
227 
228 #if HTTP_PARSER_STRICT
229   if (ch == '\t' || ch == '\f') {
230     return s_dead;
231   }
232 #endif
233 
234   switch (s) {
235     case s_req_spaces_before_url:
236       /* Proxied requests are followed by scheme of an absolute URI (alpha).
237        * All methods except CONNECT are followed by '/' or '*'.
238        */
239 
240       if (ch == '/' || ch == '*') {
241         return s_req_path;
242       }
243 
244       if (IS_ALPHA(ch)) {
245         return s_req_schema;
246       }
247 
248       break;
249 
250     case s_req_schema:
251       if (IS_ALPHA(ch)) {
252         return s;
253       }
254 
255       if (ch == ':') {
256         return s_req_schema_slash;
257       }
258 
259       break;
260 
261     case s_req_schema_slash:
262       if (ch == '/') {
263         return s_req_schema_slash_slash;
264       }
265 
266       break;
267 
268     case s_req_schema_slash_slash:
269       if (ch == '/') {
270         return s_req_server_start;
271       }
272 
273       break;
274 
275     case s_req_server_with_at:
276       if (ch == '@') {
277         return s_dead;
278       }
279 
280     /* FALLTHROUGH */
281     case s_req_server_start:
282     case s_req_server:
283       if (ch == '/') {
284         return s_req_path;
285       }
286 
287       if (ch == '?') {
288         return s_req_query_string_start;
289       }
290 
291       if (ch == '@') {
292         return s_req_server_with_at;
293       }
294 
295       if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
296         return s_req_server;
297       }
298 
299       break;
300 
301     case s_req_path:
302       if (IS_URL_CHAR(ch)) {
303         return s;
304       }
305 
306       switch (ch) {
307         case '?':
308           return s_req_query_string_start;
309 
310         case '#':
311           return s_req_fragment_start;
312       }
313 
314       break;
315 
316     case s_req_query_string_start:
317     case s_req_query_string:
318       if (IS_URL_CHAR(ch)) {
319         return s_req_query_string;
320       }
321 
322       switch (ch) {
323         case '?':
324           /* allow extra '?' in query string */
325           return s_req_query_string;
326 
327         case '#':
328           return s_req_fragment_start;
329       }
330 
331       break;
332 
333     case s_req_fragment_start:
334       if (IS_URL_CHAR(ch)) {
335         return s_req_fragment;
336       }
337 
338       switch (ch) {
339         case '?':
340           return s_req_fragment;
341 
342         case '#':
343           return s;
344       }
345 
346       break;
347 
348     case s_req_fragment:
349       if (IS_URL_CHAR(ch)) {
350         return s;
351       }
352 
353       switch (ch) {
354         case '?':
355         case '#':
356           return s;
357       }
358 
359       break;
360 
361     default:
362       break;
363   }
364 
365   /* We should never fall out of the switch above unless there's an error */
366   return s_dead;
367 }
368 
369 static enum http_host_state
http_parse_host_char(enum http_host_state s,const char ch)370 http_parse_host_char(enum http_host_state s, const char ch) {
371   switch(s) {
372     case s_http_userinfo:
373     case s_http_userinfo_start:
374       if (ch == '@') {
375         return s_http_host_start;
376       }
377 
378       if (IS_USERINFO_CHAR(ch)) {
379         return s_http_userinfo;
380       }
381       break;
382 
383     case s_http_host_start:
384       if (ch == '[') {
385         return s_http_host_v6_start;
386       }
387 
388       if (IS_HOST_CHAR(ch)) {
389         return s_http_host;
390       }
391 
392       break;
393 
394     case s_http_host:
395       if (IS_HOST_CHAR(ch)) {
396         return s_http_host;
397       }
398 
399     /* FALLTHROUGH */
400     case s_http_host_v6_end:
401       if (ch == ':') {
402         return s_http_host_port_start;
403       }
404 
405       break;
406 
407     case s_http_host_v6:
408       if (ch == ']') {
409         return s_http_host_v6_end;
410       }
411 
412     /* FALLTHROUGH */
413     case s_http_host_v6_start:
414       if (IS_HEX(ch) || ch == ':' || ch == '.') {
415         return s_http_host_v6;
416       }
417 
418       if (s == s_http_host_v6 && ch == '%') {
419         return s_http_host_v6_zone_start;
420       }
421       break;
422 
423     case s_http_host_v6_zone:
424       if (ch == ']') {
425         return s_http_host_v6_end;
426       }
427 
428     /* FALLTHROUGH */
429     case s_http_host_v6_zone_start:
430       /* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */
431       if (IS_ALPHANUM(ch) || ch == '%' || ch == '.' || ch == '-' || ch == '_' ||
432           ch == '~') {
433         return s_http_host_v6_zone;
434       }
435       break;
436 
437     case s_http_host_port:
438     case s_http_host_port_start:
439       if (IS_NUM(ch)) {
440         return s_http_host_port;
441       }
442 
443       break;
444 
445     default:
446       break;
447   }
448   return s_http_host_dead;
449 }
450 
451 static int
http_parse_host(const char * buf,struct http_parser_url * u,int found_at)452 http_parse_host(const char * buf, struct http_parser_url *u, int found_at) {
453   assert(u->field_set & (1 << UF_HOST));
454   enum http_host_state s;
455 
456   const char *p;
457   size_t buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len;
458 
459   u->field_data[UF_HOST].len = 0;
460 
461   s = found_at ? s_http_userinfo_start : s_http_host_start;
462 
463   for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) {
464     enum http_host_state new_s = http_parse_host_char(s, *p);
465 
466     if (new_s == s_http_host_dead) {
467       return 1;
468     }
469 
470     switch(new_s) {
471       case s_http_host:
472         if (s != s_http_host) {
473           u->field_data[UF_HOST].off = p - buf;
474         }
475         u->field_data[UF_HOST].len++;
476         break;
477 
478       case s_http_host_v6:
479         if (s != s_http_host_v6) {
480           u->field_data[UF_HOST].off = p - buf;
481         }
482         u->field_data[UF_HOST].len++;
483         break;
484 
485       case s_http_host_v6_zone_start:
486       case s_http_host_v6_zone:
487         u->field_data[UF_HOST].len++;
488         break;
489 
490       case s_http_host_port:
491         if (s != s_http_host_port) {
492           u->field_data[UF_PORT].off = p - buf;
493           u->field_data[UF_PORT].len = 0;
494           u->field_set |= (1 << UF_PORT);
495         }
496         u->field_data[UF_PORT].len++;
497         break;
498 
499       case s_http_userinfo:
500         if (s != s_http_userinfo) {
501           u->field_data[UF_USERINFO].off = p - buf ;
502           u->field_data[UF_USERINFO].len = 0;
503           u->field_set |= (1 << UF_USERINFO);
504         }
505         u->field_data[UF_USERINFO].len++;
506         break;
507 
508       default:
509         break;
510     }
511     s = new_s;
512   }
513 
514   /* Make sure we don't end somewhere unexpected */
515   switch (s) {
516     case s_http_host_start:
517     case s_http_host_v6_start:
518     case s_http_host_v6:
519     case s_http_host_v6_zone_start:
520     case s_http_host_v6_zone:
521     case s_http_host_port_start:
522     case s_http_userinfo:
523     case s_http_userinfo_start:
524       return 1;
525     default:
526       break;
527   }
528 
529   return 0;
530 }
531 
532 
533 int
http_parser_parse_url(const char * buf,size_t buflen,int is_connect,struct http_parser_url * u)534 http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
535                       struct http_parser_url *u)
536 {
537   enum state s;
538   const char *p;
539   enum http_parser_url_fields uf, old_uf;
540   int found_at = 0;
541 
542   u->port = u->field_set = 0;
543   s = is_connect ? s_req_server_start : s_req_spaces_before_url;
544   old_uf = UF_MAX;
545 
546   for (p = buf; p < buf + buflen; p++) {
547     s = parse_url_char(s, *p);
548 
549     /* Figure out the next field that we're operating on */
550     switch (s) {
551       case s_dead:
552         return 1;
553 
554       /* Skip delimeters */
555       case s_req_schema_slash:
556       case s_req_schema_slash_slash:
557       case s_req_server_start:
558       case s_req_query_string_start:
559       case s_req_fragment_start:
560         continue;
561 
562       case s_req_schema:
563         uf = UF_SCHEMA;
564         break;
565 
566       case s_req_server_with_at:
567         found_at = 1;
568 
569       /* FALLTROUGH */
570       case s_req_server:
571         uf = UF_HOST;
572         break;
573 
574       case s_req_path:
575         uf = UF_PATH;
576         break;
577 
578       case s_req_query_string:
579         uf = UF_QUERY;
580         break;
581 
582       case s_req_fragment:
583         uf = UF_FRAGMENT;
584         break;
585 
586       default:
587         assert(!"Unexpected state");
588         return 1;
589     }
590 
591     /* Nothing's changed; soldier on */
592     if (uf == old_uf) {
593       u->field_data[uf].len++;
594       continue;
595     }
596 
597     u->field_data[uf].off = p - buf;
598     u->field_data[uf].len = 1;
599 
600     u->field_set |= (1 << uf);
601     old_uf = uf;
602   }
603 
604   /* host must be present if there is a schema */
605   /* parsing http:///toto will fail */
606   if ((u->field_set & (1 << UF_SCHEMA)) &&
607       (u->field_set & (1 << UF_HOST)) == 0) {
608     return 1;
609   }
610 
611   if (u->field_set & (1 << UF_HOST)) {
612     if (http_parse_host(buf, u, found_at) != 0) {
613       return 1;
614     }
615   }
616 
617   /* CONNECT requests can only contain "hostname:port" */
618   if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
619     return 1;
620   }
621 
622   if (u->field_set & (1 << UF_PORT)) {
623     /* Don't bother with endp; we've already validated the string */
624     unsigned long v = strtoul(buf + u->field_data[UF_PORT].off, NULL, 10);
625 
626     /* Ports have a max value of 2^16 */
627     if (v > 0xffff) {
628       return 1;
629     }
630 
631     u->port = (uint16_t) v;
632   }
633 
634   return 0;
635 }
636