1 //
2 // Copyright 2020 Staysail Systems, Inc. <info@staysail.tech>
3 // Copyright 2018 Capitar IT Group BV <info@capitar.com>
4 //
5 // This software is supplied under the terms of the MIT License, a
6 // copy of which should be located in the distribution where this
7 // file was obtained (LICENSE.txt). A copy of the license may also be
8 // found online at https://opensource.org/licenses/MIT.
9 //
10
11 #include "core/nng_impl.h"
12
13 #include <ctype.h>
14 #include <stdbool.h>
15 #include <stdio.h>
16 #include <string.h>
17
18 #include "url.h"
19
20 static uint8_t
url_hex_val(char c)21 url_hex_val(char c)
22 {
23 if ((c >= '0') && (c <= '9')) {
24 return (c - '0');
25 }
26 if ((c >= 'A') && (c <= 'F')) {
27 return ((c - 'A') + 10);
28 }
29 if ((c >= 'a') && (c <= 'f')) {
30 return ((c - 'a') + 10);
31 }
32 return (0);
33 }
34
35 // This returns either 0, or NNG_EINVAL, if the supplied input string
36 // is malformed UTF-8. We consider UTF-8 malformed when the sequence
37 // is an invalid code point, not the shortest possible code point, or
38 // incomplete.
39 static int
url_utf8_validate(void * arg)40 url_utf8_validate(void *arg)
41 {
42 uint8_t *s = arg;
43 uint32_t v, minv;
44 int nb;
45
46 while (*s) {
47 if ((s[0] & 0x80u) == 0) {
48 s++;
49 continue;
50 }
51 if ((s[0] & 0xe0u) == 0xc0) {
52 // 0x80 thru 0x7ff
53 v = (s[0] & 0x1fu);
54 minv = 0x80;
55 nb = 1;
56 } else if ((s[0] & 0xf0u) == 0xe0) {
57 v = (s[0] & 0xfu);
58 minv = 0x800;
59 nb = 2;
60 } else if ((s[0] & 0xf8u) == 0xf0) {
61 v = (s[0] & 0x7u);
62 minv = 0x10000;
63 nb = 3;
64 } else {
65 // invalid byte, either continuation, or too many
66 // leading 1 bits.
67 return (NNG_EINVAL);
68 }
69 s++;
70 for (int i = 0; i < nb; i++) {
71 if ((s[0] & 0xc0u) != 0x80) {
72 return (NNG_EINVAL); // not continuation
73 }
74 s++;
75 v <<= 6u;
76 v += s[0] & 0x3fu;
77 }
78 if (v < minv) {
79 return (NNG_EINVAL);
80 }
81 if ((v >= 0xd800) && (v <= 0xdfff)) {
82 return (NNG_EINVAL);
83 }
84 if (v > 0x10ffff) {
85 return (NNG_EINVAL);
86 }
87 }
88 return (0);
89 }
90
91 size_t
nni_url_decode(uint8_t * out,const char * in,size_t max_len)92 nni_url_decode(uint8_t *out, const char *in, size_t max_len)
93 {
94 size_t len;
95 uint8_t c;
96
97 len = 0;
98 while ((c = (uint8_t) *in) != '\0') {
99 if (len >= max_len) {
100 return ((size_t) -1);
101 }
102 if (c == '%') {
103 in++;
104 if ((!isxdigit(in[0])) || (!isxdigit(in[1]))) {
105 return ((size_t) -1);
106 }
107 out[len] = url_hex_val(*in++);
108 out[len] <<= 4u;
109 out[len] += url_hex_val(*in++);
110 len++;
111 } else {
112 out[len++] = c;
113 in++;
114 }
115 }
116 return (len);
117 }
118
119 static int
url_canonify_uri(char ** outp,const char * in)120 url_canonify_uri(char **outp, const char *in)
121 {
122 char * out;
123 size_t src, dst, len;
124 uint8_t c;
125 int rv;
126 bool skip;
127
128 // We know that the transform is strictly "reducing".
129 if ((out = nni_strdup(in)) == NULL) {
130 return (NNG_ENOMEM);
131 }
132 len = strlen(out);
133
134 // First pass, convert '%xx' for safe characters to unescaped forms.
135 src = dst = 0;
136 while ((c = out[src]) != 0) {
137 if (c == '%') {
138 if ((!isxdigit(out[src + 1])) ||
139 (!isxdigit(out[src + 2]))) {
140 nni_free(out, len);
141 return (NNG_EINVAL);
142 }
143 c = url_hex_val(out[src + 1]);
144 c *= 16;
145 c += url_hex_val(out[src + 2]);
146 // If it's a safe character, decode, otherwise leave
147 // it alone. We also decode valid high-bytes for
148 // UTF-8, which will let us validate them and use
149 // those characters in file names later.
150 if (((c >= 'A') && (c <= 'Z')) ||
151 ((c >= 'a') && (c <= 'z')) ||
152 ((c >= '0') && (c <= '9')) || (c == '.') ||
153 (c == '~') || (c == '_') || (c == '-') ||
154 (c >= 0x80)) {
155 out[dst++] = (char) c;
156 } else {
157 out[dst++] = '%';
158 out[dst++] = toupper((uint8_t) out[src + 1]);
159 out[dst++] = toupper((uint8_t) out[src + 2]);
160 }
161 src += 3;
162 continue;
163 } else {
164 out[dst++] = out[src++];
165 }
166 }
167 out[dst] = 0;
168
169 // Second pass, eliminate redundant //.
170 src = dst = 0;
171 skip = false;
172 while ((c = out[src]) != 0) {
173 if ((c == '/') && (!skip)) {
174 out[dst++] = '/';
175 while (out[src] == '/') {
176 src++;
177 }
178 continue;
179 }
180 if ((c == '?') || (c == '#')) {
181 skip = true;
182 }
183 out[dst++] = (char) c;
184 src++;
185 }
186 out[dst] = 0;
187
188 // Second pass, reduce /. and /.. elements, but only in the path.
189 src = dst = 0;
190 skip = false;
191 while ((c = out[src]) != 0) {
192 if ((c == '/') && (!skip)) {
193 if ((strncmp(out + src, "/..", 3) == 0) &&
194 (out[src + 3] == 0 || out[src + 3] == '#' ||
195 out[src + 3] == '?' || out[src + 3] == '/')) {
196
197 if (dst > 0) {
198 do {
199 dst--;
200 } while ((dst) && (out[dst] != '/'));
201 }
202 src += 3;
203 continue;
204 }
205 if ((strncmp(out + src, "/.", 2) == 0) &&
206 (out[src + 2] == 0 || out[src + 2] == '#' ||
207 out[src + 2] == '?' || out[src + 2] == '/')) {
208 src += 2; // just skip over it
209 continue;
210 }
211 out[dst++] = '/';
212 src++;
213 } else {
214 if ((c == '?') || (c == '#')) {
215 skip = true;
216 }
217 out[dst++] = (char) c;
218 src++;
219 }
220 }
221 out[dst] = 0;
222
223 // Finally lets make sure that the results are valid UTF-8.
224 // This guards against using UTF-8 redundancy to break security.
225 if ((rv = url_utf8_validate(out)) != 0) {
226 nni_free(out, len);
227 return (rv);
228 }
229
230 *outp = nni_strdup(out);
231 nni_free(out, len);
232 return (*outp == NULL ? NNG_ENOMEM : 0);
233 }
234
235 static struct {
236 const char *scheme;
237 const char *port;
238 } nni_url_default_ports[] = {
239 // This list is not exhaustive, but likely covers the main ones we
240 // care about. Feel free to add additional ones as use cases arise.
241 // Note also that we don't use "default" ports for SP protocols
242 // that have no "default" port, like tcp:// or tls+tcp://.
243 // clang-format off
244 { "git", "9418" },
245 { "gopher", "70" },
246 { "http", "80" },
247 { "https", "443" },
248 { "ssh", "22" },
249 { "telnet", "23" },
250 { "ws", "80" },
251 { "wss", "443" },
252 { NULL, NULL },
253 // clang-format on
254 };
255
256 const char *
nni_url_default_port(const char * scheme)257 nni_url_default_port(const char *scheme)
258 {
259 const char *s;
260
261 for (int i = 0; (s = nni_url_default_ports[i].scheme) != NULL; i++) {
262 size_t l = strlen(s);
263 if (strncmp(s, scheme, strlen(s)) != 0) {
264 continue;
265 }
266 // It can have a suffix of either "4" or "6" to restrict
267 // the address family. This is an NNG extension.
268 switch (scheme[l]) {
269 case '\0':
270 return (nni_url_default_ports[i].port);
271 case '4':
272 case '6':
273 if (scheme[l + 1] == '\0') {
274 return (nni_url_default_ports[i].port);
275 }
276 break;
277 }
278 }
279 return ("");
280 }
281
282 // URLs usually follow the following format:
283 //
284 // scheme:[//[userinfo@]host][/]path[?query][#fragment]
285 //
286 // There are other URL formats, for example mailto: but these are
287 // generally not used with nanomsg transports. Golang calls these
288 //
289 // scheme:opaque[?query][#fragment]
290 //
291 // Nanomsg URLs are always of the first form, we always require a
292 // scheme with a leading //, such as http:// or tcp://. So our parser
293 // is a bit more restricted, but sufficient for our needs.
294 int
nni_url_parse(nni_url ** urlp,const char * raw)295 nni_url_parse(nni_url **urlp, const char *raw)
296 {
297 nni_url * url;
298 size_t len;
299 const char *s;
300 char c;
301 int rv;
302
303 if ((url = NNI_ALLOC_STRUCT(url)) == NULL) {
304 return (NNG_ENOMEM);
305 }
306
307 if ((url->u_rawurl = nni_strdup(raw)) == NULL) {
308 rv = NNG_ENOMEM;
309 goto error;
310 }
311
312 // Grab the scheme.
313 s = raw;
314 for (len = 0; (c = s[len]) != ':'; len++) {
315 if (c == 0) {
316 break;
317 }
318 }
319 if (strncmp(s + len, "://", 3) != 0) {
320 rv = NNG_EINVAL;
321 goto error;
322 }
323
324 if ((url->u_scheme = nni_alloc(len + 1)) == NULL) {
325 rv = NNG_ENOMEM;
326 goto error;
327 }
328 for (size_t i = 0; i < len; i++) {
329 url->u_scheme[i] = (char) tolower(s[i]);
330 }
331 url->u_scheme[len] = '\0';
332 s += len + 3; // strlen("://")
333
334 // For compatibility reasons, we treat ipc:// and inproc:// paths
335 // specially. These names URLs have a path name (ipc) or arbitrary
336 // string (inproc) and don't include anything like a host. Note that
337 // in the case of path names, it is incumbent upon the application to
338 // ensure that valid and safe path names are used. Note also that
339 // path names are not canonicalized, which means that the address and
340 // URL properties for relative paths won't be portable to other
341 // processes unless they are in the same directory. When in doubt,
342 // we recommend using absolute paths, such as ipc:///var/run/socket.
343
344 if ((strcmp(url->u_scheme, "ipc") == 0) ||
345 (strcmp(url->u_scheme, "unix") == 0) ||
346 (strcmp(url->u_scheme, "abstract") == 0) ||
347 (strcmp(url->u_scheme, "inproc") == 0)) {
348 if ((url->u_path = nni_strdup(s)) == NULL) {
349 rv = NNG_ENOMEM;
350 goto error;
351 }
352 *urlp = url;
353 return (0);
354 }
355
356 // Look for host part (including colon). Will be terminated by
357 // a path, or NUL. May also include an "@", separating a user
358 // field.
359 for (len = 0; (c = s[len]) != '/'; len++) {
360 if ((c == '\0') || (c == '#') || (c == '?')) {
361 break;
362 }
363 if (c == '@') {
364 // This is a username.
365 if (url->u_userinfo != NULL) { // we already have one
366 rv = NNG_EINVAL;
367 goto error;
368 }
369 if ((url->u_userinfo = nni_alloc(len + 1)) == NULL) {
370 rv = NNG_ENOMEM;
371 goto error;
372 }
373 memcpy(url->u_userinfo, s, len);
374 url->u_userinfo[len] = '\0';
375 s += len + 1; // skip past user@ ...
376 len = 0;
377 }
378 }
379
380 // If the hostname part is just '*', skip over it. (We treat it
381 // as an empty host for legacy nanomsg compatibility. This may be
382 // non-RFC compliant, but we're really only interested in parsing
383 // nanomsg URLs.)
384 if (((len == 1) && (s[0] == '*')) ||
385 ((len > 1) && (strncmp(s, "*:", 2) == 0))) {
386 s++;
387 len--;
388 }
389
390 if ((url->u_host = nni_alloc(len + 1)) == NULL) {
391 rv = NNG_ENOMEM;
392 goto error;
393 }
394 // Copy the host portion, but make it lower case (hostnames are
395 // case insensitive).
396 for (size_t i = 0; i < len; i++) {
397 url->u_host[i] = (char) tolower(s[i]);
398 }
399 url->u_host[len] = '\0';
400 s += len;
401
402 if ((rv = url_canonify_uri(&url->u_requri, s)) != 0) {
403 goto error;
404 }
405
406 s = url->u_requri;
407 for (len = 0; (c = s[len]) != '\0'; len++) {
408 if ((c == '?') || (c == '#')) {
409 break;
410 }
411 }
412
413 if ((url->u_path = nni_alloc(len + 1)) == NULL) {
414 rv = NNG_ENOMEM;
415 goto error;
416 }
417 memcpy(url->u_path, s, len);
418 url->u_path[len] = '\0';
419
420 s += len;
421
422 // Look for query info portion.
423 if (s[0] == '?') {
424 s++;
425 for (len = 0; (c = s[len]) != '\0'; len++) {
426 if (c == '#') {
427 break;
428 }
429 }
430 if ((url->u_query = nni_alloc(len + 1)) == NULL) {
431 rv = NNG_ENOMEM;
432 goto error;
433 }
434 memcpy(url->u_query, s, len);
435 url->u_query[len] = '\0';
436 s += len;
437 }
438
439 // Look for fragment. Will always be last, so we just use
440 // strdup.
441 if (s[0] == '#') {
442 if ((url->u_fragment = nni_strdup(s + 1)) == NULL) {
443 rv = NNG_ENOMEM;
444 goto error;
445 }
446 }
447
448 // Now go back to the host portion, and look for a separate
449 // port We also yank off the "[" part for IPv6 addresses.
450 s = url->u_host;
451 if (s[0] == '[') {
452 s++;
453 for (len = 0; s[len] != ']'; len++) {
454 if (s[len] == '\0') {
455 rv = NNG_EINVAL;
456 goto error;
457 }
458 }
459 if ((s[len + 1] != ':') && (s[len + 1] != '\0')) {
460 rv = NNG_EINVAL;
461 goto error;
462 }
463 } else {
464 for (len = 0; s[len] != ':'; len++) {
465 if (s[len] == '\0') {
466 break;
467 }
468 }
469 }
470 if ((url->u_hostname = nni_alloc(len + 1)) == NULL) {
471 rv = NNG_ENOMEM;
472 goto error;
473 }
474 memcpy(url->u_hostname, s, len);
475 url->u_hostname[len] = '\0';
476 s += len;
477
478 if (s[0] == ']') {
479 s++; // skip over ']', only used with IPv6 addresses
480 }
481 if (s[0] == ':') {
482 // If a colon was present, but no port value present, then
483 // that is an error.
484 if (s[1] == '\0') {
485 rv = NNG_EINVAL;
486 goto error;
487 }
488 url->u_port = nni_strdup(s + 1);
489 } else {
490 url->u_port = nni_strdup(nni_url_default_port(url->u_scheme));
491 }
492 if (url->u_port == NULL) {
493 rv = NNG_ENOMEM;
494 goto error;
495 }
496
497 *urlp = url;
498 return (0);
499
500 error:
501 nni_url_free(url);
502 return (rv);
503 }
504
505 void
nni_url_free(nni_url * url)506 nni_url_free(nni_url *url)
507 {
508 if (url != NULL) {
509 nni_strfree(url->u_rawurl);
510 nni_strfree(url->u_scheme);
511 nni_strfree(url->u_userinfo);
512 nni_strfree(url->u_host);
513 nni_strfree(url->u_hostname);
514 nni_strfree(url->u_port);
515 nni_strfree(url->u_path);
516 nni_strfree(url->u_query);
517 nni_strfree(url->u_fragment);
518 nni_strfree(url->u_requri);
519 NNI_FREE_STRUCT(url);
520 }
521 }
522
523 int
nni_url_asprintf(char ** str,const nni_url * url)524 nni_url_asprintf(char **str, const nni_url *url)
525 {
526 const char *scheme = url->u_scheme;
527 const char *port = url->u_port;
528 const char *host = url->u_hostname;
529 const char *hostob = "";
530 const char *hostcb = "";
531
532 if ((strcmp(scheme, "ipc") == 0) || (strcmp(scheme, "inproc") == 0) ||
533 (strcmp(scheme, "unix") == 0) ||
534 (strcmp(scheme, "ipc+abstract") == 0) ||
535 (strcmp(scheme, "unix+abstract") == 0)) {
536 return (nni_asprintf(str, "%s://%s", scheme, url->u_path));
537 }
538
539 if (port != NULL) {
540 if ((strlen(port) == 0) ||
541 (strcmp(nni_url_default_port(scheme), port) == 0)) {
542 port = NULL;
543 }
544 }
545 if (strcmp(host, "*") == 0) {
546 host = "";
547 }
548 if (strchr(host, ':') != 0) {
549 hostob = "[";
550 hostcb = "]";
551 }
552 return (nni_asprintf(str, "%s://%s%s%s%s%s%s", scheme, hostob, host,
553 hostcb, port != NULL ? ":" : "", port != NULL ? port : "",
554 url->u_requri != NULL ? url->u_requri : ""));
555 }
556
557 // nni_url_asprintf_port is like nni_url_asprintf, but includes a port
558 // override. If non-zero, this port number replaces the port number
559 // in the port string.
560 int
nni_url_asprintf_port(char ** str,const nni_url * url,int port)561 nni_url_asprintf_port(char **str, const nni_url *url, int port)
562 {
563 char portstr[16];
564 nni_url myurl = *url;
565
566 if (port > 0) {
567 (void) snprintf(portstr, sizeof(portstr), "%d", port);
568 myurl.u_port = portstr;
569 }
570 return (nni_url_asprintf(str, &myurl));
571 }
572
573 #define URL_COPYSTR(d, s) ((s != NULL) && ((d = nni_strdup(s)) == NULL))
574
575 int
nni_url_clone(nni_url ** dstp,const nni_url * src)576 nni_url_clone(nni_url **dstp, const nni_url *src)
577 {
578 nni_url *dst;
579
580 if ((dst = NNI_ALLOC_STRUCT(dst)) == NULL) {
581 return (NNG_ENOMEM);
582 }
583 if (URL_COPYSTR(dst->u_rawurl, src->u_rawurl) ||
584 URL_COPYSTR(dst->u_scheme, src->u_scheme) ||
585 URL_COPYSTR(dst->u_userinfo, src->u_userinfo) ||
586 URL_COPYSTR(dst->u_host, src->u_host) ||
587 URL_COPYSTR(dst->u_hostname, src->u_hostname) ||
588 URL_COPYSTR(dst->u_port, src->u_port) ||
589 URL_COPYSTR(dst->u_requri, src->u_requri) ||
590 URL_COPYSTR(dst->u_path, src->u_path) ||
591 URL_COPYSTR(dst->u_query, src->u_query) ||
592 URL_COPYSTR(dst->u_fragment, src->u_fragment)) {
593 nni_url_free(dst);
594 return (NNG_ENOMEM);
595 }
596 *dstp = dst;
597 return (0);
598 }
599
600 #undef URL_COPYSTR
601