1 //
2 // Copyright 2020 Staysail Systems, Inc. <info@staysail.tech>
3 // Copyright 2018 Capitar IT Group BV <info@capitar.com>
4 //
5 // This software is supplied under the terms of the MIT License, a
6 // copy of which should be located in the distribution where this
7 // file was obtained (LICENSE.txt).  A copy of the license may also be
8 // found online at https://opensource.org/licenses/MIT.
9 //
10 
11 #include "core/nng_impl.h"
12 
13 #include <ctype.h>
14 #include <stdbool.h>
15 #include <stdio.h>
16 #include <string.h>
17 
18 #include "url.h"
19 
20 static uint8_t
url_hex_val(char c)21 url_hex_val(char c)
22 {
23 	if ((c >= '0') && (c <= '9')) {
24 		return (c - '0');
25 	}
26 	if ((c >= 'A') && (c <= 'F')) {
27 		return ((c - 'A') + 10);
28 	}
29 	if ((c >= 'a') && (c <= 'f')) {
30 		return ((c - 'a') + 10);
31 	}
32 	return (0);
33 }
34 
35 // This returns either 0, or NNG_EINVAL, if the supplied input string
36 // is malformed UTF-8.  We consider UTF-8 malformed when the sequence
37 // is an invalid code point, not the shortest possible code point, or
38 // incomplete.
39 static int
url_utf8_validate(void * arg)40 url_utf8_validate(void *arg)
41 {
42 	uint8_t *s = arg;
43 	uint32_t v, minv;
44 	int      nb;
45 
46 	while (*s) {
47 		if ((s[0] & 0x80u) == 0) {
48 			s++;
49 			continue;
50 		}
51 		if ((s[0] & 0xe0u) == 0xc0) {
52 			// 0x80 thru 0x7ff
53 			v    = (s[0] & 0x1fu);
54 			minv = 0x80;
55 			nb   = 1;
56 		} else if ((s[0] & 0xf0u) == 0xe0) {
57 			v    = (s[0] & 0xfu);
58 			minv = 0x800;
59 			nb   = 2;
60 		} else if ((s[0] & 0xf8u) == 0xf0) {
61 			v    = (s[0] & 0x7u);
62 			minv = 0x10000;
63 			nb   = 3;
64 		} else {
65 			// invalid byte, either continuation, or too many
66 			// leading 1 bits.
67 			return (NNG_EINVAL);
68 		}
69 		s++;
70 		for (int i = 0; i < nb; i++) {
71 			if ((s[0] & 0xc0u) != 0x80) {
72 				return (NNG_EINVAL); // not continuation
73 			}
74 			s++;
75 			v <<= 6u;
76 			v += s[0] & 0x3fu;
77 		}
78 		if (v < minv) {
79 			return (NNG_EINVAL);
80 		}
81 		if ((v >= 0xd800) && (v <= 0xdfff)) {
82 			return (NNG_EINVAL);
83 		}
84 		if (v > 0x10ffff) {
85 			return (NNG_EINVAL);
86 		}
87 	}
88 	return (0);
89 }
90 
91 size_t
nni_url_decode(uint8_t * out,const char * in,size_t max_len)92 nni_url_decode(uint8_t *out, const char *in, size_t max_len)
93 {
94 	size_t  len;
95 	uint8_t c;
96 
97 	len = 0;
98 	while ((c = (uint8_t) *in) != '\0') {
99 		if (len >= max_len) {
100 			return ((size_t) -1);
101 		}
102 		if (c == '%') {
103 			in++;
104 			if ((!isxdigit(in[0])) || (!isxdigit(in[1]))) {
105 				return ((size_t) -1);
106 			}
107 			out[len] = url_hex_val(*in++);
108 			out[len] <<= 4u;
109 			out[len] += url_hex_val(*in++);
110 			len++;
111 		} else {
112 			out[len++] = c;
113 			in++;
114 		}
115 	}
116 	return (len);
117 }
118 
119 static int
url_canonify_uri(char ** outp,const char * in)120 url_canonify_uri(char **outp, const char *in)
121 {
122 	char *  out;
123 	size_t  src, dst, len;
124 	uint8_t c;
125 	int     rv;
126 	bool    skip;
127 
128 	// We know that the transform is strictly "reducing".
129 	if ((out = nni_strdup(in)) == NULL) {
130 		return (NNG_ENOMEM);
131 	}
132 	len = strlen(out);
133 
134 	// First pass, convert '%xx' for safe characters to unescaped forms.
135 	src = dst = 0;
136 	while ((c = out[src]) != 0) {
137 		if (c == '%') {
138 			if ((!isxdigit(out[src + 1])) ||
139 			    (!isxdigit(out[src + 2]))) {
140 				nni_free(out, len);
141 				return (NNG_EINVAL);
142 			}
143 			c = url_hex_val(out[src + 1]);
144 			c *= 16;
145 			c += url_hex_val(out[src + 2]);
146 			// If it's a safe character, decode, otherwise leave
147 			// it alone.  We also decode valid high-bytes for
148 			// UTF-8, which will let us validate them and use
149 			// those characters in file names later.
150 			if (((c >= 'A') && (c <= 'Z')) ||
151 			    ((c >= 'a') && (c <= 'z')) ||
152 			    ((c >= '0') && (c <= '9')) || (c == '.') ||
153 			    (c == '~') || (c == '_') || (c == '-') ||
154 			    (c >= 0x80)) {
155 				out[dst++] = (char) c;
156 			} else {
157 				out[dst++] = '%';
158 				out[dst++] = toupper((uint8_t) out[src + 1]);
159 				out[dst++] = toupper((uint8_t) out[src + 2]);
160 			}
161 			src += 3;
162 			continue;
163 		} else {
164 			out[dst++] = out[src++];
165 		}
166 	}
167 	out[dst] = 0;
168 
169 	// Second pass, eliminate redundant //.
170 	src = dst = 0;
171 	skip      = false;
172 	while ((c = out[src]) != 0) {
173 		if ((c == '/') && (!skip)) {
174 			out[dst++] = '/';
175 			while (out[src] == '/') {
176 				src++;
177 			}
178 			continue;
179 		}
180 		if ((c == '?') || (c == '#')) {
181 			skip = true;
182 		}
183 		out[dst++] = (char) c;
184 		src++;
185 	}
186 	out[dst] = 0;
187 
188 	// Second pass, reduce /. and /.. elements, but only in the path.
189 	src = dst = 0;
190 	skip      = false;
191 	while ((c = out[src]) != 0) {
192 		if ((c == '/') && (!skip)) {
193 			if ((strncmp(out + src, "/..", 3) == 0) &&
194 			    (out[src + 3] == 0 || out[src + 3] == '#' ||
195 			        out[src + 3] == '?' || out[src + 3] == '/')) {
196 
197 				if (dst > 0) {
198 					do {
199 						dst--;
200 					} while ((dst) && (out[dst] != '/'));
201 				}
202 				src += 3;
203 				continue;
204 			}
205 			if ((strncmp(out + src, "/.", 2) == 0) &&
206 			    (out[src + 2] == 0 || out[src + 2] == '#' ||
207 			        out[src + 2] == '?' || out[src + 2] == '/')) {
208 				src += 2; // just skip over it
209 				continue;
210 			}
211 			out[dst++] = '/';
212 			src++;
213 		} else {
214 			if ((c == '?') || (c == '#')) {
215 				skip = true;
216 			}
217 			out[dst++] = (char) c;
218 			src++;
219 		}
220 	}
221 	out[dst] = 0;
222 
223 	// Finally lets make sure that the results are valid UTF-8.
224 	// This guards against using UTF-8 redundancy to break security.
225 	if ((rv = url_utf8_validate(out)) != 0) {
226 		nni_free(out, len);
227 		return (rv);
228 	}
229 
230 	*outp = nni_strdup(out);
231 	nni_free(out, len);
232 	return (*outp == NULL ? NNG_ENOMEM : 0);
233 }
234 
235 static struct {
236 	const char *scheme;
237 	const char *port;
238 } nni_url_default_ports[] = {
239 	// This list is not exhaustive, but likely covers the main ones we
240 	// care about.  Feel free to add additional ones as use cases arise.
241 	// Note also that we don't use "default" ports for SP protocols
242 	// that have no "default" port, like tcp:// or tls+tcp://.
243 	// clang-format off
244 	{ "git", "9418" },
245 	{ "gopher", "70" },
246 	{ "http", "80" },
247 	{ "https", "443" },
248 	{ "ssh", "22" },
249 	{ "telnet", "23" },
250 	{ "ws", "80" },
251 	{ "wss", "443" },
252 	{ NULL, NULL },
253 	// clang-format on
254 };
255 
256 const char *
nni_url_default_port(const char * scheme)257 nni_url_default_port(const char *scheme)
258 {
259 	const char *s;
260 
261 	for (int i = 0; (s = nni_url_default_ports[i].scheme) != NULL; i++) {
262 		size_t l = strlen(s);
263 		if (strncmp(s, scheme, strlen(s)) != 0) {
264 			continue;
265 		}
266 		// It can have a suffix of either "4" or "6" to restrict
267 		// the address family.  This is an NNG extension.
268 		switch (scheme[l]) {
269 		case '\0':
270 			return (nni_url_default_ports[i].port);
271 		case '4':
272 		case '6':
273 			if (scheme[l + 1] == '\0') {
274 				return (nni_url_default_ports[i].port);
275 			}
276 			break;
277 		}
278 	}
279 	return ("");
280 }
281 
282 // URLs usually follow the following format:
283 //
284 // scheme:[//[userinfo@]host][/]path[?query][#fragment]
285 //
286 // There are other URL formats, for example mailto: but these are
287 // generally not used with nanomsg transports.  Golang calls these
288 //
289 // scheme:opaque[?query][#fragment]
290 //
291 // Nanomsg URLs are always of the first form, we always require a
292 // scheme with a leading //, such as http:// or tcp://. So our parser
293 // is a bit more restricted, but sufficient for our needs.
294 int
nni_url_parse(nni_url ** urlp,const char * raw)295 nni_url_parse(nni_url **urlp, const char *raw)
296 {
297 	nni_url *   url;
298 	size_t      len;
299 	const char *s;
300 	char        c;
301 	int         rv;
302 
303 	if ((url = NNI_ALLOC_STRUCT(url)) == NULL) {
304 		return (NNG_ENOMEM);
305 	}
306 
307 	if ((url->u_rawurl = nni_strdup(raw)) == NULL) {
308 		rv = NNG_ENOMEM;
309 		goto error;
310 	}
311 
312 	// Grab the scheme.
313 	s = raw;
314 	for (len = 0; (c = s[len]) != ':'; len++) {
315 		if (c == 0) {
316 			break;
317 		}
318 	}
319 	if (strncmp(s + len, "://", 3) != 0) {
320 		rv = NNG_EINVAL;
321 		goto error;
322 	}
323 
324 	if ((url->u_scheme = nni_alloc(len + 1)) == NULL) {
325 		rv = NNG_ENOMEM;
326 		goto error;
327 	}
328 	for (size_t i = 0; i < len; i++) {
329 		url->u_scheme[i] = (char) tolower(s[i]);
330 	}
331 	url->u_scheme[len] = '\0';
332 	s += len + 3; // strlen("://")
333 
334 	// For compatibility reasons, we treat ipc:// and inproc:// paths
335 	// specially. These names URLs have a path name (ipc) or arbitrary
336 	// string (inproc) and don't include anything like a host.  Note that
337 	// in the case of path names, it is incumbent upon the application to
338 	// ensure that valid and safe path names are used.  Note also that
339 	// path names are not canonicalized, which means that the address and
340 	// URL properties for relative paths won't be portable to other
341 	// processes unless they are in the same directory.  When in doubt,
342 	// we recommend using absolute paths, such as ipc:///var/run/socket.
343 
344 	if ((strcmp(url->u_scheme, "ipc") == 0) ||
345 	    (strcmp(url->u_scheme, "unix") == 0) ||
346 	    (strcmp(url->u_scheme, "abstract") == 0) ||
347 	    (strcmp(url->u_scheme, "inproc") == 0)) {
348 		if ((url->u_path = nni_strdup(s)) == NULL) {
349 			rv = NNG_ENOMEM;
350 			goto error;
351 		}
352 		*urlp = url;
353 		return (0);
354 	}
355 
356 	// Look for host part (including colon).  Will be terminated by
357 	// a path, or NUL.  May also include an "@", separating a user
358 	// field.
359 	for (len = 0; (c = s[len]) != '/'; len++) {
360 		if ((c == '\0') || (c == '#') || (c == '?')) {
361 			break;
362 		}
363 		if (c == '@') {
364 			// This is a username.
365 			if (url->u_userinfo != NULL) { // we already have one
366 				rv = NNG_EINVAL;
367 				goto error;
368 			}
369 			if ((url->u_userinfo = nni_alloc(len + 1)) == NULL) {
370 				rv = NNG_ENOMEM;
371 				goto error;
372 			}
373 			memcpy(url->u_userinfo, s, len);
374 			url->u_userinfo[len] = '\0';
375 			s += len + 1; // skip past user@ ...
376 			len = 0;
377 		}
378 	}
379 
380 	// If the hostname part is just '*', skip over it.  (We treat it
381 	// as an empty host for legacy nanomsg compatibility.  This may be
382 	// non-RFC compliant, but we're really only interested in parsing
383 	// nanomsg URLs.)
384 	if (((len == 1) && (s[0] == '*')) ||
385 	    ((len > 1) && (strncmp(s, "*:", 2) == 0))) {
386 		s++;
387 		len--;
388 	}
389 
390 	if ((url->u_host = nni_alloc(len + 1)) == NULL) {
391 		rv = NNG_ENOMEM;
392 		goto error;
393 	}
394 	// Copy the host portion, but make it lower case (hostnames are
395 	// case insensitive).
396 	for (size_t i = 0; i < len; i++) {
397 		url->u_host[i] = (char) tolower(s[i]);
398 	}
399 	url->u_host[len] = '\0';
400 	s += len;
401 
402 	if ((rv = url_canonify_uri(&url->u_requri, s)) != 0) {
403 		goto error;
404 	}
405 
406 	s = url->u_requri;
407 	for (len = 0; (c = s[len]) != '\0'; len++) {
408 		if ((c == '?') || (c == '#')) {
409 			break;
410 		}
411 	}
412 
413 	if ((url->u_path = nni_alloc(len + 1)) == NULL) {
414 		rv = NNG_ENOMEM;
415 		goto error;
416 	}
417 	memcpy(url->u_path, s, len);
418 	url->u_path[len] = '\0';
419 
420 	s += len;
421 
422 	// Look for query info portion.
423 	if (s[0] == '?') {
424 		s++;
425 		for (len = 0; (c = s[len]) != '\0'; len++) {
426 			if (c == '#') {
427 				break;
428 			}
429 		}
430 		if ((url->u_query = nni_alloc(len + 1)) == NULL) {
431 			rv = NNG_ENOMEM;
432 			goto error;
433 		}
434 		memcpy(url->u_query, s, len);
435 		url->u_query[len] = '\0';
436 		s += len;
437 	}
438 
439 	// Look for fragment.  Will always be last, so we just use
440 	// strdup.
441 	if (s[0] == '#') {
442 		if ((url->u_fragment = nni_strdup(s + 1)) == NULL) {
443 			rv = NNG_ENOMEM;
444 			goto error;
445 		}
446 	}
447 
448 	// Now go back to the host portion, and look for a separate
449 	// port We also yank off the "[" part for IPv6 addresses.
450 	s = url->u_host;
451 	if (s[0] == '[') {
452 		s++;
453 		for (len = 0; s[len] != ']'; len++) {
454 			if (s[len] == '\0') {
455 				rv = NNG_EINVAL;
456 				goto error;
457 			}
458 		}
459 		if ((s[len + 1] != ':') && (s[len + 1] != '\0')) {
460 			rv = NNG_EINVAL;
461 			goto error;
462 		}
463 	} else {
464 		for (len = 0; s[len] != ':'; len++) {
465 			if (s[len] == '\0') {
466 				break;
467 			}
468 		}
469 	}
470 	if ((url->u_hostname = nni_alloc(len + 1)) == NULL) {
471 		rv = NNG_ENOMEM;
472 		goto error;
473 	}
474 	memcpy(url->u_hostname, s, len);
475 	url->u_hostname[len] = '\0';
476 	s += len;
477 
478 	if (s[0] == ']') {
479 		s++; // skip over ']', only used with IPv6 addresses
480 	}
481 	if (s[0] == ':') {
482 		// If a colon was present, but no port value present, then
483 		// that is an error.
484 		if (s[1] == '\0') {
485 			rv = NNG_EINVAL;
486 			goto error;
487 		}
488 		url->u_port = nni_strdup(s + 1);
489 	} else {
490 		url->u_port = nni_strdup(nni_url_default_port(url->u_scheme));
491 	}
492 	if (url->u_port == NULL) {
493 		rv = NNG_ENOMEM;
494 		goto error;
495 	}
496 
497 	*urlp = url;
498 	return (0);
499 
500 error:
501 	nni_url_free(url);
502 	return (rv);
503 }
504 
505 void
nni_url_free(nni_url * url)506 nni_url_free(nni_url *url)
507 {
508 	if (url != NULL) {
509 		nni_strfree(url->u_rawurl);
510 		nni_strfree(url->u_scheme);
511 		nni_strfree(url->u_userinfo);
512 		nni_strfree(url->u_host);
513 		nni_strfree(url->u_hostname);
514 		nni_strfree(url->u_port);
515 		nni_strfree(url->u_path);
516 		nni_strfree(url->u_query);
517 		nni_strfree(url->u_fragment);
518 		nni_strfree(url->u_requri);
519 		NNI_FREE_STRUCT(url);
520 	}
521 }
522 
523 int
nni_url_asprintf(char ** str,const nni_url * url)524 nni_url_asprintf(char **str, const nni_url *url)
525 {
526 	const char *scheme = url->u_scheme;
527 	const char *port   = url->u_port;
528 	const char *host   = url->u_hostname;
529 	const char *hostob = "";
530 	const char *hostcb = "";
531 
532 	if ((strcmp(scheme, "ipc") == 0) || (strcmp(scheme, "inproc") == 0) ||
533             (strcmp(scheme, "unix") == 0) ||
534             (strcmp(scheme, "ipc+abstract") == 0) ||
535 	    (strcmp(scheme, "unix+abstract") == 0)) {
536 		return (nni_asprintf(str, "%s://%s", scheme, url->u_path));
537 	}
538 
539 	if (port != NULL) {
540 		if ((strlen(port) == 0) ||
541 		    (strcmp(nni_url_default_port(scheme), port) == 0)) {
542 			port = NULL;
543 		}
544 	}
545 	if (strcmp(host, "*") == 0) {
546 		host = "";
547 	}
548 	if (strchr(host, ':') != 0) {
549 		hostob = "[";
550 		hostcb = "]";
551 	}
552 	return (nni_asprintf(str, "%s://%s%s%s%s%s%s", scheme, hostob, host,
553 	    hostcb, port != NULL ? ":" : "", port != NULL ? port : "",
554 	    url->u_requri != NULL ? url->u_requri : ""));
555 }
556 
557 // nni_url_asprintf_port is like nni_url_asprintf, but includes a port
558 // override.  If non-zero, this port number replaces the port number
559 // in the port string.
560 int
nni_url_asprintf_port(char ** str,const nni_url * url,int port)561 nni_url_asprintf_port(char **str, const nni_url *url, int port)
562 {
563 	char    portstr[16];
564 	nni_url myurl = *url;
565 
566 	if (port > 0) {
567 		(void) snprintf(portstr, sizeof(portstr), "%d", port);
568 		myurl.u_port = portstr;
569 	}
570 	return (nni_url_asprintf(str, &myurl));
571 }
572 
573 #define URL_COPYSTR(d, s) ((s != NULL) && ((d = nni_strdup(s)) == NULL))
574 
575 int
nni_url_clone(nni_url ** dstp,const nni_url * src)576 nni_url_clone(nni_url **dstp, const nni_url *src)
577 {
578 	nni_url *dst;
579 
580 	if ((dst = NNI_ALLOC_STRUCT(dst)) == NULL) {
581 		return (NNG_ENOMEM);
582 	}
583 	if (URL_COPYSTR(dst->u_rawurl, src->u_rawurl) ||
584 	    URL_COPYSTR(dst->u_scheme, src->u_scheme) ||
585 	    URL_COPYSTR(dst->u_userinfo, src->u_userinfo) ||
586 	    URL_COPYSTR(dst->u_host, src->u_host) ||
587 	    URL_COPYSTR(dst->u_hostname, src->u_hostname) ||
588 	    URL_COPYSTR(dst->u_port, src->u_port) ||
589 	    URL_COPYSTR(dst->u_requri, src->u_requri) ||
590 	    URL_COPYSTR(dst->u_path, src->u_path) ||
591 	    URL_COPYSTR(dst->u_query, src->u_query) ||
592 	    URL_COPYSTR(dst->u_fragment, src->u_fragment)) {
593 		nni_url_free(dst);
594 		return (NNG_ENOMEM);
595 	}
596 	*dstp = dst;
597 	return (0);
598 }
599 
600 #undef URL_COPYSTR
601