1 /* URL parser and translator; implementation of RFC 2396. */
2 
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
6 
7 #include <ctype.h>
8 #include <errno.h>
9 #ifdef HAVE_IDNA_H
10 #include <idna.h>
11 #endif
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <sys/types.h>
16 #ifdef HAVE_NETDB_H
17 #include <netdb.h> /* OS/2 needs this after sys/types.h */
18 #endif
19 
20 #ifdef HAVE_SYS_SOCKET_H
21 #include <sys/socket.h>
22 #endif
23 #ifdef HAVE_NETINET_IN_H
24 #include <netinet/in.h>
25 #endif
26 #ifdef HAVE_ARPA_INET_H
27 #include <arpa/inet.h>
28 #endif
29 
30 #include "elinks.h"
31 
32 #include "main/object.h"
33 #include "protocol/protocol.h"
34 #include "protocol/uri.h"
35 #include "util/conv.h"
36 #include "util/error.h"
37 #include "util/file.h"
38 #include "util/hash.h"
39 #include "util/memory.h"
40 #include "util/string.h"
41 
42 
43 static inline int
end_of_dir(unsigned char c)44 end_of_dir(unsigned char c)
45 {
46 	return c == POST_CHAR || c == '#' || c == ';' || c == '?';
47 }
48 
49 static inline int
is_uri_dir_sep(struct uri * uri,unsigned char pos)50 is_uri_dir_sep(struct uri *uri, unsigned char pos)
51 {
52 	return (uri->protocol == PROTOCOL_FILE ? dir_sep(pos) : pos == '/');
53 }
54 
55 
56 int
is_ip_address(unsigned char * address,int addresslen)57 is_ip_address(unsigned char *address, int addresslen)
58 {
59 	/* The @address has well defined limits so it would be a shame to
60 	 * allocate it. */
61 	unsigned char buffer[IP_ADDRESS_BUFFER_SIZE];
62 
63 	if (addresslen >= sizeof(buffer))
64 		return 0;
65 
66 	safe_strncpy(buffer, address, addresslen + 1);
67 
68 #ifdef HAVE_INET_PTON
69 #ifdef CONFIG_IPV6
70 	{
71 		struct sockaddr_in6 addr6;
72 
73 		if (inet_pton(AF_INET6, buffer, &addr6.sin6_addr) > 0)
74 			return 1;
75 	}
76 #endif /* CONFIG_IPV6 */
77 	{
78 		struct in_addr addr4;
79 
80 		if (inet_pton(AF_INET, buffer, &addr4) > 0)
81 			return 1;
82 	}
83 
84 	return 0;
85 #else
86 	/* FIXME: Is this ever the case? */
87 	return 0;
88 #endif /* HAVE_INET_PTON */
89 }
90 
91 
92 int
end_with_known_tld(unsigned char * s,int slen)93 end_with_known_tld(unsigned char *s, int slen)
94 {
95 	int i;
96 	static const unsigned char *tld[] =
97 	{ "com", "edu", "net",
98 	  "org", "gov", "mil",
99 	  "int", "biz", "arpa",
100 	  "aero", "coop",
101 	  "info", "museum",
102 	  "name", "pro", NULL };
103 
104 	if (!slen) return -1;
105 	if (slen < 0) slen = strlen(s);
106 
107 	for (i = 0; tld[i]; i++) {
108 		int tldlen = strlen(tld[i]);
109 		int pos = slen - tldlen;
110 
111 		if (pos >= 0 && !c_strncasecmp(&s[pos], tld[i], tldlen))
112 			return pos;
113 	}
114 
115 	return -1;
116 }
117 
118 /* XXX: this function writes to @name. */
119 static int
check_whether_file_exists(unsigned char * name)120 check_whether_file_exists(unsigned char *name)
121 {
122 	/* Check POST_CHAR etc ... */
123 	static const unsigned char chars[] = POST_CHAR_S "#?";
124 	int i;
125 	int namelen = strlen(name);
126 
127 	if (file_exists(name))
128 		return namelen;
129 
130 	for (i = 0; i < sizeof(chars) - 1; i++) {
131 		unsigned char *pos = memchr(name, chars[i], namelen);
132 		int exists;
133 
134 		if (!pos) continue;
135 
136 		*pos = 0;
137 		exists = file_exists(name);
138 		*pos = chars[i];
139 
140 		if (exists) {
141 			return pos - name;
142 		}
143 	}
144 
145 	return -1;
146 }
147 
148 static int
check_uri_file(unsigned char * name)149 check_uri_file(unsigned char *name)
150 {
151 	/* Check POST_CHAR etc ... */
152 	static const unsigned char chars[] = POST_CHAR_S "#";
153 
154 	return strcspn(name, chars);
155 }
156 
157 /* Encodes URIs without encoding stuff like fragments and query separators. */
158 static void
encode_file_uri_string(struct string * string,unsigned char * uristring)159 encode_file_uri_string(struct string *string, unsigned char *uristring)
160 {
161 	int filenamelen = check_whether_file_exists(uristring);
162 
163 	encode_uri_string(string, uristring, filenamelen, 0);
164 	if (filenamelen > 0) add_to_string(string, uristring + filenamelen);
165 }
166 
167 
168 static inline int
get_protocol_length(const unsigned char * url)169 get_protocol_length(const unsigned char *url)
170 {
171 	unsigned char *end = (unsigned char *) url;
172 
173 	/* Seek the end of the protocol name if any. */
174 	/* RFC1738:
175 	 * scheme  = 1*[ lowalpha | digit | "+" | "-" | "." ]
176 	 * (but per its recommendations we accept "upalpha" too) */
177 	while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
178 		end++;
179 
180 	/* Now we make something to support our "IP version in protocol scheme
181 	 * name" hack and silently chop off the last digit if it's there. The
182 	 * IETF's not gonna notice I hope or it'd be going after us hard. */
183 	if (end != url && isdigit(end[-1]))
184 		end--;
185 
186 	/* Also return 0 if there's no protocol name (@end == @url). */
187 	return (*end == ':' || isdigit(*end)) ? end - url : 0;
188 }
189 
190 enum uri_errno
parse_uri(struct uri * uri,unsigned char * uristring)191 parse_uri(struct uri *uri, unsigned char *uristring)
192 {
193 	unsigned char *prefix_end, *host_end;
194 #ifdef CONFIG_IPV6
195 	unsigned char *lbracket, *rbracket;
196 #endif
197 
198 	assertm(uristring, "No uri to parse.");
199 	memset(uri, 0, sizeof(*uri));
200 
201 	/* Nothing to do for an empty url. */
202 	if_assert_failed return 0;
203 	if (!*uristring) return URI_ERRNO_EMPTY;
204 
205 	uri->string = uristring;
206 	uri->protocollen = get_protocol_length(uristring);
207 
208 	/* Invalid */
209 	if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL;
210 
211 	/* Figure out whether the protocol is known */
212 	uri->protocol = get_protocol(struri(uri), uri->protocollen);
213 
214 	prefix_end = uristring + uri->protocollen; /* ':' */
215 
216 	/* Check if there's a digit after the protocol name. */
217 	if (isdigit(*prefix_end)) {
218 		uri->ip_family = uristring[uri->protocollen] - '0';
219 		prefix_end++;
220 	}
221 	if (*prefix_end != ':')
222 		return URI_ERRNO_INVALID_PROTOCOL;
223 	prefix_end++;
224 
225 	/* Skip slashes */
226 
227 	if (prefix_end[0] == '/' && prefix_end[1] == '/') {
228 		if (prefix_end[2] == '/'
229 		    && get_protocol_need_slash_after_host(uri->protocol))
230 			return URI_ERRNO_TOO_MANY_SLASHES;
231 
232 		prefix_end += 2;
233 
234 	} else if (get_protocol_need_slashes(uri->protocol)) {
235 		return URI_ERRNO_NO_SLASHES;
236 	}
237 
238 	if (get_protocol_free_syntax(uri->protocol)) {
239 		uri->data = prefix_end;
240 		uri->datalen = strlen(prefix_end);
241 		return URI_ERRNO_OK;
242 
243 	} else if (uri->protocol == PROTOCOL_FILE) {
244 		int datalen = check_uri_file(prefix_end);
245 		unsigned char *frag_or_post = prefix_end + datalen;
246 
247 		/* Extract the fragment part. */
248 		if (datalen >= 0) {
249 			if (*frag_or_post == '#') {
250 				uri->fragment = frag_or_post + 1;
251 				uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
252 				frag_or_post = uri->fragment + uri->fragmentlen;
253 			}
254 			if (*frag_or_post == POST_CHAR) {
255 				uri->post = frag_or_post + 1;
256 			}
257 		} else {
258 			datalen = strlen(prefix_end);
259 		}
260 
261 		uri->data = prefix_end;
262 		uri->datalen = datalen;
263 
264 		return URI_ERRNO_OK;
265 	}
266 
267 	/* Isolate host */
268 
269 #ifdef CONFIG_IPV6
270 	/* Get brackets enclosing IPv6 address */
271 	lbracket = strchr(prefix_end, '[');
272 	if (lbracket) {
273 		rbracket = strchr(lbracket, ']');
274 		/* [address] is handled only inside of hostname part (surprisingly). */
275 		if (rbracket && rbracket < prefix_end + strcspn(prefix_end, "/"))
276 			uri->ipv6 = 1;
277 		else
278 			lbracket = rbracket = NULL;
279 	} else {
280 		rbracket = NULL;
281 	}
282 #endif
283 
284 	/* Possibly skip auth part */
285 	host_end = prefix_end + strcspn(prefix_end, "@");
286 
287 	if (prefix_end + strcspn(prefix_end, "/") > host_end
288 	    && *host_end) { /* we have auth info here */
289 		unsigned char *user_end;
290 
291 		/* Allow '@' in the password component */
292 		while (strcspn(host_end + 1, "@") < strcspn(host_end + 1, "/?"))
293 			host_end = host_end + 1 + strcspn(host_end + 1, "@");
294 
295 		user_end = strchr(prefix_end, ':');
296 
297 		if (!user_end || user_end > host_end) {
298 			uri->user = prefix_end;
299 			uri->userlen = host_end - prefix_end;
300 		} else {
301 			uri->user = prefix_end;
302 			uri->userlen = user_end - prefix_end;
303 			uri->password = user_end + 1;
304 			uri->passwordlen = host_end - user_end - 1;
305 		}
306 		prefix_end = host_end + 1;
307 	}
308 
309 #ifdef CONFIG_IPV6
310 	if (uri->ipv6)
311 		host_end = rbracket + strcspn(rbracket, ":/?");
312 	else
313 #endif
314 		host_end = prefix_end + strcspn(prefix_end, ":/?");
315 
316 #ifdef CONFIG_IPV6
317 	if (uri->ipv6) {
318 		int addrlen = rbracket - lbracket - 1;
319 
320 		/* Check for valid length.
321 		 * addrlen >= sizeof(hostbuf) is theorically impossible
322 		 * but i keep the test in case of... Safer, imho --Zas */
323 		assertm(addrlen >= 0 && addrlen < NI_MAXHOST,
324 			"parse_uri(): addrlen value is bad (%d) for URL '%s'. "
325 			"Problems are likely to be encountered. Please report "
326 			"this, it is a security bug!", addrlen, uristring);
327 		if_assert_failed return URI_ERRNO_IPV6_SECURITY;
328 
329 		uri->host = lbracket + 1;
330 		uri->hostlen = addrlen;
331 	} else
332 #endif
333 	{
334 		uri->host = prefix_end;
335 		uri->hostlen = host_end - prefix_end;
336 
337 		/* Trim trailing '.'s */
338 		if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
339 			return URI_ERRNO_TRAILING_DOTS;
340 	}
341 
342 	if (*host_end == ':') { /* we have port here */
343 		unsigned char *port_end = host_end + 1 + strcspn(host_end + 1, "/");
344 
345 		host_end++;
346 
347 		uri->port = host_end;
348 		uri->portlen = port_end - host_end;
349 
350 		if (uri->portlen == 0)
351 			return URI_ERRNO_NO_PORT_COLON;
352 
353 		/* We only use 8 bits for portlen so better check */
354 		if (uri->portlen != port_end - host_end)
355 			return URI_ERRNO_INVALID_PORT;
356 
357 		/* test if port is number */
358 		/* TODO: possibly lookup for the service otherwise? --pasky */
359 		for (; host_end < port_end; host_end++)
360 			if (!isdigit(*host_end))
361 				return URI_ERRNO_INVALID_PORT;
362 
363 		/* Check valid port value, and let show an error message
364 		 * about invalid url syntax. */
365 		if (uri->port && uri->portlen) {
366 			int n;
367 
368 			errno = 0;
369 			n = strtol(uri->port, NULL, 10);
370 			if (errno || !uri_port_is_valid(n))
371 				return URI_ERRNO_INVALID_PORT;
372 		}
373 	}
374 
375 	if (*host_end == '/') {
376 		host_end++;
377 
378 	} else if (get_protocol_need_slash_after_host(uri->protocol)) {
379 		/* The need for slash after the host component depends on the
380 		 * need for a host component. -- The dangerous mind of Jonah */
381 		if (!uri->hostlen)
382 			return URI_ERRNO_NO_HOST;
383 
384 		return URI_ERRNO_NO_HOST_SLASH;
385 	}
386 
387 	/* Look for #fragment or POST_CHAR */
388 	prefix_end = host_end + strcspn(host_end, "#" POST_CHAR_S);
389 	uri->data = host_end;
390 	uri->datalen = prefix_end - host_end;
391 
392 	if (*prefix_end == '#') {
393 		uri->fragment = prefix_end + 1;
394 		uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
395 		prefix_end = uri->fragment + uri->fragmentlen;
396 	}
397 
398 	if (*prefix_end == POST_CHAR) {
399 		uri->post = prefix_end + 1;
400 	}
401 
402 	return URI_ERRNO_OK;
403 }
404 
405 int
get_uri_port(struct uri * uri)406 get_uri_port(struct uri *uri)
407 {
408 	if (uri->port && uri->portlen) {
409 		unsigned char *end = uri->port;
410 		int port = strtol(uri->port, (char **) &end, 10);
411 
412 		if (end != uri->port) {
413 			assert(uri_port_is_valid(port));
414 			return port;
415 		}
416 	}
417 
418 	return get_protocol_port(uri->protocol);
419 }
420 
421 #define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))
422 
423 static inline int
compare_component(unsigned char * a,int alen,unsigned char * b,int blen)424 compare_component(unsigned char *a, int alen, unsigned char *b, int blen)
425 {
426 	/* Check that the length and the strings are both set or unset */
427 	if (alen != blen || !!a != !!b) return 0;
428 
429 	/* Both are unset so that will make a perfect match */
430 	if (!a || !alen) return 1;
431 
432 	/* Let the higher forces decide */
433 	return !memcmp(a, b, blen);
434 }
435 
436 #define wants(x) (components & (x))
437 
438 int
compare_uri(struct uri * a,struct uri * b,enum uri_component components)439 compare_uri(struct uri *a, struct uri *b, enum uri_component components)
440 {
441 	if (a == b) return 1;
442 	if (!components) return 0;
443 
444 	assertm(can_compare_uri_components(components),
445 		"compare_uri() is a work in progress. Component unsupported");
446 
447 	return (!wants(URI_PROTOCOL) || a->protocol == b->protocol)
448 		&& (!wants(URI_IP_FAMILY) || a->ip_family == b->ip_family)
449 		&& (!wants(URI_USER)
450 		    || compare_component(a->user, a->userlen, b->user, b->userlen))
451 		&& (!wants(URI_PASSWORD)
452 		    || compare_component(a->password, a->passwordlen, b->password, b->passwordlen))
453 		&& (!wants(URI_HOST)
454 		    || compare_component(a->host, a->hostlen, b->host, b->hostlen))
455 		&& (!wants(URI_PORT)
456 		    || compare_component(a->port, a->portlen, b->port, b->portlen))
457 		&& (!wants(URI_DATA)
458 		    || compare_component(a->data, a->datalen, b->data, b->datalen))
459 		&& (!wants(URI_FRAGMENT)
460 		    || compare_component(a->fragment, a->fragmentlen, b->fragment, b->fragmentlen))
461 		&& (!wants(URI_POST)
462 		    || compare_component(a->post, a->post ? strlen(a->post) : 0, b->post, b->post ? strlen(b->post) : 0));
463 }
464 
465 
466 /* We might need something more intelligent than this Swiss army knife. */
467 struct string *
add_uri_to_string(struct string * string,struct uri * uri,enum uri_component components)468 add_uri_to_string(struct string *string, struct uri *uri,
469 		  enum uri_component components)
470 {
471 	/* Custom or unknown keep the URI untouched. */
472 	if (uri->protocol == PROTOCOL_UNKNOWN)
473 		return add_to_string(string, struri(uri));
474 
475  	if (wants(URI_PROTOCOL)) {
476 		add_bytes_to_string(string, uri->string, uri->protocollen);
477 		if (wants(URI_IP_FAMILY) && uri->ip_family)
478 			add_long_to_string(string, uri->ip_family);
479 		add_char_to_string(string, ':');
480  		if (get_protocol_need_slashes(uri->protocol))
481 			add_to_string(string, "//");
482  	}
483 
484  	if (wants(URI_USER) && uri->userlen) {
485 		add_bytes_to_string(string, uri->user, uri->userlen);
486 
487  		if (wants(URI_PASSWORD) && uri->passwordlen) {
488 			add_char_to_string(string, ':');
489 			add_bytes_to_string(string, uri->password,
490 						    uri->passwordlen);
491  		}
492 
493 		add_char_to_string(string, '@');
494  	}
495 
496  	if (wants(URI_HOST) && uri->hostlen) {
497 		int add_host = 1;
498 
499 #ifdef CONFIG_IPV6
500 		/* Rationale for wants(URI_PORT): The [notation] was invented
501 		 * so that you can have an IPv6 addy and a port together. So
502 		 * we want to use it when that happens, otherwise we need not
503 		 * bother (that happens only when we want it for DNS anyway).
504 		 * I insist on an implied elegancy of this way, but YMMV. ;-)
505 		 * --pasky */
506 		if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, '[');
507 #endif
508 #ifdef CONFIG_IDN
509 		/* Support for the GNU International Domain Name library.
510 		 *
511 		 * http://www.gnu.org/software/libidn/manual/html_node/IDNA-Functions.html
512 		 *
513 		 * Now it is probably not perfect because idna_to_ascii_lz()
514 		 * will be using a ``zero terminated input string encoded in
515 		 * the current locale's character set''. Anyway I don't know
516 		 * how to convert anything to UTF-8 or Unicode. --jonas */
517 		if (wants(URI_IDN)) {
518 			unsigned char *host = memacpy(uri->host, uri->hostlen);
519 
520 			if (host) {
521 				char *idname;
522 				int code = idna_to_ascii_lz(host, &idname, 0);
523 
524 				/* FIXME: Return NULL if it coughed? --jonas */
525 				if (code == IDNA_SUCCESS) {
526 					add_to_string(string, idname);
527 					free(idname);
528 					add_host = 0;
529 				}
530 
531 				mem_free(host);
532 			}
533 		}
534 
535 #endif
536 		if (add_host)
537 			add_bytes_to_string(string, uri->host, uri->hostlen);
538 
539 #ifdef CONFIG_IPV6
540 		if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, ']');
541 #endif
542  	}
543 
544  	if (wants(URI_PORT) || wants(URI_DEFAULT_PORT)) {
545  		if (uri->portlen) {
546 			add_char_to_string(string, ':');
547 			add_bytes_to_string(string, uri->port, uri->portlen);
548 
549 		} else if (wants(URI_DEFAULT_PORT)
550 			   && uri->protocol != PROTOCOL_USER) {
551 			/* For user protocols we don't know a default port.
552 			 * Should user protocols ports be configurable? */
553 			int port = get_protocol_port(uri->protocol);
554 
555 			add_char_to_string(string, ':');
556 			add_long_to_string(string, port);
557 		}
558 	}
559 
560 	/* Only add slash if we need to separate */
561 	if ((wants(URI_DATA) || wants(URI_POST) || components == URI_HTTP_REFERRER_HOST)
562 	    && wants(~(URI_DATA | URI_PORT))
563 	    && get_protocol_need_slash_after_host(uri->protocol))
564 		add_char_to_string(string, '/');
565 
566 	if (wants(URI_DATA) && uri->datalen)
567 		add_bytes_to_string(string, uri->data, uri->datalen);
568 
569 	/* We can not test uri->datalen here since we need to always
570 	 * add '/'. */
571 	if (wants(URI_PATH) || wants(URI_FILENAME)) {
572 		unsigned char *filename = uri->data;
573 		unsigned char *pos;
574 
575 		assertm(!wants(URI_FILENAME) || components == URI_FILENAME,
576 			"URI_FILENAME should be used alone %d", components);
577 
578 		if (wants(URI_PATH) && !is_uri_dir_sep(uri, *filename)) {
579 			/* FIXME: Add correct separator */
580 			add_char_to_string(string, '/');
581 		}
582 
583 		if (!uri->datalen) return string;
584 
585 		for (pos = filename; *pos && !end_of_dir(*pos); pos++)
586 			if (wants(URI_FILENAME) && is_uri_dir_sep(uri, *pos))
587 				filename = pos + 1;
588 
589 		return add_bytes_to_string(string, filename, pos - filename);
590 	}
591 
592 	if (wants(URI_QUERY) && uri->datalen) {
593 		unsigned char *query = memchr(uri->data, '?', uri->datalen);
594 
595 		assertm(URI_QUERY == components,
596 			"URI_QUERY should be used alone %d", components);
597 
598 		if (!query) return string;
599 
600 		query++;
601 		/* Check fragment and POST_CHAR */
602 		return add_bytes_to_string(string, query, strcspn(query, "#" POST_CHAR_S));
603 	}
604 
605 	if (wants(URI_FRAGMENT) && uri->fragmentlen) {
606 		add_char_to_string(string, '#');
607 		add_bytes_to_string(string, uri->fragment, uri->fragmentlen);
608 	}
609 
610 	if (wants(URI_POST) && uri->post) {
611 		add_char_to_string(string, POST_CHAR);
612 		add_to_string(string, uri->post);
613 
614 	} else if (wants(URI_POST_INFO) && uri->post) {
615 		if (!strncmp(uri->post, "text/plain", 10)) {
616 			add_to_string(string, " (PLAIN TEXT DATA)");
617 
618 		} else if (!strncmp(uri->post, "multipart/form-data;", 20)) {
619 			add_to_string(string, " (MULTIPART FORM DATA)");
620 
621 		} else {
622 			add_to_string(string, " (POST DATA)");
623 		}
624 
625 	}
626 
627 	return string;
628 }
629 
630 #undef wants
631 
632 unsigned char *
get_uri_string(struct uri * uri,enum uri_component components)633 get_uri_string(struct uri *uri, enum uri_component components)
634 {
635 	struct string string;
636 
637 	if (init_string(&string)
638 	    && add_uri_to_string(&string, uri, components))
639 		return string.source;
640 
641 	done_string(&string);
642 	return NULL;
643 }
644 
645 
646 struct string *
add_string_uri_to_string(struct string * string,unsigned char * uristring,enum uri_component components)647 add_string_uri_to_string(struct string *string, unsigned char *uristring,
648 			 enum uri_component components)
649 {
650 	struct uri uri;
651 
652 	if (parse_uri(&uri, uristring) != URI_ERRNO_OK)
653 		return NULL;
654 
655 	return add_uri_to_string(string, &uri, components);
656 }
657 
658 
659 #define normalize_uri_reparse(str)	normalize_uri(NULL, str)
660 #define normalize_uri_noparse(uri)	normalize_uri(uri, struri(uri))
661 
662 unsigned char *
normalize_uri(struct uri * uri,unsigned char * uristring)663 normalize_uri(struct uri *uri, unsigned char *uristring)
664 {
665 	unsigned char *parse_string = uristring;
666 	unsigned char *src, *dest, *path;
667 	int need_slash = 0;
668 	int parse = (uri == NULL);
669 	struct uri uri_struct;
670 
671 	if (!uri) uri = &uri_struct;
672 
673 	/* We need to get the real (proxied) URI but lowercase relevant URI
674 	 * parts along the way. */
675 	do {
676 		if (parse && parse_uri(uri, parse_string) != URI_ERRNO_OK)
677 			return uristring;
678 
679 		assert(uri->data);
680 
681 		/* This is a maybe not the right place but both join_urls() and
682 		 * get_translated_uri() through translate_url() calls this
683 		 * function and then it already works on and modifies an
684 		 * allocated copy. */
685 		convert_to_lowercase_locale_indep(uri->string, uri->protocollen);
686 		if (uri->hostlen) convert_to_lowercase(uri->host, uri->hostlen);
687 
688 		parse = 1;
689 		parse_string = uri->data;
690 	} while (uri->protocol == PROTOCOL_PROXY);
691 
692 	if (get_protocol_free_syntax(uri->protocol))
693 		return uristring;
694 
695 	if (uri->protocol != PROTOCOL_UNKNOWN)
696 		need_slash = get_protocol_need_slash_after_host(uri->protocol);
697 
698 	/* We want to start at the first slash to also reduce URIs like
699 	 * http://host//index.html to http://host/index.html */
700 	path = uri->data - need_slash;
701 	dest = src = path;
702 
703 	/* This loop mangles the URI string by removing directory elevators and
704 	 * other cruft. Example: /.././etc////..//usr/ -> /usr/ */
705 	while (*dest) {
706 		/* If the following pieces are the LAST parts of URL, we remove
707 		 * them as well. See RFC 1808 for details. */
708 
709 		if (end_of_dir(src[0])) {
710 			/* URL data contains no more path. */
711 			memmove(dest, src, strlen(src) + 1);
712 			break;
713 		}
714 
715 		if (!is_uri_dir_sep(uri, src[0])) {
716 			/* This is to reduce indentation */
717 
718 		} else if (src[1] == '.') {
719 			if (!src[2]) {
720 				/* /. - skip the dot */
721 				*dest++ = *src;
722 				*dest = 0;
723 				break;
724 
725 			} else if (is_uri_dir_sep(uri, src[2])) {
726 				/* /./ - strip that.. */
727 				src += 2;
728 				continue;
729 
730 			} else if (src[2] == '.'
731 				   && (is_uri_dir_sep(uri, src[3]) || !src[3])) {
732 				/* /../ or /.. - skip it and preceding element. */
733 
734 				/* First back out the last incrementation of
735 				 * @dest (dest++) to get the position that was
736 				 * last asigned to. */
737 				if (dest > path) dest--;
738 
739 				/* @dest might be pointing to a dir separator
740 				 * so we decrement before any testing. */
741 				while (dest > path) {
742 					dest--;
743 					if (is_uri_dir_sep(uri, *dest)) break;
744 				}
745 
746 				if (!src[3]) {
747 					/* /.. - add ending slash and stop */
748 					*dest++ = *src;
749 					*dest = 0;
750 					break;
751 				}
752 
753 				src += 3;
754 				continue;
755 			}
756 
757 		} else if (is_uri_dir_sep(uri, src[1])) {
758 			/* // - ignore first '/'. */
759 			src += 1;
760 			continue;
761 		}
762 
763 		/* We don't want to access memory past the NUL char. */
764 		*dest = *src++;
765 		if (*dest) dest++;
766 	}
767 
768 	return uristring;
769 }
770 
771 /* The 'file' scheme URI comes in and bastardized URI comes out which consists
772  * of just the complete path to file/directory, which the dumb 'file' protocol
773  * backend can understand. No host parts etc, that is what this function is
774  * supposed to chew. */
775 static struct uri *
transform_file_url(struct uri * uri,unsigned char * cwd)776 transform_file_url(struct uri *uri, unsigned char *cwd)
777 {
778 	unsigned char *path = uri->data;
779 
780 	assert(uri->protocol == PROTOCOL_FILE && uri->data);
781 
782 	/* Sort out the host part. We currently support only host "localhost"
783 	 * (plus empty host part will be assumed to be "localhost" as well).
784 	 * As our extensions, '.' will reference to the cwd on localhost
785 	 * (originally, when the first thing after file:// wasn't "localhost/",
786 	 * we assumed the cwd as well, and pretended that there's no host part
787 	 * at all) and '..' to the directory parent to cwd. Another extension
788 	 * is that if this is a DOS-like system, the first char in two-char
789 	 * host part is uppercase letter and the second char is a colon, it is
790 	 * assumed to be a local disk specification. */
791 	/* TODO: Use FTP for non-localhost hosts. --pasky */
792 
793 	/* For URL "file://", we open the current directory. Some other
794 	 * browsers instead open root directory, but AFAIK the standard does
795 	 * not specify that and this was the original behaviour and it is more
796 	 * consistent with our file://./ notation. */
797 
798 	/* Who would name their file/dir '...' ? */
799 	if (*path == '.' || !*path) {
800 		struct string dir;
801 
802 		if (!init_string(&dir))
803 			return NULL;
804 
805 		encode_uri_string(&dir, cwd, -1, 0);
806 
807 		/* Either we will end up with '//' and translate_directories()
808 		 * will shorten it or the '/' will mark the inserted cwd as a
809 		 * directory. */
810 		if (*path == '.') *path = '/';
811 
812 		/* Insert the current working directory. */
813 		/* The offset is 7 == sizeof("file://") - 1. */
814 		insert_in_string(&struri(uri), 7, dir.source, dir.length);
815 
816 		done_string(&dir);
817 		return uri;
818 	}
819 
820 #ifdef DOS_FS
821 	if (isasciialpha(path[0]) && path[1] == ':' && dir_sep(path[2]))
822 		return NULL;
823 #endif
824 
825 	for (; *path && !dir_sep(*path); path++);
826 
827 	/* FIXME: We will in fact assume localhost even for non-local hosts,
828 	 * until we will support the FTP transformation. --pasky */
829 
830 	memmove(uri->data, path, strlen(path) + 1);
831 	return uri;
832 }
833 
834 static unsigned char *translate_url(unsigned char *url, unsigned char *cwd);
835 
836 unsigned char *
join_urls(struct uri * base,unsigned char * rel)837 join_urls(struct uri *base, unsigned char *rel)
838 {
839 	unsigned char *uristring, *path;
840 	int add_slash = 0;
841 	int translate = 0;
842 	int length = 0;
843 
844 	/* See RFC 1808 */
845 	/* TODO: Support for ';' ? (see the RFC) --pasky */
846 
847 	/* For '#', '?' and '//' we could use get_uri_string() but it might be
848 	 * too expensive since it uses granular allocation scheme. I wouldn't
849 	 * personally mind tho' because it would be cleaner. --jonas */
850 	if (rel[0] == '#') {
851 		/* Strip fragment and post part from the base URI and append
852 		 * the fragment string in @rel. */
853 		length  = base->fragment
854 			? base->fragment - struri(base) - 1
855 			: get_real_uri_length(base);
856 
857 	} else if (rel[0] == '?') {
858 		/* Strip query, fragment and post part from the base URI and
859 		 * append the query string in @rel. */
860 		length  = base->fragment ? base->fragment - struri(base) - 1
861 					 : get_real_uri_length(base);
862 
863 		uristring = memchr(base->data, '?', base->datalen);
864 		if (uristring) length = uristring - struri(base);
865 
866 	} else if (rel[0] == '/' && rel[1] == '/') {
867 		if (!get_protocol_need_slashes(base->protocol))
868 			return NULL;
869 
870 		/* Get `<protocol>:' from the base URI and append the `//' part
871 		 * from @rel. */
872 		length = base->protocollen + 1;
873 
874 		/* We need to sanitize the relative part and add stuff like
875 		 * host slash. */
876 		translate = 1;
877 	}
878 
879 	/* If one of the tests above set @length to something useful */
880 	if (length) {
881 		uristring = memacpy(struri(base), length);
882 		if (!uristring) return NULL;
883 
884 		add_to_strn(&uristring, rel);
885 
886 		if (translate) {
887 			unsigned char *translated;
888 
889 			translated = translate_url(uristring, NULL);
890 			mem_free(uristring);
891 			return translated;
892 		}
893 		return normalize_uri_reparse(uristring);
894 	}
895 
896 	/* Check if there is some protocol name to go for */
897 	length = get_protocol_length(rel);
898 	if (length) {
899 		switch (get_protocol(rel, length)) {
900 		case PROTOCOL_UNKNOWN:
901 		case PROTOCOL_PROXY:
902 			/* Mysteriously proxy URIs are breaking here ... */
903 			break;
904 
905 		case PROTOCOL_FILE:
906 			/* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg
907 			 * to translate_url(). */
908 		default:
909 			uristring = translate_url(rel, NULL);
910 			if (uristring) return uristring;
911 		}
912 	}
913 
914 	assertm(base->data, "bad base url");
915 	if_assert_failed return NULL;
916 
917 	path = base->data;
918 
919 	/* Either is path blank, but we've slash char before, or path is not
920 	 * blank, but doesn't start by a slash (if we'd just stay along with
921 	 * is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it
922 	 * should be enough, but I'm not sure and I don't want to break
923 	 * anything --pasky). */
924 	/* We skip first char of URL ('/') in parse_url() (ARGH). This
925 	 * is reason of all this bug-bearing magic.. */
926 	if (*path) {
927 		if (!is_uri_dir_sep(base, *path)) path--;
928 	} else {
929 		if (is_uri_dir_sep(base, path[-1])) path--;
930 	}
931 
932 	if (!is_uri_dir_sep(base, rel[0])) {
933 		unsigned char *path_end;
934 
935 		/* The URL is relative. */
936 
937 		if (!*path) {
938 			/* There's no path in the URL, but we're going to add
939 			 * something there, and the something doesn't start by
940 			 * a slash. So we need to insert a slash after the base
941 			 * URL. Clever, eh? ;) */
942 			add_slash = 1;
943 		}
944 
945 		for (path_end = path; *path_end; path_end++) {
946 			if (end_of_dir(*path_end)) break;
947 			/* Modify the path pointer, so that it'll always point
948 			 * above the last '/' in the URL; later, we'll copy the
949 			 * URL only _TO_ this point, and anything after last
950 			 * slash will be substituted by 'rel'. */
951 			if (is_uri_dir_sep(base, *path_end))
952 				path = path_end + 1;
953 		}
954 	}
955 
956 	length = path - struri(base);
957 	uristring = mem_alloc(length + strlen(rel) + add_slash + 1);
958 	if (!uristring) return NULL;
959 
960 	memcpy(uristring, struri(base), length);
961 	if (add_slash) uristring[length] = '/';
962 	strcpy(uristring + length + add_slash, rel);
963 
964 	return normalize_uri_reparse(uristring);
965 }
966 
967 
968 /* Tries to figure out what protocol @newurl might be specifying by checking if
969  * it exists as a file locally or by checking parts of the host name. */
970 static enum protocol
find_uri_protocol(unsigned char * newurl)971 find_uri_protocol(unsigned char *newurl)
972 {
973 	unsigned char *ch;
974 
975 	/* First see if it is a file so filenames that look like hostnames
976 	 * won't confuse us below. */
977 	if (check_whether_file_exists(newurl) >= 0) return PROTOCOL_FILE;
978 
979 	/* Yes, it would be simpler to make test for IPv6 address first,
980 	 * but it would result in confusing mix of ifdefs ;-). */
981 	/* FIXME: Ideas for improve protocol detection
982 	 *
983 	 * - Handle common hostnames. It could be part of the protocol backend
984 	 *   structure. [ www -> http, irc -> irc, news -> nntp, ... ]
985 	 *
986 	 * - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ]
987 	 */
988 
989 	ch = newurl + strcspn(newurl, ".:/@");
990 	if (*ch == '@'
991 	    || (*ch == ':' && *newurl != '[' && strchr(newurl, '@'))
992 	    || !c_strncasecmp(newurl, "ftp.", 4)) {
993 		/* Contains user/password/ftp-hostname */
994 		return PROTOCOL_FTP;
995 
996 #ifdef CONFIG_IPV6
997 	} else if (*newurl == '[' && *ch == ':') {
998 		/* Candidate for IPv6 address */
999 		unsigned char *bracket2, *colon2;
1000 
1001 		ch++;
1002 		bracket2 = strchr(ch, ']');
1003 		colon2 = strchr(ch, ':');
1004 		if (bracket2 && colon2 && bracket2 > colon2)
1005 			return PROTOCOL_HTTP;
1006 #endif
1007 
1008 	} else if (*newurl != '.' && *ch == '.') {
1009 		/* Contains domain name? */
1010 		unsigned char *host_end, *domain;
1011 		unsigned char *ipscan;
1012 
1013 		/* Process the hostname */
1014 		for (domain = ch + 1;
1015 			*(host_end = domain + strcspn(domain, ".:/?")) == '.';
1016 			domain = host_end + 1);
1017 
1018 		/* It's IP? */
1019 		for (ipscan = ch; isdigit(*ipscan) || *ipscan == '.';
1020 			ipscan++);
1021 
1022 		if (!*ipscan || *ipscan == ':' || *ipscan == '/')
1023 			return PROTOCOL_HTTP;
1024 
1025 		/* It's two-letter or known TLD? */
1026 		if (host_end - domain == 2
1027 		    || end_with_known_tld(domain, host_end - domain) >= 0)
1028 			return PROTOCOL_HTTP;
1029 	}
1030 
1031 	return PROTOCOL_UNKNOWN;
1032 }
1033 
1034 
1035 #define MAX_TRANSLATION_ATTEMPTS	32
1036 
1037 /* Returns an URI string that can be used internally. Adding protocol prefix,
1038  * missing slashes etc. */
1039 static unsigned char *
translate_url(unsigned char * url,unsigned char * cwd)1040 translate_url(unsigned char *url, unsigned char *cwd)
1041 {
1042 	unsigned char *newurl;
1043 	struct uri uri;
1044 	enum uri_errno uri_errno, prev_errno = URI_ERRNO_EMPTY;
1045 	int retries = 0;
1046 
1047 	/* Strip starting spaces */
1048 	while (*url == ' ') url++;
1049 	if (!*url) return NULL;
1050 
1051 	newurl = expand_tilde(url); /* XXX: Post data copy. */
1052 	if (!newurl) return NULL;
1053 
1054 parse_uri:
1055 	/* Yay a goto loop. If we get some URI parse error and try to
1056 	 * fix it we go back to here and try again. */
1057 	/* Ordinary parse */
1058 	uri_errno = parse_uri(&uri, newurl);
1059 
1060 	/* Bail out if the same error occurs twice */
1061 	if (uri_errno == prev_errno || retries++ > MAX_TRANSLATION_ATTEMPTS) {
1062 		if (retries > MAX_TRANSLATION_ATTEMPTS) {
1063 			ERROR("Maximum number of parsing attempts exceeded "
1064 			      "for %s.", url);
1065 		}
1066 		mem_free(newurl);
1067 		return NULL;
1068 	}
1069 
1070 	prev_errno = uri_errno;
1071 
1072 	switch (uri_errno) {
1073 	case URI_ERRNO_OK:
1074 		/* Fix translation of 1.2.3.4:5 so IP address part won't be
1075 		 * interpreted as the protocol name. */
1076 		if (uri.protocol == PROTOCOL_UNKNOWN) {
1077 			enum protocol protocol = find_uri_protocol(newurl);
1078 
1079 			/* Code duplication with the URI_ERRNO_INVALID_PROTOCOL
1080 			 * case. */
1081 			if (protocol != PROTOCOL_UNKNOWN) {
1082 				struct string str;
1083 
1084 				if (!init_string(&str)) return NULL;
1085 
1086 				switch (protocol) {
1087 				case PROTOCOL_FTP:
1088 					add_to_string(&str, "ftp://");
1089 					encode_uri_string(&str, newurl, -1, 0);
1090 					break;
1091 
1092 				case PROTOCOL_HTTP:
1093 					add_to_string(&str, "http://");
1094 					add_to_string(&str, newurl);
1095 					break;
1096 
1097 				case PROTOCOL_UNKNOWN:
1098 					break;
1099 
1100 				case PROTOCOL_FILE:
1101 				default:
1102 					add_to_string(&str, "file://");
1103 					if (!dir_sep(*newurl))
1104 						add_to_string(&str, "./");
1105 
1106 					add_to_string(&str, newurl);
1107 				}
1108 
1109 				mem_free(newurl);
1110 				newurl = str.source;
1111 
1112 				/* Work around the infinite loop prevention */
1113 				prev_errno = URI_ERRNO_EMPTY;
1114 				goto parse_uri;
1115 			}
1116 		}
1117 
1118 		/* If file:// URI is transformed we need to reparse. */
1119 		if (uri.protocol == PROTOCOL_FILE && cwd && *cwd
1120 		    && transform_file_url(&uri, cwd))
1121 			return normalize_uri_reparse(struri(&uri));
1122 
1123 		/* Translate the proxied URI too if proxy:// */
1124 		if (uri.protocol == PROTOCOL_PROXY) {
1125 			unsigned char *data = translate_url(uri.data, cwd);
1126 			int pos = uri.data - struri(&uri);
1127 
1128 			if (!data) break;
1129 			struri(&uri)[pos] = 0;
1130 			insert_in_string(&struri(&uri), pos, data, strlen(data));
1131 			mem_free(data);
1132 			return normalize_uri_reparse(struri(&uri));
1133 		}
1134 
1135 		return normalize_uri_noparse(&uri);
1136 
1137 	case URI_ERRNO_TOO_MANY_SLASHES:
1138 	{
1139 		unsigned char *from, *to;
1140 
1141 		assert(uri.string[uri.protocollen] == ':'
1142 		       && uri.string[uri.protocollen + 1] == '/'
1143 		       && uri.string[uri.protocollen + 2] == '/');
1144 
1145 		from = to = uri.string + uri.protocollen + 3;
1146 		while (*from == '/') from++;
1147 
1148 		assert(to < from);
1149 		memmove(to, from, strlen(from) + 1);
1150 		goto parse_uri;
1151 	}
1152 	case URI_ERRNO_NO_SLASHES:
1153 	{
1154 		/* Try prefix:some.url -> prefix://some.url.. */
1155 		int slashes = 2;
1156 
1157 		/* Check if only one '/' is needed. */
1158 		if (uri.string[uri.protocollen + 1] == '/')
1159 			slashes--;
1160 
1161 		insert_in_string(&newurl, uri.protocollen + 1, "//", slashes);
1162 		goto parse_uri;
1163 	}
1164 	case URI_ERRNO_TRAILING_DOTS:
1165 	{
1166 		/* Trim trailing '.'s */
1167 		unsigned char *from = uri.host + uri.hostlen;
1168 		unsigned char *to = from;
1169 
1170 		assert(uri.host < to && to[-1] == '.' && *from != '.');
1171 
1172 		while (uri.host < to && to[-1] == '.') to--;
1173 
1174 		assert(to < from);
1175 		memmove(to, from, strlen(from) + 1);
1176 		goto parse_uri;
1177 	}
1178 	case URI_ERRNO_NO_PORT_COLON:
1179 		assert(uri.portlen == 0
1180 		       && uri.string < uri.port
1181 		       && uri.port[-1] == ':');
1182 
1183 		memmove(uri.port - 1, uri.port, strlen(uri.port) + 1);
1184 		goto parse_uri;
1185 
1186 	case URI_ERRNO_NO_HOST_SLASH:
1187 	{
1188 		int offset = uri.port
1189 			   ? uri.port + uri.portlen - struri(&uri)
1190 			   : uri.host + uri.hostlen - struri(&uri) + uri.ipv6 /* ']' */;
1191 
1192 		assertm(uri.host, "uri.host not set after no host slash error");
1193 		insert_in_string(&newurl, offset, "/", 1);
1194 		goto parse_uri;
1195 	}
1196 	case URI_ERRNO_INVALID_PROTOCOL:
1197 	{
1198 		/* No protocol name */
1199 		enum protocol protocol = find_uri_protocol(newurl);
1200 		struct string str;
1201 
1202 		if (!init_string(&str)) return NULL;
1203 
1204 		switch (protocol) {
1205 			case PROTOCOL_FTP:
1206 				add_to_string(&str, "ftp://");
1207 				encode_uri_string(&str, newurl, -1, 0);
1208 				break;
1209 
1210 			case PROTOCOL_HTTP:
1211 				add_to_string(&str, "http://");
1212 				add_to_string(&str, newurl);
1213 				break;
1214 
1215 			case PROTOCOL_UNKNOWN:
1216 				/* We default to file:// even though we already
1217 				 * tested if the file existed since it will give
1218 				 * a "No such file or directory" error.  which
1219 				 * might better hint the user that there was
1220 				 * problem figuring out the URI. */
1221 			case PROTOCOL_FILE:
1222 			default:
1223 				add_to_string(&str, "file://");
1224 				if (!dir_sep(*newurl))
1225 					add_to_string(&str, "./");
1226 
1227 				encode_file_uri_string(&str, newurl);
1228 		}
1229 
1230 		mem_free(newurl);
1231 		newurl = str.source;
1232 
1233 		goto parse_uri;
1234 	}
1235 	case URI_ERRNO_EMPTY:
1236 	case URI_ERRNO_IPV6_SECURITY:
1237 	case URI_ERRNO_NO_HOST:
1238 	case URI_ERRNO_INVALID_PORT:
1239 	case URI_ERRNO_INVALID_PORT_RANGE:
1240 		/* None of these can be handled properly. */
1241 		break;
1242 	}
1243 
1244 	mem_free(newurl);
1245 	return NULL;
1246 }
1247 
1248 
1249 struct uri *
get_composed_uri(struct uri * uri,enum uri_component components)1250 get_composed_uri(struct uri *uri, enum uri_component components)
1251 {
1252 	unsigned char *string;
1253 
1254 	assert(uri);
1255 	if_assert_failed return NULL;
1256 
1257 	string = get_uri_string(uri, components);
1258 	if (!string) return NULL;
1259 
1260 	uri = get_uri(string, 0);
1261 	mem_free(string);
1262 
1263 	return uri;
1264 }
1265 
1266 struct uri *
get_translated_uri(unsigned char * uristring,unsigned char * cwd)1267 get_translated_uri(unsigned char *uristring, unsigned char *cwd)
1268 {
1269 	struct uri *uri;
1270 
1271 	uristring = translate_url(uristring, cwd);
1272 	if (!uristring) return NULL;
1273 
1274 	uri = get_uri(uristring, 0);
1275 	mem_free(uristring);
1276 
1277 	return uri;
1278 }
1279 
1280 
1281 unsigned char *
get_extension_from_uri(struct uri * uri)1282 get_extension_from_uri(struct uri *uri)
1283 {
1284 	unsigned char *extension = NULL;
1285 	int afterslash = 1;
1286 	unsigned char *pos = uri->data;
1287 
1288 	assert(pos);
1289 
1290 	for (; *pos && !end_of_dir(*pos); pos++) {
1291 		if (!afterslash && !extension && *pos == '.') {
1292 			extension = pos;
1293 		} else if (is_uri_dir_sep(uri, *pos)) {
1294 			extension = NULL;
1295 			afterslash = 1;
1296 		} else {
1297 			afterslash = 0;
1298 		}
1299 	}
1300 
1301 	if (extension && extension < pos)
1302 		return memacpy(extension, pos - extension);
1303 
1304 	return NULL;
1305 }
1306 
1307 /* URI encoding, escaping unallowed characters. */
1308 static inline int
safe_char(unsigned char c)1309 safe_char(unsigned char c)
1310 {
1311 	/* RFC 2396, Page 8, Section 2.3 ;-) */
1312 	return isident(c) || c == '.' || c == '!' || c == '~'
1313 	       || c == '*' || c == '\''|| c == '(' || c == ')';
1314 }
1315 
1316 void
encode_uri_string(struct string * string,unsigned char * name,int namelen,int convert_slashes)1317 encode_uri_string(struct string *string, unsigned char *name, int namelen,
1318 		  int convert_slashes)
1319 {
1320 	unsigned char n[4];
1321 	unsigned char *end;
1322 
1323 	n[0] = '%';
1324 	n[3] = '\0';
1325 
1326 	if (namelen < 0) namelen = strlen(name);
1327 
1328 	for (end = name + namelen; name < end; name++) {
1329 #if 0
1330 		/* This is probably correct only for query part of URI..? */
1331 		if (*name == ' ') add_char_to_string(data, len, '+');
1332 		else
1333 #endif
1334 		if (safe_char(*name) || (!convert_slashes && *name == '/')) {
1335 			add_char_to_string(string, *name);
1336 		} else {
1337 			/* Hex it. */
1338 			n[1] = hx((((int) *name) & 0xF0) >> 4);
1339 			n[2] = hx(((int) *name) & 0xF);
1340 			add_bytes_to_string(string, n, sizeof(n) - 1);
1341 		}
1342 	}
1343 }
1344 
1345 /* This function is evil, it modifies its parameter. */
1346 /* XXX: but decoded string is _never_ longer than encoded string so it's an
1347  * efficient way to do that, imho. --Zas */
1348 void
decode_uri(unsigned char * src)1349 decode_uri(unsigned char *src)
1350 {
1351 	unsigned char *dst = src;
1352 	unsigned char c;
1353 
1354 	do {
1355 		c = *src++;
1356 
1357 		if (c == '%') {
1358 			int x1 = unhx(*src);
1359 
1360 			if (x1 >= 0) {
1361 				int x2 = unhx(*(src + 1));
1362 
1363 				if (x2 >= 0) {
1364 					x1 = (x1 << 4) + x2;
1365 					if (x1 != 0) { /* don't allow %00 */
1366 						c = (unsigned char) x1;
1367 						src += 2;
1368 					}
1369 				}
1370 			}
1371 
1372 #if 0
1373 		} else if (c == '+') {
1374 			/* As the comment in encode_uri_string suggests, '+'
1375 			 * should only be decoded in the query part of a URI
1376 			 * (should that be 'URL'?). I'm not bold enough to
1377 			 * disable this code, tho. -- Miciah */
1378 			c = ' ';
1379 #endif
1380 		}
1381 
1382 		*dst++ = c;
1383 	} while (c != '\0');
1384 }
1385 
1386 void
decode_uri_string(struct string * string)1387 decode_uri_string(struct string *string)
1388 {
1389 	decode_uri(string->source);
1390 	string->length = strlen(string->source);
1391 }
1392 
1393 void
decode_uri_for_display(unsigned char * src)1394 decode_uri_for_display(unsigned char *src)
1395 {
1396 	decode_uri(src);
1397 
1398 	for (; *src; src++)
1399 		if (!isprint(*src) || iscntrl(*src))
1400 			*src = '*';
1401 }
1402 
1403 void
decode_uri_string_for_display(struct string * string)1404 decode_uri_string_for_display(struct string *string)
1405 {
1406 	decode_uri_for_display(string->source);
1407 	string->length = strlen(string->source);
1408 }
1409 
1410 
1411 /* URI list */
1412 
1413 #define URI_LIST_GRANULARITY 0x3
1414 
1415 #define realloc_uri_list(list) \
1416 	mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \
1417 			struct uri *, URI_LIST_GRANULARITY)
1418 
1419 struct uri *
add_to_uri_list(struct uri_list * list,struct uri * uri)1420 add_to_uri_list(struct uri_list *list, struct uri *uri)
1421 {
1422 	if (!realloc_uri_list(list))
1423 		return NULL;
1424 
1425 	list->uris[list->size++] = get_uri_reference(uri);
1426 
1427 	return uri;
1428 };
1429 
1430 void
free_uri_list(struct uri_list * list)1431 free_uri_list(struct uri_list *list)
1432 {
1433 	struct uri *uri;
1434 	int index;
1435 
1436 	if (!list->uris) return;
1437 
1438 	foreach_uri (uri, index, list) {
1439 		done_uri(uri);
1440 	}
1441 
1442 	mem_free_set(&list->uris, NULL);
1443 	list->size = 0;
1444 }
1445 
1446 /* URI cache */
1447 
1448 struct uri_cache_entry {
1449 	struct uri uri;
1450 	unsigned char string[1];
1451 };
1452 
1453 struct uri_cache {
1454 	struct hash *map;
1455 	struct object object;
1456 };
1457 
1458 static struct uri_cache uri_cache;
1459 
1460 #ifdef CONFIG_DEBUG
1461 static inline void
check_uri_sanity(struct uri * uri)1462 check_uri_sanity(struct uri *uri)
1463 {
1464 	int pos;
1465 
1466 	for (pos = 0; pos < uri->protocollen; pos++)
1467 		if (c_isupper(uri->string[pos])) goto error;
1468 
1469 	if (uri->hostlen)
1470 		for (pos = 0; pos < uri->hostlen; pos++)
1471 			if (c_isupper(uri->host[pos])) goto error;
1472 	return;
1473 error:
1474 	INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri));
1475 }
1476 #else
1477 #define check_uri_sanity(uri)
1478 #endif
1479 
1480 static inline struct uri_cache_entry *
get_uri_cache_entry(unsigned char * string,int length)1481 get_uri_cache_entry(unsigned char *string, int length)
1482 {
1483 	struct uri_cache_entry *entry;
1484 	struct hash_item *item;
1485 
1486 	assert(string && length > 0);
1487 	if_assert_failed return NULL;
1488 
1489 	item = get_hash_item(uri_cache.map, string, length);
1490 	if (item) return item->value;
1491 
1492 	/* Setup a new entry */
1493 
1494 	entry = mem_calloc(1, sizeof(*entry) + length);
1495 	if (!entry) return NULL;
1496 
1497 	object_nolock(&entry->uri, "uri");
1498 	memcpy(&entry->string, string, length);
1499 	string = entry->string;
1500 
1501 	if (parse_uri(&entry->uri, string) != URI_ERRNO_OK
1502 	    || !add_hash_item(uri_cache.map, string, length, entry)) {
1503 		mem_free(entry);
1504 		return NULL;
1505 	}
1506 
1507 	object_lock(&uri_cache);
1508 
1509 	return entry;
1510 }
1511 
1512 struct uri *
get_uri(unsigned char * string,enum uri_component components)1513 get_uri(unsigned char *string, enum uri_component components)
1514 {
1515 	struct uri_cache_entry *entry;
1516 
1517 	assert(string);
1518 
1519 	if (components) {
1520 		struct uri uri;
1521 
1522 		if (parse_uri(&uri, string) != URI_ERRNO_OK)
1523 			return NULL;
1524 
1525 		return get_composed_uri(&uri, components);
1526 	}
1527 
1528 	if (!is_object_used(&uri_cache)) {
1529 		uri_cache.map = init_hash(hash_size(3), strhash);
1530 		if (!uri_cache.map) return NULL;
1531 		object_nolock(&uri_cache, "uri_cache");
1532 	}
1533 
1534 	entry = get_uri_cache_entry(string, strlen(string));
1535 	if (!entry) {
1536 		if (!is_object_used(&uri_cache))
1537 			free_hash(uri_cache.map);
1538 		return NULL;
1539 	}
1540 
1541 	check_uri_sanity(&entry->uri);
1542 	object_nolock(&entry->uri, "uri");
1543 	object_lock(&entry->uri);
1544 
1545 	return &entry->uri;
1546 }
1547 
1548 void
done_uri(struct uri * uri)1549 done_uri(struct uri *uri)
1550 {
1551 	unsigned char *string = struri(uri);
1552 	int length = strlen(string);
1553 	struct hash_item *item;
1554 	struct uri_cache_entry *entry;
1555 
1556 	assert(is_object_used(&uri_cache));
1557 
1558 	object_unlock(uri);
1559 	if (is_object_used(uri)) return;
1560 
1561 	item = get_hash_item(uri_cache.map, string, length);
1562 	entry = item ? item->value : NULL;
1563 
1564 	assertm(entry, "Releasing unknown URI [%s]", string);
1565 	del_hash_item(uri_cache.map, item);
1566 	mem_free(entry);
1567 
1568 	/* Last URI frees the cache */
1569 	object_unlock(&uri_cache);
1570 	if (!is_object_used(&uri_cache))
1571 		free_hash(uri_cache.map);
1572 }
1573