1 /*
2  * Copyright (c) 2012 Tim Ruehsen
3  * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4  *
5  * This file is part of libwget.
6  *
7  * Libwget is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * Libwget is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
19  *
20  *
21  * URI/IRI routines
22  * about encoding see http://nikitathespider.com/articles/EncodingDivination.html
23  * about GET encoding see https://stackoverflow.com/questions/1549213/whats-the-correct-encoding-of-http-get-request-strings
24  * RFC 3986: URI generic syntax
25  *
26  *
27  * Changelog
28  * 25.04.2012  Tim Ruehsen  created
29  *
30  */
31 
32 #include <config.h>
33 
34 #include <string.h>
35 #include <errno.h>
36 #include "c-ctype.h"
37 
38 #include <wget.h>
39 #include "private.h"
40 
41 /**
42  * \file
43  * \brief Functions to work with URIs and IRIs
44  * \defgroup libwget-iri URIs/IRIs
45  *
46  * @{
47  *
48  * URI/IRI parsing and manipulation functions.
49  *
50  * IRIs are processed according to [RFC 3987](https://datatracker.ietf.org/doc/rfc3987/).
51  * Functions that escape certain characters (such as wget_iri_escape()) work according to
52  * [RFC 3986](https://datatracker.ietf.org/doc/rfc3986/).
53  *
54  * The \ref wget_iri_st "wget_iri" structure represents an IRI. You generate one from a string with wget_iri_parse() or
55  * wget_iri_parse_base(). You can use wget_iri_clone() to generate another identical \ref wget_iri_st "wget_iri".
56  *
57  * You can access each of the fields of a \ref wget_iri_st "wget_iri" (such as `path`) independently, and you can use
58  * the getters here to escape each of those parts, or for convenience (e.g wget_iri_get_escaped_host(),
59  * wget_iri_get_escaped_resource(), etc.).
60  *
61  * URIs/IRIs are all internally treated in UTF-8. The parsing functions that generate a \ref wget_iri_st "wget_iri" structure
62  * (wget_iri_parse() and wget_iri_parse_base()) thus convert the input string to UTF-8 before anything else.
63  * These functions take an `encoding` parameter that tells which is the original encoding of that string.
64  *
65  * Conversely, the getters (for example, wget_iri_get_path()) can convert the output string from UTF-8
66  * to an encoding of choice. The desired encoding is also specified in the `encoding` parameter.
67  *
68  * The `encoding` parameter, in all functions that accept it, is a string with the name of a character set
69  * supported by GNU libiconv. You can find such a list elsewhere, but popular examples are "utf-8", "utf-16" or "iso-8859-1".
70  */
71 
72 static const char
73 	*default_page = "index.html";
74 static size_t
75 	default_page_length = 10;
76 
77 static struct iri_scheme {
78 	uint16_t port;
79 	const char name[6];
80 } schemes[] = {
81 	[WGET_IRI_SCHEME_HTTP]  = {  80, "http"  },
82 	[WGET_IRI_SCHEME_HTTPS] = { 443, "https" },
83 };
84 
85 /**
86  * \param[in] scheme Scheme to get name for
87  * \return Name of \p scheme (e.g. "http" or "https") or NULL is not supported
88  *
89  * Maps \p scheme to it's string representation.
90  */
wget_iri_scheme_get_name(wget_iri_scheme scheme)91 const char *wget_iri_scheme_get_name(wget_iri_scheme scheme)
92 {
93 	if ((unsigned) scheme < countof(schemes))
94 		return schemes[scheme].name;
95 
96 	return NULL;
97 }
98 
99 /**
100  * \param[in] iri An IRI
101  * \return 1 if the scheme is supported, 0 if not
102  *
103  * Tells whether the IRI's scheme is supported or not.
104  */
wget_iri_supported(const wget_iri * iri)105 bool wget_iri_supported(const wget_iri *iri)
106 {
107 	return (unsigned) iri->scheme < countof(schemes);
108 }
109 
110 
111 /* \cond _hide_internal_symbols */
112 #define IRI_CTYPE_GENDELIM (1<<0)
113 #define iri_isgendelim(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_GENDELIM)
114 
115 #define IRI_CTYPE_SUBDELIM (1<<1)
116 #define iri_issubdelim(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_SUBDELIM)
117 
118 #define IRI_CTYPE_UNRESERVED (1<<2)
119 #define iri_isunreserved(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_UNRESERVED)
120 
121 #define iri_isscheme(c) (c_isalnum(c) || c == '+' || c == '-' || c == '.')
122 /* \endcond */
123 
124 static const unsigned char
125 	iri_ctype[256] = {
126 		[':'] = IRI_CTYPE_GENDELIM,
127 		['/'] = IRI_CTYPE_GENDELIM,
128 		['?'] = IRI_CTYPE_GENDELIM,
129 		['#'] = IRI_CTYPE_GENDELIM,
130 		['['] = IRI_CTYPE_GENDELIM,
131 		[']'] = IRI_CTYPE_GENDELIM,
132 		['@'] = IRI_CTYPE_GENDELIM,
133 
134 		['!'] = IRI_CTYPE_SUBDELIM,
135 		['$'] = IRI_CTYPE_SUBDELIM,
136 		['&'] = IRI_CTYPE_SUBDELIM,
137 		['\''] = IRI_CTYPE_SUBDELIM,
138 		['('] = IRI_CTYPE_SUBDELIM,
139 		[')'] = IRI_CTYPE_SUBDELIM,
140 		['*'] = IRI_CTYPE_SUBDELIM,
141 		['+'] = IRI_CTYPE_SUBDELIM,
142 		[','] = IRI_CTYPE_SUBDELIM,
143 		[';'] = IRI_CTYPE_SUBDELIM,
144 		['='] = IRI_CTYPE_SUBDELIM,
145 
146 		['0'] = IRI_CTYPE_UNRESERVED,
147 		['1'] = IRI_CTYPE_UNRESERVED,
148 		['2'] = IRI_CTYPE_UNRESERVED,
149 		['3'] = IRI_CTYPE_UNRESERVED,
150 		['4'] = IRI_CTYPE_UNRESERVED,
151 		['5'] = IRI_CTYPE_UNRESERVED,
152 		['6'] = IRI_CTYPE_UNRESERVED,
153 		['7'] = IRI_CTYPE_UNRESERVED,
154 		['8'] = IRI_CTYPE_UNRESERVED,
155 		['9'] = IRI_CTYPE_UNRESERVED,
156 		['a'] = IRI_CTYPE_UNRESERVED,
157 		['b'] = IRI_CTYPE_UNRESERVED,
158 		['c'] = IRI_CTYPE_UNRESERVED,
159 		['d'] = IRI_CTYPE_UNRESERVED,
160 		['e'] = IRI_CTYPE_UNRESERVED,
161 		['f'] = IRI_CTYPE_UNRESERVED,
162 		['g'] = IRI_CTYPE_UNRESERVED,
163 		['h'] = IRI_CTYPE_UNRESERVED,
164 		['i'] = IRI_CTYPE_UNRESERVED,
165 		['j'] = IRI_CTYPE_UNRESERVED,
166 		['k'] = IRI_CTYPE_UNRESERVED,
167 		['l'] = IRI_CTYPE_UNRESERVED,
168 		['m'] = IRI_CTYPE_UNRESERVED,
169 		['n'] = IRI_CTYPE_UNRESERVED,
170 		['o'] = IRI_CTYPE_UNRESERVED,
171 		['p'] = IRI_CTYPE_UNRESERVED,
172 		['q'] = IRI_CTYPE_UNRESERVED,
173 		['r'] = IRI_CTYPE_UNRESERVED,
174 		['s'] = IRI_CTYPE_UNRESERVED,
175 		['t'] = IRI_CTYPE_UNRESERVED,
176 		['u'] = IRI_CTYPE_UNRESERVED,
177 		['v'] = IRI_CTYPE_UNRESERVED,
178 		['w'] = IRI_CTYPE_UNRESERVED,
179 		['x'] = IRI_CTYPE_UNRESERVED,
180 		['y'] = IRI_CTYPE_UNRESERVED,
181 		['z'] = IRI_CTYPE_UNRESERVED,
182 		['A'] = IRI_CTYPE_UNRESERVED,
183 		['B'] = IRI_CTYPE_UNRESERVED,
184 		['C'] = IRI_CTYPE_UNRESERVED,
185 		['D'] = IRI_CTYPE_UNRESERVED,
186 		['E'] = IRI_CTYPE_UNRESERVED,
187 		['F'] = IRI_CTYPE_UNRESERVED,
188 		['G'] = IRI_CTYPE_UNRESERVED,
189 		['H'] = IRI_CTYPE_UNRESERVED,
190 		['I'] = IRI_CTYPE_UNRESERVED,
191 		['J'] = IRI_CTYPE_UNRESERVED,
192 		['K'] = IRI_CTYPE_UNRESERVED,
193 		['L'] = IRI_CTYPE_UNRESERVED,
194 		['M'] = IRI_CTYPE_UNRESERVED,
195 		['N'] = IRI_CTYPE_UNRESERVED,
196 		['O'] = IRI_CTYPE_UNRESERVED,
197 		['P'] = IRI_CTYPE_UNRESERVED,
198 		['Q'] = IRI_CTYPE_UNRESERVED,
199 		['R'] = IRI_CTYPE_UNRESERVED,
200 		['S'] = IRI_CTYPE_UNRESERVED,
201 		['T'] = IRI_CTYPE_UNRESERVED,
202 		['U'] = IRI_CTYPE_UNRESERVED,
203 		['V'] = IRI_CTYPE_UNRESERVED,
204 		['W'] = IRI_CTYPE_UNRESERVED,
205 		['X'] = IRI_CTYPE_UNRESERVED,
206 		['Y'] = IRI_CTYPE_UNRESERVED,
207 		['Z'] = IRI_CTYPE_UNRESERVED,
208 		['-'] = IRI_CTYPE_UNRESERVED,
209 		['.'] = IRI_CTYPE_UNRESERVED,
210 		['_'] = IRI_CTYPE_UNRESERVED,
211 		['~'] = IRI_CTYPE_UNRESERVED
212 	};
213 
214 /**
215  * \param[in] c A character
216  * \return 1 if \p c is a generic delimiter, 0 if not
217  *
218  * Tests whether \p c is a generic delimiter (gen-delim),
219  * according to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2).
220  */
wget_iri_isgendelim(char c)221 bool wget_iri_isgendelim(char c)
222 {
223 	// return strchr(":/?#[]@",c)!=NULL;
224 	return iri_isgendelim(c);
225 }
226 
227 /**
228  * \param[in] c A character
229  * \return 1 if \p c is a subcomponent delimiter, 0 if not
230  *
231  * Tests whether \p c is a subcomponent delimiter (sub-delim)
232  * according to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2).
233  */
wget_iri_issubdelim(char c)234 bool wget_iri_issubdelim(char c)
235 {
236 	// return strchr("!$&\'()*+,;=",c)!=NULL;
237 	return iri_issubdelim(c);
238 }
239 
240 /**
241  * \param[in] c A character
242  * \return 1 if \p c is a reserved character, 0 if not
243  *
244  * Tests whether \p c is a reserved character.
245  *
246  * According to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2),
247  * the set of reserved characters is formed
248  * by the generic delimiters (gen-delims, wget_iri_isgendelim()) and the
249  * subcomponent delimiters (sub-delims, wget_iri_is_subdelim()).
250  *
251  * This function is thus equivalent to:
252  *
253  *     return wget_iri_isgendelim(c) || wget_iri_issubdelim(c);
254  *
255  */
wget_iri_isreserved(char c)256 bool wget_iri_isreserved(char c)
257 {
258 	return wget_iri_isgendelim(c) || wget_iri_issubdelim(c);
259 }
260 
261 /**
262  * \param[in] c A character
263  * \return 1 if \p c is an unreserved character, 0 if not
264  *
265  * Tests whether \p c is an unreserved character.
266  */
wget_iri_isunreserved(char c)267 bool wget_iri_isunreserved(char c)
268 {
269 	return iri_isunreserved(c);
270 }
271 
unhex(unsigned char c)272 static unsigned char WGET_GCC_CONST unhex(unsigned char c)
273 {
274 	return c <= '9' ? c - '0' : (c <= 'F' ? c - 'A' + 10 : c - 'a' + 10);
275 }
276 
iri_unescape_inline(char * src,int ctype)277 static char *iri_unescape_inline(char *src, int ctype)
278 {
279 	char *ret = NULL;
280 	unsigned char *s = (unsigned char *)src; // just a helper to avoid casting a lot
281 	unsigned char *d = s;
282 
283 	while (*s) {
284 		if (*s == '%') {
285 			if (c_isxdigit(s[1]) && c_isxdigit(s[2])) {
286 				unsigned char c = (unsigned char) (unhex(s[1]) << 4) | unhex(s[2]);
287 				if (!ctype || (!(iri_ctype[(unsigned char)(c)] & ctype) && c != '%')) {
288 					*d++ = c;
289 					s += 3;
290 					ret = src;
291 					continue;
292 				}
293 			}
294 		} else if (*s == '&') {
295 			// entities are case sensitive (RFC1866, 3.2.3)
296 			if (!strncmp((char *) s + 1, "amp;", 4)) {
297 				*d++ = '&';
298 				s += 5;
299 				ret = src;
300 				continue;
301 			} else if (!strncmp((char *) s + 1, "gt;", 3)) {
302 				*d++ = '>';
303 				s += 4;
304 				ret = src;
305 				continue;
306 			} else if (!strncmp((char *) s + 1, "lt;", 3)) {
307 				*d++ = '<';
308 				s += 4;
309 				ret = src;
310 				continue;
311 			} else if (!strncmp((char *) s + 1, "quot;", 5)) {
312 				*d++ = '\"';
313 				s += 6;
314 				ret = src;
315 				continue;
316 			} else if (!strncmp((char *) s + 1, "apos;", 5)) {
317 				*d++ = '\'';
318 				s += 6;
319 				ret = src;
320 				continue;
321 			}
322 		} else if (*s == '#') {
323 			uint32_t value = 0;
324 
325 			if (s[1] == 'x') {
326 				unsigned char *p = s + 2;
327 				while (c_isxdigit(*p)) {
328 					value = (value << 4) | unhex(*p);
329 					p++;
330 				}
331 				if (*p == ';') {
332 					if (value > 0 && value < 128) {
333 						*d++ = (unsigned char) value;
334 						s = p + 1;
335 						continue;
336 					}
337 					// else: we have to convert the unicode value to whatever encoding the URL is in (likely UTF-8)
338 					// this cannot be done inline since the URL's length may increase
339 				}
340 			} else {
341 				unsigned char *p = s + 1;
342 				while (c_isdigit(*p) && value <= 0x10FFFF) { // max. Unicode value
343 					value = value * 10 + (*p - '0');
344 					p++;
345 				}
346 				if (*p == ';') {
347 					if (value > 0 && value < 128) {
348 						*d++ = (unsigned char) value;
349 						s = p + 1;
350 						continue;
351 					}
352 					// else: we have to convert the unicode value to whatever encoding the URL is in (likely UTF-8)
353 					// this cannot be done inline since the URL's length may increase
354 				}
355 			}
356 		} else if (*s == '\r' || *s == '\n') {
357 			// Ignore / remove CR and LF from URLs. See https://gitlab.com/gnuwget/wget2/-/issues/522
358 			s++;
359 			continue;
360 		}
361 
362 		*d++ = *s++;
363 	}
364 	*d = 0;
365 
366 	return ret;
367 }
368 
369 /**
370  * \param[in] src A string
371  * \return A pointer to \p src, after the transformation is done
372  *
373  * Unescape a string. All the percent-encoded characters (`%XX`) are converted
374  * back to their original form.
375  *
376  * **The transformation is done inline**, so `src` will be modified after this function returns.
377  * If no percent-encoded characters are found, the string is left untouched.
378  */
wget_iri_unescape_inline(char * src)379 char *wget_iri_unescape_inline(char *src)
380 {
381 	return iri_unescape_inline(src, 0);
382 }
383 
384 /**
385  * \param[in] src A string
386  * \return A pointer to \p src, after the transformation is done
387  *
388  * Unescape a string except escaped generic delimiters (and escaped '%'.
389  * The percent-encoded characters (`%XX`) are converted back to their original form.
390  *
391  * This variant of unescaping is helpful before an URL is being parsed, so that
392  * the parser recognizes e.g. 'http%3A//' as relative URL (path) and not as a scheme.
393  *
394  * **The transformation is done inline**, so `src` will be modified after this function returns.
395  * If no characters were unescaped, the string is left untouched.
396  */
wget_iri_unescape_url_inline(char * src)397 char *wget_iri_unescape_url_inline(char *src)
398 {
399 	return iri_unescape_inline(src, IRI_CTYPE_GENDELIM);
400 }
401 
402 /**
403  * \param[in] iri An IRI
404  *
405  * Free the heap-allocated content of the provided IRI, but leave the rest
406  * of the fields.
407  *
408  * This function frees the following fields of \ref wget_iri_st "wget_iri":
409  *
410  *  - `host`
411  *  - `path`
412  *  - `query`
413  *  - `fragment`
414  *  - `connection_part`
415  */
wget_iri_free_content(wget_iri * iri)416 void wget_iri_free_content(wget_iri *iri)
417 {
418 	if (iri) {
419 		if (iri->uri_allocated)
420 			xfree(iri->uri);
421 		if (iri->host_allocated)
422 			xfree(iri->host);
423 		if (iri->path_allocated)
424 			xfree(iri->path);
425 		if (iri->query_allocated)
426 			xfree(iri->query);
427 		if (iri->fragment_allocated)
428 			xfree(iri->fragment);
429 		xfree(iri->connection_part);
430 	}
431 }
432 
433 /**
434  * \param[in] iri A pointer to a pointer to an IRI (a \ref wget_iri_st "wget_iri")
435  *
436  * Destroy a \ref wget_iri_st "wget_iri" structure.
437  *
438  * The provided pointer is set to NULL.
439  */
wget_iri_free(wget_iri ** iri)440 void wget_iri_free(wget_iri **iri)
441 {
442 	if (iri && *iri) {
443 		wget_iri_free_content(*iri);
444 		xfree(*iri);
445 	}
446 }
447 
448 // URIs are assumed to be unescaped at this point
449 
450 /**
451  * \param[in] url A URL/IRI
452  * \param[in] encoding Original encoding of \p url
453  * \return A libwget IRI (`wget_iri`)
454  *
455  * The host, path, query and fragment parts will be converted to UTF-8 from
456  * the encoding given in the parameter \p encoding. GNU libiconv is used
457  * to perform the conversion, so this value should be the name of a valid character set
458  * supported by that library, such as "utf-8" or "iso-8859-1".
459  */
wget_iri_parse(const char * url,const char * encoding)460 wget_iri *wget_iri_parse(const char *url, const char *encoding)
461 {
462 	wget_iri *iri;
463 	char *p, *s, *authority, c;
464 	size_t slen, extra;
465 	int have_scheme;
466 
467 	if (!url)
468 		return NULL;
469 
470 	/*
471 		URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
472 		hier-part   = "//" authority path-abempty / path-absolute / path-rootless / path-empty
473 		scheme      =  ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
474 	 */
475 	while (c_isspace(*url)) url++;
476 	if (!*url) return NULL;
477 /*
478 	// first unescape, than convert to UTF-8
479 	if (strchr(url, '%')) {
480 		char *unesc_url = wget_strdup(url);
481 
482 		wget_percent_unescape(unesc_url);
483 
484 		if (wget_str_needs_encoding(unesc_url)) {
485 			if ((url = wget_str_to_utf8(unesc_url, encoding)))
486 				xfree(unesc_url);
487 			else
488 				url = unesc_url; // on error, use what we have
489 		} else
490 			url = unesc_url;
491 
492 		url_allocated = 1;
493 	} else {
494 		url_allocated = 0;
495 
496 		if (wget_str_needs_encoding(url)) {
497 			if ((s = wget_str_to_utf8(url, encoding))) {
498 				url = s;
499 				url_allocated = 1;
500 			}
501 		}
502 	}
503 */
504 
505 	if (c_isalpha(*url)) {
506 		const char *x;
507 		have_scheme = 1;
508 
509 		for (x = url; *x && iri_isscheme(*x); x++)
510 			;
511 
512 		if (*x != ':' || c_isdigit(x[1]))
513 			have_scheme = 0; // not a scheme
514 	} else
515 		have_scheme = 0;
516 
517 	// just use one block of memory for all parsed URI parts
518 	slen = strlen(url);
519 	extra = have_scheme ? 0 : sizeof("http://") - 1; // extra space for http://
520 
521 	iri = wget_malloc(sizeof(wget_iri) + (slen + extra + 1) * 2);
522 	if (!iri)
523 		return NULL;
524 
525 	memset(iri, 0, sizeof(wget_iri));
526 
527 	if (have_scheme) {
528 		iri->msize = slen + 1;
529 		iri->uri = memcpy(((char *)iri) + sizeof(wget_iri), url, iri->msize);
530 		p = s = memcpy((char *)iri->uri + iri->msize, url, iri->msize);
531 		s = strchr(s, ':'); // we know there is a :
532 		*s++ = 0;
533 
534 		// p points to scheme
535 		wget_iri_unescape_inline(p); // percent unescape
536 		wget_strtolower(p); // convert to lowercase
537 
538 		bool found = false; // assume the scheme is unsupported
539 
540 		// find the scheme in our static list of supported schemes
541 		// for later comparisons we compare pointers (avoiding strcasecmp())
542 		for (unsigned it = 0; it < countof(schemes); it++) {
543 			if (!strcmp(schemes[it].name, p)) {
544 				iri->scheme = it;
545 				iri->port = schemes[it].port;
546 				found = true;
547 				break;
548 			}
549 		}
550 
551 		if (!found) {
552 			debug_printf("Unsupported scheme in '%s'\n", url);
553 			wget_iri_free(&iri);
554 			return NULL;
555 		}
556 	} else {
557 		// add http:// scheme to url
558 		iri->uri = memcpy(((char *)iri) + sizeof(wget_iri), "http://", extra);
559 		memcpy(((char *)iri) + sizeof(wget_iri) + extra, url, slen + 1);
560 		iri->msize = slen + 1 + extra;
561 		s = memcpy((char *)iri->uri + iri->msize, "http://", extra);
562 		memcpy((char *)iri->uri + iri->msize + extra, url, slen + 1);
563 		s[extra - 3] = 0;
564 		s += extra;
565 
566 		iri->scheme = WGET_IRI_SCHEME_HTTP;
567 		iri->port = schemes[WGET_IRI_SCHEME_HTTP].port;
568 	}
569 
570 //	if (url_allocated)
571 //		xfree(url);
572 
573 	// this is true for http, https, ftp, file (accept any number of /, like most browsers)
574 	while (*s == '/')
575 		s++;
576 
577 	// authority
578 	authority = s;
579 	while (*s && *s != '/' && *s != '?' && *s != '#')
580 		s++;
581 	c = *s;
582 	if (c) *s++ = 0;
583 	wget_iri_unescape_inline(authority);
584 
585 	// left over: [path][?query][#fragment]
586 	if (c == '/') {
587 		iri->path = s;
588 		while (*s && *s != '?' && *s != '#')
589 			s++;
590 		c = *s;
591 		if (c) *s++ = 0;
592 		wget_iri_unescape_inline((char *)iri->path);
593 	}
594 
595 	if (c == '?') {
596 		iri->query = s;
597 		while (*s && *s != '#') {
598 			if (*s == '+')
599 				*s = ' ';
600 			s++;
601 		}
602 		c = *s;
603 		if (c) *s++ = 0;
604 		/* do not unescape query else we get ambiguity for chars like &, =, +, ... */
605 	}
606 
607 	if (c == '#') {
608 		iri->fragment = s;
609 		s += strlen(s);
610 		wget_iri_unescape_inline((char *)iri->fragment);
611 	}
612 
613 	if (*s) {
614 		debug_printf("unparsed rest '%s'\n", s);
615 	}
616 
617 	if (*authority) {
618 		s = authority;
619 		p = strchr(authority, '@');
620 		if (p) {
621 			iri->userinfo = s;
622 			*p = 0;
623 			if ((s = strchr(s, ':'))) {
624 				*s = 0;
625 				iri->password = s + 1;
626 			}
627 			s = p + 1;
628 		}
629 		if (*s == '[') {
630 			p = strrchr(s, ']');
631 			if (p) {
632 				iri->host = s + 1;
633 				*p = 0;
634 				s = p + 1;
635 			} else {
636 				// something is broken
637 				iri->host = s + 1;
638 				s += strlen(s);
639 			}
640 		} else {
641 			iri->host = s;
642 			while (*s && *s != ':')
643 				s++;
644 		}
645 		if (*s == ':') {
646 			if (c_isdigit(s[1])) {
647 				int port = atoi(s + 1);
648 				if (port > 0 && port < 65536) {
649 					iri->port = (uint16_t) port;
650 					iri->port_given = true;
651 				}
652 			}
653 		}
654 		*s = 0;
655 	}
656 
657 	// now unescape all components (not interested in display, userinfo, password right now)
658 
659 	if (iri->host) {
660 		wget_strtolower((char *)iri->host);
661 		if (wget_str_needs_encoding(iri->host)) {
662 			if ((s = wget_str_to_utf8(iri->host, encoding))) {
663 				iri->host = s;
664 				iri->host_allocated = true;
665 			}
666 		}
667 		if ((p = (char *)wget_str_to_ascii(iri->host)) != iri->host) {
668 			if (iri->host_allocated)
669 				xfree(iri->host);
670 			iri->host = p;
671 			iri->host_allocated = true;
672 		}
673 
674 		// Finally, if the host is a literal IPv4 or IPv6 address, mark it as so
675 		if (wget_ip_is_family(iri->host, WGET_NET_FAMILY_IPV4) || wget_ip_is_family(iri->host, WGET_NET_FAMILY_IPV6))
676 			iri->is_ip_address = true;
677 	}
678 
679 	if (!iri->host) {
680 		error_printf(_("Missing host/domain in URI '%s'\n"), iri->uri);
681 		wget_iri_free(&iri);
682 		return NULL;
683 	}
684 
685 	if (iri->path && wget_str_needs_encoding(iri->path)) {
686 		if ((s = wget_str_to_utf8(iri->path, encoding))) {
687 			iri->path = s;
688 			iri->path_allocated = true;
689 		}
690 	}
691 
692 	if (iri->query && wget_str_needs_encoding(iri->query)) {
693 		if ((s = wget_str_to_utf8(iri->query, encoding))) {
694 			iri->query = s;
695 			iri->query_allocated = true;
696 		}
697 	}
698 
699 	if (iri->fragment && wget_str_needs_encoding(iri->fragment)) {
700 		if ((s = wget_str_to_utf8(iri->fragment, encoding))) {
701 			iri->fragment = s;
702 			iri->fragment_allocated = true;
703 		}
704 	}
705 
706 /*
707 	debug_printf("scheme=%s\n",iri->scheme);
708 	debug_printf("host=%s\n",iri->host);
709 	debug_printf("path=%s\n",iri->path);
710 	debug_printf("query=%s\n",iri->query);
711 	debug_printf("fragment=%s\n",iri->fragment);
712 */
713 
714 	return iri;
715 }
716 
717 /**
718  * \param[in] iri An IRI
719  * \return A new IRI, with the exact same contents as the provided one.
720  *
721  * Clone the provided IRI.
722  */
wget_iri_clone(const wget_iri * iri)723 wget_iri *wget_iri_clone(const wget_iri *iri)
724 {
725 	if (!iri || !iri->uri)
726 		return NULL;
727 
728 	size_t slen = strlen(iri->uri);
729 	wget_iri *clone = wget_malloc(sizeof(wget_iri) + (slen + 1) + iri->msize);
730 
731 	if (!clone)
732 		return NULL;
733 
734 	memcpy(clone, iri, sizeof(wget_iri));
735 	clone->uri = memcpy(((char *)clone) + sizeof(wget_iri), iri->uri, slen + 1);
736 	memcpy((char *)clone->uri + slen + 1, (char *)iri->uri + slen + 1, iri->msize);
737 	clone->uri_allocated = 0;
738 
739 	clone->connection_part = wget_strdup(iri->connection_part);
740 
741 	// adjust pointers
742 	if (iri->host_allocated)
743 		clone->host = wget_strdup(iri->host);
744 	else
745 		clone->host = iri->host ? (char *)clone + (size_t) (iri->host - (const char *)iri) : NULL;
746 
747 	clone->display = iri->display ? (char *)clone + (size_t) (iri->display - (const char *)iri): NULL;
748 	// not adjust scheme, it is a pointer to a static string
749 	clone->userinfo = iri->userinfo ? (char *)clone + (size_t) (iri->userinfo - (const char *)iri): NULL;
750 	clone->password = iri->password ? (char *)clone + (size_t) (iri->password - (const char *)iri): NULL;
751 
752 	if (iri->path_allocated)
753 		clone->path = wget_strdup(iri->path);
754 	else
755 		clone->path = iri->path ? (char *)clone + (size_t) (iri->path - (const char *)iri): NULL;
756 
757 	if (iri->query_allocated)
758 		clone->query = wget_strdup(iri->query);
759 	else
760 		clone->query = iri->query ? (char *)clone + (size_t) (iri->query - (const char *)iri): NULL;
761 
762 	if (iri->fragment_allocated)
763 		clone->fragment = wget_strdup(iri->fragment);
764 	else
765 		clone->fragment = iri->fragment ? (char *)clone + (size_t) (iri->fragment - (const char *)iri): NULL;
766 
767 	return clone;
768 }
769 
770 /**
771  * \param[in] iri An IRI
772  * \param[in] buf A buffer, where the resulting string will be put
773  * \return The contents of the buffer \p buf
774  *
775  * Append the connection part of the IRI \p iri to \p buf.
776  *
777  * The connection part is formed by the scheme, the hostname, and optionally the port. For example:
778  *
779  *     https://localhost:8080
780  *     https://www.example.com
781  *
782  * It may be of the form `https://example.com:8080` if the port was provided when creating the IRI
783  * or of the form `https://example.com` otherwise.
784  */
wget_iri_get_connection_part(const wget_iri * iri,wget_buffer * buf)785 const char *wget_iri_get_connection_part(const wget_iri *iri, wget_buffer *buf)
786 {
787 	if (iri) {
788 		if (iri->port_given) {
789 			wget_buffer_printf_append(buf, "%s://%s:%hu", schemes[iri->scheme].name, iri->host, iri->port);
790 		} else {
791 			wget_buffer_printf_append(buf, "%s://%s", schemes[iri->scheme].name, iri->host);
792 		}
793 	}
794 
795 	return buf->data;
796 }
797 
798 // normalize /../ and remove /./
799 
normalize_path(char * path)800 static size_t WGET_GCC_NONNULL_ALL normalize_path(char *path)
801 {
802 	char *p1 = path, *p2 = path;
803 
804 	debug_printf("path %s ->\n", path);
805 
806 	// skip ./ and ../ at the beginning of the path
807 	for (;;) {
808 		if (*p2 == '/')
809 			p2++;
810 		else if (*p2 == '.') {
811 			if (p2[1] == '/')
812 				p2 += 2;
813 			else if (p2[1] == '.') {
814 				if (p2[2] == '/')
815 					p2 += 3;
816 				else if (!p2[2])
817 					p2 += 2;
818 				else
819 					break;
820 			}
821 			else if (!p2[1])
822 				p2++;
823 			else
824 				break;
825 		} else
826 			break;
827 	}
828 
829 	// normalize path but stop at query or fragment
830 	while (*p2 && *p2 != '?' && *p2 != '#') {
831 		if (*p2 == '/') {
832 			if (p2[1] == '.') {
833 				if (!strncmp(p2, "/../", 4)) {
834 					// go one level up
835 					p2 += 3;
836 					while (p1 > path && *--p1 != '/');
837 				} else if (!strcmp(p2, "/..")) {
838 					p2 += 3;
839 					while (p1 > path && *--p1 != '/');
840 					if (p1 > path) *p1++='/';
841 				} else if (!strncmp(p2, "/./", 3)) {
842 					p2 += 2;
843 				} else if (!strcmp(p2, "/.")) {
844 					p2 += 2;
845 					if (p1 > path) *p1++='/';
846 				} else
847 					*p1++ = *p2++;
848 			} else if (p1 == path)
849 				p2++; // avoid leading slash
850 			else if (p2[1] == '/')
851 				p2++; // double slash to single slash
852 			else
853 				*p1++ = *p2++;
854 		} else
855 			*p1++ = *p2++;
856 	}
857 
858 	if (p1 != p2) {
859 		while (*p2)
860 			*p1++ = *p2++;
861 
862 		*p1 = 0;
863 	} else {
864 		p1 += strlen(p1);
865 	}
866 
867 	debug_printf("     %s\n", path);
868 
869 	return p1 - path;
870 }
871 
872 // create an absolute URI from a base + relative URI
873 
874 //char *iri_relative_to_absolute(IRI *iri, const char *tag, const char *val, size_t len, char *dst, size_t dst_size)
875 /**
876  * \param[in] base A base IRI
877  * \param[in] val A path, or another URI
878  * \param[in] len Length of the string \p val or -1
879  * \param[in] buf Destination buffer, where the result will be copied.
880  * \return A new URI (string) which is based on the base IRI \p base provided, or NULL in case of error.
881  *
882  * Calculates a new URI which is based on the provided IRI \p base.
883  *
884  * Taking the IRI \p base as a starting point, a new URI is created with the path \p val, which may be
885  * a relative or absolute path, or even a whole URI. The result is returned as a string, and if the buffer
886  * \p buf is provided, it is also placed there.
887  *
888  * If \p val is an absolute path (it begins with a `/`), it is normalized first. Then the provided IRI's
889  * path is replaced by that new path. If it's a relative path, the file name of the \p base IRI's path
890  * is replaced by that path. Finally, if \p val begins with a scheme (such as `https://`) that string is returned
891  * untouched, and placed in the buffer if provided.
892  *
893  * If \p base is NULL, then \p val must itself be an absolute URI. Likewise, if \p buf is NULL,
894  * then \p val must also be an absolute URI.
895  *
896  * if \p len is `-1`, the length of \p val will be the result from `strlen(val)`.
897  */
wget_iri_relative_to_abs(const wget_iri * base,const char * val,size_t len,wget_buffer * buf)898 const char *wget_iri_relative_to_abs(const wget_iri *base, const char *val, size_t len, wget_buffer *buf)
899 {
900 	debug_printf("*url = %.*s\n", (int)len, val);
901 
902 	if (len == (size_t) -1)
903 		len = strlen(val);
904 
905 	if (*val == '/') {
906 		if (base) {
907 			char path[len + 1];
908 
909 			// strlcpy or snprintf are ineffective here since they do strlen(val), which might be large
910 			wget_strscpy(path, val, len + 1);
911 
912 			if (len >= 2 && val[1] == '/') {
913 				char *p;
914 
915 				// absolute URI without scheme: //authority/path...
916 				if ((p = strchr(path + 2, '/')))
917 					normalize_path(p + 1);
918 
919 				wget_buffer_strcpy(buf, schemes[base->scheme].name);
920 				wget_buffer_strcat(buf, ":");
921 				wget_buffer_strcat(buf, path);
922 				debug_printf("*1 %s\n", buf->data);
923 			} else {
924 				// absolute path
925 				normalize_path(path);
926 
927 				wget_buffer_reset(buf);
928 				wget_iri_get_connection_part(base, buf);
929 				wget_buffer_strcat(buf, "/");
930 				wget_buffer_strcat(buf, path);
931 				debug_printf("*2 %s\n", buf->data);
932 			}
933 		} else {
934 			return NULL;
935 		}
936 	} else {
937 		// see if URI begins with a scheme:
938 		if (memchr(val, ':', len)) {
939 			// absolute URI
940 			if (buf) {
941 				wget_buffer_memcpy(buf, val, len);
942 				debug_printf("*3 %s\n", buf->data);
943 			} else {
944 				debug_printf("*3 %s\n", val);
945 				return val;
946 			}
947 		} else if (base) {
948 			// relative path
949 			const char *lastsep = base->path ? strrchr(base->path, '/') : NULL;
950 			wget_buffer_reset(buf);
951 			wget_iri_get_connection_part(base, buf);
952 			wget_buffer_strcat(buf, "/");
953 
954 			size_t tmp_len = buf->length;
955 
956 			if (lastsep)
957 				wget_buffer_memcat(buf, base->path, lastsep - base->path + 1);
958 
959 			if (len)
960 				wget_buffer_memcat(buf, val, len);
961 
962 			buf->length = normalize_path(buf->data + tmp_len) + tmp_len;
963 
964 			debug_printf("*4 %s %zu\n", buf->data, buf->length);
965 		} else if (val[len] == 0) {
966 			return val;
967 		} else {
968 			return NULL;
969 		}
970 	}
971 
972 	return likely(buf) ? buf->data : NULL;
973 }
974 
975 /**
976  * \param[in] base The base IRI
977  * \param[in] url A relative/absolute path (or a URI) to be appended to \p base
978  * \param[in] encoding The encoding of \p url (e.g. "utf-8" or "iso-8859-1")
979  * \return A new IRI
980  *
981  * Generate a new IRI by using the provided IRI \p base as a base and the path \p url.
982  *
983  * This is equivalent to:
984  *
985  *     wget_iri *iri = wget_iri_parse(wget_iri_relative_to_abs(base, url, strlen(url), NULL), encoding);
986  *     return iri;
987  *
988  * As such, \p url can be a relative or absolute path, or another URI.
989  *
990  * If \p base is NULL, then the parameter \p url must itself be an absolute URI.
991  */
wget_iri_parse_base(const wget_iri * base,const char * url,const char * encoding)992 wget_iri *wget_iri_parse_base(const wget_iri *base, const char *url, const char *encoding)
993 {
994 	wget_iri *iri;
995 
996 	if (base) {
997 		wget_buffer buf;
998 		char sbuf[256];
999 
1000 		wget_buffer_init(&buf, sbuf, sizeof(sbuf));
1001 		iri = wget_iri_parse(wget_iri_relative_to_abs(base, url, (size_t) -1, &buf), encoding);
1002 		wget_buffer_deinit(&buf);
1003 	} else {
1004 		// no base: just check URL for being an absolute URI
1005 		iri = wget_iri_parse(wget_iri_relative_to_abs(NULL, url, (size_t) -1, NULL), encoding);
1006 	}
1007 
1008 	return iri;
1009 }
1010 
1011 // RFC conform comparison as described in https://tools.ietf.org/html/rfc2616#section-3.2.3
1012 /**
1013  * \param[in] iri1 An IRI
1014  * \param[in] iri2 Another IRI
1015  * \return 0 if both IRIs are equal according to RFC 2616 or a non-zero value otherwise
1016  *
1017  * Compare two IRIs.
1018  *
1019  * Comparison is performed according to [RFC 2616, sect. 3.2.3](https://tools.ietf.org/html/rfc2616#section-3.2.3).
1020  *
1021  * This function uses wget_strcasecmp() to compare the various parts of the IRIs so a non-zero negative return value
1022  * indicates that \p iri1 is less than \p iri2, whereas a positive value indicates \p iri1 is greater than \p iri2.
1023  */
wget_iri_compare(wget_iri * iri1,wget_iri * iri2)1024 int wget_iri_compare(wget_iri *iri1, wget_iri *iri2)
1025 {
1026 	int n;
1027 
1028 	if (!iri1) {
1029 		if (!iri2)
1030 			return 0;
1031 		else
1032 			return -1;
1033 	} else if (!iri2)
1034 		return 1;
1035 
1036 //	info_printf("iri %p %p %s:%s %s:%s\n",iri1,iri2,iri1->scheme,iri1->port,iri2->scheme,iri2->port);
1037 
1038 /*
1039 	if (!iri1->path) {
1040 //		if (iri2->path && strcmp(iri2->path, "/"))
1041 		if (iri2->path)
1042 			return -1;
1043 	}
1044 	else if (!iri2->path) {
1045 //		if (iri1->path && strcmp(iri1->path, "/"))
1046 		if (iri1->path)
1047 			return 1;
1048 	}
1049 */
1050 	if ((n = wget_strcasecmp(iri1->path, iri2->path)))
1051 		return n;
1052 
1053 	if ((n = wget_strcasecmp(iri1->query, iri2->query)))
1054 		return n;
1055 
1056 	if (iri1->scheme != iri2->scheme)
1057 		return iri1->scheme < iri2->scheme ? -1 : 1;
1058 
1059 	if ((n = iri1->port - iri2->port))
1060 		return n;
1061 
1062 	// host is already lowercase, no need to call strcasecmp()
1063 	if ((n = strcmp(iri1->host, iri2->host)))
1064 		return n;
1065 
1066 	// if ((n = wget_strcasecmp(iri1->fragment, iri2->fragment)))
1067 	//		return n;
1068 
1069 	return 0;
1070 }
1071 
1072 /**
1073  * \param[in] src A string, whose reserved characters are to be percent-encoded
1074  * \param[in] buf A buffer where the result will be copied.
1075  * \return The contents of the buffer \p buf after \p src has been encoded.
1076  *
1077  * Escapes (using percent-encoding) all the reserved characters in the string \p src.
1078  *
1079  * If \p src is NULL, the contents of the buffer \p buf are returned. \p buf cannot be NULL.
1080  */
wget_iri_escape(const char * src,wget_buffer * buf)1081 const char *wget_iri_escape(const char *src, wget_buffer *buf)
1082 {
1083 	const char *begin;
1084 
1085 	if (!src)
1086 		return buf->data;
1087 
1088 	for (begin = src; *src; src++) {
1089 		if (!iri_isunreserved(*src)) {
1090 			if (begin != src)
1091 				wget_buffer_memcat(buf, begin, src - begin);
1092 			begin = src + 1;
1093 			wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src);
1094 		}
1095 	}
1096 
1097 	if (begin != src)
1098 		wget_buffer_memcat(buf, begin, src - begin);
1099 
1100 	return buf->data;
1101 }
1102 
1103 /**
1104  * \param[in] src A string, whose reserved characters are to be percent-encoded
1105  * \param[in] buf A buffer where the result will be copied.
1106  * \return The contents of the buffer \p buf after \p src has been encoded
1107  * as described in https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1.
1108  *
1109  * Escapes the path part of the URI suitable for GET/POST requests (origin-form).
1110  *   origin-form    = absolute-path [ "?" query ]
1111  *   path-absolute = "/" [ segment-nz *( "/" segment ) ]
1112  *   segment-nz    = 1*pchar
1113  *   segment       = *pchar
1114  *   pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
1115  */
wget_iri_escape_path(const char * src,wget_buffer * buf)1116 const char *wget_iri_escape_path(const char *src, wget_buffer *buf)
1117 {
1118 	const char *begin;
1119 
1120 	for (begin = src; *src; src++) {
1121 		if (!(iri_isunreserved(*src) || iri_issubdelim(*src) || *src == '/' || *src == ':' || *src == '@')) {
1122 			if (begin != src)
1123 				wget_buffer_memcat(buf, begin, src - begin);
1124 			begin = src + 1;
1125 			wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src);
1126 		}
1127 	}
1128 
1129 	if (begin != src)
1130 		wget_buffer_memcat(buf, begin, src - begin);
1131 
1132 	return buf->data;
1133 }
1134 
1135 /**
1136  * \param[in] src A string, whose reserved characters are to be percent-encoded
1137  * \param[in] buf A buffer where the result will be copied.
1138  * \return The contents of the buffer \p buf after \p src has been encoded.
1139  *
1140  * Escapes (using percent-encoding) all the reserved characters in the string \p src
1141  * (just like wget_iri_escape()), but **excluding the equal sign `=` and the ampersand `&`**.
1142  * This function is thus ideally suited for query parts of URIs.
1143  */
wget_iri_escape_query(const char * src,wget_buffer * buf)1144 const char *wget_iri_escape_query(const char *src, wget_buffer *buf)
1145 {
1146 	const char *begin;
1147 
1148 	for (begin = src; *src; src++) {
1149 		if (!iri_isunreserved(*src) && *src != '=' && *src != '&') {
1150 			if (begin != src)
1151 				wget_buffer_memcat(buf, begin, src - begin);
1152 			begin = src + 1;
1153 			if (*src == ' ')
1154 				wget_buffer_memcat(buf, "+", 1);
1155 			else
1156 				wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src);
1157 		}
1158 	}
1159 
1160 	if (begin != src)
1161 		wget_buffer_memcat(buf, begin, src - begin);
1162 
1163 	return buf->data;
1164 }
1165 
1166 /**
1167  * \param[in] iri An IRI
1168  * \param[in] buf A buffer, where the resulting string will be put
1169  * \return The contents of the buffer \p buf
1170  *
1171  * Return the host part of the provided IRI. It is placed in the buffer \p buf
1172  * and also returned as a `const char *`.
1173  *
1174  * The host is escaped using wget_iri_escape().
1175  */
wget_iri_get_escaped_host(const wget_iri * iri,wget_buffer * buf)1176 const char *wget_iri_get_escaped_host(const wget_iri *iri, wget_buffer *buf)
1177 {
1178 	return wget_iri_escape(iri->host, buf);
1179 }
1180 
1181 /**
1182  * \param[in] iri An IRI
1183  * \param[in] buf A buffer, where the resulting string will be put
1184  * \return The contents of the buffer \p buf
1185  *
1186  * Return the resource string, suitable for use in HTTP requests.
1187  * Details:
1188  *   https://datatracker.ietf.org/doc/html/rfc7230#section-3.1.1
1189  *   https://datatracker.ietf.org/doc/html/rfc7230#section-2.7
1190  *   https://datatracker.ietf.org/doc/html/rfc3986#section-3.3
1191  *
1192  * The resource string is comprised of the path, plus the query part, if present. Example:
1193  *
1194  *     /foo/bar/?param_1=one&param_2=two
1195  *
1196  * Both the path and the query are escaped using wget_iri_escape_path() and
1197  * wget_iri_escape_query(), respectively.
1198  *
1199  * The resulting string is placed in the buffer \p buf and also returned as a `const char *`.
1200  */
wget_iri_get_escaped_resource(const wget_iri * iri,wget_buffer * buf)1201 const char *wget_iri_get_escaped_resource(const wget_iri *iri, wget_buffer *buf)
1202 {
1203 	if (iri->path)
1204 		wget_iri_escape_path(iri->path, buf);
1205 
1206 	// Do not actually escape the query field. This part of the URL *MAY*
1207 	// contain reserved characters which should be passed on as-is and without
1208 	// escaping them. This is according to the rules laid out in RFC 2616 and
1209 	// RFC 7230. But we have to replace spaces in any case.
1210 	if (iri->query) {
1211 		wget_buffer_memcat(buf, "?", 1);
1212 		for (const char *p = iri->query; *p; p++)
1213 			if (*p == ' ')
1214 				wget_buffer_memcat(buf, "%20", 3);
1215 			else
1216 				wget_buffer_memcat(buf, p, 1);
1217 	}
1218 
1219 	return buf->data;
1220 }
1221 
1222 /**
1223  * \param[in] iri An IRI
1224  * \param[in] buf A buffer, where the resulting string will be put
1225  * \param[in] encoding Character set the string should be converted to
1226  * \return The contents of the buffer \p buf
1227  *
1228  * Get the path part of the provided IRI.
1229  *
1230  * The path is appended to \p buf. If \p buf is non-empty and does not end with
1231  * a path separator (`/`), then one is added before the path is appended to \p
1232  * buf.
1233  *
1234  * If \p encoding is provided, this function will try to convert the path (which is originally
1235  * in UTF-8) to that encoding.
1236  */
1237 
wget_iri_get_path(const wget_iri * iri,wget_buffer * buf,const char * encoding)1238 char *wget_iri_get_path(const wget_iri *iri, wget_buffer *buf, const char *encoding)
1239 {
1240 	if (buf->length != 0 && buf->data[buf->length - 1] != '/')
1241 		wget_buffer_memcat(buf, "/", 1);
1242 
1243 	if (iri->path) {
1244 		if (wget_strcasecmp_ascii(encoding, "utf-8")) {
1245 			char *fname;
1246 
1247 			if ((fname = wget_utf8_to_str(iri->path, encoding))) {
1248 				wget_buffer_strcat(buf, fname);
1249 				xfree(fname);
1250 			} else {
1251 				// conversion failed, keep original string
1252 				wget_buffer_strcat(buf, iri->path);
1253 			}
1254 		} else {
1255 			wget_buffer_strcat(buf, iri->path);
1256 		}
1257 	}
1258 
1259 	if ((buf->length == 0 || buf->data[buf->length - 1] == '/') && default_page)
1260 		wget_buffer_memcat(buf, default_page, default_page_length);
1261 
1262 	return buf->data;
1263 }
1264 
1265 /**
1266  * \param[in] iri An IRI
1267  * \param[in] buf A buffer, where the resulting string will be put
1268  * \param[in] encoding Character set the string should be converted to
1269  * \return The contents of the buffer \p buf
1270  *
1271  * Take the query part, and escape the path separators (`/`), so that it can be used as part
1272  * of a filename.
1273  *
1274  * The resulting string will be placed in the buffer \p buf and also returned as a `const char *`.
1275  * If the provided IRI has no query part, then the original contents of \p buf are returned and \p buf
1276  * is kept untouched.
1277  *
1278  * If \p encoding is provided, this function will try to convert the query (which is originally
1279  * in UTF-8) to that encoding.
1280  */
wget_iri_get_query_as_filename(const wget_iri * iri,wget_buffer * buf,const char * encoding)1281 char *wget_iri_get_query_as_filename(const wget_iri *iri, wget_buffer *buf, const char *encoding)
1282 {
1283 	if (iri->query) {
1284 		const char *query;
1285 		int allocated = 0;
1286 
1287 		wget_buffer_memcat(buf, "?", 1);
1288 
1289 		if (wget_strcasecmp_ascii(encoding, "utf-8")) {
1290 			if ((query = wget_utf8_to_str(iri->query, encoding)))
1291 				allocated = 1;
1292 			else
1293 				query = iri->query;
1294 		} else {
1295 			query = iri->query;
1296 		}
1297 
1298 		int slashes = 0;
1299 		const char *src = query;
1300 
1301 		// count slashes in query string
1302 		while ((src = strchr(src, '/'))) {
1303 			slashes++;
1304 			src++;
1305 		}
1306 
1307 		if (slashes) {
1308 			// escape slashes to use query as part of a filename
1309 			const char *begin;
1310 
1311 			for (src = begin = query; *src; src++) {
1312 				if (*src == '/') {
1313 					if (begin != src)
1314 						wget_buffer_memcat(buf, begin, src - begin);
1315 					begin = src + 1;
1316 					wget_buffer_memcat(buf, "%2F", 3);
1317 				}
1318 			}
1319 
1320 			if (begin != src)
1321 				wget_buffer_memcat(buf, begin, src - begin);
1322 		} else {
1323 			wget_buffer_strcat(buf, query);
1324 		}
1325 
1326 		if (allocated)
1327 			xfree(query);
1328 	}
1329 
1330 	return buf->data;
1331 }
1332 
1333 /**
1334  * \param[in] iri An IRI
1335  * \param[in] buf A buffer, where the resulting string will be put
1336  * \param[in] encoding Character set the string should be converted to
1337  * \return The contents of the buffer \p buf
1338  *
1339  * Get the filename of the path of the provided IRI.
1340  *
1341  * This is similar to wget_iri_get_path(), but instead of returning the whole path
1342  * it only returns the substring after the last occurrence of `/`. In other words, the
1343  * filename of the path.
1344  *
1345  * This is also known as the "basename" in the UNIX world, and the output of this function
1346  * would be equivalent to the output of the `basename(1)` tool.
1347  *
1348  * The path is copied into \p buf if it's empty. If the buffer \p buf is not empty,
1349  * it is appended to it after a path separator (`/`).
1350  *
1351  * If \p encoding is provided, this function will try to convert the path (which is originally
1352  * in UTF-8) to that encoding.
1353  */
wget_iri_get_basename(const wget_iri * iri,wget_buffer * buf,const char * encoding,int flags)1354 char *wget_iri_get_basename(const wget_iri *iri, wget_buffer *buf, const char *encoding, int flags)
1355 {
1356 	if (iri->path) {
1357 		char *fname;
1358 
1359 		if (wget_strcasecmp_ascii(encoding, "utf-8")) {
1360 			char *p;
1361 
1362 			if ((p = strrchr(iri->path, '/'))) {
1363 				if (!(fname = wget_utf8_to_str(p + 1, encoding)))
1364 					wget_buffer_strcat(buf, p + 1); // conversion failed, keep original string
1365 			} else {
1366 				if (!(fname = wget_utf8_to_str(iri->path, encoding)))
1367 					wget_buffer_strcat(buf, iri->path); // conversion failed, keep original string
1368 			}
1369 
1370 			if (fname) {
1371 				// conversion succeeded
1372 				wget_buffer_strcat(buf, fname);
1373 				xfree(fname);
1374 			}
1375 		} else {
1376 			if ((fname = strrchr(iri->path, '/')))
1377 				wget_buffer_strcat(buf, fname + 1);
1378 			else
1379 				wget_buffer_strcat(buf, iri->path);
1380 		}
1381 	}
1382 
1383 	if ((buf->length == 0 || buf->data[buf->length - 1] == '/') && default_page)
1384 		wget_buffer_memcat(buf, default_page, default_page_length);
1385 
1386 	if (flags & WGET_IRI_WITH_QUERY)
1387 		return wget_iri_get_query_as_filename(iri, buf, encoding);
1388 
1389 	return buf->data;
1390 }
1391 
1392 // escaping: see https://tools.ietf.org/html/rfc2396#2 following (especially 2.4.2)
1393 /*const char *iri_escape(const char *uri)
1394 {
1395 	int esc = 0;
1396 	const char *p;
1397 
1398 	for (p = uri; *p; p++) {
1399 		if (*p == '%') {
1400 			if ((isxdigit(p[1]) && isxdigit(p[2])) || p[1] == '%')
1401 				return uri; // assume that URI is already escaped
1402 			esc++;
1403 		} else if ()
1404 	}
1405 }
1406 */
1407 
wget_iri_set_defaultpage(const char * page)1408 void wget_iri_set_defaultpage(const char *page)
1409 {
1410 	default_page = page;
1411 	default_page_length = default_page ? strlen(default_page) : 0;
1412 }
1413 
1414 /**
1415  * \param scheme The scheme for the new default port
1416  * \param port The new default port value for the given scheme
1417  * \return 0: success  -1: Unknown scheme
1418  *
1419  * Set the default \p port for the given \p scheme.
1420  */
wget_iri_set_defaultport(wget_iri_scheme scheme,uint16_t port)1421 int wget_iri_set_defaultport(wget_iri_scheme scheme, uint16_t port)
1422 {
1423 	if ((unsigned) scheme < countof(schemes)) {
1424 		schemes[scheme].port = port;
1425 		return 0;
1426 	}
1427 
1428 	return -1;
1429 }
1430 
1431 /**
1432  * \param[in] iri An IRI
1433  * \param[in] scheme A scheme, such as `http` or `https`.
1434  * \return The original scheme of IRI (ie. before the replacement)
1435  *
1436  * Set the scheme of the provided IRI. The IRI's original scheme
1437  * is replaced by the new one.
1438  *
1439  * If the IRI was using a default port (such as 80 for HTTP or 443 for HTTPS)
1440  * that port is modified as well to match the default port of the new scheme.
1441  * Otherwise the port is left untouched.
1442  */
wget_iri_set_scheme(wget_iri * iri,wget_iri_scheme scheme)1443 wget_iri_scheme wget_iri_set_scheme(wget_iri *iri, wget_iri_scheme scheme)
1444 {
1445 	wget_iri_scheme old_scheme = iri->scheme;
1446 
1447 	if ((unsigned) scheme < countof(schemes) && iri->scheme != scheme) {
1448 		iri->scheme = scheme;
1449 
1450 		// If the IRI is using the default port, also change it
1451 		if (iri->port == schemes[old_scheme].port)
1452 			iri->port = schemes[scheme].port;
1453 
1454 		size_t old_scheme_len = strlen(schemes[old_scheme].name);
1455 
1456 		if (strncmp(iri->uri, schemes[old_scheme].name, old_scheme_len) == 0 && iri->uri[old_scheme_len] == ':') {
1457 			char *new_uri = wget_aprintf("%s%s",  schemes[iri->scheme].name, iri->uri + old_scheme_len);
1458 			if (iri->uri_allocated)
1459 				xfree(iri->uri);
1460 			iri->uri = new_uri;
1461 			iri->uri_allocated = true;
1462 		}
1463 	}
1464 
1465 	return old_scheme;
1466 }
1467 
1468 /** @} */
1469